Ejemplo n.º 1
0
def run(bucket_name, prefix, token=None):
    db = S3({})
    keys = []
    if token is not None:
        prefix += token + "/"

    num_keys = None
    while num_keys is None or len(keys) < num_keys:
        try:
            keys = list(
                map(lambda o: o.key,
                    list(db.get_entries("maccoss-tide", prefix))))
            if len(keys) > 0:
                num_keys = util.parse_file_name(keys[0])["num_files"]
        except Exception as e:
            print("Error reading", e)
            keys = []
        time.sleep(10)
    keys.sort(key=lambda k: util.parse_file_name(k)["suffix"])

    species_to_score = {}

    print("Processing...")
    objs = db.get_entries("maccoss-tide", prefix)
    for obj in objs:
        it = confidence.Iterator(obj, None)
        s = it.sum("q-value")
        specie = util.parse_file_name(obj.key)["suffix"]
        species_to_score[specie] = s
        if s > 0:
            print(keys[i])
            print("***", i + 2, specie, s)
        # else:
        #   print(i+2, util.parse_file_name(obj.key)["suffix"], s)
    return species_to_score
Ejemplo n.º 2
0
def create_payload(table_name: str,
                   key: str,
                   prefix: int,
                   file_id: Optional[int] = None,
                   num_files: Optional[int] = None,
                   offsets: Optional[List[int]] = None):
    extra_params = util.parse_file_name(key)
    extra_params["prefix"] = prefix
    if file_id:
        extra_params["file_id"] = file_id
        extra_params["num_files"] = num_files
        extra_params["bin"] = 1
        extra_params["num_bins"] = 1

    if offsets:
        extra_params["offsets"] = offsets

    return {
        "Records": [{
            "s3": {
                "bucket": {
                    "name": table_name
                },
                "object": {
                    "key": key,
                },
                "extra_params": extra_params
            }
        }]
    }
Ejemplo n.º 3
0
 def copy(self,
          data_type,
          file_name,
          copy_dest_dir,
          excelize=False,
          xlsx_formats=None):
     dirs = {
         "current": self.data_dir + "/current",
         "master": self.data_dir + "/master"
     }
     _, naked_file_name, _, ext = parse_file_name(file_name)
     target_file_name = dirs[copy_dest_dir] + "/" + naked_file_name
     if os.path.exists(target_file_name):
         os.remove(target_file_name)
     copyfile(file_name, target_file_name)
     self.log.write("INFO [{}] {} copied to {}".format(
         data_type, naked_file_name, target_file_name))
     if copy_dest_dir == 'current' and excelize:
         if ext.lower() != 'csv':
             print("ERROR : cannt excelize a {} file".format(ext))
             exit()
         formats = [] if xlsx_formats is None else xlsx_formats
         xlsx_file_name = csv_2_xlsx(target_file_name, "", formats)
         self.log.write("INFO [{}] {} re-saved as {}".format(
             data_type, naked_file_name, xlsx_file_name))
Ejemplo n.º 4
0
def run(key, params, input_format, output_format, offsets):
    objects = util.get_objects(params["bucket"], "0/", params)
    assert (len(objects) == 1)
    object_key = objects[0].key
    match = open(key).read()

    payload = {
        "Records": [{
            "s3": {
                "bucket": {
                    "name": params["bucket"],
                },
                "object": {
                    "key": object_key,
                },
                "extra_params": {
                    "prefix": output_format["prefix"],
                    "species": util.parse_file_name(match)["suffix"]
                }
            }
        }]
    }

    client = boto3.client("lambda")
    response = client.invoke(FunctionName=params["output_function"],
                             InvocationType="Event",
                             Payload=json.JSONEncoder().encode(payload))
    assert (response["ResponseMetadata"]["HTTPStatusCode"] == 202)

    return []
Ejemplo n.º 5
0
def get_stage(payload: Dict[str, Any]) -> int:
    s3 = payload["Records"][0]["s3"]
    if "extra_params" in s3:
        extra_params = s3["extra_params"]
        if "prefix" in extra_params:
            return extra_params["prefix"]
    return util.parse_file_name(s3["object"]["key"])["prefix"]
Ejemplo n.º 6
0
 def test_file_name_parser(self):
     m = {
         "prefix": 0,
         "timestamp": 123.4,
         "nonce": 42,
         "bin": 12,
         "num_bins": 13,
         "file_id": 3,
         "execute": False,
         "num_files": 4,
         "suffix": "hello",
         "ext": "txt"
     }
     self.assertDictEqual(m, util.parse_file_name(util.file_name(m)))
     self.assertEqual(
         "0/123.400000-13/1-4/1-0.000000-0-suffix.txt",
         util.file_name(
             util.parse_file_name(
                 "0/123.400000-13/1-4/1-0.000000-0-suffix.txt")))
Ejemplo n.º 7
0
def run_application(database, bucket_name: str, key: str,
                    input_format: Dict[str, Any], output_format: Dict[str,
                                                                      Any],
                    offsets: List[int], params: Dict[str, Any]):
    temp_file = "/tmp/{0:s}".format(key)
    util.make_folder(util.parse_file_name(key))

    if len(offsets) == 0:
        database.download(bucket_name, key, temp_file)
    else:
        obj = database.get_entry(bucket_name, key)
        format_lib = importlib.import_module("formats." +
                                             params["input_format"])
        iterator_class = getattr(format_lib, "Iterator")
        iterator = iterator_class(obj, OffsetBounds(offsets[0], offsets[1]))
        items = iterator.get(iterator.get_start_index(),
                             iterator.get_end_index())
        with open(temp_file, "wb+") as f:
            items = list(items)
            iterator_class.from_array(list(items), f, iterator.get_extra())

    application_lib = importlib.import_module("applications." +
                                              params["application"])
    application_method = getattr(application_lib, "run")
    output_files = application_method(database, temp_file, params,
                                      input_format, output_format)

    found = False
    for output_file in output_files:
        p = util.parse_file_name(output_file.replace("/tmp/", ""))
        if p is None:
            index = output_file.rfind(".")
            ext = output_file[index + 1:]
            output_format["ext"] = ext
            new_key = util.file_name(output_format)
        else:
            new_key = util.file_name(p)

        with open(output_file, "rb") as f:
            database.put(params["bucket"], new_key, f, {})
    return True
Ejemplo n.º 8
0
 def archive(self, data_type, file_names):
     ds = datetime.now().strftime("%Y-%m-%d")
     ts = datetime.now().strftime("%Y-%m-%dT%H%M%S")
     ark_dir = "{}/archive/{}/{}".format(self.data_dir, data_type, ds)
     make_dir(ark_dir)
     files = [file_names] if type(file_names) is not list else file_names
     for file in files:
         _, naked_file_name, stem, ext = parse_file_name(file)
         ark_file_name = "{}/{}--{}.{}".format(ark_dir, stem, ts, ext)
         copyfile(file, ark_file_name)
         self.log.write("INFO [{}] {} archived to {}".format(
             data_type, naked_file_name, ark_file_name))
Ejemplo n.º 9
0
def wait_for_execution_to_finish(db, table, key, num_steps):
    entries = []
    m = util.parse_file_name(key)
    m["prefix"] = num_steps
    prefix = util.key_prefix(util.file_name(m))
    start_time = time.time()
    while len(entries) == 0:
        entries = db.get_entries(table, prefix)
        if time.time() - start_time > num_steps * 60 * 2:
            print("Tiemout", num_steps * 60 * 2)
            raise TimeoutError
        time.sleep(10)
Ejemplo n.º 10
0
  def test_basic(self):
    database: TestDatabase = TestDatabase()
    table1: TestTable = database.create_table("table1")
    with open("spacenet/1-1-1-tide.knn", "rb") as f:
      entry1: TestEntry = table1.add_entry("5/1550529206.039528-957/1-1/1-1-1-tide.knn", f.read())
    with open("spacenet/3band_AOI_1_RIO_img147.tif", "rb") as f:
      entry2: TestEntry = table1.add_entry("0/1550529206.039528-957/1-1/1-1-1-tide.tiff", f.read())

    params = {
      "bucket": table1.name,
      "input_prefix": 0
    }
    input_format = util.parse_file_name(entry1.key)
    output_format = dict(input_format)
    output_format["prefix"] = 6
    util.make_folder(output_format)
    draw_borders.run(database, entry1.key, params, input_format, output_format, [])
Ejemplo n.º 11
0
    def test_basic(self):
        database = TestDatabase()
        table1 = database.create_table("table1")
        log = database.create_table("log")
        entry1 = table1.add_entry(
            "0/123.400000-13/1-1/1-1-0-suffix.new_line",
            "A B C\nD E F\nG H I\nJ K L\nM N O\nP Q R\n")
        input_format = util.parse_file_name(entry1.key)
        output_format = dict(input_format)
        output_format["prefix"] = 1

        params = {
            "execute": 0,
            "file": "split_file",
            "format": "new_line",
            "log": log.name,
            "name": "split",
            "output_function": "an-output-function",
            "ranges": False,
            "split_size": 20,
            "timeout": 60,
        }

        event = tutils.create_event(database, table1.name, entry1.key, params)
        context = tutils.create_context(params)
        split_file.handler(event, context)

        invoke1 = get_invoke("an-output-function",
                             table1.name,
                             entry1.key,
                             prefix=1,
                             offsets=[0, 19],
                             file_id=1,
                             num_files=2)
        invoke2 = get_invoke("an-output-function",
                             table1.name,
                             entry1.key,
                             prefix=1,
                             offsets=[20, 35],
                             file_id=2,
                             num_files=2)

        expected_invokes = [invoke1, invoke2]
        self.check_payload_equality(expected_invokes, database.payloads)
Ejemplo n.º 12
0
  def __process_logs__(self):
    [key, identifier] = self.logger_queue.get()
    body = self.__get_log__(key)
    if body is None:
      self.logger_queue.put([key, identifier])
      return

    for payload in body["payloads"]:
      token = identifier[0]
      s3 = payload["Records"][0]["s3"]
      for ancestry_identifier in s3["ancestry"]:
        self.finished_tasks.add(tuple(ancestry_identifier))
      if "log" in payload:
        child_identifier = tuple(payload["log"])
      else:
        c = util.parse_file_name(s3["object"]["key"])
        if "extra_params" in s3:
          c = {**c, **s3["extra_params"]}
        child_identifier = (token, c["prefix"], c["bin"], c["num_bins"], c["file_id"], c["num_files"])
      self.find_queue.put(child_identifier)
      assert(child_identifier[2] <= child_identifier[3] and child_identifier[4] <= child_identifier[5])
      self.payload_map[token][child_identifier] = payload
Ejemplo n.º 13
0
  def __check_tasks__(self):
    key = self.key_queue.get()
    m = util.parse_file_name(key)
    token: str = "{0:f}-{1:d}".format(m["timestamp"], m["nonce"])
    identifier: Token = (token, m["prefix"], m["bin"], m["num_bins"], m["file_id"], m["num_files"])
    if key not in self.processed_logs:
      self.processed_logs.add(key)
      self.finished_tasks.add(identifier)
      self.logger_queue.put([key, identifier])

      if token not in self.payload_map:
        self.payload_map[token] = {}
        self.tokens.append(token)
        job = Job("", self.bucket_name, token, float(token.split("-")[0]))
        self.__add_task__(job)

    i = 0
    while i < len(self.tasks):
     if not self.tasks[i].running:
       self.tasks[i].join()
       self.tasks.pop(i)
     else:
       i += 1
Ejemplo n.º 14
0
def process_objects(s3, bucket_name, objects, params, subfolder):
    costs = {-1: 0}
    duration_cost = {-1: 0}
    durations = {-1: [sys.maxsize, 0]}
    list_count = {-1: 0}
    read_count = {-1: 0}
    write_count = {-1: 0}
    path = os.path.dirname(os.path.realpath(__file__))
    memory_parameters = json.loads(open(path + "../json/memory.json").read())
    statistics = []

    for stage in params["pipeline"]:
        statistics.append({"name": stage["name"], "messages": []})

    for objSum in objects:
        if subfolder:
            name = subfolder + "/" + objSum.key.replace("/", ".")
        else:
            name = None
        obj_format = util.parse_file_name(objSum.key)
        if name and os.path.isfile(name):
            content = open(name, "r").read()
            if len(content.strip()) > 0:
                body = json.loads(open(name, "r").read())
            else:
                continue
        else:
            if name:
                Path(name).touch()
                print("Not Found", name)
            obj = s3.Object(bucket_name, objSum.key)
            x = obj.get()["Body"].read()
            body = json.loads(x.decode("utf-8"))
            if name:
                with open(name, "wb+") as f:
                    f.write(x)
        duration = body["duration"]
        stage = obj_format["prefix"] - 1

        for prefix in [-1, stage]:
            if prefix not in costs:
                costs[prefix] = 0
                list_count[prefix] = 0
                write_count[prefix] = 0
                read_count[prefix] = 0
                duration_cost[prefix] = 0
                durations[prefix] = [sys.maxsize, 0]
            list_count[prefix] += body["list_count"]
            read_count[prefix] += body["read_count"]
            write_count[prefix] += body["write_count"]
            costs[prefix] += (body["write_count"] +
                              body["list_count"]) / 1000.0 * 0.005
            costs[prefix] += body["read_count"] / 1000.0 * 0.0004
            memory_size = str(params["functions"][body["name"]]["memory_size"])
            costs[prefix] += memory_parameters["lambda"][memory_size] * int(
                float(duration + 99) / 100)
            duration_cost[
                prefix] += memory_parameters["lambda"][memory_size] * int(
                    float(duration + 99) / 100)
            start_time = body["start_time"]
            end_time = start_time + body["duration"] / 1000.0

            for p in [-1, prefix]:
                durations[p][0] = min(durations[p][0], start_time)
                durations[p][1] = max(durations[p][1], end_time)

        statistics[stage]["messages"].append({"log": objSum.key, "body": body})

    print("Write count", write_count[-1])
    print("Read count", read_count[-1])
    print("List count", list_count[-1])
    print("Duration cost", duration_cost[-1])
    return [statistics, costs, durations]