def run(bucket_name, prefix, token=None): db = S3({}) keys = [] if token is not None: prefix += token + "/" num_keys = None while num_keys is None or len(keys) < num_keys: try: keys = list( map(lambda o: o.key, list(db.get_entries("maccoss-tide", prefix)))) if len(keys) > 0: num_keys = util.parse_file_name(keys[0])["num_files"] except Exception as e: print("Error reading", e) keys = [] time.sleep(10) keys.sort(key=lambda k: util.parse_file_name(k)["suffix"]) species_to_score = {} print("Processing...") objs = db.get_entries("maccoss-tide", prefix) for obj in objs: it = confidence.Iterator(obj, None) s = it.sum("q-value") specie = util.parse_file_name(obj.key)["suffix"] species_to_score[specie] = s if s > 0: print(keys[i]) print("***", i + 2, specie, s) # else: # print(i+2, util.parse_file_name(obj.key)["suffix"], s) return species_to_score
def create_payload(table_name: str, key: str, prefix: int, file_id: Optional[int] = None, num_files: Optional[int] = None, offsets: Optional[List[int]] = None): extra_params = util.parse_file_name(key) extra_params["prefix"] = prefix if file_id: extra_params["file_id"] = file_id extra_params["num_files"] = num_files extra_params["bin"] = 1 extra_params["num_bins"] = 1 if offsets: extra_params["offsets"] = offsets return { "Records": [{ "s3": { "bucket": { "name": table_name }, "object": { "key": key, }, "extra_params": extra_params } }] }
def copy(self, data_type, file_name, copy_dest_dir, excelize=False, xlsx_formats=None): dirs = { "current": self.data_dir + "/current", "master": self.data_dir + "/master" } _, naked_file_name, _, ext = parse_file_name(file_name) target_file_name = dirs[copy_dest_dir] + "/" + naked_file_name if os.path.exists(target_file_name): os.remove(target_file_name) copyfile(file_name, target_file_name) self.log.write("INFO [{}] {} copied to {}".format( data_type, naked_file_name, target_file_name)) if copy_dest_dir == 'current' and excelize: if ext.lower() != 'csv': print("ERROR : cannt excelize a {} file".format(ext)) exit() formats = [] if xlsx_formats is None else xlsx_formats xlsx_file_name = csv_2_xlsx(target_file_name, "", formats) self.log.write("INFO [{}] {} re-saved as {}".format( data_type, naked_file_name, xlsx_file_name))
def run(key, params, input_format, output_format, offsets): objects = util.get_objects(params["bucket"], "0/", params) assert (len(objects) == 1) object_key = objects[0].key match = open(key).read() payload = { "Records": [{ "s3": { "bucket": { "name": params["bucket"], }, "object": { "key": object_key, }, "extra_params": { "prefix": output_format["prefix"], "species": util.parse_file_name(match)["suffix"] } } }] } client = boto3.client("lambda") response = client.invoke(FunctionName=params["output_function"], InvocationType="Event", Payload=json.JSONEncoder().encode(payload)) assert (response["ResponseMetadata"]["HTTPStatusCode"] == 202) return []
def get_stage(payload: Dict[str, Any]) -> int: s3 = payload["Records"][0]["s3"] if "extra_params" in s3: extra_params = s3["extra_params"] if "prefix" in extra_params: return extra_params["prefix"] return util.parse_file_name(s3["object"]["key"])["prefix"]
def test_file_name_parser(self): m = { "prefix": 0, "timestamp": 123.4, "nonce": 42, "bin": 12, "num_bins": 13, "file_id": 3, "execute": False, "num_files": 4, "suffix": "hello", "ext": "txt" } self.assertDictEqual(m, util.parse_file_name(util.file_name(m))) self.assertEqual( "0/123.400000-13/1-4/1-0.000000-0-suffix.txt", util.file_name( util.parse_file_name( "0/123.400000-13/1-4/1-0.000000-0-suffix.txt")))
def run_application(database, bucket_name: str, key: str, input_format: Dict[str, Any], output_format: Dict[str, Any], offsets: List[int], params: Dict[str, Any]): temp_file = "/tmp/{0:s}".format(key) util.make_folder(util.parse_file_name(key)) if len(offsets) == 0: database.download(bucket_name, key, temp_file) else: obj = database.get_entry(bucket_name, key) format_lib = importlib.import_module("formats." + params["input_format"]) iterator_class = getattr(format_lib, "Iterator") iterator = iterator_class(obj, OffsetBounds(offsets[0], offsets[1])) items = iterator.get(iterator.get_start_index(), iterator.get_end_index()) with open(temp_file, "wb+") as f: items = list(items) iterator_class.from_array(list(items), f, iterator.get_extra()) application_lib = importlib.import_module("applications." + params["application"]) application_method = getattr(application_lib, "run") output_files = application_method(database, temp_file, params, input_format, output_format) found = False for output_file in output_files: p = util.parse_file_name(output_file.replace("/tmp/", "")) if p is None: index = output_file.rfind(".") ext = output_file[index + 1:] output_format["ext"] = ext new_key = util.file_name(output_format) else: new_key = util.file_name(p) with open(output_file, "rb") as f: database.put(params["bucket"], new_key, f, {}) return True
def archive(self, data_type, file_names): ds = datetime.now().strftime("%Y-%m-%d") ts = datetime.now().strftime("%Y-%m-%dT%H%M%S") ark_dir = "{}/archive/{}/{}".format(self.data_dir, data_type, ds) make_dir(ark_dir) files = [file_names] if type(file_names) is not list else file_names for file in files: _, naked_file_name, stem, ext = parse_file_name(file) ark_file_name = "{}/{}--{}.{}".format(ark_dir, stem, ts, ext) copyfile(file, ark_file_name) self.log.write("INFO [{}] {} archived to {}".format( data_type, naked_file_name, ark_file_name))
def wait_for_execution_to_finish(db, table, key, num_steps): entries = [] m = util.parse_file_name(key) m["prefix"] = num_steps prefix = util.key_prefix(util.file_name(m)) start_time = time.time() while len(entries) == 0: entries = db.get_entries(table, prefix) if time.time() - start_time > num_steps * 60 * 2: print("Tiemout", num_steps * 60 * 2) raise TimeoutError time.sleep(10)
def test_basic(self): database: TestDatabase = TestDatabase() table1: TestTable = database.create_table("table1") with open("spacenet/1-1-1-tide.knn", "rb") as f: entry1: TestEntry = table1.add_entry("5/1550529206.039528-957/1-1/1-1-1-tide.knn", f.read()) with open("spacenet/3band_AOI_1_RIO_img147.tif", "rb") as f: entry2: TestEntry = table1.add_entry("0/1550529206.039528-957/1-1/1-1-1-tide.tiff", f.read()) params = { "bucket": table1.name, "input_prefix": 0 } input_format = util.parse_file_name(entry1.key) output_format = dict(input_format) output_format["prefix"] = 6 util.make_folder(output_format) draw_borders.run(database, entry1.key, params, input_format, output_format, [])
def test_basic(self): database = TestDatabase() table1 = database.create_table("table1") log = database.create_table("log") entry1 = table1.add_entry( "0/123.400000-13/1-1/1-1-0-suffix.new_line", "A B C\nD E F\nG H I\nJ K L\nM N O\nP Q R\n") input_format = util.parse_file_name(entry1.key) output_format = dict(input_format) output_format["prefix"] = 1 params = { "execute": 0, "file": "split_file", "format": "new_line", "log": log.name, "name": "split", "output_function": "an-output-function", "ranges": False, "split_size": 20, "timeout": 60, } event = tutils.create_event(database, table1.name, entry1.key, params) context = tutils.create_context(params) split_file.handler(event, context) invoke1 = get_invoke("an-output-function", table1.name, entry1.key, prefix=1, offsets=[0, 19], file_id=1, num_files=2) invoke2 = get_invoke("an-output-function", table1.name, entry1.key, prefix=1, offsets=[20, 35], file_id=2, num_files=2) expected_invokes = [invoke1, invoke2] self.check_payload_equality(expected_invokes, database.payloads)
def __process_logs__(self): [key, identifier] = self.logger_queue.get() body = self.__get_log__(key) if body is None: self.logger_queue.put([key, identifier]) return for payload in body["payloads"]: token = identifier[0] s3 = payload["Records"][0]["s3"] for ancestry_identifier in s3["ancestry"]: self.finished_tasks.add(tuple(ancestry_identifier)) if "log" in payload: child_identifier = tuple(payload["log"]) else: c = util.parse_file_name(s3["object"]["key"]) if "extra_params" in s3: c = {**c, **s3["extra_params"]} child_identifier = (token, c["prefix"], c["bin"], c["num_bins"], c["file_id"], c["num_files"]) self.find_queue.put(child_identifier) assert(child_identifier[2] <= child_identifier[3] and child_identifier[4] <= child_identifier[5]) self.payload_map[token][child_identifier] = payload
def __check_tasks__(self): key = self.key_queue.get() m = util.parse_file_name(key) token: str = "{0:f}-{1:d}".format(m["timestamp"], m["nonce"]) identifier: Token = (token, m["prefix"], m["bin"], m["num_bins"], m["file_id"], m["num_files"]) if key not in self.processed_logs: self.processed_logs.add(key) self.finished_tasks.add(identifier) self.logger_queue.put([key, identifier]) if token not in self.payload_map: self.payload_map[token] = {} self.tokens.append(token) job = Job("", self.bucket_name, token, float(token.split("-")[0])) self.__add_task__(job) i = 0 while i < len(self.tasks): if not self.tasks[i].running: self.tasks[i].join() self.tasks.pop(i) else: i += 1
def process_objects(s3, bucket_name, objects, params, subfolder): costs = {-1: 0} duration_cost = {-1: 0} durations = {-1: [sys.maxsize, 0]} list_count = {-1: 0} read_count = {-1: 0} write_count = {-1: 0} path = os.path.dirname(os.path.realpath(__file__)) memory_parameters = json.loads(open(path + "../json/memory.json").read()) statistics = [] for stage in params["pipeline"]: statistics.append({"name": stage["name"], "messages": []}) for objSum in objects: if subfolder: name = subfolder + "/" + objSum.key.replace("/", ".") else: name = None obj_format = util.parse_file_name(objSum.key) if name and os.path.isfile(name): content = open(name, "r").read() if len(content.strip()) > 0: body = json.loads(open(name, "r").read()) else: continue else: if name: Path(name).touch() print("Not Found", name) obj = s3.Object(bucket_name, objSum.key) x = obj.get()["Body"].read() body = json.loads(x.decode("utf-8")) if name: with open(name, "wb+") as f: f.write(x) duration = body["duration"] stage = obj_format["prefix"] - 1 for prefix in [-1, stage]: if prefix not in costs: costs[prefix] = 0 list_count[prefix] = 0 write_count[prefix] = 0 read_count[prefix] = 0 duration_cost[prefix] = 0 durations[prefix] = [sys.maxsize, 0] list_count[prefix] += body["list_count"] read_count[prefix] += body["read_count"] write_count[prefix] += body["write_count"] costs[prefix] += (body["write_count"] + body["list_count"]) / 1000.0 * 0.005 costs[prefix] += body["read_count"] / 1000.0 * 0.0004 memory_size = str(params["functions"][body["name"]]["memory_size"]) costs[prefix] += memory_parameters["lambda"][memory_size] * int( float(duration + 99) / 100) duration_cost[ prefix] += memory_parameters["lambda"][memory_size] * int( float(duration + 99) / 100) start_time = body["start_time"] end_time = start_time + body["duration"] / 1000.0 for p in [-1, prefix]: durations[p][0] = min(durations[p][0], start_time) durations[p][1] = max(durations[p][1], end_time) statistics[stage]["messages"].append({"log": objSum.key, "body": body}) print("Write count", write_count[-1]) print("Read count", read_count[-1]) print("List count", list_count[-1]) print("Duration cost", duration_cost[-1]) return [statistics, costs, durations]