def validate_dict_params(orchestrator_params: Dict) -> None: """Ensures dictionary of orchestrator parameters contains necessary parameters. Parameters ---------- orchestrator_params : dict Dictionary containing parameters for AbyssOrchestrator object. Returns ------- Returns None if parameters are valid, raises error if invalid. """ try: for parameter_name, parameter_type in REQUIRED_ORCHESTRATOR_PARAMETERS: parameter = orchestrator_params[parameter_name] assert isinstance(parameter, parameter_type) except AssertionError: raise ValueError( f"Parameter {parameter_name} is not of type {parameter_type}") except KeyError: raise ValueError(f"Required parameter {parameter_name} not found") worker_params = orchestrator_params["worker_params"] for worker_param in worker_params: Worker.validate_dict_params(worker_param)
def test_dispatch(self): workers = [ Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 97, "transfer_dir": "/transfer", "decompress_dir": "/dir", }), Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 57, "transfer_dir": "/transfer", "decompress_dir": "/dir", }) ] worker_batches = { workers[0].worker_id: [ Job.from_dict(({ "file_path": f"/{i}", "compressed_size": 0, "decompressed_size": i, })) for i in range(10, 20) ], workers[1].worker_id: [ Job.from_dict(({ "file_path": f"/{i}", "compressed_size": 0, "decompressed_size": i, })) for i in range(0, 10) ] } preserved_batches = { workers[0].worker_id: worker_batches[workers[0].worker_id], workers[1].worker_id: worker_batches[workers[1].worker_id] } dispatcher = MaxFirstDispatcher(workers) dispatcher.dispatch_batch(worker_batches) worker_queues = dispatcher.worker_queues for worker_id, worker_batch in preserved_batches.items(): preserved_batches[worker_id] = sorted( worker_batch, reverse=True, key=(lambda x: x.decompressed_size)) for worker_id, worker_queue in worker_queues.items(): self.assertEqual(list(worker_queue.queue), preserved_batches[worker_id]) self.assertEqual(worker_batches, { workers[0].worker_id: [], workers[1].worker_id: [] }) self.assertEqual(dispatcher.worker_batches, { workers[0].worker_id: [], workers[1].worker_id: [] })
def test_update_worker(self): workers = [ Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 97, "transfer_dir": "/transfer", "decompress_dir": "/dir", }), Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 57, "transfer_dir": "/transfer", "decompress_dir": "/dir", }) ] jobs = [ Job.from_dict({ "file_path": "/", "compressed_size": 0, "decompressed_size": 10 }), Job.from_dict({ "file_path": "/", "compressed_size": 0, "decompressed_size": 5 }) ] worker_0 = workers[0] batcher = DummyBatcher(workers, jobs) worker_0.curr_available_space += 100 batcher.update_worker(worker_0) self.assertEqual( worker_0.curr_available_space, batcher.worker_dict[worker_0.worker_id].curr_available_space) with self.assertRaises(ValueError): batcher.update_worker( Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 57, "transfer_dir": "/transfer", "decompress_dir": "/dir", }))
def test_batch(self): workers = [ Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 97, "transfer_dir": "/transfer", "decompress_dir": "/dir", }), Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 57, "transfer_dir": "/transfer", "decompress_dir": "/dir", }) ] jobs = [ Job.from_dict(({ "file_path": f"/{i}", "compressed_size": 0, "decompressed_size": i, })) for i in range(10, 20) ] batcher = MMDBatcher(workers, jobs) batches = batcher.worker_batches batch_0 = batches[workers[0].worker_id] self.assertEqual(set([job.decompressed_size for job in batch_0]), {10, 12, 13, 15, 16, 18}) batch_1 = batches[workers[1].worker_id] self.assertEqual(set([job.decompressed_size for job in batch_1]), {11, 14, 17}) queued_jobs = [] while not batcher.job_queue.empty(): queued_jobs.append(batcher.job_queue.get()) self.assertEqual(set([job.decompressed_size for job in queued_jobs]), {19}) for _, worker_batch in batches.items(): for job in worker_batch: self.assertTrue(job not in batcher.jobs)
def test_multiple_batch(self): workers = [ Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 97, "transfer_dir": "/transfer", "decompress_dir": "/dir", }), Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 57, "transfer_dir": "/transfer", "decompress_dir": "/dir", }) ] jobs = [ Job.from_dict(({ "file_path": f"/{i}", "compressed_size": 0, "decompressed_size": i, })) for i in range(10, 20) ] batcher = MMDBatcher(workers, jobs) batches = batcher.worker_batches queued_jobs = [] for _ in range(batcher.job_queue.qsize()): job = batcher.job_queue.get() queued_jobs.append(job) batcher.job_queue.put(job) self.assertEqual(set([job.decompressed_size for job in queued_jobs]), {19}) batcher.batch_job( Job.from_dict(({ "file_path": f"/{100}", "compressed_size": 0, "decompressed_size": 100, }))) batches_1 = batcher.worker_batches self.assertEqual(batches, batches_1)
def test_from_dict(self): good_dict_params = { "globus_eid": "0", "funcx_eid": "1", "max_available_space": 10, "transfer_dir": "/transfer", "decompress_dir": "/dir", } worker = Worker.from_dict(good_dict_params) self.assertEqual(worker.globus_eid, good_dict_params["globus_eid"]) self.assertEqual(worker.funcx_eid, good_dict_params["funcx_eid"]) self.assertEqual(worker.max_available_space, good_dict_params["max_available_space"]) self.assertEqual(worker.transfer_dir, good_dict_params["transfer_dir"]) self.assertEqual(worker.decompress_dir, good_dict_params["decompress_dir"])
def test_validate_dict_params(self): with self.assertRaises(ValueError): bad_dict_params = { "globus_eid": 10, "funcx_eid": "1", "max_available_space": 10, "transfer_dir": "/transfer", "decompress_dir": "/dir" } Worker.validate_dict_params(bad_dict_params) with self.assertRaises(ValueError): bad_dict_params_1 = { "globus_eid": "0", "funcx_eid": "1", "max_available_space": 10, "transfer_dir": "/transfer" } Worker.validate_dict_params(bad_dict_params_1) with self.assertRaises(ValueError): bad_dict_params_2 = { "globus_eid": "0", "funcx_eid": "1", "max_available_space": 10, "transfer_dir": "/transfer", "decompress_dir": "/dir", "this should": "have an effect" } Worker.validate_dict_params(bad_dict_params_2) good_dict_params = { "globus_eid": "0", "funcx_eid": "1", "max_available_space": 10, "transfer_dir": "/transfer", "decompress_dir": "/dir" } Worker.validate_dict_params(good_dict_params)
def test_dispatch(self): workers = [ Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 97, "transfer_dir": "/transfer", "decompress_dir": "/dir", }) ] worker_batches = { workers[0].worker_id: [ Job.from_dict(({ "file_path": f"/{i}", "compressed_size": 0, "decompressed_size": i, })) for i in range(1, 11) ] } worker_batch_0 = worker_batches[workers[0].worker_id] preserved_batches = { workers[0].worker_id: [ worker_batch_0[9], worker_batch_0[0], worker_batch_0[1], worker_batch_0[2], worker_batch_0[3], worker_batch_0[8], worker_batch_0[4], worker_batch_0[5], worker_batch_0[7], worker_batch_0[6] ] } dispatcher = MaxMinDispatcher(workers) dispatcher.dispatch_batch(worker_batches) worker_queues = dispatcher.worker_queues for worker_id, worker_queue in worker_queues.items(): self.assertEqual(list(worker_queue.queue), preserved_batches[worker_id]) self.assertEqual(worker_batches, {workers[0].worker_id: []}) self.assertEqual(dispatcher.worker_batches, {workers[0].worker_id: []})
def test_is_failed_job(self): workers = [] for i in range(10): workers.append(Worker(None, None, None, None, i)) jobs = [ Job.from_dict({ "file_path": "/", "compressed_size": 0, "decompressed_size": 10 }), Job.from_dict({ "file_path": "/", "compressed_size": 0, "decompressed_size": 5 }) ] batcher = DummyBatcher(workers, jobs) self.assertTrue(batcher._is_failed_job(jobs[0])) self.assertFalse(batcher._is_failed_job(jobs[1]))
self.job_queue.append(job) worker_idx = (worker_idx + 1) % self.num_workers self.curr_idx = (self.curr_idx + 1) % self.num_workers if __name__ == "__main__": import uuid import random workers = [] jobs = [] for _ in range(3): workers.append( Worker(str(uuid.uuid4()), str(uuid.uuid4()), random.randint(1, 10))) for worker in workers: print( f"Worker {worker.worker_id}: {worker.curr_available_space} bytes") for i in range(10): jobs.append({ "file_path": f"{i}", "decompressed_size": random.randint(1, 5) }) batcher = RoundRobinBatcher(workers, jobs) print(f"Failed jobs {batcher.failed_jobs}") for worker_id, batch in batcher.worker_batches.items():
def __init__(self, abyss_id: str, globus_source_eid: str, transfer_token: str, compressed_files: List[Dict], worker_params: List[Dict], psql_conn, s3_conn, grouper="", batcher="mmd", dispatcher="fifo", prediction_mode="ml"): """Abyss orchestrator class. Parameters ---------- abyss_id : str Abyss ID for orchestration. globus_source_eid : str Globus endpoint of source data storage. transfer_token : str Globus token to authorize transfers between endpoints. compressed_files : list(dict) List of dictionaries for compressed files to process. Dictionaries contain "file_path" and "compressed_size". worker_params : list(dict) List of valid worker parameter dictionaries to create workers. psql_conn : PostgreSQL connection object to update status. sqs_conn : SQS connection object to push results to SQS. grouper : str Name of grouper to use when crawling. batcher : str Name of batcher to use. dispatcher : str Name of dispatchers to use. prediction_mode: str Mode of prediction to use to predict decompressed file size. "ml" to use machine learning method or "header" to use metadata stored in the header of compressed files (where possible). """ self.abyss_id = abyss_id self.globus_source_eid = globus_source_eid self.transfer_token = transfer_token self.grouper = grouper self.prediction_mode = prediction_mode self.worker_dict = dict() for worker_param in worker_params: worker = Worker.from_dict(worker_param) self.worker_dict[worker.worker_id] = worker self.prefetchers = dict() for worker in self.worker_dict.values(): globus_dest_eid = worker.globus_eid transfer_dir = worker.transfer_dir prefetcher = GlobusPrefetcher(self.transfer_token, self.globus_source_eid, globus_dest_eid, transfer_dir, 4) self.prefetchers[worker.worker_id] = prefetcher self.predictors = dict() for file_type, predictor in FILE_PREDICTOR_MAPPING.items(): file_predictor = predictor() file_predictor.load_models() self.predictors[file_type] = file_predictor self.job_statuses = dict( zip([x for x in JobStatus], [Queue() for _ in range(len(JobStatus))])) unpredicted_set = self.job_statuses[JobStatus.UNPREDICTED] for compressed_file in compressed_files: job = Job.from_dict(compressed_file) job.status = JobStatus.UNPREDICTED job.file_id = str(uuid.uuid4()) job.decompressed_size = 0 unpredicted_set.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}" ) self.scheduler = Scheduler(batcher, dispatcher, list(self.worker_dict.values()), []) self.worker_queues = dict() self.psql_conn = psql_conn self.abyss_metadata = [] self.s3_conn = s3_conn self._unpredicted_preprocessing_thread = threading.Thread( target=self._unpredicted_preprocessing, daemon=True) self._predictor_thread = threading.Thread( target=self._predict_decompressed_size, daemon=True) self._scheduler_thread = threading.Thread( target=self._thread_schedule_jobs, daemon=True) self._prefetcher_thread = threading.Thread( target=self._thread_prefetch, daemon=True) self._prefetcher_poll_thread = threading.Thread( target=self._thread_poll_prefetch, daemon=True) self._funcx_process_headers_thread = threading.Thread( target=self._thread_funcx_process_headers, daemon=True) self._funcx_decompress_thread = threading.Thread( target=self._thread_funcx_decompress, daemon=True) self._funcx_crawl_thread = threading.Thread( target=self._thread_funcx_crawl, daemon=True) self._funcx_poll_thread = threading.Thread( target=self._thread_funcx_poll, daemon=True) self._consolidate_results_thread = threading.Thread( target=self._thread_consolidate_crawl_results, daemon=True) self._lock = threading.Lock() self.thread_statuses = { "predictor_thread": True, "scheduler_thread": True, "prefetcher_thread": True, "prefetcher_poll_thread": True, "funcx_decompress_thread": True, "funcx_crawl_thread": True, "funcx_poll_thread": True, "consolidate_results_thread": True } self.funcx_client = FuncXClient() self.kill_status = False self.crawl_results = Queue()
def test_batch(self): for batcher_name in BATCHER_NAME_MAPPING.keys(): for dispatcher_name in DISPATCHER_NAME_MAPPING.keys(): workers = [ Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 10, "transfer_dir": "/transfer", "decompress_dir": "/dir", }) ] jobs = [ Job.from_dict(({ "file_path": f"/0", "compressed_size": 0, "decompressed_size": 10, })), Job.from_dict(({ "file_path": f"/1", "compressed_size": 0, "decompressed_size": 20, })) ] scheduler = Scheduler(batcher_name, dispatcher_name, workers, jobs) worker_queues = scheduler.worker_queues # Making sure only the correct job gets scheduled self.assertTrue(len(worker_queues), 1) worker_queue_0 = list( worker_queues[workers[0].worker_id].queue) self.assertEqual(len(worker_queue_0), 1) self.assertEqual(worker_queue_0[0].decompressed_size, 10) self.assertTrue( worker_queue_0[0] not in scheduler._batcher.jobs) # Making sure no jobs get batched twice scheduler.schedule_jobs([]) new_worker_queues = scheduler.worker_queues self.assertEqual(len(new_worker_queues), 1) new_worker_queue_0 = list( new_worker_queues[workers[0].worker_id].queue) self.assertEqual(len(new_worker_queue_0), 1) self.assertEqual(new_worker_queue_0[0].decompressed_size, 10) # Making sure jobs are appropriately batched after freeing space workers[0].curr_available_space += 50 scheduler.schedule_jobs([]) new_worker_queues_1 = scheduler.worker_queues self.assertEqual(len(new_worker_queues_1), 1) new_worker_queue_1 = list( new_worker_queues[workers[0].worker_id].queue) self.assertEqual(len(new_worker_queue_1), 2) self.assertEqual(new_worker_queue_1[0].decompressed_size, 10) self.assertEqual(new_worker_queue_1[1].decompressed_size, 20)