def run_globus_crawler(job_dict: dict, transfer_token: str, globus_eid: str, grouper_name: str, max_crawl_threads=2): import os import shutil import sys sys.path.insert(0, "/") from abyss.orchestrator.job import Job, JobStatus from abyss.crawlers.globus_crawler.globus_crawler import GlobusCrawler job = Job.from_dict(job_dict) for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.DECOMPRESSED: print(job_node.decompress_path) crawler = GlobusCrawler(transfer_token, globus_eid, job_node.decompress_path, grouper_name, max_crawl_threads=max_crawl_threads) metadata = crawler.crawl() job_node.metadata = metadata job_node.status = JobStatus.CRAWLING if os.path.exists(job_node.decompress_path): if os.path.isfile(job_node.decompress_path): os.remove(job_node.decompress_path) # logger.error(f"REMOVING FILE {job_node.decompress_path}") else: shutil.rmtree(job_node.decompress_path) # logger.error(f"REMOVING DIRECTORY {job_node.decompress_path}") return Job.to_dict(job)
def process_job_headers(job_dict: dict) -> dict: """Takes a job object and reads the file header and determines the decompressed size of the job. Parameters ---------- job_dict : dict Job dictionary. Returns ------- dict Job dictionary containing the decompressed size. """ import os import sys sys.path.insert(0, "/") from abyss.orchestrator.job import Job, JobStatus from abyss.utils.decompressors import get_zip_decompressed_size, get_tar_decompressed_size job = Job.from_dict(job_dict) if job.status != JobStatus.UNPREDICTED_PREFETCHED: raise ValueError(f"Job {job.file_path} status is not PROCESSING_HEADERS") elif job.file_path.endswith(".zip"): decompressed_size = get_zip_decompressed_size(job.transfer_path) elif job.file_path.endswith(".tar"): decompressed_size = get_tar_decompressed_size(job.transfer_path) else: raise ValueError(f"Can not process headers of {job.file_path}") job.decompressed_size = decompressed_size os.remove(job.transfer_path) return Job.to_dict(job)
def test_from_dict(self): with self.assertRaises(ValueError): bad_dict_params_1 = { "compressed_size": 10 } Job.from_dict(bad_dict_params_1) good_dict_params = { "file_path": "/", "compressed_size": 0, "decompressed_size": 0, "worker_id": "1", "transfer_path": "/transfer", "decompress_path": "/decompress", "funcx_decompress_id": "2", "funcx_crawl_id": "3", "status": JobStatus.UNPREDICTED } job = Job.from_dict(good_dict_params) self.assertEqual(job.file_path, good_dict_params["file_path"]) self.assertEqual(job.compressed_size, good_dict_params["compressed_size"]) self.assertEqual(job.decompressed_size, good_dict_params["decompressed_size"]) self.assertEqual(job.worker_id, good_dict_params["worker_id"]) self.assertEqual(job.transfer_path, good_dict_params["transfer_path"]) self.assertEqual(job.decompress_path, good_dict_params["decompress_path"]) self.assertEqual(job.funcx_decompress_id, good_dict_params["funcx_decompress_id"]) self.assertEqual(job.funcx_crawl_id, good_dict_params["funcx_crawl_id"]) self.assertEqual(job.status, good_dict_params["status"])
def test_dispatch(self): workers = [ Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 97, "transfer_dir": "/transfer", "decompress_dir": "/dir", }), Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 57, "transfer_dir": "/transfer", "decompress_dir": "/dir", }) ] worker_batches = { workers[0].worker_id: [ Job.from_dict(({ "file_path": f"/{i}", "compressed_size": 0, "decompressed_size": i, })) for i in range(10, 20) ], workers[1].worker_id: [ Job.from_dict(({ "file_path": f"/{i}", "compressed_size": 0, "decompressed_size": i, })) for i in range(0, 10) ] } preserved_batches = { workers[0].worker_id: worker_batches[workers[0].worker_id], workers[1].worker_id: worker_batches[workers[1].worker_id] } dispatcher = MaxFirstDispatcher(workers) dispatcher.dispatch_batch(worker_batches) worker_queues = dispatcher.worker_queues for worker_id, worker_batch in preserved_batches.items(): preserved_batches[worker_id] = sorted( worker_batch, reverse=True, key=(lambda x: x.decompressed_size)) for worker_id, worker_queue in worker_queues.items(): self.assertEqual(list(worker_queue.queue), preserved_batches[worker_id]) self.assertEqual(worker_batches, { workers[0].worker_id: [], workers[1].worker_id: [] }) self.assertEqual(dispatcher.worker_batches, { workers[0].worker_id: [], workers[1].worker_id: [] })
def test_update_worker(self): workers = [ Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 97, "transfer_dir": "/transfer", "decompress_dir": "/dir", }), Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 57, "transfer_dir": "/transfer", "decompress_dir": "/dir", }) ] jobs = [ Job.from_dict({ "file_path": "/", "compressed_size": 0, "decompressed_size": 10 }), Job.from_dict({ "file_path": "/", "compressed_size": 0, "decompressed_size": 5 }) ] worker_0 = workers[0] batcher = DummyBatcher(workers, jobs) worker_0.curr_available_space += 100 batcher.update_worker(worker_0) self.assertEqual( worker_0.curr_available_space, batcher.worker_dict[worker_0.worker_id].curr_available_space) with self.assertRaises(ValueError): batcher.update_worker( Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 57, "transfer_dir": "/transfer", "decompress_dir": "/dir", }))
def test_multiple_batch(self): workers = [ Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 97, "transfer_dir": "/transfer", "decompress_dir": "/dir", }), Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 57, "transfer_dir": "/transfer", "decompress_dir": "/dir", }) ] jobs = [ Job.from_dict(({ "file_path": f"/{i}", "compressed_size": 0, "decompressed_size": i, })) for i in range(10, 20) ] batcher = MMDBatcher(workers, jobs) batches = batcher.worker_batches queued_jobs = [] for _ in range(batcher.job_queue.qsize()): job = batcher.job_queue.get() queued_jobs.append(job) batcher.job_queue.put(job) self.assertEqual(set([job.decompressed_size for job in queued_jobs]), {19}) batcher.batch_job( Job.from_dict(({ "file_path": f"/{100}", "compressed_size": 0, "decompressed_size": 100, }))) batches_1 = batcher.worker_batches self.assertEqual(batches, batches_1)
def run_local_crawler(job_dict: dict, grouper_name: str, max_crawl_threads=1): import logging import os import sys import shutil sys.path.insert(0, "/") from abyss.orchestrator.job import Job, JobStatus from abyss.crawlers.local_crawler.local_crawler import LocalCrawler from abyss.definitions import ROOT_DIR logger = logging.getLogger(__name__) f_handler = logging.FileHandler(f'/project2/chard/skluzacek/ryan-data/abyss/file.log') f_handler.setLevel(logging.ERROR) f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') f_handler.setFormatter(f_format) logger.addHandler(f_handler) job = Job.from_dict(job_dict) for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.DECOMPRESSED: logger.error(f"CRAWLING {job_node.decompress_path}") crawler = LocalCrawler(job_node.decompress_path, grouper_name, max_crawl_threads=max_crawl_threads) metadata = crawler.crawl() job_node.metadata = metadata job_node.status = JobStatus.CRAWLING if os.path.exists(job_node.decompress_path): if os.path.isfile(job_node.decompress_path): os.remove(job_node.decompress_path) logger.error(f"REMOVING FILE {job_node.decompress_path}") else: shutil.rmtree(job_node.decompress_path) logger.error(f"REMOVING DIRECTORY {job_node.decompress_path}") return Job.to_dict(job)
def test_is_failed_job(self): workers = [] for i in range(10): workers.append(Worker(None, None, None, None, i)) jobs = [ Job.from_dict({ "file_path": "/", "compressed_size": 0, "decompressed_size": 10 }), Job.from_dict({ "file_path": "/", "compressed_size": 0, "decompressed_size": 5 }) ] batcher = DummyBatcher(workers, jobs) self.assertTrue(batcher._is_failed_job(jobs[0])) self.assertFalse(batcher._is_failed_job(jobs[1]))
def _thread_funcx_decompress(self) -> None: """Thread function to submit decompression tasks to funcX. Returns ------- None """ while not self.kill_status: prefetched_queue = self.job_statuses[JobStatus.PREFETCHED] decompressing_queue = self.job_statuses[JobStatus.DECOMPRESSING] batch = self.funcx_client.create_batch() batched_jobs = [] while not prefetched_queue.empty(): self.thread_statuses["funcx_decompress_thread"] = True job = prefetched_queue.get() job_dict = Job.to_dict(job) worker_id = job.worker_id worker = self.worker_dict[worker_id] batch.add(job_dict, worker.decompress_dir, endpoint_id=worker.funcx_eid, function_id=DECOMPRESSOR_FUNCX_UUID) batched_jobs.append(job) if len(batch.tasks) > 0: batch_res = self.funcx_client.batch_run(batch) else: batch_res = None for idx, job in enumerate(batched_jobs): logger.info(f"{job.file_path} DECOMPRESSING") for job_node in job.bfs_iterator(include_root=True): job_node.funcx_decompress_id = batch_res[idx] if job_node.status == JobStatus.PREFETCHED: job_node.status = JobStatus.DECOMPRESSING decompressing_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO DECOMPRESSING AT {time.time()}" ) time.sleep(5) self.thread_statuses["funcx_decompress_thread"] = False
def test_batch(self): workers = [ Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 97, "transfer_dir": "/transfer", "decompress_dir": "/dir", }), Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 57, "transfer_dir": "/transfer", "decompress_dir": "/dir", }) ] jobs = [ Job.from_dict(({ "file_path": f"/{i}", "compressed_size": 0, "decompressed_size": i, })) for i in range(10, 20) ] batcher = MMDBatcher(workers, jobs) batches = batcher.worker_batches batch_0 = batches[workers[0].worker_id] self.assertEqual(set([job.decompressed_size for job in batch_0]), {10, 12, 13, 15, 16, 18}) batch_1 = batches[workers[1].worker_id] self.assertEqual(set([job.decompressed_size for job in batch_1]), {11, 14, 17}) queued_jobs = [] while not batcher.job_queue.empty(): queued_jobs.append(batcher.job_queue.get()) self.assertEqual(set([job.decompressed_size for job in queued_jobs]), {19}) for _, worker_batch in batches.items(): for job in worker_batch: self.assertTrue(job not in batcher.jobs)
def _thread_funcx_process_headers(self) -> None: """Thread function to submit header processing tasks to funcX. Returns ------- None """ while not self.kill_status: unpredicted_prefetched_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREFETCHED] processing_headers_queue = self.job_statuses[ JobStatus.PROCESSING_HEADERS] batch = self.funcx_client.create_batch() batched_jobs = [] while not unpredicted_prefetched_queue.empty(): self.thread_statuses["funcx_processing_headers_thread"] = True job = unpredicted_prefetched_queue.get() logger.info(f"{job.file_path} PROCESSING HEADERS") job_dict = Job.to_dict(job) worker_id = job.worker_id worker = self.worker_dict[worker_id] batch.add(job_dict, endpoint_id=worker.funcx_eid, function_id=PROCESS_HEADER_FUNCX_UUID) batched_jobs.append(job) if len(batch.tasks) > 0: batch_res = self.funcx_client.batch_run(batch) else: batch_res = None for idx, job in enumerate(batched_jobs): job.funcx_process_headers_id = batch_res[idx] job.status = JobStatus.PROCESSING_HEADERS processing_headers_queue.put(job) logger.info(f"{job.file_path} PROCESSING HEADERS QUEUE") time.sleep(5) self.thread_statuses["funcx_processing_headers_thread"] = False
def test_dispatch(self): workers = [ Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 97, "transfer_dir": "/transfer", "decompress_dir": "/dir", }) ] worker_batches = { workers[0].worker_id: [ Job.from_dict(({ "file_path": f"/{i}", "compressed_size": 0, "decompressed_size": i, })) for i in range(1, 11) ] } worker_batch_0 = worker_batches[workers[0].worker_id] preserved_batches = { workers[0].worker_id: [ worker_batch_0[9], worker_batch_0[0], worker_batch_0[1], worker_batch_0[2], worker_batch_0[3], worker_batch_0[8], worker_batch_0[4], worker_batch_0[5], worker_batch_0[7], worker_batch_0[6] ] } dispatcher = MaxMinDispatcher(workers) dispatcher.dispatch_batch(worker_batches) worker_queues = dispatcher.worker_queues for worker_id, worker_queue in worker_queues.items(): self.assertEqual(list(worker_queue.queue), preserved_batches[worker_id]) self.assertEqual(worker_batches, {workers[0].worker_id: []}) self.assertEqual(dispatcher.worker_batches, {workers[0].worker_id: []})
def test_validate_dict_params(self): with self.assertRaises(ValueError): bad_dict_params = { "file_path": 10, "compressed_size": 10 } Job.validate_dict_params(bad_dict_params) with self.assertRaises(ValueError): bad_dict_params_1 = { "compressed_size": 10 } Job.validate_dict_params(bad_dict_params_1) good_dict_params = { "file_path": "/", "compressed_size": 10 } Job.validate_dict_params(good_dict_params)
if __name__ == "__main__": prefetcher = GlobusPrefetcher( "AgEqE5QBmdy5NBEyqM1Gx2N4mN299MWN0Y2pPjOvNxqGjMEBpyiwCegxa3MnylpyjDYoQ1bXKjmVYyTygwbYkcp5gz", "4f99675c-ac1f-11ea-bee8-0e716405a293", "af7bda53-6d04-11e5-ba46-22000b92c6ec", "/project2/chard/skluzacek/ryan-data/transfer_dir", max_concurrent_transfers=4, max_files_per_batch=10, max_batch_size=1 * 10**9) import pandas as pd deep_blue_crawl_df = pd.read_csv( "/Users/ryan/Documents/CS/abyss/data/deep_blue_crawl.csv") sorted_files = deep_blue_crawl_df.sort_values(by=["size_bytes"]) filtered_files = sorted_files.iloc[0:10] compressed_files = [{ "file_path": x[0], "compressed_size": x[1] } for _, x in filtered_files.iterrows()] for compressed_file in compressed_files: job = Job.from_dict(compressed_file) job.file_id = str(uuid.uuid4()) prefetcher.transfer_job(job)
def __init__(self, abyss_id: str, globus_source_eid: str, transfer_token: str, compressed_files: List[Dict], worker_params: List[Dict], psql_conn, s3_conn, grouper="", batcher="mmd", dispatcher="fifo", prediction_mode="ml"): """Abyss orchestrator class. Parameters ---------- abyss_id : str Abyss ID for orchestration. globus_source_eid : str Globus endpoint of source data storage. transfer_token : str Globus token to authorize transfers between endpoints. compressed_files : list(dict) List of dictionaries for compressed files to process. Dictionaries contain "file_path" and "compressed_size". worker_params : list(dict) List of valid worker parameter dictionaries to create workers. psql_conn : PostgreSQL connection object to update status. sqs_conn : SQS connection object to push results to SQS. grouper : str Name of grouper to use when crawling. batcher : str Name of batcher to use. dispatcher : str Name of dispatchers to use. prediction_mode: str Mode of prediction to use to predict decompressed file size. "ml" to use machine learning method or "header" to use metadata stored in the header of compressed files (where possible). """ self.abyss_id = abyss_id self.globus_source_eid = globus_source_eid self.transfer_token = transfer_token self.grouper = grouper self.prediction_mode = prediction_mode self.worker_dict = dict() for worker_param in worker_params: worker = Worker.from_dict(worker_param) self.worker_dict[worker.worker_id] = worker self.prefetchers = dict() for worker in self.worker_dict.values(): globus_dest_eid = worker.globus_eid transfer_dir = worker.transfer_dir prefetcher = GlobusPrefetcher(self.transfer_token, self.globus_source_eid, globus_dest_eid, transfer_dir, 4) self.prefetchers[worker.worker_id] = prefetcher self.predictors = dict() for file_type, predictor in FILE_PREDICTOR_MAPPING.items(): file_predictor = predictor() file_predictor.load_models() self.predictors[file_type] = file_predictor self.job_statuses = dict( zip([x for x in JobStatus], [Queue() for _ in range(len(JobStatus))])) unpredicted_set = self.job_statuses[JobStatus.UNPREDICTED] for compressed_file in compressed_files: job = Job.from_dict(compressed_file) job.status = JobStatus.UNPREDICTED job.file_id = str(uuid.uuid4()) job.decompressed_size = 0 unpredicted_set.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}" ) self.scheduler = Scheduler(batcher, dispatcher, list(self.worker_dict.values()), []) self.worker_queues = dict() self.psql_conn = psql_conn self.abyss_metadata = [] self.s3_conn = s3_conn self._unpredicted_preprocessing_thread = threading.Thread( target=self._unpredicted_preprocessing, daemon=True) self._predictor_thread = threading.Thread( target=self._predict_decompressed_size, daemon=True) self._scheduler_thread = threading.Thread( target=self._thread_schedule_jobs, daemon=True) self._prefetcher_thread = threading.Thread( target=self._thread_prefetch, daemon=True) self._prefetcher_poll_thread = threading.Thread( target=self._thread_poll_prefetch, daemon=True) self._funcx_process_headers_thread = threading.Thread( target=self._thread_funcx_process_headers, daemon=True) self._funcx_decompress_thread = threading.Thread( target=self._thread_funcx_decompress, daemon=True) self._funcx_crawl_thread = threading.Thread( target=self._thread_funcx_crawl, daemon=True) self._funcx_poll_thread = threading.Thread( target=self._thread_funcx_poll, daemon=True) self._consolidate_results_thread = threading.Thread( target=self._thread_consolidate_crawl_results, daemon=True) self._lock = threading.Lock() self.thread_statuses = { "predictor_thread": True, "scheduler_thread": True, "prefetcher_thread": True, "prefetcher_poll_thread": True, "funcx_decompress_thread": True, "funcx_crawl_thread": True, "funcx_poll_thread": True, "consolidate_results_thread": True } self.funcx_client = FuncXClient() self.kill_status = False self.crawl_results = Queue()
def _thread_consolidate_crawl_results(self) -> None: """Thread function to consolidate crawl results and push to SQS. Returns ------- None """ while not self.kill_status: unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED] consolidating_queue = self.job_statuses[JobStatus.CONSOLIDATING] succeeded_queue = self.job_statuses[JobStatus.SUCCEEDED] failed_queue = self.job_statuses[JobStatus.FAILED] while not consolidating_queue.empty(): self.thread_statuses["consolidate_results_thread"] = True job = consolidating_queue.get() logger.info(f"{job.file_path} CONSOLIDATING") resubmit_task = False for job_node in job.bfs_iterator(include_root=True): root_path = job_node.metadata["root_path"] for file_path, file_metadata in job_node.metadata[ "metadata"].items(): file_size = file_metadata["physical"]["size"] is_compressed = file_metadata["physical"][ "is_compressed"] child_file_path = os.path.join(root_path, file_path) if is_compressed: if "decompressed_size" in file_metadata[ "physical"]: decompressed_size = file_metadata["physical"][ "decompressed_size"] else: decompressed_size = None if child_file_path in job_node.child_jobs: break else: child_job = Job(file_path=child_file_path, file_id=f"{str(uuid.uuid4())}", compressed_size=file_size) if decompressed_size: child_job.decompressed_size = decompressed_size child_job.status = JobStatus.PREDICTED else: child_job.status = JobStatus.UNPREDICTED job_node.child_jobs[ child_file_path] = child_job resubmit_task = True if resubmit_task: logger.info(f"{job.file_path} RESUBMITTING") unpredicted_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}" ) continue consolidated_metadata = job.consolidate_metadata() self.abyss_metadata.append(consolidated_metadata) for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.CONSOLIDATING: job_node.status = JobStatus.SUCCEEDED succeeded_queue.put(job) logger.info(f"{job.file_path} PLACED INTO SUCCEEDED") logger.info( f"LATENCY PLACING {job.file_id} INTO SUCCEEDED AT {time.time()}" ) while not failed_queue.empty(): job = failed_queue.get() logger.info(f"{job.file_path} CONSOLIDATING FROM FAILED") consolidated_metadata = job.consolidate_metadata() self.abyss_metadata.append(consolidated_metadata) succeeded_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO SUCCEEDED AT {time.time()}" ) self.thread_statuses["consolidate_results_thread"] = False
def _thread_funcx_poll(self) -> None: """Thread function to poll funcX for results. Returns ------- None """ unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED] decompressing_queue = self.job_statuses[JobStatus.DECOMPRESSING] decompressed_queue = self.job_statuses[JobStatus.DECOMPRESSED] crawling_queue = self.job_statuses[JobStatus.CRAWLING] processing_headers_queue = self.job_statuses[ JobStatus.PROCESSING_HEADERS] predicted_queue = self.job_statuses[JobStatus.PREDICTED] consolidating_queue = self.job_statuses[JobStatus.CONSOLIDATING] failed_queue = self.job_statuses[JobStatus.FAILED] while not self.kill_status: processing_headers_funcx_ids = [] processing_header_jobs = [] while not processing_headers_queue.empty(): self.thread_statuses["funcx_poll_thread"] = True job = processing_headers_queue.get() logger.info(f"{job.file_path} POLLING HEADER PROCESSING") processing_headers_funcx_ids.append( job.funcx_process_headers_id) processing_header_jobs.append(job) processing_headers_statuses = self.funcx_client.get_batch_status( task_id_list=processing_headers_funcx_ids) for job in processing_header_jobs: worker = self.worker_dict[job.worker_id] job_status = processing_headers_statuses[ job.funcx_process_headers_id] if job_status["pending"]: processing_headers_queue.put(job) elif job_status["status"] == "success": logger.info(f"{job.file_path} COMPLETED HEADER PROCESSING") job = Job.from_dict(job_status["result"]) job.status = JobStatus.PREDICTED worker.curr_available_space += job.compressed_size predicted_queue.put(job) elif job_status["status"] == "failed": worker.curr_available_space += job.compressed_size unpredicted_predict_queue = self.job_statuses[ JobStatus.UNPREDICTED_PREDICT] job.status = JobStatus.UNPREDICTED_PREDICT unpredicted_predict_queue.put(job) time.sleep(5) decompressing_funcx_ids = [] decompressing_jobs = [] while not decompressing_queue.empty(): self.thread_statuses["funcx_poll_thread"] = True job = decompressing_queue.get() logger.info(f"{job.file_path} POLLING DECOMPRESS") decompressing_funcx_ids.append(job.funcx_decompress_id) decompressing_jobs.append(job) decompressing_statuses = self.funcx_client.get_batch_status( decompressing_funcx_ids) for job in decompressing_jobs: worker = self.worker_dict[job.worker_id] job_status = decompressing_statuses[job.funcx_decompress_id] logger.info(job_status) if job_status["pending"]: decompressing_queue.put(job) elif job_status["status"] == "success": job = Job.from_dict(job_status["result"]) logger.info(f"{job.file_path} COMPLETED DECOMPRESS") if job.status == JobStatus.FAILED: worker.curr_available_space += job.total_size failed_queue.put(job) logger.info(f"{job.file_path} PLACED INTO FAILED") logger.info( f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}" ) continue has_unpredicted = False for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.DECOMPRESSING: job_node.status = JobStatus.DECOMPRESSED elif job_node.status == JobStatus.UNPREDICTED: has_unpredicted = True if has_unpredicted: unpredicted_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO UNPREDICTED") worker.curr_available_space += job.compressed_size decompressed_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO DECOMPRESSED AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO DECOMPRESSED") elif job_status["status"] == "failed": worker.curr_available_space += job.compressed_size logger.info( f"ERROR for {job.file_path}: {job_status['exception']}" ) logger.info(f"{job.file_path} PLACED INTO FAILED") failed_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}" ) time.sleep(5) crawling_funcx_ids = [] crawling_jobs = [] while not crawling_queue.empty(): self.thread_statuses["funcx_poll_thread"] = True job = crawling_queue.get() logger.info(f"{job.file_path} POLLING CRAWL") crawling_funcx_ids.append(job.funcx_crawl_id) crawling_jobs.append(job) crawling_statuses = self.funcx_client.get_batch_status( crawling_funcx_ids) for job in crawling_jobs: worker = self.worker_dict[job.worker_id] job_status = crawling_statuses[job.funcx_crawl_id] if job_status["pending"]: crawling_queue.put(job) elif job_status["status"] == "success": result = job_status["result"] job = Job.from_dict(result) logger.info(f"{job.file_path} COMPLETED CRAWL") for job_node in job.bfs_iterator(include_root=True): if job_node.status == JobStatus.CRAWLING: job_node.status = JobStatus.CONSOLIDATING worker.curr_available_space += (job.total_size - job.compressed_size) consolidating_queue.put(job) logger.info( f"LATENCY PLACING {job.file_id} INTO CONSOLIDATING AT {time.time()}" ) logger.info(f"{job.file_path} PLACED INTO CONSOLIDATING") elif job_status["status"] == "failed": worker.curr_available_space += (job.total_size - job.compressed_size) failed_queue.put(job) logger.info(f"{job.file_path} PLACED INTO FAILED") logger.info( f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}" ) time.sleep(5) self.thread_statuses["funcx_poll_thread"] = False
def run_decompressor(job_dict: dict, decompress_dir: str): """Iterates through a Job and recursively decompresses files. Parameters ---------- job_dict : dict Job dictionary to iterate through. decompress_dir : str Location on worker to decompress files to. Returns ------- """ import os import sys import logging from shutil import rmtree sys.path.insert(0, "/") from abyss.orchestrator.job import Job, JobStatus from abyss.utils.decompressors import decompress from abyss.utils.error_utils import is_critical_oom_error, is_critical_decompression_error from abyss.utils.funcx_functions import get_directory_size from abyss.definitions import ROOT_DIR job = Job.from_dict(job_dict) logger = logging.getLogger(__name__) f_handler = logging.FileHandler(f'/project2/chard/skluzacek/ryan-data/abyss/file.log') f_handler.setLevel(logging.ERROR) f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') f_handler.setFormatter(f_format) logger.addHandler(f_handler) job_nodes = job.to_queue(include_root=True) while len(job_nodes): job_node = job_nodes.popleft() file_path = job_node.transfer_path decompress_type = os.path.splitext(job_node.file_path)[1][1:] logger.error(f"DECOMPRESSING {file_path}") if job_node.status == JobStatus.FAILED: continue try: if decompress_type == "zip": full_extract_dir = os.path.join(decompress_dir, job_node.file_id) decompress(file_path, decompress_type, full_extract_dir) elif decompress_type == "tar": full_extract_dir = os.path.join(decompress_dir, job_node.file_id) decompress(file_path, decompress_type, full_extract_dir) elif decompress_type == "gz": extract_dir = os.path.join(os.path.join(decompress_dir, job_node.file_id), os.path.basename(job_node.file_path[:-3])) full_extract_dir = os.path.dirname(extract_dir) if not os.path.exists(os.path.dirname(extract_dir)): os.makedirs(os.path.dirname(extract_dir)) decompress(file_path, decompress_type, extract_dir) job_node.decompress_path = full_extract_dir logger.error(f"DECOMPRESSED {file_path} TO {full_extract_dir}") for child_job in job_node.child_jobs.values(): # TODO: Fix this gross if statement. We might want to decompress # gz files into a directory if os.path.basename(full_extract_dir) == child_job.file_path: child_job.transfer_path = full_extract_dir else: child_job.transfer_path = os.path.join(decompress_dir, child_job.file_path) if job_node.status == JobStatus.PREFETCHED: job_node.status = JobStatus.DECOMPRESSING logger.error(f"REMOVING {job_node.transfer_path}") os.remove(job_node.transfer_path) except Exception as e: logger.error(f"ERROR TYPE {e}") logger.error(f"CAUGHT ERROR", exc_info=True) if is_critical_decompression_error(e): logger.error("HANDLED DECOMPRESSION ERROR") if job_node.status == JobStatus.PREFETCHED: job_node.status = JobStatus.FAILED job_node.error = str(e) os.remove(job_node.transfer_path) if os.path.exists(full_extract_dir): rmtree(full_extract_dir) elif is_critical_oom_error(e): logger.error("PROCESSING OOM ERROR") decompressed_size = get_directory_size(full_extract_dir) if decompressed_size > job_node.decompressed_size: logger.error("FILE TOO LARGE") os.remove(job_node.transfer_path) rmtree(full_extract_dir) for child_job in job_node.child_jobs: job_nodes.remove(child_job) job_node.status = JobStatus.UNPREDICTED job_node.error = str(e) else: logger.error("ATTEMPTING TO REPROCESS") rmtree(full_extract_dir) job_nodes.appendleft(job_node) else: if job_node.status == JobStatus.PREFETCHED: job_node.status = JobStatus.FAILED job_node.error = str(e) os.remove(job_node.transfer_path) if os.path.exists(full_extract_dir): rmtree(full_extract_dir) return Job.to_dict(job)
def test_batch(self): for batcher_name in BATCHER_NAME_MAPPING.keys(): for dispatcher_name in DISPATCHER_NAME_MAPPING.keys(): workers = [ Worker.from_dict({ "globus_eid": "0", "funcx_eid": "1", "max_available_space": 10, "transfer_dir": "/transfer", "decompress_dir": "/dir", }) ] jobs = [ Job.from_dict(({ "file_path": f"/0", "compressed_size": 0, "decompressed_size": 10, })), Job.from_dict(({ "file_path": f"/1", "compressed_size": 0, "decompressed_size": 20, })) ] scheduler = Scheduler(batcher_name, dispatcher_name, workers, jobs) worker_queues = scheduler.worker_queues # Making sure only the correct job gets scheduled self.assertTrue(len(worker_queues), 1) worker_queue_0 = list( worker_queues[workers[0].worker_id].queue) self.assertEqual(len(worker_queue_0), 1) self.assertEqual(worker_queue_0[0].decompressed_size, 10) self.assertTrue( worker_queue_0[0] not in scheduler._batcher.jobs) # Making sure no jobs get batched twice scheduler.schedule_jobs([]) new_worker_queues = scheduler.worker_queues self.assertEqual(len(new_worker_queues), 1) new_worker_queue_0 = list( new_worker_queues[workers[0].worker_id].queue) self.assertEqual(len(new_worker_queue_0), 1) self.assertEqual(new_worker_queue_0[0].decompressed_size, 10) # Making sure jobs are appropriately batched after freeing space workers[0].curr_available_space += 50 scheduler.schedule_jobs([]) new_worker_queues_1 = scheduler.worker_queues self.assertEqual(len(new_worker_queues_1), 1) new_worker_queue_1 = list( new_worker_queues[workers[0].worker_id].queue) self.assertEqual(len(new_worker_queue_1), 2) self.assertEqual(new_worker_queue_1[0].decompressed_size, 10) self.assertEqual(new_worker_queue_1[1].decompressed_size, 20)