Ejemplo n.º 1
0
def run_globus_crawler(job_dict: dict, transfer_token: str, globus_eid: str,
                grouper_name: str, max_crawl_threads=2):
    import os
    import shutil
    import sys
    sys.path.insert(0, "/")
    from abyss.orchestrator.job import Job, JobStatus
    from abyss.crawlers.globus_crawler.globus_crawler import GlobusCrawler

    job = Job.from_dict(job_dict)

    for job_node in job.bfs_iterator(include_root=True):
        if job_node.status == JobStatus.DECOMPRESSED:
            print(job_node.decompress_path)

            crawler = GlobusCrawler(transfer_token,
                                    globus_eid,
                                    job_node.decompress_path,
                                    grouper_name,
                                    max_crawl_threads=max_crawl_threads)

            metadata = crawler.crawl()
            job_node.metadata = metadata
            job_node.status = JobStatus.CRAWLING

            if os.path.exists(job_node.decompress_path):
                if os.path.isfile(job_node.decompress_path):
                    os.remove(job_node.decompress_path)
                    # logger.error(f"REMOVING FILE {job_node.decompress_path}")
                else:
                    shutil.rmtree(job_node.decompress_path)
                    # logger.error(f"REMOVING DIRECTORY {job_node.decompress_path}")

    return Job.to_dict(job)
Ejemplo n.º 2
0
def process_job_headers(job_dict: dict) -> dict:
    """Takes a job object and reads the file header and determines the
    decompressed size of the job.

    Parameters
    ----------
    job_dict : dict
        Job dictionary.

    Returns
    -------
    dict
        Job dictionary containing the decompressed size.
    """
    import os
    import sys
    sys.path.insert(0, "/")
    from abyss.orchestrator.job import Job, JobStatus
    from abyss.utils.decompressors import get_zip_decompressed_size, get_tar_decompressed_size

    job = Job.from_dict(job_dict)

    if job.status != JobStatus.UNPREDICTED_PREFETCHED:
        raise ValueError(f"Job {job.file_path} status is not PROCESSING_HEADERS")
    elif job.file_path.endswith(".zip"):
        decompressed_size = get_zip_decompressed_size(job.transfer_path)
    elif job.file_path.endswith(".tar"):
        decompressed_size = get_tar_decompressed_size(job.transfer_path)
    else:
        raise ValueError(f"Can not process headers of {job.file_path}")

    job.decompressed_size = decompressed_size
    os.remove(job.transfer_path)

    return Job.to_dict(job)
Ejemplo n.º 3
0
    def test_from_dict(self):
        with self.assertRaises(ValueError):
            bad_dict_params_1 = {
                "compressed_size": 10
            }
            Job.from_dict(bad_dict_params_1)

        good_dict_params = {
            "file_path": "/",
            "compressed_size": 0,
            "decompressed_size": 0,
            "worker_id": "1",
            "transfer_path": "/transfer",
            "decompress_path": "/decompress",
            "funcx_decompress_id": "2",
            "funcx_crawl_id": "3",
            "status": JobStatus.UNPREDICTED
        }

        job = Job.from_dict(good_dict_params)
        self.assertEqual(job.file_path, good_dict_params["file_path"])
        self.assertEqual(job.compressed_size,
                         good_dict_params["compressed_size"])
        self.assertEqual(job.decompressed_size,
                         good_dict_params["decompressed_size"])
        self.assertEqual(job.worker_id, good_dict_params["worker_id"])
        self.assertEqual(job.transfer_path,
                         good_dict_params["transfer_path"])
        self.assertEqual(job.decompress_path,
                         good_dict_params["decompress_path"])
        self.assertEqual(job.funcx_decompress_id,
                         good_dict_params["funcx_decompress_id"])
        self.assertEqual(job.funcx_crawl_id,
                         good_dict_params["funcx_crawl_id"])
        self.assertEqual(job.status, good_dict_params["status"])
Ejemplo n.º 4
0
    def test_dispatch(self):
        workers = [
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 97,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            }),
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 57,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            })
        ]
        worker_batches = {
            workers[0].worker_id: [
                Job.from_dict(({
                    "file_path": f"/{i}",
                    "compressed_size": 0,
                    "decompressed_size": i,
                })) for i in range(10, 20)
            ],
            workers[1].worker_id: [
                Job.from_dict(({
                    "file_path": f"/{i}",
                    "compressed_size": 0,
                    "decompressed_size": i,
                })) for i in range(0, 10)
            ]
        }
        preserved_batches = {
            workers[0].worker_id: worker_batches[workers[0].worker_id],
            workers[1].worker_id: worker_batches[workers[1].worker_id]
        }

        dispatcher = MaxFirstDispatcher(workers)
        dispatcher.dispatch_batch(worker_batches)
        worker_queues = dispatcher.worker_queues

        for worker_id, worker_batch in preserved_batches.items():
            preserved_batches[worker_id] = sorted(
                worker_batch,
                reverse=True,
                key=(lambda x: x.decompressed_size))

        for worker_id, worker_queue in worker_queues.items():
            self.assertEqual(list(worker_queue.queue),
                             preserved_batches[worker_id])

        self.assertEqual(worker_batches, {
            workers[0].worker_id: [],
            workers[1].worker_id: []
        })
        self.assertEqual(dispatcher.worker_batches, {
            workers[0].worker_id: [],
            workers[1].worker_id: []
        })
Ejemplo n.º 5
0
    def test_update_worker(self):
        workers = [
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 97,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            }),
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 57,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            })
        ]
        jobs = [
            Job.from_dict({
                "file_path": "/",
                "compressed_size": 0,
                "decompressed_size": 10
            }),
            Job.from_dict({
                "file_path": "/",
                "compressed_size": 0,
                "decompressed_size": 5
            })
        ]

        worker_0 = workers[0]

        batcher = DummyBatcher(workers, jobs)
        worker_0.curr_available_space += 100

        batcher.update_worker(worker_0)
        self.assertEqual(
            worker_0.curr_available_space,
            batcher.worker_dict[worker_0.worker_id].curr_available_space)

        with self.assertRaises(ValueError):
            batcher.update_worker(
                Worker.from_dict({
                    "globus_eid": "0",
                    "funcx_eid": "1",
                    "max_available_space": 57,
                    "transfer_dir": "/transfer",
                    "decompress_dir": "/dir",
                }))
Ejemplo n.º 6
0
    def test_multiple_batch(self):
        workers = [
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 97,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            }),
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 57,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            })
        ]
        jobs = [
            Job.from_dict(({
                "file_path": f"/{i}",
                "compressed_size": 0,
                "decompressed_size": i,
            })) for i in range(10, 20)
        ]

        batcher = MMDBatcher(workers, jobs)
        batches = batcher.worker_batches

        queued_jobs = []
        for _ in range(batcher.job_queue.qsize()):
            job = batcher.job_queue.get()
            queued_jobs.append(job)
            batcher.job_queue.put(job)

        self.assertEqual(set([job.decompressed_size for job in queued_jobs]),
                         {19})

        batcher.batch_job(
            Job.from_dict(({
                "file_path": f"/{100}",
                "compressed_size": 0,
                "decompressed_size": 100,
            })))
        batches_1 = batcher.worker_batches

        self.assertEqual(batches, batches_1)
Ejemplo n.º 7
0
def run_local_crawler(job_dict: dict, grouper_name: str, max_crawl_threads=1):
    import logging
    import os
    import sys
    import shutil
    sys.path.insert(0, "/")
    from abyss.orchestrator.job import Job, JobStatus
    from abyss.crawlers.local_crawler.local_crawler import LocalCrawler
    from abyss.definitions import ROOT_DIR

    logger = logging.getLogger(__name__)
    f_handler = logging.FileHandler(f'/project2/chard/skluzacek/ryan-data/abyss/file.log')
    f_handler.setLevel(logging.ERROR)
    f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    f_handler.setFormatter(f_format)
    logger.addHandler(f_handler)

    job = Job.from_dict(job_dict)

    for job_node in job.bfs_iterator(include_root=True):
        if job_node.status == JobStatus.DECOMPRESSED:
            logger.error(f"CRAWLING {job_node.decompress_path}")
            crawler = LocalCrawler(job_node.decompress_path,
                                   grouper_name,
                                   max_crawl_threads=max_crawl_threads)

            metadata = crawler.crawl()
            job_node.metadata = metadata
            job_node.status = JobStatus.CRAWLING

        if os.path.exists(job_node.decompress_path):
            if os.path.isfile(job_node.decompress_path):
                os.remove(job_node.decompress_path)
                logger.error(f"REMOVING FILE {job_node.decompress_path}")
            else:
                shutil.rmtree(job_node.decompress_path)
                logger.error(f"REMOVING DIRECTORY {job_node.decompress_path}")

    return Job.to_dict(job)
Ejemplo n.º 8
0
    def test_is_failed_job(self):
        workers = []

        for i in range(10):
            workers.append(Worker(None, None, None, None, i))

        jobs = [
            Job.from_dict({
                "file_path": "/",
                "compressed_size": 0,
                "decompressed_size": 10
            }),
            Job.from_dict({
                "file_path": "/",
                "compressed_size": 0,
                "decompressed_size": 5
            })
        ]

        batcher = DummyBatcher(workers, jobs)

        self.assertTrue(batcher._is_failed_job(jobs[0]))
        self.assertFalse(batcher._is_failed_job(jobs[1]))
Ejemplo n.º 9
0
    def _thread_funcx_decompress(self) -> None:
        """Thread function to submit decompression tasks to funcX.

        Returns
        -------
        None
        """
        while not self.kill_status:
            prefetched_queue = self.job_statuses[JobStatus.PREFETCHED]
            decompressing_queue = self.job_statuses[JobStatus.DECOMPRESSING]

            batch = self.funcx_client.create_batch()
            batched_jobs = []
            while not prefetched_queue.empty():
                self.thread_statuses["funcx_decompress_thread"] = True
                job = prefetched_queue.get()
                job_dict = Job.to_dict(job)
                worker_id = job.worker_id

                worker = self.worker_dict[worker_id]
                batch.add(job_dict,
                          worker.decompress_dir,
                          endpoint_id=worker.funcx_eid,
                          function_id=DECOMPRESSOR_FUNCX_UUID)
                batched_jobs.append(job)

            if len(batch.tasks) > 0:
                batch_res = self.funcx_client.batch_run(batch)
            else:
                batch_res = None

            for idx, job in enumerate(batched_jobs):
                logger.info(f"{job.file_path} DECOMPRESSING")
                for job_node in job.bfs_iterator(include_root=True):
                    job_node.funcx_decompress_id = batch_res[idx]
                    if job_node.status == JobStatus.PREFETCHED:
                        job_node.status = JobStatus.DECOMPRESSING

                decompressing_queue.put(job)
                logger.info(
                    f"LATENCY PLACING {job.file_id} INTO DECOMPRESSING AT {time.time()}"
                )

            time.sleep(5)

            self.thread_statuses["funcx_decompress_thread"] = False
Ejemplo n.º 10
0
    def test_batch(self):
        workers = [
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 97,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            }),
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 57,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            })
        ]
        jobs = [
            Job.from_dict(({
                "file_path": f"/{i}",
                "compressed_size": 0,
                "decompressed_size": i,
            })) for i in range(10, 20)
        ]

        batcher = MMDBatcher(workers, jobs)
        batches = batcher.worker_batches

        batch_0 = batches[workers[0].worker_id]
        self.assertEqual(set([job.decompressed_size for job in batch_0]),
                         {10, 12, 13, 15, 16, 18})

        batch_1 = batches[workers[1].worker_id]
        self.assertEqual(set([job.decompressed_size for job in batch_1]),
                         {11, 14, 17})

        queued_jobs = []
        while not batcher.job_queue.empty():
            queued_jobs.append(batcher.job_queue.get())

        self.assertEqual(set([job.decompressed_size for job in queued_jobs]),
                         {19})

        for _, worker_batch in batches.items():
            for job in worker_batch:
                self.assertTrue(job not in batcher.jobs)
Ejemplo n.º 11
0
    def _thread_funcx_process_headers(self) -> None:
        """Thread function to submit header processing tasks to funcX.

        Returns
        -------
        None
        """
        while not self.kill_status:
            unpredicted_prefetched_queue = self.job_statuses[
                JobStatus.UNPREDICTED_PREFETCHED]
            processing_headers_queue = self.job_statuses[
                JobStatus.PROCESSING_HEADERS]

            batch = self.funcx_client.create_batch()
            batched_jobs = []
            while not unpredicted_prefetched_queue.empty():
                self.thread_statuses["funcx_processing_headers_thread"] = True
                job = unpredicted_prefetched_queue.get()
                logger.info(f"{job.file_path} PROCESSING HEADERS")
                job_dict = Job.to_dict(job)
                worker_id = job.worker_id

                worker = self.worker_dict[worker_id]
                batch.add(job_dict,
                          endpoint_id=worker.funcx_eid,
                          function_id=PROCESS_HEADER_FUNCX_UUID)
                batched_jobs.append(job)

            if len(batch.tasks) > 0:
                batch_res = self.funcx_client.batch_run(batch)
            else:
                batch_res = None

            for idx, job in enumerate(batched_jobs):
                job.funcx_process_headers_id = batch_res[idx]
                job.status = JobStatus.PROCESSING_HEADERS

                processing_headers_queue.put(job)
                logger.info(f"{job.file_path} PROCESSING HEADERS QUEUE")

            time.sleep(5)

            self.thread_statuses["funcx_processing_headers_thread"] = False
Ejemplo n.º 12
0
    def test_dispatch(self):
        workers = [
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 97,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            })
        ]
        worker_batches = {
            workers[0].worker_id: [
                Job.from_dict(({
                    "file_path": f"/{i}",
                    "compressed_size": 0,
                    "decompressed_size": i,
                })) for i in range(1, 11)
            ]
        }

        worker_batch_0 = worker_batches[workers[0].worker_id]
        preserved_batches = {
            workers[0].worker_id: [
                worker_batch_0[9], worker_batch_0[0], worker_batch_0[1],
                worker_batch_0[2], worker_batch_0[3], worker_batch_0[8],
                worker_batch_0[4], worker_batch_0[5], worker_batch_0[7],
                worker_batch_0[6]
            ]
        }

        dispatcher = MaxMinDispatcher(workers)
        dispatcher.dispatch_batch(worker_batches)
        worker_queues = dispatcher.worker_queues

        for worker_id, worker_queue in worker_queues.items():
            self.assertEqual(list(worker_queue.queue),
                             preserved_batches[worker_id])

        self.assertEqual(worker_batches, {workers[0].worker_id: []})
        self.assertEqual(dispatcher.worker_batches, {workers[0].worker_id: []})
Ejemplo n.º 13
0
    def test_validate_dict_params(self):
        with self.assertRaises(ValueError):
            bad_dict_params = {
                "file_path": 10,
                "compressed_size": 10
            }
            Job.validate_dict_params(bad_dict_params)

        with self.assertRaises(ValueError):
            bad_dict_params_1 = {
                "compressed_size": 10
            }
            Job.validate_dict_params(bad_dict_params_1)

        good_dict_params = {
            "file_path": "/",
            "compressed_size": 10
        }
        Job.validate_dict_params(good_dict_params)
Ejemplo n.º 14
0

if __name__ == "__main__":
    prefetcher = GlobusPrefetcher(
        "AgEqE5QBmdy5NBEyqM1Gx2N4mN299MWN0Y2pPjOvNxqGjMEBpyiwCegxa3MnylpyjDYoQ1bXKjmVYyTygwbYkcp5gz",
        "4f99675c-ac1f-11ea-bee8-0e716405a293",
        "af7bda53-6d04-11e5-ba46-22000b92c6ec",
        "/project2/chard/skluzacek/ryan-data/transfer_dir",
        max_concurrent_transfers=4,
        max_files_per_batch=10,
        max_batch_size=1 * 10**9)

    import pandas as pd

    deep_blue_crawl_df = pd.read_csv(
        "/Users/ryan/Documents/CS/abyss/data/deep_blue_crawl.csv")

    sorted_files = deep_blue_crawl_df.sort_values(by=["size_bytes"])

    filtered_files = sorted_files.iloc[0:10]

    compressed_files = [{
        "file_path": x[0],
        "compressed_size": x[1]
    } for _, x in filtered_files.iterrows()]

    for compressed_file in compressed_files:
        job = Job.from_dict(compressed_file)
        job.file_id = str(uuid.uuid4())
        prefetcher.transfer_job(job)
Ejemplo n.º 15
0
    def __init__(self,
                 abyss_id: str,
                 globus_source_eid: str,
                 transfer_token: str,
                 compressed_files: List[Dict],
                 worker_params: List[Dict],
                 psql_conn,
                 s3_conn,
                 grouper="",
                 batcher="mmd",
                 dispatcher="fifo",
                 prediction_mode="ml"):
        """Abyss orchestrator class.
        Parameters
        ----------
        abyss_id : str
            Abyss ID for orchestration.
        globus_source_eid : str
            Globus endpoint of source data storage.
        transfer_token : str
            Globus token to authorize transfers between endpoints.
        compressed_files : list(dict)
            List of dictionaries for compressed files to process.
            Dictionaries contain "file_path" and "compressed_size".
        worker_params : list(dict)
            List of valid worker parameter dictionaries to create
            workers.
        psql_conn :
            PostgreSQL connection object to update status.
        sqs_conn :
            SQS connection object to push results to SQS.
        grouper : str
            Name of grouper to use when crawling.
        batcher : str
            Name of batcher to use.
        dispatcher : str
            Name of dispatchers to use.
        prediction_mode: str
            Mode of prediction to use to predict decompressed file size.
            "ml" to use machine learning method or "header" to use
            metadata stored in the header of compressed files (where
            possible).
        """
        self.abyss_id = abyss_id
        self.globus_source_eid = globus_source_eid
        self.transfer_token = transfer_token
        self.grouper = grouper
        self.prediction_mode = prediction_mode

        self.worker_dict = dict()
        for worker_param in worker_params:
            worker = Worker.from_dict(worker_param)
            self.worker_dict[worker.worker_id] = worker

        self.prefetchers = dict()
        for worker in self.worker_dict.values():
            globus_dest_eid = worker.globus_eid
            transfer_dir = worker.transfer_dir
            prefetcher = GlobusPrefetcher(self.transfer_token,
                                          self.globus_source_eid,
                                          globus_dest_eid, transfer_dir, 4)

            self.prefetchers[worker.worker_id] = prefetcher

        self.predictors = dict()
        for file_type, predictor in FILE_PREDICTOR_MAPPING.items():
            file_predictor = predictor()
            file_predictor.load_models()
            self.predictors[file_type] = file_predictor

        self.job_statuses = dict(
            zip([x for x in JobStatus],
                [Queue() for _ in range(len(JobStatus))]))
        unpredicted_set = self.job_statuses[JobStatus.UNPREDICTED]
        for compressed_file in compressed_files:
            job = Job.from_dict(compressed_file)
            job.status = JobStatus.UNPREDICTED
            job.file_id = str(uuid.uuid4())
            job.decompressed_size = 0
            unpredicted_set.put(job)
            logger.info(
                f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}"
            )

        self.scheduler = Scheduler(batcher, dispatcher,
                                   list(self.worker_dict.values()), [])
        self.worker_queues = dict()

        self.psql_conn = psql_conn
        self.abyss_metadata = []
        self.s3_conn = s3_conn

        self._unpredicted_preprocessing_thread = threading.Thread(
            target=self._unpredicted_preprocessing, daemon=True)
        self._predictor_thread = threading.Thread(
            target=self._predict_decompressed_size, daemon=True)
        self._scheduler_thread = threading.Thread(
            target=self._thread_schedule_jobs, daemon=True)
        self._prefetcher_thread = threading.Thread(
            target=self._thread_prefetch, daemon=True)
        self._prefetcher_poll_thread = threading.Thread(
            target=self._thread_poll_prefetch, daemon=True)
        self._funcx_process_headers_thread = threading.Thread(
            target=self._thread_funcx_process_headers, daemon=True)
        self._funcx_decompress_thread = threading.Thread(
            target=self._thread_funcx_decompress, daemon=True)
        self._funcx_crawl_thread = threading.Thread(
            target=self._thread_funcx_crawl, daemon=True)
        self._funcx_poll_thread = threading.Thread(
            target=self._thread_funcx_poll, daemon=True)
        self._consolidate_results_thread = threading.Thread(
            target=self._thread_consolidate_crawl_results, daemon=True)
        self._lock = threading.Lock()
        self.thread_statuses = {
            "predictor_thread": True,
            "scheduler_thread": True,
            "prefetcher_thread": True,
            "prefetcher_poll_thread": True,
            "funcx_decompress_thread": True,
            "funcx_crawl_thread": True,
            "funcx_poll_thread": True,
            "consolidate_results_thread": True
        }

        self.funcx_client = FuncXClient()
        self.kill_status = False
        self.crawl_results = Queue()
Ejemplo n.º 16
0
    def _thread_consolidate_crawl_results(self) -> None:
        """Thread function to consolidate crawl results and push to SQS.
        Returns
        -------
        None
        """
        while not self.kill_status:
            unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED]
            consolidating_queue = self.job_statuses[JobStatus.CONSOLIDATING]
            succeeded_queue = self.job_statuses[JobStatus.SUCCEEDED]
            failed_queue = self.job_statuses[JobStatus.FAILED]

            while not consolidating_queue.empty():
                self.thread_statuses["consolidate_results_thread"] = True
                job = consolidating_queue.get()
                logger.info(f"{job.file_path} CONSOLIDATING")

                resubmit_task = False
                for job_node in job.bfs_iterator(include_root=True):
                    root_path = job_node.metadata["root_path"]
                    for file_path, file_metadata in job_node.metadata[
                            "metadata"].items():
                        file_size = file_metadata["physical"]["size"]
                        is_compressed = file_metadata["physical"][
                            "is_compressed"]

                        child_file_path = os.path.join(root_path, file_path)

                        if is_compressed:
                            if "decompressed_size" in file_metadata[
                                    "physical"]:
                                decompressed_size = file_metadata["physical"][
                                    "decompressed_size"]
                            else:
                                decompressed_size = None
                            if child_file_path in job_node.child_jobs:
                                break
                            else:
                                child_job = Job(file_path=child_file_path,
                                                file_id=f"{str(uuid.uuid4())}",
                                                compressed_size=file_size)

                                if decompressed_size:
                                    child_job.decompressed_size = decompressed_size
                                    child_job.status = JobStatus.PREDICTED
                                else:
                                    child_job.status = JobStatus.UNPREDICTED

                                job_node.child_jobs[
                                    child_file_path] = child_job
                                resubmit_task = True

                if resubmit_task:
                    logger.info(f"{job.file_path} RESUBMITTING")
                    unpredicted_queue.put(job)
                    logger.info(
                        f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}"
                    )
                    continue

                consolidated_metadata = job.consolidate_metadata()
                self.abyss_metadata.append(consolidated_metadata)

                for job_node in job.bfs_iterator(include_root=True):
                    if job_node.status == JobStatus.CONSOLIDATING:
                        job_node.status = JobStatus.SUCCEEDED

                succeeded_queue.put(job)
                logger.info(f"{job.file_path} PLACED INTO SUCCEEDED")
                logger.info(
                    f"LATENCY PLACING {job.file_id} INTO SUCCEEDED AT {time.time()}"
                )

            while not failed_queue.empty():
                job = failed_queue.get()
                logger.info(f"{job.file_path} CONSOLIDATING FROM FAILED")
                consolidated_metadata = job.consolidate_metadata()
                self.abyss_metadata.append(consolidated_metadata)
                succeeded_queue.put(job)
                logger.info(
                    f"LATENCY PLACING {job.file_id} INTO SUCCEEDED AT {time.time()}"
                )

            self.thread_statuses["consolidate_results_thread"] = False
Ejemplo n.º 17
0
    def _thread_funcx_poll(self) -> None:
        """Thread function to poll funcX for results.

        Returns
        -------
        None
        """
        unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED]
        decompressing_queue = self.job_statuses[JobStatus.DECOMPRESSING]
        decompressed_queue = self.job_statuses[JobStatus.DECOMPRESSED]
        crawling_queue = self.job_statuses[JobStatus.CRAWLING]
        processing_headers_queue = self.job_statuses[
            JobStatus.PROCESSING_HEADERS]
        predicted_queue = self.job_statuses[JobStatus.PREDICTED]
        consolidating_queue = self.job_statuses[JobStatus.CONSOLIDATING]
        failed_queue = self.job_statuses[JobStatus.FAILED]

        while not self.kill_status:
            processing_headers_funcx_ids = []
            processing_header_jobs = []
            while not processing_headers_queue.empty():
                self.thread_statuses["funcx_poll_thread"] = True
                job = processing_headers_queue.get()
                logger.info(f"{job.file_path} POLLING HEADER PROCESSING")
                processing_headers_funcx_ids.append(
                    job.funcx_process_headers_id)
                processing_header_jobs.append(job)

            processing_headers_statuses = self.funcx_client.get_batch_status(
                task_id_list=processing_headers_funcx_ids)
            for job in processing_header_jobs:
                worker = self.worker_dict[job.worker_id]
                job_status = processing_headers_statuses[
                    job.funcx_process_headers_id]

                if job_status["pending"]:
                    processing_headers_queue.put(job)
                elif job_status["status"] == "success":
                    logger.info(f"{job.file_path} COMPLETED HEADER PROCESSING")
                    job = Job.from_dict(job_status["result"])
                    job.status = JobStatus.PREDICTED

                    worker.curr_available_space += job.compressed_size
                    predicted_queue.put(job)
                elif job_status["status"] == "failed":
                    worker.curr_available_space += job.compressed_size
                    unpredicted_predict_queue = self.job_statuses[
                        JobStatus.UNPREDICTED_PREDICT]
                    job.status = JobStatus.UNPREDICTED_PREDICT
                    unpredicted_predict_queue.put(job)

            time.sleep(5)

            decompressing_funcx_ids = []
            decompressing_jobs = []
            while not decompressing_queue.empty():
                self.thread_statuses["funcx_poll_thread"] = True
                job = decompressing_queue.get()
                logger.info(f"{job.file_path} POLLING DECOMPRESS")
                decompressing_funcx_ids.append(job.funcx_decompress_id)
                decompressing_jobs.append(job)

            decompressing_statuses = self.funcx_client.get_batch_status(
                decompressing_funcx_ids)
            for job in decompressing_jobs:
                worker = self.worker_dict[job.worker_id]
                job_status = decompressing_statuses[job.funcx_decompress_id]
                logger.info(job_status)

                if job_status["pending"]:
                    decompressing_queue.put(job)
                elif job_status["status"] == "success":
                    job = Job.from_dict(job_status["result"])
                    logger.info(f"{job.file_path} COMPLETED DECOMPRESS")

                    if job.status == JobStatus.FAILED:
                        worker.curr_available_space += job.total_size
                        failed_queue.put(job)
                        logger.info(f"{job.file_path} PLACED INTO FAILED")
                        logger.info(
                            f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}"
                        )
                        continue

                    has_unpredicted = False
                    for job_node in job.bfs_iterator(include_root=True):
                        if job_node.status == JobStatus.DECOMPRESSING:
                            job_node.status = JobStatus.DECOMPRESSED
                        elif job_node.status == JobStatus.UNPREDICTED:
                            has_unpredicted = True

                    if has_unpredicted:
                        unpredicted_queue.put(job)
                        logger.info(
                            f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}"
                        )
                        logger.info(f"{job.file_path} PLACED INTO UNPREDICTED")

                    worker.curr_available_space += job.compressed_size

                    decompressed_queue.put(job)
                    logger.info(
                        f"LATENCY PLACING {job.file_id} INTO DECOMPRESSED AT {time.time()}"
                    )
                    logger.info(f"{job.file_path} PLACED INTO DECOMPRESSED")
                elif job_status["status"] == "failed":
                    worker.curr_available_space += job.compressed_size
                    logger.info(
                        f"ERROR for {job.file_path}: {job_status['exception']}"
                    )
                    logger.info(f"{job.file_path} PLACED INTO FAILED")
                    failed_queue.put(job)
                    logger.info(
                        f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}"
                    )

            time.sleep(5)

            crawling_funcx_ids = []
            crawling_jobs = []
            while not crawling_queue.empty():
                self.thread_statuses["funcx_poll_thread"] = True
                job = crawling_queue.get()
                logger.info(f"{job.file_path} POLLING CRAWL")
                crawling_funcx_ids.append(job.funcx_crawl_id)
                crawling_jobs.append(job)

            crawling_statuses = self.funcx_client.get_batch_status(
                crawling_funcx_ids)
            for job in crawling_jobs:
                worker = self.worker_dict[job.worker_id]
                job_status = crawling_statuses[job.funcx_crawl_id]

                if job_status["pending"]:
                    crawling_queue.put(job)
                elif job_status["status"] == "success":
                    result = job_status["result"]
                    job = Job.from_dict(result)
                    logger.info(f"{job.file_path} COMPLETED CRAWL")

                    for job_node in job.bfs_iterator(include_root=True):
                        if job_node.status == JobStatus.CRAWLING:
                            job_node.status = JobStatus.CONSOLIDATING

                    worker.curr_available_space += (job.total_size -
                                                    job.compressed_size)
                    consolidating_queue.put(job)
                    logger.info(
                        f"LATENCY PLACING {job.file_id} INTO CONSOLIDATING AT {time.time()}"
                    )
                    logger.info(f"{job.file_path} PLACED INTO CONSOLIDATING")
                elif job_status["status"] == "failed":
                    worker.curr_available_space += (job.total_size -
                                                    job.compressed_size)
                    failed_queue.put(job)
                    logger.info(f"{job.file_path} PLACED INTO FAILED")
                    logger.info(
                        f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}"
                    )

            time.sleep(5)

            self.thread_statuses["funcx_poll_thread"] = False
Ejemplo n.º 18
0
def run_decompressor(job_dict: dict, decompress_dir: str):
    """Iterates through a Job and recursively decompresses files.

    Parameters
    ----------
    job_dict : dict
        Job dictionary to iterate through.
    decompress_dir : str
        Location on worker to decompress files to.

    Returns
    -------

    """
    import os
    import sys
    import logging
    from shutil import rmtree
    sys.path.insert(0, "/")
    from abyss.orchestrator.job import Job, JobStatus
    from abyss.utils.decompressors import decompress
    from abyss.utils.error_utils import is_critical_oom_error, is_critical_decompression_error
    from abyss.utils.funcx_functions import get_directory_size
    from abyss.definitions import ROOT_DIR
    job = Job.from_dict(job_dict)

    logger = logging.getLogger(__name__)
    f_handler = logging.FileHandler(f'/project2/chard/skluzacek/ryan-data/abyss/file.log')
    f_handler.setLevel(logging.ERROR)
    f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    f_handler.setFormatter(f_format)
    logger.addHandler(f_handler)

    job_nodes = job.to_queue(include_root=True)

    while len(job_nodes):
        job_node = job_nodes.popleft()

        file_path = job_node.transfer_path
        decompress_type = os.path.splitext(job_node.file_path)[1][1:]
        logger.error(f"DECOMPRESSING {file_path}")

        if job_node.status == JobStatus.FAILED:
            continue

        try:
            if decompress_type == "zip":
                full_extract_dir = os.path.join(decompress_dir,
                                                job_node.file_id)
                decompress(file_path, decompress_type, full_extract_dir)
            elif decompress_type == "tar":
                full_extract_dir = os.path.join(decompress_dir,
                                                job_node.file_id)
                decompress(file_path, decompress_type, full_extract_dir)
            elif decompress_type == "gz":
                extract_dir = os.path.join(os.path.join(decompress_dir, job_node.file_id),
                                           os.path.basename(job_node.file_path[:-3]))
                full_extract_dir = os.path.dirname(extract_dir)

                if not os.path.exists(os.path.dirname(extract_dir)):
                    os.makedirs(os.path.dirname(extract_dir))

                decompress(file_path, decompress_type, extract_dir)

            job_node.decompress_path = full_extract_dir

            logger.error(f"DECOMPRESSED {file_path} TO {full_extract_dir}")

            for child_job in job_node.child_jobs.values():
                # TODO: Fix this gross if statement. We might want to decompress
                # gz files into a directory
                if os.path.basename(full_extract_dir) == child_job.file_path:
                    child_job.transfer_path = full_extract_dir
                else:
                    child_job.transfer_path = os.path.join(decompress_dir,
                                                           child_job.file_path)

            if job_node.status == JobStatus.PREFETCHED:
                job_node.status = JobStatus.DECOMPRESSING

            logger.error(f"REMOVING {job_node.transfer_path}")
            os.remove(job_node.transfer_path)

        except Exception as e:
            logger.error(f"ERROR TYPE {e}")
            logger.error(f"CAUGHT ERROR", exc_info=True)
            if is_critical_decompression_error(e):
                logger.error("HANDLED DECOMPRESSION ERROR")
                if job_node.status == JobStatus.PREFETCHED:
                    job_node.status = JobStatus.FAILED
                    job_node.error = str(e)

                os.remove(job_node.transfer_path)

                if os.path.exists(full_extract_dir):
                    rmtree(full_extract_dir)
            elif is_critical_oom_error(e):
                logger.error("PROCESSING OOM ERROR")
                decompressed_size = get_directory_size(full_extract_dir)
                if decompressed_size > job_node.decompressed_size:
                    logger.error("FILE TOO LARGE")
                    os.remove(job_node.transfer_path)
                    rmtree(full_extract_dir)

                    for child_job in job_node.child_jobs:
                        job_nodes.remove(child_job)

                    job_node.status = JobStatus.UNPREDICTED
                    job_node.error = str(e)

                else:
                    logger.error("ATTEMPTING TO REPROCESS")
                    rmtree(full_extract_dir)
                    job_nodes.appendleft(job_node)

            else:
                if job_node.status == JobStatus.PREFETCHED:
                    job_node.status = JobStatus.FAILED
                    job_node.error = str(e)

                os.remove(job_node.transfer_path)

                if os.path.exists(full_extract_dir):
                    rmtree(full_extract_dir)

    return Job.to_dict(job)
Ejemplo n.º 19
0
    def test_batch(self):
        for batcher_name in BATCHER_NAME_MAPPING.keys():
            for dispatcher_name in DISPATCHER_NAME_MAPPING.keys():
                workers = [
                    Worker.from_dict({
                        "globus_eid": "0",
                        "funcx_eid": "1",
                        "max_available_space": 10,
                        "transfer_dir": "/transfer",
                        "decompress_dir": "/dir",
                    })
                ]
                jobs = [
                    Job.from_dict(({
                        "file_path": f"/0",
                        "compressed_size": 0,
                        "decompressed_size": 10,
                    })),
                    Job.from_dict(({
                        "file_path": f"/1",
                        "compressed_size": 0,
                        "decompressed_size": 20,
                    }))
                ]

                scheduler = Scheduler(batcher_name, dispatcher_name, workers,
                                      jobs)
                worker_queues = scheduler.worker_queues

                # Making sure only the correct job gets scheduled
                self.assertTrue(len(worker_queues), 1)

                worker_queue_0 = list(
                    worker_queues[workers[0].worker_id].queue)
                self.assertEqual(len(worker_queue_0), 1)
                self.assertEqual(worker_queue_0[0].decompressed_size, 10)

                self.assertTrue(
                    worker_queue_0[0] not in scheduler._batcher.jobs)

                # Making sure no jobs get batched twice
                scheduler.schedule_jobs([])

                new_worker_queues = scheduler.worker_queues

                self.assertEqual(len(new_worker_queues), 1)

                new_worker_queue_0 = list(
                    new_worker_queues[workers[0].worker_id].queue)
                self.assertEqual(len(new_worker_queue_0), 1)
                self.assertEqual(new_worker_queue_0[0].decompressed_size, 10)

                # Making sure jobs are appropriately batched after freeing space
                workers[0].curr_available_space += 50

                scheduler.schedule_jobs([])

                new_worker_queues_1 = scheduler.worker_queues

                self.assertEqual(len(new_worker_queues_1), 1)

                new_worker_queue_1 = list(
                    new_worker_queues[workers[0].worker_id].queue)
                self.assertEqual(len(new_worker_queue_1), 2)
                self.assertEqual(new_worker_queue_1[0].decompressed_size, 10)
                self.assertEqual(new_worker_queue_1[1].decompressed_size, 20)