Example #1
0
    def validate_dict_params(orchestrator_params: Dict) -> None:
        """Ensures dictionary of orchestrator parameters contains
        necessary parameters.
        Parameters
        ----------
        orchestrator_params : dict
            Dictionary containing parameters for AbyssOrchestrator
            object.
        Returns
        -------
            Returns None if parameters are valid, raises error if
            invalid.
        """
        try:
            for parameter_name, parameter_type in REQUIRED_ORCHESTRATOR_PARAMETERS:
                parameter = orchestrator_params[parameter_name]
                assert isinstance(parameter, parameter_type)
        except AssertionError:
            raise ValueError(
                f"Parameter {parameter_name} is not of type {parameter_type}")
        except KeyError:
            raise ValueError(f"Required parameter {parameter_name} not found")

        worker_params = orchestrator_params["worker_params"]
        for worker_param in worker_params:
            Worker.validate_dict_params(worker_param)
    def test_dispatch(self):
        workers = [
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 97,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            }),
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 57,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            })
        ]
        worker_batches = {
            workers[0].worker_id: [
                Job.from_dict(({
                    "file_path": f"/{i}",
                    "compressed_size": 0,
                    "decompressed_size": i,
                })) for i in range(10, 20)
            ],
            workers[1].worker_id: [
                Job.from_dict(({
                    "file_path": f"/{i}",
                    "compressed_size": 0,
                    "decompressed_size": i,
                })) for i in range(0, 10)
            ]
        }
        preserved_batches = {
            workers[0].worker_id: worker_batches[workers[0].worker_id],
            workers[1].worker_id: worker_batches[workers[1].worker_id]
        }

        dispatcher = MaxFirstDispatcher(workers)
        dispatcher.dispatch_batch(worker_batches)
        worker_queues = dispatcher.worker_queues

        for worker_id, worker_batch in preserved_batches.items():
            preserved_batches[worker_id] = sorted(
                worker_batch,
                reverse=True,
                key=(lambda x: x.decompressed_size))

        for worker_id, worker_queue in worker_queues.items():
            self.assertEqual(list(worker_queue.queue),
                             preserved_batches[worker_id])

        self.assertEqual(worker_batches, {
            workers[0].worker_id: [],
            workers[1].worker_id: []
        })
        self.assertEqual(dispatcher.worker_batches, {
            workers[0].worker_id: [],
            workers[1].worker_id: []
        })
Example #3
0
    def test_update_worker(self):
        workers = [
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 97,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            }),
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 57,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            })
        ]
        jobs = [
            Job.from_dict({
                "file_path": "/",
                "compressed_size": 0,
                "decompressed_size": 10
            }),
            Job.from_dict({
                "file_path": "/",
                "compressed_size": 0,
                "decompressed_size": 5
            })
        ]

        worker_0 = workers[0]

        batcher = DummyBatcher(workers, jobs)
        worker_0.curr_available_space += 100

        batcher.update_worker(worker_0)
        self.assertEqual(
            worker_0.curr_available_space,
            batcher.worker_dict[worker_0.worker_id].curr_available_space)

        with self.assertRaises(ValueError):
            batcher.update_worker(
                Worker.from_dict({
                    "globus_eid": "0",
                    "funcx_eid": "1",
                    "max_available_space": 57,
                    "transfer_dir": "/transfer",
                    "decompress_dir": "/dir",
                }))
Example #4
0
    def test_batch(self):
        workers = [
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 97,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            }),
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 57,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            })
        ]
        jobs = [
            Job.from_dict(({
                "file_path": f"/{i}",
                "compressed_size": 0,
                "decompressed_size": i,
            })) for i in range(10, 20)
        ]

        batcher = MMDBatcher(workers, jobs)
        batches = batcher.worker_batches

        batch_0 = batches[workers[0].worker_id]
        self.assertEqual(set([job.decompressed_size for job in batch_0]),
                         {10, 12, 13, 15, 16, 18})

        batch_1 = batches[workers[1].worker_id]
        self.assertEqual(set([job.decompressed_size for job in batch_1]),
                         {11, 14, 17})

        queued_jobs = []
        while not batcher.job_queue.empty():
            queued_jobs.append(batcher.job_queue.get())

        self.assertEqual(set([job.decompressed_size for job in queued_jobs]),
                         {19})

        for _, worker_batch in batches.items():
            for job in worker_batch:
                self.assertTrue(job not in batcher.jobs)
Example #5
0
    def test_multiple_batch(self):
        workers = [
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 97,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            }),
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 57,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            })
        ]
        jobs = [
            Job.from_dict(({
                "file_path": f"/{i}",
                "compressed_size": 0,
                "decompressed_size": i,
            })) for i in range(10, 20)
        ]

        batcher = MMDBatcher(workers, jobs)
        batches = batcher.worker_batches

        queued_jobs = []
        for _ in range(batcher.job_queue.qsize()):
            job = batcher.job_queue.get()
            queued_jobs.append(job)
            batcher.job_queue.put(job)

        self.assertEqual(set([job.decompressed_size for job in queued_jobs]),
                         {19})

        batcher.batch_job(
            Job.from_dict(({
                "file_path": f"/{100}",
                "compressed_size": 0,
                "decompressed_size": 100,
            })))
        batches_1 = batcher.worker_batches

        self.assertEqual(batches, batches_1)
Example #6
0
    def test_from_dict(self):
        good_dict_params = {
            "globus_eid": "0",
            "funcx_eid": "1",
            "max_available_space": 10,
            "transfer_dir": "/transfer",
            "decompress_dir": "/dir",
        }

        worker = Worker.from_dict(good_dict_params)

        self.assertEqual(worker.globus_eid, good_dict_params["globus_eid"])
        self.assertEqual(worker.funcx_eid, good_dict_params["funcx_eid"])
        self.assertEqual(worker.max_available_space,
                         good_dict_params["max_available_space"])
        self.assertEqual(worker.transfer_dir, good_dict_params["transfer_dir"])
        self.assertEqual(worker.decompress_dir,
                         good_dict_params["decompress_dir"])
Example #7
0
    def test_validate_dict_params(self):
        with self.assertRaises(ValueError):
            bad_dict_params = {
                "globus_eid": 10,
                "funcx_eid": "1",
                "max_available_space": 10,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir"
            }
            Worker.validate_dict_params(bad_dict_params)

        with self.assertRaises(ValueError):
            bad_dict_params_1 = {
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 10,
                "transfer_dir": "/transfer"
            }
            Worker.validate_dict_params(bad_dict_params_1)

        with self.assertRaises(ValueError):
            bad_dict_params_2 = {
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 10,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
                "this should": "have an effect"
            }
            Worker.validate_dict_params(bad_dict_params_2)

        good_dict_params = {
            "globus_eid": "0",
            "funcx_eid": "1",
            "max_available_space": 10,
            "transfer_dir": "/transfer",
            "decompress_dir": "/dir"
        }
        Worker.validate_dict_params(good_dict_params)
    def test_dispatch(self):
        workers = [
            Worker.from_dict({
                "globus_eid": "0",
                "funcx_eid": "1",
                "max_available_space": 97,
                "transfer_dir": "/transfer",
                "decompress_dir": "/dir",
            })
        ]
        worker_batches = {
            workers[0].worker_id: [
                Job.from_dict(({
                    "file_path": f"/{i}",
                    "compressed_size": 0,
                    "decompressed_size": i,
                })) for i in range(1, 11)
            ]
        }

        worker_batch_0 = worker_batches[workers[0].worker_id]
        preserved_batches = {
            workers[0].worker_id: [
                worker_batch_0[9], worker_batch_0[0], worker_batch_0[1],
                worker_batch_0[2], worker_batch_0[3], worker_batch_0[8],
                worker_batch_0[4], worker_batch_0[5], worker_batch_0[7],
                worker_batch_0[6]
            ]
        }

        dispatcher = MaxMinDispatcher(workers)
        dispatcher.dispatch_batch(worker_batches)
        worker_queues = dispatcher.worker_queues

        for worker_id, worker_queue in worker_queues.items():
            self.assertEqual(list(worker_queue.queue),
                             preserved_batches[worker_id])

        self.assertEqual(worker_batches, {workers[0].worker_id: []})
        self.assertEqual(dispatcher.worker_batches, {workers[0].worker_id: []})
Example #9
0
    def test_is_failed_job(self):
        workers = []

        for i in range(10):
            workers.append(Worker(None, None, None, None, i))

        jobs = [
            Job.from_dict({
                "file_path": "/",
                "compressed_size": 0,
                "decompressed_size": 10
            }),
            Job.from_dict({
                "file_path": "/",
                "compressed_size": 0,
                "decompressed_size": 5
            })
        ]

        batcher = DummyBatcher(workers, jobs)

        self.assertTrue(batcher._is_failed_job(jobs[0]))
        self.assertFalse(batcher._is_failed_job(jobs[1]))
Example #10
0
                    self.job_queue.append(job)

                worker_idx = (worker_idx + 1) % self.num_workers

            self.curr_idx = (self.curr_idx + 1) % self.num_workers


if __name__ == "__main__":
    import uuid
    import random
    workers = []
    jobs = []

    for _ in range(3):
        workers.append(
            Worker(str(uuid.uuid4()), str(uuid.uuid4()), random.randint(1,
                                                                        10)))

    for worker in workers:
        print(
            f"Worker {worker.worker_id}: {worker.curr_available_space} bytes")

    for i in range(10):
        jobs.append({
            "file_path": f"{i}",
            "decompressed_size": random.randint(1, 5)
        })

    batcher = RoundRobinBatcher(workers, jobs)
    print(f"Failed jobs {batcher.failed_jobs}")

    for worker_id, batch in batcher.worker_batches.items():
Example #11
0
    def __init__(self,
                 abyss_id: str,
                 globus_source_eid: str,
                 transfer_token: str,
                 compressed_files: List[Dict],
                 worker_params: List[Dict],
                 psql_conn,
                 s3_conn,
                 grouper="",
                 batcher="mmd",
                 dispatcher="fifo",
                 prediction_mode="ml"):
        """Abyss orchestrator class.
        Parameters
        ----------
        abyss_id : str
            Abyss ID for orchestration.
        globus_source_eid : str
            Globus endpoint of source data storage.
        transfer_token : str
            Globus token to authorize transfers between endpoints.
        compressed_files : list(dict)
            List of dictionaries for compressed files to process.
            Dictionaries contain "file_path" and "compressed_size".
        worker_params : list(dict)
            List of valid worker parameter dictionaries to create
            workers.
        psql_conn :
            PostgreSQL connection object to update status.
        sqs_conn :
            SQS connection object to push results to SQS.
        grouper : str
            Name of grouper to use when crawling.
        batcher : str
            Name of batcher to use.
        dispatcher : str
            Name of dispatchers to use.
        prediction_mode: str
            Mode of prediction to use to predict decompressed file size.
            "ml" to use machine learning method or "header" to use
            metadata stored in the header of compressed files (where
            possible).
        """
        self.abyss_id = abyss_id
        self.globus_source_eid = globus_source_eid
        self.transfer_token = transfer_token
        self.grouper = grouper
        self.prediction_mode = prediction_mode

        self.worker_dict = dict()
        for worker_param in worker_params:
            worker = Worker.from_dict(worker_param)
            self.worker_dict[worker.worker_id] = worker

        self.prefetchers = dict()
        for worker in self.worker_dict.values():
            globus_dest_eid = worker.globus_eid
            transfer_dir = worker.transfer_dir
            prefetcher = GlobusPrefetcher(self.transfer_token,
                                          self.globus_source_eid,
                                          globus_dest_eid, transfer_dir, 4)

            self.prefetchers[worker.worker_id] = prefetcher

        self.predictors = dict()
        for file_type, predictor in FILE_PREDICTOR_MAPPING.items():
            file_predictor = predictor()
            file_predictor.load_models()
            self.predictors[file_type] = file_predictor

        self.job_statuses = dict(
            zip([x for x in JobStatus],
                [Queue() for _ in range(len(JobStatus))]))
        unpredicted_set = self.job_statuses[JobStatus.UNPREDICTED]
        for compressed_file in compressed_files:
            job = Job.from_dict(compressed_file)
            job.status = JobStatus.UNPREDICTED
            job.file_id = str(uuid.uuid4())
            job.decompressed_size = 0
            unpredicted_set.put(job)
            logger.info(
                f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}"
            )

        self.scheduler = Scheduler(batcher, dispatcher,
                                   list(self.worker_dict.values()), [])
        self.worker_queues = dict()

        self.psql_conn = psql_conn
        self.abyss_metadata = []
        self.s3_conn = s3_conn

        self._unpredicted_preprocessing_thread = threading.Thread(
            target=self._unpredicted_preprocessing, daemon=True)
        self._predictor_thread = threading.Thread(
            target=self._predict_decompressed_size, daemon=True)
        self._scheduler_thread = threading.Thread(
            target=self._thread_schedule_jobs, daemon=True)
        self._prefetcher_thread = threading.Thread(
            target=self._thread_prefetch, daemon=True)
        self._prefetcher_poll_thread = threading.Thread(
            target=self._thread_poll_prefetch, daemon=True)
        self._funcx_process_headers_thread = threading.Thread(
            target=self._thread_funcx_process_headers, daemon=True)
        self._funcx_decompress_thread = threading.Thread(
            target=self._thread_funcx_decompress, daemon=True)
        self._funcx_crawl_thread = threading.Thread(
            target=self._thread_funcx_crawl, daemon=True)
        self._funcx_poll_thread = threading.Thread(
            target=self._thread_funcx_poll, daemon=True)
        self._consolidate_results_thread = threading.Thread(
            target=self._thread_consolidate_crawl_results, daemon=True)
        self._lock = threading.Lock()
        self.thread_statuses = {
            "predictor_thread": True,
            "scheduler_thread": True,
            "prefetcher_thread": True,
            "prefetcher_poll_thread": True,
            "funcx_decompress_thread": True,
            "funcx_crawl_thread": True,
            "funcx_poll_thread": True,
            "consolidate_results_thread": True
        }

        self.funcx_client = FuncXClient()
        self.kill_status = False
        self.crawl_results = Queue()
Example #12
0
    def test_batch(self):
        for batcher_name in BATCHER_NAME_MAPPING.keys():
            for dispatcher_name in DISPATCHER_NAME_MAPPING.keys():
                workers = [
                    Worker.from_dict({
                        "globus_eid": "0",
                        "funcx_eid": "1",
                        "max_available_space": 10,
                        "transfer_dir": "/transfer",
                        "decompress_dir": "/dir",
                    })
                ]
                jobs = [
                    Job.from_dict(({
                        "file_path": f"/0",
                        "compressed_size": 0,
                        "decompressed_size": 10,
                    })),
                    Job.from_dict(({
                        "file_path": f"/1",
                        "compressed_size": 0,
                        "decompressed_size": 20,
                    }))
                ]

                scheduler = Scheduler(batcher_name, dispatcher_name, workers,
                                      jobs)
                worker_queues = scheduler.worker_queues

                # Making sure only the correct job gets scheduled
                self.assertTrue(len(worker_queues), 1)

                worker_queue_0 = list(
                    worker_queues[workers[0].worker_id].queue)
                self.assertEqual(len(worker_queue_0), 1)
                self.assertEqual(worker_queue_0[0].decompressed_size, 10)

                self.assertTrue(
                    worker_queue_0[0] not in scheduler._batcher.jobs)

                # Making sure no jobs get batched twice
                scheduler.schedule_jobs([])

                new_worker_queues = scheduler.worker_queues

                self.assertEqual(len(new_worker_queues), 1)

                new_worker_queue_0 = list(
                    new_worker_queues[workers[0].worker_id].queue)
                self.assertEqual(len(new_worker_queue_0), 1)
                self.assertEqual(new_worker_queue_0[0].decompressed_size, 10)

                # Making sure jobs are appropriately batched after freeing space
                workers[0].curr_available_space += 50

                scheduler.schedule_jobs([])

                new_worker_queues_1 = scheduler.worker_queues

                self.assertEqual(len(new_worker_queues_1), 1)

                new_worker_queue_1 = list(
                    new_worker_queues[workers[0].worker_id].queue)
                self.assertEqual(len(new_worker_queue_1), 2)
                self.assertEqual(new_worker_queue_1[0].decompressed_size, 10)
                self.assertEqual(new_worker_queue_1[1].decompressed_size, 20)