sys_path_add="/",
                                   module_path=f"xtract_{extractor_name}_main",
                                   metadata_write_path=f'/home/tskluzac/{extractor_name}-{repo_name}-completed',
                                   writer='json-np')  # TODO: make an arg.

        current_batch.append(payload)

        # print(current_batch)
        print(f"Len Current Batch: {len(current_batch)}")

    for item in current_batch:
        batch.add(item, endpoint_id=ep_id, function_id=fn_uuid)

    # List of task_ids
    ts = time.time()
    batch_res = fxc.batch_run(batch)
    te = time.time()

    print(f"Total request round-trip time: {te-ts}")

    for item in batch_res:
        poll_queue.put(item)

    batch = fxc.create_batch()  # empty batch.

    # TODO: sauce this to be much better.
    print("Moving to phase 2...")
    time.sleep(0.5)


# Cleanup the 'non-full-last-batch-stragglers'
Example #2
0
class AbyssOrchestrator:
    def __init__(self,
                 abyss_id: str,
                 globus_source_eid: str,
                 transfer_token: str,
                 compressed_files: List[Dict],
                 worker_params: List[Dict],
                 psql_conn,
                 s3_conn,
                 grouper="",
                 batcher="mmd",
                 dispatcher="fifo",
                 prediction_mode="ml"):
        """Abyss orchestrator class.
        Parameters
        ----------
        abyss_id : str
            Abyss ID for orchestration.
        globus_source_eid : str
            Globus endpoint of source data storage.
        transfer_token : str
            Globus token to authorize transfers between endpoints.
        compressed_files : list(dict)
            List of dictionaries for compressed files to process.
            Dictionaries contain "file_path" and "compressed_size".
        worker_params : list(dict)
            List of valid worker parameter dictionaries to create
            workers.
        psql_conn :
            PostgreSQL connection object to update status.
        sqs_conn :
            SQS connection object to push results to SQS.
        grouper : str
            Name of grouper to use when crawling.
        batcher : str
            Name of batcher to use.
        dispatcher : str
            Name of dispatchers to use.
        prediction_mode: str
            Mode of prediction to use to predict decompressed file size.
            "ml" to use machine learning method or "header" to use
            metadata stored in the header of compressed files (where
            possible).
        """
        self.abyss_id = abyss_id
        self.globus_source_eid = globus_source_eid
        self.transfer_token = transfer_token
        self.grouper = grouper
        self.prediction_mode = prediction_mode

        self.worker_dict = dict()
        for worker_param in worker_params:
            worker = Worker.from_dict(worker_param)
            self.worker_dict[worker.worker_id] = worker

        self.prefetchers = dict()
        for worker in self.worker_dict.values():
            globus_dest_eid = worker.globus_eid
            transfer_dir = worker.transfer_dir
            prefetcher = GlobusPrefetcher(self.transfer_token,
                                          self.globus_source_eid,
                                          globus_dest_eid, transfer_dir, 4)

            self.prefetchers[worker.worker_id] = prefetcher

        self.predictors = dict()
        for file_type, predictor in FILE_PREDICTOR_MAPPING.items():
            file_predictor = predictor()
            file_predictor.load_models()
            self.predictors[file_type] = file_predictor

        self.job_statuses = dict(
            zip([x for x in JobStatus],
                [Queue() for _ in range(len(JobStatus))]))
        unpredicted_set = self.job_statuses[JobStatus.UNPREDICTED]
        for compressed_file in compressed_files:
            job = Job.from_dict(compressed_file)
            job.status = JobStatus.UNPREDICTED
            job.file_id = str(uuid.uuid4())
            job.decompressed_size = 0
            unpredicted_set.put(job)
            logger.info(
                f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}"
            )

        self.scheduler = Scheduler(batcher, dispatcher,
                                   list(self.worker_dict.values()), [])
        self.worker_queues = dict()

        self.psql_conn = psql_conn
        self.abyss_metadata = []
        self.s3_conn = s3_conn

        self._unpredicted_preprocessing_thread = threading.Thread(
            target=self._unpredicted_preprocessing, daemon=True)
        self._predictor_thread = threading.Thread(
            target=self._predict_decompressed_size, daemon=True)
        self._scheduler_thread = threading.Thread(
            target=self._thread_schedule_jobs, daemon=True)
        self._prefetcher_thread = threading.Thread(
            target=self._thread_prefetch, daemon=True)
        self._prefetcher_poll_thread = threading.Thread(
            target=self._thread_poll_prefetch, daemon=True)
        self._funcx_process_headers_thread = threading.Thread(
            target=self._thread_funcx_process_headers, daemon=True)
        self._funcx_decompress_thread = threading.Thread(
            target=self._thread_funcx_decompress, daemon=True)
        self._funcx_crawl_thread = threading.Thread(
            target=self._thread_funcx_crawl, daemon=True)
        self._funcx_poll_thread = threading.Thread(
            target=self._thread_funcx_poll, daemon=True)
        self._consolidate_results_thread = threading.Thread(
            target=self._thread_consolidate_crawl_results, daemon=True)
        self._lock = threading.Lock()
        self.thread_statuses = {
            "predictor_thread": True,
            "scheduler_thread": True,
            "prefetcher_thread": True,
            "prefetcher_poll_thread": True,
            "funcx_decompress_thread": True,
            "funcx_crawl_thread": True,
            "funcx_poll_thread": True,
            "consolidate_results_thread": True
        }

        self.funcx_client = FuncXClient()
        self.kill_status = False
        self.crawl_results = Queue()

    @staticmethod
    def validate_dict_params(orchestrator_params: Dict) -> None:
        """Ensures dictionary of orchestrator parameters contains
        necessary parameters.
        Parameters
        ----------
        orchestrator_params : dict
            Dictionary containing parameters for AbyssOrchestrator
            object.
        Returns
        -------
            Returns None if parameters are valid, raises error if
            invalid.
        """
        try:
            for parameter_name, parameter_type in REQUIRED_ORCHESTRATOR_PARAMETERS:
                parameter = orchestrator_params[parameter_name]
                assert isinstance(parameter, parameter_type)
        except AssertionError:
            raise ValueError(
                f"Parameter {parameter_name} is not of type {parameter_type}")
        except KeyError:
            raise ValueError(f"Required parameter {parameter_name} not found")

        worker_params = orchestrator_params["worker_params"]
        for worker_param in worker_params:
            Worker.validate_dict_params(worker_param)

    def start(self) -> None:
        threading.Thread(target=self._orchestrate).start()

    def _update_kill_status(self) -> None:
        """Checks whether all jobs are either succeeded or failed.
        Returns
        -------
        None
        """
        for status in JobStatus:
            if status in [JobStatus.SUCCEEDED, JobStatus.FAILED]:
                pass
            else:
                if not self.job_statuses[status].empty():
                    self.kill_status = False
                    return

        for status in self.thread_statuses.values():
            if status:
                self.kill_status = False
                return

        self.kill_status = True
        logger.info(f"KILL STATUS {self.kill_status}")

    def _update_psql_entry(self) -> None:
        """Updates a PostgreSQL entry with orchestration status. Assumes
        that a table entry has already been created.
        Returns
        -------
        """
        table_entry = dict()

        for job_status, job_queue in self.job_statuses.items():
            table_entry[job_status.value.lower()] = job_queue.qsize()

        logger.info(table_entry)
        logger.info(self.thread_statuses)

        for worker_id, worker in self.worker_dict.items():
            logger.info(
                f"{worker.worker_id} has {worker.curr_available_space}")

        update_table_entry(self.psql_conn, "abyss_status",
                           {"abyss_id": self.abyss_id}, **table_entry)

    def _orchestrate(self) -> None:
        """
        Step 1: Predict sizes of jobs using ML predictors
        Step 2: Batch jobs to worker using Batchers
        Step 3: Begin transferring files one at a time to each worker using
        one Prefetcher item per worker.
        Step 4: Constantly poll prefetcher for file completion.
        Step 5: When a file is done, send a funcx job request to crawl on worker
        Step 6: Poll funcx result
        Step 7: Pull result from sqs queue and validate/consolidate
        Returns
        -------
        None
        """
        logger.info("STARTING ORCHESTRATION")
        self._unpredicted_preprocessing_thread.start()
        self._predictor_thread.start()
        self._scheduler_thread.start()
        self._prefetcher_thread.start()
        self._prefetcher_poll_thread.start()
        self._funcx_process_headers_thread.start()
        self._funcx_decompress_thread.start()
        self._funcx_crawl_thread.start()
        self._funcx_poll_thread.start()
        self._consolidate_results_thread.start()

        t0 = time.time()
        while not self.kill_status:
            time.sleep(1)
            self._update_kill_status()
            self._update_psql_entry()
            logger.info(f"ELAPSED: {time.time() - t0}")

        self._unpredicted_preprocessing_thread.join()
        self._predictor_thread.join()
        self._scheduler_thread.join()
        self._prefetcher_thread.join()
        self._prefetcher_poll_thread.join()
        self._funcx_process_headers_thread.join()
        self._funcx_decompress_thread.join()
        self._funcx_crawl_thread.join()
        self._funcx_poll_thread.join()
        self._consolidate_results_thread.join()

        logger.info(f"PUSHING METADATA TO S3")

        # logger.info(metadata)

        metadata_file_path = os.path.join("/tmp", f"{self.abyss_id}.txt")

        with open(metadata_file_path, "w") as f:
            f.writelines("\n".join(
                [json.dumps(metadata) for metadata in self.abyss_metadata]))

        s3_upload_file(self.s3_conn, "xtract-abyss", metadata_file_path,
                       f"{self.abyss_id}.txt")

        os.remove(metadata_file_path)

    def _unpredicted_preprocessing(self) -> None:
        """Determines whether to use machine learning or file headers
        for decompressed size prediction and places jobs into respective
        queues.

        Returns
        -------
        None
        """
        while not self.kill_status:
            unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED]
            unpredicted_predict_queue = self.job_statuses[
                JobStatus.UNPREDICTED_PREDICT]
            unpredicted_schedule_queue = self.job_statuses[
                JobStatus.UNPREDICTED_SCHEDULE]

            while not unpredicted_queue.empty():
                self.thread_statuses["unpredicted_preprocessing_thread"] = True
                job = unpredicted_queue.get()

                # If a file is recursively compressed we will use machine learning to predict the file size.
                # We only use file headers if the compressed file is directly stored on our storage source.
                if self.prediction_mode == "ml" or job.status != JobStatus.UNPREDICTED:
                    if job.status == JobStatus.UNPREDICTED:
                        job.status = JobStatus.UNPREDICTED_PREDICT
                    unpredicted_predict_queue.put(job)
                    logger.info(
                        f"PLACING {job.file_path} IN UNPREDICTED PREDICT")
                elif self.prediction_mode == "header":
                    if job.file_path.endswith(
                            ".zip") or job.file_path.endswith(".tar"):
                        job.status = JobStatus.UNPREDICTED_SCHEDULE
                        unpredicted_schedule_queue.put(job)
                        logger.info(
                            f"PLACING {job.file_path} IN UNPREDICTED SCHEDULE")
                    else:
                        unpredicted_predict_queue.put(job)
                        logger.info(
                            f"PLACING {job.file_path} IN UNPREDICTED PREDICT")
                else:
                    self.kill_status = True
                    raise ValueError(
                        f"Unknown prediction mode \"{self.prediction_mode}\"")

                self.thread_statuses[
                    "unpredicted_preprocessing_thread"] = False

    def _predict_decompressed_size(self) -> None:
        """Runs decompression size predictions on all files in
        self.compressed_files and then places them in
        self.predicted_files.

        Returns
        -------
        None
        """
        while not self.kill_status:
            unpredicted_queue = self.job_statuses[
                JobStatus.UNPREDICTED_PREDICT]
            predicted_queue = self.job_statuses[JobStatus.PREDICTED]

            while not unpredicted_queue.empty():
                self.thread_statuses["predictor_thread"] = True
                job = unpredicted_queue.get()

                for job_node in job.bfs_iterator(include_root=True):
                    if job_node.status in [
                            JobStatus.UNPREDICTED,
                            JobStatus.UNPREDICTED_PREDICT
                    ]:
                        file_path = job_node.file_path
                        file_extension = Predictor.get_extension(file_path)

                        predictor = self.predictors[file_extension]

                        if job_node.decompressed_size:
                            decompressed_size = predictor.repredict(
                                job_node.decompressed_size)
                            logger.info(
                                f"REPREDICTED {job.file_path} WITH DECOMPRESSED SIZE {decompressed_size}"
                            )
                        else:
                            compressed_size = job_node.compressed_size
                            decompressed_size = predictor.predict(
                                file_path, compressed_size)
                            logger.info(
                                f"PREDICTED {job.file_path} WITH DECOMPRESSED SIZE {decompressed_size}"
                            )

                        with self._lock:
                            job_node.decompressed_size = decompressed_size
                            job_node.status = JobStatus.PREDICTED

                logger.info(
                    f"LATENCY PLACING {job.file_id} INTO PREDICTED AT {time.time()}"
                )
                predicted_queue.put(job)

            self.thread_statuses["predictor_thread"] = False

    def _thread_schedule_jobs(self) -> None:
        """Schedules items from self.predicted_files into
        worker queues in self.worker_queues.
        Returns
        -------
        None
        """
        while not self.kill_status:
            predicted_queue = self.job_statuses[JobStatus.PREDICTED]
            unpredicted_schedule_queue = self.job_statuses[
                JobStatus.UNPREDICTED_SCHEDULE]
            unpredicted_scheduled_queue = self.job_statuses[
                JobStatus.UNPREDICTED_SCHEDULED]
            scheduled_queue = self.job_statuses[JobStatus.SCHEDULED]
            failed_queue = self.job_statuses[JobStatus.FAILED]

            with self._lock:
                predicted_list = []
                while not predicted_queue.empty():
                    self.thread_statuses["scheduler_thread"] = True
                    job = predicted_queue.get()
                    logger.info(f"{job.file_path} SCHEDULING")
                    job.calculate_total_size()
                    predicted_list.append(job)

                while not unpredicted_schedule_queue.empty():
                    self.thread_statuses["scheduler_thread"] = True
                    job = unpredicted_schedule_queue.get()
                    logger.info(f"{job.file_path} UNPREDICTED SCHEDULING")
                    job.calculate_total_size()
                    predicted_list.append(job)

                self.scheduler.schedule_jobs(predicted_list)

                self.worker_queues = self.scheduler.worker_queues
                failed_jobs = self.scheduler.failed_jobs

                queue = None
                for job in predicted_list:
                    for job_node in job.bfs_iterator(include_root=True):
                        if job_node in failed_jobs:
                            job_node.status = JobStatus.FAILED
                            job_node.error = "Could not schedule"
                            logger.info(f"FAILED TO SCHEDULE {job.file_path}")
                        elif job_node.status == JobStatus.PREDICTED:
                            job_node.status = JobStatus.SCHEDULED
                            queue = JobStatus.SCHEDULED
                        elif job_node.status == JobStatus.UNPREDICTED_SCHEDULE:
                            job_node.status = JobStatus.UNPREDICTED_SCHEDULED
                            queue = JobStatus.UNPREDICTED_SCHEDULED

                    if queue:
                        if queue == JobStatus.SCHEDULED:
                            logger.info(
                                f"LATENCY PLACING {job.file_id} INTO SCHEDULED AT {time.time()}"
                            )
                            scheduled_queue.put(job)
                            logger.info(f"{job.file_path} SCHEDULED")
                        elif queue == JobStatus.UNPREDICTED_SCHEDULED:
                            unpredicted_scheduled_queue.put(job)
                            logger.info(
                                f"{job.file_path} UNPREDICTED SCHEDULED")
                    else:
                        logger.info(
                            f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}"
                        )
                        logger.info(f"{job.file_path} PLACED INTO FAILED")
                        failed_queue.put(job)
                self.thread_statuses["scheduler_thread"] = False

    def _thread_prefetch(self) -> None:
        """Places jobs into queue for prefetcher to transfer.
        Returns
        -------
        None
        """
        while not self.kill_status:
            scheduled_queue = self.job_statuses[JobStatus.SCHEDULED]
            unpredicted_scheduled_queue = self.job_statuses[
                JobStatus.UNPREDICTED_SCHEDULED]
            prefetching_queue = self.job_statuses[JobStatus.PREFETCHING]
            unpredicted_prefetching_queue = self.job_statuses[
                JobStatus.UNPREDICTED_PREFETCHING]

            with self._lock:
                for worker_id, worker_queue in self.worker_queues.items():
                    prefetcher = self.prefetchers[worker_id]
                    jobs_to_prefetch = []

                    while len(worker_queue):
                        self.thread_statuses["prefetcher_thread"] = True
                        job = worker_queue.popleft()
                        logger.info(f"{job.file_path} PREFETCHING")

                        worker_id = job.worker_id

                        jobs_to_prefetch.append(job)
                        job.transfer_path = f"{self.worker_dict[worker_id].transfer_dir}/{job.file_id}"

                        for job_node in job.bfs_iterator(include_root=True):
                            if job_node.status == JobStatus.SCHEDULED:
                                job_node.status = JobStatus.PREFETCHING
                            elif job_node.status == JobStatus.UNPREDICTED_SCHEDULED:
                                job_node.status = JobStatus.UNPREDICTED_PREFETCHING

                        if job.status == JobStatus.UNPREDICTED_PREFETCHING:
                            unpredicted_prefetching_queue.put(job)
                            unpredicted_scheduled_queue.get()
                            logger.info(
                                f"{job.file_path} PLACED INTO UNPREDICTED PREFETCHING"
                            )
                        else:
                            prefetching_queue.put(job)
                            scheduled_queue.get()
                            logger.info(
                                f"{job.file_path} PLACED INTO PREFETCHING")

                    prefetcher.transfer_job_batch(jobs_to_prefetch)

                    for job in jobs_to_prefetch:
                        logger.info(
                            f"LATENCY PLACING {job.file_id} INTO PREFETCHING AT {time.time()}"
                        )

            self.thread_statuses["prefetcher_thread"] = False
            time.sleep(4)

    def _thread_poll_prefetch(self) -> None:
        """Thread function to poll prefetcher and update
        self.job_statuses.
        Returns
        -------
        None
        """
        while not self.kill_status:
            prefetching_queue = self.job_statuses[JobStatus.PREFETCHING]
            unpredicted_prefetching_queue = self.job_statuses[
                JobStatus.UNPREDICTED_PREFETCHING]
            unpredicted_prefetched_queue = self.job_statuses[
                JobStatus.UNPREDICTED_PREFETCHED]
            prefetched_queue = self.job_statuses[JobStatus.PREFETCHED]
            failed_queue = self.job_statuses[JobStatus.FAILED]

            for _ in range(prefetching_queue.qsize() +
                           unpredicted_prefetching_queue.qsize()):
                self.thread_statuses["prefetcher_poll_thread"] = True

                if prefetching_queue.empty():
                    job = unpredicted_prefetching_queue.get()
                else:
                    job = prefetching_queue.get()

                logger.info(f"{job.file_path} POLL PREFETCH")
                file_path = job.file_path
                worker_id = job.worker_id
                prefetcher = self.prefetchers[worker_id]

                prefetcher_status = prefetcher.get_transfer_status(file_path)
                if prefetcher_status == PrefetcherStatuses.SUCCEEDED:
                    for job_node in job.bfs_iterator(include_root=True):
                        if job_node.status == JobStatus.PREFETCHING:
                            job_node.status = JobStatus.PREFETCHED
                        elif job_node.status == JobStatus.UNPREDICTED_PREFETCHING:
                            job_node.status = JobStatus.UNPREDICTED_PREFETCHED

                    if job.status == JobStatus.UNPREDICTED_PREFETCHED:
                        unpredicted_prefetched_queue.put(job)
                        logger.info(
                            f"{job.file_path} PLACED INTO UNPREDICTED PREFETCHED"
                        )
                    else:
                        prefetched_queue.put(job)
                        logger.info(
                            f"LATENCY PLACING {job.file_id} INTO PREFETCHED AT {time.time()}"
                        )
                        logger.info(f"{job.file_path} PLACED INTO PREFETCHED")
                elif prefetcher_status == PrefetcherStatuses.FAILED:
                    for job_node in job.bfs_iterator(include_root=True):
                        if job_node.status == JobStatus.PREFETCHING or job_node.status == JobStatus.UNPREDICTED_PREFETCHING:
                            job_node.status = JobStatus.FAILED
                    logger.info(f"{job.file_path} FAILED TO PREFETCH")
                    # Potentially add more logic here or in prefetcher to restart failed transfer
                    failed_queue.put(job)
                else:
                    if job.status == JobStatus.UNPREDICTED_PREFETCHING:
                        unpredicted_prefetching_queue.put(job)
                    else:
                        prefetching_queue.put(job)

            self.thread_statuses["prefetcher_poll_thread"] = False
            time.sleep(5)

    def _thread_funcx_process_headers(self) -> None:
        """Thread function to submit header processing tasks to funcX.

        Returns
        -------
        None
        """
        while not self.kill_status:
            unpredicted_prefetched_queue = self.job_statuses[
                JobStatus.UNPREDICTED_PREFETCHED]
            processing_headers_queue = self.job_statuses[
                JobStatus.PROCESSING_HEADERS]

            batch = self.funcx_client.create_batch()
            batched_jobs = []
            while not unpredicted_prefetched_queue.empty():
                self.thread_statuses["funcx_processing_headers_thread"] = True
                job = unpredicted_prefetched_queue.get()
                logger.info(f"{job.file_path} PROCESSING HEADERS")
                job_dict = Job.to_dict(job)
                worker_id = job.worker_id

                worker = self.worker_dict[worker_id]
                batch.add(job_dict,
                          endpoint_id=worker.funcx_eid,
                          function_id=PROCESS_HEADER_FUNCX_UUID)
                batched_jobs.append(job)

            if len(batch.tasks) > 0:
                batch_res = self.funcx_client.batch_run(batch)
            else:
                batch_res = None

            for idx, job in enumerate(batched_jobs):
                job.funcx_process_headers_id = batch_res[idx]
                job.status = JobStatus.PROCESSING_HEADERS

                processing_headers_queue.put(job)
                logger.info(f"{job.file_path} PROCESSING HEADERS QUEUE")

            time.sleep(5)

            self.thread_statuses["funcx_processing_headers_thread"] = False

    # TODO: Consolidate this and _thread_funcx_crawl into one function
    def _thread_funcx_decompress(self) -> None:
        """Thread function to submit decompression tasks to funcX.

        Returns
        -------
        None
        """
        while not self.kill_status:
            prefetched_queue = self.job_statuses[JobStatus.PREFETCHED]
            decompressing_queue = self.job_statuses[JobStatus.DECOMPRESSING]

            batch = self.funcx_client.create_batch()
            batched_jobs = []
            while not prefetched_queue.empty():
                self.thread_statuses["funcx_decompress_thread"] = True
                job = prefetched_queue.get()
                job_dict = Job.to_dict(job)
                worker_id = job.worker_id

                worker = self.worker_dict[worker_id]
                batch.add(job_dict,
                          worker.decompress_dir,
                          endpoint_id=worker.funcx_eid,
                          function_id=DECOMPRESSOR_FUNCX_UUID)
                batched_jobs.append(job)

            if len(batch.tasks) > 0:
                batch_res = self.funcx_client.batch_run(batch)
            else:
                batch_res = None

            for idx, job in enumerate(batched_jobs):
                logger.info(f"{job.file_path} DECOMPRESSING")
                for job_node in job.bfs_iterator(include_root=True):
                    job_node.funcx_decompress_id = batch_res[idx]
                    if job_node.status == JobStatus.PREFETCHED:
                        job_node.status = JobStatus.DECOMPRESSING

                decompressing_queue.put(job)
                logger.info(
                    f"LATENCY PLACING {job.file_id} INTO DECOMPRESSING AT {time.time()}"
                )

            time.sleep(5)

            self.thread_statuses["funcx_decompress_thread"] = False

    def _thread_funcx_crawl(self) -> None:
        """Thread function to submit crawl tasks to funcX.
        Returns
        -------
        None
        """
        while not self.kill_status:
            decompressed_queue = self.job_statuses[JobStatus.DECOMPRESSED]
            crawling_queue = self.job_statuses[JobStatus.CRAWLING]

            batch = self.funcx_client.create_batch()
            batched_jobs = []
            while not decompressed_queue.empty():
                self.thread_statuses["funcx_crawl_thread"] = True
                job = decompressed_queue.get()
                logger.info(f"{job.file_path} CRAWLING")
                job_dict = Job.to_dict(job)
                worker_id = job.worker_id

                worker = self.worker_dict[worker_id]
                batch.add(job_dict,
                          "",
                          endpoint_id=worker.funcx_eid,
                          function_id=LOCAL_CRAWLER_FUNCX_UUID)
                batched_jobs.append(job)

            if len(batch.tasks) > 0:
                batch_res = self.funcx_client.batch_run(batch)
            else:
                batch_res = None

            for idx, job in enumerate(batched_jobs):
                logger.info(
                    f"LATENCY PLACING {job.file_id} INTO CRAWLING AT {time.time()}"
                )
                for job_node in job.bfs_iterator(include_root=True):
                    job_node.funcx_crawl_id = batch_res[idx]
                    if job_node.status == JobStatus.DECOMPRESSED:
                        job_node.status = JobStatus.CRAWLING

                crawling_queue.put(job)

            time.sleep(5)

            self.thread_statuses["funcx_crawl_thread"] = False

    def _thread_funcx_poll(self) -> None:
        """Thread function to poll funcX for results.

        Returns
        -------
        None
        """
        unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED]
        decompressing_queue = self.job_statuses[JobStatus.DECOMPRESSING]
        decompressed_queue = self.job_statuses[JobStatus.DECOMPRESSED]
        crawling_queue = self.job_statuses[JobStatus.CRAWLING]
        processing_headers_queue = self.job_statuses[
            JobStatus.PROCESSING_HEADERS]
        predicted_queue = self.job_statuses[JobStatus.PREDICTED]
        consolidating_queue = self.job_statuses[JobStatus.CONSOLIDATING]
        failed_queue = self.job_statuses[JobStatus.FAILED]

        while not self.kill_status:
            processing_headers_funcx_ids = []
            processing_header_jobs = []
            while not processing_headers_queue.empty():
                self.thread_statuses["funcx_poll_thread"] = True
                job = processing_headers_queue.get()
                logger.info(f"{job.file_path} POLLING HEADER PROCESSING")
                processing_headers_funcx_ids.append(
                    job.funcx_process_headers_id)
                processing_header_jobs.append(job)

            processing_headers_statuses = self.funcx_client.get_batch_status(
                task_id_list=processing_headers_funcx_ids)
            for job in processing_header_jobs:
                worker = self.worker_dict[job.worker_id]
                job_status = processing_headers_statuses[
                    job.funcx_process_headers_id]

                if job_status["pending"]:
                    processing_headers_queue.put(job)
                elif job_status["status"] == "success":
                    logger.info(f"{job.file_path} COMPLETED HEADER PROCESSING")
                    job = Job.from_dict(job_status["result"])
                    job.status = JobStatus.PREDICTED

                    worker.curr_available_space += job.compressed_size
                    predicted_queue.put(job)
                elif job_status["status"] == "failed":
                    worker.curr_available_space += job.compressed_size
                    unpredicted_predict_queue = self.job_statuses[
                        JobStatus.UNPREDICTED_PREDICT]
                    job.status = JobStatus.UNPREDICTED_PREDICT
                    unpredicted_predict_queue.put(job)

            time.sleep(5)

            decompressing_funcx_ids = []
            decompressing_jobs = []
            while not decompressing_queue.empty():
                self.thread_statuses["funcx_poll_thread"] = True
                job = decompressing_queue.get()
                logger.info(f"{job.file_path} POLLING DECOMPRESS")
                decompressing_funcx_ids.append(job.funcx_decompress_id)
                decompressing_jobs.append(job)

            decompressing_statuses = self.funcx_client.get_batch_status(
                decompressing_funcx_ids)
            for job in decompressing_jobs:
                worker = self.worker_dict[job.worker_id]
                job_status = decompressing_statuses[job.funcx_decompress_id]
                logger.info(job_status)

                if job_status["pending"]:
                    decompressing_queue.put(job)
                elif job_status["status"] == "success":
                    job = Job.from_dict(job_status["result"])
                    logger.info(f"{job.file_path} COMPLETED DECOMPRESS")

                    if job.status == JobStatus.FAILED:
                        worker.curr_available_space += job.total_size
                        failed_queue.put(job)
                        logger.info(f"{job.file_path} PLACED INTO FAILED")
                        logger.info(
                            f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}"
                        )
                        continue

                    has_unpredicted = False
                    for job_node in job.bfs_iterator(include_root=True):
                        if job_node.status == JobStatus.DECOMPRESSING:
                            job_node.status = JobStatus.DECOMPRESSED
                        elif job_node.status == JobStatus.UNPREDICTED:
                            has_unpredicted = True

                    if has_unpredicted:
                        unpredicted_queue.put(job)
                        logger.info(
                            f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}"
                        )
                        logger.info(f"{job.file_path} PLACED INTO UNPREDICTED")

                    worker.curr_available_space += job.compressed_size

                    decompressed_queue.put(job)
                    logger.info(
                        f"LATENCY PLACING {job.file_id} INTO DECOMPRESSED AT {time.time()}"
                    )
                    logger.info(f"{job.file_path} PLACED INTO DECOMPRESSED")
                elif job_status["status"] == "failed":
                    worker.curr_available_space += job.compressed_size
                    logger.info(
                        f"ERROR for {job.file_path}: {job_status['exception']}"
                    )
                    logger.info(f"{job.file_path} PLACED INTO FAILED")
                    failed_queue.put(job)
                    logger.info(
                        f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}"
                    )

            time.sleep(5)

            crawling_funcx_ids = []
            crawling_jobs = []
            while not crawling_queue.empty():
                self.thread_statuses["funcx_poll_thread"] = True
                job = crawling_queue.get()
                logger.info(f"{job.file_path} POLLING CRAWL")
                crawling_funcx_ids.append(job.funcx_crawl_id)
                crawling_jobs.append(job)

            crawling_statuses = self.funcx_client.get_batch_status(
                crawling_funcx_ids)
            for job in crawling_jobs:
                worker = self.worker_dict[job.worker_id]
                job_status = crawling_statuses[job.funcx_crawl_id]

                if job_status["pending"]:
                    crawling_queue.put(job)
                elif job_status["status"] == "success":
                    result = job_status["result"]
                    job = Job.from_dict(result)
                    logger.info(f"{job.file_path} COMPLETED CRAWL")

                    for job_node in job.bfs_iterator(include_root=True):
                        if job_node.status == JobStatus.CRAWLING:
                            job_node.status = JobStatus.CONSOLIDATING

                    worker.curr_available_space += (job.total_size -
                                                    job.compressed_size)
                    consolidating_queue.put(job)
                    logger.info(
                        f"LATENCY PLACING {job.file_id} INTO CONSOLIDATING AT {time.time()}"
                    )
                    logger.info(f"{job.file_path} PLACED INTO CONSOLIDATING")
                elif job_status["status"] == "failed":
                    worker.curr_available_space += (job.total_size -
                                                    job.compressed_size)
                    failed_queue.put(job)
                    logger.info(f"{job.file_path} PLACED INTO FAILED")
                    logger.info(
                        f"LATENCY PLACING {job.file_id} INTO FAILED AT {time.time()}"
                    )

            time.sleep(5)

            self.thread_statuses["funcx_poll_thread"] = False

    def _thread_consolidate_crawl_results(self) -> None:
        """Thread function to consolidate crawl results and push to SQS.
        Returns
        -------
        None
        """
        while not self.kill_status:
            unpredicted_queue = self.job_statuses[JobStatus.UNPREDICTED]
            consolidating_queue = self.job_statuses[JobStatus.CONSOLIDATING]
            succeeded_queue = self.job_statuses[JobStatus.SUCCEEDED]
            failed_queue = self.job_statuses[JobStatus.FAILED]

            while not consolidating_queue.empty():
                self.thread_statuses["consolidate_results_thread"] = True
                job = consolidating_queue.get()
                logger.info(f"{job.file_path} CONSOLIDATING")

                resubmit_task = False
                for job_node in job.bfs_iterator(include_root=True):
                    root_path = job_node.metadata["root_path"]
                    for file_path, file_metadata in job_node.metadata[
                            "metadata"].items():
                        file_size = file_metadata["physical"]["size"]
                        is_compressed = file_metadata["physical"][
                            "is_compressed"]

                        child_file_path = os.path.join(root_path, file_path)

                        if is_compressed:
                            if "decompressed_size" in file_metadata[
                                    "physical"]:
                                decompressed_size = file_metadata["physical"][
                                    "decompressed_size"]
                            else:
                                decompressed_size = None
                            if child_file_path in job_node.child_jobs:
                                break
                            else:
                                child_job = Job(file_path=child_file_path,
                                                file_id=f"{str(uuid.uuid4())}",
                                                compressed_size=file_size)

                                if decompressed_size:
                                    child_job.decompressed_size = decompressed_size
                                    child_job.status = JobStatus.PREDICTED
                                else:
                                    child_job.status = JobStatus.UNPREDICTED

                                job_node.child_jobs[
                                    child_file_path] = child_job
                                resubmit_task = True

                if resubmit_task:
                    logger.info(f"{job.file_path} RESUBMITTING")
                    unpredicted_queue.put(job)
                    logger.info(
                        f"LATENCY PLACING {job.file_id} INTO UNPREDICTED AT {time.time()}"
                    )
                    continue

                consolidated_metadata = job.consolidate_metadata()
                self.abyss_metadata.append(consolidated_metadata)

                for job_node in job.bfs_iterator(include_root=True):
                    if job_node.status == JobStatus.CONSOLIDATING:
                        job_node.status = JobStatus.SUCCEEDED

                succeeded_queue.put(job)
                logger.info(f"{job.file_path} PLACED INTO SUCCEEDED")
                logger.info(
                    f"LATENCY PLACING {job.file_id} INTO SUCCEEDED AT {time.time()}"
                )

            while not failed_queue.empty():
                job = failed_queue.get()
                logger.info(f"{job.file_path} CONSOLIDATING FROM FAILED")
                consolidated_metadata = job.consolidate_metadata()
                self.abyss_metadata.append(consolidated_metadata)
                succeeded_queue.put(job)
                logger.info(
                    f"LATENCY PLACING {job.file_id} INTO SUCCEEDED AT {time.time()}"
                )

            self.thread_statuses["consolidate_results_thread"] = False
class test_orch():
    def __init__(self):
        self.current_tasks_on_ep = 0
        self.max_tasks_on_ep = file_cutoff  # IF SET TO FILE_CUTOFF, THEN THIS IS THE MAX.
        self.fxc = FuncXClient()

        self.funcx_batches = Queue()
        self.polling_queue = Queue()

        self.num_poll_reqs = 0
        self.num_send_reqs = 0

        self.total_families_sent = 0

        self.successes = 0
        self.failures = 0

        self.max_outstanding_tasks = max_outstanding_tasks

        self.family_queue = Queue()

        self.fam_batches = []

        # big_json = "/home/ubuntu/old_xtracthub-service/experiments/tyler_everything.json"
        # big_json = "/Users/tylerskluzacek/Desktop/tyler_everything.json"

        import os
        print(os.getcwd())

        #big_json = "../experiments/tyler_30k.json"
        big_json = "experiments/tyler_200k.json"
        # big_json = "/Users/tylerskluzacek/PyCharmProjects/xtracthub-service/experiments/tyler_20k.json"

        t0 = time.time()
        with open(big_json, 'r') as f:
            self.fam_list = json.load(f)

        print(f"Number of famlilies in fam_list: {len(self.fam_list)}")
        t1 = time.time()

        print(f"Time to load families: {t1-t0}")
        time.sleep(5)  # Time to read!!!

        # Transfer the stored list to a queue to promote good concurrency while making batches.
        i = 0  # TODO: added skip logic here!
        for item in self.fam_list:
            if i < skip_n:
                continue
            self.family_queue.put(item)

        self.start_time = time.time()

        self.preproc_fam_batches()

        print(f"Number of funcX batches: {self.funcx_batches.qsize()}")
        # exit()

    def path_converter(self, family_id, old_path):
        path_ls = old_path.split('/')
        file_name = path_ls[-1]
        new_path = None
        if system == "midway2":
            new_path = f"/project2/chard/skluzacek/data_to_process/{family_id}/{file_name}"
        elif system == "theta":
            new_path = f"/projects/CSC249ADCD01/skluzacek{old_path}"  #TODO: change this for things
        elif system == "js":
            new_path = f"/home/tskluzac/{family_id}/{file_name}"
        return new_path

    def preproc_fam_batches(self):

        fam_count = 0

        # Just create an empty one out here so Python doesn't yell at me.
        fam_batch = FamilyBatch()

        num_overloads = 0
        # while we have files and haven't exceeded the weak scaling threshold (file_cutoff)
        while not self.family_queue.empty() and fam_count < file_cutoff:

            fam_batch = FamilyBatch()
            total_fam_batch_size = 0

            # Keep making batch until
            while len(fam_batch.families
                      ) < map_size and not self.family_queue.empty(
                      ) and fam_count < file_cutoff:

                fam_count += 1
                fam = self.family_queue.get()

                total_family_size = 0
                # First convert to the correct paths
                for file_obj in fam['files']:
                    old_path = file_obj['path']
                    new_path = self.path_converter(fam['family_id'], old_path)
                    file_obj['path'] = new_path
                    file_size = file_obj['metadata']['physical']['size']
                    total_family_size += file_size

                for group in fam['groups']:
                    for file_obj in group['files']:
                        old_path = file_obj['path']
                        new_path = self.path_converter(fam['family_id'],
                                                       old_path)
                        file_obj['path'] = new_path

                empty_fam = Family()
                empty_fam.from_dict(fam)

                # We will ONLY handle the SIZE issue in here.

                if soft_batch_bytes_max > 0:
                    # So if this last file would put us over the top,
                    if total_fam_batch_size + total_family_size > soft_batch_bytes_max:
                        num_overloads += 1
                        print(f"Num overloads {num_overloads}")
                        # then we append the old batch (if not empty),
                        if len(fam_batch.families) > 0:
                            self.fam_batches.append(fam_batch)

                        # empty the old one
                        fam_batch = FamilyBatch()
                        total_fam_batch_size = total_family_size

                        assert (len(fam_batch.families) == 0)

                # and then continue (here we either add to our prior fam_batch OR the new one).
                fam_batch.add_family(empty_fam)

            assert len(fam_batch.families) <= map_size

            self.fam_batches.append(fam_batch)

        # img_extractor = NothingExtractor()
        img_extractor = MatioExtractor()

        # TODO: ADDING TEST. Making sure we have all of our files here.

        ta = time.time()
        num_families = 0
        for item in self.fam_batches:
            num_families += len(item.families)

        print(num_families)
        tb = time.time()
        print(f"Time to move families: {tb-ta}")
        time.sleep(5)
        # exit()

        # exit()

        # This check makes sure our batches are the correct size to avoid the January 2021 disaster of having vastly
        #  incorrect numbers of batches.
        #
        #  Here we are checking that the number of families we are processing is LESS than the total number of
        #   batches times the batch size (e.g., the last batch can be full or empty), and the number of families
        #   is GREATER than the case where our last map is missing.
        #
        #
        #  This leaves a very small window for error. Could use modulus to be more exact.

        # TODO: Bring this back (but use for grouping by num. files)

        # try:
        #     assert len(self.fam_batches) * (map_size-1) <= fam_count <= len(self.fam_batches) * map_size
        # except AssertionError as e:
        #     print(f"Caught {e} after creating client batches...")
        #     print(f"Number of batches: {len(self.fam_batches)}")
        #     print(f"Family Count: {fam_count}")
        #
        #     print("Cannot continue. Exiting...")
        #     exit()

        print(f"Container type: {container_type}")
        print(f"Location: {location}")
        self.fn_uuid = img_extractor.register_function(
            container_type=container_type,
            location=location,
            ep_id=ep_id,
            group="a31d8dce-5d0a-11ea-afea-0a53601d30b5")

        # funcX batching. Here we take the 'user' FamilyBatch objects and put them into a batch we send to funcX.
        num_fx_batches = 0
        current_batch = []

        print(f"Number of family batches: {len(self.fam_batches)}")
        for fam_batch in self.fam_batches:

            # print(len(current_batch))
            # print(batch_size)

            if len(current_batch) < batch_size:
                current_batch.append(fam_batch)
            else:
                # print("Marking batch!")
                # print(len(current_batch))
                self.funcx_batches.put(current_batch)
                current_batch = [fam_batch]
                num_fx_batches += 1

        # Grab the stragglers.
        if len(current_batch) > 0:
            print("Marking batch!")
            self.funcx_batches.put(current_batch)
            num_fx_batches += 1

        # See same description as above (map example) for explanation.
        try:
            theor_full_batches = math.ceil(len(self.fam_batches) / batch_size)

            # print(f"Theoretical full batches: {}")
            assert theor_full_batches == num_fx_batches
        except AssertionError as e:
            print(f"Caught {e} after creating funcX batches...")
            print(f"Number of batches: {self.funcx_batches.qsize()}")
            print(f"Family Count: {num_fx_batches}")

            print("Cannot continue. Exiting...")
            exit()

    # TODO: let the failures fail.
    def send_batches_thr_loop(self):

        # While there are still batches to send.
        #  Note that this should not be 'limiting' as we do that in preprocessing.
        while not self.funcx_batches.empty():

            # current_tasks_on_ep = tasks_sent - tasks_received
            if self.current_tasks_on_ep > self.max_outstanding_tasks:
                print(f"There are {self.current_tasks_on_ep}. Sleeping...")
                time.sleep(5)
                continue

            # Grab one
            batch = self.funcx_batches.get()
            fx_batch = self.fxc.create_batch()

            # Now we formally pull down each funcX batch and add each of its elements to an fx_batch.
            # TODO: could do this before putting in list.
            for item in batch:

                fam_batch_size = len(item.families)

                fx_batch.add({'family_batch': item},
                             endpoint_id=ep_id,
                             function_id=self.fn_uuid)
                self.current_tasks_on_ep += fam_batch_size

            # try:
            # TODO: bring this back when we figure out what errors it's causing.
            import random
            x = random.randint(1, 5)
            time.sleep(x / 2)
            res = self.fxc.batch_run(fx_batch)
            self.num_send_reqs += 1
            # except Exception as e:
            #     print("WE CAUGHT AN EXCEPTION WHILE SENDING. ")
            #     time.sleep(0.5)
            #     continue

            for tid in res:
                self.polling_queue.put(tid)

            # import random
            # time.sleep(random.randint(1,3))
            # time.sleep(0.75)

    def polling_loop(self):
        while True:

            current_tid_batch = []

            for i in range(500):  # TODO: 1000 might be too big?

                if self.polling_queue.empty():
                    print("Polling queue empty. Creating batch!")
                    time.sleep(3)
                    break
                else:
                    tid = self.polling_queue.get()
                    current_tid_batch.append(tid)

            if len(current_tid_batch) == 0:
                print("Batch is empty. Sleeping... ")
                time.sleep(5)

            time.sleep(0.5)

            start_req = time.time()
            res = self.fxc.get_batch_status(current_tid_batch)
            end_req = time.time()
            self.num_poll_reqs += 1

            print(f"Time to process batch: {end_req-start_req}")

            for item in res:

                # print(res[item])
                if 'result' in res[item]:

                    print(f"Received result: {res[item]['result']}")
                    exit()

                    # print(res[item])

                    #print(res[item]['result'])

                    # ret_fam_batch = res[item]['result']['family_batch']
                    ret_fam_batch = res[item]['result']

                    num_finished = ret_fam_batch['finished']

                    print(num_finished)

                    # timer = res[item]['result']['total_time']

                    family_file_size = 0
                    bad_extract_time = 0
                    good_extract_time = 0

                    good_parsers = ""

                    # family_mdata_size = get_deep_size(ret_fam_batch)
                    #
                    # for family in ret_fam_batch.families:
                    #
                    #     # print(family.metadata)
                    #
                    #     for file in family.files:
                    #         family_file_size += file['metadata']['physical']['size']
                    #
                    #     for gid in family.groups:
                    #         g_mdata = family.groups[gid].metadata
                    #         # print(g_mdata)
                    #
                    #         if g_mdata['matio'] != {} and g_mdata['matio'] is not None:
                    #             good_parsers = good_parsers + g_mdata['parser']
                    #             good_extract_time += g_mdata['extract time']
                    #         else:
                    #             bad_extract_time = g_mdata['extract time']
                    #
                    #     # TODO: These are at the family_batch level.
                    #
                    #     import_time = res[item]['result']["import_time"]
                    #     family_fetch_time = res[item]['result']["family_fetch_time"]
                    #     file_unpack_time = res[item]['result']["file_unpack_time"]
                    #     full_extraction_loop_time = res[item]['result']["full_extract_loop_time"]

                    # import_time = 0
                    # family_fetch_time = 0
                    # file_unpack_time = 0
                    # full_extraction_loop_time = 0
                    #
                    # with open('timer_file.txt', 'a') as g:
                    #     csv_writer = csv.writer(g)
                    #     csv_writer.writerow([timer, family_file_size, family_mdata_size, good_extract_time,
                    #                         bad_extract_time, import_time, family_fetch_time, file_unpack_time,
                    #                         full_extraction_loop_time, good_parsers])

                    # fam_len = len(ret_fam_batch.families)

                    with open('timer2.txt', 'a') as g:
                        csv_writer = csv.writer(g)
                        csv_writer.writerow([time.time(), num_finished])

                    self.successes += num_finished

                    self.current_tasks_on_ep -= num_finished

                    # NOTE -- we're doing nothing with the returned metadata here.

                elif 'exception' in res[item]:
                    res[item]['exception'].reraise()

                else:
                    self.polling_queue.put(item)
                """

                else:
                    print("*********ERROR *************")
                    self.failures += 1
                    print(res)
                """

    def stats_loop(self):
        while True:
            print("*********************************")
            print(f"Num successes: {self.successes}")
            print(f"Num failures: {self.failures}")
            print(f"Only {self.current_tasks_on_ep} tasks at endpoint. ")

            print(f"Number of send requests: {self.num_send_reqs}")
            print(f"Number of poll requests: {self.num_poll_reqs}")
            print("*********************************")
            print(f"Elapsed time: {time.time() - self.start_time}")
            time.sleep(5)
Example #4
0
class test_orch():
    def __init__(self):
        self.current_tasks_on_ep = 0

        self.max_tasks_on_ep = 90000

        self.fxc = FuncXClient()

        self.funcx_batches = Queue()
        self.polling_queue = Queue()

        self.num_poll_reqs = 0
        self.num_send_reqs = 0

        self.total_families_sent = 0

        self.successes = 0
        self.failures = 0

        self.fam_batches = []

        # NOTE: Changed away from X in order to load from CSV.
        # big_json = "/Users/tylerskluzacek/PyCharmProjects/xtracthub-service/experiments/tyler_20k.json"
        #
        # with open(big_json, 'r') as f:
        #     self.fam_list = json.load(f)

        self.image_path_list = Queue()
        with open('train2014_images.csv') as f:
            reader = csv.reader(f)
            for row in reader:
                # print(row[0])
                self.image_path_list.put(row[0])

        # exit()
        self.start_time = time.time()

        self.preproc_fam_batches()

    def path_converter(self, family_id, old_path):
        path_ls = old_path.split('/')
        file_name = path_ls[-1]
        new_path = None
        if system == "midway2":
            new_path = f"/project2/chard/skluzacek/{family_id}/{file_name}"
        elif system == "theta":
            new_path = f"/projects/CSC249ADCD01/skluzacek/data_to_process/{family_id}/{file_name}"
        return new_path

    def preproc_fam_batches(self):

        total_tasks = 0

        print("PREPROCESSING!")
        while not self.image_path_list.empty():

            fam_batch = FamilyBatch()
            # print(len(fam_batch.families))
            while len(fam_batch.families) < map_size:

                if self.image_path_list.empty():
                    break

                path = self.image_path_list.get()
                print(path)
                family = dict()

                family['family_id'] = None

                # TODO: CHANGE THIS FOR THETA.
                if system == 'midway2':
                    family['files'] = [{
                        'path':
                        f'/project2/chard/skluzacek/train2014/{path}'
                    }]
                elif system == 'theta':
                    family['files'] = [{
                        'path':
                        f'/projects/CSC249ADCD01/skluzacek/train2014/{path}'
                    }]
                family['metadata'] = dict()
                family['headers'] = None
                family['download_type'] = None
                family['groups'] = []

                empty_fam = Family()
                empty_fam.from_dict(family)
                print("ADDING FAMILY TO FAM BATCH")
                fam_batch.add_family(empty_fam)

            #if total_tasks > max_tasks:
            self.fam_batches.append(fam_batch)

        img_extractor = ImageExtractor()

        print(f"REGISTERING FUNCTION")
        self.fn_uuid = img_extractor.register_function(
            container_type=container_type,
            location=location,
            ep_id=ep_id,
            group="a31d8dce-5d0a-11ea-afea-0a53601d30b5")

        current_batch = []
        for fam_batch in self.fam_batches:
            if len(current_batch) < batch_size:
                current_batch.append(fam_batch)
            else:
                print(f"Length of current batch: {len(current_batch)}")
                self.funcx_batches.put(current_batch)
                current_batch = [fam_batch]

        # Grab the stragglers.
        if len(current_batch) > 0:
            self.funcx_batches.put(current_batch)

        print("Let me see")

        batch_counter = 0
        # while not self.funcx_batches.empty():
        #     funcx_batch = self.funcx_batches.get()
        #     batch_counter += 1
        #     for batch in funcx_batch:
        #         print(len(batch.families))
        #
        # print(batch_counter)
        #
        #
        # exit()

    # TODO: let the failures fail.
    def send_batches_thr_loop(self):
        while not self.funcx_batches.empty():

            if self.current_tasks_on_ep > self.max_tasks_on_ep:
                print(f"There are {self.current_tasks_on_ep}. Sleeping...")
                time.sleep(5)
                continue

            batch = self.funcx_batches.get()
            fx_batch = self.fxc.create_batch()

            for item in batch:

                fam_batch_size = len(item.families)

                fx_batch.add(
                    {
                        'family_batch': item,
                        'creds': None,
                        'download_file': None
                    },
                    endpoint_id=ep_id,
                    function_id=self.fn_uuid)
                self.current_tasks_on_ep += fam_batch_size

            try:
                res = self.fxc.batch_run(fx_batch)
                self.num_send_reqs += 1
            except:
                time.sleep(0.5)
                continue

            num_tids = 0
            for tid in res:
                self.polling_queue.put(tid)
                num_tids += 1

            # print(f"Put {num_tids} tids into polling queue! ")
            if self.current_tasks_on_ep + self.successes > task_stop:
                # This is our unclean (approximate) way of breaking at the 'task send' stage.
                break

            # time.sleep(1)

    def polling_loop(self):
        while True:

            current_tid_batch = []
            for i in range(500):  # TODO: 1000 might be too big?
                if self.polling_queue.empty():
                    print("Polling queue empty. Creating batch!")
                    time.sleep(5)
                    break
                else:
                    tid = self.polling_queue.get()
                    current_tid_batch.append(tid)

            if len(current_tid_batch) == 0:
                print("Batch is empty. Sleeping... ")
                time.sleep(5)
            res = self.fxc.get_batch_status(current_tid_batch)
            self.num_poll_reqs += 1

            for item in res:

                # print(res[item])

                # print(res[item])
                if 'result' in res[item]:
                    print(res[item])
                    # self.successes += 1

                    ret_fam_batch = res[item]['result']['family_batch']

                    fam_len = len(ret_fam_batch.families)
                    self.successes += fam_len

                    self.current_tasks_on_ep -= fam_len

                    # NOTE -- we're doing nothing with the returned metadata here.

                elif 'exception' in res[item]:
                    res[item]['exception'].reraise()

                elif 'status' in res[item]:
                    self.polling_queue.put(item)
                else:
                    print("*********ERROR *************")
                    self.failures += 1
                    print(res)

    def stats_loop(self):
        while True:
            print("*********************************")
            print(f"Num successes: {self.successes}")
            print(f"Num failures: {self.failures}")
            print(f"Only {self.current_tasks_on_ep} tasks at endpoint. ")

            print(f"Number of send requests: {self.num_send_reqs}")
            print(f"Number of poll requests: {self.num_poll_reqs}")
            print("*********************************")
            print(f"Elapsed time: {time.time() - self.start_time}")
            time.sleep(5)