Esempio n. 1
0
 def wrapper(*args: object, **kwargs: object) -> Any:
     import sys
     from parsl.app.errors import RemoteExceptionWrapper
     try:
         return func(*args, **kwargs)  # type: ignore
     except Exception:
         return RemoteExceptionWrapper(*sys.exc_info())
Esempio n. 2
0
 def wrapper(*args, **kwargs):
     import sys
     from parsl.app.errors import RemoteExceptionWrapper
     try:
         return func(*args, **kwargs)
     except Exception:
         return RemoteExceptionWrapper(*sys.exc_info())
Esempio n. 3
0
def worker(comm, rank):
    logger.info("Worker started")

    # Sync worker with master
    comm.Barrier()
    logger.debug("Synced")

    task_request = b'TREQ'

    while True:
        comm.send(task_request, dest=0, tag=TASK_REQUEST_TAG)
        # The worker will receive {'task_id':<tid>, 'buffer':<buf>}
        req = comm.recv(source=0, tag=rank)
        logger.debug("Got req: {}".format(req))
        tid = req['task_id']
        logger.debug("Got task: {}".format(tid))

        try:
            result = execute_task(req['buffer'])
        except Exception as e:
            result_package = {'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
            logger.debug("No result due to exception: {} with result package {}".format(e, result_package))
        else:
            result_package = {'task_id': tid, 'result': serialize(result)}
            logger.debug("Result: {}".format(result))

        pkl_package = pickle.dumps(result_package)
        comm.send(pkl_package, dest=0, tag=RESULT_TAG)
Esempio n. 4
0
    def start(self):

        logger.info("Starting worker")

        result = self.registration_message()
        task_type = b'REGISTER'

        while True:

            logger.debug("Sending result")
            # TODO : Swap for our serialization methods
            self.task_socket.send_multipart([
                task_type,  # Byte encoded
                pickle.dumps(result)
            ])

            if task_type == b'WRKR_DIE':
                logger.info("*** WORKER {} ABOUT TO DIE ***".format(
                    self.worker_id))
                exit(
                )  # Kill the worker after accepting death in message to manager.

            logger.debug("Waiting for task")
            p_task_id, msg = self.task_socket.recv_multipart()
            task_id = pickle.loads(p_task_id)
            logger.debug("Received task_id:{} with task:{}".format(
                task_id, msg))

            if task_id == "KILL":
                logger.info("[KILL] -- Worker KILL message received! ")
                task_type = b'WRKR_DIE'
                result = None

            logger.debug("Executing task...")

            try:
                result = self.execute_task(msg)
                logger.debug("Executed result: {}".format(result))
                serialized_result = serialize_object(result)
            except Exception:
                logger.exception("Caught an exception {}")
                result_package = {
                    'task_id':
                    task_id,
                    'exception':
                    serialize_object(RemoteExceptionWrapper(*sys.exc_info()))
                }
            else:
                logger.debug("Execution completed without exception")
                result_package = {
                    'task_id': task_id,
                    'result': serialized_result
                }

            # TODO: Change this to serialize_object to match IX?
            result = result_package
            task_type = b'TASK_RET'

        logger.warning("Broke out of the loop... dying")
Esempio n. 5
0
    def worker_watchdog(self, kill_event):
        """ Listens on the pending_result_queue and sends out results via 0mq

        Parameters:
        -----------
        kill_event : threading.Event
              Event to let the thread know when it is time to die.
        """

        logger.debug("[WORKER_WATCHDOG_THREAD] Starting thread")

        while not kill_event.is_set():
            for worker_id, p in self.procs.items():
                if not p.is_alive():
                    logger.info(
                        "[WORKER_WATCHDOG_THREAD] Worker {} has died".format(
                            worker_id))
                    try:
                        task = self._tasks_in_progress.pop(worker_id)
                        logger.info(
                            "[WORKER_WATCHDOG_THREAD] Worker {} was busy when it died"
                            .format(worker_id))
                        try:
                            raise WorkerLost(worker_id, platform.node())
                        except Exception:
                            logger.info(
                                "[WORKER_WATCHDOG_THREAD] Putting exception for task {} in the pending result queue"
                                .format(task['task_id']))
                            result_package = {
                                'task_id':
                                task['task_id'],
                                'exception':
                                serialize_object(
                                    RemoteExceptionWrapper(*sys.exc_info()))
                            }
                            pkl_package = pickle.dumps(result_package)
                            self.pending_result_queue.put(pkl_package)
                    except KeyError:
                        logger.info(
                            "[WORKER_WATCHDOG_THREAD] Worker {} was not busy when it died"
                            .format(worker_id))

                    p = multiprocessing.Process(
                        target=worker,
                        args=(worker_id, self.uid, self.worker_count,
                              self.pending_task_queue,
                              self.pending_result_queue,
                              self.ready_worker_queue,
                              self._tasks_in_progress),
                        name="HTEX-Worker-{}".format(worker_id))
                    self.procs[worker_id] = p
                    logger.info(
                        "[WORKER_WATCHDOG_THREAD] Worker {} has been restarted"
                        .format(worker_id))
                time.sleep(self.poll_period)

        logger.critical("[WORKER_WATCHDOG_THREAD] Exiting")
Esempio n. 6
0
def worker(worker_id, pool_id, task_queue, result_queue, worker_queue,
           tasks_in_progress):
    """

    Put request token into queue
    Get task from task_queue
    Pop request from queue
    Put result into result_queue
    """
    start_file_logger('{}/{}/worker_{}.log'.format(args.logdir, pool_id,
                                                   worker_id),
                      worker_id,
                      name="worker_log",
                      level=logging.DEBUG if args.debug else logging.INFO)

    # Sync worker with master
    logger.info('Worker {} started'.format(worker_id))
    if args.debug:
        logger.debug("Debug logging enabled")

    while True:
        worker_queue.put(worker_id)

        # The worker will receive {'task_id':<tid>, 'buffer':<buf>}
        req = task_queue.get()
        tasks_in_progress[worker_id] = req
        tid = req['task_id']
        logger.info("Received task {}".format(tid))

        try:
            worker_queue.get()
        except queue.Empty:
            logger.warning(
                "Worker ID: {} failed to remove itself from ready_worker_queue"
                .format(worker_id))
            pass

        try:
            result = execute_task(req['buffer'])
            serialized_result = serialize_object(result)
        except Exception as e:
            logger.info('Caught an exception: {}'.format(e))
            result_package = {
                'task_id':
                tid,
                'exception':
                serialize_object(RemoteExceptionWrapper(*sys.exc_info()))
            }
        else:
            result_package = {'task_id': tid, 'result': serialized_result}
            # logger.debug("Result: {}".format(result))

        logger.info("Completed task {}".format(tid))
        pkl_package = pickle.dumps(result_package)

        result_queue.put(pkl_package)
        tasks_in_progress.pop(worker_id)
Esempio n. 7
0
    def worker_watchdog(self, kill_event):
        """Keeps workers alive.

        Parameters:
        -----------
        kill_event : threading.Event
              Event to let the thread know when it is time to die.
        """

        logger.debug("Starting worker watchdog")

        while not kill_event.is_set():
            for worker_id, p in self.procs.items():
                if not p.is_alive():
                    logger.info("Worker {} has died".format(worker_id))
                    try:
                        task = self._tasks_in_progress.pop(worker_id)
                        logger.info("Worker {} was busy when it died".format(
                            worker_id))
                        try:
                            raise WorkerLost(worker_id, platform.node())
                        except Exception:
                            logger.info(
                                "Putting exception for task {} in the pending result queue"
                                .format(task['task_id']))
                            result_package = {
                                'type':
                                'result',
                                'task_id':
                                task['task_id'],
                                'exception':
                                serialize(
                                    RemoteExceptionWrapper(*sys.exc_info()))
                            }
                            pkl_package = pickle.dumps(result_package)
                            self.pending_result_queue.put(pkl_package)
                    except KeyError:
                        logger.info(
                            "Worker {} was not busy when it died".format(
                                worker_id))

                    p = mpProcess(target=worker,
                                  args=(worker_id, self.uid, self.worker_count,
                                        self.pending_task_queue,
                                        self.pending_result_queue,
                                        self.ready_worker_queue,
                                        self._tasks_in_progress,
                                        self.cpu_affinity),
                                  name="HTEX-Worker-{}".format(worker_id))
                    self.procs[worker_id] = p
                    logger.info(
                        "Worker {} has been restarted".format(worker_id))
                time.sleep(self.heartbeat_period)

        logger.critical("Exiting")
Esempio n. 8
0
def WorkQueueCollectorThread(collector_queue=multiprocessing.Queue(),
                             tasks={},
                             tasks_lock=threading.Lock(),
                             cancel_value=multiprocessing.Value('i', 1),
                             submit_process=None,
                             executor=None):

    logger.debug("Starting Collector Thread")

    continue_running = True
    while continue_running:
        if cancel_value.value == 0:
            continue_running = False
            continue

        # The WorkQueue process that creates task has died
        if not submit_process.is_alive() and cancel_value.value != 0:
            raise ExecutorError(executor,
                                "Workqueue Submit Process is not alive")

        # Get the result message from the collector_queue
        try:
            item = collector_queue.get(timeout=1)
        except queue.Empty:
            continue

        parsl_tid = item["tid"]
        received = item["result_received"]

        # Obtain the future from the tasks dictionary
        tasks_lock.acquire()
        future = tasks[parsl_tid]
        tasks_lock.release()

        # Failed task
        if received is False:
            reason = item["reason"]
            status = item["status"]
            future.set_exception(AppFailure(reason, status))
        # Successful task
        else:
            result = item["result"]
            future_update, _ = deserialize_object(result["result"])
            logger.debug("Updating Future for Parsl Task {}".format(parsl_tid))
            if result["failure"] is False:
                future.set_result(future_update)
            else:
                future.set_exception(RemoteExceptionWrapper(*future_update))

    logger.debug("Exiting Collector Thread")
    return
Esempio n. 9
0
        # result_file: any output (including exceptions) will be written to
        #              this file.
        try:
            (map_file, function_file, result_file) = sys.argv[1:]
        except ValueError:
            print("Usage:\n\t{} function result mapping\n".format(sys.argv[0]))
            raise

        try:
            (namespace, function_code, result_name) = load_function(map_file, function_file)
        except Exception:
            print("There was an error setting up the function for execution.")
            raise

        try:
            result = execute_function(namespace, function_code, result_name)
        except Exception:
            print("There was an error executing the function.")
            raise
    except Exception:
        traceback.print_exc()
        result = RemoteExceptionWrapper(*sys.exc_info())

    # Write out function result to the result file
    try:
        dump_result_to_file(result_file, result)
    except Exception:
        print("Could not write to result file.")
        traceback.print_exc()
        sys.exit(1)
Esempio n. 10
0
    def start(self, poll_period=None):
        """ Start the NeedNameQeueu

        Parameters:
        ----------

        TODO: Move task receiving to a thread
        """
        logger.info("Incoming ports bound")

        if poll_period is None:
            poll_period = self.poll_period

        start = time.time()
        count = 0

        self._kill_event = threading.Event()
        self._task_puller_thread = threading.Thread(
            target=self.migrate_tasks_to_internal,
            args=(self._kill_event, ),
            name="Interchange-Task-Puller")
        self._task_puller_thread.start()

        self._command_thread = threading.Thread(target=self._command_server,
                                                args=(self._kill_event, ),
                                                name="Interchange-Command")
        self._command_thread.start()

        poller = zmq.Poller()
        # poller.register(self.task_incoming, zmq.POLLIN)
        poller.register(self.task_outgoing, zmq.POLLIN)
        poller.register(self.results_incoming, zmq.POLLIN)

        # These are managers which we should examine in an iteration
        # for scheduling a job (or maybe any other attention?).
        # Anything altering the state of the manager should add it
        # onto this list.
        interesting_managers = set()

        while not self._kill_event.is_set():
            self.socks = dict(poller.poll(timeout=poll_period))

            # Listen for requests for work
            if self.task_outgoing in self.socks and self.socks[
                    self.task_outgoing] == zmq.POLLIN:
                logger.debug("[MAIN] starting task_outgoing section")
                message = self.task_outgoing.recv_multipart()
                manager = message[0]

                if manager not in self._ready_manager_queue:
                    reg_flag = False

                    try:
                        msg = json.loads(message[1].decode('utf-8'))
                        msg['reg_time'] = datetime.datetime.strptime(
                            msg['reg_time'], "%Y-%m-%d %H:%M:%S")
                        reg_flag = True
                    except Exception:
                        logger.warning(
                            "[MAIN] Got Exception reading registration message from manager: {}"
                            .format(manager),
                            exc_info=True)
                        logger.debug("[MAIN] Message :\n{}\n".format(
                            message[0]))

                    # By default we set up to ignore bad nodes/registration messages.
                    self._ready_manager_queue[manager] = {
                        'last': time.time(),
                        'free_capacity': 0,
                        'block_id': None,
                        'max_capacity': 0,
                        'worker_count': 0,
                        'active': True,
                        'tasks': []
                    }
                    if reg_flag is True:
                        interesting_managers.add(manager)
                        logger.info(
                            "[MAIN] Adding manager: {} to ready queue".format(
                                manager))
                        self._ready_manager_queue[manager].update(msg)
                        logger.info(
                            "[MAIN] Registration info for manager {}: {}".
                            format(manager, msg))
                        if self.monitoring_enabled:
                            logger.info("Sending message {} to hub".format(
                                self._ready_manager_queue[manager]))
                            self.hub_channel.send_pyobj(
                                (MessageType.NODE_INFO,
                                 self._ready_manager_queue[manager]))

                        if (msg['python_v'].rsplit(".", 1)[0] !=
                                self.current_platform['python_v'].rsplit(
                                    ".", 1)[0] or msg['parsl_v'] !=
                                self.current_platform['parsl_v']):
                            logger.warn(
                                "[MAIN] Manager {} has incompatible version info with the interchange"
                                .format(manager))

                            if self.suppress_failure is False:
                                logger.debug("Setting kill event")
                                self._kill_event.set()
                                e = ManagerLost(
                                    manager, self._ready_manager_queue[manager]
                                    ['hostname'])
                                result_package = {
                                    'task_id': -1,
                                    'exception': serialize_object(e)
                                }
                                pkl_package = pickle.dumps(result_package)
                                self.results_outgoing.send(pkl_package)
                                logger.warning(
                                    "[MAIN] Sent failure reports, unregistering manager"
                                )
                            else:
                                logger.debug(
                                    "[MAIN] Suppressing shutdown due to version incompatibility"
                                )
                        else:
                            logger.info(
                                "[MAIN] Manager {} has compatible Parsl version {}"
                                .format(manager, msg['parsl_v']))
                            logger.info(
                                "[MAIN] Manager {} has compatible Python version {}"
                                .format(manager,
                                        msg['python_v'].rsplit(".", 1)[0]))
                    else:
                        # Registration has failed.
                        if self.suppress_failure is False:
                            self._kill_event.set()
                            e = BadRegistration(manager, critical=True)
                            result_package = {
                                'task_id': -1,
                                'exception': serialize_object(e)
                            }
                            pkl_package = pickle.dumps(result_package)
                            self.results_outgoing.send(pkl_package)
                        else:
                            logger.debug(
                                "[MAIN] Suppressing bad registration from manager:{}"
                                .format(manager))

                else:
                    tasks_requested = int.from_bytes(message[1], "little")
                    self._ready_manager_queue[manager]['last'] = time.time()
                    if tasks_requested == HEARTBEAT_CODE:
                        logger.debug(
                            "[MAIN] Manager {} sent heartbeat".format(manager))
                        self.task_outgoing.send_multipart(
                            [manager, b'', PKL_HEARTBEAT_CODE])
                    else:
                        logger.debug(
                            "[MAIN] Manager {} requested {} tasks".format(
                                manager, tasks_requested))
                        self._ready_manager_queue[manager][
                            'free_capacity'] = tasks_requested
                        interesting_managers.add(manager)
                logger.debug("[MAIN] leaving task_outgoing section")

            # If we had received any requests, check if there are tasks that could be passed

            logger.debug("Managers count (total/interesting): {}/{}".format(
                len(self._ready_manager_queue), len(interesting_managers)))

            if interesting_managers and not self.pending_task_queue.empty():
                shuffled_managers = list(interesting_managers)
                random.shuffle(shuffled_managers)

                while shuffled_managers and not self.pending_task_queue.empty(
                ):  # cf. the if statement above...
                    manager = shuffled_managers.pop()
                    tasks_inflight = len(
                        self._ready_manager_queue[manager]['tasks'])
                    real_capacity = min(
                        self._ready_manager_queue[manager]['free_capacity'],
                        self._ready_manager_queue[manager]['max_capacity'] -
                        tasks_inflight)

                    if (real_capacity
                            and self._ready_manager_queue[manager]['active']):
                        tasks = self.get_tasks(real_capacity)
                        if tasks:
                            self.task_outgoing.send_multipart(
                                [manager, b'',
                                 pickle.dumps(tasks)])
                            task_count = len(tasks)
                            count += task_count
                            tids = [t['task_id'] for t in tasks]
                            self._ready_manager_queue[manager][
                                'free_capacity'] -= task_count
                            self._ready_manager_queue[manager]['tasks'].extend(
                                tids)
                            logger.debug(
                                "[MAIN] Sent tasks: {} to manager {}".format(
                                    tids, manager))
                            if self._ready_manager_queue[manager][
                                    'free_capacity'] > 0:
                                logger.debug(
                                    "[MAIN] Manager {} has free_capacity {}".
                                    format(
                                        manager,
                                        self._ready_manager_queue[manager]
                                        ['free_capacity']))
                                # ... so keep it in the interesting_managers list
                            else:
                                logger.debug(
                                    "[MAIN] Manager {} is now saturated".
                                    format(manager))
                                interesting_managers.remove(manager)
                    else:
                        interesting_managers.remove(manager)
                        # logger.debug("Nothing to send to manager {}".format(manager))
                logger.debug(
                    "[MAIN] leaving _ready_manager_queue section, with {} managers still interesting"
                    .format(len(interesting_managers)))
            else:
                logger.debug(
                    "[MAIN] either no interesting managers or no tasks, so skipping manager pass"
                )
            # Receive any results and forward to client
            if self.results_incoming in self.socks and self.socks[
                    self.results_incoming] == zmq.POLLIN:
                logger.debug("[MAIN] entering results_incoming section")
                manager, *b_messages = self.results_incoming.recv_multipart()
                if manager not in self._ready_manager_queue:
                    logger.warning(
                        "[MAIN] Received a result from a un-registered manager: {}"
                        .format(manager))
                else:
                    logger.debug("[MAIN] Got {} result items in batch".format(
                        len(b_messages)))
                    for b_message in b_messages:
                        r = pickle.loads(b_message)
                        # logger.debug("[MAIN] Received result for task {} from {}".format(r['task_id'], manager))
                        self._ready_manager_queue[manager]['tasks'].remove(
                            r['task_id'])
                    self.results_outgoing.send_multipart(b_messages)
                    logger.debug("[MAIN] Current tasks: {}".format(
                        self._ready_manager_queue[manager]['tasks']))
                logger.debug("[MAIN] leaving results_incoming section")

            bad_managers = [
                manager for manager in self._ready_manager_queue
                if time.time() - self._ready_manager_queue[manager]['last'] >
                self.heartbeat_threshold
            ]
            for manager in bad_managers:
                logger.debug("[MAIN] Last: {} Current: {}".format(
                    self._ready_manager_queue[manager]['last'], time.time()))
                logger.warning(
                    "[MAIN] Too many heartbeats missed for manager {}".format(
                        manager))

                for tid in self._ready_manager_queue[manager]['tasks']:
                    try:
                        raise ManagerLost(
                            manager,
                            self._ready_manager_queue[manager]['hostname'])
                    except Exception:
                        result_package = {
                            'task_id':
                            tid,
                            'exception':
                            serialize_object(
                                RemoteExceptionWrapper(*sys.exc_info()))
                        }
                        pkl_package = pickle.dumps(result_package)
                        self.results_outgoing.send(pkl_package)
                        logger.warning(
                            "[MAIN] Sent failure reports, unregistering manager"
                        )
                self._ready_manager_queue.pop(manager, 'None')
                if manager in interesting_managers:
                    interesting_managers.remove(manager)

        delta = time.time() - start
        logger.info("Processed {} tasks in {} seconds".format(count, delta))
        logger.warning("Exiting")
Esempio n. 11
0
def worker(worker_id, pool_id, pool_size, task_queue, result_queue, worker_queue, tasks_in_progress, cpu_affinity):
    """

    Put request token into queue
    Get task from task_queue
    Pop request from queue
    Put result into result_queue
    """
    start_file_logger('{}/block-{}/{}/worker_{}.log'.format(args.logdir, args.block_id, pool_id, worker_id),
                      worker_id,
                      name="worker_log",
                      level=logging.DEBUG if args.debug else logging.INFO)

    # Store worker ID as an environment variable
    os.environ['PARSL_WORKER_RANK'] = str(worker_id)
    os.environ['PARSL_WORKER_COUNT'] = str(pool_size)
    os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id)
    os.environ['PARSL_WORKER_BLOCK_ID'] = str(args.block_id)

    # Sync worker with master
    logger.info('Worker {} started'.format(worker_id))
    if args.debug:
        logger.debug("Debug logging enabled")

    # If desired, set process affinity
    if cpu_affinity != "none":
        # Count the number of cores per worker
        avail_cores = sorted(os.sched_getaffinity(0))  # Get the available processors
        cores_per_worker = len(avail_cores) // pool_size
        assert cores_per_worker > 0, "Affinity does not work if there are more workers than cores"

        # Determine this worker's cores
        if cpu_affinity == "block":
            my_cores = avail_cores[cores_per_worker * worker_id:cores_per_worker * (worker_id + 1)]
        elif cpu_affinity == "alternating":
            my_cores = avail_cores[worker_id::pool_size]
        else:
            raise ValueError("Affinity strategy {} is not supported".format(cpu_affinity))

        # Set the affinity for this worker
        os.sched_setaffinity(0, my_cores)
        logger.info("Set worker CPU affinity to {}".format(my_cores))

    while True:
        worker_queue.put(worker_id)

        # The worker will receive {'task_id':<tid>, 'buffer':<buf>}
        req = task_queue.get()
        tasks_in_progress[worker_id] = req
        tid = req['task_id']
        logger.info("Received task {}".format(tid))

        try:
            worker_queue.get()
        except queue.Empty:
            logger.warning("Worker ID: {} failed to remove itself from ready_worker_queue".format(worker_id))
            pass

        try:
            result = execute_task(req['buffer'])
            serialized_result = serialize(result, buffer_threshold=1e6)
        except Exception as e:
            logger.info('Caught an exception: {}'.format(e))
            result_package = {'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))}
        else:
            result_package = {'task_id': tid, 'result': serialized_result}
            # logger.debug("Result: {}".format(result))

        logger.info("Completed task {}".format(tid))
        try:
            pkl_package = pickle.dumps(result_package)
        except Exception:
            logger.exception("Caught exception while trying to pickle the result package")
            pkl_package = pickle.dumps({'task_id': tid,
                                        'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))
            })

        result_queue.put(pkl_package)
        tasks_in_progress.pop(worker_id)
Esempio n. 12
0
    def pull_tasks(self, kill_event):
        """Pull tasks from the incoming tasks 0mq pipe onto the internal
        pending task queue


        While :
            receive results and task requests from the workers
            receive tasks/heartbeats from the Interchange
            match tasks to workers
            if task doesn't have appropriate worker type:
                 launch worker of type.. with LRU or some sort of caching strategy.
            if workers >> tasks:
                 advertize available capacity

        Parameters:
        -----------
        kill_event : threading.Event
              Event to let the thread know when it is time to die.
        """
        log.info("[TASK PULL THREAD] starting")

        # Send a registration message
        msg = self.create_reg_message()
        log.debug(f"Sending registration message: {msg}")
        self.task_incoming.send(msg)
        last_interchange_contact = time.time()
        task_recv_counter = 0

        poll_timer = self.poll_period

        new_worker_map = None
        while not kill_event.is_set():
            # Disabling the check on ready_worker_queue disables batching
            log.debug("[TASK_PULL_THREAD] Loop start")
            pending_task_count = task_recv_counter - self.task_done_counter
            ready_worker_count = self.worker_map.ready_worker_count()
            log.debug(
                "[TASK_PULL_THREAD pending_task_count: %s, Ready_worker_count: %s",
                pending_task_count,
                ready_worker_count,
            )

            if pending_task_count < self.max_queue_size and ready_worker_count > 0:
                ads = self.worker_map.advertisement()
                log.debug(f"[TASK_PULL_THREAD] Requesting tasks: {ads}")
                msg = pickle.dumps(ads)
                self.task_incoming.send(msg)

            # Receive results from the workers, if any
            socks = dict(self.poller.poll(timeout=poll_timer))

            if (self.funcx_task_socket in socks
                    and socks[self.funcx_task_socket] == zmq.POLLIN):
                self.poll_funcx_task_socket()

            # Receive task batches from Interchange and forward to workers
            if self.task_incoming in socks and socks[
                    self.task_incoming] == zmq.POLLIN:

                # If we want to wrap the task_incoming polling into a separate function,
                # we need to
                #   self.poll_task_incoming(
                #       poll_timer,
                #       last_interchange_contact,
                #       kill_event,
                #       task_revc_counter
                #   )
                poll_timer = 0
                _, pkl_msg = self.task_incoming.recv_multipart()
                message = pickle.loads(pkl_msg)
                last_interchange_contact = time.time()

                if message == "STOP":
                    log.critical("[TASK_PULL_THREAD] Received stop request")
                    kill_event.set()
                    break

                elif type(message) == tuple and message[0] == "TASK_CANCEL":
                    with self.task_finalization_lock:
                        task_id = message[1]
                        log.info(
                            f"Received TASK_CANCEL request for task: {task_id}"
                        )
                        if task_id not in self.task_worker_map:
                            log.warning(
                                f"Task:{task_id} is not in task_worker_map.")
                            log.warning(
                                "Possible duplicate cancel or race-condition")
                            continue
                        # Cancel task by killing the worker it is on
                        worker_id_raw = self.task_worker_map[task_id][
                            "worker_id"]
                        worker_to_kill = self.task_worker_map[task_id][
                            "worker_id"].decode("utf-8")
                        worker_type = self.task_worker_map[task_id][
                            "task_type"]
                        log.debug(
                            "Cancelling task running on worker: %s",
                            self.task_worker_map[task_id],
                        )
                        try:
                            log.info(
                                f"Removing worker:{worker_id_raw} from map")
                            self.worker_map.start_remove_worker(worker_type)
                            self.worker_map.remove_worker(worker_id_raw)
                            log.info(
                                f"Popping worker:{worker_to_kill} from worker_procs"
                            )
                            proc = self.worker_procs.pop(worker_to_kill)
                            log.warning(
                                f"Sending process:{proc.pid} terminate signal")
                            proc.terminate()
                            try:
                                proc.wait(
                                    1
                                )  # Wait 1 second before attempting SIGKILL
                            except subprocess.TimeoutExpired:
                                log.exception(
                                    "Process did not terminate in 1 second")
                                log.warning(
                                    f"Sending process:{proc.pid} kill signal")
                                proc.kill()
                            else:
                                log.debug(
                                    f"Worker process exited with : {proc.returncode}"
                                )

                            raise TaskCancelled(worker_to_kill, self.uid)
                        except Exception as e:
                            log.exception(f"Raise exception, handling: {e}")
                            result_package = {
                                "task_id":
                                task_id,
                                "container_id":
                                worker_type,
                                "exception":
                                self.serializer.serialize(
                                    RemoteExceptionWrapper(*sys.exc_info())),
                            }
                            self.pending_result_queue.put(
                                pickle.dumps(result_package))

                        worker_proc = self.worker_map.add_worker(
                            worker_id=str(self.worker_map.worker_id_counter),
                            worker_type=self.worker_type,
                            container_cmd_options=self.container_cmd_options,
                            address=self.address,
                            debug=self.debug,
                            uid=self.uid,
                            logdir=self.logdir,
                            worker_port=self.worker_port,
                        )
                        self.worker_procs.update(worker_proc)
                        self.task_worker_map.pop(task_id)
                        self.remove_task(task_id)

                elif message == HEARTBEAT_CODE:
                    log.debug("Got heartbeat from interchange")

                else:
                    tasks = [(rt["local_container"],
                              Message.unpack(rt["raw_buffer"]))
                             for rt in message]

                    task_recv_counter += len(tasks)
                    log.debug("[TASK_PULL_THREAD] Got tasks: {} of {}".format(
                        [t[1].task_id for t in tasks], task_recv_counter))

                    for task_type, task in tasks:
                        log.debug(f"[TASK DEBUG] Task is of type: {task_type}")

                        if task_type not in self.task_queues:
                            self.task_queues[task_type] = queue.Queue()
                        if task_type not in self.outstanding_task_count:
                            self.outstanding_task_count[task_type] = 0
                        self.task_queues[task_type].put(task)
                        self.outstanding_task_count[task_type] += 1
                        self.task_type_mapping[task.task_id] = task_type
                        log.debug(
                            "Got task: Outstanding task counts: {}".format(
                                self.outstanding_task_count))
                        log.debug(
                            f"Task {task} pushed to a task queue {task_type}")

            else:
                log.debug("[TASK_PULL_THREAD] No incoming tasks")
                # Limit poll duration to heartbeat_period
                # heartbeat_period is in s vs poll_timer in ms
                if not poll_timer:
                    poll_timer = self.poll_period
                poll_timer = min(self.heartbeat_period * 1000, poll_timer * 2)

                # Only check if no messages were received.
                if time.time(
                ) > last_interchange_contact + self.heartbeat_threshold:
                    log.critical(
                        "[TASK_PULL_THREAD] Missing contact with interchange beyond "
                        "heartbeat_threshold")
                    kill_event.set()
                    log.critical("Killing all workers")
                    for proc in self.worker_procs.values():
                        proc.kill()
                    log.critical("[TASK_PULL_THREAD] Exiting")
                    break

            log.debug(f"To-Die Counts: {self.worker_map.to_die_count}")
            log.debug("Alive worker counts: {}".format(
                self.worker_map.total_worker_type_counts))

            new_worker_map = naive_scheduler(
                self.task_queues,
                self.outstanding_task_count,
                self.max_worker_count,
                new_worker_map,
                self.worker_map.to_die_count,
            )
            log.debug(f"[SCHEDULER] New worker map: {new_worker_map}")

            # NOTE: Wipes the queue -- previous scheduling loops don't affect what's
            # needed now.
            self.next_worker_q, need_more = self.worker_map.get_next_worker_q(
                new_worker_map)

            # Spin up any new workers according to the worker queue.
            # Returns the total number of containers that have spun up.
            self.worker_procs.update(
                self.worker_map.spin_up_workers(
                    self.next_worker_q,
                    mode=self.worker_mode,
                    debug=self.debug,
                    container_cmd_options=self.container_cmd_options,
                    address=self.address,
                    uid=self.uid,
                    logdir=self.logdir,
                    worker_port=self.worker_port,
                ))
            log.debug(f"[SPIN UP] Worker processes: {self.worker_procs}")

            #  Count the workers of each type that need to be removed
            spin_downs, container_switch_count = self.worker_map.spin_down_workers(
                new_worker_map,
                worker_max_idletime=self.worker_max_idletime,
                need_more=need_more,
                scheduler_mode=self.scheduler_mode,
            )
            self.container_switch_count += container_switch_count
            log.debug("Container switch count: total {}, cur {}".format(
                self.container_switch_count, container_switch_count))

            for w_type in spin_downs:
                self.remove_worker_init(w_type)

            current_worker_map = self.worker_map.get_worker_counts()
            for task_type in current_worker_map:
                if task_type == "unused":
                    continue

                # *** Match tasks to workers *** #
                else:
                    available_workers = current_worker_map[task_type]
                    log.debug("Available workers of type {}: {}".format(
                        task_type, available_workers))

                    for _i in range(available_workers):
                        if (task_type in self.task_queues and
                                not self.task_queues[task_type].qsize() == 0
                                and not self.worker_map.
                                worker_queues[task_type].qsize() == 0):

                            log.debug(
                                "Task type {} has task queue size {}".format(
                                    task_type,
                                    self.task_queues[task_type].qsize()))
                            log.debug("... and available workers: {}".format(
                                self.worker_map.worker_queues[task_type].qsize(
                                )))

                            self.send_task_to_worker(task_type)
Esempio n. 13
0
    def start(self):

        log.info("Starting worker")

        result = self.registration_message()
        task_type = b"REGISTER"
        log.debug("Sending registration")
        self.task_socket.send_multipart([task_type,
                                         pickle.dumps(result)]  # Byte encoded
                                        )

        while True:

            log.debug("Waiting for task")
            p_task_id, p_container_id, msg = self.task_socket.recv_multipart()
            task_id = pickle.loads(p_task_id)
            container_id = pickle.loads(p_container_id)
            log.debug(f"Received task_id:{task_id} with task:{msg}")

            result = None
            task_type = None
            if task_id == "KILL":
                task = Message.unpack(msg)
                if task.task_buffer.decode("utf-8") == "KILL":
                    log.info("[KILL] -- Worker KILL message received! ")
                    task_type = b"WRKR_DIE"
                else:
                    log.exception(
                        "Caught an exception of non-KILL message for KILL task"
                    )
                    continue
            else:
                log.debug("Executing task...")

                try:
                    result = self.execute_task(msg)
                    serialized_result = self.serialize(result)

                    if len(serialized_result) > self.result_size_limit:
                        raise MaxResultSizeExceeded(len(serialized_result),
                                                    self.result_size_limit)
                except Exception as e:
                    log.exception(f"Caught an exception {e}")
                    result_package = {
                        "task_id":
                        task_id,
                        "container_id":
                        container_id,
                        "exception":
                        self.serialize(
                            RemoteExceptionWrapper(*sys.exc_info())),
                    }
                else:
                    log.debug("Execution completed without exception")
                    result_package = {
                        "task_id": task_id,
                        "container_id": container_id,
                        "result": serialized_result,
                    }
                result = result_package
                task_type = b"TASK_RET"

            log.debug("Sending result")

            self.task_socket.send_multipart([task_type,
                                             pickle.dumps(result)
                                             ]  # Byte encoded
                                            )

            if task_type == b"WRKR_DIE":
                log.info(f"*** WORKER {self.worker_id} ABOUT TO DIE ***")
                # Kill the worker after accepting death in message to manager.
                sys.exit()
                # We need to return here to allow for sys.exit mocking in tests
                return

        log.warning("Broke out of the loop... dying")
Esempio n. 14
0
def worker(worker_id, pool_id, pool_size, task_queue, result_queue,
           worker_queue, tasks_in_progress, cpu_affinity,
           accelerator: Optional[str]):
    """

    Put request token into queue
    Get task from task_queue
    Pop request from queue
    Put result into result_queue
    """

    # override the global logger inherited from the __main__ process (which
    # usually logs to manager.log) with one specific to this worker.
    global logger
    logger = start_file_logger(
        '{}/block-{}/{}/worker_{}.log'.format(args.logdir, args.block_id,
                                              pool_id, worker_id),
        worker_id,
        name="worker_log",
        level=logging.DEBUG if args.debug else logging.INFO)

    # Store worker ID as an environment variable
    os.environ['PARSL_WORKER_RANK'] = str(worker_id)
    os.environ['PARSL_WORKER_COUNT'] = str(pool_size)
    os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id)
    os.environ['PARSL_WORKER_BLOCK_ID'] = str(args.block_id)

    # share the result queue with monitoring code so it too can send results down that channel
    import parsl.executors.high_throughput.monitoring_info as mi
    mi.result_queue = result_queue

    # Sync worker with master
    logger.info('Worker {} started'.format(worker_id))
    if args.debug:
        logger.debug("Debug logging enabled")

    # If desired, set process affinity
    if cpu_affinity != "none":
        # Count the number of cores per worker
        avail_cores = sorted(
            os.sched_getaffinity(0))  # Get the available processors
        cores_per_worker = len(avail_cores) // pool_size
        assert cores_per_worker > 0, "Affinity does not work if there are more workers than cores"

        # Determine this worker's cores
        if cpu_affinity == "block":
            my_cores = avail_cores[cores_per_worker *
                                   worker_id:cores_per_worker *
                                   (worker_id + 1)]
        elif cpu_affinity == "alternating":
            my_cores = avail_cores[worker_id::pool_size]
        else:
            raise ValueError(
                "Affinity strategy {} is not supported".format(cpu_affinity))

        # Set the affinity for this worker
        os.sched_setaffinity(0, my_cores)
        logger.info("Set worker CPU affinity to {}".format(my_cores))

    # If desired, pin to accelerator
    if accelerator is not None:
        os.environ["CUDA_VISIBLE_DEVICES"] = accelerator
        os.environ["ROCR_VISIBLE_DEVICES"] = accelerator
        os.environ["SYCL_DEVICE_FILTER"] = f"*:*:{accelerator}"
        logger.info(f'Pinned worker to accelerator: {accelerator}')

    while True:
        worker_queue.put(worker_id)

        # The worker will receive {'task_id':<tid>, 'buffer':<buf>}
        req = task_queue.get()
        tasks_in_progress[worker_id] = req
        tid = req['task_id']
        logger.info("Received task {}".format(tid))

        try:
            worker_queue.get()
        except queue.Empty:
            logger.warning(
                "Worker ID: {} failed to remove itself from ready_worker_queue"
                .format(worker_id))
            pass

        try:
            result = execute_task(req['buffer'])
            serialized_result = serialize(result, buffer_threshold=1e6)
        except Exception as e:
            logger.info('Caught an exception: {}'.format(e))
            result_package = {
                'type': 'result',
                'task_id': tid,
                'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))
            }
        else:
            result_package = {
                'type': 'result',
                'task_id': tid,
                'result': serialized_result
            }
            # logger.debug("Result: {}".format(result))

        logger.info("Completed task {}".format(tid))
        try:
            pkl_package = pickle.dumps(result_package)
        except Exception:
            logger.exception(
                "Caught exception while trying to pickle the result package")
            pkl_package = pickle.dumps({
                'type':
                'result',
                'task_id':
                tid,
                'exception':
                serialize(RemoteExceptionWrapper(*sys.exc_info()))
            })

        result_queue.put(pkl_package)
        tasks_in_progress.pop(worker_id)
        logger.info("All processing finished for task {}".format(tid))
Esempio n. 15
0
    def start(self):
        """ Start the interchange
        """
        logger.info("Incoming ports bound")

        hub_channel = self._create_monitoring_channel()

        poll_period = self.poll_period

        start = time.time()
        count = 0

        self._kill_event = threading.Event()
        self._task_puller_thread = threading.Thread(
            target=self.task_puller,
            args=(self._kill_event, ),
            name="Interchange-Task-Puller")
        self._task_puller_thread.start()

        self._command_thread = threading.Thread(target=self._command_server,
                                                args=(self._kill_event, ),
                                                name="Interchange-Command")
        self._command_thread.start()

        poller = zmq.Poller()
        poller.register(self.task_outgoing, zmq.POLLIN)
        poller.register(self.results_incoming, zmq.POLLIN)

        # These are managers which we should examine in an iteration
        # for scheduling a job (or maybe any other attention?).
        # Anything altering the state of the manager should add it
        # onto this list.
        interesting_managers: Set[bytes] = set()

        while not self._kill_event.is_set():
            self.socks = dict(poller.poll(timeout=poll_period))

            # Listen for requests for work
            if self.task_outgoing in self.socks and self.socks[
                    self.task_outgoing] == zmq.POLLIN:
                logger.debug("starting task_outgoing section")
                message = self.task_outgoing.recv_multipart()
                manager_id = message[0]

                if manager_id not in self._ready_managers:
                    reg_flag = False

                    try:
                        msg = json.loads(message[1].decode('utf-8'))
                        reg_flag = True
                    except Exception:
                        logger.warning(
                            "Got Exception reading registration message from manager: {}"
                            .format(manager_id),
                            exc_info=True)
                        logger.debug("Message: \n{}\n".format(message[1]))
                    else:
                        # We set up an entry only if registration works correctly
                        self._ready_managers[manager_id] = {
                            'last_heartbeat': time.time(),
                            'idle_since': time.time(),
                            'free_capacity': 0,
                            'block_id': None,
                            'max_capacity': 0,
                            'worker_count': 0,
                            'active': True,
                            'tasks': []
                        }
                    if reg_flag is True:
                        interesting_managers.add(manager_id)
                        logger.info("Adding manager: {} to ready queue".format(
                            manager_id))
                        m = self._ready_managers[manager_id]
                        m.update(msg)
                        logger.info(
                            "Registration info for manager {}: {}".format(
                                manager_id, msg))
                        self._send_monitoring_info(hub_channel, m)

                        if (msg['python_v'].rsplit(".", 1)[0] !=
                                self.current_platform['python_v'].rsplit(
                                    ".", 1)[0] or msg['parsl_v'] !=
                                self.current_platform['parsl_v']):
                            logger.warning(
                                "Manager {} has incompatible version info with the interchange"
                                .format(manager_id))
                            logger.debug("Setting kill event")
                            self._kill_event.set()
                            e = VersionMismatch(
                                "py.v={} parsl.v={}".format(
                                    self.current_platform['python_v'].rsplit(
                                        ".", 1)[0],
                                    self.current_platform['parsl_v']),
                                "py.v={} parsl.v={}".format(
                                    msg['python_v'].rsplit(".", 1)[0],
                                    msg['parsl_v']))
                            result_package = {
                                'type': 'result',
                                'task_id': -1,
                                'exception': serialize_object(e)
                            }
                            pkl_package = pickle.dumps(result_package)
                            self.results_outgoing.send(pkl_package)
                            logger.warning(
                                "Sent failure reports, unregistering manager")
                        else:
                            logger.info(
                                "Manager {} has compatible Parsl version {}".
                                format(manager_id, msg['parsl_v']))
                            logger.info(
                                "Manager {} has compatible Python version {}".
                                format(manager_id,
                                       msg['python_v'].rsplit(".", 1)[0]))
                    else:
                        # Registration has failed.
                        logger.debug(
                            "Suppressing bad registration from manager: {}".
                            format(manager_id))

                else:
                    tasks_requested = int.from_bytes(message[1], "little")
                    self._ready_managers[manager_id][
                        'last_heartbeat'] = time.time()
                    if tasks_requested == HEARTBEAT_CODE:
                        logger.debug(
                            "Manager {} sent heartbeat via tasks connection".
                            format(manager_id))
                        self.task_outgoing.send_multipart(
                            [manager_id, b'', PKL_HEARTBEAT_CODE])
                    else:
                        logger.debug("Manager {} requested {} tasks".format(
                            manager_id, tasks_requested))
                        self._ready_managers[manager_id][
                            'free_capacity'] = tasks_requested
                        interesting_managers.add(manager_id)
                logger.debug("leaving task_outgoing section")

            # If we had received any requests, check if there are tasks that could be passed

            logger.debug(
                "Managers count (interesting/total): {interesting}/{total}".
                format(total=len(self._ready_managers),
                       interesting=len(interesting_managers)))

            if interesting_managers and not self.pending_task_queue.empty():
                shuffled_managers = list(interesting_managers)
                random.shuffle(shuffled_managers)

                while shuffled_managers and not self.pending_task_queue.empty(
                ):  # cf. the if statement above...
                    manager_id = shuffled_managers.pop()
                    m = self._ready_managers[manager_id]
                    tasks_inflight = len(m['tasks'])
                    real_capacity = min(m['free_capacity'],
                                        m['max_capacity'] - tasks_inflight)

                    if (real_capacity and m['active']):
                        tasks = self.get_tasks(real_capacity)
                        if tasks:
                            self.task_outgoing.send_multipart(
                                [manager_id, b'',
                                 pickle.dumps(tasks)])
                            task_count = len(tasks)
                            count += task_count
                            tids = [t['task_id'] for t in tasks]
                            m['free_capacity'] -= task_count
                            m['tasks'].extend(tids)
                            m['idle_since'] = None
                            logger.debug("Sent tasks: {} to manager {}".format(
                                tids, manager_id))
                            if m['free_capacity'] > 0:
                                logger.debug(
                                    "Manager {} has free_capacity {}".format(
                                        manager_id, m['free_capacity']))
                                # ... so keep it in the interesting_managers list
                            else:
                                logger.debug(
                                    "Manager {} is now saturated".format(
                                        manager_id))
                                interesting_managers.remove(manager_id)
                    else:
                        interesting_managers.remove(manager_id)
                        # logger.debug("Nothing to send to manager {}".format(manager_id))
                logger.debug(
                    "leaving _ready_managers section, with {} managers still interesting"
                    .format(len(interesting_managers)))
            else:
                logger.debug(
                    "either no interesting managers or no tasks, so skipping manager pass"
                )
            # Receive any results and forward to client
            if self.results_incoming in self.socks and self.socks[
                    self.results_incoming] == zmq.POLLIN:
                logger.debug("entering results_incoming section")
                manager_id, *all_messages = self.results_incoming.recv_multipart(
                )
                if manager_id not in self._ready_managers:
                    logger.warning(
                        "Received a result from a un-registered manager: {}".
                        format(manager_id))
                else:
                    logger.debug(
                        f"Got {len(all_messages)} result items in batch from manager {manager_id}"
                    )

                    b_messages = []

                    for p_message in all_messages:
                        r = pickle.loads(p_message)
                        if r['type'] == 'result':
                            # process this for task ID and forward to executor
                            b_messages.append((p_message, r))
                        elif r['type'] == 'monitoring':
                            hub_channel.send_pyobj(r['payload'])
                        elif r['type'] == 'heartbeat':
                            logger.debug(
                                f"Manager {manager_id} sent heartbeat via results connection"
                            )
                            b_messages.append((p_message, r))
                        else:
                            logger.error(
                                "Interchange discarding result_queue message of unknown type: {}"
                                .format(r['type']))

                    m = self._ready_managers[manager_id]
                    for (b_message, r) in b_messages:
                        assert 'type' in r, f"Message is missing type entry: {r}"
                        if r['type'] == 'result':
                            try:
                                logger.debug(
                                    f"Removing task {r['task_id']} from manager record {manager_id}"
                                )
                                m['tasks'].remove(r['task_id'])
                            except Exception:
                                # If we reach here, there's something very wrong.
                                logger.exception(
                                    "Ignoring exception removing task_id {} for manager {} with task list {}"
                                    .format(r['task_id'], manager_id,
                                            m['tasks']))

                    b_messages_to_send = []
                    for (b_message, _) in b_messages:
                        b_messages_to_send.append(b_message)

                    if b_messages_to_send:
                        logger.debug("Sending messages on results_outgoing")
                        self.results_outgoing.send_multipart(
                            b_messages_to_send)
                        logger.debug("Sent messages on results_outgoing")

                    logger.debug(
                        f"Current tasks on manager {manager_id}: {m['tasks']}")
                    if len(m['tasks']) == 0 and m['idle_since'] is None:
                        m['idle_since'] = time.time()
                logger.debug("leaving results_incoming section")

            bad_managers = [
                (manager_id, m)
                for (manager_id, m) in self._ready_managers.items()
                if time.time() - m['last_heartbeat'] > self.heartbeat_threshold
            ]
            for (manager_id, m) in bad_managers:
                logger.debug("Last: {} Current: {}".format(
                    m['last_heartbeat'], time.time()))
                logger.warning(
                    f"Too many heartbeats missed for manager {manager_id} - removing manager"
                )
                if m['active']:
                    m['active'] = False
                    self._send_monitoring_info(hub_channel, m)

                logger.warning(
                    f"Cancelling htex tasks {m['tasks']} on removed manager")
                for tid in m['tasks']:
                    try:
                        raise ManagerLost(manager_id, m['hostname'])
                    except Exception:
                        result_package = {
                            'type':
                            'result',
                            'task_id':
                            tid,
                            'exception':
                            serialize_object(
                                RemoteExceptionWrapper(*sys.exc_info()))
                        }
                        pkl_package = pickle.dumps(result_package)
                        self.results_outgoing.send(pkl_package)
                logger.warning("Sent failure reports, unregistering manager")
                self._ready_managers.pop(manager_id, 'None')
                if manager_id in interesting_managers:
                    interesting_managers.remove(manager_id)

        delta = time.time() - start
        logger.info("Processed {} tasks in {} seconds".format(count, delta))
        logger.warning("Exiting")
Esempio n. 16
0
def worker(worker_id, pool_id, pool_size, task_queue, result_queue,
           worker_queue, tasks_in_progress):
    """

    Put request token into queue
    Get task from task_queue
    Pop request from queue
    Put result into result_queue
    """
    start_file_logger('{}/block-{}/{}/worker_{}.log'.format(
        args.logdir, args.block_id, pool_id, worker_id),
                      worker_id,
                      name="worker_log",
                      level=logging.DEBUG if args.debug else logging.INFO)

    # Store worker ID as an environment variable
    os.environ['PARSL_WORKER_RANK'] = str(worker_id)
    os.environ['PARSL_WORKER_COUNT'] = str(pool_size)
    os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id)

    # Sync worker with master
    logger.info('Worker {} started'.format(worker_id))
    if args.debug:
        logger.debug("Debug logging enabled")

    while True:
        worker_queue.put(worker_id)

        # The worker will receive {'task_id':<tid>, 'buffer':<buf>}
        req = task_queue.get()
        tasks_in_progress[worker_id] = req
        tid = req['task_id']
        logger.info("Received task {}".format(tid))

        try:
            worker_queue.get()
        except queue.Empty:
            logger.warning(
                "Worker ID: {} failed to remove itself from ready_worker_queue"
                .format(worker_id))
            pass

        try:
            result = execute_task(req['buffer'])
            serialized_result = serialize(result, buffer_threshold=1e6)
        except Exception as e:
            logger.info('Caught an exception: {}'.format(e))
            result_package = {
                'task_id': tid,
                'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))
            }
        else:
            result_package = {'task_id': tid, 'result': serialized_result}
            # logger.debug("Result: {}".format(result))

        logger.info("Completed task {}".format(tid))
        try:
            pkl_package = pickle.dumps(result_package)
        except Exception:
            logger.exception(
                "Caught exception while trying to pickle the result package")
            pkl_package = pickle.dumps({
                'task_id':
                tid,
                'exception':
                serialize(RemoteExceptionWrapper(*sys.exc_info()))
            })

        result_queue.put(pkl_package)
        tasks_in_progress.pop(worker_id)