def wrapper(*args: object, **kwargs: object) -> Any: import sys from parsl.app.errors import RemoteExceptionWrapper try: return func(*args, **kwargs) # type: ignore except Exception: return RemoteExceptionWrapper(*sys.exc_info())
def wrapper(*args, **kwargs): import sys from parsl.app.errors import RemoteExceptionWrapper try: return func(*args, **kwargs) except Exception: return RemoteExceptionWrapper(*sys.exc_info())
def worker(comm, rank): logger.info("Worker started") # Sync worker with master comm.Barrier() logger.debug("Synced") task_request = b'TREQ' while True: comm.send(task_request, dest=0, tag=TASK_REQUEST_TAG) # The worker will receive {'task_id':<tid>, 'buffer':<buf>} req = comm.recv(source=0, tag=rank) logger.debug("Got req: {}".format(req)) tid = req['task_id'] logger.debug("Got task: {}".format(tid)) try: result = execute_task(req['buffer']) except Exception as e: result_package = {'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))} logger.debug("No result due to exception: {} with result package {}".format(e, result_package)) else: result_package = {'task_id': tid, 'result': serialize(result)} logger.debug("Result: {}".format(result)) pkl_package = pickle.dumps(result_package) comm.send(pkl_package, dest=0, tag=RESULT_TAG)
def start(self): logger.info("Starting worker") result = self.registration_message() task_type = b'REGISTER' while True: logger.debug("Sending result") # TODO : Swap for our serialization methods self.task_socket.send_multipart([ task_type, # Byte encoded pickle.dumps(result) ]) if task_type == b'WRKR_DIE': logger.info("*** WORKER {} ABOUT TO DIE ***".format( self.worker_id)) exit( ) # Kill the worker after accepting death in message to manager. logger.debug("Waiting for task") p_task_id, msg = self.task_socket.recv_multipart() task_id = pickle.loads(p_task_id) logger.debug("Received task_id:{} with task:{}".format( task_id, msg)) if task_id == "KILL": logger.info("[KILL] -- Worker KILL message received! ") task_type = b'WRKR_DIE' result = None logger.debug("Executing task...") try: result = self.execute_task(msg) logger.debug("Executed result: {}".format(result)) serialized_result = serialize_object(result) except Exception: logger.exception("Caught an exception {}") result_package = { 'task_id': task_id, 'exception': serialize_object(RemoteExceptionWrapper(*sys.exc_info())) } else: logger.debug("Execution completed without exception") result_package = { 'task_id': task_id, 'result': serialized_result } # TODO: Change this to serialize_object to match IX? result = result_package task_type = b'TASK_RET' logger.warning("Broke out of the loop... dying")
def worker_watchdog(self, kill_event): """ Listens on the pending_result_queue and sends out results via 0mq Parameters: ----------- kill_event : threading.Event Event to let the thread know when it is time to die. """ logger.debug("[WORKER_WATCHDOG_THREAD] Starting thread") while not kill_event.is_set(): for worker_id, p in self.procs.items(): if not p.is_alive(): logger.info( "[WORKER_WATCHDOG_THREAD] Worker {} has died".format( worker_id)) try: task = self._tasks_in_progress.pop(worker_id) logger.info( "[WORKER_WATCHDOG_THREAD] Worker {} was busy when it died" .format(worker_id)) try: raise WorkerLost(worker_id, platform.node()) except Exception: logger.info( "[WORKER_WATCHDOG_THREAD] Putting exception for task {} in the pending result queue" .format(task['task_id'])) result_package = { 'task_id': task['task_id'], 'exception': serialize_object( RemoteExceptionWrapper(*sys.exc_info())) } pkl_package = pickle.dumps(result_package) self.pending_result_queue.put(pkl_package) except KeyError: logger.info( "[WORKER_WATCHDOG_THREAD] Worker {} was not busy when it died" .format(worker_id)) p = multiprocessing.Process( target=worker, args=(worker_id, self.uid, self.worker_count, self.pending_task_queue, self.pending_result_queue, self.ready_worker_queue, self._tasks_in_progress), name="HTEX-Worker-{}".format(worker_id)) self.procs[worker_id] = p logger.info( "[WORKER_WATCHDOG_THREAD] Worker {} has been restarted" .format(worker_id)) time.sleep(self.poll_period) logger.critical("[WORKER_WATCHDOG_THREAD] Exiting")
def worker(worker_id, pool_id, task_queue, result_queue, worker_queue, tasks_in_progress): """ Put request token into queue Get task from task_queue Pop request from queue Put result into result_queue """ start_file_logger('{}/{}/worker_{}.log'.format(args.logdir, pool_id, worker_id), worker_id, name="worker_log", level=logging.DEBUG if args.debug else logging.INFO) # Sync worker with master logger.info('Worker {} started'.format(worker_id)) if args.debug: logger.debug("Debug logging enabled") while True: worker_queue.put(worker_id) # The worker will receive {'task_id':<tid>, 'buffer':<buf>} req = task_queue.get() tasks_in_progress[worker_id] = req tid = req['task_id'] logger.info("Received task {}".format(tid)) try: worker_queue.get() except queue.Empty: logger.warning( "Worker ID: {} failed to remove itself from ready_worker_queue" .format(worker_id)) pass try: result = execute_task(req['buffer']) serialized_result = serialize_object(result) except Exception as e: logger.info('Caught an exception: {}'.format(e)) result_package = { 'task_id': tid, 'exception': serialize_object(RemoteExceptionWrapper(*sys.exc_info())) } else: result_package = {'task_id': tid, 'result': serialized_result} # logger.debug("Result: {}".format(result)) logger.info("Completed task {}".format(tid)) pkl_package = pickle.dumps(result_package) result_queue.put(pkl_package) tasks_in_progress.pop(worker_id)
def worker_watchdog(self, kill_event): """Keeps workers alive. Parameters: ----------- kill_event : threading.Event Event to let the thread know when it is time to die. """ logger.debug("Starting worker watchdog") while not kill_event.is_set(): for worker_id, p in self.procs.items(): if not p.is_alive(): logger.info("Worker {} has died".format(worker_id)) try: task = self._tasks_in_progress.pop(worker_id) logger.info("Worker {} was busy when it died".format( worker_id)) try: raise WorkerLost(worker_id, platform.node()) except Exception: logger.info( "Putting exception for task {} in the pending result queue" .format(task['task_id'])) result_package = { 'type': 'result', 'task_id': task['task_id'], 'exception': serialize( RemoteExceptionWrapper(*sys.exc_info())) } pkl_package = pickle.dumps(result_package) self.pending_result_queue.put(pkl_package) except KeyError: logger.info( "Worker {} was not busy when it died".format( worker_id)) p = mpProcess(target=worker, args=(worker_id, self.uid, self.worker_count, self.pending_task_queue, self.pending_result_queue, self.ready_worker_queue, self._tasks_in_progress, self.cpu_affinity), name="HTEX-Worker-{}".format(worker_id)) self.procs[worker_id] = p logger.info( "Worker {} has been restarted".format(worker_id)) time.sleep(self.heartbeat_period) logger.critical("Exiting")
def WorkQueueCollectorThread(collector_queue=multiprocessing.Queue(), tasks={}, tasks_lock=threading.Lock(), cancel_value=multiprocessing.Value('i', 1), submit_process=None, executor=None): logger.debug("Starting Collector Thread") continue_running = True while continue_running: if cancel_value.value == 0: continue_running = False continue # The WorkQueue process that creates task has died if not submit_process.is_alive() and cancel_value.value != 0: raise ExecutorError(executor, "Workqueue Submit Process is not alive") # Get the result message from the collector_queue try: item = collector_queue.get(timeout=1) except queue.Empty: continue parsl_tid = item["tid"] received = item["result_received"] # Obtain the future from the tasks dictionary tasks_lock.acquire() future = tasks[parsl_tid] tasks_lock.release() # Failed task if received is False: reason = item["reason"] status = item["status"] future.set_exception(AppFailure(reason, status)) # Successful task else: result = item["result"] future_update, _ = deserialize_object(result["result"]) logger.debug("Updating Future for Parsl Task {}".format(parsl_tid)) if result["failure"] is False: future.set_result(future_update) else: future.set_exception(RemoteExceptionWrapper(*future_update)) logger.debug("Exiting Collector Thread") return
# result_file: any output (including exceptions) will be written to # this file. try: (map_file, function_file, result_file) = sys.argv[1:] except ValueError: print("Usage:\n\t{} function result mapping\n".format(sys.argv[0])) raise try: (namespace, function_code, result_name) = load_function(map_file, function_file) except Exception: print("There was an error setting up the function for execution.") raise try: result = execute_function(namespace, function_code, result_name) except Exception: print("There was an error executing the function.") raise except Exception: traceback.print_exc() result = RemoteExceptionWrapper(*sys.exc_info()) # Write out function result to the result file try: dump_result_to_file(result_file, result) except Exception: print("Could not write to result file.") traceback.print_exc() sys.exit(1)
def start(self, poll_period=None): """ Start the NeedNameQeueu Parameters: ---------- TODO: Move task receiving to a thread """ logger.info("Incoming ports bound") if poll_period is None: poll_period = self.poll_period start = time.time() count = 0 self._kill_event = threading.Event() self._task_puller_thread = threading.Thread( target=self.migrate_tasks_to_internal, args=(self._kill_event, ), name="Interchange-Task-Puller") self._task_puller_thread.start() self._command_thread = threading.Thread(target=self._command_server, args=(self._kill_event, ), name="Interchange-Command") self._command_thread.start() poller = zmq.Poller() # poller.register(self.task_incoming, zmq.POLLIN) poller.register(self.task_outgoing, zmq.POLLIN) poller.register(self.results_incoming, zmq.POLLIN) # These are managers which we should examine in an iteration # for scheduling a job (or maybe any other attention?). # Anything altering the state of the manager should add it # onto this list. interesting_managers = set() while not self._kill_event.is_set(): self.socks = dict(poller.poll(timeout=poll_period)) # Listen for requests for work if self.task_outgoing in self.socks and self.socks[ self.task_outgoing] == zmq.POLLIN: logger.debug("[MAIN] starting task_outgoing section") message = self.task_outgoing.recv_multipart() manager = message[0] if manager not in self._ready_manager_queue: reg_flag = False try: msg = json.loads(message[1].decode('utf-8')) msg['reg_time'] = datetime.datetime.strptime( msg['reg_time'], "%Y-%m-%d %H:%M:%S") reg_flag = True except Exception: logger.warning( "[MAIN] Got Exception reading registration message from manager: {}" .format(manager), exc_info=True) logger.debug("[MAIN] Message :\n{}\n".format( message[0])) # By default we set up to ignore bad nodes/registration messages. self._ready_manager_queue[manager] = { 'last': time.time(), 'free_capacity': 0, 'block_id': None, 'max_capacity': 0, 'worker_count': 0, 'active': True, 'tasks': [] } if reg_flag is True: interesting_managers.add(manager) logger.info( "[MAIN] Adding manager: {} to ready queue".format( manager)) self._ready_manager_queue[manager].update(msg) logger.info( "[MAIN] Registration info for manager {}: {}". format(manager, msg)) if self.monitoring_enabled: logger.info("Sending message {} to hub".format( self._ready_manager_queue[manager])) self.hub_channel.send_pyobj( (MessageType.NODE_INFO, self._ready_manager_queue[manager])) if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit( ".", 1)[0] or msg['parsl_v'] != self.current_platform['parsl_v']): logger.warn( "[MAIN] Manager {} has incompatible version info with the interchange" .format(manager)) if self.suppress_failure is False: logger.debug("Setting kill event") self._kill_event.set() e = ManagerLost( manager, self._ready_manager_queue[manager] ['hostname']) result_package = { 'task_id': -1, 'exception': serialize_object(e) } pkl_package = pickle.dumps(result_package) self.results_outgoing.send(pkl_package) logger.warning( "[MAIN] Sent failure reports, unregistering manager" ) else: logger.debug( "[MAIN] Suppressing shutdown due to version incompatibility" ) else: logger.info( "[MAIN] Manager {} has compatible Parsl version {}" .format(manager, msg['parsl_v'])) logger.info( "[MAIN] Manager {} has compatible Python version {}" .format(manager, msg['python_v'].rsplit(".", 1)[0])) else: # Registration has failed. if self.suppress_failure is False: self._kill_event.set() e = BadRegistration(manager, critical=True) result_package = { 'task_id': -1, 'exception': serialize_object(e) } pkl_package = pickle.dumps(result_package) self.results_outgoing.send(pkl_package) else: logger.debug( "[MAIN] Suppressing bad registration from manager:{}" .format(manager)) else: tasks_requested = int.from_bytes(message[1], "little") self._ready_manager_queue[manager]['last'] = time.time() if tasks_requested == HEARTBEAT_CODE: logger.debug( "[MAIN] Manager {} sent heartbeat".format(manager)) self.task_outgoing.send_multipart( [manager, b'', PKL_HEARTBEAT_CODE]) else: logger.debug( "[MAIN] Manager {} requested {} tasks".format( manager, tasks_requested)) self._ready_manager_queue[manager][ 'free_capacity'] = tasks_requested interesting_managers.add(manager) logger.debug("[MAIN] leaving task_outgoing section") # If we had received any requests, check if there are tasks that could be passed logger.debug("Managers count (total/interesting): {}/{}".format( len(self._ready_manager_queue), len(interesting_managers))) if interesting_managers and not self.pending_task_queue.empty(): shuffled_managers = list(interesting_managers) random.shuffle(shuffled_managers) while shuffled_managers and not self.pending_task_queue.empty( ): # cf. the if statement above... manager = shuffled_managers.pop() tasks_inflight = len( self._ready_manager_queue[manager]['tasks']) real_capacity = min( self._ready_manager_queue[manager]['free_capacity'], self._ready_manager_queue[manager]['max_capacity'] - tasks_inflight) if (real_capacity and self._ready_manager_queue[manager]['active']): tasks = self.get_tasks(real_capacity) if tasks: self.task_outgoing.send_multipart( [manager, b'', pickle.dumps(tasks)]) task_count = len(tasks) count += task_count tids = [t['task_id'] for t in tasks] self._ready_manager_queue[manager][ 'free_capacity'] -= task_count self._ready_manager_queue[manager]['tasks'].extend( tids) logger.debug( "[MAIN] Sent tasks: {} to manager {}".format( tids, manager)) if self._ready_manager_queue[manager][ 'free_capacity'] > 0: logger.debug( "[MAIN] Manager {} has free_capacity {}". format( manager, self._ready_manager_queue[manager] ['free_capacity'])) # ... so keep it in the interesting_managers list else: logger.debug( "[MAIN] Manager {} is now saturated". format(manager)) interesting_managers.remove(manager) else: interesting_managers.remove(manager) # logger.debug("Nothing to send to manager {}".format(manager)) logger.debug( "[MAIN] leaving _ready_manager_queue section, with {} managers still interesting" .format(len(interesting_managers))) else: logger.debug( "[MAIN] either no interesting managers or no tasks, so skipping manager pass" ) # Receive any results and forward to client if self.results_incoming in self.socks and self.socks[ self.results_incoming] == zmq.POLLIN: logger.debug("[MAIN] entering results_incoming section") manager, *b_messages = self.results_incoming.recv_multipart() if manager not in self._ready_manager_queue: logger.warning( "[MAIN] Received a result from a un-registered manager: {}" .format(manager)) else: logger.debug("[MAIN] Got {} result items in batch".format( len(b_messages))) for b_message in b_messages: r = pickle.loads(b_message) # logger.debug("[MAIN] Received result for task {} from {}".format(r['task_id'], manager)) self._ready_manager_queue[manager]['tasks'].remove( r['task_id']) self.results_outgoing.send_multipart(b_messages) logger.debug("[MAIN] Current tasks: {}".format( self._ready_manager_queue[manager]['tasks'])) logger.debug("[MAIN] leaving results_incoming section") bad_managers = [ manager for manager in self._ready_manager_queue if time.time() - self._ready_manager_queue[manager]['last'] > self.heartbeat_threshold ] for manager in bad_managers: logger.debug("[MAIN] Last: {} Current: {}".format( self._ready_manager_queue[manager]['last'], time.time())) logger.warning( "[MAIN] Too many heartbeats missed for manager {}".format( manager)) for tid in self._ready_manager_queue[manager]['tasks']: try: raise ManagerLost( manager, self._ready_manager_queue[manager]['hostname']) except Exception: result_package = { 'task_id': tid, 'exception': serialize_object( RemoteExceptionWrapper(*sys.exc_info())) } pkl_package = pickle.dumps(result_package) self.results_outgoing.send(pkl_package) logger.warning( "[MAIN] Sent failure reports, unregistering manager" ) self._ready_manager_queue.pop(manager, 'None') if manager in interesting_managers: interesting_managers.remove(manager) delta = time.time() - start logger.info("Processed {} tasks in {} seconds".format(count, delta)) logger.warning("Exiting")
def worker(worker_id, pool_id, pool_size, task_queue, result_queue, worker_queue, tasks_in_progress, cpu_affinity): """ Put request token into queue Get task from task_queue Pop request from queue Put result into result_queue """ start_file_logger('{}/block-{}/{}/worker_{}.log'.format(args.logdir, args.block_id, pool_id, worker_id), worker_id, name="worker_log", level=logging.DEBUG if args.debug else logging.INFO) # Store worker ID as an environment variable os.environ['PARSL_WORKER_RANK'] = str(worker_id) os.environ['PARSL_WORKER_COUNT'] = str(pool_size) os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id) os.environ['PARSL_WORKER_BLOCK_ID'] = str(args.block_id) # Sync worker with master logger.info('Worker {} started'.format(worker_id)) if args.debug: logger.debug("Debug logging enabled") # If desired, set process affinity if cpu_affinity != "none": # Count the number of cores per worker avail_cores = sorted(os.sched_getaffinity(0)) # Get the available processors cores_per_worker = len(avail_cores) // pool_size assert cores_per_worker > 0, "Affinity does not work if there are more workers than cores" # Determine this worker's cores if cpu_affinity == "block": my_cores = avail_cores[cores_per_worker * worker_id:cores_per_worker * (worker_id + 1)] elif cpu_affinity == "alternating": my_cores = avail_cores[worker_id::pool_size] else: raise ValueError("Affinity strategy {} is not supported".format(cpu_affinity)) # Set the affinity for this worker os.sched_setaffinity(0, my_cores) logger.info("Set worker CPU affinity to {}".format(my_cores)) while True: worker_queue.put(worker_id) # The worker will receive {'task_id':<tid>, 'buffer':<buf>} req = task_queue.get() tasks_in_progress[worker_id] = req tid = req['task_id'] logger.info("Received task {}".format(tid)) try: worker_queue.get() except queue.Empty: logger.warning("Worker ID: {} failed to remove itself from ready_worker_queue".format(worker_id)) pass try: result = execute_task(req['buffer']) serialized_result = serialize(result, buffer_threshold=1e6) except Exception as e: logger.info('Caught an exception: {}'.format(e)) result_package = {'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))} else: result_package = {'task_id': tid, 'result': serialized_result} # logger.debug("Result: {}".format(result)) logger.info("Completed task {}".format(tid)) try: pkl_package = pickle.dumps(result_package) except Exception: logger.exception("Caught exception while trying to pickle the result package") pkl_package = pickle.dumps({'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info())) }) result_queue.put(pkl_package) tasks_in_progress.pop(worker_id)
def pull_tasks(self, kill_event): """Pull tasks from the incoming tasks 0mq pipe onto the internal pending task queue While : receive results and task requests from the workers receive tasks/heartbeats from the Interchange match tasks to workers if task doesn't have appropriate worker type: launch worker of type.. with LRU or some sort of caching strategy. if workers >> tasks: advertize available capacity Parameters: ----------- kill_event : threading.Event Event to let the thread know when it is time to die. """ log.info("[TASK PULL THREAD] starting") # Send a registration message msg = self.create_reg_message() log.debug(f"Sending registration message: {msg}") self.task_incoming.send(msg) last_interchange_contact = time.time() task_recv_counter = 0 poll_timer = self.poll_period new_worker_map = None while not kill_event.is_set(): # Disabling the check on ready_worker_queue disables batching log.debug("[TASK_PULL_THREAD] Loop start") pending_task_count = task_recv_counter - self.task_done_counter ready_worker_count = self.worker_map.ready_worker_count() log.debug( "[TASK_PULL_THREAD pending_task_count: %s, Ready_worker_count: %s", pending_task_count, ready_worker_count, ) if pending_task_count < self.max_queue_size and ready_worker_count > 0: ads = self.worker_map.advertisement() log.debug(f"[TASK_PULL_THREAD] Requesting tasks: {ads}") msg = pickle.dumps(ads) self.task_incoming.send(msg) # Receive results from the workers, if any socks = dict(self.poller.poll(timeout=poll_timer)) if (self.funcx_task_socket in socks and socks[self.funcx_task_socket] == zmq.POLLIN): self.poll_funcx_task_socket() # Receive task batches from Interchange and forward to workers if self.task_incoming in socks and socks[ self.task_incoming] == zmq.POLLIN: # If we want to wrap the task_incoming polling into a separate function, # we need to # self.poll_task_incoming( # poll_timer, # last_interchange_contact, # kill_event, # task_revc_counter # ) poll_timer = 0 _, pkl_msg = self.task_incoming.recv_multipart() message = pickle.loads(pkl_msg) last_interchange_contact = time.time() if message == "STOP": log.critical("[TASK_PULL_THREAD] Received stop request") kill_event.set() break elif type(message) == tuple and message[0] == "TASK_CANCEL": with self.task_finalization_lock: task_id = message[1] log.info( f"Received TASK_CANCEL request for task: {task_id}" ) if task_id not in self.task_worker_map: log.warning( f"Task:{task_id} is not in task_worker_map.") log.warning( "Possible duplicate cancel or race-condition") continue # Cancel task by killing the worker it is on worker_id_raw = self.task_worker_map[task_id][ "worker_id"] worker_to_kill = self.task_worker_map[task_id][ "worker_id"].decode("utf-8") worker_type = self.task_worker_map[task_id][ "task_type"] log.debug( "Cancelling task running on worker: %s", self.task_worker_map[task_id], ) try: log.info( f"Removing worker:{worker_id_raw} from map") self.worker_map.start_remove_worker(worker_type) self.worker_map.remove_worker(worker_id_raw) log.info( f"Popping worker:{worker_to_kill} from worker_procs" ) proc = self.worker_procs.pop(worker_to_kill) log.warning( f"Sending process:{proc.pid} terminate signal") proc.terminate() try: proc.wait( 1 ) # Wait 1 second before attempting SIGKILL except subprocess.TimeoutExpired: log.exception( "Process did not terminate in 1 second") log.warning( f"Sending process:{proc.pid} kill signal") proc.kill() else: log.debug( f"Worker process exited with : {proc.returncode}" ) raise TaskCancelled(worker_to_kill, self.uid) except Exception as e: log.exception(f"Raise exception, handling: {e}") result_package = { "task_id": task_id, "container_id": worker_type, "exception": self.serializer.serialize( RemoteExceptionWrapper(*sys.exc_info())), } self.pending_result_queue.put( pickle.dumps(result_package)) worker_proc = self.worker_map.add_worker( worker_id=str(self.worker_map.worker_id_counter), worker_type=self.worker_type, container_cmd_options=self.container_cmd_options, address=self.address, debug=self.debug, uid=self.uid, logdir=self.logdir, worker_port=self.worker_port, ) self.worker_procs.update(worker_proc) self.task_worker_map.pop(task_id) self.remove_task(task_id) elif message == HEARTBEAT_CODE: log.debug("Got heartbeat from interchange") else: tasks = [(rt["local_container"], Message.unpack(rt["raw_buffer"])) for rt in message] task_recv_counter += len(tasks) log.debug("[TASK_PULL_THREAD] Got tasks: {} of {}".format( [t[1].task_id for t in tasks], task_recv_counter)) for task_type, task in tasks: log.debug(f"[TASK DEBUG] Task is of type: {task_type}") if task_type not in self.task_queues: self.task_queues[task_type] = queue.Queue() if task_type not in self.outstanding_task_count: self.outstanding_task_count[task_type] = 0 self.task_queues[task_type].put(task) self.outstanding_task_count[task_type] += 1 self.task_type_mapping[task.task_id] = task_type log.debug( "Got task: Outstanding task counts: {}".format( self.outstanding_task_count)) log.debug( f"Task {task} pushed to a task queue {task_type}") else: log.debug("[TASK_PULL_THREAD] No incoming tasks") # Limit poll duration to heartbeat_period # heartbeat_period is in s vs poll_timer in ms if not poll_timer: poll_timer = self.poll_period poll_timer = min(self.heartbeat_period * 1000, poll_timer * 2) # Only check if no messages were received. if time.time( ) > last_interchange_contact + self.heartbeat_threshold: log.critical( "[TASK_PULL_THREAD] Missing contact with interchange beyond " "heartbeat_threshold") kill_event.set() log.critical("Killing all workers") for proc in self.worker_procs.values(): proc.kill() log.critical("[TASK_PULL_THREAD] Exiting") break log.debug(f"To-Die Counts: {self.worker_map.to_die_count}") log.debug("Alive worker counts: {}".format( self.worker_map.total_worker_type_counts)) new_worker_map = naive_scheduler( self.task_queues, self.outstanding_task_count, self.max_worker_count, new_worker_map, self.worker_map.to_die_count, ) log.debug(f"[SCHEDULER] New worker map: {new_worker_map}") # NOTE: Wipes the queue -- previous scheduling loops don't affect what's # needed now. self.next_worker_q, need_more = self.worker_map.get_next_worker_q( new_worker_map) # Spin up any new workers according to the worker queue. # Returns the total number of containers that have spun up. self.worker_procs.update( self.worker_map.spin_up_workers( self.next_worker_q, mode=self.worker_mode, debug=self.debug, container_cmd_options=self.container_cmd_options, address=self.address, uid=self.uid, logdir=self.logdir, worker_port=self.worker_port, )) log.debug(f"[SPIN UP] Worker processes: {self.worker_procs}") # Count the workers of each type that need to be removed spin_downs, container_switch_count = self.worker_map.spin_down_workers( new_worker_map, worker_max_idletime=self.worker_max_idletime, need_more=need_more, scheduler_mode=self.scheduler_mode, ) self.container_switch_count += container_switch_count log.debug("Container switch count: total {}, cur {}".format( self.container_switch_count, container_switch_count)) for w_type in spin_downs: self.remove_worker_init(w_type) current_worker_map = self.worker_map.get_worker_counts() for task_type in current_worker_map: if task_type == "unused": continue # *** Match tasks to workers *** # else: available_workers = current_worker_map[task_type] log.debug("Available workers of type {}: {}".format( task_type, available_workers)) for _i in range(available_workers): if (task_type in self.task_queues and not self.task_queues[task_type].qsize() == 0 and not self.worker_map. worker_queues[task_type].qsize() == 0): log.debug( "Task type {} has task queue size {}".format( task_type, self.task_queues[task_type].qsize())) log.debug("... and available workers: {}".format( self.worker_map.worker_queues[task_type].qsize( ))) self.send_task_to_worker(task_type)
def start(self): log.info("Starting worker") result = self.registration_message() task_type = b"REGISTER" log.debug("Sending registration") self.task_socket.send_multipart([task_type, pickle.dumps(result)] # Byte encoded ) while True: log.debug("Waiting for task") p_task_id, p_container_id, msg = self.task_socket.recv_multipart() task_id = pickle.loads(p_task_id) container_id = pickle.loads(p_container_id) log.debug(f"Received task_id:{task_id} with task:{msg}") result = None task_type = None if task_id == "KILL": task = Message.unpack(msg) if task.task_buffer.decode("utf-8") == "KILL": log.info("[KILL] -- Worker KILL message received! ") task_type = b"WRKR_DIE" else: log.exception( "Caught an exception of non-KILL message for KILL task" ) continue else: log.debug("Executing task...") try: result = self.execute_task(msg) serialized_result = self.serialize(result) if len(serialized_result) > self.result_size_limit: raise MaxResultSizeExceeded(len(serialized_result), self.result_size_limit) except Exception as e: log.exception(f"Caught an exception {e}") result_package = { "task_id": task_id, "container_id": container_id, "exception": self.serialize( RemoteExceptionWrapper(*sys.exc_info())), } else: log.debug("Execution completed without exception") result_package = { "task_id": task_id, "container_id": container_id, "result": serialized_result, } result = result_package task_type = b"TASK_RET" log.debug("Sending result") self.task_socket.send_multipart([task_type, pickle.dumps(result) ] # Byte encoded ) if task_type == b"WRKR_DIE": log.info(f"*** WORKER {self.worker_id} ABOUT TO DIE ***") # Kill the worker after accepting death in message to manager. sys.exit() # We need to return here to allow for sys.exit mocking in tests return log.warning("Broke out of the loop... dying")
def worker(worker_id, pool_id, pool_size, task_queue, result_queue, worker_queue, tasks_in_progress, cpu_affinity, accelerator: Optional[str]): """ Put request token into queue Get task from task_queue Pop request from queue Put result into result_queue """ # override the global logger inherited from the __main__ process (which # usually logs to manager.log) with one specific to this worker. global logger logger = start_file_logger( '{}/block-{}/{}/worker_{}.log'.format(args.logdir, args.block_id, pool_id, worker_id), worker_id, name="worker_log", level=logging.DEBUG if args.debug else logging.INFO) # Store worker ID as an environment variable os.environ['PARSL_WORKER_RANK'] = str(worker_id) os.environ['PARSL_WORKER_COUNT'] = str(pool_size) os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id) os.environ['PARSL_WORKER_BLOCK_ID'] = str(args.block_id) # share the result queue with monitoring code so it too can send results down that channel import parsl.executors.high_throughput.monitoring_info as mi mi.result_queue = result_queue # Sync worker with master logger.info('Worker {} started'.format(worker_id)) if args.debug: logger.debug("Debug logging enabled") # If desired, set process affinity if cpu_affinity != "none": # Count the number of cores per worker avail_cores = sorted( os.sched_getaffinity(0)) # Get the available processors cores_per_worker = len(avail_cores) // pool_size assert cores_per_worker > 0, "Affinity does not work if there are more workers than cores" # Determine this worker's cores if cpu_affinity == "block": my_cores = avail_cores[cores_per_worker * worker_id:cores_per_worker * (worker_id + 1)] elif cpu_affinity == "alternating": my_cores = avail_cores[worker_id::pool_size] else: raise ValueError( "Affinity strategy {} is not supported".format(cpu_affinity)) # Set the affinity for this worker os.sched_setaffinity(0, my_cores) logger.info("Set worker CPU affinity to {}".format(my_cores)) # If desired, pin to accelerator if accelerator is not None: os.environ["CUDA_VISIBLE_DEVICES"] = accelerator os.environ["ROCR_VISIBLE_DEVICES"] = accelerator os.environ["SYCL_DEVICE_FILTER"] = f"*:*:{accelerator}" logger.info(f'Pinned worker to accelerator: {accelerator}') while True: worker_queue.put(worker_id) # The worker will receive {'task_id':<tid>, 'buffer':<buf>} req = task_queue.get() tasks_in_progress[worker_id] = req tid = req['task_id'] logger.info("Received task {}".format(tid)) try: worker_queue.get() except queue.Empty: logger.warning( "Worker ID: {} failed to remove itself from ready_worker_queue" .format(worker_id)) pass try: result = execute_task(req['buffer']) serialized_result = serialize(result, buffer_threshold=1e6) except Exception as e: logger.info('Caught an exception: {}'.format(e)) result_package = { 'type': 'result', 'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info())) } else: result_package = { 'type': 'result', 'task_id': tid, 'result': serialized_result } # logger.debug("Result: {}".format(result)) logger.info("Completed task {}".format(tid)) try: pkl_package = pickle.dumps(result_package) except Exception: logger.exception( "Caught exception while trying to pickle the result package") pkl_package = pickle.dumps({ 'type': 'result', 'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info())) }) result_queue.put(pkl_package) tasks_in_progress.pop(worker_id) logger.info("All processing finished for task {}".format(tid))
def start(self): """ Start the interchange """ logger.info("Incoming ports bound") hub_channel = self._create_monitoring_channel() poll_period = self.poll_period start = time.time() count = 0 self._kill_event = threading.Event() self._task_puller_thread = threading.Thread( target=self.task_puller, args=(self._kill_event, ), name="Interchange-Task-Puller") self._task_puller_thread.start() self._command_thread = threading.Thread(target=self._command_server, args=(self._kill_event, ), name="Interchange-Command") self._command_thread.start() poller = zmq.Poller() poller.register(self.task_outgoing, zmq.POLLIN) poller.register(self.results_incoming, zmq.POLLIN) # These are managers which we should examine in an iteration # for scheduling a job (or maybe any other attention?). # Anything altering the state of the manager should add it # onto this list. interesting_managers: Set[bytes] = set() while not self._kill_event.is_set(): self.socks = dict(poller.poll(timeout=poll_period)) # Listen for requests for work if self.task_outgoing in self.socks and self.socks[ self.task_outgoing] == zmq.POLLIN: logger.debug("starting task_outgoing section") message = self.task_outgoing.recv_multipart() manager_id = message[0] if manager_id not in self._ready_managers: reg_flag = False try: msg = json.loads(message[1].decode('utf-8')) reg_flag = True except Exception: logger.warning( "Got Exception reading registration message from manager: {}" .format(manager_id), exc_info=True) logger.debug("Message: \n{}\n".format(message[1])) else: # We set up an entry only if registration works correctly self._ready_managers[manager_id] = { 'last_heartbeat': time.time(), 'idle_since': time.time(), 'free_capacity': 0, 'block_id': None, 'max_capacity': 0, 'worker_count': 0, 'active': True, 'tasks': [] } if reg_flag is True: interesting_managers.add(manager_id) logger.info("Adding manager: {} to ready queue".format( manager_id)) m = self._ready_managers[manager_id] m.update(msg) logger.info( "Registration info for manager {}: {}".format( manager_id, msg)) self._send_monitoring_info(hub_channel, m) if (msg['python_v'].rsplit(".", 1)[0] != self.current_platform['python_v'].rsplit( ".", 1)[0] or msg['parsl_v'] != self.current_platform['parsl_v']): logger.warning( "Manager {} has incompatible version info with the interchange" .format(manager_id)) logger.debug("Setting kill event") self._kill_event.set() e = VersionMismatch( "py.v={} parsl.v={}".format( self.current_platform['python_v'].rsplit( ".", 1)[0], self.current_platform['parsl_v']), "py.v={} parsl.v={}".format( msg['python_v'].rsplit(".", 1)[0], msg['parsl_v'])) result_package = { 'type': 'result', 'task_id': -1, 'exception': serialize_object(e) } pkl_package = pickle.dumps(result_package) self.results_outgoing.send(pkl_package) logger.warning( "Sent failure reports, unregistering manager") else: logger.info( "Manager {} has compatible Parsl version {}". format(manager_id, msg['parsl_v'])) logger.info( "Manager {} has compatible Python version {}". format(manager_id, msg['python_v'].rsplit(".", 1)[0])) else: # Registration has failed. logger.debug( "Suppressing bad registration from manager: {}". format(manager_id)) else: tasks_requested = int.from_bytes(message[1], "little") self._ready_managers[manager_id][ 'last_heartbeat'] = time.time() if tasks_requested == HEARTBEAT_CODE: logger.debug( "Manager {} sent heartbeat via tasks connection". format(manager_id)) self.task_outgoing.send_multipart( [manager_id, b'', PKL_HEARTBEAT_CODE]) else: logger.debug("Manager {} requested {} tasks".format( manager_id, tasks_requested)) self._ready_managers[manager_id][ 'free_capacity'] = tasks_requested interesting_managers.add(manager_id) logger.debug("leaving task_outgoing section") # If we had received any requests, check if there are tasks that could be passed logger.debug( "Managers count (interesting/total): {interesting}/{total}". format(total=len(self._ready_managers), interesting=len(interesting_managers))) if interesting_managers and not self.pending_task_queue.empty(): shuffled_managers = list(interesting_managers) random.shuffle(shuffled_managers) while shuffled_managers and not self.pending_task_queue.empty( ): # cf. the if statement above... manager_id = shuffled_managers.pop() m = self._ready_managers[manager_id] tasks_inflight = len(m['tasks']) real_capacity = min(m['free_capacity'], m['max_capacity'] - tasks_inflight) if (real_capacity and m['active']): tasks = self.get_tasks(real_capacity) if tasks: self.task_outgoing.send_multipart( [manager_id, b'', pickle.dumps(tasks)]) task_count = len(tasks) count += task_count tids = [t['task_id'] for t in tasks] m['free_capacity'] -= task_count m['tasks'].extend(tids) m['idle_since'] = None logger.debug("Sent tasks: {} to manager {}".format( tids, manager_id)) if m['free_capacity'] > 0: logger.debug( "Manager {} has free_capacity {}".format( manager_id, m['free_capacity'])) # ... so keep it in the interesting_managers list else: logger.debug( "Manager {} is now saturated".format( manager_id)) interesting_managers.remove(manager_id) else: interesting_managers.remove(manager_id) # logger.debug("Nothing to send to manager {}".format(manager_id)) logger.debug( "leaving _ready_managers section, with {} managers still interesting" .format(len(interesting_managers))) else: logger.debug( "either no interesting managers or no tasks, so skipping manager pass" ) # Receive any results and forward to client if self.results_incoming in self.socks and self.socks[ self.results_incoming] == zmq.POLLIN: logger.debug("entering results_incoming section") manager_id, *all_messages = self.results_incoming.recv_multipart( ) if manager_id not in self._ready_managers: logger.warning( "Received a result from a un-registered manager: {}". format(manager_id)) else: logger.debug( f"Got {len(all_messages)} result items in batch from manager {manager_id}" ) b_messages = [] for p_message in all_messages: r = pickle.loads(p_message) if r['type'] == 'result': # process this for task ID and forward to executor b_messages.append((p_message, r)) elif r['type'] == 'monitoring': hub_channel.send_pyobj(r['payload']) elif r['type'] == 'heartbeat': logger.debug( f"Manager {manager_id} sent heartbeat via results connection" ) b_messages.append((p_message, r)) else: logger.error( "Interchange discarding result_queue message of unknown type: {}" .format(r['type'])) m = self._ready_managers[manager_id] for (b_message, r) in b_messages: assert 'type' in r, f"Message is missing type entry: {r}" if r['type'] == 'result': try: logger.debug( f"Removing task {r['task_id']} from manager record {manager_id}" ) m['tasks'].remove(r['task_id']) except Exception: # If we reach here, there's something very wrong. logger.exception( "Ignoring exception removing task_id {} for manager {} with task list {}" .format(r['task_id'], manager_id, m['tasks'])) b_messages_to_send = [] for (b_message, _) in b_messages: b_messages_to_send.append(b_message) if b_messages_to_send: logger.debug("Sending messages on results_outgoing") self.results_outgoing.send_multipart( b_messages_to_send) logger.debug("Sent messages on results_outgoing") logger.debug( f"Current tasks on manager {manager_id}: {m['tasks']}") if len(m['tasks']) == 0 and m['idle_since'] is None: m['idle_since'] = time.time() logger.debug("leaving results_incoming section") bad_managers = [ (manager_id, m) for (manager_id, m) in self._ready_managers.items() if time.time() - m['last_heartbeat'] > self.heartbeat_threshold ] for (manager_id, m) in bad_managers: logger.debug("Last: {} Current: {}".format( m['last_heartbeat'], time.time())) logger.warning( f"Too many heartbeats missed for manager {manager_id} - removing manager" ) if m['active']: m['active'] = False self._send_monitoring_info(hub_channel, m) logger.warning( f"Cancelling htex tasks {m['tasks']} on removed manager") for tid in m['tasks']: try: raise ManagerLost(manager_id, m['hostname']) except Exception: result_package = { 'type': 'result', 'task_id': tid, 'exception': serialize_object( RemoteExceptionWrapper(*sys.exc_info())) } pkl_package = pickle.dumps(result_package) self.results_outgoing.send(pkl_package) logger.warning("Sent failure reports, unregistering manager") self._ready_managers.pop(manager_id, 'None') if manager_id in interesting_managers: interesting_managers.remove(manager_id) delta = time.time() - start logger.info("Processed {} tasks in {} seconds".format(count, delta)) logger.warning("Exiting")
def worker(worker_id, pool_id, pool_size, task_queue, result_queue, worker_queue, tasks_in_progress): """ Put request token into queue Get task from task_queue Pop request from queue Put result into result_queue """ start_file_logger('{}/block-{}/{}/worker_{}.log'.format( args.logdir, args.block_id, pool_id, worker_id), worker_id, name="worker_log", level=logging.DEBUG if args.debug else logging.INFO) # Store worker ID as an environment variable os.environ['PARSL_WORKER_RANK'] = str(worker_id) os.environ['PARSL_WORKER_COUNT'] = str(pool_size) os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id) # Sync worker with master logger.info('Worker {} started'.format(worker_id)) if args.debug: logger.debug("Debug logging enabled") while True: worker_queue.put(worker_id) # The worker will receive {'task_id':<tid>, 'buffer':<buf>} req = task_queue.get() tasks_in_progress[worker_id] = req tid = req['task_id'] logger.info("Received task {}".format(tid)) try: worker_queue.get() except queue.Empty: logger.warning( "Worker ID: {} failed to remove itself from ready_worker_queue" .format(worker_id)) pass try: result = execute_task(req['buffer']) serialized_result = serialize(result, buffer_threshold=1e6) except Exception as e: logger.info('Caught an exception: {}'.format(e)) result_package = { 'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info())) } else: result_package = {'task_id': tid, 'result': serialized_result} # logger.debug("Result: {}".format(result)) logger.info("Completed task {}".format(tid)) try: pkl_package = pickle.dumps(result_package) except Exception: logger.exception( "Caught exception while trying to pickle the result package") pkl_package = pickle.dumps({ 'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info())) }) result_queue.put(pkl_package) tasks_in_progress.pop(worker_id)