def worker(comm, rank): logger.info("Worker started") # Sync worker with master comm.Barrier() logger.debug("Synced") task_request = b'TREQ' while True: comm.send(task_request, dest=0, tag=TASK_REQUEST_TAG) # The worker will receive {'task_id':<tid>, 'buffer':<buf>} req = comm.recv(source=0, tag=rank) logger.debug("Got req: {}".format(req)) tid = req['task_id'] logger.debug("Got task: {}".format(tid)) try: result = execute_task(req['buffer']) except Exception as e: result_package = {'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))} logger.debug("No result due to exception: {} with result package {}".format(e, result_package)) else: result_package = {'task_id': tid, 'result': serialize(result)} logger.debug("Result: {}".format(result)) pkl_package = pickle.dumps(result_package) comm.send(pkl_package, dest=0, tag=RESULT_TAG)
def worker(worker_id, task_url, debug=True, logdir="workers", uid="1"): """ TODO: docstring TODO : Cleanup debug, logdir and uid to function correctly """ start_file_logger('{}/{}/worker_{}.log'.format(logdir, uid, worker_id), 0, level=logging.DEBUG if debug is True else logging.INFO) logger.info("Starting worker {}".format(worker_id)) task_ids_received = [] message_q = zmq_pipes.WorkerMessages(task_url) while True: print("Worker loop iteration starting") task_id, buf = message_q.get() task_ids_received.append(task_id) user_ns = locals() user_ns.update({'__builtins__': __builtins__}) f, args, kwargs = unpack_apply_message(buf, user_ns, copy=False) logger.debug("Worker {} received task {}".format(worker_id, task_id)) result = execute_task(f, args, kwargs, user_ns) logger.debug("Worker {} completed task {}".format(worker_id, task_id)) reply = {"result": result, "worker_id": worker_id} message_q.put(task_id, serialize(reply)) logger.debug("Result sent")
def main(): """Execute one rank of an MPI application.""" logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", required=True, help="Input pickle file") parser.add_argument("-o", "--output", required=True, help="Output pickle file") args = parser.parse_args() logging.info("Input : %s", args.input) logging.info("Output : %s", args.output) returnval = None exception = None # open and deserialize the task's pickled input package with open(args.input, "rb") as file_handle: fn_buf = file_handle.read() logging.info("Read input pickle file") try: returnval = execute_task(fn_buf) except Exception as exc: logging.exception("Parsl task execution failed:") exception = exc else: logging.info("Finished execution") # only rank 0 should write/return a result; other ranks exit if int(os.environ["FLUX_TASK_RANK"]) == 0: # write the result to the output file result_buf = serialize(TaskResult(returnval, exception)) with open(args.output, "wb") as file_handle: file_handle.write(result_buf)
def worker_watchdog(self, kill_event): """ Listens on the pending_result_queue and sends out results via 0mq Parameters: ----------- kill_event : threading.Event Event to let the thread know when it is time to die. """ logger.debug("[WORKER_WATCHDOG_THREAD] Starting thread") while not kill_event.is_set(): for worker_id, p in self.procs.items(): if not p.is_alive(): logger.info( "[WORKER_WATCHDOG_THREAD] Worker {} has died".format( worker_id)) try: task = self._tasks_in_progress.pop(worker_id) logger.info( "[WORKER_WATCHDOG_THREAD] Worker {} was busy when it died" .format(worker_id)) try: raise WorkerLost(worker_id, platform.node()) except Exception: logger.info( "[WORKER_WATCHDOG_THREAD] Putting exception for task {} in the pending result queue" .format(task['task_id'])) result_package = { 'task_id': task['task_id'], 'exception': serialize( RemoteExceptionWrapper(*sys.exc_info())) } pkl_package = pickle.dumps(result_package) self.pending_result_queue.put(pkl_package) except KeyError: logger.info( "[WORKER_WATCHDOG_THREAD] Worker {} was not busy when it died" .format(worker_id)) p = multiprocessing.Process( target=worker, args=(worker_id, self.uid, self.worker_count, self.pending_task_queue, self.pending_result_queue, self.ready_worker_queue, self._tasks_in_progress, self.cpu_affinity), name="HTEX-Worker-{}".format(worker_id)) self.procs[worker_id] = p logger.info( "[WORKER_WATCHDOG_THREAD] Worker {} has been restarted" .format(worker_id)) time.sleep(self.poll_period) logger.critical("[WORKER_WATCHDOG_THREAD] Exiting")
def id_for_memo_tuple(denormalized_tuple, output_ref=False): if type(denormalized_tuple) != tuple: raise ValueError("id_for_memo_tuple cannot work on subclasses of tuple") normalized_list = [] for e in denormalized_tuple: normalized_list.append(id_for_memo(e, output_ref=output_ref)) return serialize(normalized_list)
def id_for_memo_list(denormalized_list, output_ref=False): if type(denormalized_list) != list: raise ValueError("id_for_memo_list cannot work on subclasses of list") normalized_list = [] for e in denormalized_list: normalized_list.append(id_for_memo(e, output_ref=output_ref)) return serialize(normalized_list)
def id_for_memo_dict(denormalized_dict, output_ref=False): """This normalises the keys and values of the supplied dictionary. When output_ref=True, the values are normalised as output refs, but the keys are not. """ if type(denormalized_dict) != dict: raise ValueError("id_for_memo_dict cannot work on subclasses of dict") keys = sorted(denormalized_dict) normalized_list = [] for k in keys: normalized_list.append(id_for_memo(k)) normalized_list.append(id_for_memo(denormalized_dict[k], output_ref=output_ref)) return serialize(normalized_list)
def id_for_memo_function(function, output_ref=False): """This produces function hash material using the source definition of the function. The standard serialize_object based approach cannot be used as it is too sensitive to irrelevant facts such as the source line, meaning a whitespace line added at the top of a source file will cause the hash to change. """ logger.debug( "serialising id_for_memo_function for function {}, type {}".format( function, type(function))) try: fn_source = getsource(function) except Exception as e: logger.warning( "Unable to get source code for app caching. Recommend creating module. Exception was: {}" .format(e)) fn_source = function.__name__ return serialize(fn_source.encode('utf-8'))
def send(self, message: object) -> None: logger.info("Sending a monitoring message via filesystem") # this should be randomised by things like worker ID, process ID, whatever # because there will in general be many FilesystemRadio objects sharing the # same space (even from the same process). id(self) used here will # disambiguate in one process at one instant, but not between # other things: eg different hosts, different processes, same process different non-overlapping instantiations unique_id = f"msg-{self.radio_uid}-{self.id_counter}" self.id_counter = self.id_counter + 1 tmp_filename = f"{self.tmp_path}/{unique_id}" new_filename = f"{self.new_path}/{unique_id}" buffer = (message, "NA") # this will write the message out then atomically # move it into new/, so that a partially written # file will never be observed in new/ with open(tmp_filename, "wb") as f: f.write(serialize(buffer)) os.rename(tmp_filename, new_filename)
def runner(incoming_q, outgoing_q): """This is a function that mocks the Swift-T side. It listens on the the incoming_q for tasks and posts returns on the outgoing_q. Args: - incoming_q (Queue object) : The queue to listen on - outgoing_q (Queue object) : Queue to post results on The messages posted on the incoming_q will be of the form : .. code:: python { "task_id" : <uuid.uuid4 string>, "buffer" : serialized buffer containing the fn, args and kwargs } If ``None`` is received, the runner will exit. Response messages should be of the form: .. code:: python { "task_id" : <uuid.uuid4 string>, "result" : serialized buffer containing result "exception" : serialized exception object } On exiting the runner will post ``None`` to the outgoing_q """ logger.debug("[RUNNER] Starting") def execute_task(bufs): """Deserialize the buffer and execute the task. Returns the serialized result or exception. """ user_ns = locals() user_ns.update({'__builtins__': __builtins__}) f, args, kwargs = unpack_apply_message(bufs, user_ns, copy=False) fname = getattr(f, '__name__', 'f') prefix = "parsl_" fname = prefix + "f" argname = prefix + "args" kwargname = prefix + "kwargs" resultname = prefix + "result" user_ns.update({ fname: f, argname: args, kwargname: kwargs, resultname: resultname }) code = "{0} = {1}(*{2}, **{3})".format(resultname, fname, argname, kwargname) try: logger.debug("[RUNNER] Executing: {0}".format(code)) exec(code, user_ns, user_ns) except Exception as e: logger.warning("Caught exception; will raise it: {}".format(e)) raise e else: logger.debug("[RUNNER] Result: {0}".format( user_ns.get(resultname))) return user_ns.get(resultname) while True: try: # Blocking wait on the queue msg = incoming_q.get(block=True, timeout=10) except queue.Empty: # Handle case where no items were in the queue logger.debug("[RUNNER] Queue is empty") except IOError as e: logger.debug("[RUNNER] Broken pipe: {}".format(e)) try: # Attempt to send a stop notification to the management thread outgoing_q.put(None) except Exception: pass break except Exception as e: logger.debug("[RUNNER] Caught unknown exception: {}".format(e)) else: # Handle received message if not msg: # Empty message is a die request logger.debug("[RUNNER] Received exit request") outgoing_q.put(None) break else: # Received a valid message, handle it logger.debug("[RUNNER] Got a valid task with ID {}".format( msg["task_id"])) try: response_obj = execute_task(msg['buffer']) response = { "task_id": msg["task_id"], "result": serialize(response_obj) } logger.debug("[RUNNER] Returing result: {}".format( deserialize(response["result"]))) except Exception as e: logger.debug( "[RUNNER] Caught task exception: {}".format(e)) response = { "task_id": msg["task_id"], "exception": serialize(e) } outgoing_q.put(response) logger.debug("[RUNNER] Terminating")
def id_for_memo_serialize(obj, output_ref=False): return serialize(obj)
def worker(worker_id, pool_id, pool_size, task_queue, result_queue, worker_queue, tasks_in_progress, cpu_affinity): """ Put request token into queue Get task from task_queue Pop request from queue Put result into result_queue """ start_file_logger('{}/block-{}/{}/worker_{}.log'.format(args.logdir, args.block_id, pool_id, worker_id), worker_id, name="worker_log", level=logging.DEBUG if args.debug else logging.INFO) # Store worker ID as an environment variable os.environ['PARSL_WORKER_RANK'] = str(worker_id) os.environ['PARSL_WORKER_COUNT'] = str(pool_size) os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id) os.environ['PARSL_WORKER_BLOCK_ID'] = str(args.block_id) # Sync worker with master logger.info('Worker {} started'.format(worker_id)) if args.debug: logger.debug("Debug logging enabled") # If desired, set process affinity if cpu_affinity != "none": # Count the number of cores per worker avail_cores = sorted(os.sched_getaffinity(0)) # Get the available processors cores_per_worker = len(avail_cores) // pool_size assert cores_per_worker > 0, "Affinity does not work if there are more workers than cores" # Determine this worker's cores if cpu_affinity == "block": my_cores = avail_cores[cores_per_worker * worker_id:cores_per_worker * (worker_id + 1)] elif cpu_affinity == "alternating": my_cores = avail_cores[worker_id::pool_size] else: raise ValueError("Affinity strategy {} is not supported".format(cpu_affinity)) # Set the affinity for this worker os.sched_setaffinity(0, my_cores) logger.info("Set worker CPU affinity to {}".format(my_cores)) while True: worker_queue.put(worker_id) # The worker will receive {'task_id':<tid>, 'buffer':<buf>} req = task_queue.get() tasks_in_progress[worker_id] = req tid = req['task_id'] logger.info("Received task {}".format(tid)) try: worker_queue.get() except queue.Empty: logger.warning("Worker ID: {} failed to remove itself from ready_worker_queue".format(worker_id)) pass try: result = execute_task(req['buffer']) serialized_result = serialize(result, buffer_threshold=1e6) except Exception as e: logger.info('Caught an exception: {}'.format(e)) result_package = {'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info()))} else: result_package = {'task_id': tid, 'result': serialized_result} # logger.debug("Result: {}".format(result)) logger.info("Completed task {}".format(tid)) try: pkl_package = pickle.dumps(result_package) except Exception: logger.exception("Caught exception while trying to pickle the result package") pkl_package = pickle.dumps({'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info())) }) result_queue.put(pkl_package) tasks_in_progress.pop(worker_id)
def worker(worker_id, pool_id, pool_size, task_queue, result_queue, worker_queue, tasks_in_progress, cpu_affinity, accelerator: Optional[str]): """ Put request token into queue Get task from task_queue Pop request from queue Put result into result_queue """ # override the global logger inherited from the __main__ process (which # usually logs to manager.log) with one specific to this worker. global logger logger = start_file_logger( '{}/block-{}/{}/worker_{}.log'.format(args.logdir, args.block_id, pool_id, worker_id), worker_id, name="worker_log", level=logging.DEBUG if args.debug else logging.INFO) # Store worker ID as an environment variable os.environ['PARSL_WORKER_RANK'] = str(worker_id) os.environ['PARSL_WORKER_COUNT'] = str(pool_size) os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id) os.environ['PARSL_WORKER_BLOCK_ID'] = str(args.block_id) # share the result queue with monitoring code so it too can send results down that channel import parsl.executors.high_throughput.monitoring_info as mi mi.result_queue = result_queue # Sync worker with master logger.info('Worker {} started'.format(worker_id)) if args.debug: logger.debug("Debug logging enabled") # If desired, set process affinity if cpu_affinity != "none": # Count the number of cores per worker avail_cores = sorted( os.sched_getaffinity(0)) # Get the available processors cores_per_worker = len(avail_cores) // pool_size assert cores_per_worker > 0, "Affinity does not work if there are more workers than cores" # Determine this worker's cores if cpu_affinity == "block": my_cores = avail_cores[cores_per_worker * worker_id:cores_per_worker * (worker_id + 1)] elif cpu_affinity == "alternating": my_cores = avail_cores[worker_id::pool_size] else: raise ValueError( "Affinity strategy {} is not supported".format(cpu_affinity)) # Set the affinity for this worker os.sched_setaffinity(0, my_cores) logger.info("Set worker CPU affinity to {}".format(my_cores)) # If desired, pin to accelerator if accelerator is not None: os.environ["CUDA_VISIBLE_DEVICES"] = accelerator os.environ["ROCR_VISIBLE_DEVICES"] = accelerator os.environ["SYCL_DEVICE_FILTER"] = f"*:*:{accelerator}" logger.info(f'Pinned worker to accelerator: {accelerator}') while True: worker_queue.put(worker_id) # The worker will receive {'task_id':<tid>, 'buffer':<buf>} req = task_queue.get() tasks_in_progress[worker_id] = req tid = req['task_id'] logger.info("Received task {}".format(tid)) try: worker_queue.get() except queue.Empty: logger.warning( "Worker ID: {} failed to remove itself from ready_worker_queue" .format(worker_id)) pass try: result = execute_task(req['buffer']) serialized_result = serialize(result, buffer_threshold=1e6) except Exception as e: logger.info('Caught an exception: {}'.format(e)) result_package = { 'type': 'result', 'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info())) } else: result_package = { 'type': 'result', 'task_id': tid, 'result': serialized_result } # logger.debug("Result: {}".format(result)) logger.info("Completed task {}".format(tid)) try: pkl_package = pickle.dumps(result_package) except Exception: logger.exception( "Caught exception while trying to pickle the result package") pkl_package = pickle.dumps({ 'type': 'result', 'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info())) }) result_queue.put(pkl_package) tasks_in_progress.pop(worker_id) logger.info("All processing finished for task {}".format(tid))
def id_for_memo_serialize(obj: object, output_ref: bool = False) -> bytes: return serialize(obj)
import argparse if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", required=True, help="Input pickle file") parser.add_argument("-o", "--output", required=True, help="Output pickle file") args = parser.parse_args() print(f"Input : {args.input}") print(f"Output : {args.output}") result = None with open(args.input, 'rb') as f: fn_buf = f.read() print("Read input pkl file") try: result = execute_task(fn_buf) print("Finished execution") except Exception as e: print(f"Execution failed due to {e}") result = e result_buf = serialize(result) with open(args.output, 'wb') as f: f.write(result_buf)
def worker(worker_id, pool_id, pool_size, task_queue, result_queue, worker_queue, tasks_in_progress): """ Put request token into queue Get task from task_queue Pop request from queue Put result into result_queue """ start_file_logger('{}/block-{}/{}/worker_{}.log'.format( args.logdir, args.block_id, pool_id, worker_id), worker_id, name="worker_log", level=logging.DEBUG if args.debug else logging.INFO) # Store worker ID as an environment variable os.environ['PARSL_WORKER_RANK'] = str(worker_id) os.environ['PARSL_WORKER_COUNT'] = str(pool_size) os.environ['PARSL_WORKER_POOL_ID'] = str(pool_id) # Sync worker with master logger.info('Worker {} started'.format(worker_id)) if args.debug: logger.debug("Debug logging enabled") while True: worker_queue.put(worker_id) # The worker will receive {'task_id':<tid>, 'buffer':<buf>} req = task_queue.get() tasks_in_progress[worker_id] = req tid = req['task_id'] logger.info("Received task {}".format(tid)) try: worker_queue.get() except queue.Empty: logger.warning( "Worker ID: {} failed to remove itself from ready_worker_queue" .format(worker_id)) pass try: result = execute_task(req['buffer']) serialized_result = serialize(result, buffer_threshold=1e6) except Exception as e: logger.info('Caught an exception: {}'.format(e)) result_package = { 'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info())) } else: result_package = {'task_id': tid, 'result': serialized_result} # logger.debug("Result: {}".format(result)) logger.info("Completed task {}".format(tid)) try: pkl_package = pickle.dumps(result_package) except Exception: logger.exception( "Caught exception while trying to pickle the result package") pkl_package = pickle.dumps({ 'task_id': tid, 'exception': serialize(RemoteExceptionWrapper(*sys.exc_info())) }) result_queue.put(pkl_package) tasks_in_progress.pop(worker_id)