def main(): args = parse_args() # Create a temporary file store for task results. tmpdir = tempfile.mkdtemp() # Load the task input data here task_inputs = load_inputs() print('Creating tasks') tasks = generate_tasks(args.command, task_inputs, args.infiles, args.outfile, tmpdir, args.max_retries) # Create the Work Queue master that manages task distribution. work_queue.cctools_debug_flags_set("all") work_queue.cctools_debug_config_file(f'{args.name}.debug') work_queue.cctools_debug_config_file_size(0) wq = WorkQueue(port=args.port, name=args.name, shutdown=True) wq.specify_log(f'{args.name}.log') # Submit all tasks to the queue. print('Submitting tasks') for t in tasks.values(): wq.submit(t) # The main loop waits for a task to get done then handles success or # failure accordingly print('Entering main loop') while not all([done_check(t) for t in tasks.values()]): t = wq.wait(10) # This blocks for 10s or until a task is done. if t is not None: tasks[t.tag] = t # Update the task map with the correct status # On success, post-process the task. If the maximum number of # submissions for a task has been reached, make a note. Otherwise, # report the failure and resubmit. if t.return_status == 0 and t.result == WORK_QUEUE_RESULT_SUCCESS: print(f'Task {t.tag} completed successfully.') input_idx = int(t.tag.split('_')[1]) handle_success(t, tmpdir, args.outfile) elif t.result == WORK_QUEUE_RESULT_MAX_RETRIES: print(f'Task {t.tag} resubmitted too many times.') else: wq.submit(t) print(f'Task {t.tag} failed with result {t.result}') print(t.output) print('All tasks completed or hit max retries.') print('Cleaning up...') shutil.rmtree(tmpdir) print('Done')
def _work_queue_submit_wait(task_queue=multiprocessing.Queue(), launch_cmd=None, env=None, collector_queue=multiprocessing.Queue(), data_dir=".", full=False, shared_fs=False, autolabel=False, autolabel_window=None, autocategory=False, should_stop=None, port=WORK_QUEUE_DEFAULT_PORT, wq_log_dir=None, project_password_file=None, project_name=None): """Thread to handle Parsl app submissions to the Work Queue objects. Takes in Parsl functions submitted using submit(), and creates a Work Queue task with the appropriate specifications, which is then submitted to Work Queue. After tasks are completed, processes the exit status and exit code of the task, and sends results to the Work Queue collector thread. To avoid python's global interpreter lock with work queue's wait, this function should be launched as a process, not as a lightweight thread. This means that any communication should be done using the multiprocessing module capabilities, rather than shared memory. """ logger.debug("Starting WorkQueue Submit/Wait Process") # Enable debugging flags and create logging file wq_debug_log = None if wq_log_dir is not None: logger.debug("Setting debugging flags and creating logging file") wq_debug_log = os.path.join(wq_log_dir, "debug_log") # Create WorkQueue queue object logger.debug("Creating WorkQueue Object") try: logger.debug("Listening on port {}".format(port)) q = WorkQueue(port, debug_log=wq_debug_log) except Exception as e: logger.error("Unable to create WorkQueue object: {}".format(e)) raise e # Specify WorkQueue queue attributes if project_name: q.specify_name(project_name) if project_password_file: q.specify_password_file(project_password_file) if autolabel: q.enable_monitoring() if autolabel_window is not None: q.tune('category-steady-n-tasks', autolabel_window) # Only write logs when the wq_log_dir is specified, which it most likely will be if wq_log_dir is not None: wq_master_log = os.path.join(wq_log_dir, "master_log") wq_trans_log = os.path.join(wq_log_dir, "transaction_log") if full: wq_resource_log = os.path.join(wq_log_dir, "resource_logs") q.enable_monitoring_full(dirname=wq_resource_log) q.specify_log(wq_master_log) q.specify_transactions_log(wq_trans_log) orig_ppid = os.getppid() result_file_of_task_id = { } # Mapping taskid -> result file for active tasks. while not should_stop.value: # Monitor the task queue ppid = os.getppid() if ppid != orig_ppid: logger.debug("new Process") break # Submit tasks while task_queue.qsize() > 0 and not should_stop.value: # Obtain task from task_queue try: task = task_queue.get(timeout=1) logger.debug("Removing task from queue") except queue.Empty: continue pkg_pfx = "" if task.env_pkg is not None: pkg_pfx = "./{} -e {} ".format( os.path.basename(package_run_script), os.path.basename(task.env_pkg)) # Create command string logger.debug(launch_cmd) command_str = launch_cmd.format( package_prefix=pkg_pfx, mapping=os.path.basename(task.map_file), function=os.path.basename(task.function_file), result=os.path.basename(task.result_file)) logger.debug(command_str) # Create WorkQueue task for the command logger.debug("Sending task {} with command: {}".format( task.id, command_str)) try: t = Task(command_str) except Exception as e: logger.error("Unable to create task: {}".format(e)) collector_queue.put_nowait( WqTaskToParsl( id=task.id, result_received=False, result=None, reason="task could not be created by work queue", status=-1)) continue t.specify_category(task.category) if autolabel: q.specify_category_mode( task.category, WORK_QUEUE_ALLOCATION_MODE_MAX_THROUGHPUT) # Specify environment variables for the task if env is not None: for var in env: t.specify_environment_variable(var, env[var]) if task.env_pkg is not None: t.specify_input_file(package_run_script, cache=True) t.specify_input_file(task.env_pkg, cache=True) # Specify script, and data/result files for task t.specify_input_file(exec_parsl_function.__file__, cache=True) t.specify_input_file(task.function_file, cache=False) t.specify_input_file(task.map_file, cache=False) t.specify_output_file(task.result_file, cache=False) t.specify_tag(str(task.id)) result_file_of_task_id[str(task.id)] = task.result_file logger.debug("Parsl ID: {}".format(task.id)) # Specify input/output files that need to be staged. # Absolute paths are assumed to be in shared filesystem, and thus # not staged by work queue. if not shared_fs: for spec in task.input_files: if spec.stage: t.specify_input_file(spec.parsl_name, spec.parsl_name, cache=spec.cache) for spec in task.output_files: if spec.stage: t.specify_output_file(spec.parsl_name, spec.parsl_name, cache=spec.cache) # Submit the task to the WorkQueue object logger.debug("Submitting task {} to WorkQueue".format(task.id)) try: wq_id = q.submit(t) except Exception as e: logger.error( "Unable to submit task to work queue: {}".format(e)) collector_queue.put_nowait( WqTaskToParsl( id=task.id, result_received=False, result=None, reason="task could not be submited to work queue", status=-1)) continue logger.debug("Task {} submitted to WorkQueue with id {}".format( task.id, wq_id)) # If the queue is not empty wait on the WorkQueue queue for a task task_found = True if not q.empty(): while task_found and not should_stop.value: # Obtain the task from the queue t = q.wait(1) if t is None: task_found = False continue # When a task is found: parsl_id = t.tag logger.debug( "Completed WorkQueue task {}, parsl task {}".format( t.id, t.tag)) result_file = result_file_of_task_id.pop(t.tag) # A tasks completes 'succesfully' if it has result file, # and it can be loaded. This may mean that the 'success' is # an exception. logger.debug("Looking for result in {}".format(result_file)) try: with open(result_file, "rb") as f_in: result = pickle.load(f_in) logger.debug("Found result in {}".format(result_file)) collector_queue.put_nowait( WqTaskToParsl(id=parsl_id, result_received=True, result=result, reason=None, status=t.return_status)) # If a result file could not be generated, explain the # failure according to work queue error codes. We generate # an exception and wrap it with RemoteExceptionWrapper, to # match the positive case. except Exception as e: reason = _explain_work_queue_result(t) logger.debug( "Did not find result in {}".format(result_file)) logger.debug( "Wrapper Script status: {}\nWorkQueue Status: {}". format(t.return_status, t.result)) logger.debug( "Task with id parsl {} / wq {} failed because:\n{}". format(parsl_id, t.id, reason)) collector_queue.put_nowait( WqTaskToParsl(id=parsl_id, result_received=False, result=e, reason=reason, status=t.return_status)) logger.debug("Exiting WorkQueue Monitoring Process") return 0
def WorkQueueSubmitThread(task_queue=multiprocessing.Queue(), queue_lock=threading.Lock(), launch_cmd=None, env=None, collector_queue=multiprocessing.Queue(), see_worker_output=False, data_dir=".", full=False, cancel_value=multiprocessing.Value('i', 1), port=WORK_QUEUE_DEFAULT_PORT, wq_log_dir=None, project_password=None, project_password_file=None, project_name=None): """Thread to handle Parsl app submissions to the Work Queue objects. Takes in Parsl functions submitted using submit(), and creates a Work Queue task with the appropriate specifications, which is then submitted to Work Queue. After tasks are completed, processes the exit status and exit code of the task, and sends results to the Work Queue collector thread. """ logger.debug("Starting WorkQueue Submit/Wait Process") # Enable debugging flags and create logging file if wq_log_dir is not None: logger.debug("Setting debugging flags and creating logging file") wq_debug_log = os.path.join(wq_log_dir, "debug_log") cctools_debug_flags_set("all") cctools_debug_config_file(wq_debug_log) # Create WorkQueue queue object logger.debug("Creating WorkQueue Object") try: logger.debug("Listening on port {}".format(port)) q = WorkQueue(port) except Exception as e: logger.error("Unable to create WorkQueue object: {}".format(e)) raise e # Specify WorkQueue queue attributes if project_name: q.specify_name(project_name) if project_password: q.specify_password(project_password) elif project_password_file: q.specify_password_file(project_password_file) # Only write logs when the wq_log_dir is specified, which it most likely will be if wq_log_dir is not None: wq_master_log = os.path.join(wq_log_dir, "master_log") wq_trans_log = os.path.join(wq_log_dir, "transaction_log") if full: wq_resource_log = os.path.join(wq_log_dir, "resource_logs") q.enable_monitoring_full(dirname=wq_resource_log) q.specify_log(wq_master_log) q.specify_transactions_log(wq_trans_log) wq_tasks = set() orig_ppid = os.getppid() continue_running = True while (continue_running): # Monitor the task queue ppid = os.getppid() if ppid != orig_ppid: logger.debug("new Process") continue_running = False continue # Submit tasks while task_queue.qsize() > 0: if cancel_value.value == 0: logger.debug("cancel value set to cancel") continue_running = False break # Obtain task from task_queue try: item = task_queue.get(timeout=1) logger.debug("Removing task from queue") except queue.Empty: continue parsl_id = item["task_id"] # Extract information about the task function_data_loc = item["data_loc"] function_data_loc_remote = function_data_loc.split("/")[-1] function_result_loc = item["result_loc"] function_result_loc_remote = function_result_loc.split("/")[-1] input_files = item["input_files"] output_files = item["output_files"] std_files = item["std_files"] full_script_name = workqueue_worker.__file__ script_name = full_script_name.split("/")[-1] remapping_string = "" std_string = "" # Parse input file information logger.debug("Looking at input") for item in input_files: if item[3] == "std": std_string += "mv " + item[1] + " " + item[0] + "; " else: remapping_string += item[0] + ":" + item[1] + "," logger.debug(remapping_string) # Parse output file information logger.debug("Looking at output") for item in output_files: remapping_string += item[0] + ":" + item[1] + "," logger.debug(remapping_string) if len(input_files) + len(output_files) > 0: remapping_string = "-r " + remapping_string remapping_string = remapping_string[:-1] # Create command string logger.debug(launch_cmd) command_str = launch_cmd.format( input_file=function_data_loc_remote, output_file=function_result_loc_remote, remapping_string=remapping_string) command_str = std_string + command_str logger.debug(command_str) # Create WorkQueue task for the command logger.debug("Sending task {} with command: {}".format( parsl_id, command_str)) try: t = Task(command_str) except Exception as e: logger.error("Unable to create task: {}".format(e)) continue # Specify environment variables for the task if env is not None: for var in env: t.specify_environment_variable(var, env[var]) # Specify script, and data/result files for task t.specify_file(full_script_name, script_name, WORK_QUEUE_INPUT, cache=True) t.specify_file(function_data_loc, function_data_loc_remote, WORK_QUEUE_INPUT, cache=False) t.specify_file(function_result_loc, function_result_loc_remote, WORK_QUEUE_OUTPUT, cache=False) t.specify_tag(str(parsl_id)) logger.debug("Parsl ID: {}".format(t.id)) # Specify all input/output files for task for item in input_files: t.specify_file(item[0], item[1], WORK_QUEUE_INPUT, cache=item[2]) for item in output_files: t.specify_file(item[0], item[1], WORK_QUEUE_OUTPUT, cache=item[2]) for item in std_files: t.specify_file(item[0], item[1], WORK_QUEUE_OUTPUT, cache=item[2]) # Submit the task to the WorkQueue object logger.debug("Submitting task {} to WorkQueue".format(parsl_id)) try: wq_id = q.submit(t) wq_tasks.add(wq_id) except Exception as e: logger.error("Unable to create task: {}".format(e)) msg = { "tid": parsl_id, "result_received": False, "reason": "Workqueue Task Start Failure", "status": 1 } collector_queue.put_nowait(msg) continue logger.debug("Task {} submitted to WorkQueue with id {}".format( parsl_id, wq_id)) if cancel_value.value == 0: continue_running = False # If the queue is not empty wait on the WorkQueue queue for a task task_found = True if not q.empty() and continue_running: while task_found is True: if cancel_value.value == 0: continue_running = False task_found = False continue # Obtain the task from the queue t = q.wait(1) if t is None: task_found = False continue else: parsl_tid = t.tag logger.debug( "Completed WorkQueue task {}, parsl task {}".format( t.id, parsl_tid)) status = t.return_status task_result = t.result msg = None # Task failure if status != 0 or (task_result != WORK_QUEUE_RESULT_SUCCESS and task_result != WORK_QUEUE_RESULT_OUTPUT_MISSING): logger.debug( "Wrapper Script status: {}\nWorkQueue Status: {}". format(status, task_result)) # Wrapper script failure if status != 0: logger.debug( "WorkQueue task {} failed with status {}". format(t.id, status)) reason = "Wrapper Script Failure: " if status == 1: reason += "problem parsing command line options" elif status == 2: reason += "problem loading function data" elif status == 3: reason += "problem remapping file names" elif status == 4: reason += "problem writing out function result" else: reason += "unable to process wrapper script failure with status = {}".format( status) reason += "\nTrace:\n" + str(t.output) logger.debug( "WorkQueue runner script failed for task {} because {}\n" .format(parsl_tid, reason)) # WorkQueue system failure else: reason = "WorkQueue System Failure: " if task_result == 1: reason += "missing input file" elif task_result == 2: reason += "unable to generate output file" elif task_result == 4: reason += "stdout has been truncated" elif task_result == 1 << 3: reason += "task terminated with a signal" elif task_result == 2 << 3: reason += "task used more resources than requested" elif task_result == 3 << 3: reason += "task ran past the specified end time" elif task_result == 4 << 3: reason += "result could not be classified" elif task_result == 5 << 3: reason += "task failed, but not a task error" elif task_result == 6 << 3: reason += "unable to complete after specified number of retries" elif task_result == 7 << 3: reason += "task ran for more than the specified time" elif task_result == 8 << 3: reason += "task needed more space to complete task" else: reason += "unable to process Work Queue system failure" msg = { "tid": parsl_tid, "result_received": False, "reason": reason, "status": status } collector_queue.put_nowait(msg) # Task Success else: # Print the output from the task if see_worker_output: print(t.output) # Load result into result file result_loc = os.path.join( data_dir, "task_" + str(parsl_tid) + "_function_result") logger.debug( "Looking for result in {}".format(result_loc)) f = open(result_loc, "rb") result = pickle.load(f) f.close() msg = { "tid": parsl_tid, "result_received": True, "result": result } wq_tasks.remove(t.id) collector_queue.put_nowait(msg) if continue_running is False: logger.debug("Exiting WorkQueue Master Thread event loop") break # Remove all WorkQueue tasks that remain in the queue object for wq_task in wq_tasks: logger.debug("Cancelling WorkQueue Task {}".format(wq_task)) q.cancel_by_taskid(wq_task) logger.debug("Exiting WorkQueue Monitoring Process") return 0
print task.command, task.algorithm task.specify_buffer('hello from %d' % i, ifile, cache=False) if i % 2: task.specify_output_file(ofile, cache=False) else: task.specify_file(ofile, type=WORK_QUEUE_OUTPUT, cache=False) outputs.append(ofile) wq.submit(task) if wq.empty(): print 'work queue is empty' while not wq.empty(): t = wq.wait(10) if t: print t.tag print wq.stats.workers_init, wq.stats.workers_ready, wq.stats.workers_busy, \ wq.stats.tasks_running, wq.stats.tasks_waiting, wq.stats.tasks_complete map(os.unlink, outputs) for i in range(5): task = Task('hostname && date +%s.%N') task.specify_input_file('/bin/hostname') wq.submit(task) if wq.hungry(): print 'work queue is hungry'
class QMaster(threading.Thread): def __init__(self, project, port, log_freq=600): # 600 seconds """Initialize the QMaster Parameters ---------- project : port : int log_freq : int, optional frequency to print info about the status of the work queue. In units of seconds. Default is to print every 10 minutes. """ threading.Thread.__init__(self) self.project = project self.log_freq = log_freq # print time in seconds self.wake_freq = 1 # seconds self.wq = WorkQueue(port, name='MSMAccelerator', catalog=True, exclusive=False) logger.info('WORK QUEUE MASTER LISTENING ON PORT: %d', self.wq.port) logger.info('(Start a local worker with >> work_queue_worker -d all localhost %d & )', self.wq.port) # method controls whether or not we need to bring back solvated_xtc as well if self.project.method == 'explicit': self.return_wet_xtc = True elif self.project.method == 'implicit': self.return_wet_xtc = False else: raise Exception("project.method must be 'explicit' or 'implicit'") logger.info('Return wet xtc set to %s', self.return_wet_xtc) # what does this specify algorithm do? self.wq.specify_algorithm(WORK_QUEUE_SCHEDULE_FCFS) # fast abort kills jobs that appear to be stragling (taking more than 1.5x average) #self.wq.activate_fast_abort(1.5) # setting the stop event signals for the thread to die self._stop = threading.Event() # the thread sets the event every time a job returns or there are no waiting jobs # and it finished post processing. See the wait method self._mainloop_wake_event_cause = None self._mainloop_wake_event = threading.Event() # start the thread self.start() def run(self): """Main thread-loop for the QMaster thread""" last_print = time.time() while True: time.sleep(self.wake_freq) if not self.wq.empty(): t = self.wq.wait(self.wake_freq) if t: if t.return_status != 0: logger.error('Worker returned nonzero exit status for job: %d', t.return_status) else: self.on_return(t) self._mainloop_wake_event_cause = 'job returned' self._mainloop_wake_event.set() if self.wq.stats.tasks_waiting == 0 and not self._mainloop_wake_event.is_set(): self._mainloop_wake_event_cause = 'queue empty' self._mainloop_wake_event.set() # also set the event if there are no tasks in the queue if self._stop.is_set(): logger.info('Recieved stop signal. Shutting down all workers') self.wq.shutdown_workers(0) # 0 indicates to shut all of them down sys.exit(0) if time.time() - last_print > self.log_freq: logger.info('workers initialized: %d, ready: %d, busy: %d', self.wq.stats.workers_init, self.wq.stats.workers_ready, self.wq.stats.workers_busy) logger.info('workers running: %d, waiting: %d, complete: %d', self.wq.stats.tasks_running, self.wq.stats.tasks_waiting, self.wq.stats.tasks_complete) last_print = time.time() def num_jobs_waiting(self): """Number of jobs waiting to be sent out This number should be kept at 1, and when it drops to zero a new job should be generated. Returns ------- n : int The number """ return self.wq.stats.tasks_waiting def num_jobs_in_queue(self): """Get the number of jobs currently in the work queue This includes both the jobs running remotely and the ones waiting here Returns ------- n : int The number """ return self.wq.stats.tasks_running + self.wq.stats.tasks_waiting def stop(self): """Signal the Qmaster thread to stop""" self._stop.set() def wait(self): """Block until some sort of action happens in the main-thread loop. This call will return either when a job as returned from the workers, or when the queue is empty (last job in the local queue has been sent out) Returns ------- wakeup_cause : str Either 'job returned' or 'queue empty', depending on the reason """ self._mainloop_wake_event.wait() self._mainloop_wake_event.clear() cause = self._mainloop_wake_event_cause if not cause in ['job returned', 'queue empty']: raise Exception('Bad wakeup cause') return cause @with_db_lock def submit(self, traj): """ Submit a job to the work-queue for further sampling. Parameters ---------- """ if traj.submit_time is not None: raise ValueError("This traj has already been submitted") Session.add(traj) Session.flush() traj.populate_default_filenames() if not hasattr(traj, 'init_pdb'): raise ValueError('Traj is supposed to have a pdb object tacked on') save_file(traj.init_pdb_fn, traj.init_pdb) remote_driver_fn = os.path.split(str(traj.forcefield.driver))[1] remote_pdb_fn = 'input.pdb' remote_output_fn = 'production_dry{}'.format(traj.forcefield.output_extension) if traj.mode is None or traj.forcefield is None: raise ValueError('malformed traj') task = Task('chmod +x ./{driver}; ./{driver} {pdb_fn} {ff} {water} {mode} {threads}'.format( pdb_fn=remote_pdb_fn, mode=traj.mode, driver=remote_driver_fn, ff=traj.forcefield.name, water=traj.forcefield.water, threads=traj.forcefield.threads)) #why does traj.forcefield.driver come out as unicode? task.specify_input_file(str(traj.forcefield.driver), remote_driver_fn) task.specify_output_file(traj.wqlog_fn, 'logs/driver.log') task.specify_input_file(traj.init_pdb_fn, remote_pdb_fn) task.specify_output_file(traj.dry_xtc_fn, remote_output_fn) if self.return_wet_xtc: # this is the XTC file with waters, generated by the driver # when you're doing implicit solvent only, this stuff is not used. remote_wet_output_fn = 'production_wet{}'.format(traj.forcefield.output_extension) task.specify_output_file(traj.wet_xtc_fn, remote_wet_output_fn) task.specify_output_file(traj.last_wet_snapshot_fn, 'last_wet_snapshot.pdb') else: logger.debug('Not requesting production_wet%s from driver (implicit)', traj.forcefield.output_extension) task.specify_tag(str(traj.id)) task.specify_algorithm(WORK_QUEUE_SCHEDULE_FILES) # what does this do? traj.submit_time = datetime.now() self.wq.submit(task) logger.info('Submitted to queue: %s', traj) @with_db_lock def on_return(self, task): """Called by main thread on the return of data from the workers. Post-processing""" logger.info('Retrieved task %s', task.tag) traj = Session.query(models.Trajectory).get(int(task.tag)) try: # save lh5 version of the trajectory conf = load_file(self.project.pdb_topology_file) coordinates = msmbuilder.Trajectory.load_trajectory_file(str(traj.dry_xtc_fn), Conf=conf) save_file(traj.lh5_fn, coordinates) except Exception as e: logger.error('When postprocessing %s, convert to lh5 failed!', traj) logger.exception(e) raise # convert last_wet_snapshot to lh5 pdb_to_lh5(traj, 'last_wet_snapshot_fn') pdb_to_lh5(traj, 'init_pdb_fn') traj.host = task.host traj.returned_time = datetime.now() traj.length = len(coordinates) logger.info('Finished converting new traj to lh5 sucessfully')
def WorkQueueSubmitThread(task_queue=multiprocessing.Queue(), queue_lock=threading.Lock(), launch_cmd=None, env=None, collector_queue=multiprocessing.Queue(), see_worker_output=False, data_dir=".", full=False, cancel_value=multiprocessing.Value('i', 1), port=WORK_QUEUE_DEFAULT_PORT, wq_log_dir=None, project_password=None, project_password_file=None, project_name=None): logger.debug("Starting WorkQueue Submit/Wait Process") orig_ppid = os.getppid() wq_tasks = set() continue_running = True if wq_log_dir is not None: wq_debug_log = os.path.join(wq_log_dir, "debug") cctools_debug_flags_set("all") cctools_debug_config_file(wq_debug_log) logger.debug("Creating Workqueue Object") try: q = WorkQueue(port) except Exception as e: logger.error("Unable to create Workqueue object: {}", format(e)) raise e if project_name: q.specify_name(project_name) if project_password: q.specify_password(project_password) elif project_password_file: q.specify_password_file(project_password_file) # Only write Logs when the log_dir is specified, which is most likely always will be if wq_log_dir is not None: wq_master_log = os.path.join(wq_log_dir, "master_log") wq_trans_log = os.path.join(wq_log_dir, "transaction_log") if full: wq_resource_log = os.path.join(wq_log_dir, "resource_logs") q.enable_monitoring_full(dirname=wq_resource_log) q.specify_log(wq_master_log) q.specify_transactions_log(wq_trans_log) while (continue_running): # Monitor the Task Queue ppid = os.getppid() if ppid != orig_ppid: continue_running = False continue # Submit Tasks while task_queue.qsize() > 0: if cancel_value.value == 0: continue_running = False break try: # item = task_queue.get_nowait() item = task_queue.get(timeout=1) logger.debug("Removing task from queue") except queue.Empty: continue parsl_id = item["task_id"] function_data_loc = item["data_loc"] function_result_loc = item["result_loc"] function_result_loc_remote = function_result_loc.split("/")[-1] function_data_loc_remote = function_data_loc.split("/")[-1] input_files = item["input_files"] output_files = item["output_files"] std_files = item["std_files"] full_script_name = workqueue_worker.__file__ script_name = full_script_name.split("/")[-1] remapping_string = "" std_string = "" logger.debug("looking at input") for item in input_files: if item[3] == "std": std_string += "mv " + item[1] + " " + item[0] + "; " else: remapping_string += item[0] + ":" + item[1] + "," logger.debug(remapping_string) logger.debug("looking at output") for item in output_files: remapping_string += item[0] + ":" + item[1] + "," logger.debug(remapping_string) if len(input_files) + len(output_files) > 0: remapping_string = "-r " + remapping_string remapping_string = remapping_string[:-1] logger.debug(launch_cmd) command_str = launch_cmd.format( input_file=function_data_loc_remote, output_file=function_result_loc_remote, remapping_string=remapping_string) logger.debug(command_str) command_str = std_string + command_str logger.debug(command_str) logger.debug("Sending task {} with command: {}".format( parsl_id, command_str)) try: t = Task(command_str) except Exception as e: logger.error("Unable to create task: {}".format(e)) continue if env is not None: for var in env: t.specify_environment_variable(var, env[var]) t.specify_file(full_script_name, script_name, WORK_QUEUE_INPUT, cache=True) t.specify_file(function_result_loc, function_result_loc_remote, WORK_QUEUE_OUTPUT, cache=False) t.specify_file(function_data_loc, function_data_loc_remote, WORK_QUEUE_INPUT, cache=False) t.specify_tag(str(parsl_id)) for item in input_files: t.specify_file(item[0], item[1], WORK_QUEUE_INPUT, cache=item[2]) for item in output_files: t.specify_file(item[0], item[1], WORK_QUEUE_OUTPUT, cache=item[2]) for item in std_files: t.specify_file(item[0], item[1], WORK_QUEUE_OUTPUT, cache=item[2]) logger.debug("Submitting task {} to workqueue".format(parsl_id)) try: wq_id = q.submit(t) wq_tasks.add(wq_id) except Exception as e: logger.error("Unable to create task: {}".format(e)) msg = { "tid": parsl_id, "result_received": False, "reason": "Workqueue Task Start Failure", "status": 1 } collector_queue.put_nowait(msg) continue logger.debug("Task {} submitted workqueue with id {}".format( parsl_id, wq_id)) if cancel_value.value == 0: continue_running = False # Wait for Tasks task_found = True # If the queue is not empty wait on the workqueue queue for a task if not q.empty() and continue_running: while task_found is True: if cancel_value.value == 0: continue_running = False task_found = False continue t = q.wait(1) if t is None: task_found = False continue else: parsl_tid = t.tag logger.debug( "Completed workqueue task {}, parsl task {}".format( t.id, parsl_tid)) status = t.return_status task_result = t.result msg = None if status != 0 or (task_result != WORK_QUEUE_RESULT_SUCCESS and task_result != WORK_QUEUE_RESULT_OUTPUT_MISSING): if task_result == WORK_QUEUE_RESULT_SUCCESS: logger.debug( "Workqueue task {} failed with status {}". format(t.id, status)) reason = "Wrapper Script Failure: " if status == 1: reason += "command line parsing" if status == 2: reason += "problem loading function data" if status == 3: reason += "problem remapping file names" if status == 4: reason += "problem writing out function result" reason += "\nTrace:\n" + t.output logger.debug( "Workqueue runner script failed for task {} because {}\n" .format(parsl_tid, reason)) else: reason = "Workqueue system failure\n" msg = { "tid": parsl_tid, "result_received": False, "reason": reason, "status": status } collector_queue.put_nowait(msg) else: if see_worker_output: print(t.output) result_loc = os.path.join( data_dir, "task_" + str(parsl_tid) + "_function_result") logger.debug( "Looking for result in {}".format(result_loc)) f = open(result_loc, "rb") result = pickle.load(f) f.close() msg = { "tid": parsl_tid, "result_received": True, "result": result } wq_tasks.remove(t.id) collector_queue.put_nowait(msg) if continue_running is False: logger.debug("Exiting WorkQueue Master Thread event loop") break for wq_task in wq_tasks: logger.debug("Cancelling Workqueue Task {}".format(wq_task)) q.cancel_by_taskid(wq_task) logger.debug("Exiting WorkQueue Monitoring Process") return 0
def work_queue_main(items, function, accumulator, **kwargs): """Execute using Work Queue For valid parameters, see :py:func:`work_queue_executor` in :py:mod:`executor`. For more information, see :ref:`intro-coffea-wq` """ global _wq_queue _check_dynamic_chunksize_targets(kwargs["dynamic_chunksize"]) clevel = kwargs["compression"] if clevel is not None: function = _compression_wrapper(clevel, function) accumulate_fn = _compression_wrapper(clevel, accumulate_result_files) else: accumulate_fn = accumulate_result_files _vprint.verbose_mode = kwargs["verbose"] or kwargs["print_stdout"] _vprint.status_mode = kwargs["status"] if not kwargs["port"]: kwargs["port"] = 0 if kwargs["master_name"] else 9123 if kwargs["environment_file"] and not kwargs["wrapper"]: raise ValueError( "Location of python_package_run could not be determined automatically.\nUse 'wrapper' argument to the work_queue_executor." ) if _wq_queue is None: _wq_queue = WorkQueue( port=kwargs["port"], name=kwargs["master_name"], debug_log=kwargs["debug_log"], stats_log=kwargs["stats_log"], transactions_log=kwargs["transactions_log"], ) # Make use of the stored password file, if enabled. if kwargs["password_file"] is not None: _wq_queue.specify_password_file(kwargs["password_file"]) print("Listening for work queue workers on port {}...".format(_wq_queue.port)) # perform a wait to print any warnings before progress bars _wq_queue.wait(0) _declare_resources(kwargs) # Working within a custom temporary directory: with tempfile.TemporaryDirectory( prefix="wq-executor-tmp-", dir=kwargs["filepath"] ) as tmpdir: fn_wrapper = _create_fn_wrapper(kwargs["x509_proxy"], tmpdir=tmpdir) infile_function = _function_to_file( function, prefix_name=kwargs["function_name"], tmpdir=tmpdir ) infile_accum_fn = _function_to_file( accumulate_fn, prefix_name="accum", tmpdir=tmpdir ) if kwargs["custom_init"]: kwargs["custom_init"](_wq_queue) if kwargs["desc"] == "Preprocessing": return _work_queue_preprocessing( items, accumulator, fn_wrapper, infile_function, tmpdir, kwargs ) else: return _work_queue_processing( items, accumulator, fn_wrapper, infile_function, infile_accum_fn, tmpdir, kwargs, )