def __init__(self, param_file, out_file, results_file, opt_value, tmpdir, name='shadho', port=9123, shutdown=True, logfile='shadho_wq.log', debugfile='shadho_wq.debug'): WORKQUEUE.cctools_debug_flags_set("all") WORKQUEUE.cctools_debug_config_file(debugfile) WORKQUEUE.cctools_debug_config_file_size(0) if os.environ['USER'] not in name: name += '-{}'.format(os.environ['USER']) super(WQManager, self).__init__(name=name, port=int(port), shutdown=shutdown) self.specify_log(logfile) self.param_file = param_file self.out_file = out_file self.results_file = results_file self.opt_value = opt_value self.tmpdir = tmpdir self.tasks_submitted = self.stats.tasks_submitted
def main(): args = parse_args() # Create a temporary file store for task results. tmpdir = tempfile.mkdtemp() # Load the task input data here task_inputs = load_inputs() print('Creating tasks') tasks = generate_tasks(args.command, task_inputs, args.infiles, args.outfile, tmpdir, args.max_retries) # Create the Work Queue master that manages task distribution. work_queue.cctools_debug_flags_set("all") work_queue.cctools_debug_config_file(f'{args.name}.debug') work_queue.cctools_debug_config_file_size(0) wq = WorkQueue(port=args.port, name=args.name, shutdown=True) wq.specify_log(f'{args.name}.log') # Submit all tasks to the queue. print('Submitting tasks') for t in tasks.values(): wq.submit(t) # The main loop waits for a task to get done then handles success or # failure accordingly print('Entering main loop') while not all([done_check(t) for t in tasks.values()]): t = wq.wait(10) # This blocks for 10s or until a task is done. if t is not None: tasks[t.tag] = t # Update the task map with the correct status # On success, post-process the task. If the maximum number of # submissions for a task has been reached, make a note. Otherwise, # report the failure and resubmit. if t.return_status == 0 and t.result == WORK_QUEUE_RESULT_SUCCESS: print(f'Task {t.tag} completed successfully.') input_idx = int(t.tag.split('_')[1]) handle_success(t, tmpdir, args.outfile) elif t.result == WORK_QUEUE_RESULT_MAX_RETRIES: print(f'Task {t.tag} resubmitted too many times.') else: wq.submit(t) print(f'Task {t.tag} failed with result {t.result}') print(t.output) print('All tasks completed or hit max retries.') print('Cleaning up...') shutil.rmtree(tmpdir) print('Done')
def _mk_wq(self): global _AWE_WORK_QUEUE if _AWE_WORK_QUEUE is not None: ### warn awe.log('WARNING: using previously created WorkQueue instance') else: if self.debug: WQ.set_debug_flag(self.debug) if self.wq_logfile: awe.util.makedirs_parent(self.wq_logfile) WQ.cctools_debug_config_file(self.wq_logfile) WQ.cctools_debug_config_file_size(0) if self.name: self.catalog = True wq = WQ.WorkQueue(name = self.name, port = self.port, shutdown = self.shutdown, catalog = self.catalog, exclusive = self.exclusive) wq.specify_algorithm(self.schedule) if self.monitor: wq.enable_monitoring(self.summaryfile) if self.capacity: wq.estimate_capacity() awe.log('Running on port %d...' % wq.port) if wq.name: awe.log('Using project name %s' % wq.name) if self.debug and self.wq_logfile: awe.log('Logging WorkQueue to %s' % self.wq_logfile) typ = type(self.fastabort) if typ is float or typ is int: wq.activate_fast_abort(self.fastabort) _AWE_WORK_QUEUE = wq awe.util.makedirs_parent(self.wqstats_logfile) _AWE_WORK_QUEUE.specify_log(self.wqstats_logfile) return _AWE_WORK_QUEUE
def _mk_wq(self): global _AWE_WORK_QUEUE if _AWE_WORK_QUEUE is not None: ### warn awe.log('WARNING: using previously created WorkQueue instance') else: if self.debug: WQ.set_debug_flag(self.debug) if self.wq_logfile: awe.util.makedirs_parent(self.wq_logfile) WQ.cctools_debug_config_file(self.wq_logfile) WQ.cctools_debug_config_file_size(0) if self.name: self.catalog = True wq = WQ.WorkQueue(name=self.name, port=self.port, shutdown=self.shutdown, catalog=self.catalog, exclusive=self.exclusive) wq.specify_algorithm(self.schedule) if self.monitor: wq.enable_monitoring(self.summaryfile) if self.capacity: wq.estimate_capacity() awe.log('Running on port %d...' % wq.port) if wq.name: awe.log('Using project name %s' % wq.name) if self.debug and self.wq_logfile: awe.log('Logging WorkQueue to %s' % self.wq_logfile) typ = type(self.fastabort) if typ is float or typ is int: wq.activate_fast_abort(self.fastabort) _AWE_WORK_QUEUE = wq awe.util.makedirs_parent(self.wqstats_logfile) _AWE_WORK_QUEUE.specify_log(self.wqstats_logfile) return _AWE_WORK_QUEUE
def __init__(self, param_file, out_file, results_file, opt_value, tmpdir, name='shadho', port=9123, exclusive=True, shutdown=True, logfile='shadho_wq.log', debugfile='shadho_wq.debug'): work_queue.cctools_debug_flags_set("all") work_queue.cctools_debug_config_file(debugfile) work_queue.cctools_debug_config_file_size(0) super(WQManager, self).__init__(name=name, port=port, exclusive=exclusive, shutdown=shutdown, catalog=False) self.specify_log(logfile) self.param_file = param_file self.out_file = out_file self.results_file = results_file self.opt_value = opt_value self.tmpdir = tmpdir self.tasks_submitted = self.stats.tasks_submitted
def _mk_wq(self): """ Only one instance of WorkQueue should be run per process. This grants access to the WorkQueue singleton or else creates a new WorkQueue instance. This also ensures that the cctools WorkQueue object can handle more workers. Parameters: None Returns: The cctools WorkQueue singleton object """ global _AWE_WORK_QUEUE if _AWE_WORK_QUEUE is not None: ### warn awe.log('WARNING: using previously created WorkQueue instance') else: if self.debug: # Set up debugging parameters for the cctools WorkQueue object. # It has inbuilt debugging capabilities. WQ.set_debug_flag(self.debug) if self.wq_logfile: awe.util.makedirs_parent(self.wq_logfile) WQ.cctools_debug_config_file(self.wq_logfile) WQ.cctools_debug_config_file_size(0) if self.name: self.catalog = True # Create the cctools WorkQueue object wq = WQ.WorkQueue(name = self.name, port = self.port, shutdown = self.shutdown, catalog = self.catalog, exclusive = self.exclusive) # Specify the task scheduling algorithm wq.specify_algorithm(self.schedule) # Turn cctools WorkQueue object status monitoring on or off if self.monitor: wq.enable_monitoring(self.summaryfile) if self.capacity: # Determine the number of workers the WorkQueue object can handle wq.estimate_capacity() # Display information about this run of AWE-WQ awe.log('Running on port %d...' % wq.port) if wq.name: awe.log('Using project name %s' % wq.name) if self.debug and self.wq_logfile: awe.log('Logging WorkQueue to %s' % self.wq_logfile) # Set up fast abort procedures typ = type(self.fastabort) if typ is float or typ is int: wq.activate_fast_abort(self.fastabort) # Ensure that the singleton is set to the new instance _AWE_WORK_QUEUE = wq # Ensure that the singleton is logging to the correct files awe.util.makedirs_parent(self.wqstats_logfile) _AWE_WORK_QUEUE.specify_log(self.wqstats_logfile) # Return a reference to teh singleton return _AWE_WORK_QUEUE
def WorkQueueSubmitThread(task_queue=multiprocessing.Queue(), queue_lock=threading.Lock(), launch_cmd=None, env=None, collector_queue=multiprocessing.Queue(), see_worker_output=False, data_dir=".", full=False, cancel_value=multiprocessing.Value('i', 1), port=WORK_QUEUE_DEFAULT_PORT, wq_log_dir=None, project_password=None, project_password_file=None, project_name=None): """Thread to handle Parsl app submissions to the Work Queue objects. Takes in Parsl functions submitted using submit(), and creates a Work Queue task with the appropriate specifications, which is then submitted to Work Queue. After tasks are completed, processes the exit status and exit code of the task, and sends results to the Work Queue collector thread. """ logger.debug("Starting WorkQueue Submit/Wait Process") # Enable debugging flags and create logging file if wq_log_dir is not None: logger.debug("Setting debugging flags and creating logging file") wq_debug_log = os.path.join(wq_log_dir, "debug_log") cctools_debug_flags_set("all") cctools_debug_config_file(wq_debug_log) # Create WorkQueue queue object logger.debug("Creating WorkQueue Object") try: logger.debug("Listening on port {}".format(port)) q = WorkQueue(port) except Exception as e: logger.error("Unable to create WorkQueue object: {}".format(e)) raise e # Specify WorkQueue queue attributes if project_name: q.specify_name(project_name) if project_password: q.specify_password(project_password) elif project_password_file: q.specify_password_file(project_password_file) # Only write logs when the wq_log_dir is specified, which it most likely will be if wq_log_dir is not None: wq_master_log = os.path.join(wq_log_dir, "master_log") wq_trans_log = os.path.join(wq_log_dir, "transaction_log") if full: wq_resource_log = os.path.join(wq_log_dir, "resource_logs") q.enable_monitoring_full(dirname=wq_resource_log) q.specify_log(wq_master_log) q.specify_transactions_log(wq_trans_log) wq_tasks = set() orig_ppid = os.getppid() continue_running = True while (continue_running): # Monitor the task queue ppid = os.getppid() if ppid != orig_ppid: logger.debug("new Process") continue_running = False continue # Submit tasks while task_queue.qsize() > 0: if cancel_value.value == 0: logger.debug("cancel value set to cancel") continue_running = False break # Obtain task from task_queue try: item = task_queue.get(timeout=1) logger.debug("Removing task from queue") except queue.Empty: continue parsl_id = item["task_id"] # Extract information about the task function_data_loc = item["data_loc"] function_data_loc_remote = function_data_loc.split("/")[-1] function_result_loc = item["result_loc"] function_result_loc_remote = function_result_loc.split("/")[-1] input_files = item["input_files"] output_files = item["output_files"] std_files = item["std_files"] full_script_name = workqueue_worker.__file__ script_name = full_script_name.split("/")[-1] remapping_string = "" std_string = "" # Parse input file information logger.debug("Looking at input") for item in input_files: if item[3] == "std": std_string += "mv " + item[1] + " " + item[0] + "; " else: remapping_string += item[0] + ":" + item[1] + "," logger.debug(remapping_string) # Parse output file information logger.debug("Looking at output") for item in output_files: remapping_string += item[0] + ":" + item[1] + "," logger.debug(remapping_string) if len(input_files) + len(output_files) > 0: remapping_string = "-r " + remapping_string remapping_string = remapping_string[:-1] # Create command string logger.debug(launch_cmd) command_str = launch_cmd.format( input_file=function_data_loc_remote, output_file=function_result_loc_remote, remapping_string=remapping_string) command_str = std_string + command_str logger.debug(command_str) # Create WorkQueue task for the command logger.debug("Sending task {} with command: {}".format( parsl_id, command_str)) try: t = Task(command_str) except Exception as e: logger.error("Unable to create task: {}".format(e)) continue # Specify environment variables for the task if env is not None: for var in env: t.specify_environment_variable(var, env[var]) # Specify script, and data/result files for task t.specify_file(full_script_name, script_name, WORK_QUEUE_INPUT, cache=True) t.specify_file(function_data_loc, function_data_loc_remote, WORK_QUEUE_INPUT, cache=False) t.specify_file(function_result_loc, function_result_loc_remote, WORK_QUEUE_OUTPUT, cache=False) t.specify_tag(str(parsl_id)) logger.debug("Parsl ID: {}".format(t.id)) # Specify all input/output files for task for item in input_files: t.specify_file(item[0], item[1], WORK_QUEUE_INPUT, cache=item[2]) for item in output_files: t.specify_file(item[0], item[1], WORK_QUEUE_OUTPUT, cache=item[2]) for item in std_files: t.specify_file(item[0], item[1], WORK_QUEUE_OUTPUT, cache=item[2]) # Submit the task to the WorkQueue object logger.debug("Submitting task {} to WorkQueue".format(parsl_id)) try: wq_id = q.submit(t) wq_tasks.add(wq_id) except Exception as e: logger.error("Unable to create task: {}".format(e)) msg = { "tid": parsl_id, "result_received": False, "reason": "Workqueue Task Start Failure", "status": 1 } collector_queue.put_nowait(msg) continue logger.debug("Task {} submitted to WorkQueue with id {}".format( parsl_id, wq_id)) if cancel_value.value == 0: continue_running = False # If the queue is not empty wait on the WorkQueue queue for a task task_found = True if not q.empty() and continue_running: while task_found is True: if cancel_value.value == 0: continue_running = False task_found = False continue # Obtain the task from the queue t = q.wait(1) if t is None: task_found = False continue else: parsl_tid = t.tag logger.debug( "Completed WorkQueue task {}, parsl task {}".format( t.id, parsl_tid)) status = t.return_status task_result = t.result msg = None # Task failure if status != 0 or (task_result != WORK_QUEUE_RESULT_SUCCESS and task_result != WORK_QUEUE_RESULT_OUTPUT_MISSING): logger.debug( "Wrapper Script status: {}\nWorkQueue Status: {}". format(status, task_result)) # Wrapper script failure if status != 0: logger.debug( "WorkQueue task {} failed with status {}". format(t.id, status)) reason = "Wrapper Script Failure: " if status == 1: reason += "problem parsing command line options" elif status == 2: reason += "problem loading function data" elif status == 3: reason += "problem remapping file names" elif status == 4: reason += "problem writing out function result" else: reason += "unable to process wrapper script failure with status = {}".format( status) reason += "\nTrace:\n" + str(t.output) logger.debug( "WorkQueue runner script failed for task {} because {}\n" .format(parsl_tid, reason)) # WorkQueue system failure else: reason = "WorkQueue System Failure: " if task_result == 1: reason += "missing input file" elif task_result == 2: reason += "unable to generate output file" elif task_result == 4: reason += "stdout has been truncated" elif task_result == 1 << 3: reason += "task terminated with a signal" elif task_result == 2 << 3: reason += "task used more resources than requested" elif task_result == 3 << 3: reason += "task ran past the specified end time" elif task_result == 4 << 3: reason += "result could not be classified" elif task_result == 5 << 3: reason += "task failed, but not a task error" elif task_result == 6 << 3: reason += "unable to complete after specified number of retries" elif task_result == 7 << 3: reason += "task ran for more than the specified time" elif task_result == 8 << 3: reason += "task needed more space to complete task" else: reason += "unable to process Work Queue system failure" msg = { "tid": parsl_tid, "result_received": False, "reason": reason, "status": status } collector_queue.put_nowait(msg) # Task Success else: # Print the output from the task if see_worker_output: print(t.output) # Load result into result file result_loc = os.path.join( data_dir, "task_" + str(parsl_tid) + "_function_result") logger.debug( "Looking for result in {}".format(result_loc)) f = open(result_loc, "rb") result = pickle.load(f) f.close() msg = { "tid": parsl_tid, "result_received": True, "result": result } wq_tasks.remove(t.id) collector_queue.put_nowait(msg) if continue_running is False: logger.debug("Exiting WorkQueue Master Thread event loop") break # Remove all WorkQueue tasks that remain in the queue object for wq_task in wq_tasks: logger.debug("Cancelling WorkQueue Task {}".format(wq_task)) q.cancel_by_taskid(wq_task) logger.debug("Exiting WorkQueue Monitoring Process") return 0
def WorkQueueSubmitThread(task_queue=multiprocessing.Queue(), queue_lock=threading.Lock(), launch_cmd=None, env=None, collector_queue=multiprocessing.Queue(), see_worker_output=False, data_dir=".", full=False, cancel_value=multiprocessing.Value('i', 1), port=WORK_QUEUE_DEFAULT_PORT, wq_log_dir=None, project_password=None, project_password_file=None, project_name=None): logger.debug("Starting WorkQueue Submit/Wait Process") orig_ppid = os.getppid() wq_tasks = set() continue_running = True if wq_log_dir is not None: wq_debug_log = os.path.join(wq_log_dir, "debug") cctools_debug_flags_set("all") cctools_debug_config_file(wq_debug_log) logger.debug("Creating Workqueue Object") try: q = WorkQueue(port) except Exception as e: logger.error("Unable to create Workqueue object: {}", format(e)) raise e if project_name: q.specify_name(project_name) if project_password: q.specify_password(project_password) elif project_password_file: q.specify_password_file(project_password_file) # Only write Logs when the log_dir is specified, which is most likely always will be if wq_log_dir is not None: wq_master_log = os.path.join(wq_log_dir, "master_log") wq_trans_log = os.path.join(wq_log_dir, "transaction_log") if full: wq_resource_log = os.path.join(wq_log_dir, "resource_logs") q.enable_monitoring_full(dirname=wq_resource_log) q.specify_log(wq_master_log) q.specify_transactions_log(wq_trans_log) while (continue_running): # Monitor the Task Queue ppid = os.getppid() if ppid != orig_ppid: continue_running = False continue # Submit Tasks while task_queue.qsize() > 0: if cancel_value.value == 0: continue_running = False break try: # item = task_queue.get_nowait() item = task_queue.get(timeout=1) logger.debug("Removing task from queue") except queue.Empty: continue parsl_id = item["task_id"] function_data_loc = item["data_loc"] function_result_loc = item["result_loc"] function_result_loc_remote = function_result_loc.split("/")[-1] function_data_loc_remote = function_data_loc.split("/")[-1] input_files = item["input_files"] output_files = item["output_files"] std_files = item["std_files"] full_script_name = workqueue_worker.__file__ script_name = full_script_name.split("/")[-1] remapping_string = "" std_string = "" logger.debug("looking at input") for item in input_files: if item[3] == "std": std_string += "mv " + item[1] + " " + item[0] + "; " else: remapping_string += item[0] + ":" + item[1] + "," logger.debug(remapping_string) logger.debug("looking at output") for item in output_files: remapping_string += item[0] + ":" + item[1] + "," logger.debug(remapping_string) if len(input_files) + len(output_files) > 0: remapping_string = "-r " + remapping_string remapping_string = remapping_string[:-1] logger.debug(launch_cmd) command_str = launch_cmd.format( input_file=function_data_loc_remote, output_file=function_result_loc_remote, remapping_string=remapping_string) logger.debug(command_str) command_str = std_string + command_str logger.debug(command_str) logger.debug("Sending task {} with command: {}".format( parsl_id, command_str)) try: t = Task(command_str) except Exception as e: logger.error("Unable to create task: {}".format(e)) continue if env is not None: for var in env: t.specify_environment_variable(var, env[var]) t.specify_file(full_script_name, script_name, WORK_QUEUE_INPUT, cache=True) t.specify_file(function_result_loc, function_result_loc_remote, WORK_QUEUE_OUTPUT, cache=False) t.specify_file(function_data_loc, function_data_loc_remote, WORK_QUEUE_INPUT, cache=False) t.specify_tag(str(parsl_id)) for item in input_files: t.specify_file(item[0], item[1], WORK_QUEUE_INPUT, cache=item[2]) for item in output_files: t.specify_file(item[0], item[1], WORK_QUEUE_OUTPUT, cache=item[2]) for item in std_files: t.specify_file(item[0], item[1], WORK_QUEUE_OUTPUT, cache=item[2]) logger.debug("Submitting task {} to workqueue".format(parsl_id)) try: wq_id = q.submit(t) wq_tasks.add(wq_id) except Exception as e: logger.error("Unable to create task: {}".format(e)) msg = { "tid": parsl_id, "result_received": False, "reason": "Workqueue Task Start Failure", "status": 1 } collector_queue.put_nowait(msg) continue logger.debug("Task {} submitted workqueue with id {}".format( parsl_id, wq_id)) if cancel_value.value == 0: continue_running = False # Wait for Tasks task_found = True # If the queue is not empty wait on the workqueue queue for a task if not q.empty() and continue_running: while task_found is True: if cancel_value.value == 0: continue_running = False task_found = False continue t = q.wait(1) if t is None: task_found = False continue else: parsl_tid = t.tag logger.debug( "Completed workqueue task {}, parsl task {}".format( t.id, parsl_tid)) status = t.return_status task_result = t.result msg = None if status != 0 or (task_result != WORK_QUEUE_RESULT_SUCCESS and task_result != WORK_QUEUE_RESULT_OUTPUT_MISSING): if task_result == WORK_QUEUE_RESULT_SUCCESS: logger.debug( "Workqueue task {} failed with status {}". format(t.id, status)) reason = "Wrapper Script Failure: " if status == 1: reason += "command line parsing" if status == 2: reason += "problem loading function data" if status == 3: reason += "problem remapping file names" if status == 4: reason += "problem writing out function result" reason += "\nTrace:\n" + t.output logger.debug( "Workqueue runner script failed for task {} because {}\n" .format(parsl_tid, reason)) else: reason = "Workqueue system failure\n" msg = { "tid": parsl_tid, "result_received": False, "reason": reason, "status": status } collector_queue.put_nowait(msg) else: if see_worker_output: print(t.output) result_loc = os.path.join( data_dir, "task_" + str(parsl_tid) + "_function_result") logger.debug( "Looking for result in {}".format(result_loc)) f = open(result_loc, "rb") result = pickle.load(f) f.close() msg = { "tid": parsl_tid, "result_received": True, "result": result } wq_tasks.remove(t.id) collector_queue.put_nowait(msg) if continue_running is False: logger.debug("Exiting WorkQueue Master Thread event loop") break for wq_task in wq_tasks: logger.debug("Cancelling Workqueue Task {}".format(wq_task)) q.cancel_by_taskid(wq_task) logger.debug("Exiting WorkQueue Monitoring Process") return 0
def sprint(self): with util.PartiallyMutable.unlock(): self.source = TaskProvider(self.config) action = actions.Actions(self.config, self.source) logger.info("using wq from {0}".format(wq.__file__)) logger.info("running Lobster version {0}".format(util.get_version())) logger.info("current PID is {0}".format(os.getpid())) wq.cctools_debug_flags_set("all") wq.cctools_debug_config_file( os.path.join(self.config.workdir, "work_queue_debug.log")) wq.cctools_debug_config_file_size(1 << 29) self.queue = wq.WorkQueue(self.config.advanced.wq_port) self.queue.specify_min_taskid(self.source.max_taskid() + 1) self.queue.specify_log( os.path.join(self.config.workdir, "work_queue.log")) self.queue.specify_transactions_log( os.path.join(self.config.workdir, "transactions.log")) self.queue.specify_name("lobster_" + self.config.label) self.queue.specify_keepalive_timeout(300) # self.queue.tune("short-timeout", 600) self.queue.tune("transfer-outlier-factor", 4) self.queue.specify_algorithm(wq.WORK_QUEUE_SCHEDULE_RAND) if self.config.advanced.full_monitoring: self.queue.enable_monitoring_full(None) else: self.queue.enable_monitoring(None) logger.info("starting queue as {0}".format(self.queue.name)) abort_active = False abort_threshold = self.config.advanced.abort_threshold abort_multiplier = self.config.advanced.abort_multiplier wq_max_retries = self.config.advanced.wq_max_retries if util.checkpoint(self.config.workdir, 'KILLED') == 'PENDING': util.register_checkpoint(self.config.workdir, 'KILLED', 'RESTART') # time in seconds to wait for WQ to return tasks, with minimum wait # time in case no more tasks are waiting interval = 120 interval_minimum = 30 tasks_left = 0 units_left = 0 successful_tasks = 0 categories = [] self.setup_logging('all') # Workflows can be assigned categories, with each category having # different cpu/memory/walltime requirements that WQ will automatically # fine-tune for category in self.config.categories: constraints = category.wq() if category.name != 'merge': categories.append(category.name) self.setup_logging(category.name) self.queue.specify_category_mode(category.name, category.mode) if category.mode == wq.WORK_QUEUE_ALLOCATION_MODE_FIXED: self.queue.specify_category_max_resources( category.name, constraints) else: self.queue.specify_category_first_allocation_guess( category.name, constraints) logger.debug('Category {0}: {1}'.format(category.name, constraints)) if 'wall_time' not in constraints: self.queue.activate_fast_abort_category( category.name, abort_multiplier) proxy_email_sent = False while not self.source.done(): with self.measure('status'): tasks_left = self.source.tasks_left() units_left = self.source.work_left() logger.debug("expecting {0} tasks, still".format(tasks_left)) self.queue.specify_num_tasks_left(tasks_left) for c in categories + ['all']: self.log(c, units_left) if util.checkpoint(self.config.workdir, 'KILLED') == 'PENDING': util.register_checkpoint(self.config.workdir, 'KILLED', str(datetime.datetime.utcnow())) # let the task source shut down gracefully logger.info("terminating task source") self.source.terminate() logger.info("terminating gracefully") break with self.measure('create'): have = {} for c in categories: cstats = self.queue.stats_category(c) have[c] = { 'running': cstats.tasks_running, 'queued': cstats.tasks_waiting } stats = self.queue.stats_hierarchy tasks = self.source.obtain(stats.total_cores, have) expiry = None if self.config.advanced.proxy: expiry = self.config.advanced.proxy.expires() proxy_time_left = self.config.advanced.proxy.time_left() if proxy_time_left >= 24 * 3600: proxy_email_sent = False if proxy_time_left < 24 * 3600 and not proxy_email_sent: util.sendemail( "Your proxy is about to expire.\n" + "Timeleft: " + str(datetime.timedelta(seconds=proxy_time_left)), self.config) proxy_email_sent = True for category, cmd, id, inputs, outputs, env, dir in tasks: task = wq.Task(cmd) task.specify_category(category) task.specify_tag(id) task.specify_max_retries(wq_max_retries) task.specify_monitor_output( os.path.join(dir, 'resource_monitor')) for k, v in env.items(): task.specify_environment_variable(k, v) for (local, remote, cache) in inputs: cache_opt = wq.WORK_QUEUE_CACHE if cache else wq.WORK_QUEUE_NOCACHE if os.path.isfile(local) or os.path.isdir(local): task.specify_input_file(str(local), str(remote), cache_opt) else: logger.critical( "cannot send file to worker: {0}".format( local)) raise NotImplementedError for (local, remote) in outputs: task.specify_output_file(str(local), str(remote)) if expiry: task.specify_end_time(expiry * 10**6) self.queue.submit(task) with self.measure('status'): stats = self.queue.stats_hierarchy logger.info( "{0} out of {1} workers busy; {2} tasks running, {3} waiting; {4} units left" .format(stats.workers_busy, stats.workers_busy + stats.workers_ready, stats.tasks_running, stats.tasks_waiting, units_left)) with self.measure('update'): self.source.update(self.queue) # recurring actions are triggered here; plotting etc should run # while we have WQ hand us back tasks w/o any database # interaction with self.measure('action'): if action: action.take() with self.measure('fetch'): starttime = time.time() task = self.queue.wait(interval) tasks = [] while task: if task.return_status == 0: successful_tasks += 1 elif task.return_status in self.config.advanced.bad_exit_codes: logger.warning( "blacklisting host {0} due to bad exit code from task {1}" .format(task.hostname, task.tag)) self.queue.blacklist(task.hostname) tasks.append(task) remaining = int(starttime + interval - time.time()) if (interval - remaining < interval_minimum or self.queue.stats.tasks_waiting > 0 ) and remaining > 0: task = self.queue.wait(remaining) else: task = None # TODO do we really need this? We have everything based on # categories by now, so this should not be needed. if abort_threshold > 0 and successful_tasks >= abort_threshold and not abort_active: logger.info( "activating fast abort with multiplier: {0}".format( abort_multiplier)) abort_active = True self.queue.activate_fast_abort(abort_multiplier) if len(tasks) > 0: try: with self.measure('return'): self.source.release(tasks) except Exception: tb = traceback.format_exc() logger.critical( "cannot recover from the following exception:\n" + tb) util.sendemail( "Your Lobster project has crashed from the following exception:\n" + tb, self.config) for task in tasks: logger.critical( "tried to return task {0} from {1}".format( task.tag, task.hostname)) raise if units_left == 0: logger.info("no more work left to do") util.sendemail("Your Lobster project is done!", self.config) if self.config.elk: self.config.elk.end() if action: action.take(True)
def run(args): dash_checker = cmssw.dash.JobStateChecker(300) with open(args.configfile) as configfile: config = yaml.load(configfile) workdir = config['workdir'] if not os.path.exists(workdir): os.makedirs(workdir) util.register_checkpoint(workdir, "version", get_distribution('Lobster').version) else: util.verify(workdir) cmsjob = False if config.get('type', 'cmssw') == 'cmssw': cmsjob = True from ProdCommon.Credential.CredentialAPI import CredentialAPI cred = CredentialAPI({'credential': 'Proxy'}) if cred.checkCredential(Time=60): if not 'X509_USER_PROXY' in os.environ: os.environ['X509_USER_PROXY'] = cred.credObj.getUserProxy() else: if config.get('advanced', {}).get('renew proxy', True): try: cred.ManualRenewCredential() except Exception as e: print("could not renew proxy") sys.exit(1) else: print("please renew your proxy") sys.exit(1) print "Saving log to {0}".format(os.path.join(workdir, 'lobster.log')) if not args.foreground: ttyfile = open(os.path.join(workdir, 'lobster.err'), 'a') print "Saving stderr and stdout to {0}".format(os.path.join(workdir, 'lobster.err')) signals = daemon.daemon.make_default_signal_map() signals[signal.SIGTERM] = lambda num, frame: kill(args) with daemon.DaemonContext( detach_process=not args.foreground, stdout=sys.stdout if args.foreground else ttyfile, stderr=sys.stderr if args.foreground else ttyfile, working_directory=workdir, pidfile=util.get_lock(workdir), signal_map=signals): fileh = logging.handlers.RotatingFileHandler(os.path.join(workdir, 'lobster.log'), maxBytes=500e6, backupCount=10) fileh.setFormatter(ShortPathFormatter("%(asctime)s [%(levelname)5s] - %(pathname)-40s %(lineno)4d: %(message)s")) fileh.setLevel(config.get('advanced', {}).get('log level', 2) * 10) logger.addHandler(fileh) logger.setLevel(config.get('advanced', {}).get('log level', 2) * 10) if args.foreground: console = logging.StreamHandler() console.setLevel(config.get('advanced', {}).get('log level', 2) * 10) console.setFormatter(ShortPathFormatter("%(asctime)s [%(levelname)5s] - %(pathname)-40s %(lineno)4d: %(message)s")) logger.addHandler(console) config['configdir'] = args.configdir config['filename'] = args.configfile config['startdir'] = args.startdir if cmsjob: job_src = cmssw.JobProvider(config) actions = cmssw.Actions(config) else: job_src = job.SimpleJobProvider(config) actions = None logger.info("using wq from {0}".format(wq.__file__)) wq.cctools_debug_flags_set("all") wq.cctools_debug_config_file(os.path.join(workdir, "work_queue_debug.log")) wq.cctools_debug_config_file_size(1 << 29) queue = wq.WorkQueue(-1) queue.specify_log(os.path.join(workdir, "work_queue.log")) queue.specify_name("lobster_" + config["id"]) queue.specify_keepalive_timeout(300) # queue.tune("short-timeout", 600) queue.tune("transfer-outlier-factor", 4) queue.specify_algorithm(wq.WORK_QUEUE_SCHEDULE_RAND) logger.info("starting queue as {0}".format(queue.name)) logger.info("submit workers with: condor_submit_workers -M {0} <num>".format(queue.name)) payload = config.get('advanced', {}).get('payload', 400) abort_active = False abort_threshold = config.get('advanced', {}).get('abort threshold', 400) abort_multiplier = config.get('advanced', {}).get('abort multiplier', 4) if util.checkpoint(workdir, 'KILLED') == 'PENDING': util.register_checkpoint(workdir, 'KILLED', 'RESTART') jobits_left = 0 successful_jobs = 0 creation_time = 0 destruction_time = 0 with open(os.path.join(workdir, "lobster_stats.log"), "a") as statsfile: statsfile.write( "#timestamp " + "total_workers_connected total_workers_joined total_workers_removed " + "workers_busy workers_idle " + "tasks_running " + "total_send_time total_receive_time " + "total_create_time total_return_time " + "idle_percentage " + "capacity " + "efficiency " + "total_memory " + "total_cores " + "jobits_left\n") while not job_src.done(): jobits_left = job_src.work_left() stats = queue.stats with open(os.path.join(workdir, "lobster_stats.log"), "a") as statsfile: now = datetime.datetime.now() statsfile.write(" ".join(map(str, [ int(int(now.strftime('%s')) * 1e6 + now.microsecond), stats.total_workers_connected, stats.total_workers_joined, stats.total_workers_removed, stats.workers_busy, stats.workers_idle, stats.tasks_running, stats.total_send_time, stats.total_receive_time, creation_time, destruction_time, stats.idle_percentage, stats.capacity, stats.efficiency, stats.total_memory, stats.total_cores, jobits_left ] )) + "\n" ) if util.checkpoint(workdir, 'KILLED') == 'PENDING': util.register_checkpoint(workdir, 'KILLED', str(datetime.datetime.utcnow())) # just in case, check for any remaining not done task that # hasn't been reported as aborted for task_id in queue._task_table.keys(): status = cmssw.dash.status_map[queue.task_state(task_id)] if status not in (cmssw.dash.DONE, cmssw.dash.ABORTED): job_src._JobProvider__dash.update_job(task_id, cmssw.dash.ABORTED) logger.info("terminating gracefully") break logger.info("{0} out of {1} workers busy; {3} jobs running, {4} waiting; {2} jobits left".format( stats.workers_busy, stats.workers_busy + stats.workers_ready, jobits_left, stats.tasks_running, stats.tasks_waiting)) hunger = max(payload - stats.tasks_waiting, 0) t = time.time() while hunger > 0: jobs = job_src.obtain(50) if jobs == None or len(jobs) == 0: break hunger -= len(jobs) cores = config.get('cores per job', 1) for id, cmd, inputs, outputs in jobs: task = wq.Task(cmd) task.specify_tag(id) task.specify_cores(cores) # temporary work-around? # task.specify_memory(1000) # task.specify_disk(4000) for (local, remote, cache) in inputs: if os.path.isfile(local): cache_opt = wq.WORK_QUEUE_CACHE if cache else wq.WORK_QUEUE_NOCACHE task.specify_input_file(str(local), str(remote), cache_opt) elif os.path.isdir(local): task.specify_directory(local, remote, wq.WORK_QUEUE_INPUT, wq.WORK_QUEUE_CACHE, recursive=True) else: logger.critical("cannot send file to worker: {0}".format(local)) raise NotImplementedError for (local, remote) in outputs: task.specify_output_file(str(local), str(remote)) queue.submit(task) creation_time += int((time.time() - t) * 1e6) # update dashboard status for all not done tasks # report Done status only once when releasing the task # WAITING_RETRIEVAL is not a valid status in dashboard # so, skipping it for now monitor = job_src._JobProvider__dash queue = queue exclude_states = (cmssw.dash.DONE, cmssw.dash.WAITING_RETRIEVAL) try: dash_checker.update_dashboard_states(monitor, queue, exclude_states) except Exception as e: logger.warning("Could not update job states to dashboard") task = queue.wait(300) tasks = [] while task: if task.return_status == 0: successful_jobs += 1 tasks.append(task) if queue.stats.tasks_complete > 0: task = queue.wait(1) else: task = None if len(tasks) > 0: try: t = time.time() job_src.release(tasks) destruction_time += int((time.time() - t) * 1e6) except: tb = traceback.format_exc() logger.critical("cannot recover from the following exception:\n" + tb) for task in tasks: logger.critical("tried to return task {0} from {1}".format(task.tag, task.hostname)) raise if successful_jobs >= abort_threshold and not abort_active: logger.info("activating fast abort with multiplier: {0}".format(abort_multiplier)) abort_active = True queue.activate_fast_abort(abort_multiplier) # recurring actions are triggered here if actions: actions.take() if jobits_left == 0: logger.info("no more work left to do")
def run(args): with open(args.configfile) as configfile: config = yaml.load(configfile) workdir = config['workdir'] if not os.path.exists(workdir): os.makedirs(workdir) cmsjob = False if config.get('type', 'cmssw') == 'cmssw': cmsjob = True from ProdCommon.Credential.CredentialAPI import CredentialAPI cred = CredentialAPI({'credential': 'Proxy'}) if cred.checkCredential(Time=60): if not 'X509_USER_PROXY' in os.environ: os.environ['X509_USER_PROXY'] = cred.credObj.getUserProxy() else: if config.get('check proxy', True): try: cred.ManualRenewCredential() except Exception as e: logging.critical("could not renew proxy") sys.exit(1) else: logging.critical("please renew your proxy") sys.exit(1) mode_label = 'merge_' if args.merge else '' print "Saving log to {0}".format(os.path.join(workdir, mode_label+'lobster.log')) if not args.foreground: ttyfile = open(os.path.join(workdir, mode_label+'lobster.err'), 'a') print "Saving stderr and stdout to {0}".format(os.path.join(workdir, mode_label+'lobster.err')) with daemon.DaemonContext( detach_process=not args.foreground, stdout=sys.stdout if args.foreground else ttyfile, stderr=sys.stderr if args.foreground else ttyfile, working_directory=workdir, pidfile=get_lock(workdir)): logging.basicConfig( datefmt="%Y-%m-%d %H:%M:%S", format="%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s", level=config.get('log level', 2) * 10, filename=os.path.join(workdir, mode_label+'lobster.log')) if args.foreground: console = logging.StreamHandler() console.setLevel(config.get('log level', 2) * 10) console.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] - %(filename)s %(lineno)d: %(message)s")) logging.getLogger('').addHandler(console) config['configdir'] = args.configdir config['filepath'] = args.configfile config['startdir'] = args.startdir if args.merge: if args.server: config['stageout server'] = args.server config['max megabytes'] = args.max_megabytes job_src = cmssw.MergeProvider(config) elif cmsjob: job_src = cmssw.JobProvider(config) else: job_src = job.SimpleJobProvider(config) wq.cctools_debug_flags_set("all") wq.cctools_debug_config_file(os.path.join(workdir, mode_label+"work_queue_debug.log")) wq.cctools_debug_config_file_size(1 << 29) queue = wq.WorkQueue(-1) queue.specify_log(os.path.join(workdir, mode_label+"work_queue.log")) queue.specify_name("lobster_" + mode_label + config["id"]) queue.specify_keepalive_timeout(300) # queue.tune("short-timeout", 600) queue.tune("transfer-outlier-factor", 4) logging.info("starting queue as {0}".format(queue.name)) logging.info("submit workers with: condor_submit_workers -M {0} <num>".format(queue.name)) payload = config.get('tune', {}).get('payload', 400) abort_active = False abort_threshold = config.get('tune', {}).get('abort threshold', 400) abort_multiplier = config.get('tune', {}).get('abort multiplier', 4) if util.checkpoint(workdir, 'KILLED') == 'PENDING': util.register_checkpoint(workdir, 'KILLED', 'RESTART') successful_jobs = 0 creation_time = 0 destruction_time = 0 with open(os.path.join(workdir, mode_label+"lobster_stats.log"), "a") as statsfile: statsfile.write( "#timestamp " + "total_workers_connected total_workers_joined total_workers_removed " + "workers_busy workers_idle " + "tasks_running " + "total_send_time total_receive_time " + "total_create_time total_return_time " + "idle_percentage " + "capacity " + "efficiency " + "jobits_left\n") while not job_src.done(): jobits_left = job_src.work_left() stats = queue.stats with open(os.path.join(workdir, mode_label+"lobster_stats.log"), "a") as statsfile: now = datetime.datetime.now() statsfile.write(" ".join(map(str, [ int(int(now.strftime('%s')) * 1e6 + now.microsecond), stats.total_workers_connected, stats.total_workers_joined, stats.total_workers_removed, stats.workers_busy, stats.workers_idle, stats.tasks_running, stats.total_send_time, stats.total_receive_time, creation_time, destruction_time, stats.idle_percentage, stats.capacity, stats.efficiency, jobits_left ] )) + "\n" ) if util.checkpoint(workdir, 'KILLED') == 'PENDING': util.register_checkpoint(workdir, 'KILLED', str(datetime.datetime.utcnow())) logging.info("terminating gracefully") break logging.info("{0} out of {1} workers busy; {3} jobs running, {4} waiting; {2} jobits left".format( stats.workers_busy, stats.workers_busy + stats.workers_ready, jobits_left, stats.tasks_running, stats.tasks_waiting)) hunger = max(payload - stats.tasks_waiting, 0) t = time.time() while hunger > 0: jobs = job_src.obtain(50) if jobs == None or len(jobs) == 0: break hunger -= len(jobs) for id, cmd, inputs, outputs in jobs: task = wq.Task(cmd) task.specify_tag(id) task.specify_cores(1) # temporary work-around? # task.specify_memory(1000) # task.specify_disk(4000) for (local, remote) in inputs: if os.path.isfile(local): task.specify_input_file(str(local), str(remote), wq.WORK_QUEUE_CACHE) elif os.path.isdir(local): task.specify_directory(local, remote, wq.WORK_QUEUE_INPUT, wq.WORK_QUEUE_CACHE, recursive=True) else: logging.critical("cannot send file to worker: {0}".format(local)) raise NotImplementedError for (local, remote) in outputs: task.specify_output_file(str(local), str(remote)) queue.submit(task) creation_time += int((time.time() - t) * 1e6) task = queue.wait(300) tasks = [] while task: if task.return_status == 0: successful_jobs += 1 tasks.append(task) if queue.stats.tasks_complete > 0: task = queue.wait(1) else: task = None if len(tasks) > 0: try: t = time.time() job_src.release(tasks) destruction_time += int((time.time() - t) * 1e6) except: tb = traceback.format_exc() logging.critical("cannot recover from the following exception:\n" + tb) for task in tasks: logging.critical("tried to return task {0} from {1}".format(task.tag, task.hostname)) raise if successful_jobs >= abort_threshold and not abort_active: logging.info("activating fast abort with multiplier: {0}".format(abort_multiplier)) abort_active = True queue.activate_fast_abort(abort_multiplier) if jobits_left == 0: logging.info("no more work left to do")