def touch_control_outputs(self): task = self.task # create the parent directory self._outputs["submission"].parent.touch() # get all branch indexes and chunk them by tasks_per_job branch_chunks = list( iter_chunks(task.branch_map.keys(), task.tasks_per_job)) # submission output if not self._outputs["submission"].exists(): submission_data = self.submission_data.copy() # set dummy submission data submission_data.jobs.clear() for i, branches in enumerate(branch_chunks): job_num = i + 1 submission_data.jobs[ job_num] = self.submission_data_cls.job_data( branches=branches) self._outputs["submission"].dump(submission_data, formatter="json", indent=4) # status output if "status" in self._outputs and not self._outputs["status"].exists(): status_data = self.status_data_cls() # set dummy status data for i, branches in enumerate(branch_chunks): job_num = i + 1 status_data.jobs[job_num] = self.status_data_cls.job_data( status=self.job_manager.FINISHED, code=0) self._outputs["status"].dump(status_data, formatter="json", indent=4)
def query_batch(self, job_ids, threads=None, chunk_size=20, callback=None, **kwargs): """ Queries the status of a batch of jobs given by *job_ids* via a thread pool of size *threads* which defaults to its instance attribute. When *chunk_size* is not negative, *job_ids* is split into chunks of that size which are passed to :py:meth:`query`. When *callback* is set, it is invoked after each successful job (or job chunk) status query with the job number (starting from 0) and the result object. All other *kwargs* are passed the :py:meth:`query`. This method returns a tuple containing the job status query data in a dictionary mapped to job ids, and a list of exceptions that occured during status querying. An empty list means that no exceptions occured. """ # default arguments threads = threads or self.threads def _callback(i): return (lambda r: callback(i, r)) if callable(callback) else None # threaded processing pool = ThreadPool(max(threads, 1)) gen = job_ids if chunk_size < 0 else iter_chunks(job_ids, chunk_size) results = [pool.apply_async(self.query, (job_id_chunk,), kwargs, callback=_callback(i)) for i, job_id_chunk in enumerate(gen)] pool.close() pool.join() # store status data per job id query_data, errors = {}, [] for res in results: try: query_data.update(res.get()) except Exception as e: errors.append(e) return query_data, errors
def cleanup_batch(self, job_ids, threads=None, chunk_size=20, callback=None, **kwargs): """ Cleans up a batch of jobs given by *job_ids* via a thread pool of size *threads* which defaults to its instance attribute. When *chunk_size* is not negative, *job_ids* is split into chunks of that size which are passed to :py:meth:`cleanup`. When *callback* is set, it is invoked after each successful job (or job chunk) cleaning with the job number (starting from 0) and the result object. All other *kwargs* are passed the :py:meth:`cleanup`. Exceptions that occured during job cleaning is stored in a list and returned. An empty list means that no exceptions occured. """ # default arguments threads = threads or self.threads def _callback(i): return (lambda r: callback(i, r)) if callable(callback) else None # threaded processing pool = ThreadPool(max(threads, 1)) gen = job_ids if chunk_size < 0 else iter_chunks(job_ids, chunk_size) results = [pool.apply_async(self.cleanup, (job_id_chunk,), kwargs, callback=_callback(i)) for i, job_id_chunk in enumerate(gen)] pool.close() pool.join() # store errors errors = [] for res in results: try: res.get() except Exception as e: errors.append(e) return errors
def query_batch(self, job_ids, threads=None, chunk_size=20, callback=None, **kwargs): # default arguments threads = threads or self.threads def _callback(i): return (lambda r: callback(r, i)) if callable(callback) else None # threaded processing pool = ThreadPool(max(threads, 1)) gen = job_ids if chunk_size < 0 else iter_chunks(job_ids, chunk_size) results = [ pool.apply_async(self.query, (job_id_chunk, ), kwargs, callback=_callback(i)) for i, job_id_chunk in enumerate(gen) ] pool.close() pool.join() # store status data per job id query_data, errors = {}, [] for res in results: try: query_data.update(res.get()) except Exception as e: errors.append(e) return query_data, errors
def get_branch_chunks(self, chunk_size): """ Returns a list of chunks of branch numbers defined in this workflow with a certain *chunk_size*. Example: .. code-block:: python wf = SomeWorkflowTask() # has 8 branches print(wf.get_branch_chunks(3)) # -> [[0, 1, 2], [3, 4, 5], [6, 7]] wf2 = SomeWorkflowTask(end_branch=5) # has 5 branches print(wf2.get_branch_chunks(3)) # -> [[0, 1, 2], [3, 4]] """ if self.is_branch(): return self.as_workflow().get_branch_chunks(chunk_size) # get the branch map and create chunks of its branch values branch_chunks = iter_chunks(self.get_branch_map().keys(), chunk_size) return list(branch_chunks)
def run(self): """ Actual run method that starts the processing of jobs and initiates the status polling, or performs job cancelling or cleaning, depending on the task parameters. """ task = self.task self._outputs = self.output() # create the job dashboard interface self.dashboard = task.create_job_dashboard() or NoJobDashboard() # read submission data and reset some values submitted = not task.ignore_submission and self._outputs[ "submission"].exists() if submitted: self.submission_data.update( self._outputs["submission"].load(formatter="json")) task.tasks_per_job = self.submission_data.tasks_per_job self.dashboard.apply_config(self.submission_data.dashboard_config) # store the initially complete branches if "collection" in self._outputs: collection = self._outputs["collection"] count, keys = collection.count(keys=True) self._initially_existing_branches = keys # cancel jobs? if self._cancel_jobs: if submitted: self.cancel() self._controlled_jobs = True # cleanup jobs? elif self._cleanup_jobs: if submitted: self.cleanup() self._controlled_jobs = True # submit and/or wait while polling else: # maybe set a tracking url tracking_url = self.dashboard.create_tracking_url() if tracking_url: task.set_tracking_url(tracking_url) # ensure the output directory exists if not submitted: self._outputs["submission"].parent.touch() # at this point, when the status file exists, it is considered outdated if "status" in self._outputs: self._outputs["status"].remove() try: # instantiate the configured job file factory self.job_file_factory = self.create_job_file_factory() # submit if not submitted: # set the initial list of unsubmitted jobs branches = sorted(task.branch_map.keys()) branch_chunks = list( iter_chunks(branches, task.tasks_per_job)) self.submission_data.unsubmitted_jobs = OrderedDict( (i + 1, branches) for i, branches in enumerate(branch_chunks)) self.submit() # sleep once to give the job interface time to register the jobs post_submit_delay = self._get_task_attribute( "post_submit_delay")() if post_submit_delay: logger.debug( "sleep for {} seconds due to post_submit_delay". format(post_submit_delay)) time.sleep(post_submit_delay) # start status polling when a) no_poll is not set, or b) the jobs were already # submitted so that failed jobs are resubmitted after a single polling iteration if not task.no_poll or submitted: self.poll() finally: # in any event, cleanup the job file if self.job_file_factory: self.job_file_factory.cleanup_dir(force=False)
def run(self): """ Actual run method that starts the processing of jobs and initiates the status polling, or performs job cancelling or cleaning, depending on the task parameters. """ task = self.task self._outputs = self.output() # create the job dashboard interface self.dashboard = task.create_job_dashboard() or NoJobDashboard() # read submission data and reset some values submitted = not task.ignore_submission and self._outputs[ "submission"].exists() if submitted: self.submission_data.update( self._outputs["submission"].load(formatter="json")) task.tasks_per_job = self.submission_data.tasks_per_job self.dashboard.apply_config(self.submission_data.dashboard_config) for job_num in self.submission_data.jobs: self.attempts[int(job_num)] = -1 # when the branch outputs, i.e. the "collection" exists, just create dummy control outputs if "collection" in self._outputs and self._outputs[ "collection"].exists(): self.touch_control_outputs() # cancel jobs? elif self._cancel_jobs: if submitted: self.cancel() # cleanup jobs? elif self._cleanup_jobs: if submitted: self.cleanup() # submit and/or wait while polling else: # maybe set a tracking url tracking_url = self.dashboard.create_tracking_url() if tracking_url: task.set_tracking_url(tracking_url) print("tracking url set to {}".format(tracking_url)) # ensure the output directory exists if not submitted: self._outputs["submission"].parent.touch() # at this point, when the status file exists, it is considered outdated if "status" in self._outputs: self._outputs["status"].remove() try: self.job_file_factory = self.create_job_file_factory() # submit if not submitted: # set the initial job waiting list branches = sorted(task.branch_map.keys()) branch_chunks = list( iter_chunks(branches, task.tasks_per_job)) self.submission_data.waiting_jobs = OrderedDict( (i + 1, branches) for i, branches in enumerate(branch_chunks)) self.submit() # start status polling when a) no_poll is not set, or b) the jobs were already # submitted so that failed jobs are resubmitted after a single polling iteration if not task.no_poll or submitted: self.poll() finally: # finally, cleanup the job file if self.job_file_factory: self.job_file_factory.cleanup(force=False)
def query_batch(self, job_ids, threads=None, chunk_size=None, callback=None, **kwargs): """ Queries the status of a batch of jobs given by *job_ids* via a thread pool of size *threads* which defaults to its instance attribute. When *chunk_size*, which defaults to :py:attr:`chunk_size_query`, is not negative, *job_ids* are split into chunks of that size which are passed to :py:meth:`query`. When *callback* is set, it is invoked after each successful job (or job chunk) status query with the index of the corresponding job id (starting at 0) and the obtained status query data or an exception if any occurred. All other *kwargs* are passed to :py:meth:`query`. This method returns a dictionary that maps job ids to either the status query data or to an exception if any occurred. """ # default arguments threads = max(threads or self.threads or 1, 1) # is chunking allowed? if self.chunk_size_query: chunk_size = max(chunk_size or self.chunk_size_query, 0) else: chunk_size = 0 chunking = chunk_size > 0 # build chunks (either job ids one by one, or real chunks of job ids) job_ids = make_list(job_ids) chunks = list(iter_chunks(job_ids, chunk_size)) if chunking else job_ids # factory to call the passed callback for each job file even when chunking def cb_factory(i): if not callable(callback): return None elif chunking: def wrapper(query_data): offset = sum(len(chunk) for chunk in chunks[:i]) for j in range(len(chunks[i])): data = query_data if isinstance( query_data, Exception) else query_data[j] callback(offset + j, data) return wrapper else: def wrapper(data): callback(i, data) return wrapper # threaded processing pool = ThreadPool(threads) results = [ pool.apply_async(self.query, (v, ), kwargs, callback=cb_factory(i)) for i, v in enumerate(chunks) ] pool.close() pool.join() # store status data per job id or an exception query_data = {} if chunking: for i, (chunk, res) in enumerate(six.moves.zip(chunks, results)): data = get_async_result_silent(res) if isinstance(data, Exception): data = {job_id: data for job_id in chunk} query_data.update(data) else: for job_id, res in six.moves.zip(job_ids, results): query_data[job_id] = get_async_result_silent(res) return query_data
def cleanup_batch(self, job_ids, threads=None, chunk_size=None, callback=None, **kwargs): """ Cleans up a batch of jobs given by *job_ids* via a thread pool of size *threads* which defaults to its instance attribute. When *chunk_size*, which defaults to :py:attr:`chunk_size_cleanup`, is not negative, *job_ids* are split into chunks of that size which are passed to :py:meth:`cleanup`. When *callback* is set, it is invoked after each successful job (or job chunk) cleaning with the index of the corresponding job id (starting at 0) and either *None* or an exception if any occurred. All other *kwargs* are passed to :py:meth:`cleanup`. Exceptions that occured during job cleaning are stored in a list and returned. An empty list means that no exceptions occured. """ # default arguments threads = max(threads or self.threads or 1, 1) # is chunking allowed? if self.chunk_size_cleanup: chunk_size = max(chunk_size or self.chunk_size_cleanup, 0) else: chunk_size = 0 chunking = chunk_size > 0 # build chunks (either job ids one by one, or real chunks of job ids) job_ids = make_list(job_ids) chunks = list(iter_chunks(job_ids, chunk_size)) if chunking else job_ids # factory to call the passed callback for each job id even when chunking def cb_factory(i): if not callable(callback): return None elif chunking: def wrapper(err): offset = sum(len(chunk) for chunk in chunks[:i]) for j in range(len(chunks[i])): callback(offset + j, err) return wrapper else: def wrapper(err): callback(i, err) return wrapper # threaded processing pool = ThreadPool(threads) results = [ pool.apply_async(self.cleanup, (v, ), kwargs, callback=cb_factory(i)) for i, v in enumerate(chunks) ] pool.close() pool.join() # store errors errors = filter( bool, flatten(get_async_result_silent(res) for res in results)) return errors
def submit_batch(self, job_files, threads=None, chunk_size=None, callback=None, **kwargs): """ Submits a batch of jobs given by *job_files* via a thread pool of size *threads* which defaults to its instance attribute. When *chunk_size*, which defaults to :py:attr:`chunk_size_submit`, is not negative, *job_files* are split into chunks of that size which are passed to :py:meth:`submit`. When *callback* is set, it is invoked after each successful job submission with the index of the corresponding job file (starting at 0) and either the assigned job id or an exception if any occurred. All other *kwargs* are passed to :py:meth:`submit`. The return value is a list containing the return values of the particular :py:meth:`submit` calls, in an order that corresponds to *job_files*. When an exception was raised during a submission, this exception is added to the returned list. """ # default arguments threads = max(threads or self.threads or 1, 1) # is chunking allowed? if self.chunk_size_submit: chunk_size = max(chunk_size or self.chunk_size_submit, 0) else: chunk_size = 0 chunking = chunk_size > 0 # build chunks (either job files one by one, or real chunks of job files) job_files = make_list(job_files) chunks = list(iter_chunks(job_files, chunk_size)) if chunking else job_files # factory to call the passed callback for each job file even when chunking def cb_factory(i): if not callable(callback): return None elif chunking: def wrapper(job_ids): offset = sum(len(chunk) for chunk in chunks[:i]) for j in range(len(chunks[i])): job_id = job_ids if isinstance( job_ids, Exception) else job_ids[j] callback(offset + j, job_id) return wrapper else: def wrapper(job_id): callback(i, job_id) return wrapper # threaded processing pool = ThreadPool(threads) results = [ pool.apply_async(self.submit, (v, ), kwargs, callback=cb_factory(i)) for i, v in enumerate(chunks) ] pool.close() pool.join() # store return values or errors, same length as job files, independent of chunking if chunking: outputs = [] for i, (chunk, res) in enumerate(six.moves.zip(chunks, results)): job_ids = get_async_result_silent(res) if isinstance(job_ids, Exception): job_ids = len(chunk) * [job_ids] outputs.extend(job_ids) else: outputs = flatten(get_async_result_silent(res) for res in results) return outputs
def _build_merge_forest(self): # a node in the tree can be described by a tuple of integers, where each value denotes the # branch path to go down the tree to reach the node (e.g. (2, 0) -> 2nd branch, 0th branch), # so the length of the tuple defines the depth of the node via ``depth = len(node) - 1`` # the tree itself is a dict that maps depths to lists of nodes with that depth # when multiple trees are used (a forest), each one handles ``n_leaves / n_trees`` leaves if self._forest_built: return # helper to convert nested lists of leaf number chunks into a list of nodes in the format # described above def nodify(obj, node=None, root_id=0): if not isinstance(obj, list): return [] nodes = [] if node is None: node = tuple() else: nodes.append(node) for i, _obj in enumerate(obj): nodes += nodify(_obj, node + (i if node else root_id, )) return nodes # first, determine the number of files to merge in total when not already set via params if self._n_leaves is None: # the following lines build the workflow requirements, # which strictly requires this task to be a workflow (and also not the forest) # for branches, this block is executed anyway via cached workflow properties if self.is_branch(): raise Exception( "number of files to merge should not be computed for a branch" ) # get inputs, i.e. outputs of workflow requirements and trace actual inputs to merge # an integer number representing the number of inputs is also valid inputs = luigi.task.getpaths(self.merge_workflow_requires()) inputs = self.trace_merge_workflow_inputs(inputs) self._n_leaves = inputs if isinstance( inputs, six.integer_types) else len(inputs) # infer the number of trees from the merge output output = self.merge_output() n_trees = 1 if not isinstance(output, TargetCollection) else len(output) if self._n_leaves < n_trees: raise Exception( "too few leaves ({}) for number of requested trees ({})". format(self._n_leaves, n_trees)) # determine the number of leaves per tree n_min = self._n_leaves // n_trees n_trees_overlap = self._n_leaves % n_trees leaves_per_tree = n_trees_overlap * [n_min + 1] + ( n_trees - n_trees_overlap) * [n_min] # build the trees forest = [] for i, n_leaves in enumerate(leaves_per_tree): # build a nested list of leaf numbers using the merge factor # e.g. 9 leaves with factor 3 -> [[0, 1, 2], [3, 4, 5], [6, 7, 8]] # TODO: this point defines the actual tree structure, which is bottom-up at the moment, # but maybe it's good to have this configurable nested_leaves = list(iter_chunks(n_leaves, self.merge_factor)) while len(nested_leaves) > 1: nested_leaves = list( iter_chunks(nested_leaves, self.merge_factor)) # convert the list of nodes to the tree format described above tree = {} for node in nodify(nested_leaves, root_id=i): depth = len(node) - 1 tree.setdefault(depth, []).append(node) forest.append(tree) # store values self.leaves_per_tree = leaves_per_tree self.merge_forest = forest self._forest_built = True # complain when the depth is too large if self.tree_depth > self.max_depth: raise ValueError("tree_depth {} exceeds maximum depth {}".format( self.tree_depth, self.max_depth))
def submit(self, job_map=None): task = self.task # map branch numbers to job numbers, chunk by tasks_per_job if not job_map: branch_chunks = list(iter_chunks(task.branch_map.keys(), task.tasks_per_job)) job_map = dict((i + 1, branches) for i, branches in enumerate(branch_chunks)) # when only_missing is True, a job can be skipped when all its tasks are completed check_skip = False if task.only_missing and self.skipped_job_nums is None: self.skipped_job_nums = [] check_skip = True # create job files for each chunk job_data = OrderedDict() for job_num, branches in six.iteritems(job_map): if check_skip and all(task.as_branch(b).complete() for b in branches): self.skipped_job_nums.append(job_num) self.submission_data.jobs[job_num] = self.submission_data_cls.job_data( branches=branches) continue # create and store the job file job_data[job_num] = (branches, self.create_job_file(job_num, branches)) # actual submission job_files = [job_file for _, job_file in six.itervalues(job_data)] dst_info = self.destination_info() or "" dst_info = dst_info and (", " + dst_info) task.publish_message("going to submit {} {} job(s){}".format( len(job_files), self.workflow_type, dst_info)) # pass the the submit_jobs method for actual submission job_ids = self.submit_jobs(job_files) # store submission data errors = [] successful_job_nums = [] for job_num, job_id in six.moves.zip(job_data, job_ids): if isinstance(job_id, Exception): errors.append((job_num, job_id)) job_id = self.submission_data_cls.dummy_job_id else: successful_job_nums.append(job_num) self.submission_data.jobs[job_num] = self.submission_data_cls.job_data(job_id=job_id, branches=job_data[job_num][0]) # dump the submission data to the output file self.dump_submission_data() # raise exceptions or log if errors: print("{} error(s) occured during job submission of task {}:".format( len(errors), task.task_id)) tmpl = " job {}: {}" for i, tpl in enumerate(errors): print(tmpl.format(*tpl)) if i + 1 >= self.show_errors: remaining = len(errors) - self.show_errors if remaining > 0: print(" ... and {} more".format(remaining)) break else: task.publish_message("submitted {} job(s)".format(len(job_files)) + dst_info) # inform the dashboard about successful submissions for job_num in successful_job_nums: job_data = self.submission_data.jobs[job_num] task.forward_dashboard_event(self.dashboard, job_num, job_data, "action.submit")