def generate_tasks_manual(): """ Return a formatted listing of all tasks with their descriptions. """ from siskin.sources import * from siskin.workflows import * output = StringIO.StringIO() # task_tuples = sorted(Register.get_reg().iteritems()) task_names = Register.task_names() output.write(MAN_HEADER) output.write(' {0} tasks found\n\n'.format(len(task_names))) for name in task_names: klass = Register.get_task_cls(name) doc = klass.__doc__ or colors.red("@todo: docs") output.write('{0} {1}\n'.format(colors.green(name), doc)) try: deps = flatten(klass().requires()) except Exception: # TODO: tasks that have required arguments will fail here formatted = colors.yellow("\tUnavailable since task has required parameters.") else: formatted = '\t{0}'.format(pprint.pformat(deps).replace('\n', '\n\t')) output.write(colors.magenta('\n\tDependencies ({0}):\n\n{1}\n\n'.format(len(deps), formatted))) return output.getvalue()
def requires(self): from luigi.task import Register if not self.task in Register.get_reg().keys(): logger.warn("No such task {} in registry; skipping".format(self.task)) return [] else: cls = Register.get_reg()[self.task] return [cls(target=x) for x in self.generic_wrapper_target]
def requires(self): from luigi.task import Register if not self.task in Register.get_reg().keys(): logger.warn("No such task {} in registry; skipping".format( self.task)) return [] else: cls = Register.get_reg()[self.task] return [cls(target=x) for x in self.generic_wrapper_target]
def main(): print("siskin %s\n\n" % __version__) task_names = Register.task_names() print('{0} tasks found\n'.format(len(task_names))) for name in task_names: if name.islower(): continue klass = Register.get_task_cls(name) doc = klass.__doc__ or yellow("@TODO: docs") print('{0} {1}\n'.format(green(name), doc))
def requires(self): # cache because we anticipate lots of tasks if hasattr(self, '_cached_requires'): return self._cached_requires if not self.start and not self.stop: raise ParameterException("At least one of start and stop needs to be specified") if not self.start and not self.reverse: raise ParameterException("Either start needs to be specified or reverse needs to be True") # TODO check overridden complete() and exists() now = datetime.utcfromtimestamp(time.time() if self.now is None else self.now) now = datetime(now.year, now.month, now.day, now.hour) datehours = [now + timedelta(hours=h) for h in range(-self.hours_back, self.hours_forward + 1)] datehours = filter(lambda h: (not self.start or h >= self.start) and (not self.stop or h < self.stop), datehours) task_cls = Register.get_task_cls(self.of) if datehours: logger.debug('Actually checking if range [%s, %s] of %s is complete' % (datehours[0], datehours[-1], self.of)) missing_datehours = sorted(self.missing_datehours(task_cls, datehours)) logger.debug('Range [%s, %s] lacked %d of expected %d %s instances' % (datehours[0], datehours[-1], len(missing_datehours), len(datehours), self.of)) else: missing_datehours = [] if self.reverse: required_datehours = missing_datehours[-self.task_limit:] else: required_datehours = missing_datehours[:self.task_limit] if required_datehours: logger.debug('Requiring %d missing %s instances in range [%s, %s]' % (len(required_datehours), self.of, required_datehours[0], required_datehours[-1])) if self.reverse: required_datehours.reverse() # I wish this determined the order tasks were scheduled or executed, but it doesn't. No priorities in Luigi yet self._cached_requires = [task_cls(d) for d in required_datehours] return self._cached_requires
def requires(self): # cache because we anticipate a fair amount of computation if hasattr(self, '_cached_requires'): return self._cached_requires if not self.start and not self.stop: raise ParameterException( "At least one of start and stop needs to be specified") if not self.start and not self.reverse: raise ParameterException( "Either start needs to be specified or reverse needs to be True" ) if self.start and self.stop and self.start > self.stop: raise ParameterException("Can't have start > stop") # TODO check overridden complete() and exists() now = datetime.utcfromtimestamp( time.time() if self.now is None else self.now) moving_start = self.moving_start(now) finite_start = moving_start if self.start is None else max( self.parameter_to_datetime(self.start), moving_start) moving_stop = self.moving_stop(now) finite_stop = moving_stop if self.stop is None else min( self.parameter_to_datetime(self.stop), moving_stop) datetimes = self.finite_datetimes( finite_start, finite_stop) if finite_start <= finite_stop else [] task_cls = Register.get_task_cls(self.of) if datetimes: logger.debug('Actually checking if range %s of %s is complete' % (self._format_range(datetimes), self.of)) missing_datetimes = sorted( self.missing_datetimes(task_cls, datetimes)) logger.debug('Range %s lacked %d of expected %d %s instances' % (self._format_range(datetimes), len(missing_datetimes), len(datetimes), self.of)) else: missing_datetimes = [] logger.debug('Empty range. No %s instances expected' % (self.of, )) self._emit_metrics(missing_datetimes, finite_start, finite_stop) if self.reverse: required_datetimes = missing_datetimes[-self.task_limit:] else: required_datetimes = missing_datetimes[:self.task_limit] if required_datetimes: logger.debug('Requiring %d missing %s instances in range %s' % (len(required_datetimes), self.of, self._format_range(required_datetimes))) if self.reverse: required_datetimes.reverse( ) # TODO priorities, so that within the batch tasks are ordered too self._cached_requires = [ task_cls(self.datetime_to_parameter(d)) for d in required_datetimes ] return self._cached_requires
def parse(self, cmdline_args=None, main_task_cls=None): parser = PassThroughOptionParser() def add_task_option(p): if main_task_cls: p.add_option('--task', help='Task to run (one of ' + Register.tasks_str() + ') [default: %default]', default=main_task_cls.task_family) else: p.add_option('--task', help='Task to run (one of %s)' % Register.tasks_str()) add_global_parameters(parser, optparse=True) add_task_option(parser) options, args = parser.parse_args(args=cmdline_args) task_cls_name = options.task if self.__existing_optparse: parser = self.__existing_optparse else: parser = optparse.OptionParser() add_task_option(parser) task_cls = Register.get_task_cls(task_cls_name) # Register all parameters as a big mess add_global_parameters(parser, optparse=True) add_task_parameters(parser, task_cls, optparse=True) # Parse and run options, args = parser.parse_args(args=cmdline_args) set_global_parameters(options) task_params = get_task_parameters(task_cls, options) return [task_cls(**task_params)]
def of_cls(self): if isinstance(self.of, six.string_types): warnings.warn( 'When using Range programatically, dont pass "of" param as string!' ) return Register.get_task_cls(self.of) return self.of
def add_global_parameters(parser, optparse=False): seen_params = set() for task_name, is_without_section, param_name, param in Register.get_all_params(): if param in seen_params: continue seen_params.add(param) param.add_to_cmdline_parser(parser, param_name, task_name, optparse=optparse, glob=True, is_without_section=is_without_section)
def _write_task_import_cache(path): """ Write dict to path. """ with open(path, 'w') as output: task_import_cache = dict([(name, Register.get_task_cls(name).__module__) for name in Register.task_names() if name[0].isupper()]) json.dump(task_import_cache, output)
def load_task(module, task_name, params_str): """ Imports task dynamically given a module and a task name. """ __import__(module) task_cls = Register.get_task_cls(task_name) return task_cls.from_str_params(params_str)
def parse_task(self, cmdline_args=None, main_task_cls=None): parser = argparse.ArgumentParser() add_global_parameters(parser) if main_task_cls: add_task_parameters(parser, main_task_cls) args = parser.parse_args(args=cmdline_args) task_cls = main_task_cls else: task_names = sorted(Register.get_reg().keys()) # Parse global arguments and pull out the task name. # We used to do this using subparsers+command, but some issues with # argparse across different versions of Python (2.7.9) made it hard. args, unknown = parser.parse_known_args(args=cmdline_args) if len(unknown) == 0: raise SystemExit('No task specified') task_name = unknown[0] if task_name not in task_names: error_task_names(task_name, task_names) task_cls = Register.get_task_cls(task_name) # Add a subparser to parse task-specific arguments subparsers = parser.add_subparsers(dest='command') subparser = subparsers.add_parser(task_name) # Add both task and global params here so that we can support both: # test.py --global-param xyz Test --n 42 # test.py Test --n 42 --global-param xyz add_global_parameters(subparser) add_task_parameters(subparser, task_cls) # Workaround for bug in argparse for Python 2.7.9 # See https://mail.python.org/pipermail/python-dev/2015-January/137699.html subargs = parser.parse_args(args=cmdline_args) for key, value in vars(subargs).items(): if value: # Either True (for boolean args) or non-None (everything else) setattr(args, key, value) # Notice that this is not side effect free because it might set global params set_global_parameters(args) task_params = get_task_parameters(task_cls, args) return [task_cls(**task_params)]
def of_cls(self): """ DONT USE. Will be deleted soon. Use ``self.of``! """ if isinstance(self.of, six.string_types): warnings.warn('When using Range programatically, dont pass "of" param as string!') return Register.get_task_cls(self.of) return self.of
def _write_task_import_cache(path): """ Write dictionary of task name module name mappings to given path. """ with open(path, 'w') as output: task_import_cache = dict([(name, Register.get_task_cls(name).__module__) for name in Register.task_names() if name[0].isupper()]) json.dump(task_import_cache, output)
def requires(self): # cache because we anticipate a fair amount of computation if hasattr(self, '_cached_requires'): return self._cached_requires if not self.start and not self.stop: raise ParameterException( "At least one of start and stop needs to be specified") if not self.start and not self.reverse: raise ParameterException( "Either start needs to be specified or reverse needs to be True" ) # TODO check overridden complete() and exists() now = datetime.utcfromtimestamp( time.time() if self.now is None else self.now) now = datetime(now.year, now.month, now.day, now.hour) datehours = [ now + timedelta(hours=h) for h in range(-self.hours_back, self.hours_forward + 1) ] datehours = filter( lambda h: (not self.start or h >= self.start) and (not self.stop or h < self.stop), datehours) task_cls = Register.get_task_cls(self.of) if datehours: logger.debug( 'Actually checking if range [%s, %s] of %s is complete' % (datehours[0], datehours[-1], self.of)) missing_datehours = sorted( self.missing_datehours(task_cls, datehours)) logger.debug( 'Range [%s, %s] lacked %d of expected %d %s instances' % (datehours[0], datehours[-1], len(missing_datehours), len(datehours), self.of)) else: missing_datehours = [] self._emit_metrics(missing_datehours, now) if self.reverse: required_datehours = missing_datehours[-self.task_limit:] else: required_datehours = missing_datehours[:self.task_limit] if required_datehours: logger.debug( 'Requiring %d missing %s instances in range [%s, %s]' % (len(required_datehours), self.of, required_datehours[0], required_datehours[-1])) if self.reverse: required_datehours.reverse( ) # I wish this determined the order tasks were scheduled or executed, but it doesn't. No priorities in Luigi yet self._cached_requires = [task_cls(d) for d in required_datehours] return self._cached_requires
def of_cls(self): """ DONT USE. Will be deleted soon. Use ``self.of``! """ if isinstance(self.of, six.string_types): warnings.warn( 'When using Range programatically, dont pass "of" param as string!' ) return Register.get_task_cls(self.of) return self.of
def requires(self): # cache because we anticipate a fair amount of computation if hasattr(self, '_cached_requires'): return self._cached_requires if not self.start and not self.stop: raise ParameterException("At least one of start and stop needs to be specified") if not self.start and not self.reverse: raise ParameterException("Either start needs to be specified or reverse needs to be True") if self.start and self.stop and self.start > self.stop: raise ParameterException("Can't have start > stop") # TODO check overridden complete() and exists() now = datetime.utcfromtimestamp(time.time() if self.now is None else self.now) moving_start = self.moving_start(now) finite_start = moving_start if self.start is None else max(self.parameter_to_datetime(self.start), moving_start) moving_stop = self.moving_stop(now) finite_stop = moving_stop if self.stop is None else min(self.parameter_to_datetime(self.stop), moving_stop) datetimes = self.finite_datetimes(finite_start, finite_stop) if finite_start <= finite_stop else [] task_cls = Register.get_task_cls(self.of) if datetimes: logger.debug('Actually checking if range %s of %s is complete', self._format_range(datetimes), self.of) missing_datetimes = sorted(self.missing_datetimes(task_cls, datetimes)) logger.debug('Range %s lacked %d of expected %d %s instances', self._format_range(datetimes), len(missing_datetimes), len(datetimes), self.of) else: missing_datetimes = [] logger.debug('Empty range. No %s instances expected', self.of) self._emit_metrics(missing_datetimes, finite_start, finite_stop) if self.reverse: required_datetimes = missing_datetimes[-self.task_limit:] else: required_datetimes = missing_datetimes[:self.task_limit] if required_datetimes: logger.debug('Requiring %d missing %s instances in range %s', len(required_datetimes), self.of, self._format_range(required_datetimes)) if self.reverse: required_datetimes.reverse() # TODO priorities, so that within the batch tasks are ordered too self._cached_requires = [task_cls(self.datetime_to_parameter(d)) for d in required_datetimes] return self._cached_requires
def get_task_import_cache(): """ Load `taskname: modulename` mappings from dictionary. Return a tuple containing the dictionary and the path to the cache file. """ task_import_cache = None path = os.path.join(tempfile.gettempdir(), 'siskin_task_import_cache_%s' % __version__) if not os.path.exists(path): from siskin.sources import * from siskin.workflows import * with open(path, 'w') as output: task_import_cache = dict([(name, Register.get_task_cls(name).__module__) for name in Register.task_names() if name[0].isupper()]) json.dump(task_import_cache, output) if task_import_cache is None: with open(path) as handle: try: task_import_cache = json.load(handle) except Exception as err: print("failed load task import cache, try removing %s and then try again" % path, file=sys.stderr) sys.exit(1) return task_import_cache, path
def gen_sphinx_tasks(entry_point, labels, *_args, **kwargs): """ Writes a file per label, suitable for use by sphinx.ext.autodoc, using the classes found from entry_point. Also generates toctree.inc, which can be included from the index page to provide links to each generated file. """ # Declare file header strings warning = '''.. WARNING: DO NOT EDIT THIS FILE DIRECTLY Generated by sphinx_source/gen_tasks.py on {now} '''.format(now=time.strftime('%c')) toctree_header = '''{warning} :orphan: .. toctree:: :maxdepth: 1 ''' incfile_header = '''{warning} .. _{category_slug}: Back to :doc:`index` {label_heading} ''' # Load modules into memory stevedore.ExtensionManager(entry_point) # Used to filter the classes under entry_point entry_point_dot = '{entry_point}.'.format(entry_point=entry_point) # Generate a list of output file arguments from the given labels and categories output = [] categories = kwargs.get('categories', []) for idx, label in enumerate(labels): try: category = '' if idx < len(categories): category = categories[idx] # Create a category slug for sphinx, and name the file with it category_slug = category.replace(' ', '_') or 'all' file_name = '{slug}.rst'.format(slug=category_slug) file_path = os.path.join(SPHINX_DIR, file_name) file_pointer = open(file_path, "w") output.append({ 'fp': file_pointer, 'file_name': file_name, 'category': category, 'category_slug': category_slug, 'label': label, 'label_heading': "{label}\n{_}".format(label=label, _='=' * len(label)), 'modules': {}, }) except IOError: sys.exit('Unable to write to {file_path}'.format(file_path=file_path)) # Write the header to the table of contents file tocfile_name = os.path.join(SPHINX_DIR, 'toctree.rst') try: tocfile = open(tocfile_name, "w") tocfile.write(toctree_header.format(warning=warning)) except IOError: sys.exit('Unable to write to {file_name}'.format(file_name=tocfile_name)) # For each Task, sorted by class name tasks = Register.get_reg() for name in sorted(tasks): cls = tasks[name] for out in output: # Show only tasks under entry_point module = cls.__module__ if module.startswith(entry_point_dot): # Strip off entry_point to avoid redundancy in documentation module = module.replace(entry_point_dot, '') if getattr(cls, 'task_category', '') == out['category']: if module not in out['modules']: out['modules'][module] = {} out['modules'][module][name] = cls for out in output: modules = sorted(out['modules'].keys()) if modules: tocfile.write("\n {incfile}".format(incfile=out['file_name'])) out['fp'].write(incfile_header.format(warning=warning, **out)) for module in modules: module_heading = '{module}'.format(module=module) out['fp'].write("\n\n{module_heading}\n{_}".format( module_heading=module_heading, _='-' * len(module_heading))) out['fp'].write("\n\n.. automodule:: {module}".format(module=module)) names = out['modules'][module] for name in sorted(names): out['fp'].write("\n\n.. autoclass:: {name}".format(name=name)) out['fp'].close() tocfile.close()
def of_cls(self): if isinstance(self.of, six.string_types): warnings.warn('When using Range programatically, dont pass "of" param as string!') return Register.get_task_cls(self.of) return self.of
def test_cmdline(self): # Exposes issue where wrapped tasks are registered twice under # the same name from luigi.task import Register self.assertEqual(Register.get_reg().get('SubtaskDelegator', None), SubtaskDelegator)
def gen_sphinx_tasks(entry_point, labels, *_args, **kwargs): """ Writes a file per label, suitable for use by sphinx.ext.autodoc, using the classes found from entry_point. Also generates toctree.inc, which can be included from the index page to provide links to each generated file. """ # Declare file header strings warning = '''.. WARNING: DO NOT EDIT THIS FILE DIRECTLY Generated by sphinx_source/gen_tasks.py on {now} '''.format(now=time.strftime('%c')) toctree_header = '''{warning} .. toctree:: :maxdepth: 1 ''' incfile_header = '''{warning} .. _{category_slug}: Back to :doc:`index` {label_heading} ''' # Load modules into memory stevedore.ExtensionManager(entry_point) # Used to filter the classes under entry_point entry_point_dot = '{entry_point}.'.format(entry_point=entry_point) # Generate a list of output file arguments from the given labels and categories output = [] categories = kwargs.get('categories', []) for idx, label in enumerate(labels): try: category = '' if idx < len(categories): category = categories[idx] # Create a category slug for sphinx, and name the file with it category_slug = category.replace(' ', '_') or 'all' file_name = '{slug}.rst'.format(slug=category_slug) file_path = os.path.join(SPHINX_DIR, file_name) file_pointer = open(file_path, "w") output.append({ 'fp': file_pointer, 'file_name': file_name, 'category': category, 'category_slug': category_slug, 'label': label, 'label_heading': "{label}\n{_}".format(label=label, _='=' * len(label)), 'modules': {}, }) except IOError: sys.exit( 'Unable to write to {file_path}'.format(file_path=file_path)) # Write the header to the table of contents file tocfile_name = os.path.join(SPHINX_DIR, 'toctree.rst') try: tocfile = open(tocfile_name, "w") tocfile.write(toctree_header.format(warning=warning)) except IOError: sys.exit( 'Unable to write to {file_name}'.format(file_name=tocfile_name)) # For each Task, sorted by class name tasks = Register.get_reg() for name in sorted(tasks): cls = tasks[name] module = cls.__module__ # Show only tasks under entry_point if module.startswith(entry_point_dot): for out in output: # Show only tasks in the output category if getattr(cls, 'task_category', '') == out['category']: if module not in out['modules']: out['modules'][module] = {} out['modules'][module][name] = cls for out in output: modules = sorted(out['modules'].keys()) if modules: tocfile.write("\n {incfile}".format(incfile=out['file_name'])) out['fp'].write(incfile_header.format(warning=warning, **out)) for module in modules: # Strip off entry_point to avoid redundancy in documentation module_heading = '{module}'.format( module=module.replace(entry_point_dot, '')) out['fp'].write("\n\n{module_heading}\n{_}".format( module_heading=module_heading, _='-' * len(module_heading))) out['fp'].write( "\n\n.. automodule:: {module}".format(module=module)) names = out['modules'][module] for name in sorted(names): out['fp'].write("\n\n.. autoclass:: {name}".format(name=name)) out['fp'].close() tocfile.close()
def set_global_parameters(args): # Note that this is not side effect free for task_name, is_without_section, param_name, param in Register.get_all_params(): param.set_global_from_args(param_name, task_name, args, is_without_section=is_without_section)
def add_task_option(p): if main_task_cls: p.add_option('--task', help='Task to run (one of ' + Register.tasks_str() + ') [default: %default]', default=main_task_cls.task_family) else: p.add_option('--task', help='Task to run (one of %s)' % Register.tasks_str())
def requires(self): task_class = Register.get_task_cls(self.of) return [task_class(self.config, key.name) for key in self.data.file_keys]
def requires(self): task_class = Register.get_task_cls(self.of) return [task_class(key.name) for key in dataset.file_keys]