def _run_get_new_deps(self): # set task callbacks before running for reporter_attr, task_attr in six.iteritems(self.forward_reporter_callbacks): setattr(self.task, task_attr, getattr(self.status_reporter, reporter_attr)) task_gen = self.task.run() # reset task callbacks for reporter_attr, task_attr in six.iteritems(self.forward_reporter_callbacks): setattr(self.task, task_attr, None) if not isinstance(task_gen, types.GeneratorType): return None next_send = None while True: try: if next_send is None: requires = six.next(task_gen) else: requires = task_gen.send(next_send) except StopIteration: return None new_req = flatten(requires) if all(t.complete() for t in new_req): next_send = getpaths(requires) else: new_deps = [(t.task_module, t.task_family, t.to_str_params()) for t in new_req] return new_deps
def _build_pig_cmd(self): opts = self.pig_options() def line(k, v): return ('%s=%s%s' % (k, v, os.linesep)).encode('utf-8') with tempfile.NamedTemporaryFile() as param_file, tempfile.NamedTemporaryFile() as prop_file: if self.pig_parameters(): items = six.iteritems(self.pig_parameters()) param_file.writelines(line(k, v) for (k, v) in items) param_file.flush() opts.append('-param_file') opts.append(param_file.name) if self.pig_properties(): items = six.iteritems(self.pig_properties()) prop_file.writelines(line(k, v) for (k, v) in items) prop_file.flush() opts.append('-propertyFile') opts.append(prop_file.name) cmd = [self.pig_command_path()] + opts + ["-f", self.pig_script_path()] logger.info(subprocess.list2cmdline(cmd)) yield cmd
def load(self): if os.path.exists(self._state_path): logger.info("Attempting to load state from %s", self._state_path) try: with open(self._state_path, 'rb') as fobj: state = pickle.load(fobj) except BaseException: logger.exception("Error when loading state. Starting from clean slate.") return self._tasks, self._active_workers = state self._status_tasks = collections.defaultdict(dict) for task in six.itervalues(self._tasks): self._status_tasks[task.status][task.id] = task # Convert from old format # TODO: this is really ugly, we need something more future-proof # Every time we add an attribute to the Worker class, this code needs to be updated for k, v in six.iteritems(self._active_workers): if isinstance(v, float): self._active_workers[k] = Worker(worker_id=k, last_active=v) if any(not hasattr(w, 'tasks') for k, w in six.iteritems(self._active_workers)): # If you load from an old format where Workers don't contain tasks. for k, worker in six.iteritems(self._active_workers): worker.tasks = set() for task in six.itervalues(self._tasks): for worker_id in task.workers: self._active_workers[worker_id].tasks.add(task) else: logger.info("No prior state file exists at %s. Starting with clean slate", self._state_path)
def _task_expl_groups(self, expls): if not self._config.group_by_error_messages: return [((task,), msg) for task, msg in six.iteritems(expls)] groups = collections.defaultdict(list) for task, msg in six.iteritems(expls): groups[msg].append(task) return [(tasks, msg) for msg, tasks in six.iteritems(groups)]
def _forward_attributes(self): # forward configured attributes to the task for reporter_attr, task_attr in six.iteritems(self.forward_reporter_attributes): setattr(self.task, task_attr, getattr(self.status_reporter, reporter_attr)) try: yield self finally: # reset attributes again for reporter_attr, task_attr in six.iteritems(self.forward_reporter_attributes): setattr(self.task, task_attr, None)
def wrapper(*args, **kwargs): import luigi.configuration orig_conf = luigi.configuration.get_config() luigi.configuration.LuigiConfigParser._instance = None conf = luigi.configuration.get_config() for (section, settings) in six.iteritems(self.config): if not conf.has_section(section): conf.add_section(section) for (name, value) in six.iteritems(settings): conf.set(section, name, value) try: return fun(*args, **kwargs) finally: luigi.configuration.LuigiConfigParser._instance = orig_conf
def wrapper(*args, **kwargs): import luigi.configuration orig_conf = luigi.configuration.LuigiConfigParser.instance() new_conf = luigi.configuration.LuigiConfigParser() luigi.configuration.LuigiConfigParser._instance = new_conf orig_dict = {k: dict(orig_conf.items(k)) for k in orig_conf.sections()} new_dict = self._make_dict(orig_dict) for (section, settings) in six.iteritems(new_dict): new_conf.add_section(section) for (name, value) in six.iteritems(settings): new_conf.set(section, name, value) try: return fun(*args, **kwargs) finally: luigi.configuration.LuigiConfigParser._instance = orig_conf
def _build_pig_cmd(self): opts = self.pig_options() for k, v in six.iteritems(self.pig_parameters()): opts.append("-p") opts.append("%s=%s" % (k, v)) if self.pig_properties(): with open('pig_property_file', 'w') as prop_file: prop_file.writelines(["%s=%s%s" % (k, v, os.linesep) for (k, v) in six.iteritems(self.pig_properties())]) opts.append('-propertyFile') opts.append('pig_property_file') cmd = [self.pig_command_path()] + opts + ["-f", self.pig_script_path()] return cmd
def _batchable_tasks(cls): for family, task_class in six.iteritems(cls._get_reg()): if task_class == cls.AMBIGUOUS_CLASS: continue batch_param_names = task_class.batch_param_names() if batch_param_names: yield family, task_class, batch_param_names
def no_unpicklable_properties(self): """ Remove unpicklable properties before dump task and resume them after. This method could be called in subtask's dump method, to ensure unpicklable properties won't break dump. This method is a context-manager which can be called as below: .. code-block: python class DummyTask(luigi): def _dump(self): with self.no_unpicklable_properties(): pickle.dumps(self) """ unpicklable_properties = ('set_tracking_url', 'set_status_message', 'set_progress_percentage') reserved_properties = {} for property_name in unpicklable_properties: if hasattr(self, property_name): reserved_properties[property_name] = getattr(self, property_name) setattr(self, property_name, 'placeholder_during_pickling') yield for property_name, value in six.iteritems(reserved_properties): setattr(self, property_name, value)
def clone(self, cls=None, **kwargs): """ Creates a new instance from an existing instance where some of the args have changed. There's at least two scenarios where this is useful (see test/clone_test.py): * remove a lot of boiler plate when you have recursive dependencies and lots of args * there's task inheritance and some logic is on the base class :param cls: :param kwargs: :return: """ k = self.param_kwargs.copy() k.update(six.iteritems(kwargs)) if cls is None: cls = self.__class__ new_k = {} for param_name, param_class in cls.get_params(): if param_name in k: new_k[param_name] = k[param_name] return cls(**new_k)
def _purge_children(self): """ Find dead children and put a response on the result queue. :return: """ for task_id, p in six.iteritems(self._running_tasks): task_prefix = 'Task {} (pid {})'.format(task_id, p.pid) if not p.is_alive() and p.exitcode: error_msg = '{} died unexpectedly with exit code {}'.format(task_prefix, p.exitcode) p.task.trigger_event(Event.PROCESS_FAILURE, p.task, error_msg) elif p.timeout_time is not None and time.time() > float(p.timeout_time) and p.is_alive(): if p.terminate(): error_msg = '{} timed out after {} seconds and was terminated.'.format( task_prefix, p.task.worker_timeout) p.task.trigger_event(Event.TIMEOUT, p.task, error_msg) else: sec_since_timeout = \ int(round(time.time() - p.timeout_time)) \ if p.timeout_time else 0 error_msg = '{} timed out after {} seconds but failed to terminate, {} seconds overdue'.format( task_prefix, int(p.task.worker_timeout), sec_since_timeout) else: continue logger.info(error_msg) if not p.is_alive(): self._task_result_queue.put((task_id, FAILED, error_msg, [], []))
def no_unpicklable_properties(self): """ Remove unpicklable properties before dump task and resume them after. This method could be called in subtask's dump method, to ensure unpicklable properties won't break dump. This method is a context-manager which can be called as below: .. code-block: python class DummyTask(luigi): def _dump(self): with self.no_unpicklable_properties(): pickle.dumps(self) """ unpicklable_properties = tuple( luigi.worker.TaskProcess.forward_reporter_attributes.values()) reserved_properties = {} for property_name in unpicklable_properties: if hasattr(self, property_name): reserved_properties[property_name] = getattr( self, property_name) setattr(self, property_name, 'placeholder_during_pickling') yield for property_name, value in six.iteritems(reserved_properties): setattr(self, property_name, value)
def get_param_values(cls, params, args, kwargs): """ Get the values of the parameters from the args and kwargs. :param params: list of (param_name, Parameter). :param args: positional arguments :param kwargs: keyword arguments. :returns: list of `(name, value)` tuples, one for each parameter. """ result = {} params_dict = dict(params) task_family = cls.get_task_family() # In case any exceptions are thrown, create a helpful description of how the Task was invoked # TODO: should we detect non-reprable arguments? These will lead to mysterious errors exc_desc = '%s[args=%s, kwargs=%s]' % (task_family, args, kwargs) # Fill in the positional arguments positional_params = [(n, p) for n, p in params if p.positional] for i, arg in enumerate(args): if i >= len(positional_params): raise parameter.UnknownParameterException( '%s: takes at most %d parameters (%d given)' % (exc_desc, len(positional_params), len(args))) param_name, param_obj = positional_params[i] result[param_name] = param_obj.normalize(arg) # Then the keyword arguments for param_name, arg in six.iteritems(kwargs): if param_name in result: raise parameter.DuplicateParameterException( '%s: parameter %s was already set as a positional parameter' % (exc_desc, param_name)) if param_name not in params_dict: raise parameter.UnknownParameterException( '%s: unknown parameter %s' % (exc_desc, param_name)) result[param_name] = params_dict[param_name].normalize(arg) # Then use the defaults for anything not filled in for param_name, param_obj in params: if param_name not in result: if not param_obj.has_task_value(task_family, param_name): raise parameter.MissingParameterException( "%s: requires the '%s' parameter to be set" % (exc_desc, param_name)) result[param_name] = param_obj.task_value( task_family, param_name) def list_to_tuple(x): """ Make tuples out of lists and sets to allow hashing """ if isinstance(x, list) or isinstance(x, set): return tuple(x) else: return x # Sort it by the correct order and make a list return [(param_name, list_to_tuple(result[param_name])) for param_name, param_obj in params]
def flatten(struct): """ Creates a flat list of all all items in structured output (dicts, lists, items): .. code-block:: python >>> sorted(flatten({'a': 'foo', 'b': 'bar'})) ['bar', 'foo'] >>> sorted(flatten(['foo', ['bar', 'troll']])) ['bar', 'foo', 'troll'] >>> flatten('foo') ['foo'] >>> flatten(42) [42] """ if struct is None: return [] flat = [] if isinstance(struct, dict): for _, result in six.iteritems(struct): flat += flatten(result) return flat if isinstance(struct, six.string_types): return [struct] try: # if iterable for result in struct: flat += flatten(result) return flat except TypeError: pass return [struct]
def flatten(struct): """ Creates a flat list of all all items in structured output (dicts, lists, items): .. code-block:: python >>> sorted(flatten({'a': 'foo', 'b': 'bar'})) ['bar', 'foo'] >>> sorted(flatten(['foo', ['bar', 'troll']])) ['bar', 'foo', 'troll'] >>> flatten('foo') ['foo'] >>> flatten(42) [42] """ if struct is None: return [] flat = [] if isinstance(struct, dict): for _, result in six.iteritems(struct): flat += flatten(result) return flat if isinstance(struct, six.string_types): return [struct] try: # if iterable iterator = iter(struct) except TypeError: return [struct] for result in iterator: flat += flatten(result) return flat
def new_task(name, cls, workflow_task, **kwargs): ''' Instantiate a new task. Not supposed to be used by the end-user (use WorkflowTask.new_task() instead). ''' slurminfo = None for key, val in [(key, val) for key, val in iteritems(kwargs)]: # Handle non-string keys if not isinstance(key, string_types): raise Exception("Key in kwargs to new_task is not string. Must be string: %s" % key) # Handle non-string values if isinstance(val, sciluigi.slurm.SlurmInfo): slurminfo = val kwargs[key] = val elif not isinstance(val, string_types): try: kwargs[key] = json.dumps(val) # Force conversion into string except TypeError: kwargs[key] = str(val) kwargs['instance_name'] = name kwargs['workflow_task'] = workflow_task kwargs['slurminfo'] = slurminfo newtask = cls.from_str_params(kwargs) if slurminfo is not None: newtask.slurminfo = slurminfo return newtask
def main(): parser = argparse.ArgumentParser( "luigi-grep is used to search for workflows using the luigi scheduler's json api") parser.add_argument( "--scheduler-host", default="localhost", help="hostname of the luigi scheduler") parser.add_argument( "--scheduler-port", default="8082", help="port of the luigi scheduler") parser.add_argument("--prefix", help="prefix of a task query to search for", default=None) parser.add_argument("--status", help="search for jobs with the given status", default=None) args = parser.parse_args() grep = LuigiGrep(args.scheduler_host, args.scheduler_port) results = [] if args.prefix: results = grep.prefix_search(args.prefix) elif args.status: results = grep.status_search(args.status) for job in results: print("{name}: {status}, Dependencies:".format(name=job['name'], status=job['status'])) for (status, jobs) in six.iteritems(job['deps_by_status']): print(" status={status}".format(status=status)) for job in jobs: print(" {job}".format(job=job))
def no_unpicklable_properties(self): """ Remove unpicklable properties before dump task and resume them after. This method could be called in subtask's dump method, to ensure unpicklable properties won't break dump. This method is a context-manager which can be called as below: .. code-block: python class DummyTask(luigi): def _dump(self): with self.no_unpicklable_properties(): pickle.dumps(self) """ unpicklable_properties = tuple(luigi.worker.TaskProcess.forward_reporter_attributes.values()) reserved_properties = {} for property_name in unpicklable_properties: if hasattr(self, property_name): reserved_properties[property_name] = getattr(self, property_name) setattr(self, property_name, 'placeholder_during_pickling') yield for property_name, value in six.iteritems(reserved_properties): setattr(self, property_name, value)
def run_job(self, job, tracking_url_callback=None): if tracking_url_callback is not None: warnings.warn("tracking_url_callback argument is deprecated, task.set_tracking_url is " "used instead.", DeprecationWarning) self.prepare_outputs(job) with tempfile.NamedTemporaryFile() as f: query = job.query() if isinstance(query, unicode): query = query.encode('utf8') f.write(query) f.flush() arglist = load_hive_cmd() + ['-f', f.name] hiverc = job.hiverc() if hiverc: if isinstance(hiverc, str): hiverc = [hiverc] for rcfile in hiverc: arglist += ['-i', rcfile] if job.hiveconfs(): for k, v in six.iteritems(job.hiveconfs()): arglist += ['--hiveconf', '{0}={1}'.format(k, v)] logger.info(arglist) return luigi.contrib.hadoop.run_and_track_hadoop_job(arglist, job.set_tracking_url)
def wrapper(*args, **kwargs): import luigi.configuration orig_conf = luigi.configuration.get_config() luigi.configuration.LuigiConfigParser._instance = None conf = luigi.configuration.get_config() for (section, settings) in six.iteritems(self.config): if not conf.has_section(section): conf.add_section(section) elif self.replace_sections: conf.remove_section(section) conf.add_section(section) for (name, value) in six.iteritems(settings): conf.set(section, name, value) try: return fun(*args, **kwargs) finally: luigi.configuration.LuigiConfigParser._instance = orig_conf
def _used_resources(self): used_resources = collections.defaultdict(int) if self._resources is not None: for task in self._state.get_active_tasks(): if task.status == RUNNING and task.resources: for resource, amount in six.iteritems(task.resources): used_resources[resource] += amount return used_resources
def _used_resources(self): used_resources = collections.defaultdict(int) if self._resources is not None: for task in self._state.get_active_tasks(status=RUNNING): if task.resources: for resource, amount in six.iteritems(task.resources): used_resources[resource] += amount return used_resources
def load(self): if os.path.exists(self._state_path): logger.info("Attempting to load state from %s", self._state_path) try: with open(self._state_path, 'rb') as fobj: state = pickle.load(fobj) except BaseException: logger.exception( "Error when loading state. Starting from clean slate.") return self.set_state(state) self._status_tasks = collections.defaultdict(dict) for task in six.itervalues(self._tasks): self._status_tasks[task.status][task.id] = task # Convert from old format # TODO: this is really ugly, we need something more future-proof # Every time we add an attribute to the Worker or Task class, this # code needs to be updated # Compatibility since 2014-06-02 for k, v in six.iteritems(self._active_workers): if isinstance(v, float): self._active_workers[k] = Worker(worker_id=k, last_active=v) # Compatibility since 2015-05-28 if any(not hasattr(w, 'tasks') for k, w in six.iteritems(self._active_workers)): # If you load from an old format where Workers don't contain tasks. for k, worker in six.iteritems(self._active_workers): worker.tasks = set() for task in six.itervalues(self._tasks): for worker_id in task.workers: self._active_workers[worker_id].tasks.add(task) # Compatibility since 2015-04-28 if any(not hasattr(t, 'disable_hard_timeout') for t in six.itervalues(self._tasks)): for t in six.itervalues(self._tasks): t.disable_hard_timeout = None else: logger.info( "No prior state file exists at %s. Starting with clean slate", self._state_path)
def to_str_params(self): ''' Convert all parameters to a str->str hash.''' params_str = {} params = dict(self.get_params()) for param_name, param_value in six.iteritems(self.param_kwargs): params_str[param_name] = params[param_name].serialize(param_value) return params_str
def _get_param_visibilities(self): param_visibilities = {} params = dict(self.get_params()) for param_name, param_value in six.iteritems(self.param_kwargs): if params[param_name].visibility != ParameterVisibility.PRIVATE: param_visibilities[param_name] = params[param_name].visibility.serialize() return param_visibilities
def wrapper(*args, **kwargs): import luigi.configuration orig_conf = luigi.configuration.LuigiConfigParser.instance() new_conf = luigi.configuration.LuigiConfigParser() luigi.configuration.LuigiConfigParser._instance = new_conf orig_dict = { k: dict(orig_conf.items(k)) for k in orig_conf.sections() } new_dict = self._make_dict(orig_dict) for (section, settings) in six.iteritems(new_dict): new_conf.add_section(section) for (name, value) in six.iteritems(settings): new_conf.set(section, name, value) try: return fun(*args, **kwargs) finally: luigi.configuration.LuigiConfigParser._instance = orig_conf
def dict_to_recordfile(filehandle, records): ''' Convert a dictionary to a recordfile. ''' csvwt = csv.writer(filehandle, delimiter=RECORDFILE_DELIMITER, skipinitialspace=True) rows = [] for key, val in iteritems(records): rows.append([key, val]) csvwt.writerows(rows)
def get_param_values(cls, params, args, kwargs): """ Get the values of the parameters from the args and kwargs. :param params: list of (param_name, Parameter). :param args: positional arguments :param kwargs: keyword arguments. :returns: list of `(name, value)` tuples, one for each parameter. """ result = {} params_dict = dict(params) task_name = cls.task_family # In case any exceptions are thrown, create a helpful description of how the Task was invoked # TODO: should we detect non-reprable arguments? These will lead to mysterious errors exc_desc = "%s[args=%s, kwargs=%s]" % (task_name, args, kwargs) # Fill in the positional arguments positional_params = [(n, p) for n, p in params if p.positional] for i, arg in enumerate(args): if i >= len(positional_params): raise parameter.UnknownParameterException( "%s: takes at most %d parameters (%d given)" % (exc_desc, len(positional_params), len(args)) ) param_name, param_obj = positional_params[i] result[param_name] = arg # Then the optional arguments for param_name, arg in six.iteritems(kwargs): if param_name in result: raise parameter.DuplicateParameterException( "%s: parameter %s was already set as a positional parameter" % (exc_desc, param_name) ) if param_name not in params_dict: raise parameter.UnknownParameterException("%s: unknown parameter %s" % (exc_desc, param_name)) result[param_name] = arg # Then use the defaults for anything not filled in for param_name, param_obj in params: if param_name not in result: if not param_obj.has_task_value(task_name, param_name): raise parameter.MissingParameterException( "%s: requires the '%s' parameter to be set" % (exc_desc, param_name) ) result[param_name] = param_obj.task_value(task_name, param_name) def list_to_tuple(x): """ Make tuples out of lists and sets to allow hashing """ if isinstance(x, list) or isinstance(x, set): return tuple(x) else: return x # Sort it by the correct order and make a list return [(param_name, list_to_tuple(result[param_name])) for param_name, param_obj in params]
def _get_param_visibilities(self): param_visibilities = {} params = dict(self.get_params()) for param_name, param_value in six.iteritems(self.param_kwargs): if params[param_name].visibility != ParameterVisibility.PRIVATE: param_visibilities[param_name] = params[ param_name].visibility.serialize() return param_visibilities
def most_common(items): """ Wanted functionality from Counters (new in Python 2.7). """ counts = {} for i in items: counts.setdefault(i, 0) counts[i] += 1 return max(six.iteritems(counts), key=operator.itemgetter(1))
def get_all_outputs(self): """ Retrieve a list of all task outputs (i.e. those that start with 'out_') :return: a list of all task outputs """ return [ attr_val for attr_name, attr_val in iteritems(self.__dict__) if attr_name.startswith('out_') ]
def test_arglist(self): task = self.task_class(param='foo') f_name = 'my_file' runner = luigi.contrib.hive.HiveQueryRunner() arglist = runner.get_arglist(f_name, task) f_idx = arglist.index('-f') self.assertEqual(arglist[f_idx + 1], f_name) hivevars = ['{}={}'.format(k, v) for k, v in six.iteritems(task.hivevars())] for var in hivevars: idx = arglist.index(var) self.assertEqual(arglist[idx - 1], '--hivevar') hiveconfs = ['{}={}'.format(k, v) for k, v in six.iteritems(task.hiveconfs())] for conf in hiveconfs: idx = arglist.index(conf) self.assertEqual(arglist[idx - 1], '--hiveconf')
def get_arglist(self, f_name, job): arglist = load_hive_cmd() + ['-f', f_name] hiverc = job.hiverc() if hiverc: if isinstance(hiverc, str): hiverc = [hiverc] for rcfile in hiverc: arglist += ['-i', rcfile] hiveconfs = job.hiveconfs() if hiveconfs: for k, v in six.iteritems(hiveconfs): arglist += ['--hiveconf', '{0}={1}'.format(k, v)] hivevars = job.hivevars() if hivevars: for k, v in six.iteritems(hivevars): arglist += ['--hivevar', '{0}={1}'.format(k, v)] logger.info(arglist) return arglist
def partition_spec(self, partition): """ Turns a dict into the a Hive partition specification string. """ return ','.join([ "{0}='{1}'".format(k, v) for (k, v) in sorted(six.iteritems(partition), key=operator.itemgetter(0)) ])
def _build_pig_cmd(self): opts = self.pig_options() for k, v in six.iteritems(self.pig_parameters()): opts.append("-p") opts.append("%s=%s" % (k, v)) if self.pig_properties(): with open('pig_property_file', 'w') as prop_file: prop_file.writelines([ "%s=%s%s" % (k, v, os.linesep) for (k, v) in six.iteritems(self.pig_properties()) ]) opts.append('-propertyFile') opts.append('pig_property_file') cmd = [self.pig_command_path()] + opts + ["-f", self.pig_script_path()] return cmd
def _flush_batch_incr_counter(self): """ Increments any unflushed counter values. """ for key, count in six.iteritems(self._counter_dict): if count == 0: continue args = list(key) + [count] self._incr_counter(*args)
def _has_resources(self, needed_resources, used_resources): if needed_resources is None: return True available_resources = self._resources or {} for resource, amount in six.iteritems(needed_resources): if amount + used_resources[resource] > available_resources.get(resource, 1): return False return True
def run(self): count = defaultdict(int) for s in self.input(): with s.open('r') as in_file: for line in in_file: _, key, value = line.strip().split() count[key] += 1 with self.output().open('w') as out_file: for key, value in six.iteritems(count): out_file.write('{},{}\n'.format(key, value))
def get_all_params(cls): """ Compiles and returns all parameters for all :py:class:`Task`. :return: a generator of tuples (TODO: we should make this more elegant) """ for task_name, task_cls in six.iteritems(cls._get_reg()): if task_cls == cls.AMBIGUOUS_CLASS: continue for param_name, param_obj in task_cls.get_params(): yield task_name, (not task_cls.use_cmdline_section), param_name, param_obj
def dict_to_recordfile(filehandle, records): """ Convert a dictionary to a recordfile. """ csvwt = csv.writer(filehandle, delimiter=RECORDFILE_DELIMITER, skipinitialspace=True) rows = [] for key, val in iteritems(records): rows.append([key, val]) csvwt.writerows(rows)
def to_str_params(self, only_significant=False): """ Convert all parameters to a str->str hash. """ params_str = {} params = dict(self.get_params()) for param_name, param_value in six.iteritems(self.param_kwargs): if (not only_significant) or params[param_name].significant: params_str[param_name] = params[param_name].serialize(param_value) return params_str
def resources(self): ''' get total resources and available ones ''' used_resources = self._used_resources() ret = collections.defaultdict(dict) for resource, total in six.iteritems(self._resources): ret[resource]['total'] = total if resource in used_resources: ret[resource]['used'] = used_resources[resource] else: ret[resource]['used'] = 0 return ret
def args(self): """ Returns an array of args to pass to the job. """ arglist = [] for k, v in six.iteritems(self.requires_hadoop()): arglist.append('--' + k) arglist.extend([t.output().path for t in flatten(v)]) arglist.extend(['--output', self.output()]) arglist.extend(self.job_args()) return arglist
def find_all_by_parameters(self, task_name, session=None, **task_params): """ Find tasks with the given task_name and the same parameters as the kwargs. """ with self._session(session) as session: query = session.query(TaskRecord).join(TaskEvent).filter( TaskRecord.name == task_name) for (k, v) in six.iteritems(task_params): alias = sqlalchemy.orm.aliased(TaskParameter) query = query.join(alias).filter(alias.name == k, alias.value == v) tasks = query.order_by(TaskEvent.ts) for task in tasks: # Sanity check assert all( k in task.parameters and v == str(task.parameters[k].value) for (k, v) in six.iteritems(task_params)) yield task
def _key(self, task_name, family, unbatched_args): if self._config.batch_mode == 'all': return task_name elif self._config.batch_mode == 'family': return family elif self._config.batch_mode == 'unbatched_params': param_str = six.u(', ').join(six.u('{}={}').format(*kv) for kv in six.iteritems(unbatched_args)) return six.u('{}({})').format(family, param_str) else: raise ValueError('Unknown batch mode for batch notifier: {}'.format( self._config.batch_mode))
def _upstream_tasks(self): """ Extract upstream tasks from the TargetInfo objects or functions returning those (or lists of both the earlier) for use in luigi's requires() method. """ upstream_tasks = [] for attrname, attrval in iteritems(self.__dict__): if 'in_' == attrname[0:3]: upstream_tasks = self._parse_inputitem(attrval, upstream_tasks) return upstream_tasks
def run(self): artist_count = defaultdict(int) for t in self.input(): with t.open('r') as in_file: for line in in_file: _, artist, track = line.strip().split() artist_count[artist] += 1 with self.output().open('w') as out_file: for artist, count in six.iteritems(artist_count): out_file.write('{}\t{}\n'.format(artist, count))
def find_all_by_parameters(self, task_name, session=None, **task_params): """ Find tasks with the given task_name and the same parameters as the kwargs. """ with self._session(session) as session: tasks = session.query(TaskRecord).join(TaskEvent).filter( TaskRecord.name == task_name).order_by(TaskEvent.ts).all() for task in tasks: if all(k in task.parameters and v == str(task.parameters[k].value) for (k, v) in six.iteritems(task_params)): yield task
def _apply_regex(self, regex, input): import re re_match = re.match(regex, input) if re_match: kwargs = {} has_val = False for k, v in six.iteritems(re_match.groupdict(default="0")): val = int(v) has_val = has_val or val != 0 kwargs[k] = val if has_val: return datetime.timedelta(**kwargs)