Beispiel #1
0
    def _run_get_new_deps(self):
        # set task callbacks before running
        for reporter_attr, task_attr in six.iteritems(self.forward_reporter_callbacks):
            setattr(self.task, task_attr, getattr(self.status_reporter, reporter_attr))

        task_gen = self.task.run()

        # reset task callbacks
        for reporter_attr, task_attr in six.iteritems(self.forward_reporter_callbacks):
            setattr(self.task, task_attr, None)

        if not isinstance(task_gen, types.GeneratorType):
            return None

        next_send = None
        while True:
            try:
                if next_send is None:
                    requires = six.next(task_gen)
                else:
                    requires = task_gen.send(next_send)
            except StopIteration:
                return None

            new_req = flatten(requires)
            if all(t.complete() for t in new_req):
                next_send = getpaths(requires)
            else:
                new_deps = [(t.task_module, t.task_family, t.to_str_params())
                            for t in new_req]
                return new_deps
Beispiel #2
0
Datei: pig.py Projekt: 01-/luigi
    def _build_pig_cmd(self):
        opts = self.pig_options()

        def line(k, v):
            return ('%s=%s%s' % (k, v, os.linesep)).encode('utf-8')

        with tempfile.NamedTemporaryFile() as param_file, tempfile.NamedTemporaryFile() as prop_file:
            if self.pig_parameters():
                items = six.iteritems(self.pig_parameters())
                param_file.writelines(line(k, v) for (k, v) in items)
                param_file.flush()
                opts.append('-param_file')
                opts.append(param_file.name)

            if self.pig_properties():
                items = six.iteritems(self.pig_properties())
                prop_file.writelines(line(k, v) for (k, v) in items)
                prop_file.flush()
                opts.append('-propertyFile')
                opts.append(prop_file.name)

            cmd = [self.pig_command_path()] + opts + ["-f", self.pig_script_path()]

            logger.info(subprocess.list2cmdline(cmd))
            yield cmd
Beispiel #3
0
    def load(self):
        if os.path.exists(self._state_path):
            logger.info("Attempting to load state from %s", self._state_path)
            try:
                with open(self._state_path, 'rb') as fobj:
                    state = pickle.load(fobj)
            except BaseException:
                logger.exception("Error when loading state. Starting from clean slate.")
                return

            self._tasks, self._active_workers = state
            self._status_tasks = collections.defaultdict(dict)
            for task in six.itervalues(self._tasks):
                self._status_tasks[task.status][task.id] = task

            # Convert from old format
            # TODO: this is really ugly, we need something more future-proof
            # Every time we add an attribute to the Worker class, this code needs to be updated
            for k, v in six.iteritems(self._active_workers):
                if isinstance(v, float):
                    self._active_workers[k] = Worker(worker_id=k, last_active=v)

            if any(not hasattr(w, 'tasks') for k, w in six.iteritems(self._active_workers)):
                # If you load from an old format where Workers don't contain tasks.
                for k, worker in six.iteritems(self._active_workers):
                    worker.tasks = set()
                for task in six.itervalues(self._tasks):
                    for worker_id in task.workers:
                        self._active_workers[worker_id].tasks.add(task)
        else:
            logger.info("No prior state file exists at %s. Starting with clean slate", self._state_path)
Beispiel #4
0
    def _task_expl_groups(self, expls):
        if not self._config.group_by_error_messages:
            return [((task,), msg) for task, msg in six.iteritems(expls)]

        groups = collections.defaultdict(list)
        for task, msg in six.iteritems(expls):
            groups[msg].append(task)
        return [(tasks, msg) for msg, tasks in six.iteritems(groups)]
Beispiel #5
0
 def _forward_attributes(self):
     # forward configured attributes to the task
     for reporter_attr, task_attr in six.iteritems(self.forward_reporter_attributes):
         setattr(self.task, task_attr, getattr(self.status_reporter, reporter_attr))
     try:
         yield self
     finally:
         # reset attributes again
         for reporter_attr, task_attr in six.iteritems(self.forward_reporter_attributes):
             setattr(self.task, task_attr, None)
 def wrapper(*args, **kwargs):
     import luigi.configuration
     orig_conf = luigi.configuration.get_config()
     luigi.configuration.LuigiConfigParser._instance = None
     conf = luigi.configuration.get_config()
     for (section, settings) in six.iteritems(self.config):
         if not conf.has_section(section):
             conf.add_section(section)
         for (name, value) in six.iteritems(settings):
             conf.set(section, name, value)
     try:
         return fun(*args, **kwargs)
     finally:
         luigi.configuration.LuigiConfigParser._instance = orig_conf
Beispiel #7
0
 def wrapper(*args, **kwargs):
     import luigi.configuration
     orig_conf = luigi.configuration.LuigiConfigParser.instance()
     new_conf = luigi.configuration.LuigiConfigParser()
     luigi.configuration.LuigiConfigParser._instance = new_conf
     orig_dict = {k: dict(orig_conf.items(k)) for k in orig_conf.sections()}
     new_dict = self._make_dict(orig_dict)
     for (section, settings) in six.iteritems(new_dict):
         new_conf.add_section(section)
         for (name, value) in six.iteritems(settings):
             new_conf.set(section, name, value)
     try:
         return fun(*args, **kwargs)
     finally:
         luigi.configuration.LuigiConfigParser._instance = orig_conf
    def _build_pig_cmd(self):
        opts = self.pig_options()

        for k, v in six.iteritems(self.pig_parameters()):
            opts.append("-p")
            opts.append("%s=%s" % (k, v))

        if self.pig_properties():
            with open('pig_property_file', 'w') as prop_file:
                prop_file.writelines(["%s=%s%s" % (k, v, os.linesep) for (k, v) in six.iteritems(self.pig_properties())])
            opts.append('-propertyFile')
            opts.append('pig_property_file')

        cmd = [self.pig_command_path()] + opts + ["-f", self.pig_script_path()]
        return cmd
Beispiel #9
0
 def _batchable_tasks(cls):
     for family, task_class in six.iteritems(cls._get_reg()):
         if task_class == cls.AMBIGUOUS_CLASS:
             continue
         batch_param_names = task_class.batch_param_names()
         if batch_param_names:
             yield family, task_class, batch_param_names
Beispiel #10
0
    def no_unpicklable_properties(self):
        """
        Remove unpicklable properties before dump task and resume them after.

        This method could be called in subtask's dump method, to ensure unpicklable
        properties won't break dump.

        This method is a context-manager which can be called as below:

        .. code-block: python

            class DummyTask(luigi):

                def _dump(self):
                    with self.no_unpicklable_properties():
                        pickle.dumps(self)

        """
        unpicklable_properties = ('set_tracking_url', 'set_status_message', 'set_progress_percentage')
        reserved_properties = {}
        for property_name in unpicklable_properties:
            if hasattr(self, property_name):
                reserved_properties[property_name] = getattr(self, property_name)
                setattr(self, property_name, 'placeholder_during_pickling')

        yield

        for property_name, value in six.iteritems(reserved_properties):
            setattr(self, property_name, value)
Beispiel #11
0
    def clone(self, cls=None, **kwargs):
        """
        Creates a new instance from an existing instance where some of the args have changed.

        There's at least two scenarios where this is useful (see test/clone_test.py):

        * remove a lot of boiler plate when you have recursive dependencies and lots of args
        * there's task inheritance and some logic is on the base class

        :param cls:
        :param kwargs:
        :return:
        """
        k = self.param_kwargs.copy()
        k.update(six.iteritems(kwargs))

        if cls is None:
            cls = self.__class__

        new_k = {}
        for param_name, param_class in cls.get_params():
            if param_name in k:
                new_k[param_name] = k[param_name]

        return cls(**new_k)
Beispiel #12
0
    def _purge_children(self):
        """
        Find dead children and put a response on the result queue.

        :return:
        """
        for task_id, p in six.iteritems(self._running_tasks):
            task_prefix = 'Task {} (pid {})'.format(task_id, p.pid)
            if not p.is_alive() and p.exitcode:
                error_msg = '{} died unexpectedly with exit code {}'.format(task_prefix, p.exitcode)
                p.task.trigger_event(Event.PROCESS_FAILURE, p.task, error_msg)
            elif p.timeout_time is not None and time.time() > float(p.timeout_time) and p.is_alive():
                if p.terminate():
                    error_msg = '{} timed out after {} seconds and was terminated.'.format(
                        task_prefix, p.task.worker_timeout)
                    p.task.trigger_event(Event.TIMEOUT, p.task, error_msg)
                else:
                    sec_since_timeout = \
                        int(round(time.time() - p.timeout_time)) \
                        if p.timeout_time else 0
                    error_msg = '{} timed out after {} seconds but failed to terminate, {} seconds overdue'.format(
                        task_prefix, int(p.task.worker_timeout),
                        sec_since_timeout)
            else:
                continue

            logger.info(error_msg)
            if not p.is_alive():
                self._task_result_queue.put((task_id, FAILED, error_msg, [], []))
Beispiel #13
0
    def no_unpicklable_properties(self):
        """
        Remove unpicklable properties before dump task and resume them after.

        This method could be called in subtask's dump method, to ensure unpicklable
        properties won't break dump.

        This method is a context-manager which can be called as below:

        .. code-block: python

            class DummyTask(luigi):

                def _dump(self):
                    with self.no_unpicklable_properties():
                        pickle.dumps(self)

        """
        unpicklable_properties = tuple(
            luigi.worker.TaskProcess.forward_reporter_attributes.values())
        reserved_properties = {}
        for property_name in unpicklable_properties:
            if hasattr(self, property_name):
                reserved_properties[property_name] = getattr(
                    self, property_name)
                setattr(self, property_name, 'placeholder_during_pickling')

        yield

        for property_name, value in six.iteritems(reserved_properties):
            setattr(self, property_name, value)
Beispiel #14
0
    def get_param_values(cls, params, args, kwargs):
        """
        Get the values of the parameters from the args and kwargs.

        :param params: list of (param_name, Parameter).
        :param args: positional arguments
        :param kwargs: keyword arguments.
        :returns: list of `(name, value)` tuples, one for each parameter.
        """
        result = {}

        params_dict = dict(params)

        task_family = cls.get_task_family()

        # In case any exceptions are thrown, create a helpful description of how the Task was invoked
        # TODO: should we detect non-reprable arguments? These will lead to mysterious errors
        exc_desc = '%s[args=%s, kwargs=%s]' % (task_family, args, kwargs)

        # Fill in the positional arguments
        positional_params = [(n, p) for n, p in params if p.positional]
        for i, arg in enumerate(args):
            if i >= len(positional_params):
                raise parameter.UnknownParameterException(
                    '%s: takes at most %d parameters (%d given)' %
                    (exc_desc, len(positional_params), len(args)))
            param_name, param_obj = positional_params[i]
            result[param_name] = param_obj.normalize(arg)

        # Then the keyword arguments
        for param_name, arg in six.iteritems(kwargs):
            if param_name in result:
                raise parameter.DuplicateParameterException(
                    '%s: parameter %s was already set as a positional parameter'
                    % (exc_desc, param_name))
            if param_name not in params_dict:
                raise parameter.UnknownParameterException(
                    '%s: unknown parameter %s' % (exc_desc, param_name))
            result[param_name] = params_dict[param_name].normalize(arg)

        # Then use the defaults for anything not filled in
        for param_name, param_obj in params:
            if param_name not in result:
                if not param_obj.has_task_value(task_family, param_name):
                    raise parameter.MissingParameterException(
                        "%s: requires the '%s' parameter to be set" %
                        (exc_desc, param_name))
                result[param_name] = param_obj.task_value(
                    task_family, param_name)

        def list_to_tuple(x):
            """ Make tuples out of lists and sets to allow hashing """
            if isinstance(x, list) or isinstance(x, set):
                return tuple(x)
            else:
                return x

        # Sort it by the correct order and make a list
        return [(param_name, list_to_tuple(result[param_name]))
                for param_name, param_obj in params]
Beispiel #15
0
def flatten(struct):
    """
    Creates a flat list of all all items in structured output (dicts, lists, items):

    .. code-block:: python

        >>> sorted(flatten({'a': 'foo', 'b': 'bar'}))
        ['bar', 'foo']
        >>> sorted(flatten(['foo', ['bar', 'troll']]))
        ['bar', 'foo', 'troll']
        >>> flatten('foo')
        ['foo']
        >>> flatten(42)
        [42]
    """
    if struct is None:
        return []
    flat = []
    if isinstance(struct, dict):
        for _, result in six.iteritems(struct):
            flat += flatten(result)
        return flat
    if isinstance(struct, six.string_types):
        return [struct]

    try:
        # if iterable
        for result in struct:
            flat += flatten(result)
        return flat
    except TypeError:
        pass

    return [struct]
Beispiel #16
0
def flatten(struct):
    """
    Creates a flat list of all all items in structured output (dicts, lists, items):

    .. code-block:: python

        >>> sorted(flatten({'a': 'foo', 'b': 'bar'}))
        ['bar', 'foo']
        >>> sorted(flatten(['foo', ['bar', 'troll']]))
        ['bar', 'foo', 'troll']
        >>> flatten('foo')
        ['foo']
        >>> flatten(42)
        [42]
    """
    if struct is None:
        return []
    flat = []
    if isinstance(struct, dict):
        for _, result in six.iteritems(struct):
            flat += flatten(result)
        return flat
    if isinstance(struct, six.string_types):
        return [struct]

    try:
        # if iterable
        iterator = iter(struct)
    except TypeError:
        return [struct]

    for result in iterator:
        flat += flatten(result)
    return flat
Beispiel #17
0
def new_task(name, cls, workflow_task, **kwargs):
    '''
    Instantiate a new task. Not supposed to be used by the end-user
    (use WorkflowTask.new_task() instead).
    '''
    slurminfo = None
    for key, val in [(key, val) for key, val in iteritems(kwargs)]:
        # Handle non-string keys
        if not isinstance(key, string_types):
            raise Exception("Key in kwargs to new_task is not string. Must be string: %s" % key)
        # Handle non-string values
        if isinstance(val, sciluigi.slurm.SlurmInfo):
            slurminfo = val
            kwargs[key] = val
        elif not isinstance(val, string_types):
            try:
                kwargs[key] = json.dumps(val) # Force conversion into string
            except TypeError:
                kwargs[key] = str(val)
    kwargs['instance_name'] = name
    kwargs['workflow_task'] = workflow_task
    kwargs['slurminfo'] = slurminfo
    newtask = cls.from_str_params(kwargs)
    if slurminfo is not None:
        newtask.slurminfo = slurminfo
    return newtask
Beispiel #18
0
    def clone(self, cls=None, **kwargs):
        """
        Creates a new instance from an existing instance where some of the args have changed.

        There's at least two scenarios where this is useful (see test/clone_test.py):

        * remove a lot of boiler plate when you have recursive dependencies and lots of args
        * there's task inheritance and some logic is on the base class

        :param cls:
        :param kwargs:
        :return:
        """
        k = self.param_kwargs.copy()
        k.update(six.iteritems(kwargs))

        if cls is None:
            cls = self.__class__

        new_k = {}
        for param_name, param_class in cls.get_params():
            if param_name in k:
                new_k[param_name] = k[param_name]

        return cls(**new_k)
Beispiel #19
0
def main():
    parser = argparse.ArgumentParser(
        "luigi-grep is used to search for workflows using the luigi scheduler's json api")
    parser.add_argument(
        "--scheduler-host", default="localhost", help="hostname of the luigi scheduler")
    parser.add_argument(
        "--scheduler-port", default="8082", help="port of the luigi scheduler")
    parser.add_argument("--prefix", help="prefix of a task query to search for", default=None)
    parser.add_argument("--status", help="search for jobs with the given status", default=None)

    args = parser.parse_args()
    grep = LuigiGrep(args.scheduler_host, args.scheduler_port)

    results = []
    if args.prefix:
        results = grep.prefix_search(args.prefix)
    elif args.status:
        results = grep.status_search(args.status)

    for job in results:
        print("{name}: {status}, Dependencies:".format(name=job['name'], status=job['status']))
        for (status, jobs) in six.iteritems(job['deps_by_status']):
            print("  status={status}".format(status=status))
            for job in jobs:
                print("    {job}".format(job=job))
Beispiel #20
0
    def no_unpicklable_properties(self):
        """
        Remove unpicklable properties before dump task and resume them after.

        This method could be called in subtask's dump method, to ensure unpicklable
        properties won't break dump.

        This method is a context-manager which can be called as below:

        .. code-block: python

            class DummyTask(luigi):

                def _dump(self):
                    with self.no_unpicklable_properties():
                        pickle.dumps(self)

        """
        unpicklable_properties = tuple(luigi.worker.TaskProcess.forward_reporter_attributes.values())
        reserved_properties = {}
        for property_name in unpicklable_properties:
            if hasattr(self, property_name):
                reserved_properties[property_name] = getattr(self, property_name)
                setattr(self, property_name, 'placeholder_during_pickling')

        yield

        for property_name, value in six.iteritems(reserved_properties):
            setattr(self, property_name, value)
Beispiel #21
0
    def run_job(self, job, tracking_url_callback=None):
        if tracking_url_callback is not None:
            warnings.warn("tracking_url_callback argument is deprecated, task.set_tracking_url is "
                          "used instead.", DeprecationWarning)

        self.prepare_outputs(job)
        with tempfile.NamedTemporaryFile() as f:
            query = job.query()
            if isinstance(query, unicode):
                query = query.encode('utf8')
            f.write(query)
            f.flush()
            arglist = load_hive_cmd() + ['-f', f.name]
            hiverc = job.hiverc()
            if hiverc:
                if isinstance(hiverc, str):
                    hiverc = [hiverc]
                for rcfile in hiverc:
                    arglist += ['-i', rcfile]
            if job.hiveconfs():
                for k, v in six.iteritems(job.hiveconfs()):
                    arglist += ['--hiveconf', '{0}={1}'.format(k, v)]

            logger.info(arglist)
            return luigi.contrib.hadoop.run_and_track_hadoop_job(arglist, job.set_tracking_url)
Beispiel #22
0
 def wrapper(*args, **kwargs):
     import luigi.configuration
     orig_conf = luigi.configuration.get_config()
     luigi.configuration.LuigiConfigParser._instance = None
     conf = luigi.configuration.get_config()
     for (section, settings) in six.iteritems(self.config):
         if not conf.has_section(section):
             conf.add_section(section)
         elif self.replace_sections:
             conf.remove_section(section)
             conf.add_section(section)
         for (name, value) in six.iteritems(settings):
             conf.set(section, name, value)
     try:
         return fun(*args, **kwargs)
     finally:
         luigi.configuration.LuigiConfigParser._instance = orig_conf
Beispiel #23
0
 def _used_resources(self):
     used_resources = collections.defaultdict(int)
     if self._resources is not None:
         for task in self._state.get_active_tasks():
             if task.status == RUNNING and task.resources:
                 for resource, amount in six.iteritems(task.resources):
                     used_resources[resource] += amount
     return used_resources
Beispiel #24
0
 def _used_resources(self):
     used_resources = collections.defaultdict(int)
     if self._resources is not None:
         for task in self._state.get_active_tasks(status=RUNNING):
             if task.resources:
                 for resource, amount in six.iteritems(task.resources):
                     used_resources[resource] += amount
     return used_resources
Beispiel #25
0
    def load(self):
        if os.path.exists(self._state_path):
            logger.info("Attempting to load state from %s", self._state_path)
            try:
                with open(self._state_path, 'rb') as fobj:
                    state = pickle.load(fobj)
            except BaseException:
                logger.exception(
                    "Error when loading state. Starting from clean slate.")
                return

            self.set_state(state)
            self._status_tasks = collections.defaultdict(dict)
            for task in six.itervalues(self._tasks):
                self._status_tasks[task.status][task.id] = task

            # Convert from old format
            # TODO: this is really ugly, we need something more future-proof
            # Every time we add an attribute to the Worker or Task class, this
            # code needs to be updated

            # Compatibility since 2014-06-02
            for k, v in six.iteritems(self._active_workers):
                if isinstance(v, float):
                    self._active_workers[k] = Worker(worker_id=k,
                                                     last_active=v)

            # Compatibility since 2015-05-28
            if any(not hasattr(w, 'tasks')
                   for k, w in six.iteritems(self._active_workers)):
                # If you load from an old format where Workers don't contain tasks.
                for k, worker in six.iteritems(self._active_workers):
                    worker.tasks = set()
                for task in six.itervalues(self._tasks):
                    for worker_id in task.workers:
                        self._active_workers[worker_id].tasks.add(task)

            # Compatibility since 2015-04-28
            if any(not hasattr(t, 'disable_hard_timeout')
                   for t in six.itervalues(self._tasks)):
                for t in six.itervalues(self._tasks):
                    t.disable_hard_timeout = None
        else:
            logger.info(
                "No prior state file exists at %s. Starting with clean slate",
                self._state_path)
Beispiel #26
0
    def to_str_params(self):
        ''' Convert all parameters to a str->str hash.'''
        params_str = {}
        params = dict(self.get_params())
        for param_name, param_value in six.iteritems(self.param_kwargs):
            params_str[param_name] = params[param_name].serialize(param_value)

        return params_str
Beispiel #27
0
    def to_str_params(self):
        ''' Convert all parameters to a str->str hash.'''
        params_str = {}
        params = dict(self.get_params())
        for param_name, param_value in six.iteritems(self.param_kwargs):
            params_str[param_name] = params[param_name].serialize(param_value)

        return params_str
Beispiel #28
0
    def _get_param_visibilities(self):
        param_visibilities = {}
        params = dict(self.get_params())
        for param_name, param_value in six.iteritems(self.param_kwargs):
            if params[param_name].visibility != ParameterVisibility.PRIVATE:
                param_visibilities[param_name] = params[param_name].visibility.serialize()

        return param_visibilities
Beispiel #29
0
 def wrapper(*args, **kwargs):
     import luigi.configuration
     orig_conf = luigi.configuration.LuigiConfigParser.instance()
     new_conf = luigi.configuration.LuigiConfigParser()
     luigi.configuration.LuigiConfigParser._instance = new_conf
     orig_dict = {
         k: dict(orig_conf.items(k))
         for k in orig_conf.sections()
     }
     new_dict = self._make_dict(orig_dict)
     for (section, settings) in six.iteritems(new_dict):
         new_conf.add_section(section)
         for (name, value) in six.iteritems(settings):
             new_conf.set(section, name, value)
     try:
         return fun(*args, **kwargs)
     finally:
         luigi.configuration.LuigiConfigParser._instance = orig_conf
Beispiel #30
0
def dict_to_recordfile(filehandle, records):
    '''
    Convert a dictionary to a recordfile.
    '''
    csvwt = csv.writer(filehandle, delimiter=RECORDFILE_DELIMITER, skipinitialspace=True)
    rows = []
    for key, val in iteritems(records):
        rows.append([key, val])
    csvwt.writerows(rows)
Beispiel #31
0
    def get_param_values(cls, params, args, kwargs):
        """
        Get the values of the parameters from the args and kwargs.

        :param params: list of (param_name, Parameter).
        :param args: positional arguments
        :param kwargs: keyword arguments.
        :returns: list of `(name, value)` tuples, one for each parameter.
        """
        result = {}

        params_dict = dict(params)

        task_name = cls.task_family

        # In case any exceptions are thrown, create a helpful description of how the Task was invoked
        # TODO: should we detect non-reprable arguments? These will lead to mysterious errors
        exc_desc = "%s[args=%s, kwargs=%s]" % (task_name, args, kwargs)

        # Fill in the positional arguments
        positional_params = [(n, p) for n, p in params if p.positional]
        for i, arg in enumerate(args):
            if i >= len(positional_params):
                raise parameter.UnknownParameterException(
                    "%s: takes at most %d parameters (%d given)" % (exc_desc, len(positional_params), len(args))
                )
            param_name, param_obj = positional_params[i]
            result[param_name] = arg

        # Then the optional arguments
        for param_name, arg in six.iteritems(kwargs):
            if param_name in result:
                raise parameter.DuplicateParameterException(
                    "%s: parameter %s was already set as a positional parameter" % (exc_desc, param_name)
                )
            if param_name not in params_dict:
                raise parameter.UnknownParameterException("%s: unknown parameter %s" % (exc_desc, param_name))
            result[param_name] = arg

        # Then use the defaults for anything not filled in
        for param_name, param_obj in params:
            if param_name not in result:
                if not param_obj.has_task_value(task_name, param_name):
                    raise parameter.MissingParameterException(
                        "%s: requires the '%s' parameter to be set" % (exc_desc, param_name)
                    )
                result[param_name] = param_obj.task_value(task_name, param_name)

        def list_to_tuple(x):
            """ Make tuples out of lists and sets to allow hashing """
            if isinstance(x, list) or isinstance(x, set):
                return tuple(x)
            else:
                return x

        # Sort it by the correct order and make a list
        return [(param_name, list_to_tuple(result[param_name])) for param_name, param_obj in params]
Beispiel #32
0
    def _get_param_visibilities(self):
        param_visibilities = {}
        params = dict(self.get_params())
        for param_name, param_value in six.iteritems(self.param_kwargs):
            if params[param_name].visibility != ParameterVisibility.PRIVATE:
                param_visibilities[param_name] = params[
                    param_name].visibility.serialize()

        return param_visibilities
Beispiel #33
0
def most_common(items):
    """
    Wanted functionality from Counters (new in Python 2.7).
    """
    counts = {}
    for i in items:
        counts.setdefault(i, 0)
        counts[i] += 1
    return max(six.iteritems(counts), key=operator.itemgetter(1))
Beispiel #34
0
 def get_all_outputs(self):
     """
     Retrieve a list of all task outputs (i.e. those that start with 'out_')
     :return: a list of all task outputs
     """
     return [
         attr_val for attr_name, attr_val in iteritems(self.__dict__)
         if attr_name.startswith('out_')
     ]
Beispiel #35
0
    def test_arglist(self):
        task = self.task_class(param='foo')
        f_name = 'my_file'
        runner = luigi.contrib.hive.HiveQueryRunner()
        arglist = runner.get_arglist(f_name, task)

        f_idx = arglist.index('-f')
        self.assertEqual(arglist[f_idx + 1], f_name)

        hivevars = ['{}={}'.format(k, v) for k, v in six.iteritems(task.hivevars())]
        for var in hivevars:
            idx = arglist.index(var)
            self.assertEqual(arglist[idx - 1], '--hivevar')

        hiveconfs = ['{}={}'.format(k, v) for k, v in six.iteritems(task.hiveconfs())]
        for conf in hiveconfs:
            idx = arglist.index(conf)
            self.assertEqual(arglist[idx - 1], '--hiveconf')
Beispiel #36
0
 def get_arglist(self, f_name, job):
     arglist = load_hive_cmd() + ['-f', f_name]
     hiverc = job.hiverc()
     if hiverc:
         if isinstance(hiverc, str):
             hiverc = [hiverc]
         for rcfile in hiverc:
             arglist += ['-i', rcfile]
     hiveconfs = job.hiveconfs()
     if hiveconfs:
         for k, v in six.iteritems(hiveconfs):
             arglist += ['--hiveconf', '{0}={1}'.format(k, v)]
     hivevars = job.hivevars()
     if hivevars:
         for k, v in six.iteritems(hivevars):
             arglist += ['--hivevar', '{0}={1}'.format(k, v)]
     logger.info(arglist)
     return arglist
Beispiel #37
0
 def partition_spec(self, partition):
     """
     Turns a dict into the a Hive partition specification string.
     """
     return ','.join([
         "{0}='{1}'".format(k, v)
         for (k, v) in sorted(six.iteritems(partition),
                              key=operator.itemgetter(0))
     ])
Beispiel #38
0
    def test_arglist(self):
        task = self.task_class(param='foo')
        f_name = 'my_file'
        runner = luigi.contrib.hive.HiveQueryRunner()
        arglist = runner.get_arglist(f_name, task)

        f_idx = arglist.index('-f')
        self.assertEqual(arglist[f_idx + 1], f_name)

        hivevars = ['{}={}'.format(k, v) for k, v in six.iteritems(task.hivevars())]
        for var in hivevars:
            idx = arglist.index(var)
            self.assertEqual(arglist[idx - 1], '--hivevar')

        hiveconfs = ['{}={}'.format(k, v) for k, v in six.iteritems(task.hiveconfs())]
        for conf in hiveconfs:
            idx = arglist.index(conf)
            self.assertEqual(arglist[idx - 1], '--hiveconf')
Beispiel #39
0
    def _build_pig_cmd(self):
        opts = self.pig_options()

        for k, v in six.iteritems(self.pig_parameters()):
            opts.append("-p")
            opts.append("%s=%s" % (k, v))

        if self.pig_properties():
            with open('pig_property_file', 'w') as prop_file:
                prop_file.writelines([
                    "%s=%s%s" % (k, v, os.linesep)
                    for (k, v) in six.iteritems(self.pig_properties())
                ])
            opts.append('-propertyFile')
            opts.append('pig_property_file')

        cmd = [self.pig_command_path()] + opts + ["-f", self.pig_script_path()]
        return cmd
Beispiel #40
0
def most_common(items):
    """
    Wanted functionality from Counters (new in Python 2.7).
    """
    counts = {}
    for i in items:
        counts.setdefault(i, 0)
        counts[i] += 1
    return max(six.iteritems(counts), key=operator.itemgetter(1))
Beispiel #41
0
 def _flush_batch_incr_counter(self):
     """
     Increments any unflushed counter values.
     """
     for key, count in six.iteritems(self._counter_dict):
         if count == 0:
             continue
         args = list(key) + [count]
         self._incr_counter(*args)
Beispiel #42
0
 def get_arglist(self, f_name, job):
     arglist = load_hive_cmd() + ['-f', f_name]
     hiverc = job.hiverc()
     if hiverc:
         if isinstance(hiverc, str):
             hiverc = [hiverc]
         for rcfile in hiverc:
             arglist += ['-i', rcfile]
     hiveconfs = job.hiveconfs()
     if hiveconfs:
         for k, v in six.iteritems(hiveconfs):
             arglist += ['--hiveconf', '{0}={1}'.format(k, v)]
     hivevars = job.hivevars()
     if hivevars:
         for k, v in six.iteritems(hivevars):
             arglist += ['--hivevar', '{0}={1}'.format(k, v)]
     logger.info(arglist)
     return arglist
Beispiel #43
0
 def _flush_batch_incr_counter(self):
     """
     Increments any unflushed counter values.
     """
     for key, count in six.iteritems(self._counter_dict):
         if count == 0:
             continue
         args = list(key) + [count]
         self._incr_counter(*args)
Beispiel #44
0
    def _has_resources(self, needed_resources, used_resources):
        if needed_resources is None:
            return True

        available_resources = self._resources or {}
        for resource, amount in six.iteritems(needed_resources):
            if amount + used_resources[resource] > available_resources.get(resource, 1):
                return False
        return True
Beispiel #45
0
    def _has_resources(self, needed_resources, used_resources):
        if needed_resources is None:
            return True

        available_resources = self._resources or {}
        for resource, amount in six.iteritems(needed_resources):
            if amount + used_resources[resource] > available_resources.get(resource, 1):
                return False
        return True
Beispiel #46
0
 def run(self):
     count = defaultdict(int)
     for s in self.input():
         with s.open('r') as in_file:
             for line in in_file:
                 _, key, value = line.strip().split()
                 count[key] += 1
     with self.output().open('w') as out_file:
         for key, value in six.iteritems(count):
             out_file.write('{},{}\n'.format(key, value))
Beispiel #47
0
 def run(self):
     count = defaultdict(int)
     for s in self.input():
         with s.open('r') as in_file:
             for line in in_file:
                 _, key, value = line.strip().split()
                 count[key] += 1
     with self.output().open('w') as out_file:
         for key, value in six.iteritems(count):
             out_file.write('{},{}\n'.format(key, value))
Beispiel #48
0
    def get_all_params(cls):
        """
        Compiles and returns all parameters for all :py:class:`Task`.

        :return: a generator of tuples (TODO: we should make this more elegant)
        """
        for task_name, task_cls in six.iteritems(cls._get_reg()):
            if task_cls == cls.AMBIGUOUS_CLASS:
                continue
            for param_name, param_obj in task_cls.get_params():
                yield task_name, (not task_cls.use_cmdline_section), param_name, param_obj
Beispiel #49
0
def dict_to_recordfile(filehandle, records):
    """
    Convert a dictionary to a recordfile.
    """
    csvwt = csv.writer(filehandle,
                       delimiter=RECORDFILE_DELIMITER,
                       skipinitialspace=True)
    rows = []
    for key, val in iteritems(records):
        rows.append([key, val])
    csvwt.writerows(rows)
Beispiel #50
0
    def to_str_params(self, only_significant=False):
        """
        Convert all parameters to a str->str hash.
        """
        params_str = {}
        params = dict(self.get_params())
        for param_name, param_value in six.iteritems(self.param_kwargs):
            if (not only_significant) or params[param_name].significant:
                params_str[param_name] = params[param_name].serialize(param_value)

        return params_str
Beispiel #51
0
 def resources(self):
     ''' get total resources and available ones '''
     used_resources = self._used_resources()
     ret = collections.defaultdict(dict)
     for resource, total in six.iteritems(self._resources):
         ret[resource]['total'] = total
         if resource in used_resources:
             ret[resource]['used'] = used_resources[resource]
         else:
             ret[resource]['used'] = 0
     return ret
Beispiel #52
0
 def args(self):
     """
     Returns an array of args to pass to the job.
     """
     arglist = []
     for k, v in six.iteritems(self.requires_hadoop()):
         arglist.append('--' + k)
         arglist.extend([t.output().path for t in flatten(v)])
     arglist.extend(['--output', self.output()])
     arglist.extend(self.job_args())
     return arglist
Beispiel #53
0
    def find_all_by_parameters(self, task_name, session=None, **task_params):
        """
        Find tasks with the given task_name and the same parameters as the kwargs.
        """
        with self._session(session) as session:
            query = session.query(TaskRecord).join(TaskEvent).filter(
                TaskRecord.name == task_name)
            for (k, v) in six.iteritems(task_params):
                alias = sqlalchemy.orm.aliased(TaskParameter)
                query = query.join(alias).filter(alias.name == k,
                                                 alias.value == v)

            tasks = query.order_by(TaskEvent.ts)
            for task in tasks:
                # Sanity check
                assert all(
                    k in task.parameters and v == str(task.parameters[k].value)
                    for (k, v) in six.iteritems(task_params))

                yield task
Beispiel #54
0
 def _key(self, task_name, family, unbatched_args):
     if self._config.batch_mode == 'all':
         return task_name
     elif self._config.batch_mode == 'family':
         return family
     elif self._config.batch_mode == 'unbatched_params':
         param_str = six.u(', ').join(six.u('{}={}').format(*kv) for kv in six.iteritems(unbatched_args))
         return six.u('{}({})').format(family, param_str)
     else:
         raise ValueError('Unknown batch mode for batch notifier: {}'.format(
             self._config.batch_mode))
Beispiel #55
0
 def resources(self):
     ''' get total resources and available ones '''
     used_resources = self._used_resources()
     ret = collections.defaultdict(dict)
     for resource, total in six.iteritems(self._resources):
         ret[resource]['total'] = total
         if resource in used_resources:
             ret[resource]['used'] = used_resources[resource]
         else:
             ret[resource]['used'] = 0
     return ret
Beispiel #56
0
    def get_all_params(cls):
        """
        Compiles and returns all parameters for all :py:class:`Task`.

        :return: a generator of tuples (TODO: we should make this more elegant)
        """
        for task_name, task_cls in six.iteritems(cls._get_reg()):
            if task_cls == cls.AMBIGUOUS_CLASS:
                continue
            for param_name, param_obj in task_cls.get_params():
                yield task_name, (not task_cls.use_cmdline_section), param_name, param_obj
Beispiel #57
0
    def _upstream_tasks(self):
        """
        Extract upstream tasks from the TargetInfo objects
        or functions returning those (or lists of both the earlier)
        for use in luigi's requires() method.
        """
        upstream_tasks = []
        for attrname, attrval in iteritems(self.__dict__):
            if 'in_' == attrname[0:3]:
                upstream_tasks = self._parse_inputitem(attrval, upstream_tasks)

        return upstream_tasks
Beispiel #58
0
    def run(self):
        artist_count = defaultdict(int)

        for t in self.input():
            with t.open('r') as in_file:
                for line in in_file:
                    _, artist, track = line.strip().split()
                    artist_count[artist] += 1

        with self.output().open('w') as out_file:
            for artist, count in six.iteritems(artist_count):
                out_file.write('{}\t{}\n'.format(artist, count))
Beispiel #59
0
 def find_all_by_parameters(self, task_name, session=None, **task_params):
     """
     Find tasks with the given task_name and the same parameters as the kwargs.
     """
     with self._session(session) as session:
         tasks = session.query(TaskRecord).join(TaskEvent).filter(
             TaskRecord.name == task_name).order_by(TaskEvent.ts).all()
         for task in tasks:
             if all(k in task.parameters
                    and v == str(task.parameters[k].value)
                    for (k, v) in six.iteritems(task_params)):
                 yield task
Beispiel #60
0
 def _apply_regex(self, regex, input):
     import re
     re_match = re.match(regex, input)
     if re_match:
         kwargs = {}
         has_val = False
         for k, v in six.iteritems(re_match.groupdict(default="0")):
             val = int(v)
             has_val = has_val or val != 0
             kwargs[k] = val
         if has_val:
             return datetime.timedelta(**kwargs)