Exemple #1
0
def _lineage(**kwargs):
    """Invoke the api.lineage() call from the CLI to find the lineage.

    Args:
        kwargs: command line args or internal dict call, must contain uuid:str and depth:int.

    Returns:
        None

    """

    fs = disdat.fs.DisdatFS()

    if not fs.in_context():
        _logger.warning('Not in a data context')
        return

    ctxt = fs._curr_context.get_local_name()

    # (depth, uuid, lineage)
    lin_tuples = api.lineage(ctxt, kwargs['uuid'], kwargs['depth'])

    for (d,uuid,l) in lin_tuples:
        if l is None:
            print("No lineage found for UUID {}".format(uuid))
        else:
            print_lineage_protobuf(l, d)
            print()

    return
Exemple #2
0
    def prepare_pipe_kwargs(self, for_run=False):
        """ Each upstream task produces a bundle.  Prepare that bundle as input
        to the user's pipe_run function.

        Args:
            for_run (bool): prepare args for run -- at that point all upstream tasks have completed.

        Returns:
            (dict): A dictionary with the arguments.

        """

        kwargs = dict()

        # Place upstream task outputs into the kwargs.  Thus the user does not call
        # self.inputs().  If they did, they would get a list of output targets for the bundle
        # that isn't very helpful.
        if for_run:

            # Reset the stored tags, in case this instance is run multiple times.
            self._input_tags = {}
            self._input_bundle_uuids = {}

            upstream_tasks = [(t.user_arg_name, self.pfs.get_path_cache(t))
                              for t in self.requires()]
            for user_arg_name, pce in [
                    u for u in upstream_tasks if u[1] is not None
            ]:
                hfr = self.pfs.get_hframe_by_uuid(
                    pce.uuid, data_context=self.data_context)
                assert hfr.is_presentable()

                # Download any data that is not local (the linked files are not present).
                # This is the default behavior when running in a container.
                # The non-default is to download and localize ALL bundles in the context before we run.
                # That's in-efficient.   We only need meta-data to determine what to re-run.
                if self.incremental_pull:
                    DisdatFS()._localize_hfr(hfr, pce.uuid, self.data_context)

                if pce.instance.user_arg_name in kwargs:
                    _logger.warning(
                        'Task human name {} reused when naming task dependencies: Dependency hyperframe shadowed'
                        .format(pce.instance.user_arg_name))

                self._input_tags[user_arg_name] = hfr.tag_dict
                self._input_bundle_uuids[user_arg_name] = pce.uuid
                kwargs[user_arg_name] = self.data_context.present_hfr(hfr)

        return kwargs
Exemple #3
0
    def prepare_pipe_kwargs(self, for_run=False):
        """ Each upstream task produces a bundle.  Prepare that bundle as input
        to the user's pipe_run function.

        Args:
            for_run (bool): prepare args for run -- at that point all upstream tasks have completed.

        Returns:
            (dict): A dictionary with the arguments.

        """
        kwargs = dict()

        # Place upstream task outputs into the kwargs.  Thus the user does not call
        # self.inputs().  If they did, they would get a list of output targets for the bundle
        if for_run:

            # Reset the stored tags, in case this instance is run multiple times.
            self._input_tags = {}
            self._input_bundle_uuids = {}

            upstream_tasks = [(t.user_arg_name, PathCache.get_path_cache(t))
                              for t in self.deps()]
            for user_arg_name, pce in [
                    u for u in upstream_tasks if u[1] is not None
            ]:

                b = api.get(self.data_context.get_local_name(),
                            None,
                            uuid=pce.uuid)
                assert b.is_presentable

                # Download data that is not local (the linked files are not present).
                # This is the default behavior when running in a container.
                if self.incremental_pull:
                    b.pull(localize=True)

                if pce.instance.user_arg_name in kwargs:
                    _logger.warning(
                        'Task human name {} reused when naming task dependencies: Dependency hyperframe shadowed'
                        .format(pce.instance.user_arg_name))

                self._input_tags[user_arg_name] = b.tags
                self._input_bundle_uuids[user_arg_name] = pce.uuid
                kwargs[user_arg_name] = b.data

        return kwargs
Exemple #4
0
    def add_data(self, data):
        """ Attach data to a bundle.   The bundle must be open and not closed.
            One attaches one data item to a bundle (dictionary, list, tuple, scalar, or dataframe).
            Calling this replaces the latest item -- only the latest will be included in the bundle on close.

            Note: One uses `add_data_row` or `add_data` but not both.  Adding a row after `add_data`
            removes the data.   Using `add_data` after `add_data_row` removes all previously added rows.

        Args:
            data (list|tuple|dict|scalar|`pandas.DataFrame`):

        Returns:
            self
        """
        assert(self.open and not self.closed)
        if self.data is not None:
            _logger.warning("Disdat API add_data replacing existing data on bundle")
        self.data = data
        return self
Exemple #5
0
def _add(args):
    """Invoke the api.add() call from the CLI to create a bundle.

    Args:
        args: command line args.

    Returns:
        None

    """

    fs = disdat.fs.DisdatFS()

    if not fs.in_context():
        _logger.warning('Not in a data context')
        return

    _ = api.add(fs._curr_context.get_local_name(),
                args.bundle,
                args.path_name,
                tags=common.parse_args_tags(args.tag))

    return
Exemple #6
0
    def _put_subcls_params(cls, ser_params):
        """ Given the child class, create the Luigi parameter dictionary

        Assume that ser_params dictionary keys are the attribute names in the Disdat task class.

        Args:
            self: The instance of the subclass.  To get the normalized values for the Luigi Parameters
            ser_params (dict): Dictionary <str>:<str>
        Returns:
            deser_params (dict): {<name>: Luigi.Parameter,...}
        """
        deser_params = {}
        for param, ser_value in ser_params.items():
            try:
                attribute = getattr(cls, param)
                assert isinstance(attribute, luigi.Parameter)
                deser_params[param] = attribute.parse(ser_value)
            except Exception as e:
                _logger.warning(
                    "Bundle parameter ({}:{}) can't be deserialized by class({}): {}"
                    .format(param, ser_value, cls.__name__, e))
                raise e
        return deser_params
Exemple #7
0
def apply(output_bundle, pipe_params, pipe_cls, input_tags, output_tags, force,
          output_bundle_uuid=None, central_scheduler=False, workers=1, data_context=None,
          incremental_push=False, incremental_pull=False):
    """
    Given an input bundle, run the pipesline on the bundle.
    Note, we first make a copy of all tasks that are parameterized identically to the tasks we will run.
    This is so we can figure out what we will need to re-run.
    This is why we make a single uuid for the output bundle of apply (for the driver).

    Args:
        output_bundle: The new bundle to be created
        pipe_params (str):   Luigi Task parameters string
        pipe_cls:      String <module.ClassName>
        force:         force recomputation of dependencies
        input_tags (dict):  Tags used to find the input bundle
        output_tags (dict):  Tags that need to be placed on the output bundle
        force (bool): whether to re-run this pipe
        output_bundle_uuid (str):  Optionally specify exactly the UUID of the output bundle IFF we actually need to produce it
        central_scheduler: Use a centralized Luigi scheduler (default False, i.e., --local-scheduler is used)
        workers: The number of luigi workers to use for this workflow (default 1)
        data_context: Actual context object or None and read current context.
        incremental_push (bool): Whether this job should push tasks as they complete to the remote (if configured)
        incremental_pull (bool): Whether this job should localize bundles as needed from the remote (if configured)

    Returns:
        bool: True if tasks needed to be run, False if no tasks (beyond wrapper task) executed.
    """

    _logger.debug("driver {}".format(driver.DriverTask))
    _logger.debug("pipe_cls {}".format(pipe_cls))
    _logger.debug("pipe params: {}".format(pipe_params))
    _logger.debug("force: {}".format(force))
    _logger.debug("input tags: {}".format(input_tags))
    _logger.debug("output tags: {}".format(output_tags))
    _logger.debug("sys.path {}".format(sys.path))
    _logger.debug("central_scheduler {}".format(central_scheduler))
    _logger.debug("workers {}".format(workers))
    _logger.debug("incremental_push {}".format(incremental_push))
    _logger.debug("incremental_pull {}".format(incremental_pull))

    if incremental_push:
        _logger.warn("incremental_push {}".format(incremental_push))

    if incremental_pull:
        _logger.warn("incremental_pull {}".format(incremental_pull))

    pfs = fs.DisdatFS()

    if data_context is None:
        if not pfs.in_context():
            _logger.warning('Not in a data context')
            return None
        data_context = pfs.curr_context

    # Re-execute logic -- make copy of task DAG
    # Creates a cache of {pipe:path_cache_entry} in the pipesFS object.
    # This "task_path_cache" is used throughout execution to find output bundles.
    reexecute_dag = driver.DriverTask(output_bundle, pipe_params,
                                      pipe_cls, input_tags, output_tags, force,
                                      data_context, incremental_push, incremental_pull)

    # Get version information for pipeline
    users_root_task = reexecute_dag.deps()[0]
    pipeline_path = os.path.dirname(sys.modules[users_root_task.__module__].__file__)
    fs.DisdatFS().get_pipe_version(pipeline_path)

    did_work = resolve_workflow_bundles(reexecute_dag, data_context)

    # At this point the path cache should be full of existing or new UUIDs.
    # we are going to replace the final pipe's UUID if the user has passed one in.
    # this happens when we run the docker container.
    # TODO: don't replace if it already exists.
    if output_bundle_uuid is not None:
        users_root_task = reexecute_dag.deps()[0]
        pce = pfs.get_path_cache(users_root_task)
        if pce.rerun: # if we have to re-run then replace it with our UUID
            # TODO: this is the same code as new_output_hframe, FIX!!!
            dir, uuid, _ = data_context.make_managed_path(output_bundle_uuid)
            fs.DisdatFS.put_path_cache(users_root_task,
                                       uuid,
                                       dir,
                                       pce.rerun,
                                       pce.is_left_edge_task,
                                       overwrite=True)

    success = build([reexecute_dag], local_scheduler=not central_scheduler, workers=workers)

    # After running a pipeline, blow away our path cache and git hash. Needed if we're run twice in the same process.
    fs.DisdatFS().clear_pipe_version()
    fs.DisdatFS().clear_path_cache()

    return {'success': success, 'did_work': did_work}
Exemple #8
0
    def add_external_dependency(self,
                                param_name,
                                task_class,
                                params,
                                human_name=None,
                                uuid=None):
        """
        Disdat Pipe API Function

        Add an external task and its parameters to our requirements.   What this means is that
        there is no run function and, in that case, Luigi will ignore the results of task.deps() (which calls
        flatten(self.requires())).  And what that means is that this requirement can only be satisfied
        by the bundle actually existing.

        NOTE: if you add an external dependency by name, it is possible that someone adds a bundle during
        execution and that your requires function is no longer deterministic.   You must add caching to your
        requires function to handle this scenario.

        Example with class variable bundle_uuid:
        ``
        if self.bundle_uuid is None:
            bundle = self.add_external_dependency('_', MyTaskClass, {}, human_name='some_result')
            self.bundle_uuid = bundle.uuid
        else:
            bundle = self.add_external_dependency('_', MyTaskClass, {}, uuid=self.bundle_uuid)
        ``

        Args:
            param_name (str): The parameter name this bundle assumes when passed to Pipe.run
            task_class (:object):  Must always set class name of upstream task.
            params (:dict):  Dictionary of parameters for this task.  Note if UUID is set, then params are ignored!
            human_name (str): Resolve dependency by human_name, return the latest bundle with that humman_name.  Trumps task_class and params.
            uuid (str): Resolve dependency by explicit UUID, trumps task_class and params, and human_name.

        Returns:
            None

        """

        # for the bundle object
        import disdat.api as api

        if not isinstance(params, dict):
            error = "add_dependency third argument must be a dictionary of parameters"
            raise Exception(error)

        assert (param_name not in self.add_deps)

        try:
            if uuid is not None:
                hfr = self.pfs.get_hframe_by_uuid(
                    uuid, data_context=self.data_context)
            elif human_name is not None:
                hfr = self.pfs.get_latest_hframe(
                    human_name, data_context=self.data_context)
            else:
                p = task_class(**params)
                hfr = self.pfs.get_hframe_by_proc(
                    p.pipe_id(), data_context=self.data_context)

            bundle = api.Bundle(self.data_context.get_local_name(), 'unknown')

            bundle.fill_from_hfr(hfr)

            if uuid is not None or human_name is not None:
                params = task_class._put_subcls_params(bundle.params)

            self.add_deps[param_name] = (luigi.task.externalize(task_class),
                                         params)

        except Exception as error:
            _logger.warning(
                "Unable to resolve external bundle made by class ({}): {}".
                format(task_class, error))
            return None

        return bundle