def _lineage(**kwargs): """Invoke the api.lineage() call from the CLI to find the lineage. Args: kwargs: command line args or internal dict call, must contain uuid:str and depth:int. Returns: None """ fs = disdat.fs.DisdatFS() if not fs.in_context(): _logger.warning('Not in a data context') return ctxt = fs._curr_context.get_local_name() # (depth, uuid, lineage) lin_tuples = api.lineage(ctxt, kwargs['uuid'], kwargs['depth']) for (d,uuid,l) in lin_tuples: if l is None: print("No lineage found for UUID {}".format(uuid)) else: print_lineage_protobuf(l, d) print() return
def prepare_pipe_kwargs(self, for_run=False): """ Each upstream task produces a bundle. Prepare that bundle as input to the user's pipe_run function. Args: for_run (bool): prepare args for run -- at that point all upstream tasks have completed. Returns: (dict): A dictionary with the arguments. """ kwargs = dict() # Place upstream task outputs into the kwargs. Thus the user does not call # self.inputs(). If they did, they would get a list of output targets for the bundle # that isn't very helpful. if for_run: # Reset the stored tags, in case this instance is run multiple times. self._input_tags = {} self._input_bundle_uuids = {} upstream_tasks = [(t.user_arg_name, self.pfs.get_path_cache(t)) for t in self.requires()] for user_arg_name, pce in [ u for u in upstream_tasks if u[1] is not None ]: hfr = self.pfs.get_hframe_by_uuid( pce.uuid, data_context=self.data_context) assert hfr.is_presentable() # Download any data that is not local (the linked files are not present). # This is the default behavior when running in a container. # The non-default is to download and localize ALL bundles in the context before we run. # That's in-efficient. We only need meta-data to determine what to re-run. if self.incremental_pull: DisdatFS()._localize_hfr(hfr, pce.uuid, self.data_context) if pce.instance.user_arg_name in kwargs: _logger.warning( 'Task human name {} reused when naming task dependencies: Dependency hyperframe shadowed' .format(pce.instance.user_arg_name)) self._input_tags[user_arg_name] = hfr.tag_dict self._input_bundle_uuids[user_arg_name] = pce.uuid kwargs[user_arg_name] = self.data_context.present_hfr(hfr) return kwargs
def prepare_pipe_kwargs(self, for_run=False): """ Each upstream task produces a bundle. Prepare that bundle as input to the user's pipe_run function. Args: for_run (bool): prepare args for run -- at that point all upstream tasks have completed. Returns: (dict): A dictionary with the arguments. """ kwargs = dict() # Place upstream task outputs into the kwargs. Thus the user does not call # self.inputs(). If they did, they would get a list of output targets for the bundle if for_run: # Reset the stored tags, in case this instance is run multiple times. self._input_tags = {} self._input_bundle_uuids = {} upstream_tasks = [(t.user_arg_name, PathCache.get_path_cache(t)) for t in self.deps()] for user_arg_name, pce in [ u for u in upstream_tasks if u[1] is not None ]: b = api.get(self.data_context.get_local_name(), None, uuid=pce.uuid) assert b.is_presentable # Download data that is not local (the linked files are not present). # This is the default behavior when running in a container. if self.incremental_pull: b.pull(localize=True) if pce.instance.user_arg_name in kwargs: _logger.warning( 'Task human name {} reused when naming task dependencies: Dependency hyperframe shadowed' .format(pce.instance.user_arg_name)) self._input_tags[user_arg_name] = b.tags self._input_bundle_uuids[user_arg_name] = pce.uuid kwargs[user_arg_name] = b.data return kwargs
def add_data(self, data): """ Attach data to a bundle. The bundle must be open and not closed. One attaches one data item to a bundle (dictionary, list, tuple, scalar, or dataframe). Calling this replaces the latest item -- only the latest will be included in the bundle on close. Note: One uses `add_data_row` or `add_data` but not both. Adding a row after `add_data` removes the data. Using `add_data` after `add_data_row` removes all previously added rows. Args: data (list|tuple|dict|scalar|`pandas.DataFrame`): Returns: self """ assert(self.open and not self.closed) if self.data is not None: _logger.warning("Disdat API add_data replacing existing data on bundle") self.data = data return self
def _add(args): """Invoke the api.add() call from the CLI to create a bundle. Args: args: command line args. Returns: None """ fs = disdat.fs.DisdatFS() if not fs.in_context(): _logger.warning('Not in a data context') return _ = api.add(fs._curr_context.get_local_name(), args.bundle, args.path_name, tags=common.parse_args_tags(args.tag)) return
def _put_subcls_params(cls, ser_params): """ Given the child class, create the Luigi parameter dictionary Assume that ser_params dictionary keys are the attribute names in the Disdat task class. Args: self: The instance of the subclass. To get the normalized values for the Luigi Parameters ser_params (dict): Dictionary <str>:<str> Returns: deser_params (dict): {<name>: Luigi.Parameter,...} """ deser_params = {} for param, ser_value in ser_params.items(): try: attribute = getattr(cls, param) assert isinstance(attribute, luigi.Parameter) deser_params[param] = attribute.parse(ser_value) except Exception as e: _logger.warning( "Bundle parameter ({}:{}) can't be deserialized by class({}): {}" .format(param, ser_value, cls.__name__, e)) raise e return deser_params
def apply(output_bundle, pipe_params, pipe_cls, input_tags, output_tags, force, output_bundle_uuid=None, central_scheduler=False, workers=1, data_context=None, incremental_push=False, incremental_pull=False): """ Given an input bundle, run the pipesline on the bundle. Note, we first make a copy of all tasks that are parameterized identically to the tasks we will run. This is so we can figure out what we will need to re-run. This is why we make a single uuid for the output bundle of apply (for the driver). Args: output_bundle: The new bundle to be created pipe_params (str): Luigi Task parameters string pipe_cls: String <module.ClassName> force: force recomputation of dependencies input_tags (dict): Tags used to find the input bundle output_tags (dict): Tags that need to be placed on the output bundle force (bool): whether to re-run this pipe output_bundle_uuid (str): Optionally specify exactly the UUID of the output bundle IFF we actually need to produce it central_scheduler: Use a centralized Luigi scheduler (default False, i.e., --local-scheduler is used) workers: The number of luigi workers to use for this workflow (default 1) data_context: Actual context object or None and read current context. incremental_push (bool): Whether this job should push tasks as they complete to the remote (if configured) incremental_pull (bool): Whether this job should localize bundles as needed from the remote (if configured) Returns: bool: True if tasks needed to be run, False if no tasks (beyond wrapper task) executed. """ _logger.debug("driver {}".format(driver.DriverTask)) _logger.debug("pipe_cls {}".format(pipe_cls)) _logger.debug("pipe params: {}".format(pipe_params)) _logger.debug("force: {}".format(force)) _logger.debug("input tags: {}".format(input_tags)) _logger.debug("output tags: {}".format(output_tags)) _logger.debug("sys.path {}".format(sys.path)) _logger.debug("central_scheduler {}".format(central_scheduler)) _logger.debug("workers {}".format(workers)) _logger.debug("incremental_push {}".format(incremental_push)) _logger.debug("incremental_pull {}".format(incremental_pull)) if incremental_push: _logger.warn("incremental_push {}".format(incremental_push)) if incremental_pull: _logger.warn("incremental_pull {}".format(incremental_pull)) pfs = fs.DisdatFS() if data_context is None: if not pfs.in_context(): _logger.warning('Not in a data context') return None data_context = pfs.curr_context # Re-execute logic -- make copy of task DAG # Creates a cache of {pipe:path_cache_entry} in the pipesFS object. # This "task_path_cache" is used throughout execution to find output bundles. reexecute_dag = driver.DriverTask(output_bundle, pipe_params, pipe_cls, input_tags, output_tags, force, data_context, incremental_push, incremental_pull) # Get version information for pipeline users_root_task = reexecute_dag.deps()[0] pipeline_path = os.path.dirname(sys.modules[users_root_task.__module__].__file__) fs.DisdatFS().get_pipe_version(pipeline_path) did_work = resolve_workflow_bundles(reexecute_dag, data_context) # At this point the path cache should be full of existing or new UUIDs. # we are going to replace the final pipe's UUID if the user has passed one in. # this happens when we run the docker container. # TODO: don't replace if it already exists. if output_bundle_uuid is not None: users_root_task = reexecute_dag.deps()[0] pce = pfs.get_path_cache(users_root_task) if pce.rerun: # if we have to re-run then replace it with our UUID # TODO: this is the same code as new_output_hframe, FIX!!! dir, uuid, _ = data_context.make_managed_path(output_bundle_uuid) fs.DisdatFS.put_path_cache(users_root_task, uuid, dir, pce.rerun, pce.is_left_edge_task, overwrite=True) success = build([reexecute_dag], local_scheduler=not central_scheduler, workers=workers) # After running a pipeline, blow away our path cache and git hash. Needed if we're run twice in the same process. fs.DisdatFS().clear_pipe_version() fs.DisdatFS().clear_path_cache() return {'success': success, 'did_work': did_work}
def add_external_dependency(self, param_name, task_class, params, human_name=None, uuid=None): """ Disdat Pipe API Function Add an external task and its parameters to our requirements. What this means is that there is no run function and, in that case, Luigi will ignore the results of task.deps() (which calls flatten(self.requires())). And what that means is that this requirement can only be satisfied by the bundle actually existing. NOTE: if you add an external dependency by name, it is possible that someone adds a bundle during execution and that your requires function is no longer deterministic. You must add caching to your requires function to handle this scenario. Example with class variable bundle_uuid: `` if self.bundle_uuid is None: bundle = self.add_external_dependency('_', MyTaskClass, {}, human_name='some_result') self.bundle_uuid = bundle.uuid else: bundle = self.add_external_dependency('_', MyTaskClass, {}, uuid=self.bundle_uuid) `` Args: param_name (str): The parameter name this bundle assumes when passed to Pipe.run task_class (:object): Must always set class name of upstream task. params (:dict): Dictionary of parameters for this task. Note if UUID is set, then params are ignored! human_name (str): Resolve dependency by human_name, return the latest bundle with that humman_name. Trumps task_class and params. uuid (str): Resolve dependency by explicit UUID, trumps task_class and params, and human_name. Returns: None """ # for the bundle object import disdat.api as api if not isinstance(params, dict): error = "add_dependency third argument must be a dictionary of parameters" raise Exception(error) assert (param_name not in self.add_deps) try: if uuid is not None: hfr = self.pfs.get_hframe_by_uuid( uuid, data_context=self.data_context) elif human_name is not None: hfr = self.pfs.get_latest_hframe( human_name, data_context=self.data_context) else: p = task_class(**params) hfr = self.pfs.get_hframe_by_proc( p.pipe_id(), data_context=self.data_context) bundle = api.Bundle(self.data_context.get_local_name(), 'unknown') bundle.fill_from_hfr(hfr) if uuid is not None or human_name is not None: params = task_class._put_subcls_params(bundle.params) self.add_deps[param_name] = (luigi.task.externalize(task_class), params) except Exception as error: _logger.warning( "Unable to resolve external bundle made by class ({}): {}". format(task_class, error)) return None return bundle