def getRunInfoByPartialName(self, name, runConfig): """Returns the run information of a module and all its dependencies from the last run. Unlike the runModule() method, which returns the run information just for that run, this method returns the run information from the last run. If no module was run (e.g. the code did not change, so the data is read from persistent storage), the SmRunInfoCollector returned from the runModule() method would be empty. But the SmvRunInfoCollector returned from this method would contain all latest run information about all dependent modules. Args: name (str): unique suffix to fqn of target module runConfig (dict): runConfig to apply when collecting info. If module was run with a config, the same config needs to be specified here to retrieve the info. Returns: SmvRunInfoCollector """ java_result = self.j_smvPyClient.getRunInfoByPartialName( name, runConfig) return SmvRunInfoCollector(java_result)
def run(self, forceRun=False): # a set of modules which need to run post_action, keep tracking # to make sure post_action run one and only one time for each TX # the set will be updated by _create_df, _create_meta and _force_post # and eventually be emptied out # See docs/dev/SmvGenericModule/SmvModuleRunner.md for details mods_to_run_post_action = set(self.visitor.modules_needed_for_run) # a map from fqn to already run DF, since the `run` interface of # SmvModule takes a map of class => df, the map here have to be # keyed by class method instead of `versioned_fqn`, which is only # in the resolved instance known = {} collector = SmvRunInfoCollector() # Do the real module calculation, when there are persistence, run # the post_actions and ancestor ephemeral modules post actions self._create_df(known, mods_to_run_post_action, collector, forceRun) # If there are ephemeral modules who has no persisting module # down stream, (must be part of roots), force an action and run # post actions self._force_post(mods_to_run_post_action, collector) dfs = [m.data for m in self.roots] return (dfs, collector)
def runModuleByName(self, name, forceRun=False, version=None, runConfig=None, quickRun=False): """Runs a SmvModule by its name (can be partial FQN) See the `runModule` method above Args: name (str): The unique name of a module. Does not have to be the FQN. forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise. version (str): The name of the published version to load from runConfig (dict): runtime configuration to use when running the module quickRun (bool): skip computing dqm+metadata and persisting csv Returns: (DataFrame, SmvRunInfoCollector) tuple - DataFrame is the computed result of the module - SmvRunInfoCollector contains additional information about the run, such as validation results. """ # TODO call setDynamicRunConfig() here not on scala side java_result = self.j_smvPyClient.runModuleByName( name, forceRun, self.scalaOption(version), runConfig, quickRun) return (DataFrame(java_result.df(), self.sqlContext), SmvRunInfoCollector(java_result.collector()))
def get_runinfo(self): collector = SmvRunInfoCollector() def add_to_coll(m, _collector): hist = self.smvApp._read_meta_hist(m) _collector.add_runinfo(m.fqn(), m._get_metadata(), hist) self.visitor.dfs_visit(add_to_coll, collector, need_to_run_only=True) return collector
def runModuleByName(self, name, forceRun=False, version=None, runConfig=None): """Runs a SmvModule by its name (can be partial FQN) See the `runModule` method above Returns: (DataFrame, SmvRunInfoCollector) tuple - DataFrame is the computed result of the module - SmvRunInfoCollector contains additional information about the run, such as validation results. """ # TODO call setDynamicRunConfig() here not on scala side java_result = self.j_smvPyClient.runModuleByName( name, forceRun, self.scalaOption(version), runConfig) return (DataFrame(java_result.df(), self.sqlContext), SmvRunInfoCollector(java_result.collector()))
def runModule(self, urn, forceRun=False, version=None, runConfig=None): """Runs either a Scala or a Python SmvModule by its Fully Qualified Name(fqn) Use j_smvPyClient instead of j_smvApp directly so we don't have to construct SmvRunCollector from the python side. Example: To get just the dataframe of the module: dataframe = smvApp.runModule('mod:package.module.SmvModuleClass')[0] To get both the dataframe and the run info collector: dataframe, collector = smvApp.runModule('mod:package.module.SmvModuleClass') Returns: (DataFrame, SmvRunInfoCollector) tuple - DataFrame is the computed result of the module - SmvRunInfoCollector contains additional information about the run, such as validation results. """ # TODO call setDynamicRunConfig() here not on scala side java_result = self.j_smvPyClient.runModule(urn, forceRun, self.scalaOption(version), runConfig) return (DataFrame(java_result.df(), self.sqlContext), SmvRunInfoCollector(java_result.collector()))
def runModule(self, urn, forceRun=False, version=None, runConfig=None, quickRun=False): """Runs either a Scala or a Python SmvModule by its Fully Qualified Name(fqn) Use j_smvPyClient instead of j_smvApp directly so we don't have to construct SmvRunCollector from the python side. Args: urn (str): The URN of a module forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise. version (str): The name of the published version to load from runConfig (dict): runtime configuration to use when running the module quickRun (bool): skip computing dqm+metadata and persisting csv Example: To get just the dataframe of the module: dataframe = smvApp.runModule('mod:package.module.SmvModuleClass')[0] To get both the dataframe and the run info collector: dataframe, collector = smvApp.runModule('mod:package.module.SmvModuleClass') Returns: (DataFrame, SmvRunInfoCollector) tuple - DataFrame is the computed result of the module - SmvRunInfoCollector contains additional information about the run, such as validation results. """ # TODO call setDynamicRunConfig() here not on scala side java_result = self.j_smvPyClient.runModule(urn, forceRun, self.scalaOption(version), runConfig, quickRun) return (DataFrame(java_result.df(), self.sqlContext), SmvRunInfoCollector(java_result.collector()))
def quick_run(self, forceRun=False): known = {} collector = SmvRunInfoCollector() self._create_df(known, set(), forceRun, is_quick_run=True) dfs = [m.data for m in self.roots] return (dfs, collector)