def colsWithLabel(self, labels=None): """Returns all column names in the data frame that contain all the specified labels If labels are empty, returns names of unlabeled columns """ def metaLabelMatched(meta): if labels: # if labels are provided, match the column whose labels contain the given ones return set(labels) <= set(_getMetaLabels(meta)) else: # if labels are empty, match the column with no label return not _getMetaLabels(meta) ret = [ col.name for col in self.fields if metaLabelMatched(col.metadata) ] if not ret: if labels: raise SmvRuntimeError("there are no columns labeled with {{{}}} in {}"\ .format(", ".join(labels), self.df)) else: raise SmvRuntimeError("there are no unlabeled columns in {}"\ .format(self.df)) return ret
def infer_full_name_from_part(full_names, part_name): """For a given partial name (postfix), infer full name from a list """ candidates = [s for s in full_names if s.endswith(part_name)] if (len(candidates) == 0): raise SmvRuntimeError("Can't find name {}".format(part_name)) elif (len(candidates) == 1): return candidates[0] else: raise SmvRuntimeError("Partial name {} is ambiguous".format(part_name))
def run_delayed_postAction(mod, state): (_run_set, coll) = state if (mod in _run_set): smv.logger.debug("Run post_action of {} from {}".format( mod.fqn(), self.fqn())) mod._post_action() meta_io_strategy = mod.metaStrategy() if (not_persisted_or_no_edd_when_forced(meta_io_strategy)): # data cache should be populated by this step if (mod.data is None): raise SmvRuntimeError( "Module {}'s data is None, can't run postAction". format(mod.fqn())) # Since the ancestor list will be visited as depth-first, although # user_meta may trigger actions, the upper stream modules' post action # are already run. No need to call _run_ancestor_and_me_postAction # in the calculate_user_meta() any more mod._calculate_user_meta() mod._finalize_meta() mod._validate_meta() mod._persist_meta() mod._collect_runinfo_and_update_hist(coll) else: meta_json = meta_io_strategy.read() self.module_meta = SmvMetaData().fromJson(meta_json) _run_set.discard(mod)
def _all_providers(self): """scans user libraries and smv libraries for "provider" classes. Returns list of discovered provider classes """ def is_provider(klass): """A class is a provider if it has `IS_PROVIDER` and is not the base `SmvProvider` which returns empty string for provider type. """ try: klass_is_provider = (klass.IS_PROVIDER is True) and (klass.provider_type()) except AttributeError: klass_is_provider = False return klass_is_provider # providers can be in user libs dir or builtin smv prov_libs_names = self.smvApp.userLibs() + self.smvApp.smvLibs() prov_dict = {} for prov_lib_name in prov_libs_names: prov_lib = self.load_pymodule(prov_lib_name) providers = self._matchingClassesInPyModule(prov_lib, is_provider) for p in providers: p_fqn = p.provider_type_fqn() if p_fqn in prov_dict: raise SmvRuntimeError( "multiple providers with same fqn: " + p_fqn) prov_dict[p_fqn] = p return prov_dict
def _assure_output_type(self, run_output): # TODO move this back to top import h2o if (not isinstance(run_output, h2o.H2OFrame)): raise SmvRuntimeError( 'The run method output should be an H2OFrame, but {} is given.' .format(type(run_output)))
def getStageFromModuleFqn(self, fqn): """Returns the stage name for a given fqn""" res = [s for s in self.stages() if fqn.startswith(s + ".")] if (len(res) == 0): raise SmvRuntimeError("Can't find {} from stages {}".format( fqn, ", ".join(self.stages()))) return res[0]
def all_data_dirs(self): """Create all the data dir configs """ props = self.merged_props() if (self.cmdline.get('dataDir')): data_dir = self.cmdline.get('dataDir') elif (props.get('smv.dataDir')): data_dir = props.get('smv.dataDir') elif(os.getenv('DATA_DIR', None)): data_dir = os.getenv('DATA_DIR') print("WARNING: use of DATA_DIR environment variable is deprecated. use smv.dataDir instead!!!") else: raise SmvRuntimeError("Must specify a data-dir either on command line or in conf.") def get_sub_dir(name, default): res = "{}/{}".format(data_dir, default) if (self.cmdline.get(name)): res = self.cmdline.get(name) elif (props.get('smv.' + name)): res = props.get('smv.' + name) return res return { 'dataDir': data_dir, 'inputDir': get_sub_dir('inputDir', "input"), 'outputDir': get_sub_dir('outputDir', "output"), 'lockDir': get_sub_dir('lockDir', "lock"), 'historyDir': get_sub_dir('historyDir', "history"), 'publishDir': get_sub_dir('publishDir', 'publish'), 'publishVersion': self.cmdline.get('publish') }
def dependencies(self): model_mod = self.requiresModel() if not self._targetIsSmvModel(model_mod): raise SmvRuntimeError( "requiresModel method must return an SmvModel or a link to one" ) return [model_mod] + self.requiresDS()
def smvGetRunConfig(self, key): """return the current user run configuration value for the given key.""" if (key not in self.requiresConfig()): raise SmvRuntimeError( "RunConfig key {} was not specified in requiresConfig method{}." .format(key, self.requiresConfig())) return self.smvApp.getConf(key)
def metadataJson(self, jdf): """Get user's metadata and jsonify it for py4j transport """ df = DataFrame(jdf, self.smvApp.sqlContext) metadata = self.metadata(df) if not isinstance(metadata, dict): raise SmvRuntimeError("User metadata {} is not a dict".format(repr(metadata))) return json.dumps(metadata)
def write(self, smvSchema): schema_str = "\n".join(scala_seq_to_list(self.smvApp._jvm, smvSchema.toStringsWithMeta())) if (self._write_mode.lower() == "overwrite"): self._remove() else: raise SmvRuntimeError("Write mode {} is not implemented yet. (Only support overwrite)".format(self._write_mode)) self.smvApp._jvm.SmvHDFS.writeToFile(schema_str, self._file_path)
def validateMetadataJson(self, currentJson, historyJson): """Load metadata (jsonified for py4j transport) and run user's validation on it """ current = json.loads(currentJson) history = [json.loads(j) for j in historyJson] res = self.validateMetadata(current, history) if res is not None and not is_string(res): raise SmvRuntimeError("Validation failure message {} is not a string".format(repr(res))) return res
def _validate_meta(self): hist = self.smvApp._read_meta_hist(self) res = self.validateMetadata(self.module_meta, hist) if res is not None and not is_string(res): raise SmvRuntimeError( "Validation failure message {} is not a string".format( repr(res))) if (is_string(res) and len(res) > 0): raise SmvMetadataValidationError(res)
def write(self, raw_data): jdf = raw_data._jdf if (self._write_mode.lower() == "overwrite"): self._remove() else: raise SmvRuntimeError("Write mode {} is not implemented yet. (Only support overwrite)".format(self._write_mode)) handler = self.smvApp.j_smvPyClient.createFileIOHandler(self._file_path) handler.saveAsCsv(jdf, self._smv_schema)
def _assert_single_input(self): """Make sure SmvOutput only depends on a single module This method will not be called, when SmvOutput is used for mixin. It should be called by the doRun method when SmvOutput is used for base class """ if (len(self.requiresDS()) != 1): raise SmvRuntimeError("SmvOutput modules depend on a single input, more are given: {}"\ .format(", ".join([m.fqn() for m in self.requiresDS()])) )
def _checkColExistence(self, colNames): """Check if the given column names exist in the DataFrame Will throw if some of the column names are not found Args: colNames (list(string)) a list of column names to check """ invalidCols = set(colNames) - set(self.df.columns) if invalidCols: raise SmvRuntimeError("{} does not have columns {}".format( self.df, ", ".join(invalidCols)))
def _calculate_user_meta(self): """Calculate user defined metadata could have action on the result df """ self.module_meta.addSystemMeta(self) (user_meta, self.userMetadataTimeElapsed) = self._do_action_on_df( self.metadata, self.data, "GENERATE USER METADATA") if not isinstance(user_meta, dict): raise SmvRuntimeError("User metadata {} is not a dict".format( repr(user_meta))) self.module_meta.addUserMeta(user_meta)
def get_connection_by_name(self, name): """Get connection instance from name """ props = self.py_smvconf.merged_props() type_name = "smv.conn.{}.type".format(name) if (type_name in props): con_type = props.get(type_name) provider_fqn = "conn.{}".format(con_type) ConnClass = self.get_provider_by_fqn(provider_fqn) return ConnClass(name, props) else: raise SmvRuntimeError( "Connection name {} is not configured with a type".format( name))
def resolveDataSet(self, ds): """Return cached resolved version of given SmvGenericModule if it exists, or resolve it otherwise. """ if (ds.fqn() in self.resolveStack): raise SmvRuntimeError("Cycle found while resolving {}: {}".format( ds.fqn(), ", ".join(self.resolveStack))) else: if (ds.fqn() in self.fqn2res): return self.fqn2res.get(ds.fqn()) else: self.resolveStack.append(ds.fqn()) resolvedDs = ds._resolve(self) resolvedDs._setTimestamp(self.transaction_time) self.fqn2res.update({ds.fqn(): resolvedDs}) self.resolveStack.pop() return resolvedDs
def _getMetaByName(self, colName): """Returns the metadata of the first column that matches the column name Will throw if there's no column matching the specified name Args: colName (string) the name of the column that is being looked for Returns: (dict) the metadata of the given column """ try: meta = next(col.metadata for col in self.fields if col.name == colName) except StopIteration: raise SmvRuntimeError("column name {} not found".format(colName)) return meta
def loadDataSet(self, fqns): """Given a list of FQNs, return cached resolved version SmvGenericModules if exists, or otherwise load unresolved version from source and resolve them. """ res = [] for fqn in fqns: # Caller need to check whether the fqn is in a stage of the SmvConfig stages if (fqn in self.fqn2res): ds = self.fqn2res.get(fqn) else: mod = self.repo.loadDataSet(fqn) if (mod is None): raise SmvRuntimeError( "Module {} does not exist".format(fqn)) ds = self.resolveDataSet(mod) res.append(ds) return res
def get_connection(self): """Get data connection instance from connectionName() Connetion should be configured in conf file with at least a class FQN Ex: smv.conn.con_name.class=smv.conn.SmvJdbcConnectionInfo """ name = self.connectionName() conn = self.smvApp.get_connection_by_name(name) # check whether the connection provided by name has the type as expected conn_type = conn.provider_type() if (conn_type != self.connectionType()): raise SmvRuntimeError( "Connection {} has type {}, while {} need connection type {}". format(name, conn_type, self.__class__.__name__, self.connectionType())) return conn
def addDesc(self, *colDescs): """Adds column descriptions """ if not colDescs: raise SmvRuntimeError( "must provide (name, description) pair to add") self._checkColExistence([tup[0] for tup in colDescs]) addDict = dict(colDescs) def colShouldUpdate(col): return col.name in addDict def colUpdateMeta(col): return _setMetaDesc(col.metadata, addDict[col.name]) return self._updateColMeta(colShouldUpdate, colUpdateMeta)
def addLabel(self, colNames, labels): """Adds labels to the specified columns If colNames are empty, adds the same set of labels to all columns """ if not labels: raise SmvRuntimeError("must provide a list of labels to add") if colNames: self._checkColExistence(colNames) addSet = set(colNames) def colShouldUpdate(col): return not colNames or col.name in addSet def colUpdateMeta(col): return _setMetaLabel(col.metadata, labels) return self._updateColMeta(colShouldUpdate, colUpdateMeta)
def readAsDF(self, readerLogger): flist = self.smvApp.j_smvPyClient.getDirList(self.fullPath()) # ignore all hidden files in the data dir filesInDir = [ "{}/{}".format(self.fullPath(), n) for n in flist if not n.startswith(".") ] if (not filesInDir): raise SmvRuntimeError("There are no data files in {}".format( self.fullPath())) combinedJdf = None for filePath in filesInDir: jdf = self.smvApp.j_smvPyClient.readCsvFromFile( filePath, self.smvSchema(), self.csvAttr(), readerLogger) combinedJdf = jdf if ( combinedJdf is None) else combinedJdf.unionAll(jdf) return DataFrame(combinedJdf, self.smvApp.sqlContext)
def _all_providers(self): """scans user libraries and smv libraries for "provider" classes. Returns list of discovered provider classes """ def is_provider(klass): """A class is a provider if it has `IS_PROVIDER` and is not the base `SmvProvider` which returns empty string for provider type. """ try: klass_is_provider = (klass.IS_PROVIDER is True) and (klass.provider_type()) except AttributeError: klass_is_provider = False return klass_is_provider # providers can be in user libs dir or builtin smv prov_libs_names = self.smvApp.userLibs() + self.smvApp.semiLibs( ) + self.smvApp.smvLibs() prov_dict = {} for prov_lib_name in prov_libs_names: try: prov_lib = self.load_pymodule(prov_lib_name) except Exception as err: # ignore the prov_lib_name if there is any loading error traceback.print_exc() message = "{0}({1!r})".format(type(err).__name__, err.args) smv.logger.debug("Ignoring {} because it has error: {}".format( prov_lib_name, message)) continue providers = self._matchingClassesInPyModule(prov_lib, is_provider, skip_abs=False) for p in providers: p_fqn = p.provider_type_fqn() if p_fqn in prov_dict: raise SmvRuntimeError( "multiple providers with same fqn: " + p_fqn) prov_dict[p_fqn] = p return prov_dict
def doRun(self, known): dir_path = os.path.join(self.get_connection().path, self.dirName()) smv_schema = self.smvSchema() flist = self.smvApp._jvm.SmvHDFS.dirList(dir_path).array() # ignore all hidden files in the data dir filesInDir = [ os.path.join(dir_path, n) for n in flist if not n.startswith(".") ] if (not filesInDir): raise SmvRuntimeError( "There are no data files in {}".format(dir_path)) combinedDf = None reader_logger = self._readerLogger() for filePath in filesInDir: df = SmvCsvOnHdfsIoStrategy(self.smvApp, filePath, smv_schema, reader_logger).read() combinedDf = df if ( combinedDf is None) else combinedDf.unionAll(df) return combinedDf
def __init__(self, arglist, _sparkSession, py_module_hotload=True): self.smvHome = os.environ.get("SMV_HOME") if (self.smvHome is None): raise SmvRuntimeError("SMV_HOME env variable not set!") self.sparkSession = _sparkSession if (self.sparkSession is not None): sc = self.sparkSession.sparkContext sc.setLogLevel("ERROR") self.sc = sc self.sqlContext = self.sparkSession._wrapped self._jvm = sc._jvm self.j_smvPyClient = self._jvm.org.tresamigos.smv.python.SmvPyClientFactory.init( self.sparkSession._jsparkSession) self.j_smvApp = self.j_smvPyClient.j_smvApp() else: _gw = launch_gateway(None) self._jvm = _gw.jvm self.py_module_hotload = py_module_hotload java_import(self._jvm, "org.tresamigos.smv.ColumnHelper") java_import(self._jvm, "org.tresamigos.smv.SmvDFHelper") java_import(self._jvm, "org.tresamigos.smv.dqm.*") java_import(self._jvm, "org.tresamigos.smv.panel.*") java_import(self._jvm, "org.tresamigos.smv.python.SmvPythonHelper") java_import(self._jvm, "org.tresamigos.smv.SmvHDFS") java_import(self._jvm, "org.tresamigos.smv.DfCreator") self.smvSchemaObj = self._jvm.SmvPythonHelper.getSmvSchema() self.py_smvconf = SmvConfig(arglist, self._jvm) # configure spark sql params if (self.sparkSession is not None): for k, v in self.py_smvconf.spark_sql_props().items(): self.sqlContext.setConf(k, v) # issue #429 set application name from smv config if (self.sparkSession is not None): sc._conf.setAppName(self.appName()) # CmdLine is static, so can be an attribute cl = self.py_smvconf.cmdline self.cmd_line = namedtuple("CmdLine", cl.keys())(*cl.values()) # shortcut is meant for internal use only self.dsm = DataSetMgr(self._jvm, self.py_smvconf) # computed df cache, keyed by m.versioned_fqn self.data_cache = {} # AFTER app is available but BEFORE stages, # use the dynamically configured app dir to set the source path, library path self.prependDefaultDirs() self.repoFactory = DataSetRepoFactory(self) self.dsm.register(self.repoFactory) # provider cache, keyed by providers' fqn self.provider_cache = {} self.refresh_provider_cache() # Initialize DataFrame and Column with helper methods smv.helpers.init_helpers()
def getInstance(cls): if cls._instance is None: raise SmvRuntimeError("An instance of SmvApp has not been created") else: return cls._instance
def jdbcDriver(self): res = self.py_smvconf.merged_props().get('smv.jdbc.driver') if (res is None): raise SmvRuntimeError("JDBC driver is not specified in SMV config") return res