def exportToHive(dsname): """Export dataset's running result to a Hive table Args: dsname (str): The name of an SmvModule """ SmvApp.getInstance().publishModuleToHiveByName(dsname)
def smvDiscoverSchemaToFile(path, n=100000, ca=None): """Try best to discover Schema from raw Csv file Will save a schema file with postfix ".toBeReviewed" in local directory. Args: path (str): Path to the CSV file n (int): Number of records to check for schema discovery, default 100k ca (CsvAttributes): Defaults to CsvWithHeader """ SmvApp.getInstance()._jvm.SmvPythonHelper.smvDiscoverSchemaToFile(path, n, ca or SmvApp.getInstance().defaultCsvWithHeader())
def __init__(self, leftId, rightId, exactMatchFilter, groupCondition, levelLogics): jlls = SmvApp.getInstance().sc._gateway.new_array( SmvApp.getInstance()._jvm.org.tresamigos.smv.matcher.LevelLogic, len(levelLogics)) for i in range(0, len(jlls)): jlls[i] = levelLogics[i] self.jem = SmvApp.getInstance( )._jvm.org.tresamigos.smv.python.SmvPythonHelper.createMatcher( leftId, rightId, exactMatchFilter, groupCondition, jlls)
def openCsv(path, validate=False): """Read in a CSV file as a DataFrame Args: path (str): The path of the CSV file validate (bool): If true, validate the CSV before return DataFrame (raise error if malformatted) Returns: (DataFrame): The resulting DataFrame """ app = SmvApp.getInstance() jdf = app.j_smvPyClient.shellOpenCsv(path, validate) return DataFrame(jdf, SmvApp.getInstance().sqlContext)
def smvStrCat(head, *others): """Concatenate multiple columns to a single string. Similar to `concat` and `concat_ws` functions in Spark but behaves differently when some columns are nulls. The Spark version will return null if any of the inputs is null. smvStrCat will return null if all of the inputs are nulls, otherwise it will coalesce null cols to blank. This function can take 2 forms: - smvStrCat(sep, col1, col2, ...) - smvStrCat(col1, col2, ...) Args: sep (String): separater for the concats col. (Column): columns to be concatenated Return: (col): a StringType column """ if (isinstance(head, basestring)): sep = head cols = list(others) elif (isinstance(head, Column)): sep = "" cols = [head] + list(others) else: raise RuntimeError( "first parameter must be either a String or a Column") app = SmvApp.getInstance() return Column( app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvStrCat( sep, smv_copy_array(app.sc, *cols)))
def create_smv_app(self, smv_args, driver_args): """Override this to define how this driver's SmvApp is created Default is just SmvApp.createInstance(smv_args). Note that it's important to use `createInstance` to ensure that the singleton app is set. SmvDriver will parse the full CLI args to distinguish the SMV args from from the args to your driver. Args: smv_args (list(str)): CLI args for SMV - should be passed to `SmvApp`) driver_args (list(str)): CLI args for the driver """ spark_builder = SparkSession.builder.enableHiveSupport() # read the props from kernel config file and use them as spark conf smvconf = SmvConfig(smv_args) kernel_conf = smvconf.read_props_from_kernel_config_file() for key in kernel_conf: # use the master setting in the config file if exists if key == 'master': spark_builder = spark_builder.master(kernel_conf.get(key)) else: spark_builder = spark_builder.config(key, kernel_conf.get(key)) sparkSession = spark_builder.getOrCreate() # When SmvDriver is in use, user will call smv-run and interact # through command-line, so no need to do py module hotload return SmvApp.createInstance(smv_args, sparkSession, py_module_hotload=False)
def getStageFromFqn(fqn): '''returns the stage given a a dataset's fqn''' try: stage = SmvApp.getInstance().getStageFromModuleFqn(fqn).encode("utf-8") except: raise ValueError("Could not retrive stage with the given fqn: " + str(fqn)) return stage
def openCsv(path, validate=False): """Read in a CSV file as a DataFrame Args: path (str): The path of the CSV file validate (bool): If true, validate the CSV before return DataFrame (raise error if malformatted) Returns: (DataFrame): The resulting DataFrame """ app = SmvApp.getInstance() class TmpCsv(SmvCsvInputFile): def connectionName(self): return None def get_connection(self): return SmvHdfsEmptyConn def fileName(self): return path def failAtParsingError(self): return validate return TmpCsv(app).doRun(None)
def doMatch(self, df1, df2, keepOriginalCols=True): """Apply `SmvEntityMatcher` to the 2 DataFrames Args: df1 (DataFrame): DataFrame 1 with an id column with name "id" df2 (DataFrame): DataFrame 2 with an id column with name "id" keepOriginalCols (boolean): whether to keep all input columns of df1 and df2, defaults to true Example: code:: SmvEntityMatcher("id", "_id", ExactMatchPreFilter("Full_Name_Match", col("full_name") == col("_full_name")), GroupCondition(soundex("first_name") == soundex("_first_name")), [ ExactLogic("First_Name_Match", col("first_name") == col("_first_name")), FuzzyLogic("Levenshtein_City", lit(True), normlevenshtein(col("city"),col("_city")), 0.9) ] ).doMatch(df1, df2, False) Returns: (DataFrame): a DataFrame with df1's id and df2's id and match flags of all the levels. For levels with fuzzy logic, the matching score is also provided. A column named "MatchBitmap" also provided to summarize all the matching flags. When keepOriginalCols is true, input columns are also kept """ jres = self.jem.doMatch(df1._jdf, df2._jdf, keepOriginalCols) return DataFrame(jres, SmvApp.getInstance().sqlContext)
def test_readSchemaWhenFileExist(self): cls = self.__class__ app = SmvApp.getInstance() schema_file_name = "schemaToBeRead1.schema" schema_file_path = os.path.join(cls.tmpInputDir(), schema_file_name) schema_file_content = ('@delimiter = ,\n' '@has-header = true\n' '@quote-char = "\n' 'a: String\n' 'b: Integer') self.createTempInputFile(schema_file_name, schema_file_content) data_file_path = schema_file_path.replace(".schema", ".csv") smv_schema_instance = app.j_smvPyClient.readSchemaFromDataPathAsSmvSchema(data_file_path) entries = smv_schema_instance.getEntriesStr() attributes = smv_schema_instance.extractCsvAttributes() self.assertEqual(len(entries), 2) self.assertEqual(entries[0], 'a: String') self.assertEqual(entries[1], 'b: Integer') self.assertTrue(attributes.hasHeader()) self.assertEqual(attributes.delimiter(), ',') self.assertEqual(attributes.quotechar(), '"')
def get_graph_json(): ''' body: none function: return the json file of the entire dependency graph ''' res = SmvApp.getInstance().get_graph_json() return jsonify(graph=res)
def getFqnsInApp(): """returns all known module FQNs in app. Note: excluded links""" repo = DataSetRepoFactory(SmvApp.getInstance()).createRepo() # generate list of URNs in a stage for each stage (list-of-list) urnsLL = [repo.dataSetsForStage(s) for s in getStagesInApp()] # flatten the list-of-list to simple list of urns and remove the "mod:" prefix urns = [u.split(":")[1] for ul in urnsLL for u in ul] return urns
def props(): """The current app propertied used by SMV after the app, user, command-line and dynamic props are merged. Returns: (dict): The 'mergedProps' or final props used by SMV """ return SmvApp.getInstance().getCurrentProperties()
def getH2oContext(): """ Init the hc (H2OContext) using the current sparkSession. Using this instead of h2o.init() """ sparkSession = SmvApp.getInstance().sparkSession import pysparkling hc = pysparkling.H2OContext.getOrCreate(sparkSession) return hc
def get_run_info(name, runConfig=None): """Get the SmvRunInfoCollector with full information about a module and its dependencies Args: name (str): name of the module whose information to collect runConfig (dict): runConfig to apply when collecting info. If module was run with a config, the same config needs to be specified here to retrieve the info. """ return SmvApp.getInstance().getRunInfoByPartialName(name, runConfig)
def test_readSchemaWhenFileNotExist(self): cls = self.__class__ app = SmvApp.getInstance() schema_file_name = "schemaToBeRead2.schema" schema_file_path = os.path.join(cls.tmpInputDir(), schema_file_name) data_file_path = schema_file_path.replace(".schema", ".csv") smv_schema_instance = app.j_smvPyClient.readSchemaFromDataPathAsSmvSchema(data_file_path) self.assertIsNone(smv_schema_instance)
def openHive(tableName): """Read in a Hive table as a DataFrame Args: tableName (str): The name of the Hive table Returns: (DataFrame): The resulting DataFrame """ return DataFrame(_jvmShellCmd().openHive(tableName), SmvApp.getInstance().sqlContext)
def setUpClass(cls): cls.sparkContext = TestConfig.sparkContext() cls.sqlContext = TestConfig.sqlContext() cls.sparkContext.setLogLevel("ERROR") import random; callback_server_port = random.randint(20000, 65535) args = TestConfig.smv_args() + cls.smvAppInitArgs() + ['--cbs-port', str(callback_server_port), '--data-dir', cls.DataDir] cls.smvApp = SmvApp.createInstance(args, cls.sparkContext, cls.sqlContext)
def openCsv(path): """Read in a CSV file as a DataFrame Args: path (str): The path of the CSV file Returns: (DataFrame): The resulting DataFrame """ return DataFrame(_jvmShellCmd().openCsv(path), SmvApp.getInstance().sqlContext)
def dshash(name): """The current hashOfHash for the named module as a hex string Args: name (str): The uniquen name of a module. Does not have to be the FQN. Returns: (int): The hashOfHash of the named module """ return SmvApp.getInstance().getDsHash(name)
def smvCollectSet(col, datatype): """An aggregate function, which will collect all the values of the given column and create a set as an array typed column. Since Spark 1.6, a spark function collect_set was introduced, so as migrate to Spark 1.6 and later, this smvCollectSet will be depricated. Args: col (Column): column to be aggregated on datatype (DataType): datatype of the input column """ return Column(SmvApp.getInstance()._jvm.org.tresamigos.smv.python. SmvPythonHelper.smvCollectSet(col._jc, datatype.json()))
def __init__(self): options = self.parseArgs() # init Smv context smvApp = SmvApp.createInstance([]) # to reduce complexity in SmvApp, keep the rest server single-threaded app.run(host=options.ip, port=int(options.port), threaded=False, processes=1)
def run_module(): ''' body: fqn = 'xxx' (fqn) function: run the module ''' try: module_fqn = request.form['fqn'].encode("utf-8") except: raise err_res('MODULE_NOT_PROVIDED_ERR') run_result = SmvApp.getInstance().runModule("mod:{}".format(module_fqn)) return ok_res(str(run_result))
def __init__(self): options = self.parseArgs() # init Smv context sparkSession = SparkSession.builder.\ enableHiveSupport().\ getOrCreate() smvApp = SmvApp.createInstance([], sparkSession) # to reduce complexity in SmvApp, keep the rest server single-threaded app.run(host=options.ip, port=int(options.port), threaded=False, processes=1)
def df(name, forceRun=False, version=None): """The DataFrame result of running the named module Args: name (str): The name of a module. Does not have to be the FQN. forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise. version (str): The name of the published version to load from Returns: (DataFrame): The result of running the named module. """ return SmvApp.getInstance().runModuleByName(name, forceRun, version)
def df(name, forceRun=False, quickRun=True): """The DataFrame result of running the named module Args: name (str): The unique name of a module. Does not have to be the FQN. forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise. quickRun (bool): skip computing dqm+metadata and persisting csv Returns: (DataFrame): The result of running the named module. """ return SmvApp.getInstance().runModuleByName(name, forceRun, quickRun)[0]
def nGram3(c1, c2): """3-gram UDF with formula (number of overlaped gramCnt)/max(s1.gramCnt, s2.gramCnt) Args: c1 (Column): first column c2 (Column): second column Returns: (Column): 3-gram """ return Column(SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.nGram3( c1._jc, c2._jc))
def dshash(name): """The current hashOfHash for the named module as a hex string Args: name (str): The uniquen name of a module. Does not have to be the FQN. runConfig (dict): runConfig to apply when collecting info. If module was run with a config, the same config needs to be specified here to retrieve the correct hash. Returns: (int): The hashOfHash of the named module """ return SmvApp.getInstance().getDsHash(name)
def df(name, forceRun = False, version = None, runConfig = None): """The DataFrame result of running the named module Args: name (str): The unique name of a module. Does not have to be the FQN. forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise. version (str): The name of the published version to load from runConfig (dict): runtime configuration to use when running the module Returns: (DataFrame): The result of running the named module. """ return SmvApp.getInstance().runModuleByName(name, forceRun, version, runConfig)[0]
def normlevenshtein(c1, c2): """Levenshtein edit distance metric UDF Args: c1 (Column): first column c2 (Column): second column Returns: (Column): distances """ return Column( SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.normlevenshtein( c1._jc, c2._jc))
def create_smv_app(self, smv_args, driver_args): """Override this to define how this driver's SmvApp is created Default is just SmvApp.createInstance(smv_args). Note that it's important to use `createInstance` to ensure that the singleton app is set. SmvDriver will parse the full CLI args to distinguish the SMV args from from the args to your driver. Args: smv_args (list(str)): CLI args for SMV - should be passed to `SmvApp`) driver_args (list(str)): CLI args for the driver """ return SmvApp.createInstance(smv_args)
def jaroWinkler(c1, c2): """Jaro-Winkler edit distance metric UDF Args: c1 (Column): first column c2 (Column): second column Returns: (Column): distances """ return Column( SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.jaroWinkler( c1._jc, c2._jc))
def getModel(name, forceRun = False): """Get the result of running the named SmvModel module Args: name (str): The name of a module. Does not have to be the FQN. forceRun (bool): True if the module should be forced to run even if it has persisted output. False otherwise. version (str): The name of the published version to load from Returns: (object): The result of running the named module """ app = SmvApp.getInstance() fqn = app.dsm.inferFqn(name) return app.getModuleResult(fqn, forceRun)
def openHive(tableName): """Read in a Hive table as a DataFrame Args: tableName (str): The name of the Hive table Returns: (DataFrame): The resulting DataFrame """ app = SmvApp.getInstance() class TmpHive(SmvHiveTable): def tableName(self): return tableName return DataFrame(TmpHive(app).doRun(None), app.sqlContext)
def help(): """Print a list of the SMV helper functions available in the shell """ this_mod = sys.modules[__name__] help_msg = "SMV shell commands:" for func_name in __all__: func = getattr(this_mod, func_name) signature = formatargspec(*getargspec(func)) help_msg += "\n* {}{}".format(func_name, signature) smv_version = SmvApp.getInstance().smvVersion() doc_url = ("http://tresamigossd.github.io/SMV/pythondocs/{}/smv.html#module-smv.smvshell" .format(smv_version)) help_msg += "\nDocumentation may be found at " + doc_url print(help_msg)
def run_test(test_name): """Run a test with the given name without creating new Spark context First reloads SMV and the test from source, then runs the test. Args: test_name (str): Name of the test to run """ # Ensure TestConfig has a canonical SmvApp (this will eventually be used # to restore the singleton SmvApp) TestConfig.setSmvApp(SmvApp.getInstance()) first_dot = test_name.find(".") if first_dot == -1: test_root_name = test_name else: test_root_name = test_name[:first_dot] _clear_from_sys_modules(["smv", test_root_name]) SmvTestRunner("src/test/python").run([test_name])
def getStagesInApp(): """returns list of all stages defined in app""" return list(SmvApp.getInstance().stages())
def fullRun(name): """Run module and return result. Persist and run DQM if given """ return SmvApp.getInstance().runModuleByName(name, forceRun=False, quickRun=False)[0]
# # This file is licensed under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from smv import SmvApp import sys app = SmvApp.createInstance(sys.argv[1:]) # skip the first argument, which is this program app.run()
def runModule(fqn, run_config=None): '''runs module of given fqn and runtime configuration''' return SmvApp.getInstance().runModule(fqn, runConfig=run_config)[0]
def getMetadataHistoryJson(fqn): '''returns metadata history given a fqn''' return SmvApp.getInstance().getMetadataHistoryJson(fqn)
def getDatasetInstance(fqn): '''returns dataset object given a fqn''' return DataSetRepoFactory(SmvApp.getInstance()).createRepo().loadDataSet(fqn)
def _appInfo(): return SmvAppInfo(SmvApp.getInstance())
def svg_graph(*stageNames): if (not stageNames): return SmvDependencyGraph(SmvApp.getInstance()) else: return SmvDependencyGraph(SmvApp.getInstance(), list(stageNames))
def quickRun(name): """Run module and return result. No persist, but use existing persisted if possible. No DQM """ return SmvApp.getInstance().runModuleByName(name, forceRun=False, quickRun=True)[0]
if os.path.exists(historyPath): try: readline.read_history_file(historyPath) except: os.remove(historyPath) print("Unable to read history, deleting history file.") atexit.register(save_history) # Import commonly used pyspark lib import pyspark.sql.functions as F import pyspark.sql.types as T sc.setLogLevel("ERROR") with open(".smv_shell_all_args") as fp: args = fp.readline() user_args = args.split() app = SmvApp.createInstance(user_args, spark) from smv.smvshell import * # Import user-defined helpers app.prepend_source("conf/") if os.path.exists("conf/smv_shell_app_init.py"): from smv_shell_app_init import * del os, atexit, readline, rlcompleter, save_history, historyPath
def _jvmShellCmd(): return SmvApp.getInstance()._jvm.org.tresamigos.smv.shell.ShellCmd