def create_smv_pyclient(self, arglist): ''' return a smvPyClient instance ''' # convert python arglist to java String array java_args = smv_copy_array(self.sc, *arglist) return self._jvm.org.tresamigos.smv.python.SmvPyClientFactory.init(java_args, self.sqlContext._ssql_ctx)
def _moduleUrnsForStage(self, stageName, fn): # `walk_packages` can generate AttributeError if the system has # Gtk modules, which are not designed to use with reflection or # introspection. Best action to take in this situation is probably # to simply suppress the error. def err(name): pass # print("Error importing module %s" % name) # t, v, tb = sys.exc_info() # print("type is {0}, value is {1}".format(t, v)) buf = [] # import the stage and only walk the packages in the path of that stage, recursively for name in iter_submodules([stageName]): # The additional "." is necessary to prevent false positive, e.g. stage_2.M1 matches stage if name.startswith(stageName + "."): pymod = __import__(name) for c in name.split('.')[1:]: pymod = getattr(pymod, c) for n in dir(pymod): obj = getattr(pymod, n) try: # Class should have an fqn which begins with the stageName. # Each package will contain among other things all of # the modules that were imported into it, and we need # to exclude these (so that we only count each module once) if fn(obj) and obj.fqn().startswith(name): buf.append(obj.urn()) except AttributeError: continue return smv_copy_array(self.smvApp.sc, *buf)
def smvStrCat(head, *others): """Concatenate multiple columns to a single string. Similar to `concat` and `concat_ws` functions in Spark but behaves differently when some columns are nulls. The Spark version will return null if any of the inputs is null. smvStrCat will return null if all of the inputs are nulls, otherwise it will coalesce null cols to blank. This function can take 2 forms: - smvStrCat(sep, col1, col2, ...) - smvStrCat(col1, col2, ...) Args: sep (String): separater for the concats col. (Column): columns to be concatenated Return: (col): a StringType column """ if is_string(head): sep = head cols = list(others) elif isinstance(head, Column): sep = "" cols = [head] + list(others) else: raise RuntimeError("first parameter must be either a String or a Column") app = SmvApp.getInstance() return Column(app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvStrCat(sep, smv_copy_array(app.sc, *cols)))
def smvHashKey(head, *others): """Create MD5 on concatenated columns. Return "Prefix" + MD5 Hex string(size 32 string) as the unique key MD5's collisions rate on real data records could be ignored based on the following discussion. https://marc-stevens.nl/research/md5-1block-collision/ The shortest messages have the same MD5 are 512-bit (64-byte) messages as below 4dc968ff0ee35c209572d4777b721587d36fa7b21bdc56b74a3dc0783e7b9518afbfa200a8284bf36e8e4b55b35f427593d849676da0d1555d8360fb5f07fea2 and the (different by two bits) 4dc968ff0ee35c209572d4777b721587d36fa7b21bdc56b74a3dc0783e7b9518afbfa202a8284bf36e8e4b55b35f427593d849676da0d1d55d8360fb5f07fea2 both have MD5 hash 008ee33a9d58b51cfeb425b0959121c9 There are other those pairs, but all carefully constructed. Theoretically the random collisions will happen on data size approaching 2^64 (since MD5 has 128-bit), which is much larger than the number of records we deal with (a billion is about 2^30) There for using MD5 to hash primary key columns is good enough for creating an unique key This function can take 2 forms: - smvHashKey(prefix, col1, col2, ...) - smvHashKey(col1, col2, ...) Args: prefix (String): return string's prefix col. (Column): columns to be part of hash Return: (col): a StringType column as Prefix + MD5 Hex string """ if is_string(head): pre = head cols = list(others) elif isinstance(head, Column): pre = "" cols = [head] + list(others) else: raise RuntimeError("first parameter must be either a String or a Column") app = SmvApp.getInstance() return Column(app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvHashKey(pre, smv_copy_array(app.sc, *cols)))
def dependencyUrns(self): arr = [x.urn() for x in self.dependencies()] return smv_copy_array(self.smvApp.sc, *arr)
def dataSetsForStage(self, stageName): urns = self._dataSetsForStage(stageName) return smv_copy_array(self.smvApp.sc, *urns)