Esempio n. 1
0
 def create_smv_pyclient(self, arglist):
     '''
     return a smvPyClient instance
     '''
     # convert python arglist to java String array
     java_args =  smv_copy_array(self.sc, *arglist)
     return self._jvm.org.tresamigos.smv.python.SmvPyClientFactory.init(java_args, self.sqlContext._ssql_ctx)
Esempio n. 2
0
 def create_smv_pyclient(self, arglist):
     '''
     return a smvPyClient instance
     '''
     # convert python arglist to java String array
     java_args =  smv_copy_array(self.sc, *arglist)
     return self._jvm.org.tresamigos.smv.python.SmvPyClientFactory.init(java_args, self.sqlContext._ssql_ctx)
Esempio n. 3
0
    def _moduleUrnsForStage(self, stageName, fn):
        # `walk_packages` can generate AttributeError if the system has
        # Gtk modules, which are not designed to use with reflection or
        # introspection. Best action to take in this situation is probably
        # to simply suppress the error.
        def err(name): pass
        # print("Error importing module %s" % name)
        # t, v, tb = sys.exc_info()
        # print("type is {0}, value is {1}".format(t, v))
        buf = []
        # import the stage and only walk the packages in the path of that stage, recursively
        for name in iter_submodules([stageName]):
                # The additional "." is necessary to prevent false positive, e.g. stage_2.M1 matches stage
                if name.startswith(stageName + "."):
                    pymod = __import__(name)
                    for c in name.split('.')[1:]:
                        pymod = getattr(pymod, c)

                    for n in dir(pymod):
                        obj = getattr(pymod, n)
                        try:
                            # Class should have an fqn which begins with the stageName.
                            # Each package will contain among other things all of
                            # the modules that were imported into it, and we need
                            # to exclude these (so that we only count each module once)
                            if fn(obj) and obj.fqn().startswith(name):
                                buf.append(obj.urn())
                        except AttributeError:
                            continue

        return smv_copy_array(self.smvApp.sc, *buf)
Esempio n. 4
0
def smvStrCat(head, *others):
    """Concatenate multiple columns to a single string. Similar to `concat` and `concat_ws` functions in Spark but behaves differently
       when some columns are nulls.
       The Spark version will return null if any of the inputs is null.
       smvStrCat will return null if all of the inputs are nulls, otherwise it will coalesce null cols to blank.

       This function can take 2 forms:
       - smvStrCat(sep, col1, col2, ...)
       - smvStrCat(col1, col2, ...)

       Args:
           sep (String): separater for the concats
           col. (Column): columns to be concatenated

       Return:
           (col): a StringType column
    """
    if is_string(head):
        sep = head
        cols = list(others)
    elif isinstance(head, Column):
        sep = ""
        cols = [head] + list(others)
    else:
        raise RuntimeError("first parameter must be either a String or a Column")
    app = SmvApp.getInstance()
    return Column(app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvStrCat(sep, smv_copy_array(app.sc, *cols)))
Esempio n. 5
0
def smvHashKey(head, *others):
    """Create MD5 on concatenated columns.
    Return "Prefix" + MD5 Hex string(size 32 string) as the unique key

    MD5's collisions rate on real data records could be ignored based on the following discussion.

    https://marc-stevens.nl/research/md5-1block-collision/
    The shortest messages have the same MD5 are 512-bit (64-byte) messages as below

    4dc968ff0ee35c209572d4777b721587d36fa7b21bdc56b74a3dc0783e7b9518afbfa200a8284bf36e8e4b55b35f427593d849676da0d1555d8360fb5f07fea2
    and the (different by two bits)
    4dc968ff0ee35c209572d4777b721587d36fa7b21bdc56b74a3dc0783e7b9518afbfa202a8284bf36e8e4b55b35f427593d849676da0d1d55d8360fb5f07fea2
    both have MD5 hash
    008ee33a9d58b51cfeb425b0959121c9

    There are other those pairs, but all carefully constructed.
    Theoretically the random collisions will happen on data size approaching 2^64 (since MD5 has
    128-bit), which is much larger than the number of records we deal with (a billion is about 2^30)
    There for using MD5 to hash primary key columns is good enough for creating an unique key

    This function can take 2 forms:
    - smvHashKey(prefix, col1, col2, ...)
    - smvHashKey(col1, col2, ...)

    Args:
     prefix (String): return string's prefix
     col. (Column): columns to be part of hash

    Return:
     (col): a StringType column as Prefix + MD5 Hex string
    """

    if is_string(head):
        pre = head
        cols = list(others)
    elif isinstance(head, Column):
        pre = ""
        cols = [head] + list(others)
    else:
        raise RuntimeError("first parameter must be either a String or a Column")
    app = SmvApp.getInstance()
    return Column(app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvHashKey(pre, smv_copy_array(app.sc, *cols)))
Esempio n. 6
0
 def dependencyUrns(self):
     arr = [x.urn() for x in self.dependencies()]
     return smv_copy_array(self.smvApp.sc, *arr)
Esempio n. 7
0
 def dataSetsForStage(self, stageName):
     urns = self._dataSetsForStage(stageName)
     return smv_copy_array(self.smvApp.sc, *urns)