コード例 #1
0
def smvStrCat(head, *others):
    """Concatenate multiple columns to a single string. Similar to `concat` and `concat_ws` functions in Spark but behaves differently
       when some columns are nulls.
       The Spark version will return null if any of the inputs is null.
       smvStrCat will return null if all of the inputs are nulls, otherwise it will coalesce null cols to blank.

       This function can take 2 forms:
       - smvStrCat(sep, col1, col2, ...)
       - smvStrCat(col1, col2, ...)

       Args:
           sep (String): separater for the concats
           col. (Column): columns to be concatenated

       Return:
           (col): a StringType column
    """
    if (isinstance(head, basestring)):
        sep = head
        cols = list(others)
    elif (isinstance(head, Column)):
        sep = ""
        cols = [head] + list(others)
    else:
        raise RuntimeError(
            "first parameter must be either a String or a Column")
    app = SmvApp.getInstance()
    return Column(
        app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvStrCat(
            sep, smv_copy_array(app.sc, *cols)))
コード例 #2
0
 def create_smv_pyclient(self, arglist):
     '''
     return a smvPyClient instance
     '''
     # convert python arglist to java String array
     java_args = smv_copy_array(self.sc, *arglist)
     return self._jvm.org.tresamigos.smv.python.SmvPyClientFactory.init(
         java_args, self.sqlContext._ssql_ctx)
コード例 #3
0
    def smvPivotCoalesce(self, pivotCols, valueCols, baseOutput):
        """Perform SmvPivot, then coalesce the output

            Args:
                pivotCols (list(list(str))): lists of names of column names to pivot
                valueCols (list(string)): names of value columns to coalesce
                baseOutput (list(str)): expected names pivoted column

            Returns:
                (Dataframe): result of pivot coalesce
        """
        return DataFrame(self.sgd.smvPivotCoalesce(smv_copy_array(self.df._sc, *pivotCols), smv_copy_array(self.df._sc, *valueCols), smv_copy_array(self.df._sc, *baseOutput)), self.df.sql_ctx)
コード例 #4
0
ファイル: smvpydataset.py プロジェクト: hubertp/SMV
    def dependencies(self):
        # Try/except block is a short-term solution (read: hack) to ensure that
        # the user gets a full stack trace when SmvPyDataSet user-defined methods
        # causes errors
        try:
            arr = smv_copy_array(self.smvApp.sc,
                                 *[x.urn() for x in self.requiresDS()])
        except BaseException as err:
            traceback.print_exc()
            raise err

        return arr
コード例 #5
0
    def smvDedupByKey(self, *keys):
        """Remove duplicate records from the DataFrame by arbitrarly selecting the first record from a set of records with same primary key or key combo.

            Args:
                keys (\*string or \*Column): the column names or Columns on which to apply dedup

            Example:
                input DataFrame:

                +-----+---------+---------+
                | id  | product | Company |
                +=====+=========+=========+
                | 1   | A       | C1      |
                +-----+---------+---------+
                | 1   | C       | C2      |
                +-----+---------+---------+
                | 2   | B       | C3      |
                +-----+---------+---------+
                | 2   | B       | C4      |
                +-----+---------+---------+

                >>> df.dedupByKey("id")

                output DataFrame:

                +-----+---------+---------+
                | id  | product | Company |
                +=====+=========+=========+
                | 1   | A       | C1      |
                +-----+---------+---------+
                | 2   | B       | C3      |
                +-----+---------+---------+

                >>> df.dedupByKey("id", "product")

                output DataFrame:

                +-----+---------+---------+
                | id  | product | Company |
                +=====+=========+=========+
                | 1   | A       | C1      |
                +-----+---------+---------+
                | 1   | C       | C2      |
                +-----+---------+---------+
                | 2   | B       | C3      |
                +-----+---------+---------+

            Returns:
                (DataFrame): a DataFrame without duplicates for the specified keys
        """
        jdf = self._jPythonHelper.smvDedupByKey(self._jdf, smv_copy_array(self._sc, *keys))
        return DataFrame(jdf, self._sql_ctx)
コード例 #6
0
    def smvRenameField(self, *namePairs):
        """Rename one or more fields of a `DataFrame`

            Args:
                namePairs (\*tuple): tuples of strings where the first is the source column name, and the second is the target column name

            Example:
                >>> df.smvRenameField(("a", "aa"), ("c", "cc"))

            Returns:
                (DataFrame): the DataFrame with renamed fields
        """
        jdf = self._jPythonHelper.smvRenameField(self._jdf, smv_copy_array(self._sc, *namePairs))
        return DataFrame(jdf, self._sql_ctx)
コード例 #7
0
    def smvDesc(self, *colDescs):
        """Adds column descriptions

            Args:
                colDescs (\*tuple): tuples of strings where the first is the column name, and the second is the description to add

            Example:
                >>> df.smvDesc(("a", "description of col a"), ("b", "description of col b"))

            Returns:
                (DataFrame): the DataFrame with column descriptions added
        """
        jdf = self._jPythonHelper.smvDesc(self._jdf, smv_copy_array(self._sc, *colDescs))
        return DataFrame(jdf, self._sql_ctx)
コード例 #8
0
    def smvRemoveDesc(self, *colNames):
        """Removes description for the given columns from the Dataframe

            Args:
                colNames (\*string): names of columns for which to remove the description

            Example:
                >>> df.smvRemoveDesc("col_a", "col_b")

            Returns:
                (DataFrame): the DataFrame with column descriptions removed
        """
        jdf = self._jPythonHelper.smvRemoveDesc(self._jdf, smv_copy_array(self._sc, *colNames))
        return DataFrame(jdf, self._sql_ctx)
コード例 #9
0
    def smvSelectMinus(self, *cols):
        """Remove one or more columns from current DataFrame

            Args:
                cols (\*string or \*Column): column names or Columns to remove from the DataFrame

            Example:
                >>> df.smvSelectMinus("col1", "col2")
                >>> df.smvSelectMinus(col("col1"), col("col2"))

            Returns:
                (DataFrame): the resulting DataFrame after removal of columns
        """
        jdf = self._jPythonHelper.smvSelectMinus(self._jdf, smv_copy_array(self._sc, *cols))
        return DataFrame(jdf, self._sql_ctx)
コード例 #10
0
    def smvTopNRecs(self, maxElems, *cols):
        """For each group, return the top N records according to a given ordering

            Example:
                # This will keep the 3 largest amt records for each id
                df.smvGroupBy("id").smvTopNRecs(3, col("amt").desc())

            Args:
                maxElems (int): maximum number of records per group
                cols (\*str): columns defining the ordering

            Returns:
                (DataFrame): result of taking top records from groups

        """
        return DataFrame(self.sgd.smvTopNRecs(maxElems, smv_copy_array(self.df._sc, *cols)), self.df.sql_ctx)
コード例 #11
0
    def smvJoinMultipleByKey(self, keys, joinType = 'inner'):
        """Create multiple DF join builder

            It is used in conjunction with `joinWith` and `doJoin`

            Args:
                keys (list(string)): a list of column names on which to apply the join
                joinType (string): choose one of ['inner', 'outer', 'leftouter', 'rightouter', 'leftsemi']

            Example:
                >>> df.joinMultipleByKey(["k1", "k2"], "inner").joinWith(df2, "_df2").joinWith(df3, "_df3", "leftouter").doJoin()

            Returns:
                (SmvMultiJoin): the builder object for the multi join operation
        """
        jdf = self._jPythonHelper.smvJoinMultipleByKey(self._jdf, smv_copy_array(self._sc, *keys), joinType)
        return SmvMultiJoin(self._sql_ctx, jdf)
コード例 #12
0
def smvHashKey(head, *others):
    """Create MD5 on concatenated columns.
    Return "Prefix" + MD5 Hex string(size 32 string) as the unique key

    MD5's collisions rate on real data records could be ignored based on the following discussion.

    https://marc-stevens.nl/research/md5-1block-collision/
    The shortest messages have the same MD5 are 512-bit (64-byte) messages as below

    4dc968ff0ee35c209572d4777b721587d36fa7b21bdc56b74a3dc0783e7b9518afbfa200a8284bf36e8e4b55b35f427593d849676da0d1555d8360fb5f07fea2
    and the (different by two bits)
    4dc968ff0ee35c209572d4777b721587d36fa7b21bdc56b74a3dc0783e7b9518afbfa202a8284bf36e8e4b55b35f427593d849676da0d1d55d8360fb5f07fea2
    both have MD5 hash
    008ee33a9d58b51cfeb425b0959121c9

    There are other those pairs, but all carefully constructed.
    Theoretically the random collisions will happen on data size approaching 2^64 (since MD5 has
    128-bit), which is much larger than the number of records we deal with (a billion is about 2^30)
    There for using MD5 to hash primary key columns is good enough for creating an unique key

    This function can take 2 forms:
    - smvHashKey(prefix, col1, col2, ...)
    - smvHashKey(col1, col2, ...)

    Args:
     prefix (String): return string's prefix
     col. (Column): columns to be part of hash

    Return:
     (col): a StringType column as Prefix + MD5 Hex string
    """

    if (isinstance(head, basestring)):
        pre = head
        cols = list(others)
    elif (isinstance(head, Column)):
        pre = ""
        cols = [head] + list(others)
    else:
        raise RuntimeError(
            "first parameter must be either a String or a Column")
    app = SmvApp.getInstance()
    return Column(
        app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvHashKey(
            pre, smv_copy_array(app.sc, *cols)))
コード例 #13
0
    def smvPivotSum(self, pivotCols, valueCols, baseOutput):
        """Perform SmvPivot, then sum the results

            The user is required to supply the list of expected pivot column
            output names to avoid extra action on the input DataFrame. If an
            empty sequence is provided, then the base output columns will be
            extracted from values in the pivot columns (will cause an action
            on the entire DataFrame!)

            Args:
                pivotCols (list(list(str))): lists of names of column names to pivot
                valueCols (list(string)): names of value columns to sum
                baseOutput (list(str)): expected names pivoted column

            Examples:
                For example, given a DataFrame df that represents the table

                +-----+-------+---------+-------+
                | id  | month | product | count |
                +=====+=======+=========+=======+
                | 1   | 5/14  |   A     |   100 |
                +-----+-------+---------+-------+
                | 1   | 6/14  |   B     |   200 |
                +-----+-------+---------+-------+
                | 1   | 5/14  |   B     |   300 |
                +-----+-------+---------+-------+

                we can use

                >>> df.smvGroupBy("id").smvPivotSum(Seq("month", "product"))("count")("5_14_A", "5_14_B", "6_14_A", "6_14_B")

                to produce the following output

                +-----+--------------+--------------+--------------+--------------+
                | id  | count_5_14_A | count_5_14_B | count_6_14_A | count_6_14_B |
                +=====+==============+==============+==============+==============+
                | 1   | 100          | 300          | NULL         | 200          |
                +-----+--------------+--------------+--------------+--------------+

            Returns:
                (DataFrame): result of pivot sum
        """
        return DataFrame(self.sgd.smvPivotSum(smv_copy_array(self.df._sc, *pivotCols), smv_copy_array(self.df._sc, *valueCols), smv_copy_array(self.df._sc, *baseOutput)), self.df.sql_ctx)
コード例 #14
0
    def smvExpandStruct(self, *cols):
        """Expand structure type column to a group of columns

            Args:
                cols (\*string): column names to expand

            Example:
                input DF:
                    [id: string, address: struct<state:string, zip:string, street:string>]

                >>> df.smvExpandStruct("address")

                output DF:
                    [id: string, state: string, zip: string, street: string]

            Returns:
                (DataFrame): DF with expanded columns
        """
        jdf = self._jPythonHelper.smvExpandStruct(self._jdf, smv_copy_array(self._sc, *cols))
        return DataFrame(jdf, self._sql_ctx)
コード例 #15
0
    def smvGroupBy(self, *cols):
        """Similar to groupBy, instead of creating GroupedData, create an `SmvGroupedData` object.

            See [[org.tresamigos.smv.SmvGroupedDataFunc]] for list of functions that can be applied to the grouped data.

            Args:
                cols (\*string or \*Column): column names or Column objects to group on

            Note:
                This is going away shortly and user will be able to use standard Spark `groupBy` method directly.

            Example:
                >>> df.smvGroupBy(col("k"))
                >>> df.smvGroupBy("k")

            Returns:
                (SmvGroupedData): grouped data object

        """
        jSgd = self._jPythonHelper.smvGroupBy(self._jdf, smv_copy_array(self._sc, *cols))
        return SmvGroupedData(self.df, jSgd)
コード例 #16
0
    def _moduleUrnsForStage(self, stageName, fn):
        # `walk_packages` can generate AttributeError if the system has
        # Gtk modules, which are not designed to use with reflection or
        # introspection. Best action to take in this situation is probably
        # to simply suppress the error.
        def err(name):
            pass

        # print("Error importing module %s" % name)
        # t, v, tb = sys.exc_info()
        # print("type is {0}, value is {1}".format(t, v))
        buf = []
        # import the stage and only walk the packages in the path of that stage, recursively
        try:
            stagemod = __import__(stageName)
        except:
            # may be a scala-only stage
            pass
        else:
            for loader, name, is_pkg in pkgutil.walk_packages(
                    stagemod.__path__, stagemod.__name__ + '.', onerror=err):
                # The additional "." is necessary to prevent false positive, e.g. stage_2.M1 matches stage
                if name.startswith(stageName + ".") and not is_pkg:
                    pymod = __import__(name)
                    for c in name.split('.')[1:]:
                        pymod = getattr(pymod, c)

                    for n in dir(pymod):
                        obj = getattr(pymod, n)
                        try:
                            # Class should have an fqn which begins with the stageName.
                            # Each package will contain among other things all of
                            # the modules that were imported into it, and we need
                            # to exclude these (so that we only count each module once)
                            if fn(obj) and obj.fqn().startswith(name):
                                buf.append(obj.urn())
                        except AttributeError:
                            continue

        return smv_copy_array(self.smvApp.sc, *buf)
コード例 #17
0
 def _smvBinHist(self, *colWithBin):
     for elem in colWithBin:
         assert type(elem) is tuple, "smvBinHist takes a list of tuple(string, double) as paraeter"
         assert len(elem) == 2, "smvBinHist takes a list of tuple(string, double) as parameter"
     insureDouble = map(lambda t: (t[0], t[1] * 1.0), colWithBin)
     return self._jPythonHelper.smvBinHist(self._jdf, smv_copy_array(self._sc, *insureDouble))
コード例 #18
0
 def _smvConcatHist(self, *cols):
     return self._jPythonHelper.smvConcatHist(self._jdf, smv_copy_array(self._sc, *cols))
コード例 #19
0
 def __doFill(*valueCols):
     return DataFrame(self.sgd.smvFillNullWithPrevValue(smv_copy_array(self.df._sc, *orderCols), smv_copy_array(self.df._sc, *valueCols)), self.df.sql_ctx)
コード例 #20
0
 def _check(*dfothers):
     jdf = self._jPythonHelper.smvOverlapCheck(self._jdf, keyColName, smv_copy_array(self._sc, *dfothers))
     return DataFrame(jdf, self._sql_ctx)
コード例 #21
0
 def _withOrder(*orderCols):
     jdf = self._jPythonHelper.smvDedupByKeyWithOrder(self._jdf, smv_copy_array(self._sc, *keys), smv_copy_array(self._sc, *orderCols))
     return DataFrame(jdf, self._sql_ctx)