Ejemplo n.º 1
0
 def flattenProductHierarchyRecursive(df):
     # "explode" function creates a new row for each element in the given array or map column (in a DataFrame).
     if df.select(explode('categories')).count() <= 0:
         return df.select('parentId', 'childId', 'friendlyName')
     else:
         dfR = df.select('childId',explode('categories').alias('CatArray'))\
             .select(Column('childId').alias('parentId'), Column('CatArray.id').alias('childId'), Column('CatArray.friendlyName').alias('friendlyName'), Column('CatArray.categories').alias('categories'))
     return df.select('parentId', 'childId', 'friendlyName')\
         .union(flattenProductHierarchyRecursive(dfR).select('parentId', 'childId', 'friendlyName'))
Ejemplo n.º 2
0
 def temporal_key_column(self) -> Column:
     """
     Fetch the temporal key column, if any.
     :return: Temporal key column, or None.
     """
     col = self._jrfctx.temporalKeyColumn(self._jdf)
     return col and Column(col)
Ejemplo n.º 3
0
 def spatial_key_column(self) -> Column:
     """
     Fetch the tagged spatial key column.
     :return: Spatial key column
     """
     col = self._jrfctx.spatialKeyColumn(self._jdf)
     return Column(col)
Ejemplo n.º 4
0
 def tile_columns(self) -> List[Column]:
     """
     Fetches columns of type Tile.
     :return: One or more Column instances associated with Tiles.
     """
     cols = self._jrfctx.tileColumns(self._jdf)
     return [Column(c) for c in cols]
Ejemplo n.º 5
0
def __withField(self: Column, fieldName: str, fieldValue: Column):
    """
    An expression that adds/replaces a field by name in a `StructType`.
    If schema contains multiple fields with fieldName, they will all be replaced with fieldValue.
    """
    sc = SparkContext._active_spark_context
    _columnWithCustomMethods = sc._jvm.com.github.fqaiser94.mse.methods.ColumnWithCustomMethods(
        self._jc)
    _column = _columnWithCustomMethods.withField(fieldName, fieldValue._jc)
    return Column(_column)
Ejemplo n.º 6
0
def at_least_n_distinct(col, limit):
    """Count distinct that works with windows

    The standard distinct count in spark sql can't be applied in
    a window. This implementation allows that to work
    """
    sc = SparkContext._active_spark_context
    j_cols = _to_seq(sc, [_to_java_column(col), _to_java_column(F.lit(limit))])
    jc = sc._jvm.org.wikimedia.search.mjolnir.AtLeastNDistinct().apply(j_cols)
    return Column(jc)
Ejemplo n.º 7
0
    def generate_uuid(self):
        """ Generate V4 UUID.

        Returns:
            Spark Column (StringType): containing v4 UUIDs.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _generate_uuid = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.generateUUID_UDF()
        return Column(_generate_uuid.apply(_to_seq(sc, [], _to_java_column)))
Ejemplo n.º 8
0
def with_meta(self, alias, meta):
    """
    In pyspark 2.1 there is no simple way to change the metdata of a column, that only became available in pyspark 2.2.
    This is a function that takes a column and modifies its metadata.
    :param self: A pyspark column
    :param alias:
    :param meta: New meta data for the column
    """
    sc = SparkContext._active_spark_context
    jmeta = sc._gateway.jvm.org.apache.spark.sql.types.Metadata
    return Column(getattr(self._jc, "as")(alias, jmeta.fromJson(json.dumps(meta))))
Ejemplo n.º 9
0
 def _(*cols):
     jcontainer = self.get_java_container(
         package_name=package_name,
         object_name=object_name,
         java_class_instance=java_class_instance)
     # Ensure that your argument is a column
     function = getattr(jcontainer, name)
     judf = function()
     jc = judf.apply(
         self.to_scala_seq([_to_java_column(c) for c in cols]))
     return Column(jc)
Ejemplo n.º 10
0
def __dropFields(self: Column, *fieldNames: str):
    """
    An expression that drops fields by name in a `StructType`.
    This is a no-op if schema doesn't contain given field names.
    If schema contains multiple fields matching any one of the given fieldNames, they will all be dropped.
    """
    sc = SparkContext._active_spark_context
    _columnWithCustomMethods = sc._jvm.com.github.fqaiser94.mse.methods.ColumnWithCustomMethods(
        self._jc)
    _fieldNames = sc._jvm.PythonUtils.toSeq(fieldNames)
    _column = _columnWithCustomMethods.dropFields(_fieldNames)
    return Column(_column)
Ejemplo n.º 11
0
    def normalize_timestamp_dm(self, target_col):
        """ Convert string to timestamp where DAY is BEFORE MONTH.

        Args:
            target_col (Spark Column): containing strings representing timestamps.

        Returns:
            Spark Column (TimestampType): containing timestamps converted from strings.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _normalize_timestamp_dm = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeTimestampDM_UDF()
        return Column(_normalize_timestamp_dm.apply(_to_seq(sc, [target_col], _to_java_column)))
Ejemplo n.º 12
0
    def clean_string(self, target_col):
        """ Remove Java ISO control characters from, and trim, string.

        Args:
            target_col (Spark Column): target column to be cleaned.

        Returns:
            Spark Column (StringType): cleaned version of input column.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _clean_string = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.cleanString_UDF()
        return Column(_clean_string.apply(_to_seq(sc, [target_col], _to_java_column)))
Ejemplo n.º 13
0
    def normalize_date_md(self, target_col):
        """ Convert string to date where MONTH is BEFORE DAY.

        Args:
            target_col (Spark Column): containing strings representing dates.

        Returns:
            Spark Column (DateType): containing dates converted from strings.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _normalize_date_md = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeDateMD_UDF()
        return Column(_normalize_date_md.apply(_to_seq(sc, [target_col], _to_java_column)))
Ejemplo n.º 14
0
    def empty_string_to_null(self, target_col):
        """ Convert empty strings to nulls.

        Args:
            target_col (Spark Column): target column to convert.

        Returns:
            Spark Column (StringType): target column with empty values converted to nulls.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _empty_string_to_null = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.emptyStringToNull_UDF()
        return Column(_empty_string_to_null.apply(_to_seq(sc, [target_col], _to_java_column)))
Ejemplo n.º 15
0
def add_struct_field(nestedStruct: str, fieldName: str, fieldValue: Column):
    """
    A convenience method for adding/replacing a field by name inside a deeply nested struct.

    :param nestedStruct : e.g. "a.b.c" where a, b, and c are StructType columns and a is a top-level StructType Column and c is the StructType Column to add/replace field in.
    :param fieldName    : The name of the StructField to add (if it does not already exist) or replace (if it already exists).
    :param fieldValue   : The value to assign to fieldName.
    :return: a copy the top-level struct column (a) with field added/replaced.
    """
    sc = SparkContext._active_spark_context
    _add_struct_field = sc._jvm.com.github.fqaiser94.mse.methods.add_struct_field
    _column = _add_struct_field(nestedStruct, fieldName, fieldValue._jc)
    return Column(_column)
Ejemplo n.º 16
0
def __withFieldRenamed(self: Column, existingFieldName: str,
                       newFieldName: str):
    """
    An expression that renames a field by name in a `StructType`.
    This is a no-op if schema doesn't contain any field with existingFieldName.
    If schema contains multiple fields with existingFieldName, they will all be renamed to newFieldName.
    """
    sc = SparkContext._active_spark_context
    _columnWithCustomMethods = sc._jvm.com.github.fqaiser94.mse.methods.ColumnWithCustomMethods(
        self._jc)
    _column = _columnWithCustomMethods.withFieldRenamed(
        existingFieldName, newFieldName)
    return Column(_column)
Ejemplo n.º 17
0
    def string_is_number(self, target_col):
        """ Return boolean if string can be converted to a number.

        Args:
            target_col (Spark Column): containing string to check for convertability to number.

        Returns:
            Spark Column (BooleanType): whether string can converted to a number.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _string_is_number = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringIsNumber_UDF()
        return Column(_string_is_number.apply(_to_seq(sc, [target_col], _to_java_column)))
Ejemplo n.º 18
0
    def string_to_double_cfd(self, target_col):
        """ Convert string to doubles where commas represents decimal places (`cfd`).

        Args:
            target_col (Spark Column): containing double values in string format.

        Returns:
            Spark Column (DoubleType): containing double values converted from strings.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _string_to_double = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringToDoubleCommaForDecimal_UDF()
        return Column(_string_to_double.apply(_to_seq(sc, [target_col], _to_java_column)))
Ejemplo n.º 19
0
    def map_booleans_ynu(self, target_col):
        """ Map boolean values to `Y`, `N`, `Unknown`

        Args:
            target_col (Spark Column): target column containing boolean values to map.

        Returns:
            Spark Column (StringType): mapped values (`Y`, `N`, `Unknown`)
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _map_booleans_ynu = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.mapBooleansYNU_UDF()
        return Column(_map_booleans_ynu.apply(_to_seq(sc, [target_col], _to_java_column)))
Ejemplo n.º 20
0
 def _(*cols):
     jcontainer = self.get_java_container(
         package_name=package_name,
         object_name=object_name,
         java_class_instance=java_class_instance)
     # Ensure that your argument is a column
     col_args = [
         col._jc if isinstance(col, Column) else _make_col(col)._jc
         for col in cols
     ]
     function = getattr(jcontainer, name)
     args = col_args
     jc = function(*args)
     return Column(jc)
Ejemplo n.º 21
0
def add_meta(sc, col, metadata):
    """Add metadata to a column

    Adds metadata to a column for describing extra properties. This metadata survives
    serialization from dataframe to parquet and back to dataframe. Any manipulation
    of the column, such as aliasing, will lose the metadata.

    Parameters
    ----------
    sc : pyspark.SparkContext
    col : pyspark.sql.Column
    metadata : dict

    Returns
    -------
    pyspark.sql.Column
    """
    meta = sc._jvm.org.apache.spark.sql.types \
        .Metadata.fromJson(json.dumps(metadata))
    return Column(getattr(col._jc, 'as')('', meta))
Ejemplo n.º 22
0
def test_udf(cols):
    _test_udf = sc._jvm.org.opensource.gis.polygon.PolygonUtils.scala_pip()
    return Column(_test_udf.apply(_to_seq(sc, cols, _to_java_column)))
Ejemplo n.º 23
0
 def int_to_ip_udf(col):
     return Column(sc._jvm.CustomUDFs.intToIpUDF().apply(
         _to_seq(sc, [col], _to_java_column)))
Ejemplo n.º 24
0
 def _cast_spark_column_timestamp_to_long(self, scol: Column) -> Column:
     jvm = SparkContext._active_spark_context._jvm  # type: ignore[attr-defined]
     return Column(jvm.PythonSQLUtils.castTimestampNTZToLong(scol._jc))
Ejemplo n.º 25
0
    def sort_values(self,
                    by,
                    ascending=True,
                    inplace=False,
                    na_position='last'):
        """
        Sort by the values along either axis.

        Parameters
        ----------
        by : str or list of str
        ascending : bool or list of bool, default True
             Sort ascending vs. descending. Specify list for multiple sort
             orders.  If this is a list of bools, must match the length of
             the by.
        inplace : bool, default False
             if True, perform operation in-place
        na_position : {'first', 'last'}, default 'last'
             `first` puts NaNs at the beginning, `last` puts NaNs at the end

        Returns
        -------
        sorted_obj : DataFrame

        Examples
        --------
        >>> df = ks.DataFrame({
        ...     'col1': ['A', 'A', 'B', None, 'D', 'C'],
        ...     'col2': [2, 1, 9, 8, 7, 4],
        ...     'col3': [0, 1, 9, 4, 2, 3],
        ... })
        >>> df
           col1  col2  col3
        0     A     2     0
        1     A     1     1
        2     B     9     9
        3  None     8     4
        4     D     7     2
        5     C     4     3

        Sort by col1

        >>> df.sort_values(by=['col1'])
           col1  col2  col3
        0     A     2     0
        1     A     1     1
        2     B     9     9
        5     C     4     3
        4     D     7     2
        3  None     8     4


        Sort by multiple columns

        >>> df.sort_values(by=['col1', 'col2'])
           col1  col2  col3
        1     A     1     1
        0     A     2     0
        2     B     9     9
        5     C     4     3
        4     D     7     2
        3  None     8     4

        Sort Descending

        >>> df.sort_values(by='col1', ascending=False)
           col1  col2  col3
        4     D     7     2
        5     C     4     3
        2     B     9     9
        0     A     2     0
        1     A     1     1
        3  None     8     4
        """
        if isinstance(by, string_types):
            by = [by]
        if isinstance(ascending, bool):
            ascending = [ascending] * len(by)
        if len(ascending) != len(by):
            raise ValueError(
                'Length of ascending ({}) != length of by ({})'.format(
                    len(ascending), len(by)))
        if na_position not in ('first', 'last'):
            raise ValueError("invalid na_position: '{}'".format(na_position))

        # Mapper: Get a spark column function for (ascending, na_position) combination
        # Note that 'asc_nulls_first' and friends were added as of Spark 2.4, see SPARK-23847.
        mapper = {
            (True, 'first'):
            lambda x: Column(getattr(x._jc, "asc_nulls_first")()),
            (True, 'last'):
            lambda x: Column(getattr(x._jc, "asc_nulls_last")()),
            (False, 'first'):
            lambda x: Column(getattr(x._jc, "desc_nulls_first")()),
            (False, 'last'):
            lambda x: Column(getattr(x._jc, "desc_nulls_last")()),
        }
        by = [
            mapper[(asc, na_position)](self[colname]._scol)
            for colname, asc in zip(by, ascending)
        ]
        kdf = DataFrame(self._sdf.sort(*by), self._metadata.copy())
        if inplace:
            self._sdf: spark.DataFrame = kdf._sdf
            self._metadata = kdf._metadata
        else:
            return kdf
Ejemplo n.º 26
0
    # Convert the rdd of Categories into a Data Frame
    df = sqlContext.read.json(rdd)

    # This will print the Schema of the Categories Object
    df.printSchema()

    # Displays the contents of the Categories Object
    df.show()
    # from pyspark.sql.functions import split, explode
    from pyspark.sql.functions import *
    df.select(explode('categories')).show()

    from pyspark.sql import Row, Column
    # Select few columns in the JSON file
    dfRoot = df.select(
        Column('id').alias('parentId'),
        Column('id').alias('childId'), 'friendlyName', 'categories')

    # Display the results of the selected Columns
    dfRoot.show()

    from pyspark.sql.functions import *

    # Recursive function to assign the Parent and Child Ids appropritely
    def flattenProductHierarchyRecursive(df):
        # "explode" function creates a new row for each element in the given array or map column (in a DataFrame).
        if df.select(explode('categories')).count() <= 0:
            return df.select('parentId', 'childId', 'friendlyName')
        else:
            dfR = df.select('childId',explode('categories').alias('CatArray'))\
                .select(Column('childId').alias('parentId'), Column('CatArray.id').alias('childId'), Column('CatArray.friendlyName').alias('friendlyName'), Column('CatArray.categories').alias('categories'))