def compare_null_last( left: Column, right: Column, comp: Callable[[Column, Column], Column], ) -> Column: return (left.isNotNull() & right.isNotNull() & comp(left, right)) | (left.isNotNull() & right.isNull())
def compare_null_last( left: spark.Column, right: spark.Column, comp: Callable[[spark.Column, spark.Column], spark.Column], ) -> spark.Column: return (left.isNotNull() & right.isNotNull() & comp(left, right)) | (left.isNotNull() & right.isNull())
def flattenProductHierarchyRecursive(df): # "explode" function creates a new row for each element in the given array or map column (in a DataFrame). if df.select(explode('categories')).count() <= 0: return df.select('parentId', 'childId', 'friendlyName') else: dfR = df.select('childId',explode('categories').alias('CatArray'))\ .select(Column('childId').alias('parentId'), Column('CatArray.id').alias('childId'), Column('CatArray.friendlyName').alias('friendlyName'), Column('CatArray.categories').alias('categories')) return df.select('parentId', 'childId', 'friendlyName')\ .union(flattenProductHierarchyRecursive(dfR).select('parentId', 'childId', 'friendlyName'))
def temporal_key_column(self) -> Column: """ Fetch the temporal key column, if any. :return: Temporal key column, or None. """ col = self._jrfctx.temporalKeyColumn(self._jdf) return col and Column(col)
def spatial_key_column(self) -> Column: """ Fetch the tagged spatial key column. :return: Spatial key column """ col = self._jrfctx.spatialKeyColumn(self._jdf) return Column(col)
def tile_columns(self) -> List[Column]: """ Fetches columns of type Tile. :return: One or more Column instances associated with Tiles. """ cols = self._jrfctx.tileColumns(self._jdf) return [Column(c) for c in cols]
def at_least_n_distinct(col, limit): """Count distinct that works with windows The standard distinct count in spark sql can't be applied in a window. This implementation allows that to work """ sc = SparkContext._active_spark_context j_cols = _to_seq(sc, [_to_java_column(col), _to_java_column(F.lit(limit))]) jc = sc._jvm.org.wikimedia.search.mjolnir.AtLeastNDistinct().apply(j_cols) return Column(jc)
def __withField(self: Column, fieldName: str, fieldValue: Column): """ An expression that adds/replaces a field by name in a `StructType`. If schema contains multiple fields with fieldName, they will all be replaced with fieldValue. """ sc = SparkContext._active_spark_context _columnWithCustomMethods = sc._jvm.com.github.fqaiser94.mse.methods.ColumnWithCustomMethods( self._jc) _column = _columnWithCustomMethods.withField(fieldName, fieldValue._jc) return Column(_column)
def generate_uuid(self): """ Generate V4 UUID. Returns: Spark Column (StringType): containing v4 UUIDs. """ sc = self.spark.sparkContext # noinspection PyUnresolvedReferences, PyProtectedMember _generate_uuid = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.generateUUID_UDF() return Column(_generate_uuid.apply(_to_seq(sc, [], _to_java_column)))
def with_meta(self, alias, meta): """ In pyspark 2.1 there is no simple way to change the metdata of a column, that only became available in pyspark 2.2. This is a function that takes a column and modifies its metadata. :param self: A pyspark column :param alias: :param meta: New meta data for the column """ sc = SparkContext._active_spark_context jmeta = sc._gateway.jvm.org.apache.spark.sql.types.Metadata return Column(getattr(self._jc, "as")(alias, jmeta.fromJson(json.dumps(meta))))
def _(*cols): jcontainer = self.get_java_container( package_name=package_name, object_name=object_name, java_class_instance=java_class_instance) # Ensure that your argument is a column function = getattr(jcontainer, name) judf = function() jc = judf.apply( self.to_scala_seq([_to_java_column(c) for c in cols])) return Column(jc)
def __dropFields(self: Column, *fieldNames: str): """ An expression that drops fields by name in a `StructType`. This is a no-op if schema doesn't contain given field names. If schema contains multiple fields matching any one of the given fieldNames, they will all be dropped. """ sc = SparkContext._active_spark_context _columnWithCustomMethods = sc._jvm.com.github.fqaiser94.mse.methods.ColumnWithCustomMethods( self._jc) _fieldNames = sc._jvm.PythonUtils.toSeq(fieldNames) _column = _columnWithCustomMethods.dropFields(_fieldNames) return Column(_column)
def clean_string(self, target_col): """ Remove Java ISO control characters from, and trim, string. Args: target_col (Spark Column): target column to be cleaned. Returns: Spark Column (StringType): cleaned version of input column. """ sc = self.spark.sparkContext # noinspection PyUnresolvedReferences, PyProtectedMember _clean_string = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.cleanString_UDF() return Column(_clean_string.apply(_to_seq(sc, [target_col], _to_java_column)))
def normalize_date_md(self, target_col): """ Convert string to date where MONTH is BEFORE DAY. Args: target_col (Spark Column): containing strings representing dates. Returns: Spark Column (DateType): containing dates converted from strings. """ sc = self.spark.sparkContext # noinspection PyUnresolvedReferences, PyProtectedMember _normalize_date_md = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeDateMD_UDF() return Column(_normalize_date_md.apply(_to_seq(sc, [target_col], _to_java_column)))
def string_to_double_cfd(self, target_col): """ Convert string to doubles where commas represents decimal places (`cfd`). Args: target_col (Spark Column): containing double values in string format. Returns: Spark Column (DoubleType): containing double values converted from strings. """ sc = self.spark.sparkContext # noinspection PyUnresolvedReferences, PyProtectedMember _string_to_double = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringToDoubleCommaForDecimal_UDF() return Column(_string_to_double.apply(_to_seq(sc, [target_col], _to_java_column)))
def map_booleans_ynu(self, target_col): """ Map boolean values to `Y`, `N`, `Unknown` Args: target_col (Spark Column): target column containing boolean values to map. Returns: Spark Column (StringType): mapped values (`Y`, `N`, `Unknown`) """ sc = self.spark.sparkContext # noinspection PyUnresolvedReferences, PyProtectedMember _map_booleans_ynu = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.mapBooleansYNU_UDF() return Column(_map_booleans_ynu.apply(_to_seq(sc, [target_col], _to_java_column)))
def normalize_timestamp_dm(self, target_col): """ Convert string to timestamp where DAY is BEFORE MONTH. Args: target_col (Spark Column): containing strings representing timestamps. Returns: Spark Column (TimestampType): containing timestamps converted from strings. """ sc = self.spark.sparkContext # noinspection PyUnresolvedReferences, PyProtectedMember _normalize_timestamp_dm = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeTimestampDM_UDF() return Column(_normalize_timestamp_dm.apply(_to_seq(sc, [target_col], _to_java_column)))
def empty_string_to_null(self, target_col): """ Convert empty strings to nulls. Args: target_col (Spark Column): target column to convert. Returns: Spark Column (StringType): target column with empty values converted to nulls. """ sc = self.spark.sparkContext # noinspection PyUnresolvedReferences, PyProtectedMember _empty_string_to_null = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.emptyStringToNull_UDF() return Column(_empty_string_to_null.apply(_to_seq(sc, [target_col], _to_java_column)))
def string_is_number(self, target_col): """ Return boolean if string can be converted to a number. Args: target_col (Spark Column): containing string to check for convertability to number. Returns: Spark Column (BooleanType): whether string can converted to a number. """ sc = self.spark.sparkContext # noinspection PyUnresolvedReferences, PyProtectedMember _string_is_number = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringIsNumber_UDF() return Column(_string_is_number.apply(_to_seq(sc, [target_col], _to_java_column)))
def __withFieldRenamed(self: Column, existingFieldName: str, newFieldName: str): """ An expression that renames a field by name in a `StructType`. This is a no-op if schema doesn't contain any field with existingFieldName. If schema contains multiple fields with existingFieldName, they will all be renamed to newFieldName. """ sc = SparkContext._active_spark_context _columnWithCustomMethods = sc._jvm.com.github.fqaiser94.mse.methods.ColumnWithCustomMethods( self._jc) _column = _columnWithCustomMethods.withFieldRenamed( existingFieldName, newFieldName) return Column(_column)
def add_struct_field(nestedStruct: str, fieldName: str, fieldValue: Column): """ A convenience method for adding/replacing a field by name inside a deeply nested struct. :param nestedStruct : e.g. "a.b.c" where a, b, and c are StructType columns and a is a top-level StructType Column and c is the StructType Column to add/replace field in. :param fieldName : The name of the StructField to add (if it does not already exist) or replace (if it already exists). :param fieldValue : The value to assign to fieldName. :return: a copy the top-level struct column (a) with field added/replaced. """ sc = SparkContext._active_spark_context _add_struct_field = sc._jvm.com.github.fqaiser94.mse.methods.add_struct_field _column = _add_struct_field(nestedStruct, fieldName, fieldValue._jc) return Column(_column)
def _(*cols): jcontainer = self.get_java_container( package_name=package_name, object_name=object_name, java_class_instance=java_class_instance) # Ensure that your argument is a column col_args = [ col._jc if isinstance(col, Column) else _make_col(col)._jc for col in cols ] function = getattr(jcontainer, name) args = col_args jc = function(*args) return Column(jc)
def add_meta(sc, col, metadata): """Add metadata to a column Adds metadata to a column for describing extra properties. This metadata survives serialization from dataframe to parquet and back to dataframe. Any manipulation of the column, such as aliasing, will lose the metadata. Parameters ---------- sc : pyspark.SparkContext col : pyspark.sql.Column metadata : dict Returns ------- pyspark.sql.Column """ meta = sc._jvm.org.apache.spark.sql.types \ .Metadata.fromJson(json.dumps(metadata)) return Column(getattr(col._jc, 'as')('', meta))
def pow_func(left, right): return F.when(left == 1, left).otherwise(Column.__pow__(left, right))
def rpow_func(left, right): return F.when(F.lit(right == 1), right).otherwise(Column.__rpow__(left, right))
def compare_allow_null( left: Column, right: Column, comp: Callable[[Column, Column], Column], ) -> Column: return left.isNull() | right.isNull() | comp(left, right)
def compare_disallow_null( left: Column, right: Column, comp: Callable[[Column, Column], Column], ) -> Column: return left.isNotNull() & right.isNotNull() & comp(left, right)
def compare_allow_null( left: spark.Column, right: spark.Column, comp: Callable[[spark.Column, spark.Column], spark.Column], ) -> spark.Column: return left.isNull() | right.isNull() | comp(left, right)
def compare_disallow_null( left: spark.Column, right: spark.Column, comp: Callable[[spark.Column, spark.Column], spark.Column], ) -> spark.Column: return left.isNotNull() & right.isNotNull() & comp(left, right)
# Convert the rdd of Categories into a Data Frame df = sqlContext.read.json(rdd) # This will print the Schema of the Categories Object df.printSchema() # Displays the contents of the Categories Object df.show() # from pyspark.sql.functions import split, explode from pyspark.sql.functions import * df.select(explode('categories')).show() from pyspark.sql import Row, Column # Select few columns in the JSON file dfRoot = df.select( Column('id').alias('parentId'), Column('id').alias('childId'), 'friendlyName', 'categories') # Display the results of the selected Columns dfRoot.show() from pyspark.sql.functions import * # Recursive function to assign the Parent and Child Ids appropritely def flattenProductHierarchyRecursive(df): # "explode" function creates a new row for each element in the given array or map column (in a DataFrame). if df.select(explode('categories')).count() <= 0: return df.select('parentId', 'childId', 'friendlyName') else: dfR = df.select('childId',explode('categories').alias('CatArray'))\ .select(Column('childId').alias('parentId'), Column('CatArray.id').alias('childId'), Column('CatArray.friendlyName').alias('friendlyName'), Column('CatArray.categories').alias('categories'))