def comapare_dataframes( cls, df1: DataFrame, df2: DataFrame, excluded_keys: Union[List, str, None] = []) -> bool: """ Compares 2 DataFrames for exact match\ internally it use pandas.testing.assert_frame_equal :param df1: processed data :type df1: DataFrame :param df2: gold standard expected data :type df2: DataFrame :return: True :param excluded_keys: columns to be excluded from comparision, optional :type excluded_keys: Union[List, str, None] :rtype: Boolean :raises: AssertionError Dataframe mismatch """ excluded_keys = excluded_keys if type(excluded_keys) == list else [ excluded_keys ] df1 = df1.drop(*excluded_keys) df2 = df2.drop(*excluded_keys) sort_columns = [cols[0] for cols in df1.dtypes] df1_sorted = df1.toPandas().sort_values(by=sort_columns, ignore_index=True) df2_sorted = df2.toPandas().sort_values(by=sort_columns, ignore_index=True) assert_frame_equal(df1_sorted, df2_sorted) return True
def standardise_names(df: DataFrame, name_cols: list, drop_orig: bool = True): """Take a one or more name columns in a list and standardise the names so one name appears in each column consistently Args: df (DataFrame): Spark DataFrame name_cols (list): A list of columns that contain names, in order from first name to last name drop_orig (bool, optional): Drop the original columns after standardisation. Defaults to True. Returns: DataFrame: A Spark DataFrame with standardised name columns """ name_col_joined = ", ".join(name_cols) surname_col_name = name_cols[-1] df = df.withColumn('name_concat', expr(f"concat_ws(' ', {name_col_joined})")) df = df.withColumn('name_concat', expr('lower(name_concat)')) df = df.withColumn('name_concat', expr("regexp_replace(name_concat, '[\\-\\.]', ' ')")) df = df.withColumn('name_arr', expr("split(name_concat, ' ')")) df = df.withColumn( 'surname_std', expr( f"case when {surname_col_name} is not null then element_at(name_arr,-1) else null end" )) df = df.withColumn( 'forename1_std', expr( "case when size(name_arr) > 1 then element_at(name_arr,1) else null end" )) df = df.withColumn( 'forename2_std', expr( "case when size(name_arr) > 2 then element_at(name_arr,2) else null end" )) df = df.withColumn( 'forename3_std', expr( "case when size(name_arr) > 3 then element_at(name_arr,3) else null end" )) df = df.withColumn( 'forename4_std', expr( "case when size(name_arr) > 4 then element_at(name_arr,4) else null end" )) df = df.withColumn( 'forename5_std', expr( "case when size(name_arr) > 5 then element_at(name_arr,5) else null end" )) df = df.drop("name_arr", "name_concat") if drop_orig: for n in name_cols: df = df.drop(n) return df
def _unpack_struct(self, df: DataFrame, col_name): sub_df = df.select(col_name + '.*') for subcol_name in sub_df.columns: df = df.withColumn(f'{col_name}_{subcol_name}', df[col_name][subcol_name]) df = df.drop(col_name) return self.unpack_nested(df)
def null_out_values_array(df: DataFrame, array_colname: str, values_to_null: list): """Null out a user defined list of undesirable values in a column that contains an array of values Useful for columns that mostly contain valid data but occasionally contain other values such as 'unknown' Args: df (DataFrame): The dataframe to clean colname (string): The name of the column to clean values_to_null (list): A list of values to be nulled. Returns: DataFrame: The cleaned dataframe with column containing array that has values in values_to_null nulled """ if len(values_to_null) > 0: if str((dict(df.dtypes)[array_colname])).startswith("array"): array_args = [f.lit(v) for v in values_to_null] df = df.withColumn("vals_to_remove", f.array(*array_args)) df = df.withColumn( array_colname, f.expr(f"array_except({array_colname}, vals_to_remove)") ) df = df.drop("vals_to_remove") else: # if column is not an array fire up a warning warnings.warn( f""" column {array_colname} is not an array. Please use function null_out_values instead """ ) return df
def clean_immigration(df: SparkDataFrame) -> SparkDataFrame: """Clean immigration data :param df: immigration data frame to be cleaned. :return: cleaned immigration data frame """ drop_cols = [ 'visapost', 'occup', 'entdepu', 'insnum', 'count', 'entdepa', 'entdepd', 'matflag', 'dtaddto', 'biryear', 'admnum' ] int_cols = [ 'cicid', 'i94yr', 'i94mon', 'i94cit', 'i94res', 'i94mode', 'i94bir', 'i94visa', 'dtadfile' ] date_cols = ['arrdate', 'depdate'] date_udf = udf(lambda x: x and (timedelta(days=int(x)) + datetime( 1960, 1, 1)).strftime('%Y-%m-%d')) df = df.drop(*drop_cols) df = convert_column_type(df, 'integer', int_cols) for col in date_cols: df = df.withColumn(col, date_udf(df[col])) # Remove the row if the data in any of fk column is lost fk_columns = ['i94cit', 'i94port', 'i94addr'] df = reduce(lambda df, idx: df.filter(df[fk_columns[idx]].isNotNull()), range(len(fk_columns)), df) return df
def postcode_to_inward_outward(df: DataFrame, pc_field: str, drop_orig: bool = True): """Given a field containing a postcode, creates new columns in the dataframe called outward_postcode_std and inward_postcode_std Original postcode can have spaces or not and be in any case Args: df (DataFrame): Spark Dataframe pc_field (str): Name of field containing postcode """ sql = f"upper(replace({pc_field}, ' ', ''))" df = df.withColumn("pc_nospace_temp__", expr(sql)) # If the postcode is long enough, parse out inner outer # If it's too short, assume we only have the outer part sql = """ case when length(pc_nospace_temp__) >= 5 then left(pc_nospace_temp__, length(pc_nospace_temp__) - 3) else left(pc_nospace_temp__, 4) end """ # sql = f"""left(pc_nospace_temp__, length(pc_nospace_temp__) - 3)""" df = df.withColumn("outward_postcode_std", expr(sql)) sql = f"""right(pc_nospace_temp__, 3)""" sql = """ case when length(pc_nospace_temp__) >= 5 then right(pc_nospace_temp__, 3) else null end """ df = df.withColumn("inward_postcode_std", expr(sql)) df = df.drop("pc_nospace_temp__") if drop_orig: df = df.drop(pc_field) return df
def get_categoricals_multiplier(self, df: DataFrame, col_list: list = [], ignore_cols: list = [], approx_distinct=100, rsd=0.05): """ Gets a dictionary of col names and the distinct values in the column. :param df: :param col_list: Subset list of columns to use as categoricals; if null, all columns will be checked for approx_distinct values and considered categoricals :param ignore_cols: when not selecting a subset of columns using col_list, ignore columns is a list of columns that will be skipped when searching for categoricals with approx_distinct columns. :param approx_distinct: log a warning message if the approx number of distinct values is greater than this threshold. :param rsd: :return: """ # TODO - Add logging of findings filter_vals = [] filter_cols = col_list if len(col_list) == 0: for (dcol, dtype) in df.drop(*ignore_cols).dtypes: if dtype == 'string': if self._get_approx_distinct_count_for_col( df, dcol, _rsd=rsd) <= approx_distinct: # LOG print("{} has approx {} distincts".format(dcol, cnt)) # LOG print("appending {}".format(dcol)) filter_vals.append(df.select(col(dcol)) \ .filter((col(dcol).isNotNull()) & (col(dcol).isin("", "Y", "N") == False)) \ .distinct().rdd.map(lambda row: str(row[0])).collect()) filter_cols.append(dcol) # ?? TODO - What about the rest of the potential categorical types (i.e. bools/ints/floats/etc) return feature_factory.feature.Multiplier.create_from_cats( filter_cols, filter_vals) else: for dcol in col_list: if self._get_approx_distinct_count_for_col( df, dcol) > approx_distinct: print("WARN! {} has more than {} distinct values".format( dcol, approx_distinct)) filter_vals.append(df.select(col(dcol)) \ .filter((col(dcol).isNotNull()) & (col(dcol).isin("", "Y", "N") == False)) \ .distinct().rdd.map(lambda row: str(row[0])).collect()) return feature_factory.feature.Multiplier._create_from_cats( filter_cols, filter_vals)
def standardise_dob( df: DataFrame, dob_col: str, date_fmt_if_string: str = "yyyy-MM-dd", drop_orig: bool = True, ): """Create column called dob_std with dob as a string in yyyy-MM-dd format or null otherwise Args: df (DataFrame): Spark dataframe dob_col (str): Name of dob column date_fmt_if_string (str, optional): Date format if incoming dates are already string. Defaults to "yyyy-MM-dd". drop_orig (bool, optional): Drop original date of birth column. Defaults to True. Returns: DataFrame: Spark DataFrame with new standardised dob column called dob_std """ dtypes = dict(df.dtypes) if dtypes[dob_col] == "date": df = df.withColumn("dob_std", date_format(dob_col, "yyyy-MM-dd")) if dtypes[dob_col] == "timestamp": df = df.withColumn("dob_std", date_format(dob_col, "yyyy-MM-dd")) if dtypes[dob_col] == "string": df = df.withColumn("dob_std", to_timestamp(dob_col, date_fmt_if_string)) df = df.withColumn("dob_std", date_format("dob_std", "yyyy-MM-dd")) if drop_orig: if dob_col != "dob_std": df = df.drop(dob_col) return df
def append_features(self, df: DataFrame, groupBy_cols, feature_sets: [FeatureSet], withTrendsForFeatures: [FeatureSet] = None): """ Appends features to incoming df. The features columns and groupby cols will be deduped and validated. If there's a group by, the groupby cols will be applied before appending features. If there's not a group by and no agg features then the features will be appended to df. :param df: :param groupBy_cols: :param feature_sets: input of FeatureSet :return: """ # If groupBy Column is past in as something other than list, convert to list # Validation - If features, passed in is dict, convert to list of vals, etc. # groupBy_cols = self.helpers._to_list(groupBy_cols) groupBy_cols, groupBy_joiners = self.helpers._extract_groupby_joiner( groupBy_cols) features, dups = self.helpers._dedup_fast(df, [ feature for feature_set in feature_sets for feature in feature_set.features.values() ]) df = self.helpers._resolve_feature_joiners( df, features, groupBy_joiners).repartition(*groupBy_cols) # feature_cols = [] agg_cols = [] non_agg_cols = {} features_to_drop = [] # base_cols = [f.base_col for f in features] # column validation # valid_result, undef_cols = self.helpers.validate_col(df, *base_cols) # assert valid_result, "base cols {} are not defined in df columns {}".format(undef_cols, df.columns) # valid_result, undef_cols = self.helpers._validate_col(df, *groupBy_cols) # assert valid_result, "groupby cols {} are not defined in df columns {}".format(undef_cols, df.columns) for feature in features: assert True if ((len(feature.aggs) > 0) and (len( groupBy_cols) > 0) or feature.agg_func is None) else False, "{} has either aggs or groupBys " \ "but not both, ensure both are present".format(feature.name) # feature_cols.append(feature.assembled_column) # feature_cols.append(F.col(feature.output_alias)) agg_cols += [agg_col for agg_col in feature.aggs] if feature.agg_func is None: non_agg_cols[feature.output_alias] = feature.assembled_column else: df = df.withColumn(feature.output_alias, feature.assembled_column) if feature.is_temporary: features_to_drop.append(feature.name) if len(groupBy_cols) > 0: df = df.groupBy(*groupBy_cols)\ .agg(*agg_cols) for fn, col in non_agg_cols.items(): df = df.withColumn(fn, col) final_df = df.drop(*features_to_drop) # else: # new_df = df.select(*df.columns + feature_cols) return final_df
def process_log_data(spark, input_data: str, output_data: str, schema: StructType, songs_df: DataFrame) -> None: """ Extract the raw data from S3, transform the data to our likings, and load it back into S3 in parquet format. Arguments: spark: An active Spark connection. input_data: Path to the S3 bucket with input data. output_data: Path to the S3 bucket where the transformed data is going to be stored. schema: DataFrame schema of the log data .json files. songs_df: DataFrame from the song_data function which is needed to create the songplays table. Returns: None. """ log_data = spark.read.json(path=log_data_path, schema=log_schema) log_data_cached = log_data.repartition(8).cache() print(log_data_cached.count()) users_df = (log_data_cached.select( col('userId').alias('user_id'), col('firstName').alias('first_name'), col('lastName').alias('last_name'), col('gender'), col('level'), col('ts')).withColumn( 'row_number', sql_f.row_number().over( Window.partitionBy('user_id').orderBy( col('ts').desc()))).where("row_number = 1").drop( 'row_number', 'ts').repartition(8)) users_df_write = (users_df.write.mode('overwrite').parquet( f"{output_data_path}/analytical/users")) time_df = (log_data_cached.where("page = 'NextSong'").withColumn( 'start_time', sql_f.from_unixtime(col('ts') / 1000).cast(TimestampType())).select( col('start_time'), hour('start_time').alias('hour'), dayofmonth('start_time').alias('day'), weekofyear('start_time').alias('week'), month("start_time").alias('month'), year("start_time").alias('year')).withColumn( 'weekday', sql_f.when(dayofweek(col('start_time')) < 6, True).otherwise(False)).dropDuplicates( ['start_time']).repartition(8, 'year', 'month')) time_df_export = (time_df.write.mode('overwrite').partitionBy( 'year', 'month').parquet(f"{output_data_path}/analytical/time")) events_df = (log_data_cached.where("page = 'NextSong'").withColumn( "songplay_id", monotonically_increasing_id()).withColumn( 'start_time', sql_f.from_unixtime( col('ts') / 1000).cast(TimestampType())).select( col('songplay_id'), col('start_time'), col('userId').alias('user_id'), col('level'), col('sessionId').alias('session_id'), col('location'), col('userAgent').alias('user_agent'), col('song'), col('length'), month("start_time").alias('month'), year("start_time").alias('year')).repartition(8)) join_condition = [ songs_df.title == events_df.song, songs_df.duration == events_df.length ] songplays_df = (events_df.join(broadcast(songs_df.drop('year')), on=join_condition, how='left').select(col('songplay_id'), col('start_time'), col('user_id'), col('level'), col('song_id'), col('artist_id'), col('session_id'), col('location'), col('user_agent'), col('month'), col('year')).repartition( 8, 'year', 'month')) songplays_df_write = (songplays_df.write.mode('overwrite').partitionBy( 'year', 'month').parquet(f"{output_data_path}/analytical/songplays"))
from pyspark.sql.dataframe import DataFrame sax = spark._jvm.com.ralib.notebook.spark.SAXSparkSession.getSession( spark._jsparkSession) ds_Churn_Modelling = DataFrame(sax.getDataSet("Churn_Modelling"), spark) #imports from pyspark.ml import Pipeline from pyspark.ml.feature import StandardScaler from pyspark.ml.classification import LogisticRegression from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler #data preparation ds_Churn_Modelling = ds_Churn_Modelling.drop('RowNumber', 'Gender', 'CustomerId', 'Surname', 'Geography') assembler = VectorAssembler(inputCols=[ 'CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary' ], outputCol="features") scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=False) #model preparation #create Logistic Regression object