def check_total_rows(left_df: pyspark.sql.DataFrame, right_df: pyspark.sql.DataFrame) -> None: left_df_count = left_df.count() right_df_count = right_df.count() assert left_df_count == right_df_count, \ f"Number of rows are not same.\n\n" \ f"Actual Rows (left_df): {left_df_count}\n" \ f"Expected Rows (right_df): {right_df_count}\n"
def test_read_write_parquet( test_parquet_in_asset: PySparkDataAsset, iris_spark: pyspark.sql.DataFrame, fake_airflow_context: Any, spark_session: pyspark.sql.SparkSession, ) -> None: p = path.abspath( path.join( test_parquet_in_asset.staging_pickedup_path(fake_airflow_context))) os.makedirs(path.dirname(p), exist_ok=True) iris_spark.write.mode("overwrite").parquet(p) count_before = iris_spark.count() columns_before = len(iris_spark.columns) with pytest.raises(expected_exception=ValueError): PySparkDataAssetIO.read_data_asset(test_parquet_in_asset, source_files=[p]) x = PySparkDataAssetIO.read_data_asset(test_parquet_in_asset, source_files=[p], spark_session=spark_session) assert count_before == x.count() assert columns_before == len(x.columns) # try with additional kwargs: x = PySparkDataAssetIO.read_data_asset( asset=test_parquet_in_asset, source_files=[p], spark_session=spark_session, mergeSchema=True, ) assert count_before == x.count()
def assert_test_dfs_equal(expected_df: pyspark.sql.DataFrame, generated_df: pyspark.sql.DataFrame) -> None: """ Used to compare two dataframes (typically, in a unit test). Better than the direct df1.equals(df2) method, as this function allows for tolerances in the floating point columns, and is also more descriptive with which parts of the two dataframes are in disagreement. :param expected_df: First dataframe to compare :param generated_df: Second dataframe to compare """ row_limit = 10000 e_count = expected_df.count() g_count = generated_df.count() if (e_count > row_limit) or (g_count > row_limit): raise Exception( f"One or both of the dataframes passed has too many rows (>{row_limit})." f"Please limit your test sizes to be lower than this number.") assert e_count == g_count, "The dataframes have a different number of rows." expected_pdf = expected_df.toPandas() generated_pdf = generated_df.toPandas() assert list(expected_pdf.columns) == list(generated_pdf.columns), \ "The two dataframes have different columns." for col in expected_pdf.columns: error_msg = f"The columns with name: `{col}` were not equal." if expected_pdf[col].dtype.type == np.object_: assert expected_pdf[[col]].equals(generated_pdf[[col]]), error_msg else: # Numpy will not equate nulls on both sides. Filter them out. expected_pdf = expected_pdf[expected_pdf[col].notnull()] generated_pdf = generated_pdf[generated_pdf[col].notnull()] try: is_close = np.allclose(expected_pdf[col].values, generated_pdf[col].values) except ValueError: logging.error( f"Problem encountered while equating column '{col}'.") raise assert is_close, error_msg
def test_read_write_csv( test_csv_asset: PySparkDataAsset, iris_spark: pyspark.sql.DataFrame, spark_session: pyspark.sql.SparkSession, ) -> None: # try without any extra kwargs: PySparkDataAssetIO.write_data_asset(asset=test_csv_asset, data=iris_spark) # try with additional kwargs: PySparkDataAssetIO.write_data_asset(asset=test_csv_asset, data=iris_spark, header=True) # test mode; default is overwrite, switch to error (if exists) should raise: with pytest.raises(AnalysisException): PySparkDataAssetIO.write_data_asset(asset=test_csv_asset, data=iris_spark, header=True, mode="error") # test retrieval # before we can retrieve, we need to move the data from 'staging' to 'ready' os.makedirs(test_csv_asset.ready_path, exist_ok=True) # load the prepared data shutil.rmtree(test_csv_asset.ready_path) shutil.move(test_csv_asset.staging_ready_path, test_csv_asset.ready_path) retrieved = PySparkDataAssetIO.retrieve_data_asset( test_csv_asset, spark_session=spark_session, inferSchema=True, header=True) assert retrieved.count() == iris_spark.count() # Test check for missing 'spark_session' kwarg with pytest.raises(ValueError): PySparkDataAssetIO.retrieve_data_asset(test_csv_asset) # Test check for invalid 'spark_session' kwarg with pytest.raises(TypeError): PySparkDataAssetIO.retrieve_data_asset(test_csv_asset, spark_session=42)
def show_df(df: pyspark.sql.DataFrame, columns: list, rows: int = 10, sample=False, truncate=True): """ Prints out number of rows in pyspark df :param df: pyspark dataframe :param columns: list of columns to print :param rows: how many rows to print - default 10 :param sample: should we sample - default False :param truncate: truncate output - default True :return: """ if sample: sample_percent = min(rows / df.count(), 1.0) log.info(f'sampling percentage: {sample_percent}') df.select(columns).sample(False, sample_percent, seed=1).show(rows, truncate=truncate) else: df.select(columns).show(rows, truncate=truncate)
def prepare_df( df: pyspark.sql.DataFrame, store_csv: pyspark.sql.DataFrame, store_states_csv: pyspark.sql.DataFrame, state_names_csv: pyspark.sql.DataFrame, google_trend_csv: pyspark.sql.DataFrame, weather_csv: pyspark.sql.DataFrame, ) -> pyspark.sql.DataFrame: num_rows = df.count() # expand dates df = expand_date(df) # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed). df = (df.withColumn("Open", df.Open != "0").withColumn( "Promo", df.Promo != "0").withColumn("StateHoliday", df.StateHoliday != "0").withColumn( "SchoolHoliday", df.SchoolHoliday != "0")) # merge store information store = store_csv.join(store_states_csv, "Store") df = df.join(store, "Store") # merge Google Trend information google_trend_all = prepare_google_trend(google_trend_csv) df = df.join(google_trend_all, ["State", "Year", "Week"]).select(df["*"], google_trend_all.trend) # merge in Google Trend for whole Germany google_trend_de = google_trend_all[google_trend_all.file == "Rossmann_DE"].withColumnRenamed( "trend", "trend_de") df = df.join(google_trend_de, ["Year", "Week"]).select(df["*"], google_trend_de.trend_de) # merge weather weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName) df = df.join(weather, ["State", "Date"]) # fix null values df = (df.withColumn( "CompetitionOpenSinceYear", F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)), ).withColumn( "CompetitionOpenSinceMonth", F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)), ).withColumn("Promo2SinceYear", F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn( "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek, F.lit(1)))) # days and months since the competition has been open, cap it to 2 years df = df.withColumn( "CompetitionOpenSince", F.to_date( F.format_string("%s-%s-15", df.CompetitionOpenSinceYear, df.CompetitionOpenSinceMonth)), ) df = df.withColumn( "CompetitionDaysOpen", F.when( df.CompetitionOpenSinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince)), ), ).otherwise(0), ) df = df.withColumn("CompetitionMonthsOpen", (df.CompetitionDaysOpen / 30).cast(T.IntegerType())) # days and weeks of promotion, cap it to 25 weeks df = df.withColumn( "Promo2Since", F.expr( 'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)' ), ) df = df.withColumn( "Promo2Days", F.when( df.Promo2SinceYear > 1900, F.greatest( F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))), ).otherwise(0), ) df = df.withColumn("Promo2Weeks", (df.Promo2Days / 7).cast(T.IntegerType())) # ensure that no row was lost through inner joins assert num_rows == df.count(), "lost rows in joins" return df
def metrics (session: SparkSession, dataframe: pyspark.sql.DataFrame, actual: str, predicted: str) -> pyspark.sql.DataFrame: ''' Calculates evaluation metrics from predicted results :param dataframe: spark.sql.dataframe with the real and predicted values :param actual: Name of column with observed target values :param predicted: Name of column with predicted values :return: ''' # Along each row are the actual values and down each column are the predicted dataframe = dataframe.withColumn(actual, col(actual).cast('integer')) dataframe = dataframe.withColumn(predicted, col(predicted).cast('integer')) cm = dataframe.crosstab(actual, predicted) cm = cm.sort(cm.columns[0], ascending=True) # Adds missing column in case just one class was predicted if not '0' in cm.columns: cm = cm.withColumn('0', lit(0)) if not '1' in cm.columns: cm = cm.withColumn('1', lit(0)) # Subsets values from confusion matrix zero = cm.filter(cm[cm.columns[0]] == 0.0) first_0 = zero.take(1) one = cm.filter(cm[cm.columns[0]] == 1.0) first_1 = one.take(1) tn = first_0[0][1] fp = first_0[0][2] fn = first_1[0][1] tp = first_1[0][2] # Calculate metrics from values in the confussion matrix if (tp == 0): acc = float((tp + tn) / (tp + tn + fp + fn)) sen = 0 spe = float((tn) / (tn + fp)) prec = 0 rec = 0 f1 = 0 elif (tn == 0): acc = float((tp + tn) / (tp + tn + fp + fn)) sen = float((tp) / (tp + fn)) spe = 0 prec = float((tp) / (tp + fp)) rec = float((tp) / (tp + fn)) f1 = 2 * float((prec * rec) / (prec + rec)) else: acc = float((tp + tn) / (tp + tn + fp + fn)) sen = float((tp) / (tp + fn)) spe = float((tn) / (tn + fp)) prec = float((tp) / (tp + fp)) rec = float((tp) / (tp + fn)) f1 = 2 * float((prec * rec) / (prec + rec)) # Print results print('Confusion Matrix and Statistics: \n') cm.show() print('True Positives:', tp) print('True Negatives:', tn) print('False Positives:', fp) print('False Negatives:', fn) print('Total:', dataframe.count(), '\n') print('Accuracy: {0:.2f}'.format(acc)) print('Sensitivity: {0:.2f}'.format(sen)) print('Specificity: {0:.2f}'.format(spe)) print('Precision: {0:.2f}'.format(prec)) print('Recall: {0:.2f}'.format(rec)) print('F1-score: {0:.2f}'.format(f1)) # Create spark dataframe with results l = [(acc, sen, spe, prec, rec, f1)] df = session.createDataFrame(l, ['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Recall', 'F1']) return df
def read_spark_df(df: pyspark.sql.DataFrame) -> int: return df.count()
def impact(df: pyspark.sql.DataFrame, response_col: str, prob_mod: mlc.Model) -> Tuple[float, float, float]: r"""observe impact of treatment on response variable currently response must be binary if the df is small enough return naive difference in groupby label response mean. otherwise do additional regression on response col with label as predictor and use its coefficient as a measure of its impact. binning and dimensionality reduction will occur if necessary to do an effective regression Parameters ---------- df: pyspark.sql.DataFrame response_col: str prob_mod: Tmlc.Model propensity model, mostly used to keep track of feature_col, label_col, pred_cols Returns ------- treatment_rate : float treatment response rate control_rate : float control response rate adjusted_response : float impact of treatment on response, which may be `control_rate`-`treatment_rate` or may have further bias adjustement Raises ------ ValueError when number of rows is less than `MINIMUM_POS_COUNT`*2 UncaughtExceptions See Also -------- bin_features _reduce_dimensionality Notes ----- """ _persist_if_unpersisted(df) label_col = prob_mod.getOrDefault('labelCol') features_col = prob_mod.getOrDefault('featuresCol') pred_cols = _get_pred_cols(df, features_col) all_count = df.count() # safety check if all_count < MINIMUM_POS_COUNT * 2: logging.getLogger(__name__).critical( "somehow have less than 2*MINIMUM_POS_COUNT*2 rows") raise ValueError( "Have less than MINIMUM_POS_COUNT*2 rows, this shouldnt be happening" ) # dict because 1, 0 for label col are not guaranteed to be ordered naive_response_dict = dict() response_list = df.groupby(label_col).mean(response_col).collect() naive_response_dict[response_list[0][label_col]] = response_list[0][ "avg({col})".format(col=response_col)] naive_response_dict[response_list[1][label_col]] = response_list[1][ "avg({col})".format(col=response_col)] treatment_rate, control_rate = naive_response_dict[1], naive_response_dict[ 0] logging.getLogger(__name__).info( "treatment_rate:{tr:.2f} control_rate:{cr:.2f}".format( tr=treatment_rate, cr=control_rate)) # return early if additional bias reduction is not applicable if all_count < NAIVE_THRESHOLD_COUNT: logging.getLogger(__name__).info( "additional bias adjustment inapplicable, returning naive difference" ) return treatment_rate, control_rate, (control_rate - treatment_rate) logging.getLogger(__name__).info("additional bias adjustment possible") # choose fewer features if appropriate to prevent overfit. round down num_preds = int( df.where(F.col(label_col) == 1).count() // SAMPLES_PER_FEATURE) - 1 logging.getLogger(__name__).info( "need max {n:,} predictors".format(n=num_preds)) if num_preds < len(list(pred_cols)): logging.getLogger(__name__).info( "desired predictors {np:,} is less than existing {ep:,}, reducing dimensionality" .format(np=num_preds, ep=len(pred_cols))) kwargs = { 'df': df, 'label_col': label_col, 'binned_features_col': features_col, 'ncols': num_preds } df, pred_cols = reduce_dimensionality(args=kwargs, method='chi') pred_cols_r = pred_cols + [label_col] assembler_r = mlf.VectorAssembler(inputCols=pred_cols_r, outputCol='features_r') df = assembler_r.transform(df) _persist_if_unpersisted(df) lre_r = mlc.LogisticRegression( featuresCol='features_r', labelCol=response_col, predictionCol='prediction_{0}'.format(response_col), rawPredictionCol='rawPrediction_{0}'.format(response_col), probabilityCol='probability_{0}'.format(response_col)) lrm_r = lre_r.fit(df) coeff_dict = dict(zip(pred_cols_r, lrm_r.coefficients)) adjusted_response = control_rate * (1 - math.exp(coeff_dict[label_col])) logging.getLogger(__name__).info( "bias asjusted response is {ar:.2f}".format(ar=adjusted_response)) return treatment_rate, control_rate, adjusted_response