Exemple #1
0
def check_total_rows(left_df: pyspark.sql.DataFrame,
                     right_df: pyspark.sql.DataFrame) -> None:
    left_df_count = left_df.count()
    right_df_count = right_df.count()
    assert left_df_count == right_df_count, \
        f"Number of rows are not same.\n\n" \
        f"Actual Rows (left_df): {left_df_count}\n" \
        f"Expected Rows (right_df): {right_df_count}\n"
Exemple #2
0
def test_read_write_parquet(
    test_parquet_in_asset: PySparkDataAsset,
    iris_spark: pyspark.sql.DataFrame,
    fake_airflow_context: Any,
    spark_session: pyspark.sql.SparkSession,
) -> None:
    p = path.abspath(
        path.join(
            test_parquet_in_asset.staging_pickedup_path(fake_airflow_context)))
    os.makedirs(path.dirname(p), exist_ok=True)
    iris_spark.write.mode("overwrite").parquet(p)

    count_before = iris_spark.count()
    columns_before = len(iris_spark.columns)

    with pytest.raises(expected_exception=ValueError):
        PySparkDataAssetIO.read_data_asset(test_parquet_in_asset,
                                           source_files=[p])

    x = PySparkDataAssetIO.read_data_asset(test_parquet_in_asset,
                                           source_files=[p],
                                           spark_session=spark_session)

    assert count_before == x.count()
    assert columns_before == len(x.columns)

    # try with additional kwargs:
    x = PySparkDataAssetIO.read_data_asset(
        asset=test_parquet_in_asset,
        source_files=[p],
        spark_session=spark_session,
        mergeSchema=True,
    )

    assert count_before == x.count()
Exemple #3
0
def assert_test_dfs_equal(expected_df: pyspark.sql.DataFrame,
                          generated_df: pyspark.sql.DataFrame) -> None:
    """
    Used to compare two dataframes (typically, in a unit test).
    Better than the direct df1.equals(df2) method, as this function
    allows for tolerances in the floating point columns, and is
    also more descriptive with which parts of the two dataframes
    are in disagreement.
    :param expected_df: First dataframe to compare
    :param generated_df: Second dataframe to compare
    """

    row_limit = 10000

    e_count = expected_df.count()
    g_count = generated_df.count()

    if (e_count > row_limit) or (g_count > row_limit):
        raise Exception(
            f"One or both of the dataframes passed has too many rows (>{row_limit})."
            f"Please limit your test sizes to be lower than this number.")

    assert e_count == g_count, "The dataframes have a different number of rows."

    expected_pdf = expected_df.toPandas()
    generated_pdf = generated_df.toPandas()

    assert list(expected_pdf.columns) == list(generated_pdf.columns), \
        "The two dataframes have different columns."

    for col in expected_pdf.columns:
        error_msg = f"The columns with name: `{col}` were not equal."
        if expected_pdf[col].dtype.type == np.object_:
            assert expected_pdf[[col]].equals(generated_pdf[[col]]), error_msg
        else:
            # Numpy will not equate nulls on both sides. Filter them out.
            expected_pdf = expected_pdf[expected_pdf[col].notnull()]
            generated_pdf = generated_pdf[generated_pdf[col].notnull()]
            try:
                is_close = np.allclose(expected_pdf[col].values,
                                       generated_pdf[col].values)
            except ValueError:
                logging.error(
                    f"Problem encountered while equating column '{col}'.")
                raise
            assert is_close, error_msg
Exemple #4
0
def test_read_write_csv(
    test_csv_asset: PySparkDataAsset,
    iris_spark: pyspark.sql.DataFrame,
    spark_session: pyspark.sql.SparkSession,
) -> None:
    # try without any extra kwargs:
    PySparkDataAssetIO.write_data_asset(asset=test_csv_asset, data=iris_spark)

    # try with additional kwargs:
    PySparkDataAssetIO.write_data_asset(asset=test_csv_asset,
                                        data=iris_spark,
                                        header=True)

    # test mode; default is overwrite, switch to error (if exists) should raise:
    with pytest.raises(AnalysisException):
        PySparkDataAssetIO.write_data_asset(asset=test_csv_asset,
                                            data=iris_spark,
                                            header=True,
                                            mode="error")

    # test retrieval
    # before we can retrieve, we need to move the data from 'staging' to 'ready'
    os.makedirs(test_csv_asset.ready_path, exist_ok=True)

    # load the prepared data
    shutil.rmtree(test_csv_asset.ready_path)
    shutil.move(test_csv_asset.staging_ready_path, test_csv_asset.ready_path)

    retrieved = PySparkDataAssetIO.retrieve_data_asset(
        test_csv_asset,
        spark_session=spark_session,
        inferSchema=True,
        header=True)
    assert retrieved.count() == iris_spark.count()

    # Test check for missing 'spark_session' kwarg
    with pytest.raises(ValueError):
        PySparkDataAssetIO.retrieve_data_asset(test_csv_asset)

    # Test check for invalid 'spark_session' kwarg
    with pytest.raises(TypeError):
        PySparkDataAssetIO.retrieve_data_asset(test_csv_asset,
                                               spark_session=42)
Exemple #5
0
def show_df(df: pyspark.sql.DataFrame,
            columns: list,
            rows: int = 10,
            sample=False,
            truncate=True):
    """
    Prints out number of rows in pyspark df

    :param df:  pyspark dataframe
    :param columns: list of columns to print
    :param rows: how many rows to print - default 10
    :param sample: should we sample - default False
    :param truncate: truncate output - default True
    :return:
    """
    if sample:
        sample_percent = min(rows / df.count(), 1.0)
        log.info(f'sampling percentage: {sample_percent}')
        df.select(columns).sample(False, sample_percent,
                                  seed=1).show(rows, truncate=truncate)
    else:
        df.select(columns).show(rows, truncate=truncate)
def prepare_df(
    df: pyspark.sql.DataFrame,
    store_csv: pyspark.sql.DataFrame,
    store_states_csv: pyspark.sql.DataFrame,
    state_names_csv: pyspark.sql.DataFrame,
    google_trend_csv: pyspark.sql.DataFrame,
    weather_csv: pyspark.sql.DataFrame,
) -> pyspark.sql.DataFrame:
    num_rows = df.count()

    # expand dates
    df = expand_date(df)

    # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed).
    df = (df.withColumn("Open", df.Open != "0").withColumn(
        "Promo",
        df.Promo != "0").withColumn("StateHoliday",
                                    df.StateHoliday != "0").withColumn(
                                        "SchoolHoliday",
                                        df.SchoolHoliday != "0"))

    # merge store information
    store = store_csv.join(store_states_csv, "Store")
    df = df.join(store, "Store")

    # merge Google Trend information
    google_trend_all = prepare_google_trend(google_trend_csv)
    df = df.join(google_trend_all,
                 ["State", "Year", "Week"]).select(df["*"],
                                                   google_trend_all.trend)

    # merge in Google Trend for whole Germany
    google_trend_de = google_trend_all[google_trend_all.file ==
                                       "Rossmann_DE"].withColumnRenamed(
                                           "trend", "trend_de")
    df = df.join(google_trend_de,
                 ["Year", "Week"]).select(df["*"], google_trend_de.trend_de)

    # merge weather
    weather = weather_csv.join(state_names_csv,
                               weather_csv.file == state_names_csv.StateName)
    df = df.join(weather, ["State", "Date"])

    # fix null values
    df = (df.withColumn(
        "CompetitionOpenSinceYear",
        F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)),
    ).withColumn(
        "CompetitionOpenSinceMonth",
        F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)),
    ).withColumn("Promo2SinceYear",
                 F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn(
                     "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek,
                                                   F.lit(1))))

    # days and months since the competition has been open, cap it to 2 years
    df = df.withColumn(
        "CompetitionOpenSince",
        F.to_date(
            F.format_string("%s-%s-15", df.CompetitionOpenSinceYear,
                            df.CompetitionOpenSinceMonth)),
    )
    df = df.withColumn(
        "CompetitionDaysOpen",
        F.when(
            df.CompetitionOpenSinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(360 * 2),
                        F.datediff(df.Date, df.CompetitionOpenSince)),
            ),
        ).otherwise(0),
    )
    df = df.withColumn("CompetitionMonthsOpen",
                       (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

    # days and weeks of promotion, cap it to 25 weeks
    df = df.withColumn(
        "Promo2Since",
        F.expr(
            'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)'
        ),
    )
    df = df.withColumn(
        "Promo2Days",
        F.when(
            df.Promo2SinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))),
        ).otherwise(0),
    )
    df = df.withColumn("Promo2Weeks",
                       (df.Promo2Days / 7).cast(T.IntegerType()))

    # ensure that no row was lost through inner joins
    assert num_rows == df.count(), "lost rows in joins"
    return df
Exemple #7
0
def metrics (session: SparkSession, dataframe: pyspark.sql.DataFrame, actual: str,
             predicted: str) -> pyspark.sql.DataFrame:
    '''
    Calculates evaluation metrics from predicted results

    :param dataframe: spark.sql.dataframe with the real and predicted values
    :param actual: Name of column with observed target values
    :param predicted: Name of column with predicted values
    :return:
    '''

    # Along each row are the actual values and down each column are the predicted
    dataframe = dataframe.withColumn(actual, col(actual).cast('integer'))
    dataframe = dataframe.withColumn(predicted, col(predicted).cast('integer'))
    cm = dataframe.crosstab(actual, predicted)
    cm = cm.sort(cm.columns[0], ascending=True)

    # Adds missing column in case just one class was predicted
    if not '0' in cm.columns:
        cm = cm.withColumn('0', lit(0))
    if not '1' in cm.columns:
        cm = cm.withColumn('1', lit(0))

    # Subsets values from confusion matrix
    zero = cm.filter(cm[cm.columns[0]] == 0.0)
    first_0 = zero.take(1)

    one = cm.filter(cm[cm.columns[0]] == 1.0)
    first_1 = one.take(1)

    tn = first_0[0][1]
    fp = first_0[0][2]
    fn = first_1[0][1]
    tp = first_1[0][2]

    # Calculate metrics from values in the confussion matrix
    if (tp == 0):
        acc = float((tp + tn) / (tp + tn + fp + fn))
        sen = 0
        spe = float((tn) / (tn + fp))
        prec = 0
        rec = 0
        f1 = 0
    elif (tn == 0):
        acc = float((tp + tn) / (tp + tn + fp + fn))
        sen = float((tp) / (tp + fn))
        spe = 0
        prec = float((tp) / (tp + fp))
        rec = float((tp) / (tp + fn))
        f1 = 2 * float((prec * rec) / (prec + rec))
    else:
        acc = float((tp + tn) / (tp + tn + fp + fn))
        sen = float((tp) / (tp + fn))
        spe = float((tn) / (tn + fp))
        prec = float((tp) / (tp + fp))
        rec = float((tp) / (tp + fn))
        f1 = 2 * float((prec * rec) / (prec + rec))

    # Print results
    print('Confusion Matrix and Statistics: \n')
    cm.show()

    print('True Positives:', tp)
    print('True Negatives:', tn)
    print('False Positives:', fp)
    print('False Negatives:', fn)
    print('Total:', dataframe.count(), '\n')

    print('Accuracy: {0:.2f}'.format(acc))
    print('Sensitivity: {0:.2f}'.format(sen))
    print('Specificity: {0:.2f}'.format(spe))
    print('Precision: {0:.2f}'.format(prec))
    print('Recall: {0:.2f}'.format(rec))
    print('F1-score: {0:.2f}'.format(f1))

    # Create spark dataframe with results
    l = [(acc, sen, spe, prec, rec, f1)]
    df = session.createDataFrame(l, ['Accuracy', 'Sensitivity', 'Specificity', 'Precision', 'Recall', 'F1'])
    return df
Exemple #8
0
 def read_spark_df(df: pyspark.sql.DataFrame) -> int:
     return df.count()
def impact(df: pyspark.sql.DataFrame, response_col: str,
           prob_mod: mlc.Model) -> Tuple[float, float, float]:
    r"""observe impact of treatment on response variable

    currently response must be binary
    if the df is small enough return naive difference in groupby label
    response mean. otherwise do additional regression on response col
    with label as predictor and use its coefficient as a measure of its
    impact. binning and dimensionality reduction will occur if necessary
    to do an effective regression

    Parameters
    ----------
    df: pyspark.sql.DataFrame
    response_col: str
    prob_mod: Tmlc.Model
        propensity model, mostly used to keep track of feature_col,
        label_col, pred_cols

    Returns
    -------
    treatment_rate : float
        treatment response rate
    control_rate : float
        control response rate
    adjusted_response : float
        impact of treatment on response, which may be
        `control_rate`-`treatment_rate` or may have further bias adjustement

    Raises
    ------
    ValueError
        when number of rows is less than `MINIMUM_POS_COUNT`*2
    UncaughtExceptions

    See Also
    --------
    bin_features
    _reduce_dimensionality

    Notes
    -----

    """

    _persist_if_unpersisted(df)

    label_col = prob_mod.getOrDefault('labelCol')
    features_col = prob_mod.getOrDefault('featuresCol')
    pred_cols = _get_pred_cols(df, features_col)

    all_count = df.count()

    # safety check
    if all_count < MINIMUM_POS_COUNT * 2:
        logging.getLogger(__name__).critical(
            "somehow have less than 2*MINIMUM_POS_COUNT*2 rows")
        raise ValueError(
            "Have less than MINIMUM_POS_COUNT*2 rows, this shouldnt be happening"
        )

    # dict because 1, 0 for label col are not guaranteed to be ordered
    naive_response_dict = dict()
    response_list = df.groupby(label_col).mean(response_col).collect()
    naive_response_dict[response_list[0][label_col]] = response_list[0][
        "avg({col})".format(col=response_col)]
    naive_response_dict[response_list[1][label_col]] = response_list[1][
        "avg({col})".format(col=response_col)]
    treatment_rate, control_rate = naive_response_dict[1], naive_response_dict[
        0]
    logging.getLogger(__name__).info(
        "treatment_rate:{tr:.2f}   control_rate:{cr:.2f}".format(
            tr=treatment_rate, cr=control_rate))

    # return early if additional bias reduction is not applicable
    if all_count < NAIVE_THRESHOLD_COUNT:
        logging.getLogger(__name__).info(
            "additional bias adjustment inapplicable, returning naive difference"
        )
        return treatment_rate, control_rate, (control_rate - treatment_rate)

    logging.getLogger(__name__).info("additional bias adjustment possible")
    # choose fewer features if appropriate to prevent overfit. round down
    num_preds = int(
        df.where(F.col(label_col) == 1).count() // SAMPLES_PER_FEATURE) - 1
    logging.getLogger(__name__).info(
        "need max {n:,} predictors".format(n=num_preds))
    if num_preds < len(list(pred_cols)):
        logging.getLogger(__name__).info(
            "desired predictors {np:,} is less than existing {ep:,}, reducing dimensionality"
            .format(np=num_preds, ep=len(pred_cols)))
        kwargs = {
            'df': df,
            'label_col': label_col,
            'binned_features_col': features_col,
            'ncols': num_preds
        }
        df, pred_cols = reduce_dimensionality(args=kwargs, method='chi')

    pred_cols_r = pred_cols + [label_col]
    assembler_r = mlf.VectorAssembler(inputCols=pred_cols_r,
                                      outputCol='features_r')
    df = assembler_r.transform(df)
    _persist_if_unpersisted(df)
    lre_r = mlc.LogisticRegression(
        featuresCol='features_r',
        labelCol=response_col,
        predictionCol='prediction_{0}'.format(response_col),
        rawPredictionCol='rawPrediction_{0}'.format(response_col),
        probabilityCol='probability_{0}'.format(response_col))
    lrm_r = lre_r.fit(df)

    coeff_dict = dict(zip(pred_cols_r, lrm_r.coefficients))

    adjusted_response = control_rate * (1 - math.exp(coeff_dict[label_col]))
    logging.getLogger(__name__).info(
        "bias asjusted response is {ar:.2f}".format(ar=adjusted_response))
    return treatment_rate, control_rate, adjusted_response