def test_should_return_rows_that_pass_all_checks_and_reject_rows_that_violate_any_test(spark_session):
    not_between = [25, 1]
    max_exceeded = [3, 30]
    correct = [3, 15]
    less_than_min = [1, 15]
    both_wrong = [7, 30]

    df = spark_session.createDataFrame([not_between, max_exceeded, correct, less_than_min, both_wrong], schema=two_integer_columns_schema)
    expected_correct = spark_session.createDataFrame([correct], schema=two_integer_columns_schema)
    expected_errors = spark_session.createDataFrame([not_between, max_exceeded, less_than_min, both_wrong], schema=two_integer_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_between("col1", 0, 5) \
        .is_min("col1", 3) \
        .is_max("col2", 20) \
        .execute()

    AssertDf(result.correct_data, order_by_column="col1") \
        .contains_exactly(expected_correct.toPandas()) \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column="col2") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [ValidationError("col1", "between", 2), ValidationError("col1", "min", 1), ValidationError("col2", "max", 2)]
def test_min_should_check_all_given_columns_separately(spark_session):
    df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]],
                                       schema=two_integer_columns_schema)
    expected_correct = spark_session.createDataFrame(
        [], schema=two_integer_columns_schema)
    expected_errors = spark_session.createDataFrame(
        [[5, 1], [10, 2], [15, 3]], schema=two_integer_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_min("col1", 20) \
        .is_min("col2", 5) \
        .execute()

    AssertDf(result.correct_data, order_by_column="col1") \
        .contains_exactly(expected_correct.toPandas()) \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column="col2") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [
        ValidationError("col1", "min", 3),
        ValidationError("col2", "min", 3)
    ]
def test_should_check_all_given_columns_separately(spark_session):
    df = spark_session.createDataFrame(
        [["a", "12"], ["abcde", "56"], ["def", "123"]],
        schema=two_string_columns_schema)

    expected_correct = spark_session.createDataFrame(
        [], schema=two_string_columns_schema)
    expected_errors = spark_session.createDataFrame(
        [["a", "12"], ["abcde", "56"], ["def", "123"]],
        schema=two_string_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .has_length_between("col1", 2, 4) \
        .has_length_between("col2", 1, 2) \
        .execute()

    AssertDf(result.correct_data, order_by_column="col1") \
        .contains_exactly(expected_correct.toPandas()) \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column="col2") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [
        ValidationError("col1", "text_length", 2),
        ValidationError("col2", "text_length", 1)
    ]
def test_should_pass_empty_df_if_there_are_no_rules(spark_session):
    df = empty_string_df(spark_session)

    result = ValidateSparkDataFrame(spark_session, df).execute()

    AssertValidationResult(column_name="col1", constraint_name="") \
        .check(
        actual=result,
        expected_correct=df,
        expected_erroneous=df
    )
def test_should_pass_df_if_there_are_no_rules(spark_session):
    df = spark_session.createDataFrame([["abc"], ["def"]],
                                       schema=single_string_column_schema)

    result = ValidateSparkDataFrame(spark_session, df).execute()

    AssertValidationResult(column_name="col1", constraint_name="") \
        .check(
        actual=result,
        expected_correct=df,
        expected_erroneous=empty_string_df(spark_session)
    )
Ejemplo n.º 6
0
def test_should_pass_empty_df_with_not_null_constraint(spark_session):
    df = empty_string_df(spark_session)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_not_null("col1") \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="not_null") \
        .check(
        actual=result,
        expected_correct=df,
        expected_erroneous=df
    )
def test_should_return_df_without_changes_if_regex_matches_the_text(spark_session):
    df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .text_matches_regex("col1", ".*") \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="regex_match") \
        .check(
        actual=result,
        expected_correct=df,
        expected_erroneous=empty_string_df(spark_session)
    )
Ejemplo n.º 8
0
def test_should_return_df_without_changes_if_empty_df_with_is_unique_constraint(spark_session):
    df = empty_string_df(spark_session)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_unique("col1") \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="unique") \
        .check(
        actual=result,
        expected_correct=df,
        expected_erroneous=df
    )
Ejemplo n.º 9
0
def test_should_return_df_without_changes_if_all_rows_are_unique(spark_session):
    df = spark_session.createDataFrame([["abc"], ["def"], ["ghi"]], schema=single_string_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_unique("col1") \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="unique") \
        .check(
        actual=result,
        expected_correct=df,
        expected_erroneous=empty_string_df(spark_session)
    )
def test_should_return_df_without_changes_if_all_are_between(spark_session):
    df = spark_session.createDataFrame([[5], [10], [15]],
                                       schema=single_integer_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_between("col1", 5, 15) \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="between") \
        .check(
        actual=result,
        expected_correct=df,
        expected_erroneous=empty_integer_df(spark_session)
    )
def test_should_reject_all_rows_if_smaller_than_min(spark_session):
    df = spark_session.createDataFrame([[5], [10], [15]],
                                       schema=single_integer_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_min("col1", 20) \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="min") \
        .check(
        actual=result,
        expected_correct=empty_integer_df(spark_session),
        expected_erroneous=df
    )
def test_should_reject_all_rows_if_regex_match_fails(spark_session):
    df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema)
    expected_errors = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .text_matches_regex("col1", "[0-9]+") \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="regex_match") \
        .check(
        actual=result,
        expected_correct=empty_string_df(spark_session),
        expected_erroneous=expected_errors
    )
Ejemplo n.º 13
0
def test_should_return_df_without_changes_if_empty_df_with_mean_constraint(
        spark_session):
    df = empty_integer_df(spark_session)

    result = ValidateSparkDataFrame(spark_session, df) \
        .mean_column_value("col1", 0, 1) \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="mean_between") \
        .check(
        actual=result,
        expected_correct=df,
        expected_erroneous=df
    )
Ejemplo n.º 14
0
def test_should_reject_all_rows_if_none_of_them_is_in_the_list(spark_session):
    df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema)
    expected_errors = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .one_of("col1", ["ab", "b"]) \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="one_of") \
        .check(
        actual=result,
        expected_correct=empty_string_df(spark_session),
        expected_erroneous=expected_errors
    )
def test_should_return_both_correct_and_incorrect_rows(spark_session):
    df = spark_session.createDataFrame([["a"], ["abc"], ["defg"], ["hijkl"]], schema=single_string_column_schema)
    expected_correct = spark_session.createDataFrame([["abc"], ["defg"]], schema=single_string_column_schema)
    expected_errors = spark_session.createDataFrame([["a"], ["hijkl"]], schema=single_string_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .text_matches_regex("col1", "^[a-z]{3,4}$") \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="regex_match") \
        .check(
        actual=result,
        expected_correct=expected_correct,
        expected_erroneous=expected_errors
    )
def test_should_return_df_without_changes_if_all_are_longer_than_lower_bound(
        spark_session):
    df = spark_session.createDataFrame([["abcdef"], ["ghijkl"]],
                                       schema=single_string_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .has_length_between("col1", 5, 20) \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="text_length") \
        .check(
        actual=result,
        expected_correct=df,
        expected_erroneous=empty_string_df(spark_session)
    )
Ejemplo n.º 17
0
def test_should_reject_all_rows_if_mean_is_larger_than_given_values(
        spark_session):
    df = spark_session.createDataFrame([[5], [10], [15]],
                                       schema=single_integer_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .mean_column_value("col1", 5, 8) \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="mean_between") \
        .check(
        actual=result,
        expected_correct=empty_integer_df(spark_session),
        expected_erroneous=df
    )
Ejemplo n.º 18
0
def test_one_of_of_other_columns_is_ignored(spark_session):
    df = spark_session.createDataFrame([["a", "123"], ["bcd", "45"], ["cd", "12345"]], schema=two_string_columns_schema)

    expected_correct = spark_session.createDataFrame([["cd", "12345"]], schema=two_string_columns_schema)
    expected_errors = spark_session.createDataFrame([["a", "123"], ["bcd", "45"]], schema=two_string_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .one_of("col1", ["cd", "123", "45"]) \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="one_of") \
        .check(
        actual=result,
        expected_correct=expected_correct,
        expected_erroneous=expected_errors
    )
Ejemplo n.º 19
0
def test_nulls_in_other_columns_are_ignored(spark_session):
    df = spark_session.createDataFrame([["abc", "123"], [None, "456"], ["def", None]], schema=two_string_columns_schema)

    expected_correct = spark_session.createDataFrame([["abc", "123"], ["def", None]], schema=two_string_columns_schema)
    expected_errors = spark_session.createDataFrame([[None, "456"]], schema=two_string_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_not_null("col1") \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="not_null") \
        .check(
        actual=result,
        expected_correct=expected_correct,
        expected_erroneous=expected_errors
    )
Ejemplo n.º 20
0
def test_should_return_both_correct_and_incorrect_rows(spark_session):
    df = spark_session.createDataFrame([["abc"], [None]], schema=single_string_column_schema)

    expected_correct = spark_session.createDataFrame([["abc"]], schema=single_string_column_schema)
    expected_errors = spark_session.createDataFrame([[None]], schema=single_string_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_not_null("col1") \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="not_null") \
        .check(
        actual=result,
        expected_correct=expected_correct,
        expected_erroneous=expected_errors
    )
Ejemplo n.º 21
0
def test_should_return_both_correct_and_incorrect_rows_numeric_values(spark_session):
    df = spark_session.createDataFrame([[1], [2], [3], [4]], schema=single_string_column_schema)

    expected_correct = spark_session.createDataFrame([[1], [3]], schema=single_string_column_schema)
    expected_errors = spark_session.createDataFrame([[2], [4]], schema=single_string_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .one_of("col1", [1, 3, 5]) \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="one_of") \
        .check(
        actual=result,
        expected_correct=expected_correct,
        expected_erroneous=expected_errors
    )
Ejemplo n.º 22
0
def test_mean_value_of_other_columns_is_ignored(spark_session):
    df = spark_session.createDataFrame([[5, 1], [10, 2], [15, 3]],
                                       schema=two_integer_columns_schema)
    expected_errors = spark_session.createDataFrame(
        [], schema=two_integer_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .mean_column_value("col1", 10, 10) \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="mean_between") \
        .check(
        actual=result,
        expected_correct=df,
        expected_erroneous=expected_errors
    )
Ejemplo n.º 23
0
def test_not_null_should_check_all_given_columns_separately_even_if_all_of_them_are_defined_at_once(spark_session):
    df = spark_session.createDataFrame([["abc", None], [None, "456"], [None, None]], schema=two_string_columns_schema)
    expected_errors = spark_session.createDataFrame([["abc", None], [None, "456"], [None, None]], schema=two_string_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .are_not_null(["col1", "col2"]) \
        .execute()

    AssertDf(result.correct_data) \
        .is_empty() \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column=["col1", "col2"]) \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [ValidationError("col1", "not_null", 2), ValidationError("col2", "not_null", 2)]
Ejemplo n.º 24
0
def test_should_reject_all_rows_if_all_are_the_same(spark_session):
    df = spark_session.createDataFrame([["abc"], ["abc"], ["abc"]], schema=single_string_column_schema)
    expected_errors = spark_session.createDataFrame([["abc"]], schema=single_string_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_unique("col1") \
        .execute()

    AssertDf(result.correct_data) \
        .is_empty() \
        .has_columns(["col1"])

    AssertDf(result.erroneous_data, order_by_column="col1") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1"])

    assert result.errors == [ValidationError("col1", "unique", 3)]
def test_should_reject_all_rows_if_all_are_too_short_or_too_long(
        spark_session):
    df = spark_session.createDataFrame([["abc"], ["a"], ["abcdefghi"]],
                                       schema=single_string_column_schema)
    expected_errors = spark_session.createDataFrame(
        [["abc"], ["a"], ["abcdefghi"]], schema=single_string_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .has_length_between("col1", 5, 8) \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="text_length") \
        .check(
        actual=result,
        expected_correct=empty_string_df(spark_session),
        expected_erroneous=expected_errors
    )
Ejemplo n.º 26
0
def test_uniqueness_should_check_all_given_columns_separately_when_defining_all_columns_at_once(spark_session):
    df = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema)
    expected_correct = spark_session.createDataFrame([], schema=two_string_columns_schema)
    expected_errors = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .are_unique(["col1", "col2"]) \
        .execute()

    AssertDf(result.correct_data, order_by_column="col1") \
        .contains_exactly(expected_correct.toPandas()) \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column="col2") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [ValidationError("col1", "unique", 2), ValidationError("col2", "unique", 2)]
def test_should_return_both_correct_and_incorrect_rows(spark_session):
    df = spark_session.createDataFrame([[5], [10], [15]],
                                       schema=single_integer_column_schema)
    expected_correct = spark_session.createDataFrame(
        [[10], [15]], schema=single_integer_column_schema)
    expected_errors = spark_session.createDataFrame(
        [[5]], schema=single_integer_column_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_min("col1", 10) \
        .execute()

    AssertValidationResult(column_name="col1", constraint_name="min") \
        .check(
        actual=result,
        expected_correct=expected_correct,
        expected_erroneous=expected_errors
    )
Ejemplo n.º 28
0
def test_uniqueness_of_other_columns_is_ignored(spark_session):
    df = spark_session.createDataFrame([["abc", "123"], ["abc", "456"], ["def", "123"]], schema=two_string_columns_schema)
    expected_correct = spark_session.createDataFrame([["def", "123"]], schema=two_string_columns_schema)
    expected_errors = spark_session.createDataFrame([["abc", "123"], ["abc", "456"]], schema=two_string_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_unique("col1") \
        .execute()

    AssertDf(result.correct_data, order_by_column="col1") \
        .contains_exactly(expected_correct.toPandas()) \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column="col2") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [ValidationError("col1", "unique", 2)]
def test_between_ignores_the_other_column(spark_session):
    df = spark_session.createDataFrame([[5, 8], [10, 20], [15, 8]],
                                       schema=two_integer_columns_schema)
    expected_correct = spark_session.createDataFrame(
        [[5, 8], [10, 20]], schema=two_integer_columns_schema)
    expected_errors = spark_session.createDataFrame(
        [[15, 8]], schema=two_integer_columns_schema)

    result = ValidateSparkDataFrame(spark_session, df) \
        .is_between("col1", 5, 10) \
        .execute()

    AssertDf(result.correct_data, order_by_column="col1") \
        .contains_exactly(expected_correct.toPandas()) \
        .has_columns(["col1", "col2"])

    AssertDf(result.erroneous_data, order_by_column="col2") \
        .contains_exactly(expected_errors.toPandas()) \
        .has_columns(["col1", "col2"])

    assert result.errors == [ValidationError("col1", "between", 1)]
Ejemplo n.º 30
0
def test_should_throw_error_if_there_are_duplicate_constraints(spark_session):
    with pytest.raises(ValueError):
        ValidateSparkDataFrame(spark_session, empty_integer_df(spark_session)) \
            .mean_column_value("col1", 10, 10) \
            .mean_column_value("col1", 5, 5) \
            .execute()