Example #1
0
def test_is_true(spark):
    source_df = spark.create_df(
        [(True, True), (False, False), (None, None)],
        [("has_stuff", BooleanType(), True),
         ("expected", BooleanType(), True)],
    )
    actual_df = source_df.withColumn("is_stuff_true",
                                     F.col("has_stuff").isTrue())
    chispa.assert_column_equality(actual_df, "is_stuff_true", "expected")
Example #2
0
def test_regexp_extract_all(spark):
    df = spark.create_df(
        [("200 - 300 PA.", ["200", "300"]), ("400 PA.", ["400"]),
         (None, None)],
        [
            ("str", StringType(), True),
            ("expected", ArrayType(StringType(), True), True),
        ],
    )
    actual_df = df.withColumn(
        "all_numbers", quinn.regexp_extract_all(F.col("str"), F.lit(r"(\d+)")))
    chispa.assert_column_equality(actual_df, "all_numbers", "expected")
Example #3
0
def test_ip_country_match(spark):
    schema = StructType([
        StructField("host", StringType()),
        StructField("expected_country", StringType())
    ])
    data = [
        Row(host='130.119.171.217', expected_country="US"),
        Row(host='2001:888:197d:0:250:fcff:fe23:3879', expected_country="NL")
    ]
    df = spark.createDataFrame(data, schema)
    ip_country_added_df = get_country_from_ip(df)
    assert_column_equality(ip_country_added_df, "expected_country",
                           "ip_country")
Example #4
0
def test_ip_country_no_match(spark):
    schema = StructType([
        StructField("host", StringType()),
        StructField("expected_country", StringType())
    ])
    data = [
        Row(host='127.0.0.1', expected_country="NoMatch"),
        Row(host='random.domain.com', expected_country="NotIP")
    ]
    df = spark.createDataFrame(data, schema)
    ip_country_added_df = get_country_from_ip(df)
    assert_column_equality(ip_country_added_df, "expected_country",
                           "ip_country")
Example #5
0
def test_is_not_in(spark):
    source_df = spark.create_df(
        [
            ("surfing", True),
            ("swimming", True),
            ("dancing", False),
        ],
        [
            ("fun_thing", StringType(), True),
            ("expected", BooleanType(), True),
        ]
    )
    bobs_hobbies = ["dancing", "snowboarding"]
    actual_df = source_df.withColumn("is_not_bobs_hobby", F.col("fun_thing").isNotIn(bobs_hobbies))
    chispa.assert_column_equality(actual_df, "is_not_bobs_hobby", "expected")
Example #6
0
def test_time_parser(spark):
    schema = StructType([
        StructField("timestamp", StringType()),
        StructField("expected_timestamp", TimestampType())
    ])
    data = [
        Row(timestamp='01/Jul/1995:12:30:23 -0400',
            expected_time=datetime(1995, 7, 1, 19, 30, 23)),
        Row(timestamp='10/Aug/2003:20:28:01 +0200',
            expected_time=datetime(2003, 8, 10, 21, 28, 1))
    ]
    df = spark.createDataFrame(data, schema)
    parsed_df = parse_log_time(df)
    parsed_df.show(truncate=False)
    assert_column_equality(parsed_df, "expected_timestamp", "parsed_time")
Example #7
0
def test_is_null_or_blank(spark):
    source_df = spark.create_df(
        [
            ("", True),
            ("   ", True),
            (None, True),
            ("hi", False),
        ],
        [
            ("blah", StringType(), True),
            ("expected", BooleanType(), True),
        ]
    )
    actual_df = source_df.withColumn("is_blah_null_or_blank", F.col("blah").isNullOrBlank())
    chispa.assert_column_equality(actual_df, "is_blah_null_or_blank", "expected")
Example #8
0
def test_anti_trim(spark):
    df = spark.create_df(
        [
            ("  I like     fish  ", "  Ilikefish  "),
            ("    zombies", "    zombies"),
            ("  simpsons   cat lady   ", "  simpsonscatlady   "),
            (None, None),
        ],
        [
            ("words", StringType(), True),
            ("expected", StringType(), True),
        ],
    )
    actual_df = df.withColumn("words_anti_trimmed",
                              quinn.anti_trim(F.col("words")))
    chispa.assert_column_equality(actual_df, "words_anti_trimmed", "expected")
Example #9
0
def test_exists(spark):
    df = spark.createDataFrame(
        [
            ([1, 2, 3], False),
            ([4, 5, 6], True),
            ([10, 11, 12], True),
        ],
        StructType([
            StructField("nums", ArrayType(IntegerType(), True), True),
            StructField("expected", BooleanType(), True),
        ]),
    )
    actual_df = df.withColumn("any_num_greater_than_5",
                              quinn.exists(lambda n: n > 5)(F.col("nums")))
    chispa.assert_column_equality(actual_df, "any_num_greater_than_5",
                                  "expected")
Example #10
0
def test_remove_non_word_characters(spark):
    df = spark.create_df(
        [
            ("I?like!fish>", "Ilikefish"),
            ("%%%zombies", "zombies"),
            ("si%$#@!#$!@#mpsons", "simpsons"),
            (None, None),
        ],
        [
            ("words", StringType(), True),
            ("expected", StringType(), True),
        ],
    )
    actual_df = df.withColumn("words_without_nonword_chars",
                              quinn.remove_non_word_characters(F.col("words")))
    chispa.assert_column_equality(actual_df, "words_without_nonword_chars",
                                  "expected")
Example #11
0
def test_remove_all_whitespace(spark):
    df = spark.create_df(
        [
            ("  I like     fish  ", "Ilikefish"),
            ("    zombies", "zombies"),
            ("simpsons   cat lady", "simpsonscatlady"),
            (None, None),
        ],
        [
            ("words", StringType(), True),
            ("expected", StringType(), True),
        ],
    )
    actual_df = df.withColumn("words_without_whitespace",
                              quinn.remove_all_whitespace(F.col("words")))
    chispa.assert_column_equality(actual_df, "words_without_whitespace",
                                  "expected")
Example #12
0
 def it_works_with_end_date_of_sunday(spark):
     df = spark.create_df(
         [
             # converts a Thursday to the Sunday after
             (datetime.datetime(2020, 1, 2), datetime.datetime(2020, 1, 5)),
             # converts a Wednesday to the Sunday after
             (datetime.datetime(2020, 7, 15), datetime.datetime(
                 2020, 7, 19)),
             # doesn't change if the day in a Sunday
             (datetime.datetime(2020, 7, 19), datetime.datetime(
                 2020, 7, 19)),
             (None, None),
         ],
         [("some_date", DateType(), True), ("expected", DateType(), True)],
     )
     actual_df = df.withColumn(
         "week_start_date", quinn.week_end_date(F.col("some_date"), "Sun"))
     chispa.assert_column_equality(actual_df, "week_start_date", "expected")
Example #13
0
 def it_defaults_to_saturday_week_end(spark):
     df = spark.create_df(
         [
             # converts a Tuesday to the Saturday after
             (datetime.datetime(2020, 1, 2), datetime.datetime(2020, 1, 4)),
             # converts a Wednesday to the Saturday after
             (datetime.datetime(2020, 7, 15), datetime.datetime(
                 2020, 7, 18)),
             # doesn't change if the day is Saturday
             (datetime.datetime(2020, 7, 25), datetime.datetime(
                 2020, 7, 25)),
             (None, None),
         ],
         [("some_date", DateType(), True), ("expected", DateType(), True)],
     )
     actual_df = df.withColumn("week_start_date",
                               quinn.week_end_date(F.col("some_date")))
     chispa.assert_column_equality(actual_df, "week_start_date", "expected")
Example #14
0
def test_multi_equals(spark):
    df = spark.create_df(
        [
            ("cat", "cat", True),
            ("cat", "dog", False),
            ("pig", "pig", False),
            ("", "", False),
            (None, None, False),
        ],
        [
            ("s1", StringType(), True),
            ("s2", StringType(), True),
            ("expected", BooleanType(), True),
        ],
    )
    actual_df = df.withColumn(
        "are_s1_and_s2_cat",
        quinn.multi_equals("cat")(F.col("s1"), F.col("s2")))
    chispa.assert_column_equality(actual_df, "are_s1_and_s2_cat", "expected")
Example #15
0
 def it_defaults_to_sunday_start_date(spark):
     df = spark.create_df(
         [
             # converts a Tuesday to the Sunday before
             (datetime.datetime(2020, 1, 2), datetime.datetime(
                 2019, 12, 29)),
             # converts a Wednesday to the Sunday before
             (datetime.datetime(2020, 7, 15), datetime.datetime(
                 2020, 7, 12)),
             # doesn't change if the day is Sunday
             (datetime.datetime(2020, 7, 26), datetime.datetime(
                 2020, 7, 26)),
             (None, None),
         ],
         [("some_date", DateType(), True), ("expected", DateType(), True)],
     )
     actual_df = df.withColumn("week_start_date",
                               quinn.week_start_date(F.col("some_date")))
     chispa.assert_column_equality(actual_df, "week_start_date", "expected")
Example #16
0
 def it_works_with_integer_values(spark):
     df = spark.create_df(
         [
             (12, 14, True),
             (20, 26, False),
             (44, 41, True),
             (32, 9, False),
             (None, None, None)
         ],
         [
             ("num1", IntegerType(), True),
             ("num2", IntegerType(), True),
             ("expected", BooleanType(), True)
         ]
     )
     actual_df = df.withColumn(
         "are_nums_approx_equal",
         quinn.approx_equal(F.col("num1"), F.col("num2"), F.lit(5))
     )
     chispa.assert_column_equality(actual_df, "are_nums_approx_equal", "expected")
Example #17
0
 def it_works_with_floating_values(spark):
     df = spark.create_df(
         [
             (1.1, 1.05, True),
             (1.1, 11.6, False),
             (1.02, 1.09, True),
             (1.02, 1.34, False),
             (None, None, None)
         ],
         [
             ("num1", FloatType(), True),
             ("num2", FloatType(), True),
             ("expected", BooleanType(), True)
         ]
     )
     actual_df = df.withColumn(
         "are_nums_approx_equal",
         quinn.approx_equal(F.col("num1"), F.col("num2"), F.lit(0.1))
     )
     chispa.assert_column_equality(actual_df, "are_nums_approx_equal", "expected")
Example #18
0
 def it_works_with_start_date_of_monday(spark):
     df = spark.create_df(
         [
             # converts a Thursday to the Monday before
             (datetime.datetime(2020, 1, 2), datetime.datetime(
                 2019, 12, 30)),
             # converts a Wednesday to the Monday before
             (datetime.datetime(2020, 7, 15), datetime.datetime(
                 2020, 7, 13)),
             # doesn't change if the day in a Monday
             (datetime.datetime(2020, 7, 20), datetime.datetime(
                 2020, 7, 20)),
             (None, None),
         ],
         [("some_date", DateType(), True), ("expected", DateType(), True)],
     )
     actual_df = df.withColumn(
         "week_start_date", quinn.week_start_date(F.col("some_date"),
                                                  "Mon"))
     chispa.assert_column_equality(actual_df, "week_start_date", "expected")
Example #19
0
def test_null_between(spark):
    source_df = spark.create_df(
        [
            (17, None, 94, True),
            (17, None, 10, False),
            (None, 10, 5, True),
            (None, 10, 88, False),
            (10, 15, 11, True),
            (None, None, 11, False),
            (3, 5, None, False),
            (None, None, None, False),
        ],
        [
            ("lower_age", IntegerType(), True),
            ("upper_age", IntegerType(), True),
            ("age", IntegerType(), True),
            ("expected", BooleanType(), True),
        ],
    )
    actual_df = source_df.withColumn(
        "is_between",
        F.col("age").nullBetween(F.col("lower_age"), F.col("upper_age")))
    chispa.assert_column_equality(actual_df, "is_between", "expected")
Example #20
0
def test_ip_anonymizer(spark):
    schema = StructType([
        StructField("host", StringType()),
        StructField("ip_country", StringType()),
        StructField("expected_anonymized_ip", StringType())
    ])
    data = [
        Row(host='130.119.171.217',
            ip_country="US",
            expected_anonymized_ip="130.119.171.US"),
        Row(host='2001:888:197d:0:250:fcff:fe23:3879',
            ip_country="NL",
            expected_anonymized_ip="2001:888:197d:0:250:fcff:fe23:NL"),
        Row(host='random.domain.com',
            ip_country="NotIP",
            expected_anonymized_ip='random.domain.com'),
        Row(host='127.0.0.1',
            ip_country="NoMatch",
            expected_anonymized_ip="127.0.0.NaN")
    ]
    df = spark.createDataFrame(data, schema)
    anonymized_df = anonymize_ip(df)
    assert_column_equality(anonymized_df, "expected_anonymized_ip",
                           "anonymized_ip")
Example #21
0
def test_cast_arraytype(spark):
    data = [(['200', '300'], [200, 300]), (['400'], [400]), (None, None)]
    df = spark.createDataFrame(data, ["nums", "expected"])\
        .withColumn("actual", F.col("nums").cast(ArrayType(IntegerType(), True)))
    assert_column_equality(df, "actual", "expected")