예제 #1
0
def test_run_transform():
    data = [("jose", "jose"), ("li", "li"), ("luisa", "laura")]
    df = spark.createDataFrame(data, ["name", "expected_name"])
    actual_df = df.transform(ct_a.transform())
    expected_data = [("jose", "jose", "a_hi"), ("li", "li", "a_hi"),
                     ("luisa", "laura", "a_hi")]
    expected_df = spark.createDataFrame(expected_data,
                                        ["name", "expected_name", "col_a"])
    chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
예제 #2
0
 def test_with_greeting2(self, spark):
     source_data = [("jose", 1), ("li", 2)]
     source_df = spark.spark_session.createDataFrame(
         source_data, ["name", "age"])
     actual_df = source_df.transform(T.with_greeting2("hi"))
     expected_data = [("jose", 1, "hi"), ("li", 2, "hi")]
     expected_df = spark.spark_session.createDataFrame(
         expected_data, ["name", "age", "greeting"])
     chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_column_names(spark):
    source_data = [("jose", "oak", "switch")]
    source_df = spark.createDataFrame(
        source_data, ["some first name", "some.tree.type", "a gaming.system"])
    actual_df = SO.column_names(source_df)
    expected_data = [("jose", "oak", "switch")]
    expected_df = spark.createDataFrame(
        expected_data,
        ["some_&first_&name", "some_$tree_$type", "a_&gaming_$system"])
    assert_df_equality(actual_df, expected_df)
예제 #4
0
def test_transform_with_lambda(spark):
    data = [("jose", 1), ("li", 2), ("liz", 3)]
    source_df = spark.createDataFrame(data, ["name", "age"])
    actual_df = source_df.transform(
        lambda df: df.withColumn("age_times_two",
                                 col("age") * 2))
    expected_data = [("jose", 1, 2), ("li", 2, 4), ("liz", 3, 6)]
    expected_df = spark.createDataFrame(expected_data,
                                        ["name", "age", "age_times_two"])
    chispa.assert_df_equality(actual_df, expected_df)
예제 #5
0
def test_compare_dataframes(spark):
    df1 = spark.spark_session.createDataFrame([("Alice", 1500), ("Bob", 1000),
                                               ("Charlie", 150),
                                               ("Dexter", 100)],
                                              ["name", "count"])
    df2 = spark.spark_session.createDataFrame([("Alice", 1500), ("Bob", 1000),
                                               ("Charlie", 150),
                                               ("Dexter", 100)],
                                              ["name", "count"])
    chispa.assert_df_equality(df1, df2)
예제 #6
0
def test_add_column_d():
    data = [("jose", ), ("li", ), ("luisa", )]
    df = spark.createDataFrame(data, [
        "name",
    ])
    actual_df = unicron.add_column(df, graph, "d")
    expected_data = [("jose", "aaa", "bbb", "ddd"),
                     ("li", "aaa", "bbb", "ddd"),
                     ("luisa", "aaa", "bbb", "ddd")]
    expected_df = spark.createDataFrame(expected_data, ["name", "a", "b", "d"])
    chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
예제 #7
0
def test_root_to_e():
    data = [("jose", ), ("li", ), ("luisa", )]
    df = spark.createDataFrame(data, [
        "name",
    ])
    transforms = unicron.transforms_to_run(df, graph, root, e)
    actual_df = unicron.run_custom_transforms(df, transforms)
    expected_data = [("jose", "aaa", "eee"), ("li", "aaa", "eee"),
                     ("luisa", "aaa", "eee")]
    expected_df = spark.createDataFrame(expected_data, ["name", "a", "e"])
    chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
예제 #8
0
 def it_runs_transformations():
     data = [("jose", "jose"), ("li", "li"), ("luisa", "laura")]
     df = spark.createDataFrame(data, ["name", "expected_name"])
     transforms = [ct_a, ct_ab, ct_abc]
     actual_df = unicron.run_custom_transforms(df, transforms)
     expected_data = [("jose", "jose", "a", "aba", "abcaba"),
                      ("li", "li", "a", "aba", "abcaba"),
                      ("luisa", "laura", "a", "aba", "abcaba")]
     expected_df = spark.createDataFrame(
         expected_data,
         ["name", "expected_name", "col_a", "col_ab", "col_abc"])
     chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
예제 #9
0
def test_verbose_code_without_transform(spark):
    data = [("jose", 1), ("li", 2), ("liz", 3)]
    source_df = spark.createDataFrame(data, ["name", "age"])
    df1 = with_greeting(source_df)
    df2 = with_something(df1, "moo")
    expected_data = [
        ("jose", 1, "hi", "moo"),
        ("li", 2, "hi", "moo"),
        ("liz", 3, "hi", "moo"),
    ]
    expected_df = spark.createDataFrame(
        expected_data, ["name", "age", "greeting", "something"])
    chispa.assert_df_equality(df2, expected_df, ignore_nullable=True)
예제 #10
0
 def test_with_clean_first_name(self, spark):
     source_df = spark.spark_session.create_df(
         [("jo&&se", "a"), ("##li", "b"),
          ("!!sam**", "c")], [("first_name", StringType(), True),
                              ("letter", StringType(), True)])
     actual_df = T.with_clean_first_name(source_df)
     expected_df = spark.spark_session.create_df(
         [("jo&&se", "a", "jose"), ("##li", "b", "li"),
          ("!!sam**", "c", "sam")],
         [("first_name", StringType(), True),
          ("letter", StringType(), True),
          ("clean_first_name", StringType(), True)])
     chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
예제 #11
0
 def it_snake_cases_col_names(spark):
     schema = StructType([
         StructField("I like CHEESE", StringType(), True),
         StructField("YUMMMMY stuff", StringType(), True)
     ])
     data = [("jose", "a"), ("li", "b"), ("sam", "c")]
     source_df = spark.createDataFrame(data, schema)
     actual_df = quinn.snake_case_col_names(source_df)
     expected_df = spark.create_df([("jose", "a"), ("li", "b"),
                                    ("sam", "c")],
                                   [("i_like_cheese", StringType(), True),
                                    ("yummmmy_stuff", StringType(), True)])
     chispa.assert_df_equality(actual_df, expected_df)
예제 #12
0
def test_chain_transforms(spark):
    data = [("jose", 1), ("li", 2), ("liz", 3)]
    source_df = spark.createDataFrame(data, ["name", "age"])
    actual_df = source_df.transform(with_greeting).transform(
        lambda df: with_something(df, "crazy"))
    expected_data = [
        ("jose", 1, "hi", "crazy"),
        ("li", 2, "hi", "crazy"),
        ("liz", 3, "hi", "crazy"),
    ]
    expected_df = spark.createDataFrame(
        expected_data, ["name", "age", "greeting", "something"])
    chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
예제 #13
0
def test_currying(spark):
    data = [("jose", 1), ("li", 2), ("liz", 3)]
    source_df = spark.createDataFrame(data, ["name", "age"])
    pipeline = compose(with_stuff1("nice", "person"), with_stuff2("yoyo"))
    actual_df = pipeline(source_df)
    expected_data = [
        ("jose", 1, "yoyo", "nice person"),
        ("li", 2, "yoyo", "nice person"),
        ("liz", 3, "yoyo", "nice person"),
    ]
    expected_df = spark.createDataFrame(expected_data,
                                        ["name", "age", "stuff2", "stuff1"])
    chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
예제 #14
0
def test_transform_with_closure(spark):
    data = [("jose", 1), ("li", 2), ("liz", 3)]
    source_df = spark.createDataFrame(data, ["name", "age"])
    actual_df = source_df.transform(
        with_greeting).transform(  # no lambda required
            with_funny("haha"))
    expected_data = [
        ("jose", 1, "hi", "haha"),
        ("li", 2, "hi", "haha"),
        ("liz", 3, "hi", "haha"),
    ]
    expected_df = spark.createDataFrame(expected_data,
                                        ["name", "age", "greeting", "funny"])
    chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
예제 #15
0
def test_create_df(spark):
    schema = StructType([
        StructField("name", StringType(), True),
        StructField("blah", StringType(), True)]
    )
    data = [("jose", "a"), ("li", "b"), ("sam", "c")]
    actual_df = spark.createDataFrame(data, schema)

    expected_df = spark.create_df(
        [("jose", "a"), ("li", "b"), ("sam", "c")],
        [("name", StringType(), True), ("blah", StringType(), True)]
    )

    chispa.assert_df_equality(expected_df, actual_df)
예제 #16
0
 def test_group_visits_by_video(self, spark):
     source_data = [
         (1234, 11111),
         (1234, 22222),
         (5678, 33333),
     ]
     source_df = spark.spark_session.createDataFrame(
         source_data,
         [Constants.VISITS_VIDEO_ID, Constants.VISITS_LOCATION_ID])
     actual_df = Processor.group_visits_by_video(source_df)
     expected_data = [(1234, 2), (5678, 1)]
     expected_df = spark.spark_session.createDataFrame(
         expected_data,
         [Constants.VISITSXVIDEO_VIDEO_ID, Constants.VISITSXVIDEO_COUNT])
     chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
예제 #17
0
def test_transform_with_functools_partial(spark):
    data = [("jose", 1), ("li", 2), ("liz", 3)]
    source_df = spark.createDataFrame(data, ["name", "age"])
    actual_df = source_df.transform(
        partial(with_greeting)
    ).transform(  # partial is optional for transformations that only take a single DataFrame argument
        partial(with_jacket, "warm"))
    expected_data = [
        ("jose", 1, "hi", "warm"),
        ("li", 2, "hi", "warm"),
        ("liz", 3, "hi", "warm"),
    ]
    expected_df = spark.createDataFrame(expected_data,
                                        ["name", "age", "greeting", "jacket"])
    chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
예제 #18
0
    def it_converts_a_show_string_to_a_dataframe(spark):
        s = """+----+---+-----------+------+
|name|age|     stuff1|stuff2|
+----+---+-----------+------+
|jose|  1|nice person|  yoyo|
|  li|  2|nice person|  yoyo|
| liz|  3|nice person|  yoyo|
+----+---+-----------+------+"""
        actual_df = quinn.show_output_to_df(s, spark)
        expected_data = [("jose", "1", "nice person", "yoyo"),
                         ("li", "2", "nice person", "yoyo"),
                         ("liz", "3", "nice person", "yoyo")]
        expected_df = spark.createDataFrame(
            expected_data, ["name", "age", "stuff1", "stuff2"])
        chispa.assert_df_equality(expected_df, actual_df)
예제 #19
0
    def it_renames_dots_to_underscores(spark):
        def dots_to_underscores(s):
            return s.replace(".", "_")

        schema = StructType([
            StructField("i.like.cheese", StringType(), True),
            StructField("yummy.stuff", StringType(), True)
        ])
        data = [("jose", "a"), ("li", "b"), ("sam", "c")]
        source_df = spark.createDataFrame(data, schema)
        actual_df = quinn.with_columns_renamed(dots_to_underscores)(source_df)
        expected_df = spark.create_df([("jose", "a"), ("li", "b"),
                                       ("sam", "c")],
                                      [("i_like_cheese", StringType(), True),
                                       ("yummy_stuff", StringType(), True)])
        chispa.assert_df_equality(actual_df, expected_df)
예제 #20
0
    def test_normalize_count_by_videos(self, spark):
        source_data = [
            (1234, 2),
            (1234, 3),
            (5678, 10),
            (5678, 1),
        ]
        source_df = spark.spark_session.createDataFrame(
            source_data,
            [Constants.VISITSXVIDEO_VIDEO_ID, Constants.VISITSXVIDEO_COUNT])

        actual_df = Normalizer.normalize_count_by_videos(source_df)
        expected_data = [(1234, 3), (5678, 10)]
        expected_df = spark.spark_session.createDataFrame(
            expected_data,
            [Constants.VISITSXVIDEO_VIDEO_ID, Constants.VISITSXVIDEO_COUNT])
        chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
예제 #21
0
def test_regex_matching_for_ipv4_ipv6(spark):
    data = [
        Row(value=
            '130.119.171.217 - - [01/Jul/1995:12:30:23 -0400] "GET /ksc.html HTTP/1.0" 200 7074'
            ),
        Row(value=
            '2001:888:197d:0:250:fcff:fe23:3879 - - [10/Aug/2003:20:28:01 +0200] "GET /ipv6/ksc.html HTTP/1.1" 200 472'
            ),
    ]
    test_schema = StructType([StructField('value', StringType())])
    test_df_raw = spark.createDataFrame(data, test_schema)
    test_df = parse_access_log_to_df(test_df_raw)

    expected_schema = StructType([
        StructField('host', StringType()),
        StructField('rfc1413', StringType()),
        StructField('user', StringType()),
        StructField('timestamp', StringType()),
        StructField('method', StringType()),
        StructField('endpoint', StringType()),
        StructField('protocol', StringType()),
        StructField('status', IntegerType()),
        StructField('content_size', IntegerType())
    ])
    expected_data = [
        Row(host='130.119.171.217',
            rfc1413='-',
            user='******',
            timestamp='01/Jul/1995:12:30:23 -0400',
            method='GET',
            endpoint='/ksc.html',
            protocol='HTTP/1.0',
            status=200,
            content_size=7074),
        Row(host='2001:888:197d:0:250:fcff:fe23:3879',
            rfc1413='-',
            user='******',
            timestamp='10/Aug/2003:20:28:01 +0200',
            method='GET',
            endpoint='/ipv6/ksc.html',
            protocol='HTTP/1.1',
            status=200,
            content_size=472)
    ]
    expected_df = spark.createDataFrame(expected_data, expected_schema)
    assert_df_equality(test_df, expected_df)
예제 #22
0
def test_modify_column_names_error(spark):
    source_data = [
        ("jose", 8),
        ("li", 23),
        ("luisa", 48),
    ]
    source_df = spark.createDataFrame(source_data,
                                      ["first.name", "person.favorite.number"])

    actual_df = T.modify_column_names(source_df, SH.dots_to_underscores)

    expected_data = [
        ("jose", 8),
        ("li", 23),
        ("luisa", 48),
    ]
    expected_df = spark.createDataFrame(
        expected_data, ["first_name", "person_favorite_number"])

    assert_df_equality(actual_df, expected_df)
예제 #23
0
def test_sort_columns_asc(spark):
    source_data = [
        ("jose", "oak", "switch"),
        ("li", "redwood", "xbox"),
        ("luisa", "maple", "ps4"),
    ]
    source_df = spark.createDataFrame(source_data,
                                      ["name", "tree", "gaming_system"])

    actual_df = T.sort_columns(source_df, "asc")

    expected_data = [
        ("switch", "jose", "oak"),
        ("xbox", "li", "redwood"),
        ("ps4", "luisa", "maple"),
    ]
    expected_df = spark.createDataFrame(expected_data,
                                        ["gaming_system", "name", "tree"])

    assert_df_equality(actual_df, expected_df)
예제 #24
0
 def it_sorts_columns_in_desc_order(spark):
     source_df = spark.create_df([
         ("jose", "oak", "switch"),
         ("li", "redwood", "xbox"),
         ("luisa", "maple", "ps4"),
     ], [
         ("name", StringType(), True),
         ("tree", StringType(), True),
         ("gaming_system", StringType(), True),
     ])
     actual_df = quinn.sort_columns(source_df, "desc")
     expected_df = spark.create_df([
         ("oak", "jose", "switch"),
         ("redwood", "li", "xbox"),
         ("maple", "luisa", "ps4"),
     ], [
         ("tree", StringType(), True),
         ("name", StringType(), True),
         ("gaming_system", StringType(), True),
     ])
     chispa.assert_df_equality(actual_df, expected_df)
예제 #25
0
    def it_renames_some_columns_with_dots(spark):
        def dots_to_underscores(s):
            return s.replace(".", "_")

        def change_col_name(s):
            return s.startswith("a")

        schema = StructType([
            StructField("a.person", StringType(), True),
            StructField("a.thing", StringType(), True),
            StructField("b.person", StringType(), True)
        ])
        data = [("frank", "hot dog", "mia")]
        source_df = spark.createDataFrame(data, schema)
        actual_df = quinn.with_some_columns_renamed(dots_to_underscores,
                                                    change_col_name)(source_df)
        expected_df = spark.create_df([("frank", "hot dog", "mia")],
                                      [("a_person", StringType(), True),
                                       ("a_thing", StringType(), True),
                                       ("b.person", StringType(), True)])
        chispa.assert_df_equality(actual_df, expected_df)
예제 #26
0
 def test_normalize_visit(self, spark):
     source_data = [
         (1, 2, 3, 4, 5),
         (10, 20, 30, 40, 50),
     ]
     source_df = spark.spark_session.createDataFrame(
         source_data, [
             Constants.VISITS_USER_ID, Constants.VISITS_VIDEO_ID,
             Constants.VISITS_DEVICE_ID, Constants.VISITS_LOCATION_ID,
             Constants.VISITS_VISIT_TIMESTAMP
         ])
     actual_df = Normalizer.normalize_visit(source_df)
     expected_data = [
         (1, 2, 3, 4, 5, '1_2_3_4_5'),
         (10, 20, 30, 40, 50, '10_20_30_40_50'),
     ]
     expected_df = spark.spark_session.createDataFrame(
         expected_data, [
             Constants.VISITS_USER_ID, Constants.VISITS_VIDEO_ID,
             Constants.VISITS_DEVICE_ID, Constants.VISITS_LOCATION_ID,
             Constants.VISITS_VISIT_TIMESTAMP, Constants.VISITS_ID
         ])
     chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
예제 #27
0
    def it_renames_columns_based_on_a_map(spark):
        mapping = {"chips": "french_fries", "petrol": "gas"}

        def british_to_american(s):
            return mapping[s]

        def change_col_name(s):
            return s in mapping

        schema = StructType([
            StructField("chips", StringType(), True),
            StructField("hi", StringType(), True),
            StructField("petrol", StringType(), True)
        ])
        data = [("potato", "hola!", "disel")]
        source_df = spark.createDataFrame(data, schema)
        actual_df = quinn.with_some_columns_renamed(british_to_american,
                                                    change_col_name)(source_df)
        expected_df = spark.create_df([("potato", "hola!", "disel")],
                                      [("french_fries", StringType(), True),
                                       ("hi", StringType(), True),
                                       ("gas", StringType(), True)])
        chispa.assert_df_equality(actual_df, expected_df)