def test_run_transform(): data = [("jose", "jose"), ("li", "li"), ("luisa", "laura")] df = spark.createDataFrame(data, ["name", "expected_name"]) actual_df = df.transform(ct_a.transform()) expected_data = [("jose", "jose", "a_hi"), ("li", "li", "a_hi"), ("luisa", "laura", "a_hi")] expected_df = spark.createDataFrame(expected_data, ["name", "expected_name", "col_a"]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_with_greeting2(self, spark): source_data = [("jose", 1), ("li", 2)] source_df = spark.spark_session.createDataFrame( source_data, ["name", "age"]) actual_df = source_df.transform(T.with_greeting2("hi")) expected_data = [("jose", 1, "hi"), ("li", 2, "hi")] expected_df = spark.spark_session.createDataFrame( expected_data, ["name", "age", "greeting"]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_column_names(spark): source_data = [("jose", "oak", "switch")] source_df = spark.createDataFrame( source_data, ["some first name", "some.tree.type", "a gaming.system"]) actual_df = SO.column_names(source_df) expected_data = [("jose", "oak", "switch")] expected_df = spark.createDataFrame( expected_data, ["some_&first_&name", "some_$tree_$type", "a_&gaming_$system"]) assert_df_equality(actual_df, expected_df)
def test_transform_with_lambda(spark): data = [("jose", 1), ("li", 2), ("liz", 3)] source_df = spark.createDataFrame(data, ["name", "age"]) actual_df = source_df.transform( lambda df: df.withColumn("age_times_two", col("age") * 2)) expected_data = [("jose", 1, 2), ("li", 2, 4), ("liz", 3, 6)] expected_df = spark.createDataFrame(expected_data, ["name", "age", "age_times_two"]) chispa.assert_df_equality(actual_df, expected_df)
def test_compare_dataframes(spark): df1 = spark.spark_session.createDataFrame([("Alice", 1500), ("Bob", 1000), ("Charlie", 150), ("Dexter", 100)], ["name", "count"]) df2 = spark.spark_session.createDataFrame([("Alice", 1500), ("Bob", 1000), ("Charlie", 150), ("Dexter", 100)], ["name", "count"]) chispa.assert_df_equality(df1, df2)
def test_add_column_d(): data = [("jose", ), ("li", ), ("luisa", )] df = spark.createDataFrame(data, [ "name", ]) actual_df = unicron.add_column(df, graph, "d") expected_data = [("jose", "aaa", "bbb", "ddd"), ("li", "aaa", "bbb", "ddd"), ("luisa", "aaa", "bbb", "ddd")] expected_df = spark.createDataFrame(expected_data, ["name", "a", "b", "d"]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_root_to_e(): data = [("jose", ), ("li", ), ("luisa", )] df = spark.createDataFrame(data, [ "name", ]) transforms = unicron.transforms_to_run(df, graph, root, e) actual_df = unicron.run_custom_transforms(df, transforms) expected_data = [("jose", "aaa", "eee"), ("li", "aaa", "eee"), ("luisa", "aaa", "eee")] expected_df = spark.createDataFrame(expected_data, ["name", "a", "e"]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def it_runs_transformations(): data = [("jose", "jose"), ("li", "li"), ("luisa", "laura")] df = spark.createDataFrame(data, ["name", "expected_name"]) transforms = [ct_a, ct_ab, ct_abc] actual_df = unicron.run_custom_transforms(df, transforms) expected_data = [("jose", "jose", "a", "aba", "abcaba"), ("li", "li", "a", "aba", "abcaba"), ("luisa", "laura", "a", "aba", "abcaba")] expected_df = spark.createDataFrame( expected_data, ["name", "expected_name", "col_a", "col_ab", "col_abc"]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_verbose_code_without_transform(spark): data = [("jose", 1), ("li", 2), ("liz", 3)] source_df = spark.createDataFrame(data, ["name", "age"]) df1 = with_greeting(source_df) df2 = with_something(df1, "moo") expected_data = [ ("jose", 1, "hi", "moo"), ("li", 2, "hi", "moo"), ("liz", 3, "hi", "moo"), ] expected_df = spark.createDataFrame( expected_data, ["name", "age", "greeting", "something"]) chispa.assert_df_equality(df2, expected_df, ignore_nullable=True)
def test_with_clean_first_name(self, spark): source_df = spark.spark_session.create_df( [("jo&&se", "a"), ("##li", "b"), ("!!sam**", "c")], [("first_name", StringType(), True), ("letter", StringType(), True)]) actual_df = T.with_clean_first_name(source_df) expected_df = spark.spark_session.create_df( [("jo&&se", "a", "jose"), ("##li", "b", "li"), ("!!sam**", "c", "sam")], [("first_name", StringType(), True), ("letter", StringType(), True), ("clean_first_name", StringType(), True)]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def it_snake_cases_col_names(spark): schema = StructType([ StructField("I like CHEESE", StringType(), True), StructField("YUMMMMY stuff", StringType(), True) ]) data = [("jose", "a"), ("li", "b"), ("sam", "c")] source_df = spark.createDataFrame(data, schema) actual_df = quinn.snake_case_col_names(source_df) expected_df = spark.create_df([("jose", "a"), ("li", "b"), ("sam", "c")], [("i_like_cheese", StringType(), True), ("yummmmy_stuff", StringType(), True)]) chispa.assert_df_equality(actual_df, expected_df)
def test_chain_transforms(spark): data = [("jose", 1), ("li", 2), ("liz", 3)] source_df = spark.createDataFrame(data, ["name", "age"]) actual_df = source_df.transform(with_greeting).transform( lambda df: with_something(df, "crazy")) expected_data = [ ("jose", 1, "hi", "crazy"), ("li", 2, "hi", "crazy"), ("liz", 3, "hi", "crazy"), ] expected_df = spark.createDataFrame( expected_data, ["name", "age", "greeting", "something"]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_currying(spark): data = [("jose", 1), ("li", 2), ("liz", 3)] source_df = spark.createDataFrame(data, ["name", "age"]) pipeline = compose(with_stuff1("nice", "person"), with_stuff2("yoyo")) actual_df = pipeline(source_df) expected_data = [ ("jose", 1, "yoyo", "nice person"), ("li", 2, "yoyo", "nice person"), ("liz", 3, "yoyo", "nice person"), ] expected_df = spark.createDataFrame(expected_data, ["name", "age", "stuff2", "stuff1"]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_transform_with_closure(spark): data = [("jose", 1), ("li", 2), ("liz", 3)] source_df = spark.createDataFrame(data, ["name", "age"]) actual_df = source_df.transform( with_greeting).transform( # no lambda required with_funny("haha")) expected_data = [ ("jose", 1, "hi", "haha"), ("li", 2, "hi", "haha"), ("liz", 3, "hi", "haha"), ] expected_df = spark.createDataFrame(expected_data, ["name", "age", "greeting", "funny"]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_create_df(spark): schema = StructType([ StructField("name", StringType(), True), StructField("blah", StringType(), True)] ) data = [("jose", "a"), ("li", "b"), ("sam", "c")] actual_df = spark.createDataFrame(data, schema) expected_df = spark.create_df( [("jose", "a"), ("li", "b"), ("sam", "c")], [("name", StringType(), True), ("blah", StringType(), True)] ) chispa.assert_df_equality(expected_df, actual_df)
def test_group_visits_by_video(self, spark): source_data = [ (1234, 11111), (1234, 22222), (5678, 33333), ] source_df = spark.spark_session.createDataFrame( source_data, [Constants.VISITS_VIDEO_ID, Constants.VISITS_LOCATION_ID]) actual_df = Processor.group_visits_by_video(source_df) expected_data = [(1234, 2), (5678, 1)] expected_df = spark.spark_session.createDataFrame( expected_data, [Constants.VISITSXVIDEO_VIDEO_ID, Constants.VISITSXVIDEO_COUNT]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_transform_with_functools_partial(spark): data = [("jose", 1), ("li", 2), ("liz", 3)] source_df = spark.createDataFrame(data, ["name", "age"]) actual_df = source_df.transform( partial(with_greeting) ).transform( # partial is optional for transformations that only take a single DataFrame argument partial(with_jacket, "warm")) expected_data = [ ("jose", 1, "hi", "warm"), ("li", 2, "hi", "warm"), ("liz", 3, "hi", "warm"), ] expected_df = spark.createDataFrame(expected_data, ["name", "age", "greeting", "jacket"]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def it_converts_a_show_string_to_a_dataframe(spark): s = """+----+---+-----------+------+ |name|age| stuff1|stuff2| +----+---+-----------+------+ |jose| 1|nice person| yoyo| | li| 2|nice person| yoyo| | liz| 3|nice person| yoyo| +----+---+-----------+------+""" actual_df = quinn.show_output_to_df(s, spark) expected_data = [("jose", "1", "nice person", "yoyo"), ("li", "2", "nice person", "yoyo"), ("liz", "3", "nice person", "yoyo")] expected_df = spark.createDataFrame( expected_data, ["name", "age", "stuff1", "stuff2"]) chispa.assert_df_equality(expected_df, actual_df)
def it_renames_dots_to_underscores(spark): def dots_to_underscores(s): return s.replace(".", "_") schema = StructType([ StructField("i.like.cheese", StringType(), True), StructField("yummy.stuff", StringType(), True) ]) data = [("jose", "a"), ("li", "b"), ("sam", "c")] source_df = spark.createDataFrame(data, schema) actual_df = quinn.with_columns_renamed(dots_to_underscores)(source_df) expected_df = spark.create_df([("jose", "a"), ("li", "b"), ("sam", "c")], [("i_like_cheese", StringType(), True), ("yummy_stuff", StringType(), True)]) chispa.assert_df_equality(actual_df, expected_df)
def test_normalize_count_by_videos(self, spark): source_data = [ (1234, 2), (1234, 3), (5678, 10), (5678, 1), ] source_df = spark.spark_session.createDataFrame( source_data, [Constants.VISITSXVIDEO_VIDEO_ID, Constants.VISITSXVIDEO_COUNT]) actual_df = Normalizer.normalize_count_by_videos(source_df) expected_data = [(1234, 3), (5678, 10)] expected_df = spark.spark_session.createDataFrame( expected_data, [Constants.VISITSXVIDEO_VIDEO_ID, Constants.VISITSXVIDEO_COUNT]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_regex_matching_for_ipv4_ipv6(spark): data = [ Row(value= '130.119.171.217 - - [01/Jul/1995:12:30:23 -0400] "GET /ksc.html HTTP/1.0" 200 7074' ), Row(value= '2001:888:197d:0:250:fcff:fe23:3879 - - [10/Aug/2003:20:28:01 +0200] "GET /ipv6/ksc.html HTTP/1.1" 200 472' ), ] test_schema = StructType([StructField('value', StringType())]) test_df_raw = spark.createDataFrame(data, test_schema) test_df = parse_access_log_to_df(test_df_raw) expected_schema = StructType([ StructField('host', StringType()), StructField('rfc1413', StringType()), StructField('user', StringType()), StructField('timestamp', StringType()), StructField('method', StringType()), StructField('endpoint', StringType()), StructField('protocol', StringType()), StructField('status', IntegerType()), StructField('content_size', IntegerType()) ]) expected_data = [ Row(host='130.119.171.217', rfc1413='-', user='******', timestamp='01/Jul/1995:12:30:23 -0400', method='GET', endpoint='/ksc.html', protocol='HTTP/1.0', status=200, content_size=7074), Row(host='2001:888:197d:0:250:fcff:fe23:3879', rfc1413='-', user='******', timestamp='10/Aug/2003:20:28:01 +0200', method='GET', endpoint='/ipv6/ksc.html', protocol='HTTP/1.1', status=200, content_size=472) ] expected_df = spark.createDataFrame(expected_data, expected_schema) assert_df_equality(test_df, expected_df)
def test_modify_column_names_error(spark): source_data = [ ("jose", 8), ("li", 23), ("luisa", 48), ] source_df = spark.createDataFrame(source_data, ["first.name", "person.favorite.number"]) actual_df = T.modify_column_names(source_df, SH.dots_to_underscores) expected_data = [ ("jose", 8), ("li", 23), ("luisa", 48), ] expected_df = spark.createDataFrame( expected_data, ["first_name", "person_favorite_number"]) assert_df_equality(actual_df, expected_df)
def test_sort_columns_asc(spark): source_data = [ ("jose", "oak", "switch"), ("li", "redwood", "xbox"), ("luisa", "maple", "ps4"), ] source_df = spark.createDataFrame(source_data, ["name", "tree", "gaming_system"]) actual_df = T.sort_columns(source_df, "asc") expected_data = [ ("switch", "jose", "oak"), ("xbox", "li", "redwood"), ("ps4", "luisa", "maple"), ] expected_df = spark.createDataFrame(expected_data, ["gaming_system", "name", "tree"]) assert_df_equality(actual_df, expected_df)
def it_sorts_columns_in_desc_order(spark): source_df = spark.create_df([ ("jose", "oak", "switch"), ("li", "redwood", "xbox"), ("luisa", "maple", "ps4"), ], [ ("name", StringType(), True), ("tree", StringType(), True), ("gaming_system", StringType(), True), ]) actual_df = quinn.sort_columns(source_df, "desc") expected_df = spark.create_df([ ("oak", "jose", "switch"), ("redwood", "li", "xbox"), ("maple", "luisa", "ps4"), ], [ ("tree", StringType(), True), ("name", StringType(), True), ("gaming_system", StringType(), True), ]) chispa.assert_df_equality(actual_df, expected_df)
def it_renames_some_columns_with_dots(spark): def dots_to_underscores(s): return s.replace(".", "_") def change_col_name(s): return s.startswith("a") schema = StructType([ StructField("a.person", StringType(), True), StructField("a.thing", StringType(), True), StructField("b.person", StringType(), True) ]) data = [("frank", "hot dog", "mia")] source_df = spark.createDataFrame(data, schema) actual_df = quinn.with_some_columns_renamed(dots_to_underscores, change_col_name)(source_df) expected_df = spark.create_df([("frank", "hot dog", "mia")], [("a_person", StringType(), True), ("a_thing", StringType(), True), ("b.person", StringType(), True)]) chispa.assert_df_equality(actual_df, expected_df)
def test_normalize_visit(self, spark): source_data = [ (1, 2, 3, 4, 5), (10, 20, 30, 40, 50), ] source_df = spark.spark_session.createDataFrame( source_data, [ Constants.VISITS_USER_ID, Constants.VISITS_VIDEO_ID, Constants.VISITS_DEVICE_ID, Constants.VISITS_LOCATION_ID, Constants.VISITS_VISIT_TIMESTAMP ]) actual_df = Normalizer.normalize_visit(source_df) expected_data = [ (1, 2, 3, 4, 5, '1_2_3_4_5'), (10, 20, 30, 40, 50, '10_20_30_40_50'), ] expected_df = spark.spark_session.createDataFrame( expected_data, [ Constants.VISITS_USER_ID, Constants.VISITS_VIDEO_ID, Constants.VISITS_DEVICE_ID, Constants.VISITS_LOCATION_ID, Constants.VISITS_VISIT_TIMESTAMP, Constants.VISITS_ID ]) chispa.assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def it_renames_columns_based_on_a_map(spark): mapping = {"chips": "french_fries", "petrol": "gas"} def british_to_american(s): return mapping[s] def change_col_name(s): return s in mapping schema = StructType([ StructField("chips", StringType(), True), StructField("hi", StringType(), True), StructField("petrol", StringType(), True) ]) data = [("potato", "hola!", "disel")] source_df = spark.createDataFrame(data, schema) actual_df = quinn.with_some_columns_renamed(british_to_american, change_col_name)(source_df) expected_df = spark.create_df([("potato", "hola!", "disel")], [("french_fries", StringType(), True), ("hi", StringType(), True), ("gas", StringType(), True)]) chispa.assert_df_equality(actual_df, expected_df)