def test_default_value_defaults_to_none(self, input_df, expected_output_df): """Missing 'default' attribute is set to the default: None""" cleaning_definition = dict(b=dict(elements=["positive"], mode="allow")) output_df = EnumCleaner( cleaning_definitions=cleaning_definition).transform(input_df) assert_df_equality(expected_output_df, output_df)
def test_missing_mode_defaults_to_allow(self, input_df, expected_output_df): """Missing 'mode' attribute is set to the default: 'allow'""" cleaning_definition = dict(b=dict(elements=["positive"], default=None)) output_df = EnumCleaner( cleaning_definitions=cleaning_definition).transform(input_df) assert_df_equality(expected_output_df, output_df)
def test_multiple_cleansing_rules(self, spark_session): input_df = spark_session.createDataFrame([ Row(a="stay", b="positive", c="or", d="healthy"), Row(a="stay", b="negative", c="and", d="healthy"), Row(a="stay", b="positive", c="xor", d="healthy"), ]) cleansing_definitions = dict( b=dict(elements=["positive"]), c=dict(elements=["xor"], mode="disallow", default="or"), ) expected_output_df = spark_session.createDataFrame([ Row(a="stay", b="positive", c="or", d="healthy", cleansed_values_enum=Row(b=None, c=None)), Row(a="stay", b=None, c="and", d="healthy", cleansed_values_enum=Row(b="negative", c=None)), Row(a="stay", b="positive", c="or", d="healthy", cleansed_values_enum=Row(b=None, c="xor")), ]) output_df = EnumCleaner( cleansing_definitions, column_to_log_cleansed_values="cleansed_values_enum").transform( input_df) assert_df_equality(expected_output_df, output_df, ignore_nullable=True)
def assert_mapping_equality(mapping_1, mapping_2, spark): if mapping_1 == mapping_2: return True else: # for easier debugging assert_df_equality( spark.createDataFrame(mapping_1, ["name", "source", "type"]), spark.createDataFrame(mapping_2, ["name", "source", "type"]))
def test_single_array_with_other_columns_keeping_original_columns( self, spark_session): input_df = spark_session.createDataFrame([ Row(array_val=[4789, 4790, 4791], timestamp_val=datetime.datetime(2021, 1, 14, 8, 10, 14)) ]) flattener = Flattener(keep_original_columns=True) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame([ Row(original_columns=Row(array_val=[4789, 4790, 4791], timestamp_val=datetime.datetime( 2021, 1, 14, 8, 10, 14)), timestamp_val=datetime.datetime(2021, 1, 14, 8, 10, 14), array_val=4789), Row(original_columns=Row(array_val=[4789, 4790, 4791], timestamp_val=datetime.datetime( 2021, 1, 14, 8, 10, 14)), timestamp_val=datetime.datetime(2021, 1, 14, 8, 10, 14), array_val=4790), Row(original_columns=Row(array_val=[4789, 4790, 4791], timestamp_val=datetime.datetime( 2021, 1, 14, 8, 10, 14)), timestamp_val=datetime.datetime(2021, 1, 14, 8, 10, 14), array_val=4791), ]) expected_output_df.schema["original_columns"].nullable = False assert_df_equality(expected_output_df, output_df)
def test_simple_renames(self, flattener, spark_session): input_df = spark_session.createDataFrame( [Row(struct_val=Row(int_val=4789, string_val="Hello World"))]) expected_output_df = spark_session.createDataFrame( [(4789, "Hello World")], schema=["int_val", "string_val"]) output_df = flattener.transform(input_df) assert_df_equality(output_df, expected_output_df)
def test_single_array(self, spark_session, flattener): input_df = spark_session.createDataFrame( [Row(array_val=[4789, 4790, 4791])]) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame( [(4789, ), (4790, ), (4791, )], schema=["array_val"]) assert_df_equality(expected_output_df, output_df)
def test_ensemble_array(spark, input_df, config_array, expected_result_df): ensembler = Ensembler.from_config(config_array) result_df = ensembler.ensemble(input_df, spark) expected_df = input_df.join(expected_result_df.toDF( 'customer_id', config_array.result.column_name), on='customer_id') assert_df_equality(result_df, expected_df, ignore_row_order=True)
def test_single_column(self, spark_session, flattener): input_df = spark_session.createDataFrame( [Row(string_val="Hello World")]) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame( [("Hello World", )], schema=["string_val"]) assert_df_equality(expected_output_df, output_df)
def test_single_struct_single_attribute(self, spark_session, flattener): input_df = spark_session.createDataFrame( [Row(struct_val=Row(int_val=4789))]) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame( [(4789, )], schema=["struct_val_int_val"]) assert_df_equality(expected_output_df, output_df)
def test_multiple_columns_of_same_datatype(self, spark_session, flattener): input_df = spark_session.createDataFrame( [Row(int_val_1=4789, int_val_2=4790, int_val_3=4791)]) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame( [(4789, 4790, 4791)], schema=["int_val_1", "int_val_2", "int_val_3"]) assert_df_equality(expected_output_df, output_df)
def test_timestamp_to_date(spark): source_data = [(1606946341, "a"), (1606946341, "a")] source_df = spark.createDataFrame(source_data, ["timestamp", "name"]) actual_df = common_manipulations.timestamp_to_date(source_df) expected_data = [ (1606946341, "a", datetime.fromtimestamp(1606946341), "22", "59", "4"), (1606946341, "a", datetime.fromtimestamp(1606946341), "22", "59", "4") ] expected_df = spark.createDataFrame( expected_data, ["timestamp", "name", "normal_type", "godzina", "minuta", "dzien"]) assert_df_equality(actual_df, expected_df)
def test_simple_struct(self, flattener, spark_session): input_df = spark_session.createDataFrame( [Row(struct_val=Row(int_val=4789, string_val="Hello World"))]) expected_output_df = spark_session.createDataFrame([ Row(original_columns=Row( struct_val=Row(int_val=4789, string_val="Hello World")), int_val=4789, string_val="Hello World") ]) expected_output_df.schema["original_columns"].nullable = False output_df = flattener.transform(input_df) assert_df_equality(output_df, expected_output_df)
def test_array_nested_in_struct(self, spark_session, flattener): input_df = spark_session.createDataFrame([ Row(struct_val=Row(array_val=[4789, 4790, 4791], string_val="How are you?")) ]) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame( [("How are you?", 4789), ("How are you?", 4790), ("How are you?", 4791)], schema=["string_val", "array_val"]) assert_df_equality(expected_output_df, output_df)
def test_multiple_columns_of_different_datatype(self, spark_session, flattener): input_df = spark_session.createDataFrame([ Row(int_val=4789, string_val="Hello World", date_val=datetime.date(2021, 1, 14)) ]) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame( [(4789, "Hello World", datetime.date(2021, 1, 14))], schema=["int_val", "string_val", "date_val"]) assert_df_equality(expected_output_df, output_df)
def test_duplicated_column_names(self, flattener, spark_session): input_df = spark_session.createDataFrame([ Row(struct_val=Row(int_val=4789, string_val="Hello World"), struct_val_2=Row(int_val=4790, string_val="How are you?")) ]) expected_output_df = spark_session.createDataFrame( [(4789, "Hello World", 4790, "How are you?")], schema=[ "int_val", "string_val", "struct_val_2_int_val", "struct_val_2_string_val" ]) output_df = flattener.transform(input_df) assert_df_equality(output_df, expected_output_df)
def test_single_array_with_other_columns(self, spark_session, flattener): input_df = spark_session.createDataFrame([ Row(array_val=[4789, 4790, 4791], timestamp_val=datetime.datetime(2021, 1, 14, 8, 10, 14)) ]) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame( [(datetime.datetime(2021, 1, 14, 8, 10, 14), 4789), (datetime.datetime(2021, 1, 14, 8, 10, 14), 4790), (datetime.datetime(2021, 1, 14, 8, 10, 14), 4791)], schema=["timestamp_val", "array_val"]) assert_df_equality(expected_output_df, output_df)
def test_multiple_arrays(self, spark_session, flattener): input_df = spark_session.createDataFrame([ Row(array_val_1=[4789, 4790, 4791], array_val_2=["How", "Are", "You", "?"]) ]) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame( [(4789, "How"), (4789, "Are"), (4789, "You"), (4789, "?"), (4790, "How"), (4790, "Are"), (4790, "You"), (4790, "?"), (4791, "How"), (4791, "Are"), (4791, "You"), (4791, "?")], schema=["array_val_1", "array_val_2"]) assert_df_equality(expected_output_df, output_df)
def test_single_cleansed_value_is_stored_in_separate_column(self, transformer, input_df_integers, spark_session): thresholds = dict(integers=dict(min=0, max=10)) expected_output_df = spark_session.createDataFrame( [ Row(id=0, integers=None, cleansed_values_threshold=Row(integers=-5)), Row(id=1, integers=5, cleansed_values_threshold=Row(integers=None)), Row(id=2, integers=None, cleansed_values_threshold=Row(integers=15)), ] ) output_df = ThresholdCleaner(thresholds, column_to_log_cleansed_values="cleansed_values_threshold").transform( input_df_integers ) assert_df_equality(expected_output_df, output_df, ignore_nullable=True)
def test_current_date(self, input_df, spark_session): """Substitute the cleansed values with the current date""" cleaning_definitions = dict(status=dict( elements=["active", "inactive"], default=F.current_date())) expected_output_df = spark_session.createDataFrame([ Row(id=1, status="active"), Row(id=2, status=str(dt.date.today())), Row(id=3, status=str(dt.date.today())), Row(id=4, status="inactive"), Row(id=5, status=None), Row(id=6, status=str(dt.date.today())), ]) output_df = EnumCleaner( cleaning_definitions=cleaning_definitions).transform(input_df) assert_df_equality(expected_output_df, output_df)
def test_array_nested_in_array(self, spark_session, flattener): input_df = spark_session.createDataFrame([ Row(array_val=[["Here's", "My", "Number", ":"], [555, 127, 53, 90]], string_val="How are you?") ]) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame( [("How are you?", "Here's"), ("How are you?", "My"), ("How are you?", "Number"), ("How are you?", ":"), ("How are you?", "555"), ("How are you?", "127"), ("How are you?", "53"), ("How are you?", "90")], schema=["string_val", "array_val"]) assert_df_equality(expected_output_df, output_df)
def test_column_reference(self, input_df, spark_session): """Substitute the cleansed values with the calculated string based on another column""" default_value_func = (F.col("id") * 10).cast(T.StringType()) cleaning_definitions = dict(status=dict( elements=["active", "inactive"], default=default_value_func)) expected_output_df = spark_session.createDataFrame([ Row(id=1, status="active"), Row(id=2, status="20"), Row(id=3, status="30"), Row(id=4, status="inactive"), Row(id=5, status=None), Row(id=6, status="60"), ]) output_df = EnumCleaner( cleaning_definitions=cleaning_definitions).transform(input_df) assert_df_equality(expected_output_df, output_df)
def test_nested_struct_attributes(self, flattener, spark_session): input_df = spark_session.createDataFrame([ Row(struct_val_1=Row(struct_val_2=Row(struct_val_3=Row( struct_val_4=Row(int_val=4789), int_val=4790), string_val="Hello"), double_val=43.12), timestamp_val=datetime.datetime(2021, 1, 1, 12, 30, 15)) ]) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame( [(4789, 4790, "Hello", 43.12, datetime.datetime(2021, 1, 1, 12, 30, 15))], schema=[ "int_val", "struct_val_3_int_val", "string_val", "double_val", "timestamp_val" ]) assert_df_equality(expected_output_df, output_df)
def test_multiple_arrays_with_other_columns(self, flattener, spark_session): input_df = spark_session.createDataFrame([ Row(array_val_1=[4789, 4790, 4791], array_val_2=["How", "Are", "You", "?"], double_val=43.102) ]) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame( [(43.102, 4789, "How"), (43.102, 4789, "Are"), (43.102, 4789, "You"), (43.102, 4789, "?"), (43.102, 4790, "How"), (43.102, 4790, "Are"), (43.102, 4790, "You"), (43.102, 4790, "?"), (43.102, 4791, "How"), (43.102, 4791, "Are"), (43.102, 4791, "You"), (43.102, 4791, "?")], schema=["double_val", "array_val_1", "array_val_2"]) assert_df_equality(expected_output_df, output_df)
def test_remove_space(spark): source_data = [ (" toto ", "*****@*****.**"), ("titi ", "*****@*****.**"), (" tata", "*****@*****.**"), ] source_df = spark.createDataFrame(source_data, ["name", "mail"]) actual_df = remove_space(source_df, "name", "a") expected_data = [ ("toto", "*****@*****.**"), ("titi", "*****@*****.**"), ("tata", "*****@*****.**"), ] expected_df = spark.createDataFrame(expected_data, ["name", "mail"]) source_df.show() actual_df.show() expected_df.show() assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_single_cleansed_value_is_stored_in_separate_column_with_default_substitute( self, input_df, spark_session): expected_output_df = spark_session.createDataFrame([ Row(b="positive", cleansed_values_enum=Row(b=None)), Row(b="cleansed_value", cleansed_values_enum=Row(b="negative")), Row(b="positive", cleansed_values_enum=Row(b=None)), ]) cleansing_definitions = { "b": { "elements": ["positive"], "default": "cleansed_value" } } transformer = EnumCleaner( cleansing_definitions, column_to_log_cleansed_values="cleansed_values_enum") output_df = transformer.transform(input_df) assert_df_equality(expected_output_df, output_df, ignore_nullable=True)
def test_add_greeting(spark): source_data = [ ("toto", ), ("titi", ), ("tata", ), ] source_df = spark.createDataFrame(source_data, ["name"]) actual_df = add_greeting(source_df) expected_data = [ ("toto", "hello!"), ("titi", "hello!"), ("tata", "hello!"), ] expected_df = spark.createDataFrame(expected_data, ["name", "greeting"]) source_df.show() actual_df.show() expected_df.show() assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_velib_preprocessing(spark): source_data = [ (1606946341, "a", datetime.fromtimestamp(1606946341), "22", "59", "4"), (1606946341, "a", datetime.fromtimestamp(1606946341), "22", "59", "4") ] source_df = spark.createDataFrame( source_data, ["timestamp", "name", "normal_type", "godzina", "minuta", "dzien"]) actual_df = urzedy_manipulation.urzedy_preprocessing(source_df) expected_data = [(1606946341, "a", "22", "59", "4"), (1606946341, "a", "22", "59", "4")] expected_df = spark.createDataFrame( expected_data, ["timestamp", "name", "godzina", "minuta", "dzien"]) assert_df_equality(actual_df, expected_df)
def test_struct_nested_in_array(self, spark_session, flattener): input_df = spark_session.createDataFrame([ Row(array_val=[ Row(int_val=4789, string_val="Hello Darkness", date_val=datetime.date(2021, 1, 14)), Row(int_val=4790, string_val="My Old Friend", date_val=datetime.date(2021, 1, 15)) ], double_val=43.102) ]) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame( [(43.102, 4789, "Hello Darkness", datetime.date(2021, 1, 14)), (43.102, 4790, "My Old Friend", datetime.date(2021, 1, 15))], schema=["double_val", "int_val", "string_val", "date_val"]) assert_df_equality(expected_output_df, output_df)
def test_multiple_columns_of_different_datatype_keeping_original_columns( self, spark_session): input_df = spark_session.createDataFrame([ Row(int_val=4789, string_val="Hello World", date_val=datetime.date(2021, 1, 14)) ]) flattener = Flattener(keep_original_columns=True) output_df = flattener.transform(input_df) expected_output_df = spark_session.createDataFrame([ Row(original_columns=Row(int_val=4789, string_val="Hello World", date_val=datetime.date(2021, 1, 14)), int_val=4789, string_val="Hello World", date_val=datetime.date(2021, 1, 14)) ]) expected_output_df.schema["original_columns"].nullable = False assert_df_equality(output_df, expected_output_df)