Example #1
0
 def test_default_value_defaults_to_none(self, input_df,
                                         expected_output_df):
     """Missing 'default' attribute is set to the default: None"""
     cleaning_definition = dict(b=dict(elements=["positive"], mode="allow"))
     output_df = EnumCleaner(
         cleaning_definitions=cleaning_definition).transform(input_df)
     assert_df_equality(expected_output_df, output_df)
Example #2
0
 def test_missing_mode_defaults_to_allow(self, input_df,
                                         expected_output_df):
     """Missing 'mode' attribute is set to the default: 'allow'"""
     cleaning_definition = dict(b=dict(elements=["positive"], default=None))
     output_df = EnumCleaner(
         cleaning_definitions=cleaning_definition).transform(input_df)
     assert_df_equality(expected_output_df, output_df)
Example #3
0
    def test_multiple_cleansing_rules(self, spark_session):
        input_df = spark_session.createDataFrame([
            Row(a="stay", b="positive", c="or", d="healthy"),
            Row(a="stay", b="negative", c="and", d="healthy"),
            Row(a="stay", b="positive", c="xor", d="healthy"),
        ])

        cleansing_definitions = dict(
            b=dict(elements=["positive"]),
            c=dict(elements=["xor"], mode="disallow", default="or"),
        )

        expected_output_df = spark_session.createDataFrame([
            Row(a="stay",
                b="positive",
                c="or",
                d="healthy",
                cleansed_values_enum=Row(b=None, c=None)),
            Row(a="stay",
                b=None,
                c="and",
                d="healthy",
                cleansed_values_enum=Row(b="negative", c=None)),
            Row(a="stay",
                b="positive",
                c="or",
                d="healthy",
                cleansed_values_enum=Row(b=None, c="xor")),
        ])

        output_df = EnumCleaner(
            cleansing_definitions,
            column_to_log_cleansed_values="cleansed_values_enum").transform(
                input_df)
        assert_df_equality(expected_output_df, output_df, ignore_nullable=True)
Example #4
0
def assert_mapping_equality(mapping_1, mapping_2, spark):
    if mapping_1 == mapping_2:
        return True
    else:  # for easier debugging
        assert_df_equality(
            spark.createDataFrame(mapping_1, ["name", "source", "type"]),
            spark.createDataFrame(mapping_2, ["name", "source", "type"]))
Example #5
0
 def test_single_array_with_other_columns_keeping_original_columns(
         self, spark_session):
     input_df = spark_session.createDataFrame([
         Row(array_val=[4789, 4790, 4791],
             timestamp_val=datetime.datetime(2021, 1, 14, 8, 10, 14))
     ])
     flattener = Flattener(keep_original_columns=True)
     output_df = flattener.transform(input_df)
     expected_output_df = spark_session.createDataFrame([
         Row(original_columns=Row(array_val=[4789, 4790, 4791],
                                  timestamp_val=datetime.datetime(
                                      2021, 1, 14, 8, 10, 14)),
             timestamp_val=datetime.datetime(2021, 1, 14, 8, 10, 14),
             array_val=4789),
         Row(original_columns=Row(array_val=[4789, 4790, 4791],
                                  timestamp_val=datetime.datetime(
                                      2021, 1, 14, 8, 10, 14)),
             timestamp_val=datetime.datetime(2021, 1, 14, 8, 10, 14),
             array_val=4790),
         Row(original_columns=Row(array_val=[4789, 4790, 4791],
                                  timestamp_val=datetime.datetime(
                                      2021, 1, 14, 8, 10, 14)),
             timestamp_val=datetime.datetime(2021, 1, 14, 8, 10, 14),
             array_val=4791),
     ])
     expected_output_df.schema["original_columns"].nullable = False
     assert_df_equality(expected_output_df, output_df)
Example #6
0
 def test_simple_renames(self, flattener, spark_session):
     input_df = spark_session.createDataFrame(
         [Row(struct_val=Row(int_val=4789, string_val="Hello World"))])
     expected_output_df = spark_session.createDataFrame(
         [(4789, "Hello World")], schema=["int_val", "string_val"])
     output_df = flattener.transform(input_df)
     assert_df_equality(output_df, expected_output_df)
Example #7
0
    def test_single_array(self, spark_session, flattener):
        input_df = spark_session.createDataFrame(
            [Row(array_val=[4789, 4790, 4791])])
        output_df = flattener.transform(input_df)
        expected_output_df = spark_session.createDataFrame(
            [(4789, ), (4790, ), (4791, )], schema=["array_val"])

        assert_df_equality(expected_output_df, output_df)
Example #8
0
def test_ensemble_array(spark, input_df, config_array, expected_result_df):
    ensembler = Ensembler.from_config(config_array)
    result_df = ensembler.ensemble(input_df, spark)

    expected_df = input_df.join(expected_result_df.toDF(
        'customer_id', config_array.result.column_name),
                                on='customer_id')
    assert_df_equality(result_df, expected_df, ignore_row_order=True)
Example #9
0
    def test_single_column(self, spark_session, flattener):
        input_df = spark_session.createDataFrame(
            [Row(string_val="Hello World")])
        output_df = flattener.transform(input_df)
        expected_output_df = spark_session.createDataFrame(
            [("Hello World", )], schema=["string_val"])

        assert_df_equality(expected_output_df, output_df)
Example #10
0
    def test_single_struct_single_attribute(self, spark_session, flattener):
        input_df = spark_session.createDataFrame(
            [Row(struct_val=Row(int_val=4789))])
        output_df = flattener.transform(input_df)
        expected_output_df = spark_session.createDataFrame(
            [(4789, )], schema=["struct_val_int_val"])

        assert_df_equality(expected_output_df, output_df)
Example #11
0
    def test_multiple_columns_of_same_datatype(self, spark_session, flattener):
        input_df = spark_session.createDataFrame(
            [Row(int_val_1=4789, int_val_2=4790, int_val_3=4791)])
        output_df = flattener.transform(input_df)
        expected_output_df = spark_session.createDataFrame(
            [(4789, 4790, 4791)],
            schema=["int_val_1", "int_val_2", "int_val_3"])

        assert_df_equality(expected_output_df, output_df)
def test_timestamp_to_date(spark):
    source_data = [(1606946341, "a"), (1606946341, "a")]
    source_df = spark.createDataFrame(source_data, ["timestamp", "name"])
    actual_df = common_manipulations.timestamp_to_date(source_df)
    expected_data = [
        (1606946341, "a", datetime.fromtimestamp(1606946341), "22", "59", "4"),
        (1606946341, "a", datetime.fromtimestamp(1606946341), "22", "59", "4")
    ]
    expected_df = spark.createDataFrame(
        expected_data,
        ["timestamp", "name", "normal_type", "godzina", "minuta", "dzien"])
    assert_df_equality(actual_df, expected_df)
Example #13
0
 def test_simple_struct(self, flattener, spark_session):
     input_df = spark_session.createDataFrame(
         [Row(struct_val=Row(int_val=4789, string_val="Hello World"))])
     expected_output_df = spark_session.createDataFrame([
         Row(original_columns=Row(
             struct_val=Row(int_val=4789, string_val="Hello World")),
             int_val=4789,
             string_val="Hello World")
     ])
     expected_output_df.schema["original_columns"].nullable = False
     output_df = flattener.transform(input_df)
     assert_df_equality(output_df, expected_output_df)
Example #14
0
    def test_array_nested_in_struct(self, spark_session, flattener):
        input_df = spark_session.createDataFrame([
            Row(struct_val=Row(array_val=[4789, 4790, 4791],
                               string_val="How are you?"))
        ])
        output_df = flattener.transform(input_df)
        expected_output_df = spark_session.createDataFrame(
            [("How are you?", 4789), ("How are you?", 4790),
             ("How are you?", 4791)],
            schema=["string_val", "array_val"])

        assert_df_equality(expected_output_df, output_df)
Example #15
0
    def test_multiple_columns_of_different_datatype(self, spark_session,
                                                    flattener):
        input_df = spark_session.createDataFrame([
            Row(int_val=4789,
                string_val="Hello World",
                date_val=datetime.date(2021, 1, 14))
        ])
        output_df = flattener.transform(input_df)
        expected_output_df = spark_session.createDataFrame(
            [(4789, "Hello World", datetime.date(2021, 1, 14))],
            schema=["int_val", "string_val", "date_val"])

        assert_df_equality(expected_output_df, output_df)
Example #16
0
 def test_duplicated_column_names(self, flattener, spark_session):
     input_df = spark_session.createDataFrame([
         Row(struct_val=Row(int_val=4789, string_val="Hello World"),
             struct_val_2=Row(int_val=4790, string_val="How are you?"))
     ])
     expected_output_df = spark_session.createDataFrame(
         [(4789, "Hello World", 4790, "How are you?")],
         schema=[
             "int_val", "string_val", "struct_val_2_int_val",
             "struct_val_2_string_val"
         ])
     output_df = flattener.transform(input_df)
     assert_df_equality(output_df, expected_output_df)
Example #17
0
    def test_single_array_with_other_columns(self, spark_session, flattener):
        input_df = spark_session.createDataFrame([
            Row(array_val=[4789, 4790, 4791],
                timestamp_val=datetime.datetime(2021, 1, 14, 8, 10, 14))
        ])
        output_df = flattener.transform(input_df)
        expected_output_df = spark_session.createDataFrame(
            [(datetime.datetime(2021, 1, 14, 8, 10, 14), 4789),
             (datetime.datetime(2021, 1, 14, 8, 10, 14), 4790),
             (datetime.datetime(2021, 1, 14, 8, 10, 14), 4791)],
            schema=["timestamp_val", "array_val"])

        assert_df_equality(expected_output_df, output_df)
Example #18
0
    def test_multiple_arrays(self, spark_session, flattener):
        input_df = spark_session.createDataFrame([
            Row(array_val_1=[4789, 4790, 4791],
                array_val_2=["How", "Are", "You", "?"])
        ])
        output_df = flattener.transform(input_df)
        expected_output_df = spark_session.createDataFrame(
            [(4789, "How"), (4789, "Are"), (4789, "You"), (4789, "?"),
             (4790, "How"), (4790, "Are"), (4790, "You"), (4790, "?"),
             (4791, "How"), (4791, "Are"), (4791, "You"), (4791, "?")],
            schema=["array_val_1", "array_val_2"])

        assert_df_equality(expected_output_df, output_df)
Example #19
0
    def test_single_cleansed_value_is_stored_in_separate_column(self, transformer, input_df_integers, spark_session):
        thresholds = dict(integers=dict(min=0, max=10))

        expected_output_df = spark_session.createDataFrame(
            [
                Row(id=0, integers=None, cleansed_values_threshold=Row(integers=-5)),
                Row(id=1, integers=5, cleansed_values_threshold=Row(integers=None)),
                Row(id=2, integers=None, cleansed_values_threshold=Row(integers=15)),
            ]
        )
        output_df = ThresholdCleaner(thresholds, column_to_log_cleansed_values="cleansed_values_threshold").transform(
            input_df_integers
        )
        assert_df_equality(expected_output_df, output_df, ignore_nullable=True)
Example #20
0
 def test_current_date(self, input_df, spark_session):
     """Substitute the cleansed values with the current date"""
     cleaning_definitions = dict(status=dict(
         elements=["active", "inactive"], default=F.current_date()))
     expected_output_df = spark_session.createDataFrame([
         Row(id=1, status="active"),
         Row(id=2, status=str(dt.date.today())),
         Row(id=3, status=str(dt.date.today())),
         Row(id=4, status="inactive"),
         Row(id=5, status=None),
         Row(id=6, status=str(dt.date.today())),
     ])
     output_df = EnumCleaner(
         cleaning_definitions=cleaning_definitions).transform(input_df)
     assert_df_equality(expected_output_df, output_df)
Example #21
0
    def test_array_nested_in_array(self, spark_session, flattener):
        input_df = spark_session.createDataFrame([
            Row(array_val=[["Here's", "My", "Number", ":"], [555, 127, 53,
                                                             90]],
                string_val="How are you?")
        ])
        output_df = flattener.transform(input_df)
        expected_output_df = spark_session.createDataFrame(
            [("How are you?", "Here's"), ("How are you?", "My"),
             ("How are you?", "Number"), ("How are you?", ":"),
             ("How are you?", "555"), ("How are you?", "127"),
             ("How are you?", "53"), ("How are you?", "90")],
            schema=["string_val", "array_val"])

        assert_df_equality(expected_output_df, output_df)
Example #22
0
 def test_column_reference(self, input_df, spark_session):
     """Substitute the cleansed values with the calculated string based on another column"""
     default_value_func = (F.col("id") * 10).cast(T.StringType())
     cleaning_definitions = dict(status=dict(
         elements=["active", "inactive"], default=default_value_func))
     expected_output_df = spark_session.createDataFrame([
         Row(id=1, status="active"),
         Row(id=2, status="20"),
         Row(id=3, status="30"),
         Row(id=4, status="inactive"),
         Row(id=5, status=None),
         Row(id=6, status="60"),
     ])
     output_df = EnumCleaner(
         cleaning_definitions=cleaning_definitions).transform(input_df)
     assert_df_equality(expected_output_df, output_df)
Example #23
0
 def test_nested_struct_attributes(self, flattener, spark_session):
     input_df = spark_session.createDataFrame([
         Row(struct_val_1=Row(struct_val_2=Row(struct_val_3=Row(
             struct_val_4=Row(int_val=4789), int_val=4790),
                                               string_val="Hello"),
                              double_val=43.12),
             timestamp_val=datetime.datetime(2021, 1, 1, 12, 30, 15))
     ])
     output_df = flattener.transform(input_df)
     expected_output_df = spark_session.createDataFrame(
         [(4789, 4790, "Hello", 43.12,
           datetime.datetime(2021, 1, 1, 12, 30, 15))],
         schema=[
             "int_val", "struct_val_3_int_val", "string_val", "double_val",
             "timestamp_val"
         ])
     assert_df_equality(expected_output_df, output_df)
Example #24
0
    def test_multiple_arrays_with_other_columns(self, flattener,
                                                spark_session):
        input_df = spark_session.createDataFrame([
            Row(array_val_1=[4789, 4790, 4791],
                array_val_2=["How", "Are", "You", "?"],
                double_val=43.102)
        ])
        output_df = flattener.transform(input_df)
        expected_output_df = spark_session.createDataFrame(
            [(43.102, 4789, "How"), (43.102, 4789, "Are"),
             (43.102, 4789, "You"), (43.102, 4789, "?"), (43.102, 4790, "How"),
             (43.102, 4790, "Are"), (43.102, 4790, "You"), (43.102, 4790, "?"),
             (43.102, 4791, "How"), (43.102, 4791, "Are"),
             (43.102, 4791, "You"), (43.102, 4791, "?")],
            schema=["double_val", "array_val_1", "array_val_2"])

        assert_df_equality(expected_output_df, output_df)
Example #25
0
def test_remove_space(spark):
    source_data = [
        ("  toto  ", "*****@*****.**"),
        ("titi  ", "*****@*****.**"),
        ("  tata", "*****@*****.**"),
    ]
    source_df = spark.createDataFrame(source_data, ["name", "mail"])
    actual_df = remove_space(source_df, "name", "a")
    expected_data = [
        ("toto", "*****@*****.**"),
        ("titi", "*****@*****.**"),
        ("tata", "*****@*****.**"),
    ]
    expected_df = spark.createDataFrame(expected_data, ["name", "mail"])
    source_df.show()
    actual_df.show()
    expected_df.show()
    assert_df_equality(actual_df, expected_df, ignore_nullable=True)
Example #26
0
 def test_single_cleansed_value_is_stored_in_separate_column_with_default_substitute(
         self, input_df, spark_session):
     expected_output_df = spark_session.createDataFrame([
         Row(b="positive", cleansed_values_enum=Row(b=None)),
         Row(b="cleansed_value", cleansed_values_enum=Row(b="negative")),
         Row(b="positive", cleansed_values_enum=Row(b=None)),
     ])
     cleansing_definitions = {
         "b": {
             "elements": ["positive"],
             "default": "cleansed_value"
         }
     }
     transformer = EnumCleaner(
         cleansing_definitions,
         column_to_log_cleansed_values="cleansed_values_enum")
     output_df = transformer.transform(input_df)
     assert_df_equality(expected_output_df, output_df, ignore_nullable=True)
def test_add_greeting(spark):
    source_data = [
        ("toto", ),
        ("titi", ),
        ("tata", ),
    ]

    source_df = spark.createDataFrame(source_data, ["name"])
    actual_df = add_greeting(source_df)
    expected_data = [
        ("toto", "hello!"),
        ("titi", "hello!"),
        ("tata", "hello!"),
    ]
    expected_df = spark.createDataFrame(expected_data, ["name", "greeting"])
    source_df.show()
    actual_df.show()
    expected_df.show()
    assert_df_equality(actual_df, expected_df, ignore_nullable=True)
def test_velib_preprocessing(spark):

    source_data = [
        (1606946341, "a", datetime.fromtimestamp(1606946341), "22", "59", "4"),
        (1606946341, "a", datetime.fromtimestamp(1606946341), "22", "59", "4")
    ]
    source_df = spark.createDataFrame(
        source_data,
        ["timestamp", "name", "normal_type", "godzina", "minuta", "dzien"])

    actual_df = urzedy_manipulation.urzedy_preprocessing(source_df)

    expected_data = [(1606946341, "a", "22", "59", "4"),
                     (1606946341, "a", "22", "59", "4")]

    expected_df = spark.createDataFrame(
        expected_data, ["timestamp", "name", "godzina", "minuta", "dzien"])

    assert_df_equality(actual_df, expected_df)
Example #29
0
    def test_struct_nested_in_array(self, spark_session, flattener):
        input_df = spark_session.createDataFrame([
            Row(array_val=[
                Row(int_val=4789,
                    string_val="Hello Darkness",
                    date_val=datetime.date(2021, 1, 14)),
                Row(int_val=4790,
                    string_val="My Old Friend",
                    date_val=datetime.date(2021, 1, 15))
            ],
                double_val=43.102)
        ])
        output_df = flattener.transform(input_df)
        expected_output_df = spark_session.createDataFrame(
            [(43.102, 4789, "Hello Darkness", datetime.date(2021, 1, 14)),
             (43.102, 4790, "My Old Friend", datetime.date(2021, 1, 15))],
            schema=["double_val", "int_val", "string_val", "date_val"])

        assert_df_equality(expected_output_df, output_df)
Example #30
0
 def test_multiple_columns_of_different_datatype_keeping_original_columns(
         self, spark_session):
     input_df = spark_session.createDataFrame([
         Row(int_val=4789,
             string_val="Hello World",
             date_val=datetime.date(2021, 1, 14))
     ])
     flattener = Flattener(keep_original_columns=True)
     output_df = flattener.transform(input_df)
     expected_output_df = spark_session.createDataFrame([
         Row(original_columns=Row(int_val=4789,
                                  string_val="Hello World",
                                  date_val=datetime.date(2021, 1, 14)),
             int_val=4789,
             string_val="Hello World",
             date_val=datetime.date(2021, 1, 14))
     ])
     expected_output_df.schema["original_columns"].nullable = False
     assert_df_equality(output_df, expected_output_df)