Beispiel #1
0
 def test_extended_string_to_long(self, spark_session, input_value,
                                  expected_value):
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(
         mapping=[("output_key", "input_key",
                   "extended_string_to_long")]).transform(input_df)
     assert output_df.first().output_key == expected_value
     assert isinstance(output_df.schema["output_key"].dataType, T.LongType)
Beispiel #2
0
 def test_extended_string_unix_timestamp_ms_to_date_spark2(
         self, spark_session, input_value, expected_value):
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(mapping=[(
         "output_key", "input_key",
         "extended_string_unix_timestamp_ms_to_date")]).transform(input_df)
     actual_value = output_df.first().output_key
     assert actual_value == expected_value
     assert isinstance(output_df.schema["output_key"].dataType, T.DateType)
Beispiel #3
0
 def test_extended_string_to_float(self, spark_session, input_value,
                                   expected_value):
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(
         mapping=[("output_key", "input_key",
                   "extended_string_to_float")]).transform(input_df)
     actual_value = output_df.first().output_key
     if actual_value is not None:
         assert pytest.approx(actual_value) == expected_value
     else:
         assert actual_value == expected_value
     assert isinstance(output_df.schema["output_key"].dataType, T.FloatType)
Beispiel #4
0
 def test_spark_sql_object(self, spark_session, input_value_1,
                           input_value_2, mapper_function, expected_value):
     input_df = self.create_input_df(input_value_1, input_value_2,
                                     spark_session)
     output_df = Mapper(mapping=[("output_key", mapper_function,
                                  "as_is")]).transform(input_df)
     actual = output_df.first().output_key
     if isinstance(expected_value, datetime.datetime):
         assert (expected_value - datetime.timedelta(seconds=30)
                 ) < actual < datetime.datetime.now()
     else:
         assert actual == expected_value
Beispiel #5
0
 def test_extended_string_to_date_spark2(self, spark_session, input_value,
                                         expected_value):
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(
         mapping=[("output_key", "input_key",
                   "extended_string_to_date")]).transform(input_df)
     try:
         actual_value = output_df.first().output_key
     except ValueError:
         # If input is in milliseconds it will still be stored in the DF but cannot be collected in Python
         actual_value = "out_of_range_for_python"
     assert actual_value == expected_value
     assert isinstance(output_df.schema["output_key"].dataType, T.DateType)
Beispiel #6
0
    def test_multiple_columns_are_accessed(self, spark_session):
        input_df = spark_session.createDataFrame([
            Row(first_name="David", last_name="Eigenstuhler"),
            Row(first_name="Katharina", last_name="Hohensinn"),
            Row(first_name="Nora", last_name="Hohensinn"),
        ])
        input_values = input_df.rdd.map(lambda x: x.asDict()).collect()
        expected_values = [
            d["first_name"] + "_" + d["last_name"] for d in input_values
        ]

        def _first_and_last_name(source_column, name):
            return F.concat_ws("_", source_column,
                               F.col("last_name")).alias(name)

        custom_types.add_custom_data_type(function_name="fullname",
                                          func=_first_and_last_name)

        output_df = Mapper([("full_name", "first_name", "fullname")
                            ]).transform(input_df)

        output_values = output_df.rdd.map(
            lambda x: x.asDict()["full_name"]).collect()

        assert expected_values == output_values
Beispiel #7
0
 def test_prepending_a_mapping(self, mapped_df, new_mapping, input_columns,
                               new_columns):
     """Output schema is correct for added mapping at the beginning of the input schema"""
     new_mapped_df = Mapper(
         mapping=new_mapping, mode="prepend",
         ignore_missing_columns=True).transform(mapped_df)
     assert new_columns + input_columns == new_mapped_df.columns
Beispiel #8
0
 def test_generate_select_expression_for_unix_timestamp_ms_to_spark_timestamp(
         self, input_value, spark_session):
     input_df = spark_session.createDataFrame(
         [Row(input_column=input_value)],
         schema=T.StructType(
             [T.StructField("input_column", T.LongType(), True)]))
     output_df = Mapper(mapping=[(
         "output_column", "input_column",
         "unix_timestamp_ms_to_spark_timestamp")]).transform(input_df)
     expected_value = datetime.datetime.fromtimestamp(input_value / 1000.0)
     assert output_df.first(
     ).output_column == expected_value, "Processing of column value"
     assert output_df.schema.fieldNames() == ["output_column"
                                              ], "Renaming of column"
     assert output_df.schema["output_column"].dataType.typeName(
     ) == "timestamp", "Casting of column"
Beispiel #9
0
 def test_generate_select_expression_for_meters_to_cm(
         self, input_value, expected_value, spark_session):
     input_df = spark_session.createDataFrame(
         data=[Row(input_key=input_value)],
         schema=T.StructType([
             T.StructField("input_key", get_spark_data_type(input_value),
                           True)
         ]),
     )
     output_df = Mapper(mapping=[("output_column", "input_key",
                                  "meters_to_cm")]).transform(input_df)
     assert output_df.first(
     ).output_column == expected_value, "Processing of column value"
     assert output_df.schema.fieldNames() == ["output_column"
                                              ], "Renaming of column"
     assert output_df.schema["output_column"].dataType.typeName(
     ) == "integer", "Casting of column"
Beispiel #10
0
 def test_extended_string_to_timestamp(self, spark_session, input_value,
                                       expected_value):
     # test uses timezone set to GMT / UTC (pytest.ini)!
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(
         mapping=[("output_key", "input_key",
                   "extended_string_to_timestamp")]).transform(input_df)
     # workaround via pandas necessary due to bug with direct conversion
     # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123)
     output_pd_df = output_df.toPandas()
     output_value = output_pd_df.iloc[0]["output_key"]
     if isinstance(output_value, type(pd.NaT)):
         actual_value = None
     else:
         actual_value = output_value.to_pydatetime()
     assert actual_value == expected_value
     assert isinstance(output_df.schema["output_key"].dataType,
                       T.TimestampType)
Beispiel #11
0
 def test_extended_string_to_timestamp_spark2(self, spark_session,
                                              input_value, expected_value):
     # test uses timezone set to GMT / UTC (pytest.ini)!
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(
         mapping=[("output_key", "input_key",
                   "extended_string_to_timestamp")]).transform(input_df)
     # workaround via pandas necessary due to bug with direct conversion
     # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123)
     try:
         output_pd_df = output_df.toPandas()
         actual_value = output_pd_df.iloc[0]["output_key"].to_pydatetime()
     except ValueError:
         # If input is in milliseconds it will still be stored in the DF but cannot be collected in Python
         actual_value = "out_of_range_for_python"
     except AttributeError:
         # `.to_pydatetime()` can only be used on datetimes and throws AttributeErrors on other objects / None
         actual_value = None
     assert actual_value == expected_value
     assert isinstance(output_df.schema["output_key"].dataType,
                       T.TimestampType)
Beispiel #12
0
 def test_extended_string_unix_timestamp_ms_to_timestamp_spark2(
         self, spark_session, input_value, expected_value):
     # test uses timezone set to GMT / UTC (pytest.ini)!
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(
         mapping=[("output_key", "input_key",
                   "extended_string_unix_timestamp_ms_to_timestamp"
                   )]).transform(input_df)
     # workaround via pandas necessary due to bug with direct conversion
     # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123)
     try:
         output_pd_df = output_df.toPandas()
         actual_value = output_pd_df.iloc[0]["output_key"].to_pydatetime()
         assert (
             actual_value.toordinal() == expected_value.toordinal(),
             "actual_value: {act_val}, expected value: {expected_val}".
             format(act_val=actual_value, expected_val=expected_value),
         )
     except AttributeError:
         # `.to_pydatetime()` can only be used on datetimes and throws AttributeErrors on None
         assert expected_value is None
     assert isinstance(output_df.schema["output_key"].dataType,
                       T.TimestampType)
Beispiel #13
0
 def test_prepending_a_mapping_with_duplicated_columns(
         self, input_columns, mapped_df):
     """Output schema is correct for newly prepended mapping with columns
     that are also included in the input schema"""
     new_mapping = [
         ("created_date", "meta.created_at_sec", "DateType"),
         ("birthday", "birthday", "DateType"),
     ]
     new_columns = [name for (name, path, data_type) in new_mapping]
     new_columns_deduplicated = [
         x for x in new_columns if x not in input_columns
     ]
     new_mapped_df = Mapper(
         mapping=new_mapping, mode="prepend",
         ignore_missing_columns=True).transform(mapped_df)
     assert new_columns_deduplicated + input_columns == new_mapped_df.columns
     assert mapped_df.schema["birthday"].dataType == T.TimestampType()
     assert new_mapped_df.schema["birthday"].dataType == T.DateType()
Beispiel #14
0
    def test_function_name_is_shortened(self, spark_session):
        input_df = spark_session.createDataFrame([
            Row(first_name="David"),
            Row(first_name="Katharina"),
            Row(first_name="Nora"),
        ])
        input_values = input_df.rdd.map(
            lambda x: x.asDict()["first_name"]).collect()
        expected_values = [fn.lower() for fn in input_values]

        def _lowercase(source_column, name):
            return F.lower(source_column).alias(name)

        custom_types.add_custom_data_type(function_name="lowercase",
                                          func=_lowercase)

        output_df = Mapper([("first_name", "first_name", "lowercase")
                            ]).transform(input_df)
        output_values = output_df.rdd.map(
            lambda x: x.asDict()["first_name"]).collect()

        assert expected_values == output_values
Beispiel #15
0
 def test_ambiguous_column_names_exception_is_ignored(
         self, input_df, mapping, expected_output_df):
     transformer = Mapper(mapping, ignore_ambiguous_columns=True)
     output_df = transformer.transform(input_df)
     assert_df_equality(expected_output_df, output_df)
Beispiel #16
0
 def test_ambiguous_column_names_raise_exception(self, input_df, mapping):
     transformer = Mapper(mapping)
     with pytest.raises(AnalysisException):
         transformer.transform(input_df)
Beispiel #17
0
 def transformer(self, mapping):
     return Mapper(mapping=mapping, ignore_missing_columns=False)
Beispiel #18
0
def transformer(mapping):
    return Mapper(mapping=mapping, ignore_missing_columns=True)