Beispiel #1
0
 def test_generate_select_expression_for_meters_to_cm(
         self, input_value, expected_value, spark_session):
     input_df = spark_session.createDataFrame(
         data=[Row(input_key=input_value)],
         schema=T.StructType([
             T.StructField("input_key", get_spark_data_type(input_value),
                           True)
         ]))
     output_df = Mapper(mapping=[("output_column", "input_key",
                                  "meters_to_cm")]).transform(input_df)
     assert output_df.first(
     ).output_column == expected_value, "Processing of column value"
     assert output_df.schema.fieldNames() == ["output_column"
                                              ], "Renaming of column"
     assert output_df.schema["output_column"].dataType.typeName(
     ) == "integer", "Casting of column"
Beispiel #2
0
 def test_extended_string_to_timestamp(self, spark_session, input_value,
                                       expected_value):
     # test uses timezone set to GMT / UTC (pytest.ini)!
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(
         mapping=[("output_key", "input_key",
                   "extended_string_to_timestamp")]).transform(input_df)
     # workaround via pandas necessary due to bug with direct conversion
     # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123)
     output_pd_df = output_df.toPandas()
     output_value = output_pd_df.iloc[0]["output_key"]
     if isinstance(output_value, type(pd.NaT)):
         actual_value = None
     else:
         actual_value = output_value.to_pydatetime()
     assert actual_value == expected_value
     assert isinstance(output_df.schema["output_key"].dataType,
                       T.TimestampType)
Beispiel #3
0
 def test_prepending_a_mapping_with_duplicated_columns(
         self, input_columns, mapped_df):
     """Output schema is correct for newly prepended mapping with columns
     that are also included in the input schema"""
     new_mapping = [
         ("created_date", "meta.created_at_sec", "DateType"),
         ("birthday", "birthday", "DateType"),
     ]
     new_columns = [name for (name, path, data_type) in new_mapping]
     new_columns_deduplicated = [
         x for x in new_columns if x not in input_columns
     ]
     new_mapped_df = Mapper(
         mapping=new_mapping, mode="prepend",
         ignore_missing_columns=True).transform(mapped_df)
     assert new_columns_deduplicated + input_columns == new_mapped_df.columns
     assert mapped_df.schema["birthday"].dataType == T.TimestampType()
     assert new_mapped_df.schema["birthday"].dataType == T.DateType()
Beispiel #4
0
 def test_extended_string_unix_timestamp_ms_to_timestamp_spark2(
         self, spark_session, input_value, expected_value):
     # test uses timezone set to GMT / UTC (pytest.ini)!
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(
         mapping=[("output_key", "input_key",
                   "extended_string_unix_timestamp_ms_to_timestamp"
                   )]).transform(input_df)
     # workaround via pandas necessary due to bug with direct conversion
     # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123)
     try:
         output_pd_df = output_df.toPandas()
         actual_value = output_pd_df.iloc[0]["output_key"].to_pydatetime()
         assert (actual_value.toordinal() == expected_value.toordinal(),
                 "actual_value: {act_val}, expected value: {expected_val}".
                 format(act_val=actual_value, expected_val=expected_value))
     except AttributeError:
         # `.to_pydatetime()` can only be used on datetimes and throws AttributeErrors on None
         assert expected_value is None
     assert isinstance(output_df.schema["output_key"].dataType,
                       T.TimestampType)
Beispiel #5
0
 def test_extended_string_to_timestamp_spark2(self, spark_session,
                                              input_value, expected_value):
     # test uses timezone set to GMT / UTC (pytest.ini)!
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(
         mapping=[("output_key", "input_key",
                   "extended_string_to_timestamp")]).transform(input_df)
     # workaround via pandas necessary due to bug with direct conversion
     # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123)
     try:
         output_pd_df = output_df.toPandas()
         actual_value = output_pd_df.iloc[0]["output_key"].to_pydatetime()
     except ValueError:
         # If input is in milliseconds it will still be stored in the DF but cannot be collected in Python
         actual_value = "out_of_range_for_python"
     except AttributeError:
         # `.to_pydatetime()` can only be used on datetimes and throws AttributeErrors on other objects / None
         actual_value = None
     assert actual_value == expected_value
     assert isinstance(output_df.schema["output_key"].dataType,
                       T.TimestampType)
Beispiel #6
0
    def test_function_name_is_shortened(self, spark_session):
        input_df = spark_session.createDataFrame([
            Row(first_name="David"),
            Row(first_name="Katharina"),
            Row(first_name="Nora"),
        ])
        input_values = input_df.rdd.map(
            lambda x: x.asDict()["first_name"]).collect()
        expected_values = [fn.lower() for fn in input_values]

        def _lowercase(source_column, name):
            return F.lower(source_column).alias(name)

        custom_types.add_custom_data_type(function_name="lowercase",
                                          func=_lowercase)

        output_df = Mapper([("first_name", "first_name", "lowercase")
                            ]).transform(input_df)
        output_values = output_df.rdd.map(
            lambda x: x.asDict()["first_name"]).collect()

        assert expected_values == output_values
Beispiel #7
0
 def transformer(self, mapping):
     return Mapper(mapping=mapping, ignore_missing_columns=False)
Beispiel #8
0
def transformer(mapping):
    return Mapper(mapping=mapping, ignore_missing_columns=True)
Beispiel #9
0
 def transform(self, input_df):
     exploded_df, mapping = self._explode_and_get_mapping(input_df)
     mapped_df = Mapper(mapping=mapping, ignore_ambiguous_columns=self.ignore_ambiguous_columns).transform(exploded_df)
     return mapped_df