def test_extended_string_to_long(self, spark_session, input_value, expected_value): input_df = self.create_input_df(input_value, spark_session) output_df = Mapper( mapping=[("output_key", "input_key", "extended_string_to_long")]).transform(input_df) assert output_df.first().output_key == expected_value assert isinstance(output_df.schema["output_key"].dataType, T.LongType)
def test_extended_string_unix_timestamp_ms_to_date_spark2( self, spark_session, input_value, expected_value): input_df = self.create_input_df(input_value, spark_session) output_df = Mapper(mapping=[( "output_key", "input_key", "extended_string_unix_timestamp_ms_to_date")]).transform(input_df) actual_value = output_df.first().output_key assert actual_value == expected_value assert isinstance(output_df.schema["output_key"].dataType, T.DateType)
def test_extended_string_to_float(self, spark_session, input_value, expected_value): input_df = self.create_input_df(input_value, spark_session) output_df = Mapper( mapping=[("output_key", "input_key", "extended_string_to_float")]).transform(input_df) actual_value = output_df.first().output_key if actual_value is not None: assert pytest.approx(actual_value) == expected_value else: assert actual_value == expected_value assert isinstance(output_df.schema["output_key"].dataType, T.FloatType)
def test_spark_sql_object(self, spark_session, input_value_1, input_value_2, mapper_function, expected_value): input_df = self.create_input_df(input_value_1, input_value_2, spark_session) output_df = Mapper(mapping=[("output_key", mapper_function, "as_is")]).transform(input_df) actual = output_df.first().output_key if isinstance(expected_value, datetime.datetime): assert (expected_value - datetime.timedelta(seconds=30) ) < actual < datetime.datetime.now() else: assert actual == expected_value
def test_extended_string_to_date_spark2(self, spark_session, input_value, expected_value): input_df = self.create_input_df(input_value, spark_session) output_df = Mapper( mapping=[("output_key", "input_key", "extended_string_to_date")]).transform(input_df) try: actual_value = output_df.first().output_key except ValueError: # If input is in milliseconds it will still be stored in the DF but cannot be collected in Python actual_value = "out_of_range_for_python" assert actual_value == expected_value assert isinstance(output_df.schema["output_key"].dataType, T.DateType)
def test_multiple_columns_are_accessed(self, spark_session): input_df = spark_session.createDataFrame([ Row(first_name="David", last_name="Eigenstuhler"), Row(first_name="Katharina", last_name="Hohensinn"), Row(first_name="Nora", last_name="Hohensinn"), ]) input_values = input_df.rdd.map(lambda x: x.asDict()).collect() expected_values = [ d["first_name"] + "_" + d["last_name"] for d in input_values ] def _first_and_last_name(source_column, name): return F.concat_ws("_", source_column, F.col("last_name")).alias(name) custom_types.add_custom_data_type(function_name="fullname", func=_first_and_last_name) output_df = Mapper([("full_name", "first_name", "fullname") ]).transform(input_df) output_values = output_df.rdd.map( lambda x: x.asDict()["full_name"]).collect() assert expected_values == output_values
def test_prepending_a_mapping(self, mapped_df, new_mapping, input_columns, new_columns): """Output schema is correct for added mapping at the beginning of the input schema""" new_mapped_df = Mapper( mapping=new_mapping, mode="prepend", ignore_missing_columns=True).transform(mapped_df) assert new_columns + input_columns == new_mapped_df.columns
def test_generate_select_expression_for_unix_timestamp_ms_to_spark_timestamp( self, input_value, spark_session): input_df = spark_session.createDataFrame( [Row(input_column=input_value)], schema=T.StructType( [T.StructField("input_column", T.LongType(), True)])) output_df = Mapper(mapping=[( "output_column", "input_column", "unix_timestamp_ms_to_spark_timestamp")]).transform(input_df) expected_value = datetime.datetime.fromtimestamp(input_value / 1000.0) assert output_df.first( ).output_column == expected_value, "Processing of column value" assert output_df.schema.fieldNames() == ["output_column" ], "Renaming of column" assert output_df.schema["output_column"].dataType.typeName( ) == "timestamp", "Casting of column"
def test_generate_select_expression_for_meters_to_cm( self, input_value, expected_value, spark_session): input_df = spark_session.createDataFrame( data=[Row(input_key=input_value)], schema=T.StructType([ T.StructField("input_key", get_spark_data_type(input_value), True) ]), ) output_df = Mapper(mapping=[("output_column", "input_key", "meters_to_cm")]).transform(input_df) assert output_df.first( ).output_column == expected_value, "Processing of column value" assert output_df.schema.fieldNames() == ["output_column" ], "Renaming of column" assert output_df.schema["output_column"].dataType.typeName( ) == "integer", "Casting of column"
def test_extended_string_to_timestamp(self, spark_session, input_value, expected_value): # test uses timezone set to GMT / UTC (pytest.ini)! input_df = self.create_input_df(input_value, spark_session) output_df = Mapper( mapping=[("output_key", "input_key", "extended_string_to_timestamp")]).transform(input_df) # workaround via pandas necessary due to bug with direct conversion # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123) output_pd_df = output_df.toPandas() output_value = output_pd_df.iloc[0]["output_key"] if isinstance(output_value, type(pd.NaT)): actual_value = None else: actual_value = output_value.to_pydatetime() assert actual_value == expected_value assert isinstance(output_df.schema["output_key"].dataType, T.TimestampType)
def test_extended_string_to_timestamp_spark2(self, spark_session, input_value, expected_value): # test uses timezone set to GMT / UTC (pytest.ini)! input_df = self.create_input_df(input_value, spark_session) output_df = Mapper( mapping=[("output_key", "input_key", "extended_string_to_timestamp")]).transform(input_df) # workaround via pandas necessary due to bug with direct conversion # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123) try: output_pd_df = output_df.toPandas() actual_value = output_pd_df.iloc[0]["output_key"].to_pydatetime() except ValueError: # If input is in milliseconds it will still be stored in the DF but cannot be collected in Python actual_value = "out_of_range_for_python" except AttributeError: # `.to_pydatetime()` can only be used on datetimes and throws AttributeErrors on other objects / None actual_value = None assert actual_value == expected_value assert isinstance(output_df.schema["output_key"].dataType, T.TimestampType)
def test_extended_string_unix_timestamp_ms_to_timestamp_spark2( self, spark_session, input_value, expected_value): # test uses timezone set to GMT / UTC (pytest.ini)! input_df = self.create_input_df(input_value, spark_session) output_df = Mapper( mapping=[("output_key", "input_key", "extended_string_unix_timestamp_ms_to_timestamp" )]).transform(input_df) # workaround via pandas necessary due to bug with direct conversion # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123) try: output_pd_df = output_df.toPandas() actual_value = output_pd_df.iloc[0]["output_key"].to_pydatetime() assert ( actual_value.toordinal() == expected_value.toordinal(), "actual_value: {act_val}, expected value: {expected_val}". format(act_val=actual_value, expected_val=expected_value), ) except AttributeError: # `.to_pydatetime()` can only be used on datetimes and throws AttributeErrors on None assert expected_value is None assert isinstance(output_df.schema["output_key"].dataType, T.TimestampType)
def test_prepending_a_mapping_with_duplicated_columns( self, input_columns, mapped_df): """Output schema is correct for newly prepended mapping with columns that are also included in the input schema""" new_mapping = [ ("created_date", "meta.created_at_sec", "DateType"), ("birthday", "birthday", "DateType"), ] new_columns = [name for (name, path, data_type) in new_mapping] new_columns_deduplicated = [ x for x in new_columns if x not in input_columns ] new_mapped_df = Mapper( mapping=new_mapping, mode="prepend", ignore_missing_columns=True).transform(mapped_df) assert new_columns_deduplicated + input_columns == new_mapped_df.columns assert mapped_df.schema["birthday"].dataType == T.TimestampType() assert new_mapped_df.schema["birthday"].dataType == T.DateType()
def test_function_name_is_shortened(self, spark_session): input_df = spark_session.createDataFrame([ Row(first_name="David"), Row(first_name="Katharina"), Row(first_name="Nora"), ]) input_values = input_df.rdd.map( lambda x: x.asDict()["first_name"]).collect() expected_values = [fn.lower() for fn in input_values] def _lowercase(source_column, name): return F.lower(source_column).alias(name) custom_types.add_custom_data_type(function_name="lowercase", func=_lowercase) output_df = Mapper([("first_name", "first_name", "lowercase") ]).transform(input_df) output_values = output_df.rdd.map( lambda x: x.asDict()["first_name"]).collect() assert expected_values == output_values
def test_ambiguous_column_names_exception_is_ignored( self, input_df, mapping, expected_output_df): transformer = Mapper(mapping, ignore_ambiguous_columns=True) output_df = transformer.transform(input_df) assert_df_equality(expected_output_df, output_df)
def test_ambiguous_column_names_raise_exception(self, input_df, mapping): transformer = Mapper(mapping) with pytest.raises(AnalysisException): transformer.transform(input_df)
def transformer(self, mapping): return Mapper(mapping=mapping, ignore_missing_columns=False)
def transformer(mapping): return Mapper(mapping=mapping, ignore_missing_columns=True)