def test_extended_string_to_long(self, spark_session, input_value, expected_value): input_df = self.create_input_df(input_value, spark_session) output_df = Mapper( mapping=[("output_key", "input_key", "extended_string_to_long")]).transform(input_df) assert output_df.first().output_key == expected_value assert isinstance(output_df.schema["output_key"].dataType, T.LongType)
def test_extended_string_unix_timestamp_ms_to_date_spark2( self, spark_session, input_value, expected_value): input_df = self.create_input_df(input_value, spark_session) output_df = Mapper(mapping=[( "output_key", "input_key", "extended_string_unix_timestamp_ms_to_date")]).transform(input_df) actual_value = output_df.first().output_key assert actual_value == expected_value assert isinstance(output_df.schema["output_key"].dataType, T.DateType)
def test_extended_string_to_float(self, spark_session, input_value, expected_value): input_df = self.create_input_df(input_value, spark_session) output_df = Mapper( mapping=[("output_key", "input_key", "extended_string_to_float")]).transform(input_df) actual_value = output_df.first().output_key if actual_value is not None: assert pytest.approx(actual_value) == expected_value else: assert actual_value == expected_value assert isinstance(output_df.schema["output_key"].dataType, T.FloatType)
def test_spark_sql_object(self, spark_session, input_value_1, input_value_2, mapper_function, expected_value): input_df = self.create_input_df(input_value_1, input_value_2, spark_session) output_df = Mapper(mapping=[("output_key", mapper_function, "as_is")]).transform(input_df) actual = output_df.first().output_key if isinstance(expected_value, datetime.datetime): assert (expected_value - datetime.timedelta(seconds=30) ) < actual < datetime.datetime.now() else: assert actual == expected_value
def test_extended_string_to_date_spark2(self, spark_session, input_value, expected_value): input_df = self.create_input_df(input_value, spark_session) output_df = Mapper( mapping=[("output_key", "input_key", "extended_string_to_date")]).transform(input_df) try: actual_value = output_df.first().output_key except ValueError: # If input is in milliseconds it will still be stored in the DF but cannot be collected in Python actual_value = "out_of_range_for_python" assert actual_value == expected_value assert isinstance(output_df.schema["output_key"].dataType, T.DateType)
def test_generate_select_expression_for_unix_timestamp_ms_to_spark_timestamp( self, input_value, spark_session): input_df = spark_session.createDataFrame( [Row(input_column=input_value)], schema=T.StructType( [T.StructField("input_column", T.LongType(), True)])) output_df = Mapper(mapping=[( "output_column", "input_column", "unix_timestamp_ms_to_spark_timestamp")]).transform(input_df) expected_value = datetime.datetime.fromtimestamp(input_value / 1000.0) assert output_df.first( ).output_column == expected_value, "Processing of column value" assert output_df.schema.fieldNames() == ["output_column" ], "Renaming of column" assert output_df.schema["output_column"].dataType.typeName( ) == "timestamp", "Casting of column"
def test_generate_select_expression_for_meters_to_cm( self, input_value, expected_value, spark_session): input_df = spark_session.createDataFrame( data=[Row(input_key=input_value)], schema=T.StructType([ T.StructField("input_key", get_spark_data_type(input_value), True) ]), ) output_df = Mapper(mapping=[("output_column", "input_key", "meters_to_cm")]).transform(input_df) assert output_df.first( ).output_column == expected_value, "Processing of column value" assert output_df.schema.fieldNames() == ["output_column" ], "Renaming of column" assert output_df.schema["output_column"].dataType.typeName( ) == "integer", "Casting of column"