Ejemplo n.º 1
0
 def test_extended_string_to_timestamp(self, spark_session, input_value,
                                       expected_value):
     # test uses timezone set to GMT / UTC (pytest.ini)!
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(
         mapping=[("output_key", "input_key",
                   "extended_string_to_timestamp")]).transform(input_df)
     # workaround via pandas necessary due to bug with direct conversion
     # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123)
     output_pd_df = output_df.toPandas()
     output_value = output_pd_df.iloc[0]["output_key"]
     if isinstance(output_value, type(pd.NaT)):
         actual_value = None
     else:
         actual_value = output_value.to_pydatetime()
     assert actual_value == expected_value
     assert isinstance(output_df.schema["output_key"].dataType,
                       T.TimestampType)
Ejemplo n.º 2
0
 def test_extended_string_unix_timestamp_ms_to_timestamp_spark2(
         self, spark_session, input_value, expected_value):
     # test uses timezone set to GMT / UTC (pytest.ini)!
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(
         mapping=[("output_key", "input_key",
                   "extended_string_unix_timestamp_ms_to_timestamp"
                   )]).transform(input_df)
     # workaround via pandas necessary due to bug with direct conversion
     # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123)
     try:
         output_pd_df = output_df.toPandas()
         actual_value = output_pd_df.iloc[0]["output_key"].to_pydatetime()
         assert (actual_value.toordinal() == expected_value.toordinal(),
                 "actual_value: {act_val}, expected value: {expected_val}".
                 format(act_val=actual_value, expected_val=expected_value))
     except AttributeError:
         # `.to_pydatetime()` can only be used on datetimes and throws AttributeErrors on None
         assert expected_value is None
     assert isinstance(output_df.schema["output_key"].dataType,
                       T.TimestampType)
Ejemplo n.º 3
0
 def test_extended_string_to_timestamp_spark2(self, spark_session,
                                              input_value, expected_value):
     # test uses timezone set to GMT / UTC (pytest.ini)!
     input_df = self.create_input_df(input_value, spark_session)
     output_df = Mapper(
         mapping=[("output_key", "input_key",
                   "extended_string_to_timestamp")]).transform(input_df)
     # workaround via pandas necessary due to bug with direct conversion
     # to python datetime wrt timezone conversions (https://issues.apache.org/jira/browse/SPARK-32123)
     try:
         output_pd_df = output_df.toPandas()
         actual_value = output_pd_df.iloc[0]["output_key"].to_pydatetime()
     except ValueError:
         # If input is in milliseconds it will still be stored in the DF but cannot be collected in Python
         actual_value = "out_of_range_for_python"
     except AttributeError:
         # `.to_pydatetime()` can only be used on datetimes and throws AttributeErrors on other objects / None
         actual_value = None
     assert actual_value == expected_value
     assert isinstance(output_df.schema["output_key"].dataType,
                       T.TimestampType)