Ejemplo n.º 1
0
 def test_vertica_export_options_datetimeformattz(self):
     # Vertica doesn't currently allow any configurability on
     # output datetimeformattz.  Check again before adding any test
     # cases here!
     should_raise = {
         'YYYY-MM-DD HH:MI:SS': True,
         'YYYY-MM-DD HH24:MI:SSOF': False,
         'MM/DD/YY HH24:MI': True,
     }
     for datetimeformattz in DATETIMETZ_CASES:
         records_format = DelimitedRecordsFormat(
             variant='vertica',
             hints={'datetimeformattz': datetimeformattz})
         unhandled_hints = set(records_format.hints)
         processing_instructions = ProcessingInstructions(
             max_failure_rows=123)
         load_plan = RecordsLoadPlan(
             processing_instructions=processing_instructions,
             records_format=records_format)
         # Records Mover passes no particular option for dateformat on
         # export in Vertica; it always uses YYYY-MM-DD as a result.
         try:
             vertica_export_options(unhandled_hints, load_plan)
         except NotImplementedError:
             if should_raise[datetimeformattz]:
                 pass
             else:
                 self.fail()
Ejemplo n.º 2
0
 def test_vertica_import_options_datetimeformat(self):
     # Vertica doesn't currently allow any configurability on
     # input datetimeformat.  Check again before adding any test cases
     # here!
     should_raise = {
         'YYYY-MM-DD HH:MI:SS': True,
         'YYYY-MM-DD HH24:MI:SS': False,
         'MM/DD/YY HH24:MI': True,
         'YYYY-MM-DD HH12:MI AM': True,
     }
     for datetimeformat in DATETIME_CASES:
         records_format = DelimitedRecordsFormat(variant='vertica',
                                                 hints={
                                                     'datetimeformat':
                                                     datetimeformat,
                                                 })
         unhandled_hints = set(records_format.hints)
         processing_instructions = ProcessingInstructions(
             max_failure_rows=123)
         load_plan = RecordsLoadPlan(
             processing_instructions=processing_instructions,
             records_format=records_format)
         try:
             vertica_import_options(unhandled_hints, load_plan)
         except NotImplementedError:
             if should_raise[datetimeformat]:
                 pass
             else:
                 self.fail()
Ejemplo n.º 3
0
    def test_dateformat(self) -> None:
        class DateFormatExpectations(TypedDict):
            # Use the datetimeformat/datetimeformattz which is
            # compatible, as pandas doesn't let you configure those
            # separately
            dayfirst: bool

        testcases: Dict[HintDateFormat, DateFormatExpectations] = {
            'YYYY-MM-DD': {
                'dayfirst': False,
            },
            'MM-DD-YYYY': {
                'dayfirst': False,
            },
            'DD-MM-YYYY': {
                'dayfirst': True,
            },
            'MM/DD/YY': {
                'dayfirst': False,
            },
            'DD/MM/YY': {
                'dayfirst': True,
            },
            'DD-MM-YY': {
                'dayfirst': True,
            },
        }
        for dateformat in DATE_CASES:
            records_format = DelimitedRecordsFormat(hints={
                'dateformat': dateformat,
                'datetimeformat': f"{dateformat} HH:MI:SS",
                'datetimeformattz': f"{dateformat} HH:MI:SSOF",
                'compression': None,
            })
            records_schema = RecordsSchema.from_data({
                'schema': 'bltypes/v1',
                'fields': {
                    'first': {
                        'type': 'date'
                    }
                },
            })
            unhandled_hints = set(records_format.hints)
            processing_instructions = ProcessingInstructions()
            expectations = testcases[dateformat]
            try:
                options = pandas_read_csv_options(records_format,
                                                  records_schema,
                                                  unhandled_hints,
                                                  processing_instructions)
            except NotImplementedError:
                self.fail(f'Could not handle combination for {dateformat}')
            self.assertTrue(all(item in options.items() for item in expectations.items()))
            fileobj = io.StringIO(create_sample(dateformat))
            df = pandas.read_csv(filepath_or_buffer=fileobj,
                                 **options)
            timestamp = df['untitled_0'][0]
            self.assertEqual(timestamp.year, SAMPLE_YEAR)
            self.assertEqual(timestamp.month, SAMPLE_MONTH)
            self.assertEqual(timestamp.day, SAMPLE_DAY)
Ejemplo n.º 4
0
 def test_vertica_export_options_timeonlyformat(self):
     # Vertica doesn't currently allow any configurability on
     # output timeonlyformat.  Check again before adding any test
     # cases here!
     should_raise = {
         'HH:MI:SS': False,
         'HH24:MI:SS': False,
         'HH24:MI': True,
         'HH12:MI AM': True,
     }
     for timeonlyformat in TIMEONLY_CASES:
         records_format = DelimitedRecordsFormat(variant='vertica',
                                                 hints={
                                                     'timeonlyformat':
                                                     timeonlyformat,
                                                 })
         unhandled_hints = set(records_format.hints)
         processing_instructions = ProcessingInstructions(
             max_failure_rows=123)
         load_plan = RecordsLoadPlan(
             processing_instructions=processing_instructions,
             records_format=records_format)
         # Records Mover passes no particular option for dateformat on
         # export in Vertica; it always uses YYYY-MM-DD as a result.
         try:
             vertica_export_options(unhandled_hints, load_plan)
         except NotImplementedError:
             if should_raise[timeonlyformat]:
                 pass
             else:
                 raise
Ejemplo n.º 5
0
 def test_bluelabs_with_compression(self):
     records_format = DelimitedRecordsFormat(variant='bluelabs',
                                             hints={'compression': 'GZIP'})
     unhandled_hints = set(records_format.hints)
     processing_instructions = ProcessingInstructions()
     load_plan = RecordsLoadPlan(processing_instructions, records_format)
     with self.assertRaises(NotImplementedError):
         postgres_copy_from_options(unhandled_hints, load_plan)
Ejemplo n.º 6
0
 def test_vertica(self):
     records_format = DelimitedRecordsFormat(variant='vertica',
                                             hints={'compression': None})
     records_format.hints['escape'] = '\\'
     unhandled_hints = set(records_format.hints)
     processing_instructions = ProcessingInstructions()
     load_plan = RecordsLoadPlan(processing_instructions, records_format)
     with self.assertRaises(NotImplementedError):
         postgres_copy_from_options(unhandled_hints, load_plan)
Ejemplo n.º 7
0
 def test_new_compression_hint(self):
     records_format = DelimitedRecordsFormat(variant='bluelabs',
                                             hints={'compression': None})
     records_format.hints['encoding'] = 'NEWNEWENCODING'
     unhandled_hints = set(records_format.hints)
     processing_instructions = ProcessingInstructions()
     load_plan = RecordsLoadPlan(processing_instructions, records_format)
     with self.assertRaises(NotImplementedError):
         postgres_copy_from_options(unhandled_hints, load_plan)
Ejemplo n.º 8
0
    def test_prep_df_for_csv_output_include_index(self):
        schema_data = {
            'schema': "bltypes/v1",
            'fields': {
                "date": {
                    "type": "date",
                    "index": 1,
                },
                "time": {
                    "type": "time",
                    "index": 2,
                },
                "timetz": {
                    "type": "timetz",
                    "index": 3,
                },
            }
        }
        records_format = DelimitedRecordsFormat(variant='bluelabs')
        records_schema = RecordsSchema.from_data(schema_data)
        processing_instructions = ProcessingInstructions()
        # us_eastern = pytz.timezone('US/Eastern')
        data = {
            'time': [
                pd.Timestamp(year=1970, month=1, day=1,
                             hour=12, minute=33, second=53, microsecond=1234)
            ],
            # timetz is not well supported in records mover yet.  For
            # instance, specifying how it's turned into a CSV is not
            # currently part of the records spec:
            #
            #   https://github.com/bluelabsio/records-mover/issues/76
            #
            # In addition, Vertica suffers from a driver limitation:
            #
            #   https://github.com/bluelabsio/records-mover/issues/77
            #
            # 'timetz': [
            #     us_eastern.localize(pd.Timestamp(year=1970, month=1, day=1,
            #                                      hour=12, minute=33, second=53,
            #                                      microsecond=1234)),
            # ],
        }
        df = pd.DataFrame(data,
                          index=[pd.Timestamp(year=1970, month=1, day=1)],
                          columns=['time', 'timetz'])

        new_df = prep_df_for_csv_output(df=df,
                                        include_index=True,
                                        records_schema=records_schema,
                                        records_format=records_format,
                                        processing_instructions=processing_instructions)
        self.assertEqual(new_df.index[0], '1970-01-01')
        self.assertEqual(new_df['time'][0], '12:33:53')
        # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
        self.assertIsNotNone(new_df)
 def test_pandas_read_csv_options_bzip(self):
     records_format = DelimitedRecordsFormat(hints={'compression': 'BZIP'})
     records_schema = RecordsSchema.from_data({'schema': 'bltypes/v1'})
     unhandled_hints = set(records_format.hints)
     processing_instructions = ProcessingInstructions()
     expectations = {'compression': 'bz2'}
     out = pandas_read_csv_options(records_format, records_schema,
                                   unhandled_hints, processing_instructions)
     self.assertTrue(
         all(item in out.items() for item in expectations.items()))
Ejemplo n.º 10
0
 def test_csv_quote_all(self):
     records_format = DelimitedRecordsFormat(variant='csv',
                                             hints={
                                                 'compression': None,
                                                 'quoting': 'all'
                                             })
     unhandled_hints = set(records_format.hints)
     processing_instructions = ProcessingInstructions()
     load_plan = RecordsLoadPlan(processing_instructions, records_format)
     with self.assertRaises(NotImplementedError):
         postgres_copy_from_options(unhandled_hints, load_plan)
Ejemplo n.º 11
0
    def test_timeonlyformat(self):
        schema_data = {
            'schema': "bltypes/v1",
            'fields': {
                "time_as_timestamp": {
                    "type": "time",
                    "index": 1,
                },
                "time_as_time": {
                    "type": "time",
                    "index": 2,
                },
            }
        }
        records_schema = RecordsSchema.from_data(schema_data)
        processing_instructions = ProcessingInstructions()
        for timeonlyformat in TIMEONLY_CASES:
            records_format = DelimitedRecordsFormat(variant='bluelabs',
                                                    hints={
                                                        'timeonlyformat': timeonlyformat,
                                                    })
            # us_eastern = pytz.timezone('US/Eastern')
            time_as_timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY,
                                             hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE,
                                             second=SAMPLE_SECOND)
            time_as_time = datetime.time(hour=SAMPLE_HOUR,
                                         minute=SAMPLE_MINUTE,
                                         second=SAMPLE_SECOND)
            data = {
                'time_as_timestamp': [
                    time_as_timestamp
                ],
                'time_as_time': [
                    time_as_time
                ],
            }
            df = pd.DataFrame(data, columns=['time_as_timestamp', 'time_as_time'])

            new_df = prep_df_for_csv_output(df=df,
                                            include_index=False,
                                            records_schema=records_schema,
                                            records_format=records_format,
                                            processing_instructions=processing_instructions)
            self.assertEqual(new_df['time_as_timestamp'][0],
                             create_sample(timeonlyformat),
                             timeonlyformat)
            self.assertEqual(new_df['time_as_time'][0],
                             create_sample(timeonlyformat),
                             timeonlyformat)
            # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
            self.assertIsNotNone(new_df)
Ejemplo n.º 12
0
 def test_numeric_schema_fields_created(self) -> None:
     self.numeric_fixture.bring_up()
     with tempfile.TemporaryDirectory(
             prefix='test_records_numeric_schema') as tempdir:
         output_url = pathlib.Path(tempdir).resolve().as_uri() + '/'
         records_format = DelimitedRecordsFormat()
         processing_instructions = ProcessingInstructions()
         source = self.records.sources.table(schema_name=self.schema_name,
                                             table_name=self.table_name,
                                             db_engine=self.engine)
         target = self.records.targets.directory_from_url(
             output_url, records_format=records_format)
         out = self.records.move(source, target, processing_instructions)
         self.assertIn(out.move_count, [1, None])
         self.validate_records_schema(tempdir)
Ejemplo n.º 13
0
 def test_vertica_import_options_max_failure_rows_specified(self):
     records_format = DelimitedRecordsFormat(variant='vertica')
     unhandled_hints = set(records_format.hints)
     processing_instructions = ProcessingInstructions(max_failure_rows=123)
     load_plan = RecordsLoadPlan(
         processing_instructions=processing_instructions,
         records_format=records_format)
     out = vertica_import_options(unhandled_hints, load_plan)
     expectations = {
         'trailing_nullcols': True,
         'rejectmax': 123,
         'enforcelength': None,
         'error_tolerance': None,
         'abort_on_error': None,
     }
     self.assertTrue(set(expectations.items()).issubset(set(out.items())))
 def test_bluelabs_uncompressed(self):
     records_format = DelimitedRecordsFormat(variant='bluelabs',
                                             hints={'compression': None})
     unhandled_hints = set(records_format.hints)
     processing_instructions = ProcessingInstructions()
     load_plan = RecordsLoadPlan(processing_instructions, records_format)
     date_input_style, copy_options = postgres_copy_from_options(
         unhandled_hints, load_plan)
     self.assertEqual(date_input_style, None)
     self.assertEqual(
         copy_options, {
             'format': 'text',
             'encoding': 'UTF8',
             'header': False,
             'delimiter': ','
         })
 def test_load_known_formats(self):
     mock_url_resolver = Mock(name='url_resolver')
     mock_meta = Mock(name='meta')
     mock_db = Mock(name='db')
     loader = PostgresLoader(url_resolver=mock_url_resolver,
                             meta=mock_meta,
                             db=mock_db)
     known_load_formats = loader.known_supported_records_formats_for_load()
     for records_format in known_load_formats:
         unhandled_hints = set(records_format.hints)
         processing_instructions = ProcessingInstructions()
         load_plan = RecordsLoadPlan(processing_instructions,
                                     records_format)
         # ensure no exception thrown
         postgres_copy_from_options(unhandled_hints, load_plan)
         self.assertFalse(unhandled_hints)
Ejemplo n.º 16
0
    def test_dateformat(self):
        schema_data = {
            'schema': "bltypes/v1",
            'fields': {
                "date_as_timestamp": {
                    "type": "date",
                    "index": 1,
                },
                "date_as_date": {
                    "type": "date",
                    "index": 1,
                },
            }
        }
        records_schema = RecordsSchema.from_data(schema_data)
        processing_instructions = ProcessingInstructions()
        for dateformat in DATE_CASES:
            records_format = DelimitedRecordsFormat(variant='bluelabs',
                                                    hints={
                                                        'dateformat': dateformat
                                                    })
            # us_eastern = pytz.timezone('US/Eastern')
            data = {
                'date_as_timestamp': [
                    pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY)
                ],
                'date_as_date': [
                    datetime.date(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY)
                ],
            }
            df = pd.DataFrame(data,
                              columns=['date_as_timestamp', 'date_as_date'])

            new_df = prep_df_for_csv_output(df=df,
                                            include_index=False,
                                            records_schema=records_schema,
                                            records_format=records_format,
                                            processing_instructions=processing_instructions)
            self.assertEqual(new_df['date_as_timestamp'][0],
                             create_sample(dateformat))
            self.assertEqual(new_df['date_as_date'][0],
                             create_sample(dateformat))
            self.assertIsNotNone(new_df)
Ejemplo n.º 17
0
    def save_and_verify(self, records_format, processing_instructions=None) -> None:
        if not self.has_pandas():
            logger.warning("Skipping test as we don't have Pandas to save with.")
            return

        from pandas import DataFrame

        if processing_instructions is None:
            processing_instructions = ProcessingInstructions()
        us_eastern = pytz.timezone('US/Eastern')
        df = DataFrame.from_dict([{
            'num': 123,
            'numstr': '123',
            'str': 'foo',
            'comma': ',',
            'doublequote': '"',
            'quotecommaquote': '","',
            'newlinestr': ("* SQL unload would generate multiple files (one for each slice/part)\n"
                           "* Filecat would produce a single data file"),
            'date': datetime.date(2000, 1, 1),
            'time': datetime.time(0, 0),
            'timestamp': datetime.datetime(2000, 1, 2, 12, 34, 56, 789012),
            'timestamptz': us_eastern.localize(datetime.datetime(2000, 1, 2, 12, 34, 56, 789012))
        }])

        records_schema = RecordsSchema.from_dataframe(df,
                                                      processing_instructions,
                                                      include_index=False)
        records_schema = records_schema.refine_from_dataframe(df, processing_instructions)

        with tempfile.TemporaryDirectory(prefix='test_records_save_df') as tempdir:
            output_url = pathlib.Path(tempdir).resolve().as_uri() + '/'
            source = self.records.sources.dataframe(df=df,
                                                    records_schema=records_schema,
                                                    processing_instructions=processing_instructions)
            target = self.records.targets.directory_from_url(output_url,
                                                             records_format=records_format)
            out = self.records.move(source, target, processing_instructions)
            self.verify_records_directory(records_format.format_type,
                                          records_format.variant,
                                          tempdir,
                                          records_format.hints)
            return out
Ejemplo n.º 18
0
    def test_datetimeformat(self):
        schema_data = {
            'schema': "bltypes/v1",
            'fields': {
                "datetimez": {
                    "type": "datetime",
                    "index": 1,
                },
            }
        }
        records_schema = RecordsSchema.from_data(schema_data)
        processing_instructions = ProcessingInstructions()
        for datetimeformat in DATETIME_CASES:
            records_format = DelimitedRecordsFormat(variant='bluelabs',
                                                    hints={
                                                        'datetimeformat': datetimeformat
                                                    })
            # us_eastern = pytz.timezone('US/Eastern')
            timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY,
                                     hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE,
                                     second=SAMPLE_SECOND)

            data = {
                'datetime': [
                    timestamp
                ],
            }
            df = pd.DataFrame(data, columns=['datetime'])

            new_df = prep_df_for_csv_output(df=df,
                                            include_index=False,
                                            records_schema=records_schema,
                                            records_format=records_format,
                                            processing_instructions=processing_instructions)
            # No conversion is done of datetime as pandas' CSV
            # outputter handles it properly, so we should expect the
            # original again
            self.assertEqual(new_df['datetime'][0],
                             timestamp,
                             create_sample(datetimeformat))
            # self.assertEqual(new_df['timetz'][0], '12:33:53-05')
            self.assertIsNotNone(new_df)
Ejemplo n.º 19
0
    def can_load_this_format(self,
                             source_records_format: BaseRecordsFormat) -> bool:
        try:
            processing_instructions = ProcessingInstructions()
            load_plan = RecordsLoadPlan(
                records_format=source_records_format,
                processing_instructions=processing_instructions)
            if not isinstance(load_plan.records_format,
                              DelimitedRecordsFormat):
                return False

            unhandled_hints = set(load_plan.records_format.hints.keys())
            processing_instructions = load_plan.processing_instructions
            mysql_load_options(unhandled_hints,
                               load_plan.records_format,
                               fail_if_cant_handle_hint=True)
            complain_on_unhandled_hints(fail_if_dont_understand=True,
                                        unhandled_hints=unhandled_hints,
                                        hints=load_plan.records_format.hints)
            return True
        except NotImplementedError:
            return False
Ejemplo n.º 20
0
 def test_timeonlyformat(self) -> None:
     for timeonlyformat in TIMEONLY_CASES:
         records_format = DelimitedRecordsFormat(hints={
             'timeonlyformat': timeonlyformat,
             'compression': None,
         })
         records_schema = RecordsSchema.from_data({
             'schema': 'bltypes/v1',
             'fields': {
                 'first': {
                     'type': 'time'
                 }
             },
         })
         unhandled_hints = set(records_format.hints)
         processing_instructions = ProcessingInstructions()
         try:
             options = pandas_read_csv_options(records_format,
                                               records_schema,
                                               unhandled_hints,
                                               processing_instructions)
         except NotImplementedError:
             self.fail(f'Could not handle combination for {timeonlyformat}')
         self.assertEqual(options['parse_dates'], [0])
         timeonly = create_sample(timeonlyformat)
         fileobj = io.StringIO(timeonly)
         df = pandas.read_csv(filepath_or_buffer=fileobj,
                              **options)
         timestamp = df['untitled_0'][0]
         self.assertIsInstance(timestamp, pandas.Timestamp,
                               f"Pandas did not parse {timeonly} as a timestamp object")
         self.assertEqual(timestamp.hour, SAMPLE_HOUR)
         self.assertEqual(timestamp.minute, SAMPLE_MINUTE)
         if 'SS' in timeonlyformat:
             self.assertEqual(timestamp.second, SAMPLE_SECOND)
         else:
             self.assertEqual(timestamp.second, 0,
                              timeonly)
Ejemplo n.º 21
0
 def test_numeric_database_columns_created(self):
     records_schema = RecordsSchema.from_data(
         example_numeric_records_schema)
     processing_instructions = ProcessingInstructions()
     preferred_records_format = {
         'redshift': 'bluelabs',
         'bigquery': 'bigquery',
         'vertica': 'vertica',
         'postgresql': 'bluelabs',
         'mysql': 'bluelabs',
     }
     records_format = DelimitedRecordsFormat(
         variant=preferred_records_format[self.engine.name])
     source = self.records.sources.\
         local_file('/dev/null',
                    records_format=records_format,
                    records_schema=records_schema)
     target = self.records.targets.table(schema_name=self.schema_name,
                                         table_name=self.table_name,
                                         db_engine=self.engine)
     out = self.records.move(source, target, processing_instructions)
     self.assertIn(out.move_count, [0, None])
     self.validate_table()
Ejemplo n.º 22
0
    def test_datetimeformattz(self) -> None:
        known_failures: Set[str] = set()
        expectations = {
            'YYYY-MM-DD HH24:MI:SS': '%Y-%m-%d %H:%M:%S.%f',
            'YYYY-MM-DD HH:MI:SS': '%Y-%m-%d %H:%M:%S.%f',
            'MM/DD/YY HH24:MI': '%m/%d/%y %H:%M',
            'YYYY-MM-DD HH12:MI AM': '%Y-%m-%d %I:%M %p',
        }
        compatible_dateformat = {
            'YYYY-MM-DD HH24:MI:SS': 'YYYY-MM-DD',
            'YYYY-MM-DD HH:MI:SS': 'YYYY-MM-DD',
            'YYYY-MM-DD HH12:MI AM': 'YYYY-MM-DD',
            'MM/DD/YY HH24:MI': 'MM/DD/YY',
        }
        for datetimeformat in DATETIME_CASES:
            records_format = DelimitedRecordsFormat(
                hints={
                    # Pandas doesn't consider dateformats to be separate
                    # from datetime/datetimetz formats, so they need to be
                    # consistent
                    'dateformat': compatible_dateformat[datetimeformat],
                    'datetimeformat': datetimeformat,
                    'datetimeformattz': datetimeformat,
                    'compression': None,
                })
            unhandled_hints = set(records_format.hints)
            processing_instructions = ProcessingInstructions()
            try:
                options = pandas_to_csv_options(records_format,
                                                unhandled_hints,
                                                processing_instructions)
            except NotImplementedError:
                if datetimeformat in known_failures:
                    continue
                else:
                    raise
            self.assertEqual(options['date_format'],
                             expectations[datetimeformat], datetimeformat)
            self.assertNotIn(datetimeformat, known_failures)

            fileobj = io.StringIO(create_sample(datetimeformat))
            df = pandas.DataFrame(data={
                'datetime': [
                    pandas.Timestamp(day=SAMPLE_DAY,
                                     month=SAMPLE_MONTH,
                                     year=SAMPLE_YEAR,
                                     hour=SAMPLE_HOUR,
                                     minute=SAMPLE_MINUTE,
                                     second=SAMPLE_SECOND)
                ]
            },
                                  columns=['datetime'])
            df.to_csv(path_or_buf=fileobj, index=False, **options)
            output = fileobj.getvalue()
            # In reality this isn't used raw, as Pandas doesn't really
            # try to handle lone dates or times.  Instead, we use
            # prep_for_csv() to preconvert these Serieses into strings.
            sample = create_sample(datetimeformat)
            if 'SS' in datetimeformat:
                # Pandas doesn't truncate fractional seconds in the
                # same way other tools do.
                self.assertEqual(output, f"{sample}.000000\n")
            else:
                self.assertEqual(output, f"{sample}\n", datetimeformat)
Ejemplo n.º 23
0
    def test_datetimeformattz(self) -> None:
        class DateTimeFormatTzExpectations(TypedDict):
            # Use the datetimeformat/datetimeformattz which is
            # compatible, as pandas doesn't let you configure those
            # separately
            dayfirst: bool

        testcases: Dict[HintDateTimeFormatTz, DateTimeFormatTzExpectations] = {
            'YYYY-MM-DD HH:MI:SSOF': {
                'dayfirst': False,
            },
            'YYYY-MM-DD HH:MI:SS': {
                'dayfirst': False,
            },
            'YYYY-MM-DD HH24:MI:SSOF': {
                'dayfirst': False,
            },
            'MM/DD/YY HH24:MI': {
                'dayfirst': False,
            },
        }
        for datetimeformattz in DATETIMETZ_CASES:
            records_format = DelimitedRecordsFormat(hints={
                'datetimeformattz': datetimeformattz,
                'compression': None,
            })
            records_schema = RecordsSchema.from_data({
                'schema': 'bltypes/v1',
                'fields': {
                    'first': {
                        'type': 'datetimetz'
                    }
                },
            })
            unhandled_hints = set(records_format.hints)
            processing_instructions = ProcessingInstructions()
            expectations = testcases[datetimeformattz]
            try:
                options = pandas_read_csv_options(records_format,
                                                  records_schema,
                                                  unhandled_hints,
                                                  processing_instructions)
            except NotImplementedError:
                self.fail(f'Could not handle combination for {datetimeformattz}')
            self.assertEqual(options['parse_dates'], [0])
            self.assertTrue(all(item in options.items() for item in expectations.items()))
            datetimetz = create_sample(datetimeformattz)
            fileobj = io.StringIO(datetimetz)
            df = pandas.read_csv(filepath_or_buffer=fileobj,
                                 **options)
            timestamp = df['untitled_0'][0]
            self.assertIsInstance(timestamp, pandas.Timestamp,
                                  f"Pandas did not parse {datetimetz} as a timestamp object")
            self.assertEqual(timestamp.year, SAMPLE_YEAR)
            self.assertEqual(timestamp.month, SAMPLE_MONTH)
            self.assertEqual(timestamp.day, SAMPLE_DAY)
            self.assertEqual(timestamp.hour, SAMPLE_HOUR)
            self.assertEqual(timestamp.minute, SAMPLE_MINUTE)
            if 'SS' in datetimeformattz:
                self.assertEqual(timestamp.second, SAMPLE_SECOND)
            else:
                self.assertEqual(timestamp.second, 0)