def test_vertica_export_options_datetimeformattz(self): # Vertica doesn't currently allow any configurability on # output datetimeformattz. Check again before adding any test # cases here! should_raise = { 'YYYY-MM-DD HH:MI:SS': True, 'YYYY-MM-DD HH24:MI:SSOF': False, 'MM/DD/YY HH24:MI': True, } for datetimeformattz in DATETIMETZ_CASES: records_format = DelimitedRecordsFormat( variant='vertica', hints={'datetimeformattz': datetimeformattz}) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions( max_failure_rows=123) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) # Records Mover passes no particular option for dateformat on # export in Vertica; it always uses YYYY-MM-DD as a result. try: vertica_export_options(unhandled_hints, load_plan) except NotImplementedError: if should_raise[datetimeformattz]: pass else: self.fail()
def test_vertica_import_options_datetimeformat(self): # Vertica doesn't currently allow any configurability on # input datetimeformat. Check again before adding any test cases # here! should_raise = { 'YYYY-MM-DD HH:MI:SS': True, 'YYYY-MM-DD HH24:MI:SS': False, 'MM/DD/YY HH24:MI': True, 'YYYY-MM-DD HH12:MI AM': True, } for datetimeformat in DATETIME_CASES: records_format = DelimitedRecordsFormat(variant='vertica', hints={ 'datetimeformat': datetimeformat, }) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions( max_failure_rows=123) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) try: vertica_import_options(unhandled_hints, load_plan) except NotImplementedError: if should_raise[datetimeformat]: pass else: self.fail()
def test_dateformat(self) -> None: class DateFormatExpectations(TypedDict): # Use the datetimeformat/datetimeformattz which is # compatible, as pandas doesn't let you configure those # separately dayfirst: bool testcases: Dict[HintDateFormat, DateFormatExpectations] = { 'YYYY-MM-DD': { 'dayfirst': False, }, 'MM-DD-YYYY': { 'dayfirst': False, }, 'DD-MM-YYYY': { 'dayfirst': True, }, 'MM/DD/YY': { 'dayfirst': False, }, 'DD/MM/YY': { 'dayfirst': True, }, 'DD-MM-YY': { 'dayfirst': True, }, } for dateformat in DATE_CASES: records_format = DelimitedRecordsFormat(hints={ 'dateformat': dateformat, 'datetimeformat': f"{dateformat} HH:MI:SS", 'datetimeformattz': f"{dateformat} HH:MI:SSOF", 'compression': None, }) records_schema = RecordsSchema.from_data({ 'schema': 'bltypes/v1', 'fields': { 'first': { 'type': 'date' } }, }) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() expectations = testcases[dateformat] try: options = pandas_read_csv_options(records_format, records_schema, unhandled_hints, processing_instructions) except NotImplementedError: self.fail(f'Could not handle combination for {dateformat}') self.assertTrue(all(item in options.items() for item in expectations.items())) fileobj = io.StringIO(create_sample(dateformat)) df = pandas.read_csv(filepath_or_buffer=fileobj, **options) timestamp = df['untitled_0'][0] self.assertEqual(timestamp.year, SAMPLE_YEAR) self.assertEqual(timestamp.month, SAMPLE_MONTH) self.assertEqual(timestamp.day, SAMPLE_DAY)
def test_vertica_export_options_timeonlyformat(self): # Vertica doesn't currently allow any configurability on # output timeonlyformat. Check again before adding any test # cases here! should_raise = { 'HH:MI:SS': False, 'HH24:MI:SS': False, 'HH24:MI': True, 'HH12:MI AM': True, } for timeonlyformat in TIMEONLY_CASES: records_format = DelimitedRecordsFormat(variant='vertica', hints={ 'timeonlyformat': timeonlyformat, }) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions( max_failure_rows=123) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) # Records Mover passes no particular option for dateformat on # export in Vertica; it always uses YYYY-MM-DD as a result. try: vertica_export_options(unhandled_hints, load_plan) except NotImplementedError: if should_raise[timeonlyformat]: pass else: raise
def test_bluelabs_with_compression(self): records_format = DelimitedRecordsFormat(variant='bluelabs', hints={'compression': 'GZIP'}) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan(processing_instructions, records_format) with self.assertRaises(NotImplementedError): postgres_copy_from_options(unhandled_hints, load_plan)
def test_vertica(self): records_format = DelimitedRecordsFormat(variant='vertica', hints={'compression': None}) records_format.hints['escape'] = '\\' unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan(processing_instructions, records_format) with self.assertRaises(NotImplementedError): postgres_copy_from_options(unhandled_hints, load_plan)
def test_new_compression_hint(self): records_format = DelimitedRecordsFormat(variant='bluelabs', hints={'compression': None}) records_format.hints['encoding'] = 'NEWNEWENCODING' unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan(processing_instructions, records_format) with self.assertRaises(NotImplementedError): postgres_copy_from_options(unhandled_hints, load_plan)
def test_prep_df_for_csv_output_include_index(self): schema_data = { 'schema': "bltypes/v1", 'fields': { "date": { "type": "date", "index": 1, }, "time": { "type": "time", "index": 2, }, "timetz": { "type": "timetz", "index": 3, }, } } records_format = DelimitedRecordsFormat(variant='bluelabs') records_schema = RecordsSchema.from_data(schema_data) processing_instructions = ProcessingInstructions() # us_eastern = pytz.timezone('US/Eastern') data = { 'time': [ pd.Timestamp(year=1970, month=1, day=1, hour=12, minute=33, second=53, microsecond=1234) ], # timetz is not well supported in records mover yet. For # instance, specifying how it's turned into a CSV is not # currently part of the records spec: # # https://github.com/bluelabsio/records-mover/issues/76 # # In addition, Vertica suffers from a driver limitation: # # https://github.com/bluelabsio/records-mover/issues/77 # # 'timetz': [ # us_eastern.localize(pd.Timestamp(year=1970, month=1, day=1, # hour=12, minute=33, second=53, # microsecond=1234)), # ], } df = pd.DataFrame(data, index=[pd.Timestamp(year=1970, month=1, day=1)], columns=['time', 'timetz']) new_df = prep_df_for_csv_output(df=df, include_index=True, records_schema=records_schema, records_format=records_format, processing_instructions=processing_instructions) self.assertEqual(new_df.index[0], '1970-01-01') self.assertEqual(new_df['time'][0], '12:33:53') # self.assertEqual(new_df['timetz'][0], '12:33:53-05') self.assertIsNotNone(new_df)
def test_pandas_read_csv_options_bzip(self): records_format = DelimitedRecordsFormat(hints={'compression': 'BZIP'}) records_schema = RecordsSchema.from_data({'schema': 'bltypes/v1'}) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() expectations = {'compression': 'bz2'} out = pandas_read_csv_options(records_format, records_schema, unhandled_hints, processing_instructions) self.assertTrue( all(item in out.items() for item in expectations.items()))
def test_csv_quote_all(self): records_format = DelimitedRecordsFormat(variant='csv', hints={ 'compression': None, 'quoting': 'all' }) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan(processing_instructions, records_format) with self.assertRaises(NotImplementedError): postgres_copy_from_options(unhandled_hints, load_plan)
def test_timeonlyformat(self): schema_data = { 'schema': "bltypes/v1", 'fields': { "time_as_timestamp": { "type": "time", "index": 1, }, "time_as_time": { "type": "time", "index": 2, }, } } records_schema = RecordsSchema.from_data(schema_data) processing_instructions = ProcessingInstructions() for timeonlyformat in TIMEONLY_CASES: records_format = DelimitedRecordsFormat(variant='bluelabs', hints={ 'timeonlyformat': timeonlyformat, }) # us_eastern = pytz.timezone('US/Eastern') time_as_timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY, hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE, second=SAMPLE_SECOND) time_as_time = datetime.time(hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE, second=SAMPLE_SECOND) data = { 'time_as_timestamp': [ time_as_timestamp ], 'time_as_time': [ time_as_time ], } df = pd.DataFrame(data, columns=['time_as_timestamp', 'time_as_time']) new_df = prep_df_for_csv_output(df=df, include_index=False, records_schema=records_schema, records_format=records_format, processing_instructions=processing_instructions) self.assertEqual(new_df['time_as_timestamp'][0], create_sample(timeonlyformat), timeonlyformat) self.assertEqual(new_df['time_as_time'][0], create_sample(timeonlyformat), timeonlyformat) # self.assertEqual(new_df['timetz'][0], '12:33:53-05') self.assertIsNotNone(new_df)
def test_numeric_schema_fields_created(self) -> None: self.numeric_fixture.bring_up() with tempfile.TemporaryDirectory( prefix='test_records_numeric_schema') as tempdir: output_url = pathlib.Path(tempdir).resolve().as_uri() + '/' records_format = DelimitedRecordsFormat() processing_instructions = ProcessingInstructions() source = self.records.sources.table(schema_name=self.schema_name, table_name=self.table_name, db_engine=self.engine) target = self.records.targets.directory_from_url( output_url, records_format=records_format) out = self.records.move(source, target, processing_instructions) self.assertIn(out.move_count, [1, None]) self.validate_records_schema(tempdir)
def test_vertica_import_options_max_failure_rows_specified(self): records_format = DelimitedRecordsFormat(variant='vertica') unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions(max_failure_rows=123) load_plan = RecordsLoadPlan( processing_instructions=processing_instructions, records_format=records_format) out = vertica_import_options(unhandled_hints, load_plan) expectations = { 'trailing_nullcols': True, 'rejectmax': 123, 'enforcelength': None, 'error_tolerance': None, 'abort_on_error': None, } self.assertTrue(set(expectations.items()).issubset(set(out.items())))
def test_bluelabs_uncompressed(self): records_format = DelimitedRecordsFormat(variant='bluelabs', hints={'compression': None}) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan(processing_instructions, records_format) date_input_style, copy_options = postgres_copy_from_options( unhandled_hints, load_plan) self.assertEqual(date_input_style, None) self.assertEqual( copy_options, { 'format': 'text', 'encoding': 'UTF8', 'header': False, 'delimiter': ',' })
def test_load_known_formats(self): mock_url_resolver = Mock(name='url_resolver') mock_meta = Mock(name='meta') mock_db = Mock(name='db') loader = PostgresLoader(url_resolver=mock_url_resolver, meta=mock_meta, db=mock_db) known_load_formats = loader.known_supported_records_formats_for_load() for records_format in known_load_formats: unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan(processing_instructions, records_format) # ensure no exception thrown postgres_copy_from_options(unhandled_hints, load_plan) self.assertFalse(unhandled_hints)
def test_dateformat(self): schema_data = { 'schema': "bltypes/v1", 'fields': { "date_as_timestamp": { "type": "date", "index": 1, }, "date_as_date": { "type": "date", "index": 1, }, } } records_schema = RecordsSchema.from_data(schema_data) processing_instructions = ProcessingInstructions() for dateformat in DATE_CASES: records_format = DelimitedRecordsFormat(variant='bluelabs', hints={ 'dateformat': dateformat }) # us_eastern = pytz.timezone('US/Eastern') data = { 'date_as_timestamp': [ pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY) ], 'date_as_date': [ datetime.date(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY) ], } df = pd.DataFrame(data, columns=['date_as_timestamp', 'date_as_date']) new_df = prep_df_for_csv_output(df=df, include_index=False, records_schema=records_schema, records_format=records_format, processing_instructions=processing_instructions) self.assertEqual(new_df['date_as_timestamp'][0], create_sample(dateformat)) self.assertEqual(new_df['date_as_date'][0], create_sample(dateformat)) self.assertIsNotNone(new_df)
def save_and_verify(self, records_format, processing_instructions=None) -> None: if not self.has_pandas(): logger.warning("Skipping test as we don't have Pandas to save with.") return from pandas import DataFrame if processing_instructions is None: processing_instructions = ProcessingInstructions() us_eastern = pytz.timezone('US/Eastern') df = DataFrame.from_dict([{ 'num': 123, 'numstr': '123', 'str': 'foo', 'comma': ',', 'doublequote': '"', 'quotecommaquote': '","', 'newlinestr': ("* SQL unload would generate multiple files (one for each slice/part)\n" "* Filecat would produce a single data file"), 'date': datetime.date(2000, 1, 1), 'time': datetime.time(0, 0), 'timestamp': datetime.datetime(2000, 1, 2, 12, 34, 56, 789012), 'timestamptz': us_eastern.localize(datetime.datetime(2000, 1, 2, 12, 34, 56, 789012)) }]) records_schema = RecordsSchema.from_dataframe(df, processing_instructions, include_index=False) records_schema = records_schema.refine_from_dataframe(df, processing_instructions) with tempfile.TemporaryDirectory(prefix='test_records_save_df') as tempdir: output_url = pathlib.Path(tempdir).resolve().as_uri() + '/' source = self.records.sources.dataframe(df=df, records_schema=records_schema, processing_instructions=processing_instructions) target = self.records.targets.directory_from_url(output_url, records_format=records_format) out = self.records.move(source, target, processing_instructions) self.verify_records_directory(records_format.format_type, records_format.variant, tempdir, records_format.hints) return out
def test_datetimeformat(self): schema_data = { 'schema': "bltypes/v1", 'fields': { "datetimez": { "type": "datetime", "index": 1, }, } } records_schema = RecordsSchema.from_data(schema_data) processing_instructions = ProcessingInstructions() for datetimeformat in DATETIME_CASES: records_format = DelimitedRecordsFormat(variant='bluelabs', hints={ 'datetimeformat': datetimeformat }) # us_eastern = pytz.timezone('US/Eastern') timestamp = pd.Timestamp(year=SAMPLE_YEAR, month=SAMPLE_MONTH, day=SAMPLE_DAY, hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE, second=SAMPLE_SECOND) data = { 'datetime': [ timestamp ], } df = pd.DataFrame(data, columns=['datetime']) new_df = prep_df_for_csv_output(df=df, include_index=False, records_schema=records_schema, records_format=records_format, processing_instructions=processing_instructions) # No conversion is done of datetime as pandas' CSV # outputter handles it properly, so we should expect the # original again self.assertEqual(new_df['datetime'][0], timestamp, create_sample(datetimeformat)) # self.assertEqual(new_df['timetz'][0], '12:33:53-05') self.assertIsNotNone(new_df)
def can_load_this_format(self, source_records_format: BaseRecordsFormat) -> bool: try: processing_instructions = ProcessingInstructions() load_plan = RecordsLoadPlan( records_format=source_records_format, processing_instructions=processing_instructions) if not isinstance(load_plan.records_format, DelimitedRecordsFormat): return False unhandled_hints = set(load_plan.records_format.hints.keys()) processing_instructions = load_plan.processing_instructions mysql_load_options(unhandled_hints, load_plan.records_format, fail_if_cant_handle_hint=True) complain_on_unhandled_hints(fail_if_dont_understand=True, unhandled_hints=unhandled_hints, hints=load_plan.records_format.hints) return True except NotImplementedError: return False
def test_timeonlyformat(self) -> None: for timeonlyformat in TIMEONLY_CASES: records_format = DelimitedRecordsFormat(hints={ 'timeonlyformat': timeonlyformat, 'compression': None, }) records_schema = RecordsSchema.from_data({ 'schema': 'bltypes/v1', 'fields': { 'first': { 'type': 'time' } }, }) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() try: options = pandas_read_csv_options(records_format, records_schema, unhandled_hints, processing_instructions) except NotImplementedError: self.fail(f'Could not handle combination for {timeonlyformat}') self.assertEqual(options['parse_dates'], [0]) timeonly = create_sample(timeonlyformat) fileobj = io.StringIO(timeonly) df = pandas.read_csv(filepath_or_buffer=fileobj, **options) timestamp = df['untitled_0'][0] self.assertIsInstance(timestamp, pandas.Timestamp, f"Pandas did not parse {timeonly} as a timestamp object") self.assertEqual(timestamp.hour, SAMPLE_HOUR) self.assertEqual(timestamp.minute, SAMPLE_MINUTE) if 'SS' in timeonlyformat: self.assertEqual(timestamp.second, SAMPLE_SECOND) else: self.assertEqual(timestamp.second, 0, timeonly)
def test_numeric_database_columns_created(self): records_schema = RecordsSchema.from_data( example_numeric_records_schema) processing_instructions = ProcessingInstructions() preferred_records_format = { 'redshift': 'bluelabs', 'bigquery': 'bigquery', 'vertica': 'vertica', 'postgresql': 'bluelabs', 'mysql': 'bluelabs', } records_format = DelimitedRecordsFormat( variant=preferred_records_format[self.engine.name]) source = self.records.sources.\ local_file('/dev/null', records_format=records_format, records_schema=records_schema) target = self.records.targets.table(schema_name=self.schema_name, table_name=self.table_name, db_engine=self.engine) out = self.records.move(source, target, processing_instructions) self.assertIn(out.move_count, [0, None]) self.validate_table()
def test_datetimeformattz(self) -> None: known_failures: Set[str] = set() expectations = { 'YYYY-MM-DD HH24:MI:SS': '%Y-%m-%d %H:%M:%S.%f', 'YYYY-MM-DD HH:MI:SS': '%Y-%m-%d %H:%M:%S.%f', 'MM/DD/YY HH24:MI': '%m/%d/%y %H:%M', 'YYYY-MM-DD HH12:MI AM': '%Y-%m-%d %I:%M %p', } compatible_dateformat = { 'YYYY-MM-DD HH24:MI:SS': 'YYYY-MM-DD', 'YYYY-MM-DD HH:MI:SS': 'YYYY-MM-DD', 'YYYY-MM-DD HH12:MI AM': 'YYYY-MM-DD', 'MM/DD/YY HH24:MI': 'MM/DD/YY', } for datetimeformat in DATETIME_CASES: records_format = DelimitedRecordsFormat( hints={ # Pandas doesn't consider dateformats to be separate # from datetime/datetimetz formats, so they need to be # consistent 'dateformat': compatible_dateformat[datetimeformat], 'datetimeformat': datetimeformat, 'datetimeformattz': datetimeformat, 'compression': None, }) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() try: options = pandas_to_csv_options(records_format, unhandled_hints, processing_instructions) except NotImplementedError: if datetimeformat in known_failures: continue else: raise self.assertEqual(options['date_format'], expectations[datetimeformat], datetimeformat) self.assertNotIn(datetimeformat, known_failures) fileobj = io.StringIO(create_sample(datetimeformat)) df = pandas.DataFrame(data={ 'datetime': [ pandas.Timestamp(day=SAMPLE_DAY, month=SAMPLE_MONTH, year=SAMPLE_YEAR, hour=SAMPLE_HOUR, minute=SAMPLE_MINUTE, second=SAMPLE_SECOND) ] }, columns=['datetime']) df.to_csv(path_or_buf=fileobj, index=False, **options) output = fileobj.getvalue() # In reality this isn't used raw, as Pandas doesn't really # try to handle lone dates or times. Instead, we use # prep_for_csv() to preconvert these Serieses into strings. sample = create_sample(datetimeformat) if 'SS' in datetimeformat: # Pandas doesn't truncate fractional seconds in the # same way other tools do. self.assertEqual(output, f"{sample}.000000\n") else: self.assertEqual(output, f"{sample}\n", datetimeformat)
def test_datetimeformattz(self) -> None: class DateTimeFormatTzExpectations(TypedDict): # Use the datetimeformat/datetimeformattz which is # compatible, as pandas doesn't let you configure those # separately dayfirst: bool testcases: Dict[HintDateTimeFormatTz, DateTimeFormatTzExpectations] = { 'YYYY-MM-DD HH:MI:SSOF': { 'dayfirst': False, }, 'YYYY-MM-DD HH:MI:SS': { 'dayfirst': False, }, 'YYYY-MM-DD HH24:MI:SSOF': { 'dayfirst': False, }, 'MM/DD/YY HH24:MI': { 'dayfirst': False, }, } for datetimeformattz in DATETIMETZ_CASES: records_format = DelimitedRecordsFormat(hints={ 'datetimeformattz': datetimeformattz, 'compression': None, }) records_schema = RecordsSchema.from_data({ 'schema': 'bltypes/v1', 'fields': { 'first': { 'type': 'datetimetz' } }, }) unhandled_hints = set(records_format.hints) processing_instructions = ProcessingInstructions() expectations = testcases[datetimeformattz] try: options = pandas_read_csv_options(records_format, records_schema, unhandled_hints, processing_instructions) except NotImplementedError: self.fail(f'Could not handle combination for {datetimeformattz}') self.assertEqual(options['parse_dates'], [0]) self.assertTrue(all(item in options.items() for item in expectations.items())) datetimetz = create_sample(datetimeformattz) fileobj = io.StringIO(datetimetz) df = pandas.read_csv(filepath_or_buffer=fileobj, **options) timestamp = df['untitled_0'][0] self.assertIsInstance(timestamp, pandas.Timestamp, f"Pandas did not parse {datetimetz} as a timestamp object") self.assertEqual(timestamp.year, SAMPLE_YEAR) self.assertEqual(timestamp.month, SAMPLE_MONTH) self.assertEqual(timestamp.day, SAMPLE_DAY) self.assertEqual(timestamp.hour, SAMPLE_HOUR) self.assertEqual(timestamp.minute, SAMPLE_MINUTE) if 'SS' in datetimeformattz: self.assertEqual(timestamp.second, SAMPLE_SECOND) else: self.assertEqual(timestamp.second, 0)