Beispiel #1
0
 def load(self, hints, fail_if):
     processing_instructions = ProcessingInstructions()
     processing_instructions.fail_if_cant_handle_hint = fail_if
     processing_instructions.fail_if_dont_understand = fail_if
     processing_instructions.fail_if_row_invalid = fail_if
     self.mock_records_load_plan.records_format = DelimitedRecordsFormat(
         hints=hints)
     self.mock_records_load_plan.processing_instructions = processing_instructions
     return self.redshift_db_driver.loader().\
         load(schema='myschema',
              table='mytable',
              load_plan=self.mock_records_load_plan,
              directory=self.mock_directory)
Beispiel #2
0
 def test_vertica_format_permissive(self):
     vertica_format = DelimitedRecordsFormat(variant='vertica')
     processing_instructions = ProcessingInstructions(fail_if_row_invalid=False)
     load_plan = RecordsLoadPlan(processing_instructions=processing_instructions,
                                 records_format=vertica_format)
     unhandled_hints = set(load_plan.records_format.hints.keys())
     options = vertica_import_options(unhandled_hints, load_plan)
     expected_options = {
         'abort_on_error': False,
         'delimiter': '\x01',
         'enclosed_by': None,
         'enforcelength': False,
         'error_tolerance': True,
         'escape_as': None,
         'gzip': False,
         'load_method': 'AUTO',
         'no_commit': False,
         'null_as': None,
         'record_terminator': '\x02',
         'rejectmax': None,
         'skip': 0,
         'trailing_nullcols': True,
     }
     self.assertDictEqual(options, expected_options)
     self.assertEqual(unhandled_hints, set())
 def test_datetimeformat(self):
     # Double check this before adding anything else in here to see
     # if it has changed, but YYYY-MM-DD HH:MI:SS, YYYY-MM-DD
     # HH24:MI:SS and YYYY-MM-DD HH:MI:SS are the only formats
     # accepted by BigQuery as of this writing
     should_raise = {
         'YYYY-MM-DD HH12:MI AM': True,
         'MM/DD/YY HH24:MI': True,
     }
     processing_instructions = ProcessingInstructions(
         fail_if_dont_understand=True,
         fail_if_cant_handle_hint=True,
         fail_if_row_invalid=True)
     for datetimeformat in DATETIME_CASES:
         records_format =\
             DelimitedRecordsFormat(variant='bigquery',
                                    hints={
                                        'datetimeformat': datetimeformat
                                    })
         load_plan = RecordsLoadPlan(
             processing_instructions=processing_instructions,
             records_format=records_format)
         unhandled_hints = set(records_format.hints.keys())
         try:
             load_job_config(unhandled_hints, load_plan)
         except NotImplementedError:
             if should_raise[datetimeformat]:
                 pass
             else:
                 raise
    def test_dataframe_to_int64_and_back_to_object_produces_int_columns(self) -> None:
        # This reproduces a situation found when a user worked around
        # a separate historical Records Mover limitation by doing an
        # unusual cast on their dataframe...and then hit a separate
        # limitation:
        #
        # https://github.com/bluelabsio/records-mover/pull/103
        data = {'Population': [11190846, 1303171035, 207847528]}
        df = DataFrame(data, columns=['Population'])

        df['Population'] = df['Population'].astype("Int64")
        df['Population'] = df['Population'].astype("object")

        source = DataframesRecordsSource(dfs=[df])
        processing_instructions = ProcessingInstructions()
        schema = source.initial_records_schema(processing_instructions)
        dialect = RedshiftDialect()
        mock_engine = Mock(name='engine')
        mock_engine.dialect = dialect
        driver = RedshiftDBDriver(db=mock_engine)
        schema_sql = schema.to_schema_sql(driver=driver,
                                          schema_name='my_schema_name',
                                          table_name='my_table_name')
        expected_schema_sql = """
CREATE TABLE my_schema_name.my_table_name (
\t"Population" INTEGER
)

"""
        self.assertEqual(schema_sql, expected_schema_sql)
Beispiel #5
0
 def test_christmas_tree_format_1_permissive(self):
     vertica_format = DelimitedRecordsFormat(variant='dumb', hints=christmas_tree_format_1_hints)
     processing_instructions = ProcessingInstructions(fail_if_cant_handle_hint=False)
     load_plan = RecordsLoadPlan(processing_instructions=processing_instructions,
                                 records_format=vertica_format)
     unhandled_hints = set(load_plan.records_format.hints.keys())
     with patch.object(driver_logger, 'warning') as mock_warning:
         options = vertica_import_options(unhandled_hints, load_plan)
     expected_options = {
         'abort_on_error': True,
         'delimiter': '\x01',
         'enforcelength': True,
         'error_tolerance': False,
         'escape_as': '\\',
         'load_method': 'AUTO',
         'no_commit': False,
         'null_as': None,
         'record_terminator': '\x02',
         'rejectmax': 1,
         'skip': 1,
         'trailing_nullcols': False,
     }
     self.assertDictEqual(options, expected_options)
     self.assertListEqual(mock_warning.mock_calls,
                          [call("Ignoring hint compression = 'LZO'"),
                           call("Ignoring hint quoting = 'nonnumeric'")])
     self.assertEqual(unhandled_hints, set())
 def test_pandas_read_csv_options_vertica(self):
     self.maxDiff = None
     expected = {
         'dayfirst': False,
         'compression': None,
         'delimiter': '\x01',
         'doublequote': False,
         'engine': 'c',
         'error_bad_lines': True,
         'header': None,
         'lineterminator': '\x02',
         'prefix': 'untitled_',
         'quotechar': '"',
         'quoting': 3,
         'warn_bad_lines': True,
         'parse_dates': [0, 1, 2, 3],
     }
     processing_instructions = ProcessingInstructions()
     records_format = DelimitedRecordsFormat(hints=vertica_format_hints)
     unhandled_hints = set(records_format.hints)
     actual = pandas_read_csv_options(records_format,
                                      self.records_schema,
                                      unhandled_hints,
                                      processing_instructions)
     self.assertEqual(expected, actual)
     self.assertFalse(unhandled_hints)
 def test_pandas_read_csv_options_bluelabs(self):
     expected = {
         'dayfirst': False,
         'compression': 'gzip',
         'delimiter': ',',
         'doublequote': False,
         'encoding': 'UTF8',
         'engine': 'python',
         'error_bad_lines': True,
         'escapechar': '\\',
         'header': None,
         'prefix': 'untitled_',
         'quotechar': '"',
         'quoting': 3,
         'warn_bad_lines': True,
         'parse_dates': [0, 1, 2, 3],
     }
     processing_instructions = ProcessingInstructions()
     records_format = DelimitedRecordsFormat(hints=bluelabs_format_hints)
     unhandled_hints = set(records_format.hints)
     actual = pandas_read_csv_options(records_format,
                                      self.records_schema,
                                      unhandled_hints,
                                      processing_instructions)
     self.assertEqual(expected, actual)
     self.assertFalse(unhandled_hints)
 def test_timeonlyformat(self):
     # Double check this before adding anything else in here to see
     # if it has changed, but HH:MI:SS is the only format accepted
     # by BigQuery as of this writing
     should_raise = {
         'HH:MI:SS': False,
         'HH24:MI:SS': False,
         'HH12:MI AM': True,
     }
     processing_instructions = ProcessingInstructions(
         fail_if_dont_understand=True,
         fail_if_cant_handle_hint=True,
         fail_if_row_invalid=True)
     for timeonlyformat in TIMEONLY_CASES:
         records_format =\
             DelimitedRecordsFormat(variant='bigquery',
                                    hints={
                                        'timeonlyformat': timeonlyformat,
                                    })
         load_plan = RecordsLoadPlan(
             processing_instructions=processing_instructions,
             records_format=records_format)
         unhandled_hints = set(records_format.hints.keys())
         try:
             load_job_config(unhandled_hints, load_plan)
         except NotImplementedError:
             if should_raise[timeonlyformat]:
                 pass
             else:
                 raise
 def test_load_job_config_permissive(self):
     records_format = DelimitedRecordsFormat(variant='bigquery')
     processing_instructions = ProcessingInstructions(
         fail_if_dont_understand=True,
         fail_if_cant_handle_hint=True,
         fail_if_row_invalid=False)
     load_plan = RecordsLoadPlan(
         processing_instructions=processing_instructions,
         records_format=records_format)
     unhandled_hints = set(records_format.hints.keys())
     out = load_job_config(unhandled_hints, load_plan)
     expectations = {
         'allowJaggedRows': True,
         'allowQuotedNewlines': True,
         'autodetect': False,
         'createDisposition': 'CREATE_NEVER',
         'destinationTableProperties': {},
         'encoding': 'UTF-8',
         'fieldDelimiter': ',',
         'ignoreUnknownValues': False,
         'maxBadRecords': 999999,
         'quote': '"',
         'schemaUpdateOptions': None,
         'skipLeadingRows': '1',
         'sourceFormat': 'CSV',
         'writeDisposition': 'WRITE_APPEND'
     }
     self.assertEqual(out.to_api_repr()['load'], expectations)
 def test_pandas_to_csv_options_christmas_tree_format_3(self):
     expected = {
         'compression': 'bz2',
         'date_format': '%d-%m-%Y %H:%M:%S.%f%z',
         'doublequote': True,
         'encoding': 'UTF8',
         'escapechar': '\\',
         'header': False,
         'line_terminator': '\x02',
         'quotechar': '"',
         'quoting': 0,
         'sep': '\x01',
     }
     processing_instructions =\
         ProcessingInstructions(fail_if_cant_handle_hint=False)
     records_format = DelimitedRecordsFormat(
         hints=christmas_tree_format_3_hints)
     unhandled_hints = set(records_format.hints)
     with patch.object(driver_logger, 'warning') as mock_warning:
         actual = pandas_to_csv_options(records_format, unhandled_hints,
                                        processing_instructions)
         self.assertEqual(expected, actual)
         self.assertListEqual(mock_warning.mock_calls, [
             call("Ignoring hint quoting = "
                  "'some_future_option_not_supported_now'"),
             call("Ignoring hint escape = '@'"),
             call("Ignoring hint datetimeformattz = 'HH:MI:SSOF YYYY-MM-DD'"
                  ),
             call("Ignoring hint datetimeformattz = "
                  "'YYYY-MM-DD HH24:MI:SSOF'"),
             call("Ignoring hint datetimeformat = 'YYYY-MM-DD HH24:MI:SS'")
         ])
         self.assertFalse(unhandled_hints)
Beispiel #11
0
    def setUp(self):
        self.mock_db_engine = MagicMock()
        self.mock_db_engine.dialect = create_autospec(VerticaDialect)
        self.mock_db_engine.dialect.preparer.return_value.quote = fake_quote
        self.mock_db_engine.engine = self.mock_db_engine
        self.mock_s3_temp_base_loc = MagicMock(name='s3_temp_base_loc')
        self.mock_url_resolver = Mock(name='url_resolver')
        self.mock_directory_url = self.mock_url_resolver.directory_url
        self.mock_s3_temp_base_loc.url = 's3://fakebucket/fakedir/fakesubdir/'
        with patch('records_mover.db.vertica.vertica_db_driver.VerticaLoader') \
                as mock_VerticaLoader:
            self.vertica_db_driver = VerticaDBDriver(
                db=self.mock_db_engine,
                s3_temp_base_loc=self.mock_s3_temp_base_loc,
                url_resolver=self.mock_url_resolver)
            self.mock_VerticaLoader = mock_VerticaLoader
            self.mock_vertica_loader = mock_VerticaLoader.return_value

        mock_records_unload_plan = create_autospec(RecordsUnloadPlan)
        mock_records_unload_plan.records_format = create_autospec(
            DelimitedRecordsFormat)
        mock_records_unload_plan.records_format.format_type = 'delimited'
        mock_records_unload_plan.records_format.variant = None
        mock_records_unload_plan.processing_instructions = ProcessingInstructions(
        )
        self.mock_records_unload_plan = mock_records_unload_plan

        mock_records_load_plan = Mock()
        mock_records_load_plan.processing_instructions = ProcessingInstructions(
        )
        self.mock_records_load_plan = mock_records_load_plan

        mock_directory = Mock()

        mock_directory.loc.url = 's3://mybucket/myparent/mychild/'
        mock_directory.loc.aws_creds.return_value = Mock(name='aws creds')
        mock_directory.loc.aws_creds.return_value.access_key = 'fake_aws_id'
        mock_directory.loc.aws_creds.return_value.secret_key = 'fake_aws_secret'
        mock_directory.loc.aws_creds.return_value.token = None

        self.mock_directory = mock_directory
Beispiel #12
0
 def test_weird_timeonlyformat(self):
     vertica_format = DelimitedRecordsFormat(variant='dumb', hints={
         'timeonlyformat': 'something else'
     })
     processing_instructions = ProcessingInstructions()
     load_plan = RecordsLoadPlan(processing_instructions=processing_instructions,
                                 records_format=vertica_format)
     unhandled_hints = set(load_plan.records_format.hints.keys())
     with self.assertRaisesRegexp(NotImplementedError,
                                  "Implement hint timeonlyformat='something else' or try again "
                                  "with fail_if_cant_handle_hint=False"):
         vertica_import_options(unhandled_hints, load_plan)
Beispiel #13
0
 def test_load_job_config_vertica(self):
     records_format = DelimitedRecordsFormat(variant='vertica')
     processing_instructions = ProcessingInstructions(fail_if_dont_understand=True,
                                                      fail_if_cant_handle_hint=True,
                                                      fail_if_row_invalid=True)
     load_plan = RecordsLoadPlan(processing_instructions=processing_instructions,
                                 records_format=records_format)
     unhandled_hints = set(records_format.hints.keys())
     with self.assertRaisesRegex(NotImplementedError,
                                 r"Implement hint record-terminator='\\x02' "
                                 "or try again with fail_if_cant_handle_hint=False"):
         load_job_config(unhandled_hints, load_plan)
Beispiel #14
0
 def test_quote_all_with_doublequote(self):
     vertica_format = DelimitedRecordsFormat(variant='csv', hints={
         'quoting': 'all'
     })
     processing_instructions = ProcessingInstructions()
     load_plan = RecordsLoadPlan(processing_instructions=processing_instructions,
                                 records_format=vertica_format)
     unhandled_hints = set(load_plan.records_format.hints.keys())
     with self.assertRaisesRegexp(NotImplementedError,
                                  r"Implement hint doublequote=True or try again with "
                                  "fail_if_cant_handle_hint=False"):
         vertica_import_options(unhandled_hints, load_plan)
 def test_load_job_config_unknown_quoting(self):
     records_format = DelimitedRecordsFormat(variant='bigquery',
                                             hints={'quoting': 'blah'})
     processing_instructions = ProcessingInstructions(
         fail_if_dont_understand=True,
         fail_if_cant_handle_hint=True,
         fail_if_row_invalid=True)
     load_plan = RecordsLoadPlan(
         processing_instructions=processing_instructions,
         records_format=records_format)
     unhandled_hints = set(records_format.hints.keys())
     with self.assertRaises(NotImplementedError):
         load_job_config(unhandled_hints, load_plan)
 def test_load_job_config_no_bzip_support(self):
     records_format = DelimitedRecordsFormat(variant='bigquery',
                                             hints={'compression': 'BZIP'})
     processing_instructions = ProcessingInstructions(
         fail_if_dont_understand=True,
         fail_if_cant_handle_hint=True,
         fail_if_row_invalid=True)
     load_plan = RecordsLoadPlan(
         processing_instructions=processing_instructions,
         records_format=records_format)
     unhandled_hints = set(records_format.hints.keys())
     with self.assertRaisesRegex(
             NotImplementedError, r"Implement hint compression='BZIP' "
             "or try again with fail_if_cant_handle_hint=False"):
         load_job_config(unhandled_hints, load_plan)
 def test_pandas_read_csv_options_inconsistent_date_format(self):
     processing_instructions = ProcessingInstructions()
     hints = bluelabs_format_hints.copy()
     hints.update({
         'dateformat': 'DD-MM-YYYY',
         'datetimeformattz': 'MM-DD-YYYY HH24:MIOF',
         'datetimeformat': 'DD-MM-YYYY HH24:MI',
     })
     records_format = DelimitedRecordsFormat(hints=hints)
     unhandled_hints = set(records_format.hints)
     with self.assertRaises(NotImplementedError):
         pandas_read_csv_options(records_format,
                                 self.records_schema,
                                 unhandled_hints,
                                 processing_instructions)
 def test_load_job_config_unsupported_datetimeformattz(self):
     records_format = DelimitedRecordsFormat(
         variant='bigquery',
         hints={'datetimeformattz': 'MM/DD/YY HH:MI:SSOF'})
     processing_instructions = ProcessingInstructions(
         fail_if_dont_understand=True,
         fail_if_cant_handle_hint=True,
         fail_if_row_invalid=True)
     load_plan = RecordsLoadPlan(
         processing_instructions=processing_instructions,
         records_format=records_format)
     unhandled_hints = set(records_format.hints.keys())
     with self.assertRaisesRegex(
             NotImplementedError,
             r"Implement hint datetimeformattz='MM/DD/YY HH:MI:SSOF' "
             "or try again with fail_if_cant_handle_hint=False"):
         load_job_config(unhandled_hints, load_plan)
Beispiel #19
0
 def test_quote_all_without_doublequote(self):
     vertica_format = DelimitedRecordsFormat(variant='csv', hints={
         'quoting': 'all',
         'doublequote': False,
         # Vertica doesn't support exporting CSV variant style dates by
         # default, so let's pick some it can for purposes of this
         # test:
         'dateformat': 'YYYY-MM-DD',
         'datetimeformat': 'YYYY-MM-DD HH:MI:SS',
         'datetimeformattz': 'YYYY-MM-DD HH:MI:SSOF',
     })
     processing_instructions = ProcessingInstructions()
     load_plan = RecordsLoadPlan(processing_instructions=processing_instructions,
                                 records_format=vertica_format)
     unhandled_hints = set(load_plan.records_format.hints.keys())
     out = vertica_import_options(unhandled_hints, load_plan)
     self.assertEqual(out['enclosed_by'], '"')
 def test_pandas_to_csv_options_vertica(self):
     expected = {
         'date_format': '%Y-%m-%d %H:%M:%S.%f%z',
         'doublequote': False,
         'encoding': 'UTF8',
         'header': False,
         'line_terminator': '\x02',
         'quotechar': '"',
         'quoting': 3,
         'sep': '\x01',
     }
     processing_instructions = ProcessingInstructions()
     records_format = DelimitedRecordsFormat(hints=vertica_format_hints)
     unhandled_hints = set(records_format.hints)
     actual = pandas_to_csv_options(records_format, unhandled_hints,
                                    processing_instructions)
     self.assertEqual(expected, actual)
     self.assertFalse(unhandled_hints)
 def test_pandas_to_csv_options_csv(self):
     expected = {
         'compression': 'gzip',
         'date_format': '%m/%d/%y %H:%M',
         'doublequote': True,
         'encoding': 'UTF8',
         'header': True,
         'line_terminator': '\n',
         'quotechar': '"',
         'quoting': 0,
         'sep': ','
     }
     processing_instructions =\
         ProcessingInstructions(fail_if_cant_handle_hint=True)
     records_format = DelimitedRecordsFormat(hints=csv_format_hints)
     unhandled_hints = set(records_format.hints)
     actual = pandas_to_csv_options(records_format, unhandled_hints,
                                    processing_instructions)
     self.assertEqual(expected, actual)
     self.assertFalse(unhandled_hints)
 def test_load_job_config_parquet(self):
     records_format = ParquetRecordsFormat()
     processing_instructions = ProcessingInstructions(
         fail_if_dont_understand=True,
         fail_if_cant_handle_hint=True,
         fail_if_row_invalid=True)
     load_plan = RecordsLoadPlan(
         processing_instructions=processing_instructions,
         records_format=records_format)
     unhandled_hints = set()
     out = load_job_config(unhandled_hints, load_plan)
     expectations = {
         'allowJaggedRows': False,
         'autodetect': False,
         'createDisposition': 'CREATE_NEVER',
         'destinationTableProperties': {},
         'ignoreUnknownValues': True,
         'maxBadRecords': 0,
         'schemaUpdateOptions': None,
         'sourceFormat': 'PARQUET',
         'writeDisposition': 'WRITE_APPEND'
     }
     self.assertEqual(expectations, out.to_api_repr()['load'])
 def test_pandas_to_csv_options_christmas_tree_format_1(self):
     expected = {
         'date_format': '%Y-%m-%d %H:%M:%S.%f%z',
         'doublequote': False,
         'encoding': 'UTF8',
         'escapechar': '\\',
         'header': True,
         'line_terminator': '\x02',
         'quotechar': '"',
         'quoting': 2,
         'sep': '\x01'
     }
     processing_instructions =\
         ProcessingInstructions(fail_if_cant_handle_hint=False)
     records_format = DelimitedRecordsFormat(
         hints=christmas_tree_format_1_hints)
     unhandled_hints = set(records_format.hints)
     with patch.object(driver_logger, 'warning') as mock_warning:
         actual = pandas_to_csv_options(records_format, unhandled_hints,
                                        processing_instructions)
         self.assertEqual(expected, actual)
         self.assertListEqual(mock_warning.mock_calls,
                              [call("Ignoring hint compression = 'LZO'")])
         self.assertFalse(unhandled_hints)
Beispiel #24
0
 def test_pandas_numeric_types_and_constraints(self):
     self.maxDiff = None
     # https://docs.scipy.org/doc/numpy/reference/arrays.scalars.html
     # https://stackoverflow.com/a/53828986/9795956
     dtypes = np.dtype([
         ('int8', np.int8),
         ('int16', np.int16),
         ('int32', np.int32),
         ('int64', np.int64),
         ('ubyte', np.ubyte),
         ('uint8', np.uint8),
         ('uint16', np.uint16),
         ('uint32', np.uint32),
         ('uint64', np.uint64),
         ('float16', np.float16),
         ('float32', np.float32),
         ('float64', np.float64),
         # 'float96', np.float96), # not supported by numpy on macOS on amd64, apparantly
         ('float128', np.float128),
     ])
     data = np.empty(0, dtype=dtypes)
     df = pd.DataFrame(data)
     processing_instructions = ProcessingInstructions()
     schema = RecordsSchema.from_dataframe(df,
                                           processing_instructions,
                                           include_index=False)
     data = schema.to_data()
     fields = data['fields']
     fields_and_constraints = {
         field_name: fields[field_name]['constraints']
         for field_name in fields
     }
     expected_fields = {
         'int8': {
             'required': False,
             'unique': False,
             'min': '-128',
             'max': '127'
         },
         'float128': {
             'fp_significand_bits': 64,
             'fp_total_bits': 80,
             'required': False,
             'unique': False
         },
         'float16': {
             'fp_significand_bits': 11,
             'fp_total_bits': 16,
             'required': False,
             'unique': False
         },
         'float32': {
             'fp_significand_bits': 23,
             'fp_total_bits': 32,
             'required': False,
             'unique': False
         },
         'float64': {
             'fp_significand_bits': 53,
             'fp_total_bits': 64,
             'required': False,
             'unique': False
         },
         'int16': {
             'max': '32767',
             'min': '-32768',
             'required': False,
             'unique': False
         },
         'int32': {
             'max': '2147483647',
             'min': '-2147483648',
             'required': False,
             'unique': False
         },
         'int64': {
             'max': '9223372036854775807',
             'min': '-9223372036854775808',
             'required': False,
             'unique': False
         },
         'ubyte': {
             'max': '255',
             'min': '0',
             'required': False,
             'unique': False
         },
         'uint16': {
             'max': '65535',
             'min': '0',
             'required': False,
             'unique': False
         },
         'uint32': {
             'max': '4294967295',
             'min': '0',
             'required': False,
             'unique': False
         },
         'uint64': {
             'max': '18446744073709551615',
             'min': '0',
             'required': False,
             'unique': False
         },
         'uint8': {
             'max': '255',
             'min': '0',
             'required': False,
             'unique': False
         }
     }
     self.assertEqual(fields_and_constraints, expected_fields)