def schema_for_column(c): '''Returns the Schema object for the given Column.''' data_type = c.data_type.lower() inclusion = 'available' result = Schema(inclusion=inclusion) if data_type == 'boolean': result.type = ['null', 'boolean'] elif data_type == 'number' or data_type == 'real' or data_type == 'float' or data_type == 'fixed': result.type = ['null', 'number'] elif data_type == 'text': result.type = ['null', 'string'] elif data_type == 'timestamp_ntz': result.type = ['null', 'string'] result.format = 'date-time' elif data_type == 'variant' or data_type == 'array': result.type = ['null', 'string'] else: result = Schema(None, inclusion='unsupported', description='Unsupported column type {}'.format(data_type)) return result
class TestSchema(unittest.TestCase): # Raw data structures for several schema types string_dict = {'type': 'string', 'maxLength': 32} integer_dict = {'type': 'integer', 'maximum': 1000000} array_dict = {'type': 'array', 'items': integer_dict} object_dict = { 'type': 'object', 'properties': { 'a_string': string_dict, 'an_array': array_dict } } # Schema object forms of the same schemas as above string_obj = Schema(type='string', maxLength=32) integer_obj = Schema(type='integer', maximum=1000000) array_obj = Schema(type='array', items=integer_obj) object_obj = Schema(type='object', properties={ 'a_string': string_obj, 'an_array': array_obj }) def test_string_to_dict(self): self.assertEquals(self.string_dict, self.string_obj.to_dict()) def test_integer_to_dict(self): self.assertEquals(self.integer_dict, self.integer_obj.to_dict()) def test_array_to_dict(self): self.assertEquals(self.array_dict, self.array_obj.to_dict()) def test_object_to_dict(self): self.assertEquals(self.object_dict, self.object_obj.to_dict()) def test_string_from_dict(self): self.assertEquals(self.string_obj, Schema.from_dict(self.string_dict)) def test_integer_from_dict(self): self.assertEquals(self.integer_obj, Schema.from_dict(self.integer_dict)) def test_array_from_dict(self): self.assertEquals(self.array_obj, Schema.from_dict(self.array_dict)) def test_object_from_dict(self): self.assertEquals(self.object_obj, Schema.from_dict(self.object_dict)) def test_repr_atomic(self): self.assertEquals(self.string_obj, eval(repr(self.string_obj))) def test_repr_recursive(self): self.assertEquals(self.object_obj, eval(repr(self.object_obj)))
def schema_for_column(c, pks_for_table): # Return Schema(None) to avoid calling lower() on a column with no datatype if c.data_type is None: LOGGER.info('Skipping column %s since it had no datatype', c.column_name) return Schema(None) data_type = c.data_type.lower() result = Schema() # Scale of None indicates default of 6 digits numeric_scale = c.numeric_scale if data_type == 'number' and numeric_scale is not None and numeric_scale <= 0: result.type = nullable_column(c.column_name, 'integer', pks_for_table) return result elif data_type == 'number': # NB: Due to scale and precision variations in Oracle version, and # among numeric types, we're using a custom `singer.decimal` string # formatter for this, with no opinion on scale/precision. result.type = nullable_column(c.column_name, 'string', pks_for_table) result.format = 'singer.decimal' return result elif data_type == 'date' or data_type.startswith("timestamp"): result.type = nullable_column(c.column_name, 'string', pks_for_table) result.format = 'date-time' return result elif data_type in FLOAT_TYPES: result.type = nullable_column(c.column_name, 'number', pks_for_table) return result elif data_type in STRING_TYPES: character_used = c.character_used result.type = nullable_column(c.column_name, 'string', pks_for_table) if character_used == 'C': result.maxLength = c.char_length return result #these column types are insane. they are NOT actually ieee754 floats #instead they are represented as decimals, but despite this #it appears we can say nothing about their max or min #"float", "double_precision", "real" elif data_type in ['float', 'double_precision']: result.type = nullable_column(c.column_name, 'string', pks_for_table) result.format = 'singer.decimal' return result return Schema(None)
def test_unknown_inclusion(self): selected_cols = {'col1'} table_schema = Schema(type='object', properties={ 'col1': Schema(None, inclusion='unknown'), 'col2': Schema(None, inclusion='unsupported') }) assert_that( calling(tap_redshift.resolve.desired_columns).with_args( selected_cols, table_schema), raises(Exception))
def schema_for_column(c): """Returns the Schema object for the given Column.""" data_type = c.data_type.lower() inclusion = "available" if c.is_primary_key == 1: inclusion = "automatic" result = Schema(inclusion=inclusion) if data_type == "bit": result.type = ["null", "boolean"] #In SQL Server tinyint is unsigned. elif data_type in BYTES_FOR_UNSIGNED_INTEGER_TYPE: result.type = ["null", "integer"] bits = BYTES_FOR_UNSIGNED_INTEGER_TYPE[data_type] * 8 result.minimum = 0 result.maximum = 2**bits - 1 elif data_type in BYTES_FOR_INTEGER_TYPE: result.type = ["null", "integer"] bits = BYTES_FOR_INTEGER_TYPE[data_type] * 8 result.minimum = 0 - 2**(bits - 1) result.maximum = 2**(bits - 1) - 1 elif data_type in FLOAT_TYPES: result.type = ["null", "number"] elif data_type in PRECISE_NUMERIC_TYPES: result.type = ["null", "number"] result.multipleOf = 10**(0 - c.numeric_scale) return result elif data_type in STRING_TYPES: result.type = ["null", "string"] if c.character_maximum_length is not None and c.character_maximum_length > 0: result.maxLength = c.character_maximum_length elif data_type in DATETIME_TYPES: result.type = ["null", "string"] result.format = "date-time" elif data_type in VARIANT_TYPES: result.type = ["null", "object"] else: result = Schema( None, inclusion="unsupported", description="Unsupported column type", ) return result
def schema_for_column(c): '''Returns the Schema object for the given Column.''' data_type = c.data_type.lower() column_type = c.column_type.lower() inclusion = 'available' # We want to automatically include all primary key columns if c.column_key.lower() == 'pri': inclusion = 'automatic' result = Schema(inclusion=inclusion) if data_type == 'bit' or column_type.startswith('tinyint(1)'): result.type = ['null', 'boolean'] elif data_type in BYTES_FOR_INTEGER_TYPE: result.type = ['null', 'integer'] bits = BYTES_FOR_INTEGER_TYPE[data_type] * 8 if 'unsigned' in c.column_type: result.minimum = 0 result.maximum = 2**bits - 1 else: result.minimum = 0 - 2**(bits - 1) result.maximum = 2**(bits - 1) - 1 elif data_type in FLOAT_TYPES: result.type = ['null', 'number'] elif data_type == 'decimal': result.type = ['null', 'number'] result.exclusiveMaximum = True result.maximum = 10**(c.numeric_precision - c.numeric_scale) result.multipleOf = 10**(0 - c.numeric_scale) if 'unsigned' in column_type: result.minimum = 0 else: result.exclusiveMinimum = True result.minimum = -10**(c.numeric_precision - c.numeric_scale) return result elif data_type in STRING_TYPES: result.type = ['null', 'string'] result.maxLength = c.character_maximum_length elif data_type in DATETIME_TYPES: result.type = ['null', 'string'] result.format = 'date-time' else: result = Schema( None, inclusion='unsupported', description='Unsupported column type {}'.format(column_type)) return result
def test_select_desired_column(self): selected_cols = {'col1', 'col2', 'col3'} table_schema = Schema(type='object', properties={ 'col1': Schema(None, inclusion='available'), 'col2': Schema(None, inclusion='unsupported'), 'col4': Schema(None, inclusion='automatic') }) desired_columns = tap_redshift.resolve.desired_columns( selected_cols, table_schema) assert_that(desired_columns, equal_to({'col1', 'col4'}))
def schema_for_column(c): """Returns the Schema object for the given Column.""" data_type = c.data_type.lower() column_type = c.column_type.lower() inclusion = "available" # We want to automatically include all primary key columns if c.column_key.lower() == "pri": inclusion = "automatic" result = Schema(inclusion=inclusion) if data_type == "bit" or column_type.startswith("tinyint(1)"): result.type = ["null", "boolean"] elif data_type in BYTES_FOR_INTEGER_TYPE: result.type = ["null", "integer"] bits = BYTES_FOR_INTEGER_TYPE[data_type] * 8 if "unsigned" in c.column_type: result.minimum = 0 result.maximum = 2 ** bits - 1 else: result.minimum = 0 - 2 ** (bits - 1) result.maximum = 2 ** (bits - 1) - 1 elif data_type in FLOAT_TYPES: result.type = ["null", "number"] result.multipleOf = 10 ** (0 - 6) elif data_type == "decimal": result.type = ["null", "number"] result.multipleOf = 10 ** (0 - c.numeric_scale) return result elif data_type in STRING_TYPES: result.type = ["null", "string"] result.maxLength = c.character_maximum_length elif data_type in DATETIME_TYPES: result.type = ["null", "string"] result.format = "date-time" elif data_type in VARIANT_TYPES: result.type = ["null", "object"] else: result = Schema( None, inclusion="unsupported", description="Unsupported column type {}".format(column_type), ) return result
def runTest(self): selected_cols = set(['a', 'b', 'd']) table_schema = Schema(type='object', properties={ 'a': Schema(None, inclusion='available'), 'b': Schema(None, inclusion='unsupported'), 'c': Schema(None, inclusion='automatic')}) got_cols = tap_mysql.discover_utils.desired_columns(selected_cols, table_schema) self.assertEqual(got_cols, {'a', 'c'}, 'Keep automatic as well as selected, available columns.')
def test_one_selected_stream(self): selected_entry = CatalogEntry(tap_stream_id='a', schema=Schema(), metadata=[{'metadata': {'selected': True}, 'breadcrumb': []}]) catalog = Catalog( [selected_entry, CatalogEntry(tap_stream_id='b',schema=Schema(),metadata=[]), CatalogEntry(tap_stream_id='c',schema=Schema(),metadata=[])]) state = {} selected_streams = catalog.get_selected_streams(state) self.assertEquals([e for e in selected_streams],[selected_entry])
def populate_schema_node(schema, field_info, id_field_map, breadcrumb, metadata): """ Populates a node in the schema. A node corresponds to a JSON object, which has properties (children) """ # add metadata metadata.append( { 'metadata': { 'inclusion': 'available' }, 'breadcrumb': [i for i in breadcrumb] } ) #populate schema schema.type = ['null','object'] schema.properties = {} for id_num in field_info.get('composite_fields'): child_field_info = id_field_map[id_num] breadcrumb.extend(['properties',child_field_info.get('name')]) child_schema = Schema() if child_field_info.get('composite_fields'): populate_schema_node(child_schema, child_field_info, id_field_map, breadcrumb, metadata) else: populate_schema_leaf(child_schema, child_field_info, id_num, breadcrumb, metadata) schema.properties[child_field_info.get('name')] = child_schema # remove 'properties' and 'child_field_name' from breadcrumb breadcrumb.pop() breadcrumb.pop()
def test_tinyint_1_unsigned(self): self.assertEqual(self.schema.properties['c_tinyint_1_unsigned'], Schema(['null', 'boolean'], inclusion='available')) self.assertEqual(self.get_metadata_for_column('c_tinyint_1_unsigned'), {'selected-by-default': True, 'sql-datatype': 'tinyint(1) unsigned'})
def runTest(self): selected_cols = set(["a", "b", "d"]) table_schema = Schema( type="object", properties={ "a": Schema(None, inclusion="available"), "b": Schema(None, inclusion="unsupported"), "c": Schema(None, inclusion="automatic"), }, ) got_cols = tap_mysql.desired_columns(selected_cols, table_schema) self.assertEqual( got_cols, set(["a", "c"]), "Keep automatic as well as selected, available columns.")
def test_retries_on_facebook_request_error_sync_batches( self, mocked_schema, mocked_api): """ AdCreative.sync_batches calls a `facebook_business` method,`api_batch.execute()`, to get a batch of ad creatives. We mock this method to raise a `FacebookRequestError` and expect the tap to retry this that function up to 5 times, which is the current hard coded `max_tries` value. """ # Mock new_batch() function of API mocked_api.new_batch = Mock() mocked_api.new_batch.return_value = MockBatch( exception="FacebookRequestError" ) # Raise FacebookRequestError exception # Initialize AdCreative and mock catalog_entry mock_catalog_entry = CatalogEntry(schema=Schema()) ad_creative_object = AdCreative('', '', '', '') ad_creative_object.catalog_entry = mock_catalog_entry # Call sync_batches() function of AdCreatives and verify FacebookRequestError is raised with self.assertRaises(FacebookRequestError): ad_creative_object.sync_batches([]) # verify calls inside sync_batches are called 5 times as max 5 reties provided for function self.assertEquals(5, mocked_api.new_batch.call_count) self.assertEquals(5, mocked_schema.call_count)
def generate_catalog(streams: List["StreamCatalogEntry"]) -> Catalog: """Generates a catalog with an entry for each stream in `streams`""" entries = [] for stream_entry in streams: if "replication_method" not in stream_entry: stream_entry["replication_method"] = "INCREMENTAL" if "replication_key" not in stream_entry: stream_entry["replication_key"] = "updated_date" entries.append( CatalogEntry( tap_stream_id=stream_entry["tap_stream_id"], schema=Schema(), replication_method=stream_entry["replication_method"], replication_key=stream_entry["replication_key"], metadata=[{ "breadcrumb": tuple([]), "metadata": { "selected": stream_entry["selected"], }, }], )) return Catalog(entries)
def test_float(self): self.assertEqual(self.dt_schema.properties['C_FLOAT'], Schema(['null', 'number'], inclusion='available')) self.assertEqual(self.get_dt_metadata_for_column('C_FLOAT'), { 'selected-by-default': True, 'sql-datatype': 'float' })
def test_decimal_with_defined_scale_and_precision(self): self.assertEqual(self.dt_schema.properties['C_DECIMAL_2'], Schema(['null', 'number'], inclusion='available')) self.assertEqual(self.get_dt_metadata_for_column('C_DECIMAL_2'), { 'selected-by-default': True, 'sql-datatype': 'number' })
def test_double(self): self.assertEqual(self.schema.properties['c_double'], Schema(['null', 'number'], inclusion='available')) self.assertEqual(self.get_metadata_for_column('c_double'), { 'selected-by-default': True, 'sql-datatype': 'double' })
def test_bit(self): self.assertEqual(self.schema.properties['c_bit'], Schema(['null', 'boolean'], inclusion='available')) self.assertEqual(self.get_metadata_for_column('c_bit'), { 'selected-by-default': True, 'sql-datatype': 'bit(4)' })
def add_automatic_properties(catalog_entry, columns): catalog_entry.schema.properties[SDC_DELETED_AT] = Schema( type=["null", "string"], format="date-time") columns.append(SDC_DELETED_AT) return columns
def test_decimal_with_defined_scale_and_precision(self): self.assertEqual( self.schema.properties['c_decimal_2'], Schema(['null', 'number'], inclusion='available', multipleOf=0.01)) self.assertEqual(self.get_metadata_for_column('c_decimal_2'), { 'selected-by-default': True, 'sql-datatype': 'decimal(11,2)' })
def test_decimal(self): self.assertEqual( self.schema.properties['c_decimal'], Schema(['null', 'number'], inclusion='available', multipleOf=1)) self.assertEqual(self.get_metadata_for_column('c_decimal'), { 'selected-by-default': True, 'sql-datatype': 'decimal(10,0)' })
def test_varbinary(self): self.assertEqual( self.schema.properties['c_varbinary'], Schema(['null', 'string'], maxLength=32, inclusion='available')) self.assertEqual(self.get_metadata_for_column('c_varbinary'), { 'selected-by-default': True, 'sql-datatype': 'varbinary(16)' })
def test_array(self): self.assertEqual( self.dt_schema.properties['C_ARRAY'], Schema(['null', 'object', 'array'], inclusion='available')) self.assertEqual(self.get_dt_metadata_for_column('C_ARRAY'), { 'selected-by-default': True, 'sql-datatype': 'array' })
def test_varbinary(self): self.assertEqual( self.dt_schema.properties['C_VARBINARY'], Schema(['null', 'string'], format='binary', inclusion='available')) self.assertEqual(self.get_dt_metadata_for_column('C_VARBINARY'), { 'selected-by-default': True, 'sql-datatype': 'binary' })
def test_time(self): self.assertEqual( self.dt_schema.properties['C_TIME'], Schema(['null', 'string'], format='time', inclusion='available')) self.assertEqual(self.get_dt_metadata_for_column('C_TIME'), { 'selected-by-default': True, 'sql-datatype': 'time' })
def discover_columns(connection, table_info): entries = [] for schema_name in table_info.keys(): for table_name in table_info[schema_name].keys(): mdata = {} columns = table_info[schema_name][table_name]['columns'] table_pks = [ col_name for col_name, col_info in columns.items() if col_info.is_primary_key ] with connection.cursor( cursor_factory=psycopg2.extras.DictCursor) as cur: cur.execute(" SELECT current_database()") database_name = cur.fetchone()[0] metadata.write(mdata, (), 'table-key-properties', table_pks) metadata.write(mdata, (), 'schema-name', schema_name) metadata.write(mdata, (), 'database-name', database_name) metadata.write(mdata, (), 'row-count', table_info[schema_name][table_name]['row_count']) metadata.write(mdata, (), 'is-view', table_info[schema_name][table_name].get('is_view')) column_schemas = { col_name: schema_for_column(col_info) for col_name, col_info in columns.items() } schema = Schema(type='object', properties=column_schemas) for c_name in column_schemas.keys(): mdata = write_sql_data_type_md(mdata, columns[c_name]) if column_schemas[c_name].type is None: mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'unsupported') mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', False) elif table_info[schema_name][table_name]['columns'][ c_name].is_primary_key: mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'automatic') mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', True) else: mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'available') mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', True) entry = CatalogEntry(table=table_name, stream=table_name, metadata=metadata.to_list(mdata), tap_stream_id=database_name + '-' + schema_name + '-' + table_name, schema=schema) entries.append(entry) return entries
def test_resumes_currently_syncing_stream(self): selected_entry_a = CatalogEntry(tap_stream_id='a', schema=Schema(), metadata=[{'metadata': {'selected': True}, 'breadcrumb': []}]) selected_entry_c = CatalogEntry(tap_stream_id='c', schema=Schema(), metadata=[{'metadata': {'selected': True}, 'breadcrumb': []}]) catalog = Catalog( [selected_entry_a, CatalogEntry(tap_stream_id='b',schema=Schema(),metadata=[]), selected_entry_c]) state = {'currently_syncing': 'c'} selected_streams = catalog.get_selected_streams(state) self.assertEquals([e for e in selected_streams][0],selected_entry_c)
def test_bigint(self): self.assertEqual(self.schema.properties['c_bigint'], Schema(['null', 'integer'], inclusion='available', minimum=-9223372036854775808, maximum=9223372036854775807)) self.assertEqual(self.get_metadata_for_column('c_bigint'), {'selected-by-default': True, 'sql-datatype': 'bigint(20)'})
def test_tinyint(self): self.assertEqual(self.schema.properties['c_tinyint'], Schema(['null', 'integer'], inclusion='available', minimum=-128, maximum=127)) self.assertEqual(self.get_metadata_for_column('c_tinyint'), {'selected-by-default': True, 'sql-datatype': 'tinyint(4)'})