def add_synthetic_keys_to_stream_schema(stream): """Add synthetic keys to stream's schema.""" stream.schema.properties["_sdc_report_datetime"] = Schema.from_dict({ "description": "DateTime of Report Run", "type": "string", "format": "date-time", }) if stream.tap_stream_id in STATISTICS_REPORT_TYPES: stream.schema.properties["_sdc_report_currency"] = Schema.from_dict({ "description": "Currency of all costs in report", "type": "string", }) stream.schema.properties[ "_sdc_report_ignore_x_device"] = Schema.from_dict({ "description": "Ignore cross-device data. Also can explicitly " + "set to null for TransactionID ReportType to get all data.", "type": "boolean", }) return stream
def do_sync(client, catalog, state, config): selected_stream_names = get_selected_streams(catalog) validate_dependencies(selected_stream_names) populate_class_schemas(catalog, selected_stream_names) all_sub_stream_names = get_sub_stream_names() for stream in catalog.streams: stream_name = stream.tap_stream_id mdata = metadata.to_map(stream.metadata) if stream_name not in selected_stream_names: LOGGER.info("%s: Skipping - not selected", stream_name) continue key_properties = metadata.get(mdata, (), 'table-key-properties') sideload_objects = metadata.get(mdata, (), 'sideload-objects') if sideload_objects: stream_schema = get_side_load_schemas(sideload_objects, stream) stream.schema = Schema.from_dict(stream_schema) singer.write_schema(stream_name, stream.schema.to_dict(), key_properties) sub_stream_names = SUB_STREAMS.get(stream_name) if sub_stream_names: for sub_stream_name in sub_stream_names: if sub_stream_name not in selected_stream_names: continue sub_stream = STREAMS[sub_stream_name].stream sub_mdata = metadata.to_map(sub_stream.metadata) sub_key_properties = metadata.get(sub_mdata, (), 'table-key-properties') sideload_objects = metadata.get(mdata, (), 'sideload-objects') if sideload_objects: sub_stream_schema = get_side_load_schemas( sideload_objects, sub_stream) sub_stream.schema = Schema.from_dict(sub_stream_schema) singer.write_schema(sub_stream.tap_stream_id, sub_stream.schema.to_dict(), sub_key_properties) # parent stream will sync sub stream if stream_name in all_sub_stream_names: continue LOGGER.info("%s: Starting sync", stream_name) instance = STREAMS[stream_name](client, config) counter_value = sync_stream(state, config.get('start_date'), instance) singer.write_state(state) LOGGER.info("%s: Completed sync (%s rows)", stream_name, counter_value) zendesk_metrics.log_aggregate_rates() singer.write_state(state) LOGGER.info("Finished sync") zendesk_metrics.log_aggregate_rates()
def add_automatic_properties(catalog_entry, columns): catalog_entry.schema.properties[SDC_DELETED_AT] = Schema( type=["null", "string"], format="date-time") catalog_entry.schema.properties[SYS_UPDATED_AT] = Schema( type=["null", "string"], format="date-time") catalog_entry.schema.properties[SYS_EVENT_TYPE] = Schema(type="integer") catalog_entry.schema.properties[SYS_HASHDIFF] = Schema( type=["null", "string"], format="date-time") catalog_entry.schema.properties[SYS_HASHKEY] = Schema( type=["null", "string"], format="date-time") catalog_entry.schema.properties[SYS_LOG_POS] = Schema(type='integer') catalog_entry.schema.properties[SYS_LOG_FILE] = Schema(type='integer') catalog_entry.schema.properties[SYS_LINENO] = Schema(type='integer') columns.append(SDC_DELETED_AT) columns.append(SYS_UPDATED_AT) columns.append(SYS_EVENT_TYPE) columns.append(SYS_HASHKEY) columns.append(SYS_HASHDIFF) columns.append(SYS_LINENO) columns.append(SYS_LOG_POS) columns.append(SYS_LOG_FILE) return columns
def generate_catalog(client, report_config, standard_fields, custom_fields, all_cubes, cubes_lookup, profile_ids): """ Generate a catalog entry for each report specified in `report_config` """ catalog_entries = [] # for report in PREMADE_REPORTS: for report in report_config: # change to safe name for bigquery temp = report['name'].replace(' ', '_').lower() report['name'] = temp metrics_dimensions = set(report['metrics'] + report['dimensions']) selected_by_default = { *report['metrics'][:10], # Use first 10 metrics in definition *report.get('default_dimensions', []) } premade_fields = [ field for field in standard_fields if field['id'] in metrics_dimensions ] schema, mdata = generate_premade_catalog_entry(premade_fields, all_cubes, cubes_lookup) mdata = reduce( lambda mdata, field_name: metadata.write(mdata, ( "properties", field_name), "selected-by-default", True), selected_by_default, mdata) catalog_entries.append( CatalogEntry(schema=Schema.from_dict(schema), key_properties=['_sdc_record_hash'], stream=report['name'], tap_stream_id=report['name'], metadata=metadata.to_list(mdata))) # for report in report_config: for report in []: schema, mdata = generate_catalog_entry(client, standard_fields, custom_fields, all_cubes, cubes_lookup, profile_ids) catalog_entries.append( CatalogEntry(schema=Schema.from_dict(schema), key_properties=['_sdc_record_hash'], stream=report['name'], tap_stream_id=report['id'], metadata=metadata.to_list(mdata))) return Catalog(catalog_entries)
def test_one_pk_value_with_bookmark(self): catalog_entry = CatalogEntry(tap_stream_id='foo', schema=Schema.from_dict({ 'properties': { 'id': { 'type': ['integer'] } } }), metadata=[{ 'breadcrumb': (), 'metadata': { 'table-key-properties': ['id'] } }]) state = { 'bookmarks': { 'foo': { 'last_pk_fetched': { 'id': 4 }, 'max_pk_values': { 'id': 10 } } } } expected = ' WHERE ((`id` > 4)) AND `id` <= 10 ORDER BY `id` ASC' actual = generate_pk_clause(catalog_entry, state) self.assertEqual(expected, actual)
def test_should_ignore_fields_that_dont_parse(self): record = { "id": "5d6ca50762a07c00045125fb", "created_at": "not a date", "edited_at": "2019-09-02T05:13:43.151" } schema = Schema.from_dict({ "properties": { "created_at": { "format": "date-time", "type": "string" }, "edited_at": { "format": "date-time", "type": "string" }, "id": { "type": "string" }, } }) updated_record = Stream.convert_dates_to_rfc3339(record, schema) assert updated_record["created_at"] == "not a date" assert updated_record[ "edited_at"] == "2019-09-02T05:13:43.151000+00:00"
def add_automatic_properties(catalog_entry, columns): catalog_entry.schema.properties[SDC_DELETED_AT] = Schema( type=["null", "string"], format="date-time") columns.append(SDC_DELETED_AT) return columns
def test_fails_with_null_bookmark(self): catalog_entry = CatalogEntry( schema=Schema.from_dict({'properties': {}})) key_properties = [] last_pk_fetched = None with self.assertRaises(AssertionError): generate_pk_bookmark_clause(key_properties, last_pk_fetched, catalog_entry)
def test_no_pk_values(self): catalog_entry = CatalogEntry(schema=Schema.from_dict( {'properties': {}}), metadata=[]) state = {} expected = '' actual = generate_pk_clause(catalog_entry, state) self.assertEqual(expected, actual)
def test_empty_pk(self): catalog_entry = CatalogEntry( schema=Schema.from_dict({'properties': {}})) key_properties = [] last_pk_fetched = {} expected = '' actual = generate_pk_bookmark_clause(key_properties, last_pk_fetched, catalog_entry) self.assertEqual(expected, actual)
def generate_catalog(client, standard_fields, custom_fields, all_cubes, cubes_lookup, profile_id): schema, mdata = generate_catalog_entry(client, standard_fields, custom_fields, all_cubes, cubes_lookup, profile_id) # Do the thing to generate the thing catalog_entry = CatalogEntry(schema=Schema.from_dict(schema), key_properties=['_sdc_record_hash'], stream='report', tap_stream_id='report', metadata=metadata.to_list(mdata)) return Catalog([catalog_entry])
def test_sync_users(self, m): loop = asyncio.get_event_loop() record_value = load_file_current('teams_output.json', 'data_test') with mock.patch('tap_sentry.SentryClient.users', return_value=[record_value]): dataSync = SentrySync(self.client) schema = load_file('users.json', 'tap_sentry/schemas') resp = dataSync.sync_users(Schema(schema)) with mock.patch('singer.write_record') as patching: task = asyncio.gather(resp) loop.run_until_complete(task) patching.assert_called_with('users', record_value)
def test_one_pk(self): catalog_entry = CatalogEntry(schema=Schema.from_dict( {'properties': { 'id1': { 'type': ['integer'] } }})) key_properties = ['id1'] last_pk_fetched = {'id1': 4} expected = '(`id1` > 4)' actual = generate_pk_bookmark_clause(key_properties, last_pk_fetched, catalog_entry) self.assertEqual(expected, actual)
def generate_streams(conn, table_info): entries = [] for schema_name in table_info.keys(): for table in table_info[schema_name].keys(): with conn.cursor() as cur: sql = f""" SELECT COLUMN_NAME FROM INFORMATION_SCHEMA.KEY_COLUMN_USAGE AS kcu INNER JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS AS tc ON tc.CONSTRAINT_NAME = kcu.CONSTRAINT_NAME AND tc.CONSTRAINT_TYPE = 'PRIMARY KEY' WHERE kcu.TABLE_SCHEMA = '{schema_name}' AND kcu.TABLE_NAME = '{table}'""" cur.execute(sql) table_pks = [ col['COLUMN_NAME'] for col in convert_result_to_dict(cur) ] sql = """SELECT db_name()""" cur.execute(sql) database = cur.fetchone()[0] meta = {} columns = table_info[schema_name][table]['columns'] metadata.write(meta, (), 'table-key-properties', table_pks) metadata.write(meta, (), 'schema-name', schema_name) metadata.write(meta, (), 'database-name', database) metadata.write(meta, (), 'row-count', table_info[schema_name][table]['row_count']) metadata.write(meta, (), 'is-view', table_info[schema_name][table]['is_view']) column_schemas = { col_name: schema_for_column(col_info, table_pks) for col_name, col_info in columns.items() } schema = Schema(type=object, properties=column_schemas) entry = CatalogEntry(table=table, stream=table, metadata=metadata.to_list(meta), tap_stream_id=get_tap_stream_id( database, schema_name, table), schema=schema) entries.append(entry) return Catalog(entries)
def resolve_catalog(discovered_catalog, streams_to_sync): result = Catalog(streams=[]) # Iterate over the streams in the input catalog and match each one up # with the same stream in the discovered catalog. for catalog_entry in streams_to_sync: catalog_metadata = metadata.to_map(catalog_entry.metadata) replication_key = catalog_metadata.get((), {}).get("replication-key") discovered_table = discovered_catalog.get_stream(catalog_entry.tap_stream_id) database_name = common.get_database_name(catalog_entry) if not discovered_table: LOGGER.warning( "Database %s table %s was selected but does not exist", database_name, catalog_entry.table, ) continue selected = { k for k, v in catalog_entry.schema.properties.items() if common.property_is_selected(catalog_entry, k) or k == replication_key } # These are the columns we need to select columns = desired_columns(selected, discovered_table.schema) result.streams.append( CatalogEntry( tap_stream_id=catalog_entry.tap_stream_id, metadata=catalog_entry.metadata, stream=catalog_entry.tap_stream_id, table=catalog_entry.table, schema=Schema( type="object", properties={ col: discovered_table.schema.properties[col] for col in columns }, ), ) ) return result
def schema(self): if self._schema is None: raw_schema = self._client.item_schema(self.api) if self._config.merge_fields_array: # Replace merge fields object with array to make a separate table. mf_desc = raw_schema['properties']['merge_fields'][ 'description'] raw_schema['properties']['merge_fields'] = { 'description': mf_desc, 'type': 'array', 'items': { 'type': 'object', 'properties': { 'merge_id': { 'type': 'number' }, 'tag': { 'type': 'string' }, 'name': { 'type': 'string' }, 'type': { 'type': 'string' }, 'value': { 'type': 'string' } } } } if self._config.interests_array: # Replace interest object with array to make a separate table. int_desc = raw_schema['properties']['interests']['description'] raw_schema['properties']['interests'] = { 'description': int_desc, 'type': 'array', 'items': { 'type': 'object' } } self._schema = Schema.from_dict(raw_schema) return self._schema
def test_two_pks(self): catalog_entry = CatalogEntry(schema=Schema.from_dict({ 'properties': { 'id1': { 'type': ['integer'] }, 'str': { 'type': ['string'] } } })) key_properties = ['id1', 'str'] last_pk_fetched = {'id1': 4, 'str': 'apples'} expected = '(`id1` > 4) OR (`id1` = 4 AND `str` > \'apples\')' actual = generate_pk_bookmark_clause(key_properties, last_pk_fetched, catalog_entry) self.assertEqual(expected, actual)
def test_three_pk_values_with_bookmark(self): catalog_entry = CatalogEntry(tap_stream_id='foo', schema=Schema.from_dict({ 'properties': { 'id1': { 'type': ['integer'] }, 'id2': { 'type': ['string'] }, 'id3': { 'type': ['integer'] } } }), metadata=[{ 'breadcrumb': (), 'metadata': { 'table-key-properties': ['id1', 'id2', 'id3'] } }]) state = { 'bookmarks': { 'foo': { 'last_pk_fetched': { 'id1': 4, 'id2': 6, 'id3': 2 }, 'max_pk_values': { 'id1': 10, 'id2': 8, 'id3': 3 } } } } expected = ' WHERE ((`id1` > 4) OR (`id1` = 4 AND `id2` > \'6\') OR (`id1` = 4 AND `id2` = \'6\' AND `id3` > 2)) AND `id1` <= 10 AND `id2` <= \'8\' AND `id3` <= 3 ORDER BY `id1`, `id2`, `id3` ASC' actual = generate_pk_clause(catalog_entry, state) self.assertEqual(expected, actual)
def get_schema_for_table(table): """ Given a table object, output its schema """ schema = {"type": "object", "properties": {}, "required": []} table_fields = table.get("table_fields", []) # Add __id (record id) to every table record_id_field = { "id": "__id", "type": "integer", "required": True, "is_multiple": False } table_fields.append(record_id_field) for field in table_fields: property_schema = {"inclusion": "automatic"} property_schema['type'] = [] if field["required"]: schema["required"].append(field["id"]) else: property_schema['type'].append("null") if field["type"] in (STRING_TYPES | DATE_TYPES) \ or field["is_multiple"]: property_schema['type'].append("string") if field["type"] in DATE_TYPES: property_schema["format"] = "date-time" if field["type"] in NUMERIC_TYPES: property_schema["type"].append("number") if field["type"] in INTEGER_TYPES or field["type"] == "integer": property_schema["type"].append("integer") schema["properties"][field["id"]] = property_schema return Schema.from_dict(schema)
def test_should_not_update_datetime_that_contains_timezone(self): record = { "id": "5d6ca50762a07c00045125fb", "domain": "pageup", "created_at": "2019-09-02T05:13:43.151+10:00" } schema = Schema.from_dict({ "properties": { "created_at": { "format": "date-time", "type": "string" }, "domain": { "type": "string" }, "id": { "type": "string" }, } }) updated_record = Stream.convert_dates_to_rfc3339(record, schema) assert updated_record["created_at"] == "2019-09-02T05:13:43.151+10:00"
def test_should_append_timezone_to_nested_datetime_fields(self): record = { "id": "5d6ca50762a07c00045125fb", "date": "2019-08-13T00:00:00", "edited": { "author": "Sam Witwicky", "date": "2019-10-09T06:14:58.877" }, } schema = Schema.from_dict({ "properties": { "edited": { "properties": { "author": { "type": "string" }, "date": { "format": "date-time", "type": "string" } }, "type": "object", }, "date": { "format": "date-time", "type": "string" }, "id": { "type": "string" }, } }) updated_record = Stream.convert_dates_to_rfc3339(record, schema) assert updated_record["date"] == "2019-08-13T00:00:00+00:00" assert updated_record["edited"][ "date"] == "2019-10-09T06:14:58.877000+00:00"
def generate_catalog( client, report_config, standard_fields, custom_fields, all_cubes, cubes_lookup, profile_ids, ): """ Generate a catalog entry for each report specified in `report_config` """ catalog_entries = [] for report in report_config: selected_by_default = { *report['metrics'][:10], *report.get('dimensions', []) } schema, mdata = generate_catalog_entry(client, standard_fields, custom_fields, all_cubes, cubes_lookup, profile_ids) mdata = reduce( lambda mdata, field_name: metadata.write(mdata, ( "properties", field_name), "selected-by-default", True), selected_by_default, mdata) catalog_entries.append( CatalogEntry( schema=Schema.from_dict(schema), key_properties=['_sdc_record_hash'], stream=report['name'], tap_stream_id=report['name'], metadata=metadata.to_list(mdata), )) return Catalog(catalog_entries)
def discover_catalog(mysql_conn: Dict, dbs: str = None, tables: Optional[str] = None): """Returns a Catalog describing the structure of the database.""" if dbs: filter_dbs_clause = ",".join( [f"'{db_name}'" for db_name in dbs.split(",")]) table_schema_clause = f"WHERE table_schema IN ({filter_dbs_clause})" else: table_schema_clause = """ WHERE table_schema NOT IN ( 'information_schema', 'performance_schema', 'mysql', 'sys' )""" tables_clause = '' if tables is not None and tables != '': filter_tables_clause = ",".join( [f"'{table_name}'" for table_name in tables.split(",")]) tables_clause = f" AND table_name IN ({filter_tables_clause})" with connect_with_backoff(mysql_conn) as open_conn: with open_conn.cursor() as cur: cur.execute(f""" SELECT table_schema, table_name, table_type, table_rows FROM information_schema.tables {table_schema_clause}{tables_clause} """) table_info = {} for (db_name, table, table_type, rows) in cur.fetchall(): if db_name not in table_info: table_info[db_name] = {} table_info[db_name][table] = { 'row_count': rows, 'is_view': table_type == 'VIEW' } cur.execute(f""" SELECT table_schema, table_name, column_name, data_type, character_maximum_length, numeric_precision, numeric_scale, column_type, column_key FROM information_schema.columns {table_schema_clause}{tables_clause} ORDER BY table_schema, table_name """) columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() entries = [] for (k, cols) in itertools.groupby( columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema(type='object', properties={ c.column_name: schema_for_column(c) for c in cols }) mdata = create_column_metadata(cols) md_map = metadata.to_map(mdata) md_map = metadata.write(md_map, (), 'database-name', table_schema) is_view = table_info[table_schema][table_name]['is_view'] if table_schema in table_info and table_name in table_info[ table_schema]: row_count = table_info[table_schema][table_name].get( 'row_count') if row_count is not None: md_map = metadata.write(md_map, (), 'row-count', row_count) md_map = metadata.write(md_map, (), 'is-view', is_view) column_is_key_prop = lambda c, s: ( c.column_key == 'PRI' and s.properties[ c.column_name].inclusion != 'unsupported') key_properties = [ c.column_name for c in cols if column_is_key_prop(c, schema) ] if not is_view: md_map = metadata.write(md_map, (), 'table-key-properties', key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id( table_schema, table_name), schema=schema) entries.append(entry) return Catalog(entries)
def schema_for_column(column): # pylint: disable=too-many-branches """Returns the Schema object for the given Column.""" data_type = column.data_type.lower() column_type = column.column_type.lower() inclusion = 'available' # We want to automatically include all primary key columns if column.column_key.lower() == 'pri': inclusion = 'automatic' result = Schema(inclusion=inclusion) if data_type == 'bit' or column_type.startswith('tinyint(1)'): result.type = ['null', 'boolean'] elif data_type in BYTES_FOR_INTEGER_TYPE: result.type = ['null', 'integer'] bits = BYTES_FOR_INTEGER_TYPE[data_type] * 8 if 'unsigned' in column.column_type: result.minimum = 0 result.maximum = 2**bits - 1 else: result.minimum = 0 - 2**(bits - 1) result.maximum = 2**(bits - 1) - 1 elif data_type in FLOAT_TYPES: result.type = ['null', 'number'] elif data_type == 'json': result.type = ['null', 'object'] elif data_type == 'decimal': result.type = ['null', 'number'] result.multipleOf = 10**(0 - column.numeric_scale) return result elif data_type in STRING_TYPES: result.type = ['null', 'string'] result.maxLength = column.character_maximum_length elif data_type in DATETIME_TYPES: result.type = ['null', 'string'] result.format = 'date-time' elif data_type == 'time': result.type = ['null', 'string'] result.format = 'time' elif data_type in BINARY_TYPES: result.type = ['null', 'string'] result.format = 'binary' elif data_type in SPATIAL_TYPES: result.type = ['null', 'object'] result.format = 'spatial' else: result = Schema(None, inclusion='unsupported', description=f'Unsupported column type {column_type}') return result
def schema_for_column(c): '''Returns the Schema object for the given Column.''' data_type = c.data_type.lower() # column_type = c.column_type.lower() inclusion = 'available' # We want to automatically include all primary key columns # if c.column_key.lower() == 'pri': # inclusion = 'automatic' result = Schema(inclusion=inclusion) # if data_type == 'bit' or column_type.startswith('tinyint(1)'): # result.type = ['null', 'boolean'] if data_type in BYTES_FOR_INTEGER_TYPE: result.type = ['null', 'integer'] bits = BYTES_FOR_INTEGER_TYPE[data_type] * 8 # if 'unsigned' in c.column_type: # result.minimum = 0 # result.maximum = 2 ** bits - 1 # else: # result.minimum = 0 - 2 ** (bits - 1) # result.maximum = 2 ** (bits - 1) - 1 elif data_type in FLOAT_TYPES: result.type = ['null', 'number'] elif data_type == 'decimal': result.type = ['null', 'number'] result.multipleOf = 10**(0 - c.numeric_scale) return result elif data_type in STRING_TYPES: result.type = ['null', 'string'] result.maxLength = c.character_maximum_length elif data_type in DATETIME_TYPES: result.type = ['null', 'string'] result.format = 'date-time' else: result = Schema(None, inclusion='unsupported', description='Unsupported column type') return result
def discover(conn, config): with connect_with_backoff(conn) as open_conn: with open_conn.cursor() as cur: cur.execute(""" SELECT TABLE_CATALOG, TABLE_SCHEMA, TABLE_NAME, TABLE_TYPE From INFORMATION_SCHEMA.TABLES """) table_info = {} schemas = cur.fetchall() for (db, schema, table, table_type) in schemas: if db not in table_info: table_info[db] = {} if schema not in table_info[db]: table_info[db][schema] = {} table_info[db][schema][table] = { # 'row_count': rows, 'is_view': table_type == 'VIEW' } cur.execute(""" SELECT C.TABLE_SCHEMA, C.TABLE_NAME, C.COLUMN_NAME, C.DATA_TYPE, C.CHARACTER_MAXIMUM_LENGTH, C.NUMERIC_PRECISION, C.NUMERIC_PRECISION, TC.CONSTRAINT_TYPE FROM INFORMATION_SCHEMA.COLUMNS C LEFT JOIN INFORMATION_SCHEMA.CONSTRAINT_COLUMN_USAGE CCU On C.COLUMN_NAME = CCU.COLUMN_NAME LEFT JOIN INFORMATION_SCHEMA.TABLE_CONSTRAINTS TC ON CCU.CONSTRAINT_NAME = Tc.CONSTRAINT_NAME ORDER BY C.TABLE_SCHEMA, C.TABLE_NAME """) # res = cur.fetchall() columns = [] rec = cur.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cur.fetchone() entries = [] for (k, cols) in itertools.groupby( columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema(type='object', properties={ c.column_name: schema_for_column(c) for c in cols }) md = create_column_metadata(cols) md_map = metadata.to_map(md) md_map = metadata.write(md_map, (), 'database-name', table_schema) is_view = table_info[db][table_schema][table_name]['is_view'] if table_schema in table_info and table_name in table_info[ table_schema]: row_count = table_info[table_schema][table_name].get( 'row_count') if row_count is not None: md_map = metadata.write(md_map, (), 'row-count', row_count) md_map = metadata.write(md_map, (), 'is-view', is_view) column_is_key_prop = lambda c, s: ( c.constraint_type == 'PRI' and s.properties[ c.column_name].inclusion != 'unsupported') key_properties = [ c.column_name for c in cols if column_is_key_prop(c, schema) ] if not is_view: md_map = metadata.write(md_map, (), 'table-key-properties', key_properties) entry = CatalogEntry( table=table_name, stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=common.generate_tap_stream_id( table_schema, table_name), schema=schema) entries.append(entry) return Catalog(entries) raw_schemas = load_schemas() streams = [] for schema_name, schema in raw_schemas.items(): # TODO: populate any metadata and stream's key properties here.. stream_metadata = [] stream_key_properties = [] # create and add catalog entry catalog_entry = { 'stream': schema_name, 'tap_stream_id': schema_name, 'schema': schema, 'metadata': [], 'key_properties': [] } streams.append(catalog_entry) return {'streams': streams}
def schema_for_column(column): # pylint: disable=too-many-branches """Returns the Schema object for the given Column.""" data_type = column.data_type.lower() column_type = column.column_type.lower() inclusion = "available" # We want to automatically include all primary key columns if column.column_key.lower() == "pri": inclusion = "automatic" result = Schema(inclusion=inclusion) if data_type == "bit" or column_type.startswith("tinyint(1)"): result.type = ["null", "boolean"] elif data_type in BYTES_FOR_INTEGER_TYPE: result.type = ["null", "integer"] bits = BYTES_FOR_INTEGER_TYPE[data_type] * 8 if "unsigned" in column.column_type: result.minimum = 0 result.maximum = 2 ** bits - 1 else: result.minimum = 0 - 2 ** (bits - 1) result.maximum = 2 ** (bits - 1) - 1 elif data_type in FLOAT_TYPES: result.type = ["null", "number"] elif data_type == "json": result.type = ["null", "object"] elif data_type == "decimal": result.type = ["null", "number"] result.multipleOf = 10 ** (0 - column.numeric_scale) return result elif data_type in STRING_TYPES: result.type = ["null", "string"] if data_type in ("longtext", "mediumtext"): result.maxLength = 65535 else: result.maxLength = column.character_maximum_length elif data_type in DATETIME_TYPES: result.type = ["null", "string"] result.format = "date-time" elif data_type == "time": result.type = ["null", "string"] result.format = "time" elif data_type in BINARY_TYPES: result.type = ["null", "string"] result.format = "binary" if data_type in ("longblob", "blob"): result.maxLength = 65535 else: result = Schema( None, inclusion="unsupported", description=f"Unsupported column type {column_type}", ) return result
def load_static_schemas(streams): """ Load default schemas for all streams """ for stream in streams: LOGGER.info('Loading schema for %s', stream.tap_stream_id) stream.discovered_schema.update(load_static_schema(stream)) STATIC_STREAMS = "members pipes cards tables".split() catalog_entries = [ CatalogEntry(tap_stream_id=stream, stream=stream, key_properties=["id"], schema=Schema.from_dict(load_static_schema(stream))) for stream in STATIC_STREAMS ] CATALOG = Catalog(catalog_entries) LOGGER.info("There are %s static streams", len(CATALOG.streams)) LOGGER.info("STREAMS: %s", [stream.stream for stream in CATALOG.streams]) def get_schema_for_table(table): """ Given a table object, output its schema """ schema = {"type": "object", "properties": {}, "required": []} table_fields = table.get("table_fields", [])
def schema(self): if self._schema is None: self._schema = Schema.from_dict(self._client.item_schema(self.api)) return self._schema
def schema_for_column(col_info: Column, table_pks: List[str]): data_type = col_info.sql_data_type result = Schema() if data_type in INTEGER_TYPES: result.type = nullable_column(col_info.column_name, 'integer', table_pks) result.minimum = -1 * (2**(col_info.numeric_precision - 1)) result.maximum = 2**(col_info.numeric_precision - 1) return result elif data_type in DATE_TIME_TYPES: result.type = nullable_column(col_info.column_name, 'string', table_pks) if data_type == 'time': result.format = 'time' else: result.format = 'date-time' return result elif data_type in FLOAT_TYPES: result.type = nullable_column(col_info.column_name, 'number', table_pks) result.exclusiveMaximum = True result.exclusiveMinimum = True result.maximum = 10**(col_info.numeric_precision - col_info.numeric_scale) result.minimum = -10**(col_info.numeric_precision - col_info.numeric_scale) result.multipleOf = 10**(0 - col_info.numeric_scale) return result elif data_type == 'bit': result.type = nullable_column(col_info.column_name, 'boolean', table_pks) return result elif data_type in STRING_TYPES: result.type = nullable_column(col_info.column_name, 'string', table_pks) result.maxLength = col_info.character_maximum_length return result elif data_type in INFERRED_STRING_TYPES: result.type = nullable_column(col_info.column_name, 'string', table_pks) return result return Schema(None)