def test_decimal_unsigned(self): self.assertEqual( self.schema.properties['c_decimal_2_unsigned'], Schema(['null', 'number'], inclusion='available', multipleOf=0.01)) self.assertEqual(self.get_metadata_for_column('c_decimal_2_unsigned'), { 'selected-by-default': True, 'sql-datatype': 'decimal(5,2) unsigned' })
def test_bigint(self): self.assertEqual(self.schema.properties['c_bigint'], Schema(['null', 'integer'], inclusion='available', minimum=-9223372036854775808, maximum=9223372036854775807)) self.assertEqual(self.get_metadata_for_column('c_bigint'), {'selected-by-default': True, 'sql-datatype': 'bigint(20)'})
def test_bigint_unsigned(self): self.assertEqual(self.schema.properties['c_bigint_unsigned'], Schema(['null', 'integer'], inclusion='available', minimum=0, maximum=18446744073709551615)) self.assertEqual(self.get_metadata_for_column('c_bigint_unsigned'), {'selected-by-default': True, 'sql-datatype': 'bigint(20) unsigned'})
def test_int(self): self.assertEqual(self.schema.properties['c_int'], Schema(['null', 'integer'], inclusion='available', minimum=-2147483648, maximum=2147483647)) self.assertEqual(self.get_metadata_for_column('c_int'), {'selected-by-default': True, 'sql-datatype': 'int(11)'})
def test_mediumint(self): self.assertEqual(self.schema.properties['c_mediumint'], Schema(['null', 'integer'], inclusion='available', minimum=-8388608, maximum=8388607)) self.assertEqual(self.get_metadata_for_column('c_mediumint'), {'selected-by-default': True, 'sql-datatype': 'mediumint(9)'})
def test_smallint(self): self.assertEqual(self.schema.properties['c_smallint'], Schema(['null', 'integer'], inclusion='available', minimum=-32768, maximum=32767)) self.assertEqual(self.get_metadata_for_column('c_smallint'), {'selected-by-default': True, 'sql-datatype': 'smallint(6)'})
def load_schemas(): """ Load schemas from schemas folder """ schemas = {} for filename in os.listdir(get_abs_path('schemas')): path = get_abs_path('schemas') + '/' + filename file_raw = filename.replace('.json', '') with open(path) as file: schemas[file_raw] = Schema.from_dict(json.load(file)) return schemas
def load_schemas() -> Dict[str, Any]: """ Load schemas from schemas folder """ schemas = {} for filename in os.listdir(_get_abs_path("schemas")): path = _get_abs_path("schemas") + "/" + filename file_raw = filename.replace(".json", "") with open(path) as file: schemas[file_raw] = Schema.from_dict(json.load(file)) return schemas
def discover_catalog(connection): cursor = connection.cursor() cursor.execute(""" SELECT table_schema, table_name, column_name, data_type, character_maximum_length, numeric_precision, numeric_scale FROM information_schema.columns WHERE table_schema != 'INFORMATION_SCHEMA' ORDER BY table_schema, table_name """) columns = [] rec = cursor.fetchone() while rec is not None: columns.append(Column(*rec)) rec = cursor.fetchone() entries = [] for (k, cols) in itertools.groupby(columns, lambda c: (c.table_schema, c.table_name)): cols = list(cols) (table_schema, table_name) = k schema = Schema(type='object', properties={c.column_name: schema_for_column(c) for c in cols}) md = create_column_metadata(cols) md_map = metadata.to_map(md) if "events" in table_name.lower(): key_properties = ['UUID'] replication_key = "EVENT_TIME" elif "merge" in table_name.lower(): key_properties = [] replication_key = "MERGE_EVENT_TIME" else: replication_key = "" key_properties = [] md_map = metadata.write(md_map, (), 'table-key-properties', key_properties) entry = CatalogEntry( stream=table_name, metadata=metadata.to_list(md_map), tap_stream_id=table_schema + "-" + table_name, replication_key=replication_key, # This is a non-discoverable key. replication_method="INCREMENTAL", # This is a non-discoverable key. schema=schema) entries.append(entry) return Catalog(entries)
def test_should_output_records(self, mock_stdout, requests_mock): requests_mock.get( "https://api.nikabot.com/api/v1/users?limit=1000&page=0", json=json.loads(USERS_RESPONSE)) requests_mock.get( "https://api.nikabot.com/api/v1/users?limit=1000&page=1", json=json.loads(EMPTY_RESPONSE)) config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog(streams=[ CatalogEntry( tap_stream_id="users", stream="users", schema=Schema.from_dict({}), key_properties=["id"], metadata=[{ "breadcrumb": [], "metadata": { "selected": True } }], ) ]) sync(config, state, catalog) assert mock_stdout.mock_calls == [ call( '{"type": "SCHEMA", "stream": "users", "schema": {}, "key_properties": ["id"]}\n' ), call( '{"type": "RECORD", "stream": "users", "record": {"id": "5de459977292020014fb601c", "name": "Billy", "deleted": true, "presence": "away", "user_id": "UR5B0QABX", "team_id": "T034F9NPW", "is_restricted": false, "is_ultra_restricted": false, "is_admin": false, "is_nikabot_admin": false, "tz": "Australia/Canberra", "tz_label": "Australian Eastern Standard Time", "tz_offset": 36000, "is_checkin_excluded": true, "created_at": "2019-12-02T00:23:51.087", "groups": [], "updated_at": "2020-06-14T22:47:29.617"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), call( '{"type": "RECORD", "stream": "users", "record": {"id": "68QMxnnt8YcpPdfmM", "name": "paul.heasley", "deleted": false, "presence": "active", "user_id": "U04AX35QP", "team_id": "T034F9NPW", "is_restricted": false, "is_ultra_restricted": false, "is_admin": false, "is_nikabot_admin": true, "tz": "Australia/Canberra", "tz_label": "Australian Eastern Standard Time", "tz_offset": 36000, "is_checkin_excluded": false, "create_date": "2019-09-02T05:13:47.88", "created_at": "2019-09-02T05:13:47.882", "role": "0.1", "groups": ["TA Stream", "TA Squad 1", "TA Squad 2", "TA Squad 3", "TA Squad 4", "Learning Applications", "Notification Capability"], "updated_at": "2020-06-15T06:07:58.272"}, "time_extracted": "2020-01-01T00:00:00.000000Z"}\n' ), ] assert LOGGER.info.mock_calls == [ call("Syncing stream: %s", "users"), call( "Making %s request to %s with params %s", "GET", "https://api.nikabot.com/api/v1/users", { "limit": "1000", "page": "0" }, ), call( "Making %s request to %s with params %s", "GET", "https://api.nikabot.com/api/v1/users", { "limit": "1000", "page": "1" }, ), ]
def test_bit(self): self.assertEqual(self.schema.properties["c_bit"], Schema(["null", "boolean"], inclusion="available")) self.assertEqual( self.get_metadata_for_column("c_bit"), { "selected-by-default": True, "sql-datatype": "bit(4)" }, )
def test_double(self): self.assertEqual(self.schema.properties["c_double"], Schema(["null", "number"], inclusion="available")) self.assertEqual( self.get_metadata_for_column("c_double"), { "selected-by-default": True, "sql-datatype": "double" }, )
def test_time(self): self.assertEqual( self.schema.properties['c_time'], Schema(['null', 'string'], format='date-time', inclusion='available')) self.assertEqual(self.get_metadata_for_column('c_time'), { 'selected-by-default': True, 'sql-datatype': 'time' })
def test_decimal_with_defined_scale_and_precision(self): self.assertEqual( self.schema.properties["c_decimal_2"], Schema(["null", "number"], inclusion="available", multipleOf=0.01), ) self.assertEqual( self.get_metadata_for_column("c_decimal_2"), { "selected-by-default": True, "sql-datatype": "decimal(11,2)" }, )
def test_decimal_unsigned(self): self.assertEqual( self.schema.properties["c_decimal_2_unsigned"], Schema(["null", "number"], inclusion="available", multipleOf=0.01), ) self.assertEqual( self.get_metadata_for_column("c_decimal_2_unsigned"), { "selected-by-default": True, "sql-datatype": "decimal(5,2) unsigned" }, )
def test_decimal_with_defined_scale_and_precision(self): self.assertEqual(self.schema.properties['c_decimal_2'], Schema(['null', 'number'], inclusion='available', maximum=1000000000, exclusiveMaximum=True, minimum=-1000000000, exclusiveMinimum=True, multipleOf=0.01)) self.assertEqual(self.get_metadata_for_column('c_decimal_2'), {'selected-by-default': True, 'sql-datatype': 'decimal(11,2)'})
def test_tinyint_1_unsigned(self): self.assertEqual( self.schema.properties["c_tinyint_1_unsigned"], Schema(["null", "boolean"], inclusion="available"), ) self.assertEqual( self.get_metadata_for_column("c_tinyint_1_unsigned"), { "selected-by-default": True, "sql-datatype": "tinyint(1) unsigned" }, )
def discover(client, custom_reports): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): stream_instance = STREAMS[stream_id] stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=stream_instance.key_properties, valid_replication_keys=stream_instance.replication_key, replication_method=stream_instance.replication_method) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=stream_instance.key_properties, metadata=stream_metadata, replication_key=stream_instance.replication_key, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=stream_instance.replication_method, )) if custom_reports: for report in custom_reports: schema = build_schema(client, report) schema = Schema.from_dict(schema) key_properties = report.get('key_properties') replication_key = report.get('valid_replication_keys') stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, valid_replication_keys=replication_key, replication_method=None) streams.append( CatalogEntry( tap_stream_id=report['stream_id'], stream=report['stream_id'], schema=schema, key_properties=report.get('key_properties'), metadata=stream_metadata, replication_key=report.get('valid_replication_keys'), is_view=None, database=None, table=None, row_count=None, stream_alias=report, replication_method=None, )) return Catalog(streams)
def test_time(self): self.assertEqual( self.schema.properties["c_time"], Schema(["null", "string"], format="date-time", inclusion="available"), ) self.assertEqual( self.get_metadata_for_column("c_time"), { "selected-by-default": True, "sql-datatype": "time" }, )
def produce_column_metadata(connection, table_info, table_schema, table_name, pk_constraints, column_schemas, cols): mdata = {} table_pks = pk_constraints.get(table_schema, {}).get(table_name, []) #NB> sadly, some system tables like XDB$STATS have P constraints for columns that do not exist so we must protect against this table_pks = list( filter( lambda pk: column_schemas.get(pk, Schema(None)).type is not None, table_pks)) database_name = get_database_name(connection) metadata.write(mdata, (), 'table-key-properties', table_pks) metadata.write(mdata, (), 'schema-name', table_schema) metadata.write(mdata, (), 'database-name', database_name) if table_schema in table_info and table_name in table_info[table_schema]: metadata.write(mdata, (), 'is-view', table_info[table_schema][table_name]['is_view']) row_count = table_info[table_schema][table_name].get('row_count') if row_count is not None: metadata.write(mdata, (), 'row-count', row_count) for c in cols: c_name = c.column_name # Write the data_type or "None" when the column has no datatype metadata.write(mdata, ('properties', c_name), 'sql-datatype', (c.data_type or "None")) if column_schemas[c_name].type is None: mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'unsupported') mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', False) elif c_name in pk_constraints.get(table_schema, {}).get(table_name, []): mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'automatic') mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', True) else: mdata = metadata.write(mdata, ('properties', c_name), 'inclusion', 'available') mdata = metadata.write(mdata, ('properties', c_name), 'selected-by-default', True) return mdata
def mock_catalog(): return Catalog( streams=[ CatalogEntry( tap_stream_id="records", stream="records", schema=Schema.from_dict(json.loads(SCHEMA)), key_properties=["id"], metadata=[{"breadcrumb": [], "metadata": {"selected": True}}], replication_key="date", replication_method="INCREMENTAL", ) ] )
def generate_schema(table_spec, samples): metadata_schema = { '_smart_source_bucket': {'type': 'string'}, '_smart_source_file': {'type': 'string'}, '_smart_source_lineno': {'type': 'integer'}, } prefer_number_vs_integer = table_spec.get('prefer_number_vs_integer', False) data_schema = conversion.generate_schema(samples, prefer_number_vs_integer=prefer_number_vs_integer) inferred_schema = { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) } merged_schema = override_schema_with_config(inferred_schema, table_spec) return Schema.from_dict(merged_schema)
def build_schema(query_resource): return Schema.from_dict({ 'type': ['null', 'object'], 'additionalProperties': False, 'properties': { **{ key: {'type': ['null', 'string']} for key in query_resource['params']['groupBys'] }, **{ key: {'type': ['null', 'number']} for key in query_resource['params']['metrics'] }, } })
def test_int(self): self.assertEqual( self.schema.properties["c_int"], Schema(["null", "integer"], inclusion="available", minimum=-2147483648, maximum=2147483647), ) self.assertEqual( self.get_metadata_for_column("c_int"), { "selected-by-default": True, "sql-datatype": "int(11)" }, )
def test_bigint_unsigned(self): self.assertEqual( self.schema.properties["c_bigint_unsigned"], Schema(["null", "integer"], inclusion="available", minimum=0, maximum=18446744073709551615), ) self.assertEqual( self.get_metadata_for_column("c_bigint_unsigned"), { "selected-by-default": True, "sql-datatype": "bigint(20) unsigned" }, )
def discover(): schemas, field_metadata = get_schemas() catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=STREAMS[stream_name]['key_properties'], schema=schema, metadata=mdata)) return catalog
def test_should_output_nothing_given_no_streams_selected( self, mock_stdout): config = {"access_token": "my-access-token", "page_size": 1000} state = {} catalog = Catalog(streams=[ CatalogEntry( tap_stream_id="users", stream="users", schema=Schema.from_dict({}), key_properties=["id"], metadata=[], ) ]) sync(config, state, catalog) mock_stdout.assert_not_called() assert LOGGER.info.mock_calls == [call("Skipping stream: %s", "users")]
def resolve_catalog(discovered_catalog, streams_to_sync): result = Catalog(streams=[]) # Iterate over the streams in the input catalog and match each one up # with the same stream in the discovered catalog. for catalog_entry in streams_to_sync: catalog_metadata = metadata.to_map(catalog_entry.metadata) replication_key = catalog_metadata.get((), {}).get("replication-key") discovered_table = discovered_catalog.get_stream( catalog_entry.tap_stream_id) database_name = common.get_database_name(catalog_entry) if not discovered_table: LOGGER.warning( "Database %s table %s was selected but does not exist", database_name, catalog_entry.table, ) continue selected = { k for k, v in discovered_table.schema.properties.items() if common.property_is_selected(catalog_entry, k) or k == replication_key } # These are the columns we need to select columns = desired_columns(selected, discovered_table.schema) result.streams.append( CatalogEntry( tap_stream_id=catalog_entry.tap_stream_id, metadata=catalog_entry.metadata, stream=catalog_entry.tap_stream_id, table=catalog_entry.table, schema=Schema( type="object", properties={ col: discovered_table.schema.properties[col] for col in columns }, ), )) return result
def discover(): entries = [] for stream in streams: schema = Schema.from_dict(stream.get_schema()) stream_metadata = [] key_properties = stream.key_properties for prop, json_schema in schema.properties.items(): inclusion = 'available' if prop in key_properties or prop == 'start_date': inclusion = 'automatic' stream_metadata.append({ 'breadcrumb': [], 'metadata': { 'inclusion': 'available', 'table-key-properties': key_properties, 'schema-name': stream.tap_stream_id, 'selected': True, } }) stream_metadata.append({ 'breadcrumb': ['properties', prop], 'metadata': { 'inclusion': inclusion } }) entries.append( CatalogEntry( tap_stream_id=stream.tap_stream_id, stream=stream.tap_stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, ) ) return Catalog(entries)
def load_schemas(config): """ Load schemas from schemas folder """ schemas = {} schema_dir_path = get_abs_path(config['schema_dir']) if os.path.isdir(schema_dir_path): for filename in os.listdir(schema_dir_path): path = get_abs_path(config['schema_dir']) + '/' + filename file_raw = filename.replace('.json', '') if os.path.isfile(path): with open(path) as file: try: schemas[file_raw] = Schema.from_dict(json.load(file)) except json.decoder.JSONDecodeError as err: LOGGER.warning("Schema file : " + file_raw + " is invalid or not JSON : " + err.msg) else: LOGGER.warning(schema_dir_path + " : Is not a valid directory") return schemas