def discover(): catalog = Catalog([]) for stream_name, endpoint_config in get_endpoints(): schema_dict, metadata = get_schema(stream_name, endpoint_config) schema = Schema.from_dict(schema_dict) catalog.streams.append(CatalogEntry( stream=stream_name, tap_stream_id=stream_name, key_properties=endpoint_config['pk'], schema=schema, metadata=metadata )) return catalog
def resolve_catalog(discovered_catalog, streams_to_sync): result = Catalog(streams=[]) # Iterate over the streams in the input catalog and match each one up # with the same stream in the discovered catalog. for catalog_entry in streams_to_sync: catalog_metadata = metadata.to_map(catalog_entry.metadata) replication_key = catalog_metadata.get((), {}).get("replication-key") discovered_table = discovered_catalog.get_stream( catalog_entry.tap_stream_id) database_name = common.get_database_name(catalog_entry) if not discovered_table: LOGGER.warning( "Database %s table %s was selected but does not exist", database_name, catalog_entry.table, ) continue selected = { k for k, v in discovered_table.schema.properties.items() if common.property_is_selected(catalog_entry, k) or k == replication_key } # These are the columns we need to select columns = desired_columns(selected, discovered_table.schema) result.streams.append( CatalogEntry( tap_stream_id=catalog_entry.tap_stream_id, metadata=catalog_entry.metadata, stream=catalog_entry.tap_stream_id, table=catalog_entry.table, schema=Schema( type="object", properties={ col: discovered_table.schema.properties[col] for col in columns }, ), )) return result
def discover(service): catalog = Catalog([]) for entity_name, entity in service.entities.items(): optionset_map = get_optionset_metadata(service, entity_name) schema_dict, metadata, pks = get_schema(entity.__odata_schema__, optionset_map) schema = Schema.from_dict(schema_dict) catalog.streams.append( CatalogEntry(stream=entity_name, tap_stream_id=entity_name, key_properties=pks, schema=schema, metadata=metadata)) return catalog
def discover(client): catalog = Catalog([]) for resource_name in RESOURCES.keys(): schema_dict, metadata = get_schema(client, resource_name) schema = Schema.from_dict(schema_dict) stream_name = RESOURCES[resource_name] catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=['Id'], schema=schema, metadata=metadata)) return catalog
def discover(client): schemas, field_metadata = get_schemas(client) catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) metadata = field_metadata[stream_name] pk = get_pk(stream_name) catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=pk, schema=schema, metadata=metadata)) return catalog
def discover(): schemas, schemas_metadata = get_schemas() streams = [] for schema_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) schema_meta = schemas_metadata[schema_name] streams.append( CatalogEntry( tap_stream_id=schema_name, stream=schema_name, schema=schema, key_properties=STREAMS[schema_name]['key_properties'], metadata=schema_meta, )) return Catalog(streams)
def add_stream_to_catalog(catalog, ctx, stream): schema_dict = load_schema(ctx, stream.tap_stream_id) schema = Schema.from_dict(schema_dict) mdata = metadata.get_standard_metadata(schema_dict, key_properties=stream.pk_fields) mdata = metadata.to_map(mdata) for field_name in schema_dict['properties'].keys(): mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') catalog.streams.append( CatalogEntry(stream=stream.tap_stream_id, tap_stream_id=stream.tap_stream_id, key_properties=stream.pk_fields, schema=schema, metadata=metadata.to_list(mdata)))
def discover(): entries = [] for stream in streams: schema = Schema.from_dict(stream.get_schema()) stream_metadata = [] key_properties = stream.key_properties for prop, json_schema in schema.properties.items(): inclusion = 'available' if prop in key_properties or prop == 'start_date': inclusion = 'automatic' stream_metadata.append({ 'breadcrumb': [], 'metadata': { 'inclusion': 'available', 'table-key-properties': key_properties, 'schema-name': stream.tap_stream_id, 'selected': True, } }) stream_metadata.append({ 'breadcrumb': ['properties', prop], 'metadata': { 'inclusion': inclusion } }) entries.append( CatalogEntry( tap_stream_id=stream.tap_stream_id, stream=stream.tap_stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, ) ) return Catalog(entries)
def discover_streams(service, config): profile_id = config.get('profile_id') reports = (service.reports().list( profileId=profile_id).execute().get('items')) reports = sorted(reports, key=lambda x: x['id']) report_configs = {} for report in reports: stream_name = sanitize_name(report['name']) tap_stream_id = '{}_{}'.format(stream_name, report['id']) report_configs[(stream_name, tap_stream_id)] = report field_type_lookup = get_field_type_lookup() catalog = Catalog([]) for (stream_name, tap_stream_id), report in report_configs.items(): fieldmap = get_fields(field_type_lookup, report) schema_dict = get_schema(stream_name, fieldmap) schema = Schema.from_dict(schema_dict) metadata = [] metadata.append({ 'metadata': { 'tap-doubleclick-campaign-manager.report-id': report['id'] }, 'breadcrumb': [] }) for prop in schema_dict['properties'].keys(): metadata.append({ 'metadata': { 'inclusion': 'automatic' }, 'breadcrumb': ['properties', prop] }) catalog.streams.append( CatalogEntry(stream=stream_name, stream_alias=stream_name, tap_stream_id=tap_stream_id, key_properties=[], schema=schema, metadata=metadata)) return catalog.to_dict()
def discover(ctx): LOGGER.info("Running discover") use_event_log = has_access_to_event_log(ctx) catalog = Catalog([]) for tap_stream_id in streams_.stream_ids: if not use_event_log and tap_stream_id == schemas.IDS.EVENT_LOG: continue raw_schema = schemas.load_schema(ctx, tap_stream_id) schema = Schema.from_dict(raw_schema, inclusion="automatic") catalog.streams.append( CatalogEntry( stream=tap_stream_id, tap_stream_id=tap_stream_id, key_properties=schemas.PK_FIELDS[tap_stream_id], schema=schema, )) return catalog
def discover() -> Catalog: # noqa: WPS210 """Load the Stream catalog. Returns: Catalog -- The catalog """ raw_schemas: dict = load_schemas() streams: list = [] # Parse every schema for stream_id, schema in raw_schemas.items(): stream_meta: dict = {'key_properties': ['id']} # Create metadata mdata: list = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=stream_meta.get('key_properties', None), valid_replication_keys=stream_meta.get( 'replication_keys', None, ), replication_method=stream_meta.get( 'replication_method', None, ), ) # Create a catalog entry streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=stream_meta.get('key_properties', None), metadata=mdata, replication_key=stream_meta.get( 'replication_key', None, ), replication_method=stream_meta.get( 'replication_method', None, ), ), ) return Catalog(streams)
def discover(ctx): ctx.refresh_credentials() catalog = Catalog([]) for stream in streams.all_streams: schema_dict = load_correct_schema(stream.tap_stream_id) mdata = load_metadata(stream, schema_dict) schema = Schema.from_dict(schema_dict) catalog.streams.append( CatalogEntry( stream=stream.tap_stream_id, tap_stream_id=stream.tap_stream_id, key_properties=stream.pk_fields, schema=schema, metadata=mdata, )) return catalog
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): stream_metadata = gen_metadata(stream_id, schema, BASE_METADATA) streams.append( CatalogEntry(tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=metadata.get( metadata.to_map(stream_metadata), (), 'key_properties'), metadata=stream_metadata)) return Catalog(streams)
def discover(): schemas, field_metadata = get_schemas() catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] catalog.streams.append(CatalogEntry( stream=stream_name, tap_stream_id=stream_name, key_properties=STREAMS[stream_name]['key_properties'], schema=schema, metadata=mdata )) return catalog
def mock_catalog(): return Catalog(streams=[ CatalogEntry( tap_stream_id="records", stream="records", schema=Schema.from_dict(json.loads(SCHEMA)), key_properties=["id"], metadata=[{ "breadcrumb": [], "metadata": { "selected": True } }], replication_key="date", replication_method="INCREMENTAL", ) ])
def test_insights_start_dates(self): insights = AdsInsights( name='insights', account=None, stream_alias="insights", options={}, catalog_entry=CatalogEntry(schema={'properties': {'something': {'type': 'object'}}}, metadata=[{'breadcrumb': ('properties', 'something'), 'metadata': {'selected' : True}}]), state={'bookmarks':{'insights': {'date_start': '2017-01-31'}}}) params = list(itertools.islice(insights.job_params(), 5)) self.assertEqual(params[0]['time_ranges'], [{'since': '2017-01-03', 'until': '2017-01-03'}]) self.assertEqual(params[4]['time_ranges'], [{'since': '2017-01-07', 'until': '2017-01-07'}])
def discover(config): client = Client(config) streams = [] for _, stream in STREAM_OBJECTS.items(): raw_schema = load_schema(stream.tap_stream_id) schema = Schema.from_dict(raw_schema) streams.append( CatalogEntry(stream=stream.tap_stream_id, tap_stream_id=stream.tap_stream_id, key_properties=stream.pk_fields, schema=schema, metadata=metadata.get_standard_metadata( schema=raw_schema, schema_name=stream.tap_stream_id, key_properties=stream.pk_fields, valid_replication_keys=stream.replication_keys, replication_method=stream.replication_method))) return Catalog(streams)
def get_dynamic_streams(): """ Get dynamic table schemas """ entries = [] org = get_organization(CONFIG["organization_id"]) tables = org.pop("tables", []) tables = [tab["node"] for tab in tables.get("edges", [])] for table in tables: stream = "table_{}".format(table["id"]) entry = CatalogEntry(tap_stream_id=stream, stream=stream, key_properties=["__id"], schema=get_schema_for_table(table)) entries.append(entry) LOGGER.info("There are %s tables (dynamic schemas)", len(tables)) return entries
def test_two_pks(self): catalog_entry = CatalogEntry(schema=Schema.from_dict({ 'properties': { 'id1': { 'type': ['integer'] }, 'str': { 'type': ['string'] } } })) key_properties = ['id1', 'str'] last_pk_fetched = {'id1': 4, 'str': 'apples'} expected = '(`id1` > 4) OR (`id1` = 4 AND `str` > \'apples\')' actual = generate_pk_bookmark_clause(key_properties, last_pk_fetched, catalog_entry) self.assertEqual(expected, actual)
def test_three_pk_values_with_bookmark(self): catalog_entry = CatalogEntry(tap_stream_id='foo', schema=Schema.from_dict({ 'properties': { 'id1': { 'type': ['integer'] }, 'id2': { 'type': ['string'] }, 'id3': { 'type': ['integer'] } } }), metadata=[{ 'breadcrumb': (), 'metadata': { 'table-key-properties': ['id1', 'id2', 'id3'] } }]) state = { 'bookmarks': { 'foo': { 'last_pk_fetched': { 'id1': 4, 'id2': 6, 'id3': 2 }, 'max_pk_values': { 'id1': 10, 'id2': 8, 'id3': 3 } } } } expected = ' WHERE ((`id1` > 4) OR (`id1` = 4 AND `id2` > \'6\') OR (`id1` = 4 AND `id2` = \'6\' AND `id3` > 2)) AND `id1` <= 10 AND `id2` <= \'8\' AND `id3` <= 3 ORDER BY `id1`, `id2`, `id3` ASC' actual = generate_pk_clause(catalog_entry, state) self.assertEqual(expected, actual)
def discover(): schemas, field_metadata = get_schemas() catalog = Catalog([]) flat_streams = flatten_streams() for stream_name, schema_dict in schemas.items(): LOGGER.info('discover schema for stream: {}'.format(stream_name)) schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=flat_streams.get(stream_name, {}).get( 'key_properties', None), schema=schema, metadata=mdata)) LOGGER.info('Returning catalog: {}'.format(catalog)) return catalog
def tabsInfo(sheetsService, row): result = [] with rate_limiter: tabs = makeRequestWithExponentialBackoff(sheetsService, row) for tab_id, tab in enumerate(tabs["sheets"]): sheet_id = row['id'] sheet_name = row['name'].lower().replace(" ", "") tab_id = str(tab_id) tab_name = tab["properties"]["title"].lower().replace(" ", "") entry = CatalogEntry( tap_stream_id=sheet_id + "?" + sheet_name + "?" + tab_id + "?" + tab_name + "?" + sheet_name + "_" + tab_name, stream=tab["properties"]["title"].lower().replace(" ", ""), database=row['name'].lower().replace(" ", "") + '&' + row['id'], table=tab["properties"]["title"].lower().replace(" ", "") + '&' + str(tab_id), ) result.append(entry) return (result)
def discover(config): client = Client(config) ensure_credentials_are_authorized(client) include_account_stream = is_account_endpoint_authorized(client) catalog = Catalog([]) for stream in streams_.all_streams: if (not include_account_stream and stream.tap_stream_id == streams_.ACCOUNT.tap_stream_id): continue schema = Schema.from_dict(load_schema(stream.tap_stream_id), inclusion="automatic") catalog.streams.append( CatalogEntry( stream=stream.tap_stream_id, tap_stream_id=stream.tap_stream_id, key_properties=stream.pk_fields, schema=schema, )) return catalog
def discover_base(cls, base_id, base_name=None): cls.logger.info("discover base " + base_id) headers = cls.__get_auth_header() response = requests.get(url=cls.metadata_url + base_id, headers=headers) response.raise_for_status() entries = [] for table in response.json()["tables"]: schema_cols = { "id": Schema(inclusion="automatic", type=['null', "string"]) } meta = {} table_name = table["name"] keys = [] meta = metadata.write(meta, (), "inclusion", "available") meta = metadata.write(meta, 'database_name', 'base_id', base_id) for field in table["fields"]: col_schema = cls.column_schema(field) if col_schema.inclusion == "automatic": keys.append(field["name"]) schema_cols[field["name"]] = col_schema meta = metadata.write(meta, ('properties', field["name"]), 'inclusion', 'available') meta = metadata.write(meta, ('properties', field["name"]), 'airtable_type', field["config"]["type"] or None) schema = Schema(type='object', properties=schema_cols) entry = CatalogEntry(tap_stream_id=table["id"], database=base_name or base_id, table=table_name, stream=table_name, metadata=metadata.to_list(meta), key_properties=keys, schema=schema) entries.append(entry) return entries
def discover(): schemas, field_metadata = get_schemas() catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) mdata = field_metadata.get(stream_name, {}) catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=[], schema=schema, metadata=[{ "metadata": mdata, "breadcrumb": [] }])) return catalog
def discover(): schemas, field_metadata = get_schemas() catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) schema_metadata = field_metadata[stream_name] # Assume id key for each stream for now... key_properties = ['id'] # Selects all streams by default metadata = [{ 'metadata': { 'inclusion': 'available', 'table-key-properties': key_properties, 'selected': True, 'schema-name': stream_name }, 'breadcrumb': [] }] for prop, json_schema in schema_metadata: inclusion = 'available' if prop in key_properties: inclusion = 'automatic' metadata.append({ 'breadcrumb': ['properties', prop], 'metadata': { 'inclusion': inclusion } }) catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=key_properties, schema=schema, metadata=metadata)) return catalog
def do_discover(self): logger.info("Starting discover") catalog = Catalog([]) for stream in self.streams: stream.tap = self schema = Schema.from_dict(stream.get_schema()) key_properties = stream.key_properties metadata = [{ "metadata": { "inclusion": "available", "table-key-properties": ["id"], "selected": True, "schema-name": stream.get_name() }, "breadcrumb": [] }] for prop, json_schema in schema.properties.items(): inclusion = "available" if prop in key_properties or (stream.state_field and prop == stream.state_field): inclusion = "automatic" metadata.append({ "breadcrumb": ["properties", prop], "metadata": { "inclusion": inclusion }, }) catalog.streams.append( CatalogEntry( stream=stream.schema, tap_stream_id=stream.schema, key_properties=key_properties, schema=schema, metadata=metadata, )) return catalog
def discover(): schemas, field_metadata = get_schemas() catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=STREAMS[stream_name]['key_properties'], schema=schema, metadata=mdata)) return catalog # def discover(): # raw_schemas = load_schemas() # streams = [] # for stream_id, schema in raw_schemas.items(): # # TODO: populate any metadata and stream's key properties here.. # stream_metadata = [] # key_properties = [] # streams.append( # CatalogEntry( # tap_stream_id=stream_id, # stream=stream_id, # schema=schema, # key_properties=key_properties, # metadata=stream_metadata, # replication_key=None, # is_view=None, # database=None, # table=None, # row_count=None, # stream_alias=None, # replication_method=None, # ) # ) # return Catalog(streams)
def discover(): streams = [] for schema_name, schema_dict in get_schemas().items(): schema = Schema.from_dict(schema_dict) schema_meta = metadata.get_standard_metadata( schema=schema_dict, schema_name=schema_name, key_properties=STREAMS[schema_name]['key_properties'], valid_replication_keys=STREAMS[schema_name]['replication_keys'], replication_method=STREAMS[schema_name]['replication_method']) streams.append( CatalogEntry( tap_stream_id=schema_name, stream=schema_name, schema=schema, key_properties=STREAMS[schema_name]['key_properties'], metadata=schema_meta, )) return Catalog(streams)
def discover(client): schemas, field_metadata = get_schemas(client) catalog = Catalog([]) for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] key_properties = STREAMS.get(stream_name, {}).get('key_properties', ['dimensions_hash_key', 'date']) catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=key_properties, schema=schema, metadata=mdata)) return catalog