def get_schemas(client, spreadsheet_id): schemas = {} field_metadata = {} for stream_name, stream_metadata in STREAMS.items(): schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) field_metadata[stream_name] = mdata if stream_name == 'spreadsheet_metadata': api = stream_metadata.get('api', 'sheets') params = stream_metadata.get('params', {}) querystring = '&'.join( ['%s=%s' % (key, value) for (key, value) in params.items()]) path = '{}?{}'.format(stream_metadata.get('path').replace('{spreadsheet_id}', \ spreadsheet_id), querystring) # GET spreadsheet_metadata, which incl. sheets (basic metadata for each worksheet) spreadsheet_md_results = client.get(path=path, params=querystring, api=api, \ endpoint=stream_name) sheets = spreadsheet_md_results.get('sheets') if sheets: # Loop thru each worksheet in spreadsheet for sheet in sheets: # GET sheet_json_schema for each worksheet (from function above) sheet_json_schema, columns = get_sheet_metadata( sheet, spreadsheet_id, client) # SKIP empty sheets (where sheet_json_schema and columns are None) if sheet_json_schema and columns: sheet_title = sheet.get('properties', {}).get('title') schemas[sheet_title] = sheet_json_schema sheet_mdata = metadata.new() sheet_mdata = metadata.get_standard_metadata( schema=sheet_json_schema, key_properties=['__sdc_row'], valid_replication_keys=None, replication_method='FULL_TABLE') field_metadata[sheet_title] = sheet_mdata return schemas, field_metadata
def discover(client, custom_reports): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): stream_instance = STREAMS[stream_id] stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=stream_instance.key_properties, valid_replication_keys=stream_instance.replication_key, replication_method=stream_instance.replication_method) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=stream_instance.key_properties, metadata=stream_metadata, replication_key=stream_instance.replication_key, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=stream_instance.replication_method, )) if custom_reports: for report in custom_reports: schema = build_schema(client, report) schema = Schema.from_dict(schema) key_properties = report.get('key_properties') replication_key = report.get('valid_replication_keys') stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, valid_replication_keys=replication_key, replication_method=None) streams.append( CatalogEntry( tap_stream_id=report['stream_id'], stream=report['stream_id'], schema=schema, key_properties=report.get('key_properties'), metadata=stream_metadata, replication_key=report.get('valid_replication_keys'), is_view=None, database=None, table=None, row_count=None, stream_alias=report, replication_method=None, )) return Catalog(streams)
def generate_catalog(self): cls = self.__class__ # get the reference schemas refs = load_schema_references() # resolve the schema reference and make final schema schema = singer.resolve_schema_references(load_schema(cls.TABLE), refs) mdata = metadata.new() # use 'get_standard_metadata' with primary key, replication key and replication method mdata = metadata.get_standard_metadata( schema=schema, key_properties=self.KEY_PROPERTIES, valid_replication_keys=self.REPLICATION_KEYS if self.REPLICATION_KEYS else None, replication_method=self.REPLICATION_METHOD) mdata_map = metadata.to_map(mdata) # make 'automatic' inclusion for replication keys for replication_key in self.REPLICATION_KEYS: mdata_map[('properties', replication_key)]['inclusion'] = 'automatic' return [{ 'tap_stream_id': cls.TABLE, 'stream': cls.TABLE, 'key_properties': cls.KEY_PROPERTIES, 'schema': schema, 'metadata': metadata.to_list(mdata_map) }]
def discover(): ''' Run discovery mode ''' streams = [] for stream_id, stream_object in STREAMS.items(): raw_schema = load_schema(stream_id) schema = Schema.from_dict(raw_schema) mdata = metadata.to_map( metadata.get_standard_metadata( schema=raw_schema, schema_name=stream_id, key_properties=stream_object.key_properties, valid_replication_keys=[stream_object.replication_key], replication_method=stream_object.replication_method)) # make sure that the replication key field is mandatory if stream_object.replication_key: metadata.write(mdata, ('properties', stream_object.replication_key), 'inclusion', 'automatic') streams.append( CatalogEntry(stream=stream_id, tap_stream_id=stream_id, key_properties=stream_object.key_properties, schema=schema, metadata=metadata.to_list(mdata))) return Catalog(streams)
def discover_streams(config): streams = [] reports = json.loads(config['reports']) username = config['username'] password = config['password'] for report in reports: LOGGER.info('Downloading XSD to determine table schema "%s".', report['report_name']) xsd = download_xsd(report['report_url'], username, password) schema = generate_schema_for_report(xsd) stream_md = metadata.get_standard_metadata( schema, key_properties=report.get('key_properties'), replication_method='FULL_TABLE') streams.append({ 'stream': report['report_name'], 'tap_stream_id': report['report_name'], 'schema': schema, 'metadata': stream_md }) return streams
def get_schemas(): schemas = {} schemas_metadata = {} for stream_name, stream_object in STREAMS.items(): schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) meta = metadata.get_standard_metadata( schema=schema, key_properties=stream_object.key_properties, replication_method=stream_object.replication_method) meta = metadata.to_map(meta) if stream_object.valid_replication_keys: meta = metadata.write(meta, (), 'valid-replication-keys', stream_object.valid_replication_keys) if stream_object.replication_key: meta = metadata.write( meta, ('properties', stream_object.replication_key), 'inclusion', 'automatic') meta = metadata.to_list(meta) schemas[stream_name] = schema schemas_metadata[stream_name] = meta return schemas, schemas_metadata
def do_discover(): raw_schemas = _load_schemas() catalog_entries = [] for stream_name, schema in raw_schemas.items(): # create and add catalog entry stream = STREAM_OBJECTS[stream_name] catalog_entry = { "stream": stream_name, "tap_stream_id": stream_name, "schema": schema, "metadata": metadata.get_standard_metadata( schema=schema, key_properties=stream.key_properties, valid_replication_keys=stream.replication_keys, replication_method=stream.replication_method, ), "key_properties": stream.key_properties, } catalog_entries.append(catalog_entry) return Catalog.from_dict({"streams": catalog_entries})
def do_discover(client): raw_schemas = _load_schemas() catalog_entries = [] major_ver = client.request_feed("gbfs_versions").get("version")[0] feed_names = client.feed_names for feed_name in feed_names: versioned_feed = f"{feed_name}_v{major_ver}" # create and add catalog entry stream = STREAM_OBJECTS.get(versioned_feed) if stream is None: continue schema = raw_schemas[versioned_feed] catalog_entry = { "stream": versioned_feed, "tap_stream_id": versioned_feed, "schema": schema, "metadata": metadata.get_standard_metadata( schema=schema, key_properties=stream.key_properties, valid_replication_keys=stream.replication_keys, replication_method=stream.replication_method, ), "key_properties": stream.key_properties, } catalog_entries.append(catalog_entry) return Catalog.from_dict({"streams": catalog_entries})
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): key_properties = ['uuid'] replication_key = None if stream_id == 'qa': replication_key = 'sequence_id' stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, valid_replication_keys=replication_key, replication_method=None) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=replication_key, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def get_schemas(config): schemas = {} schemas_metadata = {} client = S3Client(config['aws_access_key_id'], config['aws_secret_access_key']) for tap_stream_id, table_spec in config['tables'].items(): LOGGER.info(f'Starting discovery for {tap_stream_id}') stream_object = Stream(client, table_spec, None) stream_schema = stream_object.get_schema() meta = metadata.get_standard_metadata( schema=stream_schema, key_properties=stream_object.key_properties, replication_method=stream_object.replication_method ) meta = metadata.to_map(meta) if stream_object.valid_replication_keys: meta = metadata.write(meta, (), 'valid-replication-keys', stream_object.valid_replication_keys) if stream_object.replication_key: meta = metadata.write(meta, ('properties', stream_object.replication_key), 'inclusion', 'automatic') meta = metadata.to_list(meta) schemas[tap_stream_id] = stream_schema schemas_metadata[tap_stream_id] = meta return schemas, schemas_metadata
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): # TODO: populate any metadata and stream's key properties here.. mock_mdata = metadata.get_standard_metadata(schema.to_dict()) metadata.write(metadata.to_map(mock_mdata), (), "selected", True) mock_keyprops = ['id'] stream_metadata = mock_mdata key_properties = mock_keyprops streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def do_discover(): raw_schemas = _load_schemas() catalog_entries = [] for stream_name, schema in raw_schemas.items(): stream = STREAM_OBJECTS[stream_name] mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream.key_properties, replication_method=stream.replication_method ) mdata = metadata.to_map(mdata) if stream.replication_key: mdata = metadata.write(mdata, (), 'valid-replication-keys', [stream.replication_key]) for field_name in schema['properties'].keys(): if field_name in stream.key_properties or field_name == stream.replication_key: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') else: mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'available') catalog_entry = { 'stream': stream_name, 'tap_stream_id': stream_name, 'schema': schema, 'metadata': metadata.to_list(mdata), 'key_properties': stream.key_properties} catalog_entries.append(catalog_entry) return Catalog.from_dict({'streams': catalog_entries})
def get_schemas(): schemas = {} field_metadata = {} for stream_name, stream_metadata in STREAMS.items(): schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) # Add additional metadata if stream_name in ('ad_analytics_by_campaign', 'ad_analytics_by_creative'): mdata_map = metadata.to_map(mdata) mdata_map[('properties', 'date_range')]['inclusion'] = 'automatic' mdata_map[('properties', 'pivot')]['inclusion'] = 'automatic' mdata_map[('properties', 'pivot_value')]['inclusion'] = 'automatic' mdata = metadata.to_list(mdata_map) field_metadata[stream_name] = mdata return schemas, field_metadata
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): mdata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=["report_id", "row_id"], valid_replication_keys=["report_date"], replication_method="INCREMENTAL", ) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=["report_id", "row_id"], metadata=mdata, replication_key=["report_date"], is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, ) ) return Catalog(streams)
def do_discover(self): logger.info('Starting discover') catalog = Catalog([]) for stream in self.streams: stream.tap = self schema = Schema.from_dict(stream.get_schema()) key_properties = stream.key_properties meta = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, valid_replication_keys=[stream.state_field] if stream.state_field else None, replication_method=stream.replication_method) # If the stream has a state_field, it needs to mark that property with automatic metadata if stream.state_field: meta = metadata.to_map(meta) meta[('properties', stream.state_field)]['inclusion'] = 'automatic' meta = metadata.to_list(meta) catalog.streams.append( CatalogEntry(stream=stream.schema, tap_stream_id=stream.schema, key_properties=key_properties, schema=schema, metadata=meta)) return catalog
def discover(ctx): check_credentials_are_authorized(ctx) catalog = Catalog([]) for tap_stream_id in schemas.stream_ids: schema_dict = schemas.load_schema(tap_stream_id) schema = Schema.from_dict(schema_dict) mdata = metadata.get_standard_metadata( schema_dict, key_properties=schemas.PK_FIELDS[tap_stream_id]) mdata = metadata.to_map(mdata) # NB: `lists` and `messages` are required for their substreams. # This is an approximation of the initial functionality using # metadata, which marked them as `selected=True` in the schema. if tap_stream_id in ['lists', 'messages']: mdata = metadata.write(mdata, (), 'inclusion', 'automatic') for field_name in schema_dict['properties'].keys(): mdata = metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') catalog.streams.append( CatalogEntry(stream=tap_stream_id, tap_stream_id=tap_stream_id, key_properties=schemas.PK_FIELDS[tap_stream_id], schema=schema, metadata=metadata.to_list(mdata))) return catalog
def do_discover(): raw_schemas = _load_schemas() catalog_entries = [] for stream_name, schema in raw_schemas.items(): # create and add catalog entry stream = STREAM_OBJECTS[stream_name] mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream.key_properties, valid_replication_keys=stream.replication_keys, replication_method=stream.replication_method, ) mdata = metadata.to_map(mdata) for field_name in stream.replication_keys: metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') catalog_entry = { "stream": stream_name, "tap_stream_id": stream_name, "schema": schema, "metadata": metadata.to_list(mdata), "key_properties": stream.key_properties, } catalog_entries.append(catalog_entry) return Catalog.from_dict({"streams": catalog_entries})
def load_metadata(self): return get_standard_metadata( schema=self.schema, key_properties=self.key_properties, valid_replication_keys=[self.replication_key], replication_method=self.replication_method, )
def load_metadata(self): return metadata.get_standard_metadata( schema=self.load_schema(), schema_name=self.name, key_properties=self.key_properties, valid_replication_keys=[self.replication_key], replication_method=self.replication_method)
def get_schemas(client, properties_flag, denest_properties_flag): schemas = {} field_metadata = {} for stream_name, stream_metadata in STREAMS.items(): # When the client detects disable_engage_endpoint, skip discovering the stream if stream_name == 'engage' and client.disable_engage_endpoint: LOGGER.warning( 'Mixpanel returned a 402 indicating the Engage endpoint and stream is unavailable. Skipping.' ) continue schema = get_schema(client, properties_flag, denest_properties_flag, stream_name) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) field_metadata[stream_name] = mdata return schemas, field_metadata
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): # TODO: populate any metadata and stream's key properties here.. key_properties = STREAM_CONFIGS[stream_id]['key_properties'] stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, # TODO: Verify this works / is necessary valid_replication_keys=['date'], replication_method=None) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key='date', is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def do_discover(): raw_schemas = _load_schemas() catalog_entries = [] for stream_name, stream in STREAM_OBJECTS.items(): # create and add catalog entry schema = raw_schemas[stream_name] mdata = metadata.to_map( metadata.get_standard_metadata( schema=schema, key_properties=stream.key_properties, valid_replication_keys=stream.replication_keys, replication_method=stream.replication_method, )) # Set the replication_key MetaData to automatic as well mdata = metadata.write(mdata, ('properties', stream.replication_keys[0]), 'inclusion', 'automatic') catalog_entry = { "stream": stream_name, "tap_stream_id": stream_name, "schema": schema, "metadata": metadata.to_list(mdata), "key_properties": stream.key_properties } catalog_entries.append(catalog_entry) return Catalog.from_dict({"streams": catalog_entries})
def discover_schemas(): # Load Facebook's shared schemas refs = load_shared_schema_refs() result = {'streams': []} streams = initialize_streams_for_discovery() for stream in streams: LOGGER.info('Loading schema for %s', stream.name) schema = singer.resolve_schema_references(load_schema(stream), refs) mdata = metadata.to_map( metadata.get_standard_metadata( schema, key_properties=stream.key_properties)) bookmark_key = BOOKMARK_KEYS.get(stream.name) if bookmark_key == UPDATED_TIME_KEY or bookmark_key == CREATED_TIME_KEY: mdata = metadata.write(mdata, ('properties', bookmark_key), 'inclusion', 'automatic') result['streams'].append({ 'stream': stream.name, 'tap_stream_id': stream.name, 'schema': schema, 'metadata': metadata.to_list(mdata) }) return result
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): key_properties = ['id'] valid_replication_keys = None if stream_id == 'issues': valid_replication_keys = ['updated_at'] elif stream_id == 'messages': valid_replication_keys = ['created_at'] stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, valid_replication_keys=valid_replication_keys, replication_method=None) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def get_schemas(config, config_path): schemas = {} schemas_metadata = {} streams = get_streams(config, config_path) LOGGER.info('There are {:d} valid streams in MS Dynamics'.format( len(streams))) for stream_name, stream_object in streams.items(): schema = stream_object.schema meta = metadata.get_standard_metadata( schema=schema, key_properties=stream_object.key_properties, replication_method=stream_object.replication_method) meta = metadata.to_map(meta) if stream_object.valid_replication_keys: meta = metadata.write(meta, (), 'valid-replication-keys', stream_object.valid_replication_keys) if stream_object.replication_key: meta = metadata.write( meta, ('properties', stream_object.replication_key), 'inclusion', 'automatic') meta = metadata.to_list(meta) schemas[stream_name] = schema schemas_metadata[stream_name] = meta return schemas, schemas_metadata
def get_schemas(): schemas = {} field_metadata = {} for stream_name, stream_metadata in STREAMS.items(): schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), selected=stream_metadata.get('selected', True), valid_replication_keys=stream_metadata.get('replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) field_metadata[stream_name] = mdata return schemas, field_metadata
def discover(): raw_schemas = load_schemas() streams = [] for stream_id, schema in raw_schemas.items(): key_properties = ['gradable_id'] if stream_id == 'section_scores': key_properties.append('section_id') stream_metadata = metadata.get_standard_metadata( schema=schema.to_dict(), key_properties=key_properties, valid_replication_keys='date_graded', replication_method=None) streams.append( CatalogEntry( tap_stream_id=stream_id, stream=stream_id, schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key='date_graded', is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def get_schemas(): schemas = {} field_metadata = {} for stream_name, stream_class in STREAMS.items(): base_schema_path = 'schemas/{}.json'.format(stream_name) schema_file_path = stream_class.json_schema or base_schema_path schema_path = get_abs_path(schema_file_path) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_class.key_properties or None, valid_replication_keys=stream_class.replication_keys or None, replication_method=stream_class.replication_method or None) mdata_map = metadata.to_map(mdata) # update inclusion of "replication keys" as "automatic" for replication_key in (stream_class.replication_keys or []): mdata_map[('properties', replication_key)]['inclusion'] = 'automatic' field_metadata[stream_name] = metadata.to_list(mdata_map) return schemas, field_metadata
def get_schemas(): schemas = {} schemas_metadata = {} streams = flatten_streams(STREAMS, {}) for stream_name, stream_object in streams.items(): LOGGER.info('Getting schema for {}'.format(stream_name)) schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) refs = schema.pop("definitions", {}) if refs: singer.resolve_schema_references(schema, refs) meta = metadata.get_standard_metadata( schema=schema, key_properties=stream_object.key_properties, replication_method=stream_object.replication_method ) meta = metadata.to_map(meta) if stream_object.replication_key: meta = metadata.write( meta, ('properties', stream_object.replication_key), 'inclusion', 'automatic') meta = metadata.to_list(meta) schemas[stream_name] = schema schemas_metadata[stream_name] = meta return schemas, schemas_metadata
def test_empty_valid_replication_keys_are_written(self): mdata = get_standard_metadata(valid_replication_keys=[]) self.assertEqual(mdata, [{ 'breadcrumb': (), 'metadata': { 'valid-replication-keys': [] } }])