def get_schemas(): schemas = {} field_metadata = {} flat_streams = flatten_streams() for stream_name, stream_metadata in flat_streams.items(): replication_ind = stream_metadata.get('replication_ind', True) if replication_ind: schema_path = get_abs_path('schemas/{}.json'.format(stream_name)) with open(schema_path) as file: schema = json.load(file) schemas[stream_name] = schema mdata = metadata.new() # Documentation: # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions # Reference: # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44 mdata = metadata.get_standard_metadata( schema=schema, key_properties=stream_metadata.get('key_properties', None), valid_replication_keys=stream_metadata.get( 'replication_keys', None), replication_method=stream_metadata.get('replication_method', None)) field_metadata[stream_name] = mdata return schemas, field_metadata
def sync(client, config, catalog, state): if 'start_date' in config: start_date = config['start_date'] # Get selected_streams from catalog, based on state last_stream # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info('last/currently syncing stream: {}'.format(last_stream)) selected_streams = [] flat_streams = flatten_streams() for stream in catalog.get_selected_streams(state): selected_streams.append(stream.stream) parent_stream = flat_streams.get(stream.stream, {}).get('parent_stream') if parent_stream: if parent_stream not in selected_streams: selected_streams.append(parent_stream) LOGGER.info('selected_streams: {}'.format(selected_streams)) if not selected_streams: return # Loop through selected_streams for stream_name, endpoint_config in STREAMS.items(): if stream_name in selected_streams: LOGGER.info('START Syncing: {}'.format(stream_name)) update_currently_syncing(state, stream_name) path = endpoint_config.get('path', stream_name) bookmark_field = next(iter(endpoint_config.get('replication_keys', [])), None) replication_ind = endpoint_config.get('replication_ind', True) if replication_ind: selected_fields = get_selected_fields(catalog, stream_name) LOGGER.info('Stream: {}, selected_fields: {}'.format(stream_name, selected_fields)) write_schema(catalog, stream_name) else: selected_fields = None total_records = sync_endpoint( client=client, catalog=catalog, state=state, start_date=start_date, stream_name=stream_name, path=path, endpoint_config=endpoint_config, static_params=endpoint_config.get('params', {}), bookmark_query_field=endpoint_config.get('bookmark_query_field', None), bookmark_field=bookmark_field, bookmark_type=endpoint_config.get('bookmark_type', None), data_key=endpoint_config.get('data_key', stream_name), id_fields=endpoint_config.get('key_properties'), selected_streams=selected_streams, replication_ind=replication_ind) update_currently_syncing(state, None) LOGGER.info('FINISHED Syncing: {}, total_records: {}'.format( stream_name, total_records))
def discover(): schemas, field_metadata = get_schemas() catalog = Catalog([]) flat_streams = flatten_streams() for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) mdata = field_metadata[stream_name] catalog.streams.append( CatalogEntry(stream=stream_name, tap_stream_id=stream_name, key_properties=flat_streams.get(stream_name, {}).get( 'key_properties', None), schema=schema, metadata=mdata)) return catalog
def discover(): schemas, field_metadata = get_schemas() catalog = Catalog([]) flat_streams = flatten_streams() for stream_name, schema_dict in schemas.items(): schema = Schema.from_dict(schema_dict) mdata = metadata.to_map(field_metadata[stream_name]) stream = flat_streams.get(stream_name, {}) if stream.get('replication_method') == 'INCREMENTAL': for field_name in stream.get('replication_keys'): metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic') catalog.streams.append(CatalogEntry( stream=stream_name, tap_stream_id=stream_name, key_properties=stream.get('key_properties', None), schema=schema, metadata=metadata.to_list(mdata) )) return catalog