Example #1
0
def get_schemas():
    schemas = {}
    field_metadata = {}

    flat_streams = flatten_streams()
    for stream_name, stream_metadata in flat_streams.items():
        replication_ind = stream_metadata.get('replication_ind', True)
        if replication_ind:
            schema_path = get_abs_path('schemas/{}.json'.format(stream_name))
            with open(schema_path) as file:
                schema = json.load(file)
            schemas[stream_name] = schema
            mdata = metadata.new()

            # Documentation:
            # https://github.com/singer-io/getting-started/blob/master/docs/DISCOVERY_MODE.md#singer-python-helper-functions
            # Reference:
            # https://github.com/singer-io/singer-python/blob/master/singer/metadata.py#L25-L44
            mdata = metadata.get_standard_metadata(
                schema=schema,
                key_properties=stream_metadata.get('key_properties', None),
                valid_replication_keys=stream_metadata.get(
                    'replication_keys', None),
                replication_method=stream_metadata.get('replication_method',
                                                       None))
            field_metadata[stream_name] = mdata

    return schemas, field_metadata
Example #2
0
def sync(client, config, catalog, state):
    if 'start_date' in config:
        start_date = config['start_date']

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []

    flat_streams = flatten_streams()

    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
        parent_stream = flat_streams.get(stream.stream, {}).get('parent_stream')
        if parent_stream:
            if parent_stream not in selected_streams:
                selected_streams.append(parent_stream)
    LOGGER.info('selected_streams: {}'.format(selected_streams))

    if not selected_streams:
        return

    # Loop through selected_streams
    for stream_name, endpoint_config in STREAMS.items():
        if stream_name in selected_streams:
            LOGGER.info('START Syncing: {}'.format(stream_name))
            update_currently_syncing(state, stream_name)
            path = endpoint_config.get('path', stream_name)
            bookmark_field = next(iter(endpoint_config.get('replication_keys', [])), None)
            replication_ind = endpoint_config.get('replication_ind', True)
            if replication_ind:
                selected_fields = get_selected_fields(catalog, stream_name)
                LOGGER.info('Stream: {}, selected_fields: {}'.format(stream_name, selected_fields))
                write_schema(catalog, stream_name)
            else:
                selected_fields = None
            total_records = sync_endpoint(
                client=client,
                catalog=catalog,
                state=state,
                start_date=start_date,
                stream_name=stream_name,
                path=path,
                endpoint_config=endpoint_config,
                static_params=endpoint_config.get('params', {}),
                bookmark_query_field=endpoint_config.get('bookmark_query_field', None),
                bookmark_field=bookmark_field,
                bookmark_type=endpoint_config.get('bookmark_type', None),
                data_key=endpoint_config.get('data_key', stream_name),
                id_fields=endpoint_config.get('key_properties'),
                selected_streams=selected_streams,
                replication_ind=replication_ind)

            update_currently_syncing(state, None)
            LOGGER.info('FINISHED Syncing: {}, total_records: {}'.format(
                stream_name,
                total_records))
Example #3
0
def discover():
    schemas, field_metadata = get_schemas()
    catalog = Catalog([])

    flat_streams = flatten_streams()
    for stream_name, schema_dict in schemas.items():
        schema = Schema.from_dict(schema_dict)
        mdata = field_metadata[stream_name]

        catalog.streams.append(
            CatalogEntry(stream=stream_name,
                         tap_stream_id=stream_name,
                         key_properties=flat_streams.get(stream_name, {}).get(
                             'key_properties', None),
                         schema=schema,
                         metadata=mdata))

    return catalog
Example #4
0
def discover():
    schemas, field_metadata = get_schemas()
    catalog = Catalog([])

    flat_streams = flatten_streams()
    for stream_name, schema_dict in schemas.items():
        schema = Schema.from_dict(schema_dict)
        mdata = metadata.to_map(field_metadata[stream_name])

        stream = flat_streams.get(stream_name, {})
        if stream.get('replication_method') == 'INCREMENTAL':
            for field_name in stream.get('replication_keys'):
                metadata.write(mdata, ('properties', field_name), 'inclusion', 'automatic')
        catalog.streams.append(CatalogEntry(
            stream=stream_name,
            tap_stream_id=stream_name,
            key_properties=stream.get('key_properties', None),
            schema=schema,
            metadata=metadata.to_list(mdata)
        ))

    return catalog