Beispiel #1
0
def sync_stream(config: Dict, state: Dict, table_spec: Dict, stream: Dict) -> int:
    """
    Sync the stream
    :param config: Connection and stream config
    :param state: current state
    :param table_spec: table specs
    :param stream: stream
    :return: count of streamed records
    """
    table_name = table_spec['table_name']
    modified_since = utils.strptime_with_tz(get_bookmark(state, table_name, 'modified_since') or
                                            config['start_date'])

    LOGGER.info('Syncing table "%s".', table_name)
    LOGGER.info('Getting files modified since %s.', modified_since)

    s3_files = s3.get_input_files_for_table(
        config, table_spec, modified_since)

    records_streamed = 0

    # We sort here so that tracking the modified_since bookmark makes
    # sense. This means that we can't sync s3 buckets that are larger than
    # we can sort in memory which is suboptimal. If we could bookmark
    # based on anything else then we could just sync files as we see them.
    for s3_file in sorted(s3_files, key=lambda item: item['last_modified']):
        records_streamed += sync_table_file(
            config, s3_file['key'], table_spec, stream)

        state = write_bookmark(state, table_name, 'modified_since', s3_file['last_modified'].isoformat())
        write_state(state)

    LOGGER.info('Wrote %s records for table "%s".', records_streamed, table_name)

    return records_streamed
Beispiel #2
0
def sync_stream(config, state, table_spec, stream):
    table_name = table_spec['table_name']
    modified_since = utils.strptime_with_tz(
        singer.get_bookmark(state, table_name, 'modified_since')
        or config['start_date'])

    LOGGER.info('Syncing table "%s".', table_name)
    LOGGER.info('Getting files modified since %s.', modified_since)

    s3_files = s3.get_input_files_for_table(config, table_spec, modified_since)

    LOGGER.info('Found %s files to be synced.', len(s3_files))

    records_streamed = 0
    if not s3_files:
        return records_streamed

    for s3_file in s3_files:
        records_streamed += sync_table_file(config, s3_file['key'], table_spec,
                                            stream)

        state = singer.write_bookmark(state, table_name, 'modified_since',
                                      s3_file['last_modified'].isoformat())
        singer.write_state(state)

    LOGGER.info('Wrote %s records for table "%s".', records_streamed,
                table_name)

    return records_streamed
Beispiel #3
0
def get_sampled_schema_for_table(config, table_spec):
    logger.info('Sampling records to determine table schema.')

    s3_files = s3.get_input_files_for_table(config, table_spec)

    samples = s3.sample_files(config, table_spec, s3_files)

    metadata_schema = {
        '_s3_source_bucket': {
            'type': 'string'
        },
        '_s3_source_file': {
            'type': 'string'
        },
        '_s3_source_lineno': {
            'type': 'integer'
        },
    }

    data_schema = conversion.generate_schema(samples)

    return {
        'type': 'object',
        'properties': merge_dicts(data_schema, metadata_schema)
    }
Beispiel #4
0
def sync_stream(config, state, table_spec, stream):
    table_name = table_spec['table_name']
    modified_since = singer_utils.strptime_with_tz(singer.get_bookmark(state, table_name, 'modified_since') or
                                            config['start_date'])

    LOGGER.info('Syncing table "%s".', table_name)
    LOGGER.info('Getting files modified since %s.', modified_since)

    s3_files = s3.get_input_files_for_table(
        config, table_spec, modified_since)

    records_streamed = 0

    # We sort here so that tracking the modified_since bookmark makes
    # sense. This means that we can't sync s3 buckets that are larger than
    # we can sort in memory which is suboptimal. If we could bookmark
    # based on anything else then we could just sync files as we see them.
    for s3_file in sorted(s3_files, key=lambda item: item['last_modified']):
        records_streamed += sync_table_file(
            config, s3_file['key'], table_spec, stream)

        state = singer.write_bookmark(state, table_name, 'modified_since', s3_file['last_modified'].isoformat())
        singer.write_state(state)

    if s3.skipped_files_count:
        LOGGER.warn("%s files got skipped during the last sync.",s3.skipped_files_count)

    LOGGER.info('Wrote %s records for table "%s".', records_streamed, table_name)

    return records_streamed
Beispiel #5
0
def sync_stream(config, state, table_spec, stream):
    table_name = table_spec["table_name"]
    modified_since = utils.strptime_with_tz(
        singer.get_bookmark(state, table_name, "modified_since")
        or config["start_date"])

    LOGGER.info('Syncing table "%s".', table_name)
    LOGGER.info("Getting files modified since %s.", modified_since)

    s3_files = s3.get_input_files_for_table(config, table_spec, modified_since)

    records_streamed = 0

    # We sort here so that tracking the modified_since bookmark makes
    # sense. This means that we can't sync s3 buckets that are larger than
    # we can sort in memory which is suboptimal. If we could bookmark
    # based on anything else then we could just sync files as we see them.
    for s3_file in sorted(s3_files, key=lambda item: item["last_modified"]):
        records_streamed += sync_table_file(config, s3_file["key"], table_spec,
                                            stream, s3_file["last_modified"])

        state = singer.write_bookmark(state, table_name, "modified_since",
                                      s3_file["last_modified"].isoformat())
        singer.write_state(state)

    LOGGER.info('Wrote %s records for table "%s".', records_streamed,
                table_name)

    return records_streamed
Beispiel #6
0
def sync_stream(config, state, table_spec, stream):
    table_name = table_spec['table_name']
    bookmark = singer.get_bookmark(state, table_name, 'modified_since')
    modified_since = utils.strptime_with_tz(bookmark or '1990-01-01T00:00:00Z')

    LOGGER.info('Syncing table "%s".', table_name)
    LOGGER.info('Getting files modified since %s.', modified_since)

    s3_files = s3.get_input_files_for_table(config, table_spec, modified_since)

    records_streamed = 0

    # Original implementation sorted by 'modified_since' so that the modified_since bookmark makes
    # sense. We sort by 'key' because we import multiple part files generated from Spark where the
    # names are incremental order.
    # This means that we can't sync s3 buckets that are larger than
    # we can sort in memory which is suboptimal. If we could bookmark
    # based on anything else then we could just sync files as we see them.
    for s3_file in sorted(s3_files, key=lambda item: item['key']):
        records_streamed += sync_table_file(config, s3_file['key'], table_spec,
                                            stream)

        state = singer.write_bookmark(state, table_name, 'modified_since',
                                      s3_file['last_modified'].isoformat())
        singer.write_state(state)

    LOGGER.info('Wrote %s records for table "%s".', records_streamed,
                table_name)

    return records_streamed
Beispiel #7
0
def sync_table(config, state, table_spec):
    table_name = table_spec['name']
    modified_since = dateutil.parser.parse(
        state.get(table_name, {}).get('modified_since') or
        config['start_date'])

    logger.info('Syncing table "{}".'.format(table_name))
    logger.info('Getting files modified since {}.'.format(modified_since))

    s3_files = s3.get_input_files_for_table(
        config, table_spec, modified_since)

    logger.info('Found {} files to be synced.'
                .format(len(s3_files)))

    if not s3_files:
        return state

    inferred_schema = get_sampled_schema_for_table(config, table_spec)
    override_schema = {'properties': table_spec.get('schema_overrides', {})}
    schema = merge_dicts(
        inferred_schema,
        override_schema)

    singer.write_schema(
        table_name,
        schema,
        key_properties=table_spec['key_properties'])

    records_streamed = 0

    for s3_file in s3_files:
        records_streamed += sync_table_file(
            config, s3_file['key'], table_spec, schema)

        state[table_name] = {
            'modified_since': s3_file['last_modified'].isoformat()
        }

        singer.write_state(state)

    logger.info('Wrote {} records for table "{}".'
                .format(records_streamed, table_name))

    return state