Esempio n. 1
0
def do_sync(args):
    logger.info('Starting sync.')

    meltano_config = load_json_file(args.config)
    bucket_files_definition = meltano_config.get("bucket_files_definition",
                                                 None)
    if bucket_files_definition:
        if os.path.isfile(bucket_files_definition):
            config = tap_s3_csv.config.load(bucket_files_definition)
        else:
            logger.error("tap_s3_csv: '{}' file not found".format(
                bucket_files_definition))
            exit(1)
    else:
        check_config(CONFIG, REQUIRED_CONFIG_KEYS)
        csv_files = CONFIG['files']

    state = load_state(args.state)

    for table in config['tables']:
        state = sync_table(config, state, table)

    state = {'COMPLETED': True}
    singer.write_state(state)

    logger.info('Done syncing.')
Esempio n. 2
0
def get_sampled_schema_for_table(config, table_spec):
    logger.info('Sampling records to determine table schema.')

    s3_files = s3.get_input_files_for_table(config, table_spec)

    samples = s3.sample_files(config, table_spec, s3_files)

    metadata_schema = {
        '_s3_source_bucket': {
            'type': 'string'
        },
        '_s3_source_file': {
            'type': 'string'
        },
        '_s3_source_lineno': {
            'type': 'integer'
        },
    }

    data_schema = conversion.generate_schema(samples)

    return {
        'type': 'object',
        'properties': merge_dicts(data_schema, metadata_schema)
    }
Esempio n. 3
0
def list_files_in_bucket(config, bucket, search_prefix=None):
    s3_client = boto3.client('s3')

    s3_objects = []

    max_results = 1000
    args = {
        'Bucket': bucket,
        'MaxKeys': max_results,
    }

    if search_prefix is not None:
        args['Prefix'] = search_prefix

    result = s3_client.list_objects_v2(**args)

    s3_objects += result['Contents']
    next_continuation_token = result.get('NextContinuationToken')

    while next_continuation_token is not None:
        logger.debug('Continuing pagination with token "{}".'.format(
            next_continuation_token))

        continuation_args = args.copy()
        continuation_args['ContinuationToken'] = next_continuation_token

        result = s3_client.list_objects_v2(**continuation_args)

        s3_objects += result['Contents']
        next_continuation_token = result.get('NextContinuationToken')

    logger.info("Found {} files.".format(len(s3_objects)))

    return s3_objects
Esempio n. 4
0
def sync_table_file(config, s3_file, table_spec, schema):
    logger.info('Syncing file "{}".'.format(s3_file))

    bucket = config['bucket']
    table_name = table_spec['name']

    iterator = tap_s3_csv.format_handler.get_row_iterator(
        config, table_spec, s3_file)

    records_synced = 0

    for row in iterator:
        metadata = {
            '_s3_source_bucket': bucket,
            '_s3_source_file': s3_file,

            # index zero, +1 for header row
            '_s3_source_lineno': records_synced + 2
        }

        try:
            to_write = [{**conversion.convert_row(row, schema), **metadata}]
            singer.write_records(table_name, to_write)
        except BrokenPipeError as bpe:
            logger.error(
                f'Pipe to loader broke after {records_synced} records were written from {s3_file}: troubled line was {row}'
            )
            raise bpe

        records_synced += 1

    return records_synced
Esempio n. 5
0
def sync_table_file(config, s3_file, table_spec, schema):
    logger.info('Syncing file "{}".'.format(s3_file))

    bucket = config['bucket']
    table_name = table_spec['name']

    iterator = tap_s3_csv.format_handler.get_row_iterator(
        config, table_spec, s3_file)

    records_synced = 0

    for row in iterator:
        metadata = {
            '_s3_source_bucket': bucket,
            '_s3_source_file': s3_file,

            # index zero, +1 for header row
            '_s3_source_lineno': records_synced + 2
        }

        to_write = [{**conversion.convert_row(row, schema), **metadata}]
        singer.write_records(table_name, to_write)
        records_synced += 1

    return records_synced
Esempio n. 6
0
def do_sync(args):
    logger.info('Starting sync.')

    config = tap_s3_csv.config.load(args.config)
    state = load_state(args.state)

    for table in config['tables']:
        state = sync_table(config, state, table)

    logger.info('Done syncing.')
Esempio n. 7
0
def sample_file(config, table_spec, s3_path, sample_rate, max_records):
    logger.info('Sampling {} ({} records, every {}th record).'.format(
        s3_path, max_records, sample_rate))

    samples = []

    iterator = tap_s3_csv.format_handler.get_row_iterator(
        config, table_spec, s3_path)

    current_row = 0

    for row in iterator:
        if (current_row % sample_rate) == 0:
            samples.append(row)

        current_row += 1

        if len(samples) >= max_records:
            break

    logger.info('Sampled {} records.'.format(len(samples)))

    return samples
Esempio n. 8
0
def sync_table(config, state, table_spec):
    table_name = table_spec['name']
    modified_since = dateutil.parser.parse(
        state.get(table_name, {}).get('modified_since') or
        config['start_date'])

    logger.info('Syncing table "{}".'.format(table_name))
    logger.info('Getting files modified since {}.'.format(modified_since))

    s3_files = s3.get_input_files_for_table(
        config, table_spec, modified_since)

    logger.info('Found {} files to be synced.'
                .format(len(s3_files)))

    if not s3_files:
        return state

    inferred_schema = get_sampled_schema_for_table(config, table_spec)
    override_schema = {'properties': table_spec.get('schema_overrides', {})}
    schema = merge_dicts(
        inferred_schema,
        override_schema)

    singer.write_schema(
        table_name,
        schema,
        key_properties=table_spec['key_properties'])

    records_streamed = 0

    for s3_file in s3_files:
        records_streamed += sync_table_file(
            config, s3_file['key'], table_spec, schema)

        state[table_name] = {
            'modified_since': s3_file['last_modified'].isoformat()
        }

        singer.write_state(state)

    logger.info('Wrote {} records for table "{}".'
                .format(records_streamed, table_name))

    return state
Esempio n. 9
0
def retry_handler(details):
    logger.info("Received retryable error -- Retry %s/%s",
                details['tries'], MAX_RETRIES)