コード例 #1
0
def sync_table_file(config, s3_path, table_spec, stream):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    iterator = csv.get_row_iterator(s3_file_handle._raw_stream, table_spec)  #pylint:disable=protected-access

    records_synced = 0

    for row in iterator:
        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN:
            bucket,
            s3.SDC_SOURCE_FILE_COLUMN:
            s3_path,

            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN:
            records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(
                rec, stream['schema'], metadata.to_map(stream['metadata']))

        singer.write_record(table_name, to_write)
        records_synced += 1

    return records_synced
コード例 #2
0
ファイル: sync.py プロジェクト: tomwang-varicent/tap-s3-csv
def sync_table_file(config, s3_path, table_spec, stream):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)
    iterator = csv_iterator.get_row_iterator(s3_file_handle._raw_stream,
                                             table_spec)  #pylint:disable=protected-access

    records_synced = 0

    for row in iterator:

        with Transformer() as transformer:
            to_write = transformer.transform(
                row, stream['schema'], metadata.to_map(stream['metadata']))

        singer.write_record(table_name, to_write)
        records_synced += 1

    return records_synced
コード例 #3
0
def sync_gz_file(config, s3_path, table_spec, stream, file_handler):
    if s3_path.endswith(".tar.gz"):
        LOGGER.warning('Skipping "%s" file as .tar.gz extension is not supported',s3_path)
        s3.skipped_files_count = s3.skipped_files_count + 1
        return 0

    # If file is extracted from zip use file object else get file object from s3 bucket
    file_object = file_handler if file_handler else s3.get_file_handle(config, s3_path)

    file_bytes = file_object.read()
    gz_file_obj = gzip.GzipFile(fileobj=io.BytesIO(file_bytes))

    # pylint: disable=duplicate-code
    try:
        gz_file_name = utils.get_file_name_from_gzfile(fileobj=io.BytesIO(file_bytes))
    except AttributeError as err:
        # If a file is compressed using gzip command with --no-name attribute,
        # It will not return the file name and timestamp. Hence we will skip such files.
        # We also seen this issue occur when tar is used to compress the file
        LOGGER.warning('Skipping "%s" file as we did not get the original file name',s3_path)
        s3.skipped_files_count = s3.skipped_files_count + 1
        return 0

    if gz_file_name:

        if gz_file_name.endswith(".gz"):
            LOGGER.warning('Skipping "%s" file as it contains nested compression.',s3_path)
            s3.skipped_files_count = s3.skipped_files_count + 1
            return 0

        gz_file_extension = gz_file_name.split(".")[-1].lower()
        return handle_file(config, s3_path + "/" + gz_file_name, table_spec, stream, gz_file_extension, io.BytesIO(gz_file_obj.read()))

    raise Exception('"{}" file has some error(s)'.format(s3_path))
コード例 #4
0
def sync_gz_file(config, s3_path, table_spec, stream, file_handler):
    if s3_path.endswith(".tar.gz"):
        LOGGER.warning(
            'Skipping "%s" file as .tar.gz extension is not supported',
            s3_path)
        return 0

    # If file is extracted from zip use file object else get file object from s3 bucket
    file_object = file_handler if file_handler else s3.get_file_handle(
        config, s3_path)

    file_bytes = file_object.read()
    gz_file_obj = gzip.GzipFile(fileobj=io.BytesIO(file_bytes))

    gz_file_name = utils.get_file_name_from_gzfile(
        fileobj=io.BytesIO(file_bytes))

    if gz_file_name:

        if gz_file_name.endswith(".gz"):
            LOGGER.warning(
                'Skipping "%s" file as it contains nested compression.',
                s3_path)
            return 0

        gz_file_extension = gz_file_name.split(".")[-1].lower()
        return handle_file(config, s3_path + "/" + gz_file_name, table_spec,
                           stream, gz_file_extension,
                           io.BytesIO(gz_file_obj.read()))

    raise Exception('"{}" file has some error(s)'.format(s3_path))
コード例 #5
0
ファイル: sync.py プロジェクト: kchan-varicent/tap-s3-csv
def handle_file(config, s3_path, table_spec, stream, extension, file_handler=None):
    """
    Used to sync normal supported files
    """

    # Check whether file is without extension or not
    if not extension or s3_path.lower() == extension:
        LOGGER.warning('"%s" without extension will not be synced.', s3_path)
        s3.skipped_files_count = s3.skipped_files_count + 1
        return 0
    if extension == "gz":
        return sync_gz_file(config, s3_path, table_spec, stream, file_handler)

    if extension in ["csv", "txt"]:

        # If file is extracted from zip or gz use file object else get file object from s3 bucket
        file_handle = file_handler if file_handler else s3.get_file_handle(
            config, s3_path)._raw_stream  # pylint:disable=protected-access
        return sync_csv_file(config, file_handle, s3_path, table_spec, stream)

    if extension == "jsonl":

        # If file is extracted from zip or gz use file object else get file object from s3 bucket
        file_handle = file_handler if file_handler else s3.get_file_handle(
            config, s3_path)._raw_stream
        records = sync_jsonl_file(
            config, file_handle, s3_path, table_spec, stream)
        if records == 0:
            # Only space isn't the valid JSON but it is a valid CSV header hence skipping the jsonl file with only space.
            s3.skipped_files_count = s3.skipped_files_count + 1
            LOGGER.warning('Skipping "%s" file as it is empty', s3_path)
        return records

    if extension == "zip":
        LOGGER.warning(
            'Skipping "%s" file as it contains nested compression.', s3_path)
        s3.skipped_files_count = s3.skipped_files_count + 1
        return 0

    LOGGER.warning(
        '"%s" having the ".%s" extension will not be synced.', s3_path, extension)
    s3.skipped_files_count = s3.skipped_files_count + 1
    return 0
コード例 #6
0
ファイル: sync.py プロジェクト: oncoramedical/tap-s3-csv
def sync_table_file(config, s3_path, table_spec, stream, last_modified):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config["bucket"]
    table_name = table_spec["table_name"]

    s3_file_handle = s3.get_file_handle(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)

    encoding_module = singer_encodings_csv
    if 'encoding_module' in config:
        try:
            encoding_module = importlib.import_module(
                config['encoding_module'])
        except ModuleNotFoundError:
            LOGGER.warning(
                f'Failed to load encoding module [{config["encoding_module"]}]. Defaulting to [singer_encodings.csv]'
            )

    iterator = encoding_module.get_row_iterator(s3_file_handle._raw_stream,
                                                table_spec)  # pylint:disable=protected-access

    records_synced = 0

    for row in iterator:
        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN:
            bucket,
            s3.SDC_SOURCE_FILE_COLUMN:
            s3_path,
            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN:
            records_synced + 2,
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(
                rec, stream["schema"], metadata.to_map(stream["metadata"]))

        to_write_with_sequence = RecordMessageWithSequence(
            singer.RecordMessage(stream=table_name, record=to_write),
            last_modified)

        singer.write_message(to_write_with_sequence)
        records_synced += 1

    return records_synced
コード例 #7
0
def handle_file(config,
                s3_path,
                table_spec,
                stream,
                extension,
                file_handler=None):
    """
    Used to sync normal supported files
    """

    # Check whether file is without extension or not
    if not extension or s3_path.lower() == extension:
        LOGGER.warning('"%s" without extension will not be synced.', s3_path)
        return 0
    if extension == "gz":
        return sync_gz_file(config, s3_path, table_spec, stream, file_handler)

    if extension in ["csv", "txt"]:

        # If file is extracted from zip or gz use file object else get file object from s3 bucket
        file_handle = file_handler if file_handler else s3.get_file_handle(
            config, s3_path)._raw_stream  #pylint:disable=protected-access
        return sync_csv_file(config, file_handle, s3_path, table_spec, stream)

    if extension == "jsonl":

        # If file is extracted from zip or gz use file object else get file object from s3 bucket
        file_handle = file_handler if file_handler else s3.get_file_handle(
            config, s3_path)._raw_stream
        return sync_jsonl_file(config, file_handle, s3_path, table_spec,
                               stream)

    if extension == "zip":
        LOGGER.warning('Skipping "%s" file as it contains nested compression.',
                       s3_path)
        return 0

    LOGGER.warning('"%s" having the ".%s" extension will not be synced.',
                   s3_path, extension)
    return 0
コード例 #8
0
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int:
    """
    Sync a given csv found file
    :param config: tap configuration
    :param s3_path: file path given by S3
    :param table_spec: tables specs
    :param stream: Stream data
    :return: number of streamed records
    """
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)
    iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec)  # pylint:disable=protected-access

    records_synced = 0

    for row in iterator:
        time_extracted = utils.now()

        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN: bucket,
            s3.SDC_SOURCE_FILE_COLUMN: s3_path,

            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata']))

        write_record(table_name, to_write, time_extracted=time_extracted)
        records_synced += 1

    return records_synced
コード例 #9
0
def sync_compressed_file(config, s3_path, table_spec, stream):
    LOGGER.info('Syncing Compressed file "%s".', s3_path)

    records_streamed = 0
    s3_file_handle = s3.get_file_handle(config, s3_path)

    decompressed_files = compression.infer(io.BytesIO(s3_file_handle.read()), s3_path)

    for decompressed_file in decompressed_files:
        extension = decompressed_file.name.split(".")[-1].lower()

        if extension in ["csv", "jsonl", "gz", "txt"]:
            # Append the extracted file name with zip file.
            s3_file_path = s3_path + "/" + decompressed_file.name

            records_streamed += handle_file(config, s3_file_path, table_spec, stream, extension, file_handler=decompressed_file)

    return records_streamed
コード例 #10
0
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict,
                    stream: Dict) -> int:
    """
    Sync a given csv found file
    :param config: tap configuration
    :param s3_path: file path given by S3
    :param table_spec: tables specs
    :param stream: Stream data
    :return: number of streamed records
    """
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec)  # pylint:disable=protected-access

    records_synced = 0

    for row in iterator:
        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN:
            bucket,
            s3.SDC_SOURCE_FILE_COLUMN:
            s3_path,

            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN:
            records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(
                rec, stream['schema'], metadata.to_map(stream['metadata']))

        write_record(table_name, to_write)
        records_synced += 1

    return records_synced
コード例 #11
0
ファイル: sync.py プロジェクト: crimsonmacaw/tap-s3-csv
def sync_table_file(config, s3_path, table_spec, stream):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)
    iterator = singer_encodings_csv.get_row_iterator(
        s3_file_handle._raw_stream, table_spec) #pylint:disable=protected-access

    records_synced = 0

    for row in iterator:
        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN: bucket,
            s3.SDC_SOURCE_FILE_COLUMN: s3_path,

            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(rec, stream.schema.to_dict(), metadata.to_map(stream.metadata))

        singer.write_record(table_name, to_write)
        records_synced += 1

    return records_synced
コード例 #12
0
ファイル: sync.py プロジェクト: valulucchesi/tap-s3-csv
def sync_table_file(config, s3_path, table_spec, stream, modified):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)
    longitud = 0
    if s3_path.endswith('zip'):
        with io.BytesIO(s3_file_handle.read()) as tf:
            if tf is not None:
                tf.seek(0)

            # Read the file as a zipfile and process the members
            with zipfile.ZipFile(tf, mode='r') as zipf:
                for subfile in zipf.namelist():
                    with zipf.open(subfile) as myfile:
                        iterator = singer_encodings_csv.get_row_iterator(
                            myfile, table_spec)
                        rows = list(iterator)
                        longitud = len(rows)

    else:
        iterator = singer_encodings_csv.get_row_iterator(
            s3_file_handle._raw_stream, table_spec)  #pylint:disable=protected-access
        rows = list(iterator)
        longitud = len(rows)

    records_synced = 0
    current_row = 0
    i = 0
    for row in rows:

        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN:
            bucket,
            s3.SDC_SOURCE_FILE_COLUMN:
            s3_path,

            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN:
            records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(
                rec, stream['schema'], metadata.to_map(stream['metadata']))
            if "preprocess" in config and config['preprocess'] != '':
                preprocess_items = json.loads(config['preprocess'])
                for i in preprocess_items:
                    preprocess = i
                    if (table_name == preprocess['table_name']):
                        for value in preprocess['values']:
                            to_get = value.split("|")[0]
                            to_del = value.split("|")[1]
                            if to_get in rec:
                                if to_del in rec:
                                    if rec[to_get] == rec[to_del]:
                                        if to_del in to_write:
                                            del to_write[to_del]
                                    else:
                                        LOGGER.warning('removing record: ' +
                                                       json.dumps(rec) + ' ' +
                                                       to_get + ' and ' +
                                                       to_del +
                                                       ' are not equals')

                            elif to_del in rec:
                                to_write[to_get] = rec[to_del]
                                if to_del in to_write:
                                    del to_write[to_del]
                            else:
                                to_write[to_get] = ""

        to_write['last_modified'] = modified.__str__()
        singer.write_record(table_name, to_write)
        records_synced += 1
        current_row += 1
        if (i == longitud):
            continue

    return records_synced