Python get_row_iterator Beispiele, singer_encodings.csv.get_row_iterator Python Beispiele

Beispiel #1

0

Datei anzeigen

def sync_csv_file(config, file_handle, s3_path, table_spec, stream):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)

    if "properties" in stream["schema"]:
        iterator = csv_helper.get_row_iterator(
            file_handle, table_spec, stream["schema"]["properties"].keys(), True)
    else:
        iterator = csv_helper.get_row_iterator(file_handle, table_spec, None, True)

    records_synced = 0

    if iterator:
        for row in iterator:

            #Skipping the empty line of CSV
            if len(row) == 0:
                continue

            custom_columns = {
                s3.SDC_SOURCE_BUCKET_COLUMN: bucket,
                s3.SDC_SOURCE_FILE_COLUMN: s3_path,

                # index zero, +1 for header row
                s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2
            }
            rec = {**row, **custom_columns}

            with Transformer() as transformer:
                to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata']))

            singer.write_record(table_name, to_write)
            records_synced += 1
    else:
        LOGGER.warning('Skipping "%s" file as it is empty',s3_path)
        s3.skipped_files_count = s3.skipped_files_count + 1

    return records_synced

Beispiel #2

0

Datei anzeigen

 def test(self):
     options = {'quoting': 'MINIMAL', 'delimiter': '\t'}
     row_iterator = csv.get_row_iterator(self.csv_data, options)
     rows = [r for r in row_iterator]
     # if csv.QUOTE_MINIMAL is used, DictReader interprets all lines within quote-pair
     # as a single line
     self.assertEqual(len(rows), 1)

Beispiel #3

0

Datei anzeigen

def sample_file(config, table_spec, s3_path, sample_rate, max_records):
    LOGGER.info('Sampling %s (%s records, every %sth record).', s3_path,
                max_records, sample_rate)

    samples = []

    file_handle = get_file_handle(config, s3_path)
    iterator = csv.get_row_iterator(file_handle._raw_stream, table_spec)  #pylint:disable=protected-access

    current_row = 0

    for row in iterator:
        if (current_row % sample_rate) == 0:
            if row.get(csv.SDC_EXTRA_COLUMN):
                row.pop(csv.SDC_EXTRA_COLUMN)
            samples.append(row)

        current_row += 1

        if len(samples) >= max_records:
            break

    LOGGER.info('Sampled %s records.', len(samples))

    return samples

Beispiel #4

0

Datei anzeigen

    def test_csv_records(self):
        table_spec = {}
        file_handle = [
            b"columnA,columnB,columnC", b"1,2,3", b"1,2,3", b"1,2,3", b"1,2,3",
            b"1,2,3", b"4,5,6"
        ]
        s3_path = "unittest_compressed_files/sample.csv"

        iterator = csv.get_row_iterator(file_handle, table_spec)

        expected_output = [{
            "columnA": "1",
            "columnB": "2",
            "columnC": "3"
        }, {
            "columnA": "4",
            "columnB": "5",
            "columnC": "6"
        }]

        actual_output = [
            record for record in s3.get_records_for_csv(s3_path, 5, iterator)
        ]

        self.assertEqual(expected_output, actual_output)

Beispiel #5

0

Datei anzeigen

def sync_table_file(config, s3_path, stream):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = stream['tap_stream_id']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    iterator = csv.get_row_iterator(s3_file_handle._raw_stream)

    records_synced = 0

    for row in iterator:
        custom_columns = {
            '_sdc_source_bucket': bucket,
            '_sdc_source_file': s3_path,

            # index zero, +1 for header row
            '_sdc_source_lineno': records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(
                rec, stream['schema'], metadata.to_map(stream['metadata']))

        singer.write_record(table_name, to_write)
        records_synced += 1

    return records_synced

Beispiel #6

0

Datei anzeigen

def sample_file(conn, table_name, f, sample_rate, max_records):
    plurality = "s" if sample_rate != 1 else ""
    LOGGER.info('Sampling %s (%s records, every %s record%s).', f['filepath'],
                max_records, sample_rate, plurality)

    samples = []

    file_handle = conn.get_file_handle(f)

    raw_stream = sftp.RawStream(file_handle)
    iterator = csv.get_row_iterator(raw_stream)

    current_row = 0

    for row in iterator:
        if (current_row % sample_rate) == 0:
            if row.get(csv.SDC_EXTRA_COLUMN):
                row.pop(csv.SDC_EXTRA_COLUMN)
            samples.append(row)

        current_row += 1

        if len(samples) >= max_records:
            break

    LOGGER.info('Sampled %s records.', len(samples))

    # Empty sample to show field selection, if needed
    empty_file = False
    if len(samples) == 0:
        empty_file = True
        samples.append({name: None for name in iterator.fieldnames})

    return (empty_file, samples)

Beispiel #7

0

Datei anzeigen

def sync_table_file(conn, f, stream):
    LOGGER.info('Syncing file "%s".', f["filepath"])

    table_name = stream.tap_stream_id

    file_handle = conn.get_file_handle(f)
    raw_stream = sftp.RawStream(file_handle)
    iterator = csv.get_row_iterator(raw_stream)

    records_synced = 0

    for row in iterator:
        custom_columns = {
            '_sdc_source_file': f["filepath"],

            # index zero, +1 for header row
            '_sdc_source_lineno': records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(rec, stream.schema.to_dict(),
                                             metadata.to_map(stream.metadata))

        singer.write_record(table_name, to_write)
        records_synced += 1

    return records_synced

Beispiel #8

0

Datei anzeigen

Datei: s3.py Projekt: entera-ai/pipelinewise-tap-s3-csv

def sample_file(config: Dict, table_spec: Dict, s3_path: str,
                sample_rate: int) -> Generator:
    """
    Get a sample data from the given S3 file
    :param config:
    :param table_spec:
    :param s3_path:
    :param sample_rate:
    :return: generator containing the samples as dictionaries
    """
    file_handle = get_file_handle(config, s3_path)
    # _raw_stream seems like the wrong way to access this..
    iterator = get_row_iterator(file_handle._raw_stream, table_spec)  # pylint:disable=protected-access

    current_row = 0

    sampled_row_count = 0

    for row in iterator:
        if (current_row % sample_rate) == 0:
            if row.get(SDC_EXTRA_COLUMN):
                row.pop(SDC_EXTRA_COLUMN)
            sampled_row_count += 1
            if (sampled_row_count % 200) == 0:
                LOGGER.info("Sampled %s rows from %s", sampled_row_count,
                            s3_path)
            yield row

        current_row += 1

    LOGGER.info("Sampled %s rows from %s", sampled_row_count, s3_path)

Beispiel #9

0

Datei anzeigen

def sync_table_file(config, s3_path, table_spec, stream):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    iterator = csv.get_row_iterator(s3_file_handle._raw_stream, table_spec)  #pylint:disable=protected-access

    records_synced = 0

    for row in iterator:
        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN:
            bucket,
            s3.SDC_SOURCE_FILE_COLUMN:
            s3_path,

            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN:
            records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(
                rec, stream['schema'], metadata.to_map(stream['metadata']))

        singer.write_record(table_name, to_write)
        records_synced += 1

    return records_synced

Beispiel #10

0

Datei anzeigen

    def test(self):
        row_iterator = csv.get_row_iterator(
            self.csv_data, options={'key_properties': ['columnA']})
        rows = [r for r in row_iterator]
        self.assertEqual(rows[0]['columnA'], '1')

        with self.assertRaises(Exception):
            row_iterator = csv.get_row_iterator(
                self.csv_data, options={'key_properties': ['fizz']})

        row_iterator = csv.get_row_iterator(
            self.csv_data, options={'date_overrides': ['columnA']})
        rows = [r for r in row_iterator]
        self.assertEqual(rows[0]['columnA'], '1')

        with self.assertRaises(Exception):
            row_iterator = csv.get_row_iterator(
                self.csv_data, options={'date_overrides': ['fizz']})

Beispiel #11

0

Datei anzeigen

Datei: tap_s3_csv.py Projekt: wedotech-limited/pipelinewise

    def _get_file_records(
        self, s3_path: str, table_spec: Dict, records: List[Dict], headers: Set
    ) -> None:
        """
        Reads the file in s3_path and inserts the rows in records
        :param config: tap connection configuration
        :param s3_path: full path of file in S3 bucket
        :param table_spec: dict of table with its specs
        :param records: list into which to insert the rows from file
        :param headers: set to update with any new column names
        :return: None
        """
        bucket = self.connection_config['bucket']

        s3_file_handle = S3Helper.get_file_handle(self.connection_config, s3_path)

        # We observed data whose field size exceeded the default maximum of
        # 131072. We believe the primary consequence of the following setting
        # is that a malformed, wide CSV would potentially parse into a single
        # large field rather than giving this error, but we also think the
        # chances of that are very small and at any rate the source data would
        # need to be fixed. The other consequence of this could be larger
        # memory consumption but that's acceptable as well.
        csv.field_size_limit(sys.maxsize)

        # pylint:disable=protected-access
        iterator = singer_encodings_csv.get_row_iterator(
            s3_file_handle._raw_stream, table_spec
        )

        records_copied = len(records)

        for row in iterator:
            now_datetime = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')
            custom_columns = {
                S3Helper.SDC_SOURCE_BUCKET_COLUMN: bucket,
                S3Helper.SDC_SOURCE_FILE_COLUMN: s3_path,
                S3Helper.SDC_SOURCE_LINENO_COLUMN: records_copied + 1,
                '_SDC_EXTRACTED_AT': now_datetime,
                '_SDC_BATCHED_AT': now_datetime,
                '_SDC_DELETED_AT': None,
            }

            new_row = {}

            # make all columns safe
            # pylint: disable=invalid-name
            for k, v in row.items():
                new_row[safe_column_name(k, self.target_quote)] = v

            record = {**new_row, **custom_columns}

            records.append(record)
            headers.update(record.keys())

            records_copied += 1

Beispiel #12

0

Datei anzeigen

def sample_file(config, table_spec, s3_path, sample_rate):
    file_handle = get_file_handle(config, s3_path)
    if s3_path.endswith('zip'):
        with io.BytesIO(file_handle.read()) as tf:
            if tf is not None:
                tf.seek(0)

            # Read the file as a zipfile and process the members
            with zipfile.ZipFile(tf, mode='r') as zipf:
                for subfile in zipf.namelist():
                    if "MAC" not in subfile:
                        with zipf.open(subfile) as myfile:
                            iterator = csv_singer.get_row_iterator(
                                myfile, table_spec)
                            rows = list(iterator)
                            longitud = len(rows)
    else:
        iterator = csv_singer.get_row_iterator(file_handle._raw_stream,
                                               table_spec)  #pylint:disable=protected-access
        rows = list(iterator)
        longitud = len(rows)

    current_row = 0

    sampled_row_count = 0

    i = 0

    for row in rows:
        if (current_row % sample_rate) == 0:
            if row.get(csv_singer.SDC_EXTRA_COLUMN):
                row.pop(csv_singer.SDC_EXTRA_COLUMN)
            sampled_row_count += 1
            if (sampled_row_count % 200) == 0:
                LOGGER.info("Sampled %s rows from %s", sampled_row_count,
                            s3_path)
            yield row
        if i == longitud:
            continue

        current_row += 1

    LOGGER.info("Sampled %s rows from %s", sampled_row_count, s3_path)

Beispiel #13

0

Datei anzeigen

    def test(self):
        row_iterator = csv.get_row_iterator(self.csv_data, options={'key_properties': ['columnA']})
        rows = [r for r in row_iterator]
        self.assertEqual(rows[0]['columnA'], '1')

        try:
            row_iterator = csv.get_row_iterator(self.csv_data, options={'key_properties': ['fizz']})
        except Exception as ex:
            expected_message = "CSV file missing required headers: {'fizz'}"
            self.assertEquals(expected_message, str(ex))

        row_iterator = csv.get_row_iterator(self.csv_data, options={'date_overrides': ['columnA']})
        rows = [r for r in row_iterator]
        self.assertEqual(rows[0]['columnA'], '1')

        try:
            row_iterator = csv.get_row_iterator(self.csv_data, options={'date_overrides': ['columnA']})
        except Exception as ex:
            expected_message = "CSV file missing date_overrides headers: {'fizz'}"
            self.assertEquals(expected_message, str(ex))

Beispiel #14

0

Datei anzeigen

Datei: sync.py Projekt: thai01255nt/setel-tap-s3-csv

def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int:
    """
    Sync a given csv found file
    :param config: tap configuration
    :param s3_path: file path given by S3
    :param table_spec: tables specs
    :param stream: Stream data
    :return: number of streamed records
    """
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    sync_one_one = config.get('sync_one_one', "True")
    if sync_one_one or sync_one_one == "True" or sync_one_one == "true":
        sync_one_one = True
    elif not sync_one_one or sync_one_one == "False" or sync_one_one == "false":
        sync_one_one = False
    else:
        raise Exception("Don't understand sync_one_one param in config, must be boolean")
    table_name = table_spec['table_name']
    s3_file_handle, tags = s3.get_file_handle_custom(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)
    iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec)  # pylint:disable=protected-access

    records_synced = 0
    for row in iterator:
        if not sync_one_one:
            custom_columns = {
                s3.SDC_SOURCE_BUCKET_COLUMN: bucket,
                s3.SDC_SOURCE_FILE_COLUMN: s3_path,

                # index zero, +1 for header row
                s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2
            }
            rec = {**row, **custom_columns}
            with Transformer() as transformer:
                to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata']))
            write_record(table_name, to_write)
        if sync_one_one:
            write_message(
                OneOneMessage(table_name, row, TagSet=tags, sync_one_one=sync_one_one, _sdc_source_file=s3_path))

        records_synced += 1

    return records_synced

Beispiel #15

0

Datei anzeigen

def sample_file(table_spec, s3_path, file_handle, sample_rate, extension):
    global skipped_files_count

    # Check whether file is without extension or not
    if not extension or s3_path.lower() == extension:
        LOGGER.warning('"%s" without extension will not be sampled.', s3_path)
        skipped_files_count = skipped_files_count + 1
        return []
    if extension in ["csv", "txt"]:
        # If file object read from s3 bucket file else use extracted file object from zip or gz
        file_handle = file_handle._raw_stream if hasattr(
            file_handle, "_raw_stream") else file_handle  #pylint:disable=protected-access
        iterator = csv.get_row_iterator(file_handle, table_spec, None, True)
        csv_records = []
        if iterator:
            csv_records = get_records_for_csv(s3_path, sample_rate, iterator)
        else:
            LOGGER.warning('Skipping "%s" file as it is empty', s3_path)
            skipped_files_count = skipped_files_count + 1
        return csv_records
    if extension == "gz":
        return sampling_gz_file(table_spec, s3_path, file_handle, sample_rate)
    if extension == "jsonl":
        # If file object read from s3 bucket file else use extracted file object from zip or gz
        file_handle = file_handle._raw_stream if hasattr(
            file_handle, "_raw_stream") else file_handle
        records = get_records_for_jsonl(s3_path, sample_rate, file_handle)
        check_jsonl_sample_records, records = itertools.tee(records)
        jsonl_sample_records = list(check_jsonl_sample_records)
        if len(jsonl_sample_records) == 0:
            LOGGER.warning('Skipping "%s" file as it is empty', s3_path)
            skipped_files_count = skipped_files_count + 1
        check_key_properties_and_date_overrides_for_jsonl_file(
            table_spec, jsonl_sample_records, s3_path)

        return records
    if extension == "zip":
        LOGGER.warning('Skipping "%s" file as it contains nested compression.',
                       s3_path)
        skipped_files_count = skipped_files_count + 1
        return []
    LOGGER.warning('"%s" having the ".%s" extension will not be sampled.',
                   s3_path, extension)
    skipped_files_count = skipped_files_count + 1
    return []

Beispiel #16

0

Datei anzeigen

def sync_table_file(config: Dict, s3_path: str, table_spec: Dict,
                    stream: Dict) -> int:
    """
    Sync a given csv found file
    :param config: tap configuration
    :param s3_path: file path given by S3
    :param table_spec: tables specs
    :param stream: Stream data
    :return: number of streamed records
    """
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec)  # pylint:disable=protected-access

    records_synced = 0

    for row in iterator:
        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN:
            bucket,
            s3.SDC_SOURCE_FILE_COLUMN:
            s3_path,

            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN:
            records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(
                rec, stream['schema'], metadata.to_map(stream['metadata']))

        write_record(table_name, to_write)
        records_synced += 1

    return records_synced

Beispiel #17

0

Datei anzeigen

def sync_table_file(config, s3_path, table_spec, stream):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)
    iterator = singer_encodings_csv.get_row_iterator(
        s3_file_handle._raw_stream, table_spec)  #pylint:disable=protected-access

    records_synced = 0

    for row in iterator:
        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN:
            bucket,
            s3.SDC_SOURCE_FILE_COLUMN:
            s3_path,

            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN:
            records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(
                rec, stream['schema'], metadata.to_map(stream['metadata']))

        singer.write_record(table_name, to_write)
        records_synced += 1

    return records_synced

Beispiel #18

0

Datei anzeigen

Datei: s3.py Projekt: rongfengliang/tap-minio-csv

def sample_file(config, table_spec, s3_path, sample_rate):
    file_handle = get_file_handle(config, s3_path)
    iterator = csv.get_row_iterator(file_handle._raw_stream, table_spec)  #pylint:disable=protected-access

    current_row = 0

    sampled_row_count = 0

    for row in iterator:
        if (current_row % sample_rate) == 0:
            if row.get(csv.SDC_EXTRA_COLUMN):
                row.pop(csv.SDC_EXTRA_COLUMN)
            sampled_row_count += 1
            if (sampled_row_count % 200) == 0:
                LOGGER.info("Sampled %s rows from %s", sampled_row_count,
                            s3_path)
            yield row

        current_row += 1

    LOGGER.info("Sampled %s rows from %s", sampled_row_count, s3_path)

Beispiel #19

0

Datei anzeigen

Datei: tap_s3_csv.py Projekt: entera-ai/pipelinewise

        def get_rows():
            LOGGER.info("Fetching rows from path: %s", s3_path)

            with tempfile.NamedTemporaryFile(mode='w+b', suffix=".csv.gz") as tmpfile:
                s3_file_handle = S3Helper.get_file_handle(self.connection_config, s3_path)
                gzip_file = gzip.GzipFile(mode='wb', fileobj=tmpfile)
                shutil.copyfileobj(s3_file_handle, gzip_file)
                gzip_file.close()
                s3_file_handle.close()

                LOGGER.info("Downloaded %s", s3_path)

                tmpfile.seek(0)
                gzip_file = gzip.GzipFile(mode='rb', fileobj=tmpfile)
                # pylint:disable=protected-access
                row_iterator = singer_encodings_csv.get_row_iterator(gzip_file, table_spec)

                for row in row_iterator:
                    now_datetime = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')
                    custom_columns = {
                        S3Helper.SDC_SOURCE_BUCKET_COLUMN: bucket,
                        S3Helper.SDC_SOURCE_FILE_COLUMN: s3_path,
                        S3Helper.SDC_SOURCE_LINENO_COLUMN: next(count),
                        '_SDC_EXTRACTED_AT': now_datetime,
                        '_SDC_BATCHED_AT': now_datetime,
                        '_SDC_DELETED_AT': None
                    }

                    new_row = {}

                    # make all columns safe
                    # pylint: disable=invalid-name
                    for k, v in row.items():
                        new_row[safe_column_name(k)] = v

                    yield {**new_row, **custom_columns}

Beispiel #20

0

Datei anzeigen

 def test(self):
     row_iterator = csv.get_row_iterator(self.csv_data, None, None, True)
     rows = [r for r in row_iterator]
     self.assertEqual(rows[0]['_sdc_extra'], [{"no_headers": ["4"]}])

Beispiel #21

0

Datei anzeigen

 def test(self):
     row_iterator = csv.get_row_iterator(self.csv_data)
     rows = [r for r in row_iterator]
     self.assertEqual(rows[0]['columnB'], '2')

Beispiel #22

0

Datei anzeigen

 def test(self):
     row_iterator = csv.get_row_iterator(self.csv_data)
     self.assertEquals(row_iterator.fieldnames, ["columnA", "columnB"])

Beispiel #23

0

Datei anzeigen

 def test(self):
     row_iterator = csv.get_row_iterator([])
     self.assertEquals(row_iterator.fieldnames, None)

Beispiel #24

0

Datei anzeigen

Datei: sync.py Projekt: valulucchesi/tap-s3-csv

def sync_table_file(config, s3_path, table_spec, stream, modified):
    LOGGER.info('Syncing file "%s".', s3_path)

    bucket = config['bucket']
    table_name = table_spec['table_name']

    s3_file_handle = s3.get_file_handle(config, s3_path)
    # We observed data who's field size exceeded the default maximum of
    # 131072. We believe the primary consequence of the following setting
    # is that a malformed, wide CSV would potentially parse into a single
    # large field rather than giving this error, but we also think the
    # chances of that are very small and at any rate the source data would
    # need to be fixed. The other consequence of this could be larger
    # memory consumption but that's acceptable as well.
    csv.field_size_limit(sys.maxsize)
    longitud = 0
    if s3_path.endswith('zip'):
        with io.BytesIO(s3_file_handle.read()) as tf:
            if tf is not None:
                tf.seek(0)

            # Read the file as a zipfile and process the members
            with zipfile.ZipFile(tf, mode='r') as zipf:
                for subfile in zipf.namelist():
                    with zipf.open(subfile) as myfile:
                        iterator = singer_encodings_csv.get_row_iterator(
                            myfile, table_spec)
                        rows = list(iterator)
                        longitud = len(rows)

    else:
        iterator = singer_encodings_csv.get_row_iterator(
            s3_file_handle._raw_stream, table_spec)  #pylint:disable=protected-access
        rows = list(iterator)
        longitud = len(rows)

    records_synced = 0
    current_row = 0
    i = 0
    for row in rows:

        custom_columns = {
            s3.SDC_SOURCE_BUCKET_COLUMN:
            bucket,
            s3.SDC_SOURCE_FILE_COLUMN:
            s3_path,

            # index zero, +1 for header row
            s3.SDC_SOURCE_LINENO_COLUMN:
            records_synced + 2
        }
        rec = {**row, **custom_columns}

        with Transformer() as transformer:
            to_write = transformer.transform(
                rec, stream['schema'], metadata.to_map(stream['metadata']))
            if "preprocess" in config and config['preprocess'] != '':
                preprocess_items = json.loads(config['preprocess'])
                for i in preprocess_items:
                    preprocess = i
                    if (table_name == preprocess['table_name']):
                        for value in preprocess['values']:
                            to_get = value.split("|")[0]
                            to_del = value.split("|")[1]
                            if to_get in rec:
                                if to_del in rec:
                                    if rec[to_get] == rec[to_del]:
                                        if to_del in to_write:
                                            del to_write[to_del]
                                    else:
                                        LOGGER.warning('removing record: ' +
                                                       json.dumps(rec) + ' ' +
                                                       to_get + ' and ' +
                                                       to_del +
                                                       ' are not equals')

                            elif to_del in rec:
                                to_write[to_get] = rec[to_del]
                                if to_del in to_write:
                                    del to_write[to_del]
                            else:
                                to_write[to_get] = ""

        to_write['last_modified'] = modified.__str__()
        singer.write_record(table_name, to_write)
        records_synced += 1
        current_row += 1
        if (i == longitud):
            continue

    return records_synced

Beispiel #25

0

Datei anzeigen

 def test(self):
     row_iterator = csv.get_row_iterator(self.csv_data, None, None, True)
     rows = [r for r in row_iterator]
     self.assertEqual(rows[0]['_sdc_extra'], [{"columnB": "4"},{"columnC": ["5", "6"]}])
     self.assertEqual(list(rows[0].keys()), ["columnA","columnB","columnC","_sdc_extra"])

Beispiel #26

0

Datei anzeigen

 def test(self):
     row_iterator = csv.get_row_iterator(self.csv_data, None, None, True)
     rows = [r for r in row_iterator]
     self.assertEqual(list(rows[0].keys()), ["columnA","columnB"])

Beispiel #27

0

Datei anzeigen

 def test(self):
     row_iterator = csv.get_row_iterator(self.csv_data)
     rows = [r for r in row_iterator]
     self.assertEqual(rows[0]['_sdc_extra'], ['4'])

Beispiel #28

0

Datei anzeigen

    def test(self, mocked_logger_warn):
        row_iterator = csv.get_row_iterator(self.csv_data, None, None, True)
        rows = [r for r in row_iterator]
        self.assertEqual(list(rows[0].keys()), ["columnA","columnB","columnC"])

        mocked_logger_warn.assert_called_with('Duplicate Header(s) %s found in the csv and its value will be stored in the \"_sdc_extra\" field.', {'columnC'})

Beispiel #29

0

Datei anzeigen

 def test_get_row_iterator_return_none_for_empty_csv(self, mocked_logger_warn):
     row_iterator = csv.get_row_iterator([], None, None, True)
     self.assertEqual(row_iterator,None)

Beispiel #30

0

Datei anzeigen

 def test(self):
     options = {'quoting': 'NONE', 'delimiter': '\t'}
     row_iterator = csv.get_row_iterator(self.csv_data, options)
     rows = [r for r in row_iterator]
     # if csv.QUOTE_NONE is used, lines spread across quote-pair are parsed individually
     self.assertEqual(len(rows), 2)