def sync_csv_file(config, file_handle, s3_path, table_spec, stream): LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) if "properties" in stream["schema"]: iterator = csv_helper.get_row_iterator( file_handle, table_spec, stream["schema"]["properties"].keys(), True) else: iterator = csv_helper.get_row_iterator(file_handle, table_spec, None, True) records_synced = 0 if iterator: for row in iterator: #Skipping the empty line of CSV if len(row) == 0: continue custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata'])) singer.write_record(table_name, to_write) records_synced += 1 else: LOGGER.warning('Skipping "%s" file as it is empty',s3_path) s3.skipped_files_count = s3.skipped_files_count + 1 return records_synced
def test(self): options = {'quoting': 'MINIMAL', 'delimiter': '\t'} row_iterator = csv.get_row_iterator(self.csv_data, options) rows = [r for r in row_iterator] # if csv.QUOTE_MINIMAL is used, DictReader interprets all lines within quote-pair # as a single line self.assertEqual(len(rows), 1)
def sample_file(config, table_spec, s3_path, sample_rate, max_records): LOGGER.info('Sampling %s (%s records, every %sth record).', s3_path, max_records, sample_rate) samples = [] file_handle = get_file_handle(config, s3_path) iterator = csv.get_row_iterator(file_handle._raw_stream, table_spec) #pylint:disable=protected-access current_row = 0 for row in iterator: if (current_row % sample_rate) == 0: if row.get(csv.SDC_EXTRA_COLUMN): row.pop(csv.SDC_EXTRA_COLUMN) samples.append(row) current_row += 1 if len(samples) >= max_records: break LOGGER.info('Sampled %s records.', len(samples)) return samples
def test_csv_records(self): table_spec = {} file_handle = [ b"columnA,columnB,columnC", b"1,2,3", b"1,2,3", b"1,2,3", b"1,2,3", b"1,2,3", b"4,5,6" ] s3_path = "unittest_compressed_files/sample.csv" iterator = csv.get_row_iterator(file_handle, table_spec) expected_output = [{ "columnA": "1", "columnB": "2", "columnC": "3" }, { "columnA": "4", "columnB": "5", "columnC": "6" }] actual_output = [ record for record in s3.get_records_for_csv(s3_path, 5, iterator) ] self.assertEqual(expected_output, actual_output)
def sync_table_file(config, s3_path, stream): LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = stream['tap_stream_id'] s3_file_handle = s3.get_file_handle(config, s3_path) iterator = csv.get_row_iterator(s3_file_handle._raw_stream) records_synced = 0 for row in iterator: custom_columns = { '_sdc_source_bucket': bucket, '_sdc_source_file': s3_path, # index zero, +1 for header row '_sdc_source_lineno': records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform( rec, stream['schema'], metadata.to_map(stream['metadata'])) singer.write_record(table_name, to_write) records_synced += 1 return records_synced
def sample_file(conn, table_name, f, sample_rate, max_records): plurality = "s" if sample_rate != 1 else "" LOGGER.info('Sampling %s (%s records, every %s record%s).', f['filepath'], max_records, sample_rate, plurality) samples = [] file_handle = conn.get_file_handle(f) raw_stream = sftp.RawStream(file_handle) iterator = csv.get_row_iterator(raw_stream) current_row = 0 for row in iterator: if (current_row % sample_rate) == 0: if row.get(csv.SDC_EXTRA_COLUMN): row.pop(csv.SDC_EXTRA_COLUMN) samples.append(row) current_row += 1 if len(samples) >= max_records: break LOGGER.info('Sampled %s records.', len(samples)) # Empty sample to show field selection, if needed empty_file = False if len(samples) == 0: empty_file = True samples.append({name: None for name in iterator.fieldnames}) return (empty_file, samples)
def sync_table_file(conn, f, stream): LOGGER.info('Syncing file "%s".', f["filepath"]) table_name = stream.tap_stream_id file_handle = conn.get_file_handle(f) raw_stream = sftp.RawStream(file_handle) iterator = csv.get_row_iterator(raw_stream) records_synced = 0 for row in iterator: custom_columns = { '_sdc_source_file': f["filepath"], # index zero, +1 for header row '_sdc_source_lineno': records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform(rec, stream.schema.to_dict(), metadata.to_map(stream.metadata)) singer.write_record(table_name, to_write) records_synced += 1 return records_synced
def sample_file(config: Dict, table_spec: Dict, s3_path: str, sample_rate: int) -> Generator: """ Get a sample data from the given S3 file :param config: :param table_spec: :param s3_path: :param sample_rate: :return: generator containing the samples as dictionaries """ file_handle = get_file_handle(config, s3_path) # _raw_stream seems like the wrong way to access this.. iterator = get_row_iterator(file_handle._raw_stream, table_spec) # pylint:disable=protected-access current_row = 0 sampled_row_count = 0 for row in iterator: if (current_row % sample_rate) == 0: if row.get(SDC_EXTRA_COLUMN): row.pop(SDC_EXTRA_COLUMN) sampled_row_count += 1 if (sampled_row_count % 200) == 0: LOGGER.info("Sampled %s rows from %s", sampled_row_count, s3_path) yield row current_row += 1 LOGGER.info("Sampled %s rows from %s", sampled_row_count, s3_path)
def sync_table_file(config, s3_path, table_spec, stream): LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) iterator = csv.get_row_iterator(s3_file_handle._raw_stream, table_spec) #pylint:disable=protected-access records_synced = 0 for row in iterator: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform( rec, stream['schema'], metadata.to_map(stream['metadata'])) singer.write_record(table_name, to_write) records_synced += 1 return records_synced
def test(self): row_iterator = csv.get_row_iterator( self.csv_data, options={'key_properties': ['columnA']}) rows = [r for r in row_iterator] self.assertEqual(rows[0]['columnA'], '1') with self.assertRaises(Exception): row_iterator = csv.get_row_iterator( self.csv_data, options={'key_properties': ['fizz']}) row_iterator = csv.get_row_iterator( self.csv_data, options={'date_overrides': ['columnA']}) rows = [r for r in row_iterator] self.assertEqual(rows[0]['columnA'], '1') with self.assertRaises(Exception): row_iterator = csv.get_row_iterator( self.csv_data, options={'date_overrides': ['fizz']})
def _get_file_records( self, s3_path: str, table_spec: Dict, records: List[Dict], headers: Set ) -> None: """ Reads the file in s3_path and inserts the rows in records :param config: tap connection configuration :param s3_path: full path of file in S3 bucket :param table_spec: dict of table with its specs :param records: list into which to insert the rows from file :param headers: set to update with any new column names :return: None """ bucket = self.connection_config['bucket'] s3_file_handle = S3Helper.get_file_handle(self.connection_config, s3_path) # We observed data whose field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) # pylint:disable=protected-access iterator = singer_encodings_csv.get_row_iterator( s3_file_handle._raw_stream, table_spec ) records_copied = len(records) for row in iterator: now_datetime = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f') custom_columns = { S3Helper.SDC_SOURCE_BUCKET_COLUMN: bucket, S3Helper.SDC_SOURCE_FILE_COLUMN: s3_path, S3Helper.SDC_SOURCE_LINENO_COLUMN: records_copied + 1, '_SDC_EXTRACTED_AT': now_datetime, '_SDC_BATCHED_AT': now_datetime, '_SDC_DELETED_AT': None, } new_row = {} # make all columns safe # pylint: disable=invalid-name for k, v in row.items(): new_row[safe_column_name(k, self.target_quote)] = v record = {**new_row, **custom_columns} records.append(record) headers.update(record.keys()) records_copied += 1
def sample_file(config, table_spec, s3_path, sample_rate): file_handle = get_file_handle(config, s3_path) if s3_path.endswith('zip'): with io.BytesIO(file_handle.read()) as tf: if tf is not None: tf.seek(0) # Read the file as a zipfile and process the members with zipfile.ZipFile(tf, mode='r') as zipf: for subfile in zipf.namelist(): if "MAC" not in subfile: with zipf.open(subfile) as myfile: iterator = csv_singer.get_row_iterator( myfile, table_spec) rows = list(iterator) longitud = len(rows) else: iterator = csv_singer.get_row_iterator(file_handle._raw_stream, table_spec) #pylint:disable=protected-access rows = list(iterator) longitud = len(rows) current_row = 0 sampled_row_count = 0 i = 0 for row in rows: if (current_row % sample_rate) == 0: if row.get(csv_singer.SDC_EXTRA_COLUMN): row.pop(csv_singer.SDC_EXTRA_COLUMN) sampled_row_count += 1 if (sampled_row_count % 200) == 0: LOGGER.info("Sampled %s rows from %s", sampled_row_count, s3_path) yield row if i == longitud: continue current_row += 1 LOGGER.info("Sampled %s rows from %s", sampled_row_count, s3_path)
def test(self): row_iterator = csv.get_row_iterator(self.csv_data, options={'key_properties': ['columnA']}) rows = [r for r in row_iterator] self.assertEqual(rows[0]['columnA'], '1') try: row_iterator = csv.get_row_iterator(self.csv_data, options={'key_properties': ['fizz']}) except Exception as ex: expected_message = "CSV file missing required headers: {'fizz'}" self.assertEquals(expected_message, str(ex)) row_iterator = csv.get_row_iterator(self.csv_data, options={'date_overrides': ['columnA']}) rows = [r for r in row_iterator] self.assertEqual(rows[0]['columnA'], '1') try: row_iterator = csv.get_row_iterator(self.csv_data, options={'date_overrides': ['columnA']}) except Exception as ex: expected_message = "CSV file missing date_overrides headers: {'fizz'}" self.assertEquals(expected_message, str(ex))
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int: """ Sync a given csv found file :param config: tap configuration :param s3_path: file path given by S3 :param table_spec: tables specs :param stream: Stream data :return: number of streamed records """ LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] sync_one_one = config.get('sync_one_one', "True") if sync_one_one or sync_one_one == "True" or sync_one_one == "true": sync_one_one = True elif not sync_one_one or sync_one_one == "False" or sync_one_one == "false": sync_one_one = False else: raise Exception("Don't understand sync_one_one param in config, must be boolean") table_name = table_spec['table_name'] s3_file_handle, tags = s3.get_file_handle_custom(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec) # pylint:disable=protected-access records_synced = 0 for row in iterator: if not sync_one_one: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform(rec, stream['schema'], metadata.to_map(stream['metadata'])) write_record(table_name, to_write) if sync_one_one: write_message( OneOneMessage(table_name, row, TagSet=tags, sync_one_one=sync_one_one, _sdc_source_file=s3_path)) records_synced += 1 return records_synced
def sample_file(table_spec, s3_path, file_handle, sample_rate, extension): global skipped_files_count # Check whether file is without extension or not if not extension or s3_path.lower() == extension: LOGGER.warning('"%s" without extension will not be sampled.', s3_path) skipped_files_count = skipped_files_count + 1 return [] if extension in ["csv", "txt"]: # If file object read from s3 bucket file else use extracted file object from zip or gz file_handle = file_handle._raw_stream if hasattr( file_handle, "_raw_stream") else file_handle #pylint:disable=protected-access iterator = csv.get_row_iterator(file_handle, table_spec, None, True) csv_records = [] if iterator: csv_records = get_records_for_csv(s3_path, sample_rate, iterator) else: LOGGER.warning('Skipping "%s" file as it is empty', s3_path) skipped_files_count = skipped_files_count + 1 return csv_records if extension == "gz": return sampling_gz_file(table_spec, s3_path, file_handle, sample_rate) if extension == "jsonl": # If file object read from s3 bucket file else use extracted file object from zip or gz file_handle = file_handle._raw_stream if hasattr( file_handle, "_raw_stream") else file_handle records = get_records_for_jsonl(s3_path, sample_rate, file_handle) check_jsonl_sample_records, records = itertools.tee(records) jsonl_sample_records = list(check_jsonl_sample_records) if len(jsonl_sample_records) == 0: LOGGER.warning('Skipping "%s" file as it is empty', s3_path) skipped_files_count = skipped_files_count + 1 check_key_properties_and_date_overrides_for_jsonl_file( table_spec, jsonl_sample_records, s3_path) return records if extension == "zip": LOGGER.warning('Skipping "%s" file as it contains nested compression.', s3_path) skipped_files_count = skipped_files_count + 1 return [] LOGGER.warning('"%s" having the ".%s" extension will not be sampled.', s3_path, extension) skipped_files_count = skipped_files_count + 1 return []
def sync_table_file(config: Dict, s3_path: str, table_spec: Dict, stream: Dict) -> int: """ Sync a given csv found file :param config: tap configuration :param s3_path: file path given by S3 :param table_spec: tables specs :param stream: Stream data :return: number of streamed records """ LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) iterator = get_row_iterator(s3_file_handle._raw_stream, table_spec) # pylint:disable=protected-access records_synced = 0 for row in iterator: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform( rec, stream['schema'], metadata.to_map(stream['metadata'])) write_record(table_name, to_write) records_synced += 1 return records_synced
def sync_table_file(config, s3_path, table_spec, stream): LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) iterator = singer_encodings_csv.get_row_iterator( s3_file_handle._raw_stream, table_spec) #pylint:disable=protected-access records_synced = 0 for row in iterator: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform( rec, stream['schema'], metadata.to_map(stream['metadata'])) singer.write_record(table_name, to_write) records_synced += 1 return records_synced
def sample_file(config, table_spec, s3_path, sample_rate): file_handle = get_file_handle(config, s3_path) iterator = csv.get_row_iterator(file_handle._raw_stream, table_spec) #pylint:disable=protected-access current_row = 0 sampled_row_count = 0 for row in iterator: if (current_row % sample_rate) == 0: if row.get(csv.SDC_EXTRA_COLUMN): row.pop(csv.SDC_EXTRA_COLUMN) sampled_row_count += 1 if (sampled_row_count % 200) == 0: LOGGER.info("Sampled %s rows from %s", sampled_row_count, s3_path) yield row current_row += 1 LOGGER.info("Sampled %s rows from %s", sampled_row_count, s3_path)
def get_rows(): LOGGER.info("Fetching rows from path: %s", s3_path) with tempfile.NamedTemporaryFile(mode='w+b', suffix=".csv.gz") as tmpfile: s3_file_handle = S3Helper.get_file_handle(self.connection_config, s3_path) gzip_file = gzip.GzipFile(mode='wb', fileobj=tmpfile) shutil.copyfileobj(s3_file_handle, gzip_file) gzip_file.close() s3_file_handle.close() LOGGER.info("Downloaded %s", s3_path) tmpfile.seek(0) gzip_file = gzip.GzipFile(mode='rb', fileobj=tmpfile) # pylint:disable=protected-access row_iterator = singer_encodings_csv.get_row_iterator(gzip_file, table_spec) for row in row_iterator: now_datetime = datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f') custom_columns = { S3Helper.SDC_SOURCE_BUCKET_COLUMN: bucket, S3Helper.SDC_SOURCE_FILE_COLUMN: s3_path, S3Helper.SDC_SOURCE_LINENO_COLUMN: next(count), '_SDC_EXTRACTED_AT': now_datetime, '_SDC_BATCHED_AT': now_datetime, '_SDC_DELETED_AT': None } new_row = {} # make all columns safe # pylint: disable=invalid-name for k, v in row.items(): new_row[safe_column_name(k)] = v yield {**new_row, **custom_columns}
def test(self): row_iterator = csv.get_row_iterator(self.csv_data, None, None, True) rows = [r for r in row_iterator] self.assertEqual(rows[0]['_sdc_extra'], [{"no_headers": ["4"]}])
def test(self): row_iterator = csv.get_row_iterator(self.csv_data) rows = [r for r in row_iterator] self.assertEqual(rows[0]['columnB'], '2')
def test(self): row_iterator = csv.get_row_iterator(self.csv_data) self.assertEquals(row_iterator.fieldnames, ["columnA", "columnB"])
def test(self): row_iterator = csv.get_row_iterator([]) self.assertEquals(row_iterator.fieldnames, None)
def sync_table_file(config, s3_path, table_spec, stream, modified): LOGGER.info('Syncing file "%s".', s3_path) bucket = config['bucket'] table_name = table_spec['table_name'] s3_file_handle = s3.get_file_handle(config, s3_path) # We observed data who's field size exceeded the default maximum of # 131072. We believe the primary consequence of the following setting # is that a malformed, wide CSV would potentially parse into a single # large field rather than giving this error, but we also think the # chances of that are very small and at any rate the source data would # need to be fixed. The other consequence of this could be larger # memory consumption but that's acceptable as well. csv.field_size_limit(sys.maxsize) longitud = 0 if s3_path.endswith('zip'): with io.BytesIO(s3_file_handle.read()) as tf: if tf is not None: tf.seek(0) # Read the file as a zipfile and process the members with zipfile.ZipFile(tf, mode='r') as zipf: for subfile in zipf.namelist(): with zipf.open(subfile) as myfile: iterator = singer_encodings_csv.get_row_iterator( myfile, table_spec) rows = list(iterator) longitud = len(rows) else: iterator = singer_encodings_csv.get_row_iterator( s3_file_handle._raw_stream, table_spec) #pylint:disable=protected-access rows = list(iterator) longitud = len(rows) records_synced = 0 current_row = 0 i = 0 for row in rows: custom_columns = { s3.SDC_SOURCE_BUCKET_COLUMN: bucket, s3.SDC_SOURCE_FILE_COLUMN: s3_path, # index zero, +1 for header row s3.SDC_SOURCE_LINENO_COLUMN: records_synced + 2 } rec = {**row, **custom_columns} with Transformer() as transformer: to_write = transformer.transform( rec, stream['schema'], metadata.to_map(stream['metadata'])) if "preprocess" in config and config['preprocess'] != '': preprocess_items = json.loads(config['preprocess']) for i in preprocess_items: preprocess = i if (table_name == preprocess['table_name']): for value in preprocess['values']: to_get = value.split("|")[0] to_del = value.split("|")[1] if to_get in rec: if to_del in rec: if rec[to_get] == rec[to_del]: if to_del in to_write: del to_write[to_del] else: LOGGER.warning('removing record: ' + json.dumps(rec) + ' ' + to_get + ' and ' + to_del + ' are not equals') elif to_del in rec: to_write[to_get] = rec[to_del] if to_del in to_write: del to_write[to_del] else: to_write[to_get] = "" to_write['last_modified'] = modified.__str__() singer.write_record(table_name, to_write) records_synced += 1 current_row += 1 if (i == longitud): continue return records_synced
def test(self): row_iterator = csv.get_row_iterator(self.csv_data, None, None, True) rows = [r for r in row_iterator] self.assertEqual(rows[0]['_sdc_extra'], [{"columnB": "4"},{"columnC": ["5", "6"]}]) self.assertEqual(list(rows[0].keys()), ["columnA","columnB","columnC","_sdc_extra"])
def test(self): row_iterator = csv.get_row_iterator(self.csv_data, None, None, True) rows = [r for r in row_iterator] self.assertEqual(list(rows[0].keys()), ["columnA","columnB"])
def test(self): row_iterator = csv.get_row_iterator(self.csv_data) rows = [r for r in row_iterator] self.assertEqual(rows[0]['_sdc_extra'], ['4'])
def test(self, mocked_logger_warn): row_iterator = csv.get_row_iterator(self.csv_data, None, None, True) rows = [r for r in row_iterator] self.assertEqual(list(rows[0].keys()), ["columnA","columnB","columnC"]) mocked_logger_warn.assert_called_with('Duplicate Header(s) %s found in the csv and its value will be stored in the \"_sdc_extra\" field.', {'columnC'})
def test_get_row_iterator_return_none_for_empty_csv(self, mocked_logger_warn): row_iterator = csv.get_row_iterator([], None, None, True) self.assertEqual(row_iterator,None)
def test(self): options = {'quoting': 'NONE', 'delimiter': '\t'} row_iterator = csv.get_row_iterator(self.csv_data, options) rows = [r for r in row_iterator] # if csv.QUOTE_NONE is used, lines spread across quote-pair are parsed individually self.assertEqual(len(rows), 2)