def test_smart_columns(self): with patch('sys.stdout', new_callable=StringIO) as fake_out: records_streamed = 0 table_spec = TEST_TABLE_SPEC['tables'][7] modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_matching_objects( table_spec, modified_since) samples = file_utils.sample_files(table_spec, target_files, sample_rate=1) schema = generate_schema(table_spec, samples) for t_file in target_files: records_streamed += file_utils.write_file( t_file['key'], table_spec, schema.to_dict()) raw_records = fake_out.getvalue().split('\n') records = [json.loads(raw) for raw in raw_records if raw] self.assertEqual( records_streamed, len(records), "Number records written to the pipe differed from records read from the pipe." ) self.assertTrue(records[0]['type'] == "RECORD") self.assertTrue(len(records[0]) == 3) self.assertTrue(len(records[0]['record']) == 7) self.assertTrue("_smart_source_bucket" in records[0]['record']) self.assertTrue("_smart_source_lineno" in records[0]['record'])
def sync(config, state, catalog): # Loop over selected streams in catalog LOGGER.info(f"Processing {len(list(catalog.get_selected_streams(state)))} selected streams from Catalog") for stream in catalog.get_selected_streams(state): LOGGER.info("Syncing stream:" + stream.tap_stream_id) catalog_schema = stream.schema.to_dict() table_spec = next((x for x in config['tables'] if x['name'] == stream.tap_stream_id), None) if table_spec is not None: # Allow updates to our tables specification to override any previously extracted schema in the catalog merged_schema = override_schema_with_config(catalog_schema, table_spec) singer.write_schema( stream_name=stream.tap_stream_id, schema=merged_schema, key_properties=stream.key_properties, ) modified_since = dateutil.parser.parse( state.get(stream.tap_stream_id, {}).get('modified_since') or table_spec['start_date']) target_files = file_utils.get_matching_objects(table_spec, modified_since) max_records_per_run = table_spec.get('max_records_per_run', -1) records_streamed = 0 for t_file in target_files: records_streamed += file_utils.write_file(t_file['key'], table_spec, merged_schema, max_records=max_records_per_run-records_streamed) if 0 < max_records_per_run <= records_streamed: LOGGER.info(f'Processed the per-run limit of {records_streamed} records for stream "{stream.tap_stream_id}". Stopping sync for this stream.') break state[stream.tap_stream_id] = {'modified_since': t_file['last_modified'].isoformat()} singer.write_state(state) LOGGER.info(f'Wrote {records_streamed} records for stream "{stream.tap_stream_id}".') else: LOGGER.warn(f'Skipping processing for stream [{stream.tap_stream_id}] without a config block.') return
def discover(config): streams = [] for table_spec in config['tables']: try: modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_matching_objects(table_spec, modified_since) sample_rate = table_spec.get('sample_rate',5) max_sampling_read = table_spec.get('max_sampling_read', 1000) max_sampled_files = table_spec.get('max_sampled_files', 50) samples = file_utils.sample_files(table_spec, target_files,sample_rate=sample_rate, max_records=max_sampling_read, max_files=max_sampled_files) schema = generate_schema(table_spec, samples) stream_metadata = [] key_properties = table_spec.get('key_properties', []) streams.append( CatalogEntry( tap_stream_id=table_spec['name'], stream=table_spec['name'], schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, ) ) except Exception as err: LOGGER.error(f"Unable to write Catalog entry for '{table_spec['name']}' - it will be skipped due to error {err}") return Catalog(streams)
def test_https_bucket(self): table_spec = TEST_TABLE_SPEC['tables'][4] modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_matching_objects(table_spec, modified_since) assert len(target_files) == 1 target_uri = table_spec['path'] + '/' + target_files[0]["key"] iterator = get_row_iterator(TEST_TABLE_SPEC['tables'][4], target_uri) row = next(iterator) self.assertTrue(int(row['id']) > 0, row['id'] + " was not positive")
def test_renamed_https_object(self): table_spec = TEST_TABLE_SPEC['tables'][6] modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_matching_objects(table_spec, modified_since) assert len(target_files) == 1 target_uri = table_spec['path'] + '/' + table_spec['pattern'] iterator = get_row_iterator(TEST_TABLE_SPEC['tables'][6], target_uri) row = next(iterator) self.assertTrue(len(row) > 1, "Not able to read a row.")
def test_indirect_https_bucket(self): table_spec = TEST_TABLE_SPEC['tables'][5] modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_matching_objects(table_spec, modified_since) assert len(target_files) == 1 target_uri = table_spec['path'] + '/' + table_spec['pattern'] iterator = get_row_iterator(TEST_TABLE_SPEC['tables'][5], target_uri) row = next(iterator) self.assertTrue(row['1976'] == '1976', "Row did not contain expected data")
def discover(config): streams = [] for table_spec in config['tables']: try: modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_matching_objects( table_spec, modified_since) sample_rate = table_spec.get('sample_rate', 5) max_sampling_read = table_spec.get('max_sampling_read', 1000) max_sampled_files = table_spec.get('max_sampled_files', 50) prefer_number_vs_integer = table_spec.get( 'prefer_number_vs_integer', False) samples = file_utils.sample_files(table_spec, target_files, sample_rate=sample_rate, max_records=max_sampling_read, max_files=max_sampled_files) metadata_schema = { '_smart_source_bucket': { 'type': 'string' }, '_smart_source_file': { 'type': 'string' }, '_smart_source_lineno': { 'type': 'integer' }, } data_schema = conversion.generate_schema( samples, prefer_number_vs_integer=prefer_number_vs_integer) inferred_schema = { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) } merged_schema = override_schema_with_config( inferred_schema, table_spec) schema = Schema.from_dict(merged_schema) stream_metadata = [] key_properties = table_spec.get('key_properties', []) streams.append( CatalogEntry( tap_stream_id=table_spec['name'], stream=table_spec['name'], schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) except Exception as err: LOGGER.error( f"Unable to write Catalog entry for '{table_spec['name']}' - it will be skipped due to error {err}" ) return Catalog(streams)
def test_local_bucket(self): table_spec = TEST_TABLE_SPEC['tables'][1] modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_matching_objects(table_spec, modified_since) assert len(target_files) == 1