def test_smart_columns(self): with patch('sys.stdout', new_callable=StringIO) as fake_out: records_streamed = 0 table_spec = TEST_TABLE_SPEC['tables'][7] modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_matching_objects( table_spec, modified_since) samples = file_utils.sample_files(table_spec, target_files, sample_rate=1) schema = generate_schema(table_spec, samples) for t_file in target_files: records_streamed += file_utils.write_file( t_file['key'], table_spec, schema.to_dict()) raw_records = fake_out.getvalue().split('\n') records = [json.loads(raw) for raw in raw_records if raw] self.assertEqual( records_streamed, len(records), "Number records written to the pipe differed from records read from the pipe." ) self.assertTrue(records[0]['type'] == "RECORD") self.assertTrue(len(records[0]) == 3) self.assertTrue(len(records[0]['record']) == 7) self.assertTrue("_smart_source_bucket" in records[0]['record']) self.assertTrue("_smart_source_lineno" in records[0]['record'])
def discover(config): streams = [] for table_spec in config['tables']: try: modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_matching_objects(table_spec, modified_since) sample_rate = table_spec.get('sample_rate',5) max_sampling_read = table_spec.get('max_sampling_read', 1000) max_sampled_files = table_spec.get('max_sampled_files', 50) samples = file_utils.sample_files(table_spec, target_files,sample_rate=sample_rate, max_records=max_sampling_read, max_files=max_sampled_files) schema = generate_schema(table_spec, samples) stream_metadata = [] key_properties = table_spec.get('key_properties', []) streams.append( CatalogEntry( tap_stream_id=table_spec['name'], stream=table_spec['name'], schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, ) ) except Exception as err: LOGGER.error(f"Unable to write Catalog entry for '{table_spec['name']}' - it will be skipped due to error {err}") return Catalog(streams)
def discover(config): streams = [] for table_spec in config['tables']: modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_input_files_for_table( table_spec, modified_since) sample_rate = table_spec.get('sample_rate', 10) max_sampling_read = table_spec.get('max_sampling_read', 1000) max_sampled_files = table_spec.get('max_sampled_files', 5) prefer_number_vs_integer = table_spec.get('prefer_number_vs_integer', False) samples = file_utils.sample_files(table_spec, target_files, sample_rate=sample_rate, max_records=max_sampling_read, max_files=max_sampled_files) metadata_schema = { '_smart_source_bucket': { 'type': 'string' }, '_smart_source_file': { 'type': 'string' }, '_smart_source_lineno': { 'type': 'integer' }, } data_schema = conversion.generate_schema( samples, prefer_number_vs_integer=prefer_number_vs_integer) inferred_schema = { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) } merged_schema = override_schema_with_config(inferred_schema, table_spec) schema = Schema.from_dict(merged_schema) stream_metadata = [] key_properties = table_spec.get('key_properties', []) streams.append( CatalogEntry( tap_stream_id=table_spec['name'], stream=table_spec['name'], schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) return Catalog(streams)
def discover(config): streams = [] for table_spec in config['tables']: try: modified_since = dateutil.parser.parse(table_spec['start_date']) target_files = file_utils.get_matching_objects( table_spec, modified_since) sample_rate = table_spec.get('sample_rate', 5) max_sampling_read = table_spec.get('max_sampling_read', 1000) max_sampled_files = table_spec.get('max_sampled_files', 50) prefer_number_vs_integer = table_spec.get( 'prefer_number_vs_integer', False) samples = file_utils.sample_files(table_spec, target_files, sample_rate=sample_rate, max_records=max_sampling_read, max_files=max_sampled_files) metadata_schema = { '_smart_source_bucket': { 'type': 'string' }, '_smart_source_file': { 'type': 'string' }, '_smart_source_lineno': { 'type': 'integer' }, } data_schema = conversion.generate_schema( samples, prefer_number_vs_integer=prefer_number_vs_integer) inferred_schema = { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) } merged_schema = override_schema_with_config( inferred_schema, table_spec) # Ignore empty columns merged_schema['properties'] = { key: value for key, value in merged_schema['properties'].items() if key != '' } schema = Schema.from_dict(merged_schema) stream_metadata = [] key_properties = table_spec.get('key_properties', []) streams.append( CatalogEntry( tap_stream_id=table_spec['name'], stream=table_spec['name'], schema=schema, key_properties=key_properties, metadata=stream_metadata, replication_key=None, is_view=None, database=None, table=None, row_count=None, stream_alias=None, replication_method=None, )) except Exception as err: LOGGER.error( f"Unable to write Catalog entry for '{table_spec['name']}' - it will be skipped due to error {err}" ) return Catalog(streams)