def test_generate_schema(self): self.assertEqual( generate_schema([{ 'id': '1', 'first_name': 'Connor' }, { 'id': '2', 'first_name': '1' }]), { 'id': { 'type': ['null', 'integer'], '_conversion_type': 'integer' }, 'first_name': { 'type': ['null', 'string'], '_conversion_type': 'string' } }) self.assertEqual( generate_schema([{ 'id': '1', 'cost': '1' }, { 'id': '2', 'cost': '1.25' }]), { 'id': { 'type': ['null', 'integer'], '_conversion_type': 'integer' }, 'cost': { 'type': ['null', 'number'], '_conversion_type': 'number' } }) self.assertEqual( generate_schema([{ 'id': '1', 'date': '2017-01-01' }, { 'id': '2', 'date': '2017-01-02' }]), { 'id': { 'type': ['null', 'integer'], '_conversion_type': 'integer' }, 'date': { 'type': ['null', 'string'], '_conversion_type': 'string' } })
def get_sampled_schema_for_table(config, table_spec): logger.info('Sampling records to determine table schema.') s3_files = s3.get_input_files_for_table(config, table_spec) samples = s3.sample_files(config, table_spec, s3_files) metadata_schema = { '_s3_source_bucket': { 'type': 'string' }, '_s3_source_file': { 'type': 'string' }, '_s3_source_lineno': { 'type': 'integer' }, } data_schema = conversion.generate_schema(samples) return { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) }
def get_sampled_schema_for_table(config, table_spec): LOGGER.info('Sampling records to determine table schema.') s3_files_gen = get_input_files_for_table(config, table_spec) samples = [sample for sample in sample_files( config, table_spec, s3_files_gen)] if skipped_files_count: LOGGER.warning( "%s files got skipped during the last sampling.", skipped_files_count) if not samples: # Return empty properties for accept everything from data if no samples found return { 'type': 'object', 'properties': {} } data_schema = conversion.generate_schema( samples, table_spec, config.get('string_max_length', False)) return { 'type': 'object', 'properties': data_schema }
def get_sampled_schema_for_table(config: Dict, table_spec: Dict) -> Dict: """ Detects json schema using a sample of table/stream data :param config: Tap config :param table_spec: tables specs :return: detected schema """ LOGGER.info('Sampling records to determine table schema.') modified_since = utils.strptime_with_tz(config['start_date']) s3_files_gen = get_input_files_for_table(config, table_spec, modified_since) samples = list(sample_files(config, table_spec, s3_files_gen)) if not samples: return {} metadata_schema = { SDC_SOURCE_BUCKET_COLUMN: {'type': 'string'}, SDC_SOURCE_FILE_COLUMN: {'type': 'string'}, SDC_SOURCE_LINENO_COLUMN: {'type': 'integer'}, SDC_EXTRA_COLUMN: {'type': 'array', 'items': {'type': 'string'}}, } data_schema = conversion.generate_schema(samples, table_spec) return { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) }
def get_sampled_schema_for_table(config, table_spec): LOGGER.info('Sampling records to determine table schema.') s3_files = get_input_files_for_table(config, table_spec) if not s3_files: return {} samples = sample_files(config, table_spec, s3_files) metadata_schema = { SDC_SOURCE_BUCKET_COLUMN: { 'type': 'string' }, SDC_SOURCE_FILE_COLUMN: { 'type': 'string' }, SDC_SOURCE_LINENO_COLUMN: { 'type': 'integer' }, csv.SDC_EXTRA_COLUMN: { 'type': 'array', 'items': { 'type': 'string' } }, } data_schema = conversion.generate_schema(samples, table_spec) return { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) }
def get_sampled_schema_for_table(config, table_spec): LOGGER.info('Sampling records to determine table schema.') s3_files_gen = get_input_files_for_table(config, table_spec) samples = [ sample for sample in sample_files(config, table_spec, s3_files_gen) ] if not samples: return {} data_schema = conversion.generate_schema(samples, table_spec) return {'type': 'object', 'properties': data_schema}
def test_generate_schema(self): samples = [ dict(id='1', name='productA', added_at='2017/05/18 10:40:22', price='22.99', sold='true', sold_at='2019-11-29'), dict(id='4', name='productB', added_at='2017/05/18 10:40:22', price='18', sold='false'), dict(id='6', name='productC', added_at='2017/05/18 10:40:22', price='14.6', sold='true', sold_at='2019-12-11'), ] table_specs = {'date_overrides': ['added_at']} schema = generate_schema(samples, table_specs) self.assertDictEqual( { 'id': { 'type': ['null', 'integer'] }, 'name': { 'type': ['null', 'string'] }, 'added_at': { 'type': ['null', 'string'], 'format': 'date-time' }, 'price': { 'type': ['null', 'number'] }, 'sold': { 'type': ['null', 'string'] }, 'sold_at': { 'type': ['null', 'string'] } }, schema)
def get_sampled_schema_for_table(config, table_spec): LOGGER.info('Sampling records to determine table schema.') s3_files_gen = get_input_files_for_table(config, table_spec) samples = [ sample for sample in sample_files(config, table_spec, s3_files_gen) ] if skipped_files_count: LOGGER.warning("%s files got skipped during the last sampling.", skipped_files_count) if not samples: #Return empty properties for accept everything from data if no samples found return {'type': 'object', 'properties': {}} metadata_schema = { SDC_SOURCE_BUCKET_COLUMN: { 'type': 'string' }, SDC_SOURCE_FILE_COLUMN: { 'type': 'string' }, SDC_SOURCE_LINENO_COLUMN: { 'type': 'integer' }, SDC_EXTRA_COLUMN: { 'type': 'array', 'items': { 'anyOf': [{ 'type': 'object', 'properties': {} }, { 'type': 'string' }] } } } data_schema = conversion.generate_schema(samples, table_spec) return { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) }
def get_sampled_schema_for_table(config: Dict, table_spec: Dict) -> Dict: """ Detects json schema using a sample of table/stream data :param config: Tap config :param table_spec: tables specs :return: detected schema """ LOGGER.info('Sampling records to determine table schema.') s3_files_gen = get_input_files_for_table(config, table_spec) s3_files_list = [obj for obj in s3_files_gen] sorted_s3_files = sorted(s3_files_list, reverse=True, key=lambda k: k['last_modified']) samples = list(sample_files(config, table_spec, sorted_s3_files)) if not samples: return {} metadata_schema = { SDC_SOURCE_BUCKET_COLUMN: { 'type': 'string' }, SDC_SOURCE_FILE_COLUMN: { 'type': 'string' }, SDC_SOURCE_LINENO_COLUMN: { 'type': 'integer' }, SDC_EXTRA_COLUMN: { 'type': 'array', 'items': { 'type': 'string' } }, } data_schema = conversion.generate_schema(samples, table_spec) return { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) }
def get_sampled_schema_for_table(config, table_spec): LOGGER.info('Sampling records to determine table schema.') s3_files_gen = get_input_files_for_table(config, table_spec) samples = [ sample for sample in sample_files(config, table_spec, s3_files_gen) ] if not samples: return {} metadata_schema = { SDC_SOURCE_BUCKET_COLUMN: { 'type': 'string' }, SDC_SOURCE_FILE_COLUMN: { 'type': 'string' }, SDC_SOURCE_LINENO_COLUMN: { 'type': 'integer' }, csv_singer.SDC_EXTRA_COLUMN: { 'type': 'array', 'items': { 'type': 'string' } }, LAST_MODIFIED: { 'type': 'string', 'format': 'date-time' } } data_schema = conversion.generate_schema(samples, table_spec) return { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) }
def get_sampled_schema_for_table(config, table_spec): LOGGER.info("Sampling records to determine table schema.") s3_files_gen = get_input_files_for_table(config, table_spec) samples = [ sample for sample in sample_files(config, table_spec, s3_files_gen) ] if not samples: return {} metadata_schema = { SDC_SOURCE_BUCKET_COLUMN: { "type": "string" }, SDC_SOURCE_FILE_COLUMN: { "type": "string" }, SDC_SOURCE_LINENO_COLUMN: { "type": "integer" }, csv.SDC_EXTRA_COLUMN: { "type": "array", "items": { "type": "string" } }, } data_schema = conversion.generate_schema(samples, table_spec) return { "type": "object", "properties": merge_dicts(data_schema, metadata_schema) }
def test_generate_schema(self, mock_count_sample): samples = [{ 'name': 'test', 'id': 3, 'marks': [45.85, 25.38], 'students': { 'no': 5, 'col': 6 }, 'created_at': '20-05-2021', 'tota': [] }] table_spec = { 'search_prefix': '', 'search_pattern': 'test\\/.*\\.jsonl', 'table_name': 'jsonl_table', 'key_properties': ['id'], 'date_overrides': ['created_at'], 'delimiter': ',' } res = conversion.generate_schema(samples, table_spec) expected_result = { 'name': { 'type': ['null', 'string'] }, 'id': { 'type': ['null', 'integer', 'string'] }, 'marks': { 'anyOf': [{ 'type': 'array', 'items': { 'type': ['null', 'number', 'string'] } }, { 'type': ['null', 'string'] }] }, 'students': { 'anyOf': [{ 'type': 'object', 'properties': {} }, { 'type': ['null', 'string'] }] }, 'created_at': { 'anyOf': [{ 'type': ['null', 'string'], 'format': 'date-time' }, { 'type': ['null', 'string'] }] }, 'tota': { 'anyOf': [{ 'type': 'array', 'items': ['null', 'string'] }, { 'type': ['null', 'string'] }] } } self.assertEqual(res, expected_result)
def test_generate_schema(self): samples = self.load_file("sample.json", "data_test") table_input = self.load_file("table_spec_without_key.json", "data_test") output = self.load_file("data_schema.json", "data_test") result = generate_schema([samples], table_input) self.assertEqual(simplejson.dumps(output), simplejson.dumps(result))