def get_sampled_schema_for_table(config, table_spec): logger.info('Sampling records to determine table schema.') s3_files = s3.get_input_files_for_table(config, table_spec) samples = s3.sample_files(config, table_spec, s3_files) metadata_schema = { '_s3_source_bucket': { 'type': 'string' }, '_s3_source_file': { 'type': 'string' }, '_s3_source_lineno': { 'type': 'integer' }, } data_schema = conversion.generate_schema(samples) return { 'type': 'object', 'properties': merge_dicts(data_schema, metadata_schema) }
def test_sample_file(self): config = self.load_file("config-file.json", "data_test") s3_files = [{'key':'value'}] table_input = self.load_file("table_spec_without_key.json", "data_test") tap_s3_csv.s3.sample_file = Mock(return_value=[]) with mock.patch('singer.get_logger') as patching: for sample in sample_files(config, table_input, s3_files): patching.assert_called_with("Sampling files (max files: %s)", 2) patching.assert_called_with("Sampling %s (max records: %s, sample rate: %s)", "value", 1000, 2)
def test_sampling_of_gz_file_stored_with_jsonl_Extension( self, mock_jsonl_sample_file, mock_get_file_handle, mock_get_files_to_sample_csv, mocked_logger): table_spec = {} s3_files = "unittest_compressed_files/gz_stored_as_jsonl.jsonl" sample_rate = 5 config = [] actual_output = [ sample for sample in s3.sample_files(config, table_spec, s3_files, sample_rate) ] self.assertTrue(len(actual_output) == 0) new_s3_path = "unittest_compressed_files/gz_stored_as_jsonl.jsonl" mocked_logger.assert_called_with( 'Skipping %s file as parsing failed. Verify an extension of the file.', new_s3_path)