Esempio n. 1
0
def get_sampled_schema_for_table(config, table_spec):
    logger.info('Sampling records to determine table schema.')

    s3_files = s3.get_input_files_for_table(config, table_spec)

    samples = s3.sample_files(config, table_spec, s3_files)

    metadata_schema = {
        '_s3_source_bucket': {
            'type': 'string'
        },
        '_s3_source_file': {
            'type': 'string'
        },
        '_s3_source_lineno': {
            'type': 'integer'
        },
    }

    data_schema = conversion.generate_schema(samples)

    return {
        'type': 'object',
        'properties': merge_dicts(data_schema, metadata_schema)
    }
Esempio n. 2
0
 def test_sample_file(self):
     config =  self.load_file("config-file.json", "data_test")
     s3_files = [{'key':'value'}]
     table_input = self.load_file("table_spec_without_key.json", "data_test")
     tap_s3_csv.s3.sample_file = Mock(return_value=[])
     with mock.patch('singer.get_logger') as patching:
         for sample in sample_files(config, table_input, s3_files):
             patching.assert_called_with("Sampling files (max files: %s)", 2)
             patching.assert_called_with("Sampling %s (max records: %s, sample rate: %s)", "value", 1000, 2)
Esempio n. 3
0
    def test_sampling_of_gz_file_stored_with_jsonl_Extension(
            self, mock_jsonl_sample_file, mock_get_file_handle,
            mock_get_files_to_sample_csv, mocked_logger):
        table_spec = {}
        s3_files = "unittest_compressed_files/gz_stored_as_jsonl.jsonl"
        sample_rate = 5
        config = []

        actual_output = [
            sample for sample in s3.sample_files(config, table_spec, s3_files,
                                                 sample_rate)
        ]

        self.assertTrue(len(actual_output) == 0)

        new_s3_path = "unittest_compressed_files/gz_stored_as_jsonl.jsonl"

        mocked_logger.assert_called_with(
            'Skipping %s file as parsing failed. Verify an extension of the file.',
            new_s3_path)