def test_get_record_for_jsonl_called_in_sample_file_for_jsonl_file( self, mock_get_records_for_jsonl, mock_key_properties): s3_path = "test\\abc.jsonl" table_spec = {'table_name': 'test_table'} file_handle = [ b'{"name":"test","id":"1"}\n', b'{"name":"test1","id":"2"}\n', b'{"name":"test2","id":"3"}\n', b'{"name":"test3","id":"4"}\n', b'{"name":"test4","id":"5","marks":"[\'221\',\'222\']"}\n' ] sample_rate = 5 s3.sample_file(table_spec, s3_path, file_handle, sample_rate, "jsonl") self.assertEqual(mock_get_records_for_jsonl.call_count, 1)
def test_sampling_of_gz_file_contains_zip_file_samples( self, mocked_logger): table_spec = {} s3_path = "unittest_compressed_files/sample_compressed_gz_file_contains_zip.gz" sample_rate = 5 extension = s3_path.split(".")[-1].lower() gz_file_path = get_resources_path( "sample_compressed_gz_file_contains_zip.gz", COMPRESSION_FOLDER_PATH) with gzip.GzipFile(gz_file_path) as gz_file: actual_output = [ sample for sample in s3.sample_file(table_spec, s3_path, gz_file. fileobj, sample_rate, extension) ] self.assertTrue(len(actual_output) == 0) new_s3_path = "unittest_compressed_files/sample_compressed_gz_file_contains_zip.gz/csv_jsonl.zip" mocked_logger.assert_called_with( 'Skipping "%s" file as it contains nested compression.', new_s3_path)
def test_gz_samples_for_jsonl(self): gz_file_path = get_resources_path( "sample_compressed_gz_file_with_json_file_2_records.gz", JSONL_FOLDER_PATH) with gzip.GzipFile(gz_file_path) as gz_file: table_spec = {} file_handle = gz_file.fileobj s3_path = "unittest_compressed_files/sample_compressed_gz_file_with_json_file_2_records.gz" sample_rate = 5 extension = "gz" expected_output = [{ "id": 1, "name": "abc", "semester": 1, "created_at": "2021-05-21" }] actual_output = [ sample for sample in s3.sample_file( table_spec, s3_path, file_handle, sample_rate, extension) ] self.assertEqual(expected_output, actual_output)
def test_gz_samples_for_csv(self): gz_file_path = get_resources_path( "sample_compressed_gz_file_with_csv_file_2_records.gz", CSV_FOLDER_PATH) with gzip.GzipFile(gz_file_path) as gz_file: table_spec = {} file_handle = gz_file.fileobj s3_path = "unittest_compressed_files/sample_compressed_gz_file_with_csv_file_2_records.gz" sample_rate = 5 extension = "gz" expected_output = [{ 'id': '1', 'location': 'Eldon Base for stackable storage shelf, platinum', 'name': 'Muhammed MacIntyre', 'count': '3', 'decimal1': '-213.25', 'decimal2': '38.94', 'decimal3': '35', 'category': 'Nunavut', 'point': 'Storage & Organization' }] actual_output = [ sample for sample in s3.sample_file( table_spec, s3_path, file_handle, sample_rate, extension) ] self.assertEqual(expected_output, actual_output)
def test_get_records_for_jsonl_in_sample_file_for_10_records_of_file_with_sample_rate_3( self, mock_check_key_properties_and_date_overrides_for_jsonl_file): s3_path = "test/abc.jsonl" file_handle = [ b'{"name":"test","id":"1"}\n', b'{"name":"test1","id":"2"}\n', b'{"name":"test2","id":"3"}\n', b'{"name":"test3","id":"4"}\n', b'{"name":"test4","id":"5","marks":"[\'221\',\'222\']"}\n', b'{"name":"test5","id":"6"}\n', b'{"name":"test6","id":"7"}\n', b'{"name":"test7","id":"8"}\n', b'{"name":"test8","id":"9"}\n', b'{"name":"test9","id":"10","marks":"[\'111\',\'112\']"}\n' ] sample_rate = 3 config = None table_spec = None expected_result = [{ "name": "test", "id": "1" }, { "name": "test3", "id": "4" }, { "name": "test6", "id": "7" }, { "name": "test9", "id": "10", "marks": "['111','112']" }] result = s3.sample_file(table_spec, s3_path, file_handle, sample_rate, "jsonl") self.assertListEqual(list(result), expected_result)
def test_sampling_of_error_gz_file_samples(self, mocked_gz_file_name, mocked_logger): table_spec = {} s3_path = "unittest_compressed_files/sample_compressed_gz_file.gz" sample_rate = 5 extension = s3_path.split(".")[-1].lower() gz_file_path = get_resources_path("sample_compressed_gz_file.gz", COMPRESSION_FOLDER_PATH) with gzip.GzipFile(gz_file_path) as gz_file: mocked_gz_file_name.return_value = None try: s3.sample_file(table_spec, s3_path, gz_file.fileobj, sample_rate, extension) except Exception as e: expected_message = '"{}" file has some error(s)'.format( s3_path) self.assertEqual(expected_message, str(e))
def test_get_record_for_csv_called_in_sample_file_for_txt_file(self): s3_path = "test\\abc.txt" table_spec = {'table_name': 'test_table'} file_handle = [ b'a,b,c', b'a0,b0,c0', ] sample_rate = 5 records = s3.sample_file(table_spec, s3_path, file_handle, sample_rate, "txt") self.assertEqual(len(list(records)), 1)
def test_csv_records_as_samples(self): table_spec = {} file_handle = [b"columnA,columnB,columnC", b"1,2,3,4"] s3_path = "unittest_compressed_files/sample.csv" sample_rate = 5 extension = "csv" expected_output = [{"columnA": "1", "columnB": "2", "columnC": "3"}] actual_output = [ sample for sample in s3.sample_file( table_spec, s3_path, file_handle, sample_rate, extension) ] self.assertEqual(expected_output, actual_output)
def test_sampling_of_file_without_extension_samples(self, mocked_logger): table_spec = {} file_handle = None s3_path = "unittest_compressed_files/sample_compressed" sample_rate = 5 extension = s3_path.split(".")[-1].lower() actual_output = [ sample for sample in s3.sample_file( table_spec, s3_path, file_handle, sample_rate, extension) ] self.assertTrue(len(actual_output) == 0) mocked_logger.assert_called_with( '"%s" without extension will not be sampled.', s3_path)
def test_sampling_of_unsupported_file_samples(self, mocked_logger): table_spec = {} file_handle = None s3_path = "unittest_compressed_files/sample_compressed.exe" sample_rate = 5 extension = "exe" actual_output = [ sample for sample in s3.sample_file( table_spec, s3_path, file_handle, sample_rate, extension) ] self.assertTrue(len(actual_output) == 0) mocked_logger.assert_called_with( '"%s" having the ".%s" extension will not be sampled.', s3_path, extension)
def test_sampling_of_tar_gz_file_samples(self, mocked_logger): table_spec = {} file_handle = None s3_path = "unittest_compressed_files/sample_compressed.tar.gz" sample_rate = 5 extension = "gz" actual_output = [ sample for sample in s3.sample_file( table_spec, s3_path, file_handle, sample_rate, extension) ] self.assertTrue(len(actual_output) == 0) mocked_logger.assert_called_with( 'Skipping "%s" file as .tar.gz extension is not supported', s3_path)
def test_sampling_of_file_gzip_using_no_name(self, mocked_logger): table_spec = {} s3_path = "unittest_compressed_files/sample_compressed.gz" sample_rate = 5 extension = "gz" gz_file_path = get_resources_path("sample_compressed_no_name.gz", COMPRESSION_FOLDER_PATH) with gzip.GzipFile(gz_file_path) as gz_file: actual_output = [ sample for sample in s3.sample_file(table_spec, s3_path, gz_file. fileobj, sample_rate, extension) ] self.assertTrue(len(actual_output) == 0) mocked_logger.assert_called_with( 'Skipping "%s" file as we did not get the original file name', s3_path)
def test_sampling_of_empty_csv_converted_to_gz(self, mocked_logger): table_spec = {} s3_path = "unittest_compressed_files/empty_csv_gz.gz" sample_rate = 5 extension = s3_path.split(".")[-1].lower() gz_file_path = get_resources_path("empty_csv_gz.gz", COMPRESSION_FOLDER_PATH) with gzip.GzipFile(gz_file_path) as gz_file: actual_output = [ sample for sample in s3.sample_file(table_spec, s3_path, gz_file. fileobj, sample_rate, extension) ] self.assertTrue(len(actual_output) == 0) new_s3_path = "unittest_compressed_files/empty_csv_gz.gz/empty_csv.csv" mocked_logger.assert_called_with( 'Skipping "%s" file as it is empty', new_s3_path)