Beispiel #1
0
    def test_get_record_for_jsonl_called_in_sample_file_for_jsonl_file(
            self, mock_get_records_for_jsonl, mock_key_properties):

        s3_path = "test\\abc.jsonl"
        table_spec = {'table_name': 'test_table'}
        file_handle = [
            b'{"name":"test","id":"1"}\n', b'{"name":"test1","id":"2"}\n',
            b'{"name":"test2","id":"3"}\n', b'{"name":"test3","id":"4"}\n',
            b'{"name":"test4","id":"5","marks":"[\'221\',\'222\']"}\n'
        ]
        sample_rate = 5

        s3.sample_file(table_spec, s3_path, file_handle, sample_rate, "jsonl")
        self.assertEqual(mock_get_records_for_jsonl.call_count, 1)
Beispiel #2
0
    def test_sampling_of_gz_file_contains_zip_file_samples(
            self, mocked_logger):
        table_spec = {}
        s3_path = "unittest_compressed_files/sample_compressed_gz_file_contains_zip.gz"
        sample_rate = 5
        extension = s3_path.split(".")[-1].lower()

        gz_file_path = get_resources_path(
            "sample_compressed_gz_file_contains_zip.gz",
            COMPRESSION_FOLDER_PATH)

        with gzip.GzipFile(gz_file_path) as gz_file:

            actual_output = [
                sample
                for sample in s3.sample_file(table_spec, s3_path, gz_file.
                                             fileobj, sample_rate, extension)
            ]

            self.assertTrue(len(actual_output) == 0)

            new_s3_path = "unittest_compressed_files/sample_compressed_gz_file_contains_zip.gz/csv_jsonl.zip"

            mocked_logger.assert_called_with(
                'Skipping "%s" file as it contains nested compression.',
                new_s3_path)
Beispiel #3
0
    def test_gz_samples_for_jsonl(self):
        gz_file_path = get_resources_path(
            "sample_compressed_gz_file_with_json_file_2_records.gz",
            JSONL_FOLDER_PATH)

        with gzip.GzipFile(gz_file_path) as gz_file:
            table_spec = {}
            file_handle = gz_file.fileobj
            s3_path = "unittest_compressed_files/sample_compressed_gz_file_with_json_file_2_records.gz"
            sample_rate = 5
            extension = "gz"

            expected_output = [{
                "id": 1,
                "name": "abc",
                "semester": 1,
                "created_at": "2021-05-21"
            }]

            actual_output = [
                sample for sample in s3.sample_file(
                    table_spec, s3_path, file_handle, sample_rate, extension)
            ]

            self.assertEqual(expected_output, actual_output)
Beispiel #4
0
    def test_gz_samples_for_csv(self):
        gz_file_path = get_resources_path(
            "sample_compressed_gz_file_with_csv_file_2_records.gz",
            CSV_FOLDER_PATH)

        with gzip.GzipFile(gz_file_path) as gz_file:
            table_spec = {}
            file_handle = gz_file.fileobj
            s3_path = "unittest_compressed_files/sample_compressed_gz_file_with_csv_file_2_records.gz"
            sample_rate = 5
            extension = "gz"

            expected_output = [{
                'id': '1',
                'location': 'Eldon Base for stackable storage shelf, platinum',
                'name': 'Muhammed MacIntyre',
                'count': '3',
                'decimal1': '-213.25',
                'decimal2': '38.94',
                'decimal3': '35',
                'category': 'Nunavut',
                'point': 'Storage & Organization'
            }]

            actual_output = [
                sample for sample in s3.sample_file(
                    table_spec, s3_path, file_handle, sample_rate, extension)
            ]

            self.assertEqual(expected_output, actual_output)
Beispiel #5
0
    def test_get_records_for_jsonl_in_sample_file_for_10_records_of_file_with_sample_rate_3(
            self, mock_check_key_properties_and_date_overrides_for_jsonl_file):

        s3_path = "test/abc.jsonl"
        file_handle = [
            b'{"name":"test","id":"1"}\n', b'{"name":"test1","id":"2"}\n',
            b'{"name":"test2","id":"3"}\n', b'{"name":"test3","id":"4"}\n',
            b'{"name":"test4","id":"5","marks":"[\'221\',\'222\']"}\n',
            b'{"name":"test5","id":"6"}\n', b'{"name":"test6","id":"7"}\n',
            b'{"name":"test7","id":"8"}\n', b'{"name":"test8","id":"9"}\n',
            b'{"name":"test9","id":"10","marks":"[\'111\',\'112\']"}\n'
        ]
        sample_rate = 3
        config = None
        table_spec = None
        expected_result = [{
            "name": "test",
            "id": "1"
        }, {
            "name": "test3",
            "id": "4"
        }, {
            "name": "test6",
            "id": "7"
        }, {
            "name": "test9",
            "id": "10",
            "marks": "['111','112']"
        }]
        result = s3.sample_file(table_spec, s3_path, file_handle, sample_rate,
                                "jsonl")
        self.assertListEqual(list(result), expected_result)
Beispiel #6
0
    def test_sampling_of_error_gz_file_samples(self, mocked_gz_file_name,
                                               mocked_logger):
        table_spec = {}
        s3_path = "unittest_compressed_files/sample_compressed_gz_file.gz"
        sample_rate = 5
        extension = s3_path.split(".")[-1].lower()

        gz_file_path = get_resources_path("sample_compressed_gz_file.gz",
                                          COMPRESSION_FOLDER_PATH)

        with gzip.GzipFile(gz_file_path) as gz_file:

            mocked_gz_file_name.return_value = None

            try:
                s3.sample_file(table_spec, s3_path, gz_file.fileobj,
                               sample_rate, extension)
            except Exception as e:
                expected_message = '"{}" file has some error(s)'.format(
                    s3_path)
                self.assertEqual(expected_message, str(e))
    def test_get_record_for_csv_called_in_sample_file_for_txt_file(self):

        s3_path = "test\\abc.txt"
        table_spec = {'table_name': 'test_table'}
        file_handle = [
            b'a,b,c',
            b'a0,b0,c0',
        ]
        sample_rate = 5

        records = s3.sample_file(table_spec, s3_path, file_handle, sample_rate, "txt")
        self.assertEqual(len(list(records)), 1)
Beispiel #8
0
    def test_csv_records_as_samples(self):
        table_spec = {}
        file_handle = [b"columnA,columnB,columnC", b"1,2,3,4"]
        s3_path = "unittest_compressed_files/sample.csv"
        sample_rate = 5
        extension = "csv"

        expected_output = [{"columnA": "1", "columnB": "2", "columnC": "3"}]

        actual_output = [
            sample for sample in s3.sample_file(
                table_spec, s3_path, file_handle, sample_rate, extension)
        ]

        self.assertEqual(expected_output, actual_output)
Beispiel #9
0
    def test_sampling_of_file_without_extension_samples(self, mocked_logger):
        table_spec = {}
        file_handle = None
        s3_path = "unittest_compressed_files/sample_compressed"
        sample_rate = 5
        extension = s3_path.split(".")[-1].lower()

        actual_output = [
            sample for sample in s3.sample_file(
                table_spec, s3_path, file_handle, sample_rate, extension)
        ]

        self.assertTrue(len(actual_output) == 0)

        mocked_logger.assert_called_with(
            '"%s" without extension will not be sampled.', s3_path)
Beispiel #10
0
    def test_sampling_of_unsupported_file_samples(self, mocked_logger):
        table_spec = {}
        file_handle = None
        s3_path = "unittest_compressed_files/sample_compressed.exe"
        sample_rate = 5
        extension = "exe"

        actual_output = [
            sample for sample in s3.sample_file(
                table_spec, s3_path, file_handle, sample_rate, extension)
        ]

        self.assertTrue(len(actual_output) == 0)

        mocked_logger.assert_called_with(
            '"%s" having the ".%s" extension will not be sampled.', s3_path,
            extension)
Beispiel #11
0
    def test_sampling_of_tar_gz_file_samples(self, mocked_logger):
        table_spec = {}
        file_handle = None
        s3_path = "unittest_compressed_files/sample_compressed.tar.gz"
        sample_rate = 5
        extension = "gz"

        actual_output = [
            sample for sample in s3.sample_file(
                table_spec, s3_path, file_handle, sample_rate, extension)
        ]

        self.assertTrue(len(actual_output) == 0)

        mocked_logger.assert_called_with(
            'Skipping "%s" file as .tar.gz extension is not supported',
            s3_path)
Beispiel #12
0
    def test_sampling_of_file_gzip_using_no_name(self, mocked_logger):
        table_spec = {}
        s3_path = "unittest_compressed_files/sample_compressed.gz"
        sample_rate = 5
        extension = "gz"

        gz_file_path = get_resources_path("sample_compressed_no_name.gz",
                                          COMPRESSION_FOLDER_PATH)

        with gzip.GzipFile(gz_file_path) as gz_file:

            actual_output = [
                sample
                for sample in s3.sample_file(table_spec, s3_path, gz_file.
                                             fileobj, sample_rate, extension)
            ]

            self.assertTrue(len(actual_output) == 0)

            mocked_logger.assert_called_with(
                'Skipping "%s" file as we did not get the original file name',
                s3_path)
Beispiel #13
0
    def test_sampling_of_empty_csv_converted_to_gz(self, mocked_logger):
        table_spec = {}
        s3_path = "unittest_compressed_files/empty_csv_gz.gz"
        sample_rate = 5
        extension = s3_path.split(".")[-1].lower()

        gz_file_path = get_resources_path("empty_csv_gz.gz",
                                          COMPRESSION_FOLDER_PATH)

        with gzip.GzipFile(gz_file_path) as gz_file:

            actual_output = [
                sample
                for sample in s3.sample_file(table_spec, s3_path, gz_file.
                                             fileobj, sample_rate, extension)
            ]

            self.assertTrue(len(actual_output) == 0)

            new_s3_path = "unittest_compressed_files/empty_csv_gz.gz/empty_csv.csv"

            mocked_logger.assert_called_with(
                'Skipping "%s" file as it is empty', new_s3_path)