コード例 #1
0
    def test_header_check_files(self):
        """
        Determine if files with no header are properly determined.
        """
        from itertools import islice
        from dataprofiler.data_readers import data_utils

        # add some more files to the list to test the header detection
        # these files have some first lines which are not the header
        test_dir = os.path.join(test_root_path, 'data')
        file_with_header_and_authors = [
            dict(path=os.path.join(test_dir, 'csv/sparse-first-and-last-column-header-and-author.txt'),
                 count=6, delimiter=',', has_header=[1],
                 num_columns=3, encoding='utf-8'),
            dict(path=os.path.join(test_dir, 'csv/sparse-first-and-last-column-header-and-author-description.txt'),
                 count=6, delimiter=',', has_header=[3],
                 num_columns=3, encoding='utf-8'),
            dict(path=os.path.join(test_dir, 'csv/sparse-first-and-last-column-empty-first-row.txt'),
                 count=11, delimiter=',', has_header=[1],
                 num_columns=3, encoding='utf-8'),
        ]

        input_file_names = self.input_file_names[:]
        input_file_names += file_with_header_and_authors
        for input_file in input_file_names:
            file_encoding = data_utils.detect_file_encoding(input_file['path'])
            with open(input_file['path'], encoding=file_encoding) as csvfile:
                data_as_str = ''.join(list(islice(csvfile, 5)))
            header_line = CSVData._guess_header_row(data_as_str, input_file['delimiter'])
            self.assertIn(header_line, input_file['has_header'], input_file['path'])
コード例 #2
0
    def test_file_UTF_encoding_detection(self):
        """
        Tests the ability for `data_utils.detect_file_encoding` to detect the
        encoding of text files. This test is specifically for UTF-8, UTF-16,
        and UTF-32 of csv or JSON.
        :return:
        """
        test_dir = os.path.join(test_root_path, 'data')
        input_files = [
            dict(path=os.path.join(test_dir, 'csv/iris-utf-8.csv'),
                 encoding="utf-8"),
            dict(path=os.path.join(test_dir, 'csv/iris-utf-16.csv'),
                 encoding="utf-16"),
            dict(path=os.path.join(test_dir, 'csv/iris-utf-32.csv'),
                 encoding="utf-32"),
            dict(path=os.path.join(test_dir, 'json/iris-utf-8.json'),
                 encoding="utf-8"),
            dict(path=os.path.join(test_dir, 'json/iris-utf-16.json'),
                 encoding="utf-16"),
            dict(path=os.path.join(test_dir, 'json/iris-utf-32.json'),
                 encoding="utf-32"),
        ]

        for input_file in input_files:
            detected_encoding = \
                data_utils.detect_file_encoding(file_path=input_file["path"])
            self.assertEqual(detected_encoding.lower(), input_file["encoding"])
コード例 #3
0
    def test_header_check_files(self):
        """
        Determine if files with no header are properly determined.
        """
        from itertools import islice
        from dataprofiler.data_readers import data_utils

        # add some more files to the list to test the header detection
        # these files have some first lines which are not the header
        input_file_names = self.input_file_names
        for input_file in input_file_names:
            file_encoding = data_utils.detect_file_encoding(input_file['path'])
            with open(input_file['path'], encoding=file_encoding) as csvfile:
                data_as_str = ''.join(list(islice(csvfile, 5)))
            header_line = CSVData._guess_header_row(data_as_str,
                                                    input_file['delimiter'])
            self.assertIn(header_line, input_file['has_header'],
                          input_file['path'])
コード例 #4
0
    def test_file_UTF_encoding_detection(self):
        """
        Tests the ability for `data_utils.detect_file_encoding` to detect the
        encoding of text files. This test is specifically for UTF-8, UTF-16,
        and UTF-32 of csv or JSON.
        :return:
        """
        test_dir = os.path.join(test_root_path, "data")
        input_files = [
            dict(path=os.path.join(test_dir, "csv/iris-utf-8.csv"),
                 encoding="utf-8"),
            dict(path=os.path.join(test_dir, "csv/iris-utf-16.csv"),
                 encoding="utf-16"),
            dict(path=os.path.join(test_dir, "csv/iris-utf-32.csv"),
                 encoding="utf-32"),
            dict(path=os.path.join(test_dir, "json/iris-utf-8.json"),
                 encoding="utf-8"),
            dict(path=os.path.join(test_dir, "json/iris-utf-16.json"),
                 encoding="utf-16"),
            dict(path=os.path.join(test_dir, "json/iris-utf-32.json"),
                 encoding="utf-32"),
            dict(path=os.path.join(test_dir, "txt/utf8.txt"),
                 encoding="utf-8"),
            dict(path=os.path.join(test_dir, "csv/zomato.csv"),
                 encoding="ISO-8859-1"),
            dict(path=os.path.join(test_dir, "csv/reddit_wsb.csv"),
                 encoding="utf-8"),
        ]

        get_match_acc = lambda s, s2: sum(
            [s[i] == s2[i] for i in range(len(s))]) / len(s)

        for input_file in input_files:
            detected_encoding = data_utils.detect_file_encoding(
                file_path=input_file["path"])
            with open(input_file["path"], "rb") as infile:
                # Read a max of 1 MB of data
                content = infile.read(1024 * 1024)
                # Assert at least 99.9% of the content was correctly decoded
                match_acc = get_match_acc(
                    content.decode(input_file["encoding"]),
                    content.decode(detected_encoding),
                )
                self.assertGreaterEqual(match_acc, 0.999)