def test_header_check_files(self):
        """
        Determine if files with no header are properly determined.
        """
        from itertools import islice

        # add some more files to the list to test the header detection
        # these files have some first lines which are not the header
        for input_file in self.input_file_names:
            with open(input_file['path'],
                      encoding=input_file['encoding']) as csvfile:
                data_as_str = ''.join(list(islice(csvfile, 5)))
            header_line = CSVData._guess_header_row(data_as_str,
                                                    input_file['delimiter'])
            self.assertIn(header_line, input_file['has_header'],
                          input_file['path'])

        for input_buf in self.buffer_list:
            # BytesIO is wrapped so that it is fed into guess header row
            # the same way it would internally
            buffer = input_buf['path']
            if isinstance(input_buf['path'], BytesIO):
                buffer = TextIOWrapper(input_buf['path'],
                                       encoding=input_buf['encoding'])

            data_as_str = ''.join(list(islice(buffer, 5)))
            header_line = CSVData._guess_header_row(data_as_str,
                                                    input_buf['delimiter'])
            self.assertIn(header_line, input_buf['has_header'],
                          input_buf['path'])

            # since BytesIO was wrapped, it now has to be detached
            if isinstance(buffer, TextIOWrapper):
                buffer.detach()
    def test_options(self):

        def _test_options(option, valid, invalid, expected_error):
            # Test Valid
            for value in valid:
                CSVData(options={option: value})
            
            # Test Invalid
            for value in invalid:
                with self.assertRaisesRegex(ValueError, expected_error):
                    CSVData(options={option: value})

        _test_options("header", valid = ["auto", None, 0, 1],
                      invalid = ["error", CSVData(), -1],
                      expected_error = '`header` must be one of following: auto, ')
        
        _test_options("delimiter", valid = [',', '\t', '', None],
                      invalid = [CSVData(), 1],
                      expected_error="'delimiter' must be a string or None")    
        
        _test_options("data_format", valid = ['dataframe', 'records'],
                      invalid = ["error", CSVData(), 1, None],
                      expected_error = "'data_format' must be one of the following: ") 
        
        _test_options("selected_columns", valid = [['hello', 'world'], ["test"], []],
                      invalid = ["error", CSVData(), 1, None],
                      expected_error = "'selected_columns' must be a list") 
        
        _test_options("selected_columns", valid = [], invalid = [[0,1,2,3]],
                      expected_error = "'selected_columns' must be a list of strings")
        def _test_options(option, valid, invalid, expected_error):
            # Test Valid
            for value in valid:
                CSVData(options={option: value})

            # Test Invalid
            for value in invalid:
                with self.assertRaisesRegex(ValueError, expected_error):
                    CSVData(options={option: value})
    def test_is_structured(self):
        # Default construction
        data = CSVData()
        self.assertTrue(data.is_structured)

        # With option specifying dataframe as data_format
        data = CSVData(options={"data_format": "dataframe"})
        self.assertTrue(data.is_structured)

        # With option specifying records as data_format
        data = CSVData(options={"data_format": "records"})
        self.assertFalse(data.is_structured)
 def test_is_match(self):
     """
     Determine if the csv file can be automatically identified from
     byte stream or stringio stream or file path
     """
     for input_file in self.file_or_buf_list:
         self.assertTrue(CSVData.is_match(input_file['path']))
    def test_header_check_files(self):
        """
        Determine if files with no header are properly determined.
        """
        from itertools import islice
        from dataprofiler.data_readers import data_utils

        # add some more files to the list to test the header detection
        # these files have some first lines which are not the header
        test_dir = os.path.join(test_root_path, 'data')
        file_with_header_and_authors = [
            dict(path=os.path.join(test_dir, 'csv/sparse-first-and-last-column-header-and-author.txt'),
                 count=6, delimiter=',', has_header=[1],
                 num_columns=3, encoding='utf-8'),
            dict(path=os.path.join(test_dir, 'csv/sparse-first-and-last-column-header-and-author-description.txt'),
                 count=6, delimiter=',', has_header=[3],
                 num_columns=3, encoding='utf-8'),
            dict(path=os.path.join(test_dir, 'csv/sparse-first-and-last-column-empty-first-row.txt'),
                 count=11, delimiter=',', has_header=[1],
                 num_columns=3, encoding='utf-8'),
        ]

        input_file_names = self.input_file_names[:]
        input_file_names += file_with_header_and_authors
        for input_file in input_file_names:
            file_encoding = data_utils.detect_file_encoding(input_file['path'])
            with open(input_file['path'], encoding=file_encoding) as csvfile:
                data_as_str = ''.join(list(islice(csvfile, 5)))
            header_line = CSVData._guess_header_row(data_as_str, input_file['delimiter'])
            self.assertIn(header_line, input_file['has_header'], input_file['path'])
    def test_options(self):
        def _test_options(option, valid, invalid, expected_error):
            # Test Valid
            for value in valid:
                CSVData(options={option: value})

            # Test Invalid
            for value in invalid:
                with self.assertRaisesRegex(ValueError, expected_error):
                    CSVData(options={option: value})

        _test_options(
            "header",
            valid=["auto", None, 0, 1],
            invalid=["error", CSVData(), -1],
            expected_error='`header` must be one of following: auto, ')

        _test_options("delimiter",
                      valid=[',', '\t', '', None],
                      invalid=[CSVData(), 1],
                      expected_error="'delimiter' must be a string or None")

        _test_options(
            "data_format",
            valid=['dataframe', 'records'],
            invalid=["error", CSVData(), 1, None],
            expected_error="'data_format' must be one of the following: ")

        _test_options("selected_columns",
                      valid=[['hello', 'world'], ["test"], []],
                      invalid=["error", CSVData(), 1, None],
                      expected_error="'selected_columns' must be a list")

        _test_options(
            "selected_columns",
            valid=[],
            invalid=[[0, 1, 2, 3]],
            expected_error="'selected_columns' must be a list of strings")

        _test_options(
            "record_samples_per_line",
            valid=[1, 10],
            invalid=[[-1, int, '', None, dict()]],
            expected_error="'record_samples_per_line' must be an int more than "
            "0")

        # test edge case for header being set
        file = self.input_file_names[0]
        filepath = file['path']
        expected_header_value = file['has_header'][0]
        options = {'header': 'auto', 'delimiter': ','}  # default values
        data = CSVData(options=options)
        self.assertEqual('auto', data.header)
        self.assertFalse(data._checked_header)

        data = CSVData(filepath, options=options)
        retrieve_data = data.data
        self.assertEqual(expected_header_value, data.header)
        self.assertTrue(data._checked_header)
    def test_set_header(self):
        test_dir = os.path.join(test_root_path, 'data')
        filename = 'csv/sparse-first-and-last-column-two-headers.txt'
        filename = os.path.join(test_dir, filename)

        # set bad header setting
        options = dict(header=-2)
        with self.assertRaisesRegex(
                ValueError, '`header` must be one of following: auto, '
                'none for no header, or a non-negative '
                'integer for the row that represents the '
                'header \(0 based index\)'):
            csv_data = CSVData(filename, options=options)
            first_value = csv_data.data.loc[0][0]

        # set bad header setting
        options = dict(header='abcdef')
        with self.assertRaisesRegex(
                ValueError, '`header` must be one of following: auto, '
                'none for no header, or a non-negative '
                'integer for the row that represents the '
                'header \(0 based index\)'):
            csv_data = CSVData(filename, options=options)
            first_value = csv_data.data.loc[0][0]

        # set header auto
        options = dict(header='auto')
        csv_data = CSVData(filename, options=options)
        first_value = csv_data.data.loc[0][0]
        self.assertEqual(1, csv_data.header)
        self.assertEqual('1', first_value)

        # set header None (no header)
        options = dict(header=None)
        csv_data = CSVData(filename, options=options)
        first_value = csv_data.data.loc[0][0]
        self.assertIsNone(csv_data.header)  # should be None
        self.assertEqual('COUNT', first_value)

        # set header 0
        options = dict(header=0)
        csv_data = CSVData(filename, options=options)
        first_value = csv_data.data.loc[0][0]
        self.assertEqual(0, csv_data.header)
        self.assertEqual('CONTAR', first_value)

        # set header 1
        options = dict(header=1)
        csv_data = CSVData(filename, options=options)
        first_value = csv_data.data.loc[0][0]
        self.assertEqual(1, csv_data.header)
        self.assertEqual('1', first_value)
Exemple #9
0
    def test_header_check_files(self):
        """
        Determine if files with no header are properly determined.
        """
        from itertools import islice
        from dataprofiler.data_readers import data_utils

        # add some more files to the list to test the header detection
        # these files have some first lines which are not the header
        input_file_names = self.input_file_names
        for input_file in input_file_names:
            file_encoding = data_utils.detect_file_encoding(input_file['path'])
            with open(input_file['path'], encoding=file_encoding) as csvfile:
                data_as_str = ''.join(list(islice(csvfile, 5)))
            header_line = CSVData._guess_header_row(data_as_str,
                                                    input_file['delimiter'])
            self.assertIn(header_line, input_file['has_header'],
                          input_file['path'])