def test_header_check_files(self): """ Determine if files with no header are properly determined. """ from itertools import islice # add some more files to the list to test the header detection # these files have some first lines which are not the header for input_file in self.input_file_names: with open(input_file['path'], encoding=input_file['encoding']) as csvfile: data_as_str = ''.join(list(islice(csvfile, 5))) header_line = CSVData._guess_header_row(data_as_str, input_file['delimiter']) self.assertIn(header_line, input_file['has_header'], input_file['path']) for input_buf in self.buffer_list: # BytesIO is wrapped so that it is fed into guess header row # the same way it would internally buffer = input_buf['path'] if isinstance(input_buf['path'], BytesIO): buffer = TextIOWrapper(input_buf['path'], encoding=input_buf['encoding']) data_as_str = ''.join(list(islice(buffer, 5))) header_line = CSVData._guess_header_row(data_as_str, input_buf['delimiter']) self.assertIn(header_line, input_buf['has_header'], input_buf['path']) # since BytesIO was wrapped, it now has to be detached if isinstance(buffer, TextIOWrapper): buffer.detach()
def test_options(self): def _test_options(option, valid, invalid, expected_error): # Test Valid for value in valid: CSVData(options={option: value}) # Test Invalid for value in invalid: with self.assertRaisesRegex(ValueError, expected_error): CSVData(options={option: value}) _test_options("header", valid = ["auto", None, 0, 1], invalid = ["error", CSVData(), -1], expected_error = '`header` must be one of following: auto, ') _test_options("delimiter", valid = [',', '\t', '', None], invalid = [CSVData(), 1], expected_error="'delimiter' must be a string or None") _test_options("data_format", valid = ['dataframe', 'records'], invalid = ["error", CSVData(), 1, None], expected_error = "'data_format' must be one of the following: ") _test_options("selected_columns", valid = [['hello', 'world'], ["test"], []], invalid = ["error", CSVData(), 1, None], expected_error = "'selected_columns' must be a list") _test_options("selected_columns", valid = [], invalid = [[0,1,2,3]], expected_error = "'selected_columns' must be a list of strings")
def _test_options(option, valid, invalid, expected_error): # Test Valid for value in valid: CSVData(options={option: value}) # Test Invalid for value in invalid: with self.assertRaisesRegex(ValueError, expected_error): CSVData(options={option: value})
def test_is_structured(self): # Default construction data = CSVData() self.assertTrue(data.is_structured) # With option specifying dataframe as data_format data = CSVData(options={"data_format": "dataframe"}) self.assertTrue(data.is_structured) # With option specifying records as data_format data = CSVData(options={"data_format": "records"}) self.assertFalse(data.is_structured)
def test_is_match(self): """ Determine if the csv file can be automatically identified from byte stream or stringio stream or file path """ for input_file in self.file_or_buf_list: self.assertTrue(CSVData.is_match(input_file['path']))
def test_header_check_files(self): """ Determine if files with no header are properly determined. """ from itertools import islice from dataprofiler.data_readers import data_utils # add some more files to the list to test the header detection # these files have some first lines which are not the header test_dir = os.path.join(test_root_path, 'data') file_with_header_and_authors = [ dict(path=os.path.join(test_dir, 'csv/sparse-first-and-last-column-header-and-author.txt'), count=6, delimiter=',', has_header=[1], num_columns=3, encoding='utf-8'), dict(path=os.path.join(test_dir, 'csv/sparse-first-and-last-column-header-and-author-description.txt'), count=6, delimiter=',', has_header=[3], num_columns=3, encoding='utf-8'), dict(path=os.path.join(test_dir, 'csv/sparse-first-and-last-column-empty-first-row.txt'), count=11, delimiter=',', has_header=[1], num_columns=3, encoding='utf-8'), ] input_file_names = self.input_file_names[:] input_file_names += file_with_header_and_authors for input_file in input_file_names: file_encoding = data_utils.detect_file_encoding(input_file['path']) with open(input_file['path'], encoding=file_encoding) as csvfile: data_as_str = ''.join(list(islice(csvfile, 5))) header_line = CSVData._guess_header_row(data_as_str, input_file['delimiter']) self.assertIn(header_line, input_file['has_header'], input_file['path'])
def test_options(self): def _test_options(option, valid, invalid, expected_error): # Test Valid for value in valid: CSVData(options={option: value}) # Test Invalid for value in invalid: with self.assertRaisesRegex(ValueError, expected_error): CSVData(options={option: value}) _test_options( "header", valid=["auto", None, 0, 1], invalid=["error", CSVData(), -1], expected_error='`header` must be one of following: auto, ') _test_options("delimiter", valid=[',', '\t', '', None], invalid=[CSVData(), 1], expected_error="'delimiter' must be a string or None") _test_options( "data_format", valid=['dataframe', 'records'], invalid=["error", CSVData(), 1, None], expected_error="'data_format' must be one of the following: ") _test_options("selected_columns", valid=[['hello', 'world'], ["test"], []], invalid=["error", CSVData(), 1, None], expected_error="'selected_columns' must be a list") _test_options( "selected_columns", valid=[], invalid=[[0, 1, 2, 3]], expected_error="'selected_columns' must be a list of strings") _test_options( "record_samples_per_line", valid=[1, 10], invalid=[[-1, int, '', None, dict()]], expected_error="'record_samples_per_line' must be an int more than " "0") # test edge case for header being set file = self.input_file_names[0] filepath = file['path'] expected_header_value = file['has_header'][0] options = {'header': 'auto', 'delimiter': ','} # default values data = CSVData(options=options) self.assertEqual('auto', data.header) self.assertFalse(data._checked_header) data = CSVData(filepath, options=options) retrieve_data = data.data self.assertEqual(expected_header_value, data.header) self.assertTrue(data._checked_header)
def test_set_header(self): test_dir = os.path.join(test_root_path, 'data') filename = 'csv/sparse-first-and-last-column-two-headers.txt' filename = os.path.join(test_dir, filename) # set bad header setting options = dict(header=-2) with self.assertRaisesRegex( ValueError, '`header` must be one of following: auto, ' 'none for no header, or a non-negative ' 'integer for the row that represents the ' 'header \(0 based index\)'): csv_data = CSVData(filename, options=options) first_value = csv_data.data.loc[0][0] # set bad header setting options = dict(header='abcdef') with self.assertRaisesRegex( ValueError, '`header` must be one of following: auto, ' 'none for no header, or a non-negative ' 'integer for the row that represents the ' 'header \(0 based index\)'): csv_data = CSVData(filename, options=options) first_value = csv_data.data.loc[0][0] # set header auto options = dict(header='auto') csv_data = CSVData(filename, options=options) first_value = csv_data.data.loc[0][0] self.assertEqual(1, csv_data.header) self.assertEqual('1', first_value) # set header None (no header) options = dict(header=None) csv_data = CSVData(filename, options=options) first_value = csv_data.data.loc[0][0] self.assertIsNone(csv_data.header) # should be None self.assertEqual('COUNT', first_value) # set header 0 options = dict(header=0) csv_data = CSVData(filename, options=options) first_value = csv_data.data.loc[0][0] self.assertEqual(0, csv_data.header) self.assertEqual('CONTAR', first_value) # set header 1 options = dict(header=1) csv_data = CSVData(filename, options=options) first_value = csv_data.data.loc[0][0] self.assertEqual(1, csv_data.header) self.assertEqual('1', first_value)
def test_header_check_files(self): """ Determine if files with no header are properly determined. """ from itertools import islice from dataprofiler.data_readers import data_utils # add some more files to the list to test the header detection # these files have some first lines which are not the header input_file_names = self.input_file_names for input_file in input_file_names: file_encoding = data_utils.detect_file_encoding(input_file['path']) with open(input_file['path'], encoding=file_encoding) as csvfile: data_as_str = ''.join(list(islice(csvfile, 5))) header_line = CSVData._guess_header_row(data_as_str, input_file['delimiter']) self.assertIn(header_line, input_file['has_header'], input_file['path'])