def test_simple_case(self): input_table = list() input_table.append(['name', 'value']) input_table.append(['abc', '1234']) input_table.append(['abc', '1234']) input_table.append(['efg', '100']) input_table.append(['abc', '100']) input_table.append(['cde', '12999']) input_table.append(['aaa', '2000']) input_table.append(['abc', '100']) expected_table = list() expected_table.append(['abc', '12340']) expected_table.append(['abc', '12340']) expected_table.append(['abc', '1000']) expected_table.append(['abc', '1000']) delim = ',' policy = 'quoted' csv_data = table_to_csv_string_random(input_table, delim, policy) input_stream, encoding = string_to_randomly_encoded_stream(csv_data) input_iterator = rbql_csv.CSVRecordIterator(input_stream, True, encoding, delim=delim, policy=policy) output_stream = io.BytesIO() if encoding is not None else io.StringIO() output_writer = rbql_csv.CSVWriter(output_stream, False, encoding, delim, policy) error_info, warnings = rbql.generic_run('select a.name, int(a.value) * 10 where NR > 1 and a.name == "abc"', input_iterator, output_writer) self.assertEqual(error_info, None) self.assertEqual(warnings, []) output_stream.seek(0) output_iterator = rbql_csv.CSVRecordIterator(output_stream, True, encoding, delim=delim, policy=policy) output_table = output_iterator.get_all_records() self.assertEqual(expected_table, output_table)
def test_utf_decoding_errors(self): table = [[ 'hello', u'\x80\x81\xffThis unicode string encoded as latin-1 is not a valid utf-8\xaa\xbb\xcc' ], ['hello', 'world']] delim = ',' policy = 'simple' encoding = 'latin-1' csv_data = table_to_csv_string_random(table, delim, policy) stream = io.BytesIO(csv_data.encode('latin-1')) record_iterator = rbql_csv.CSVRecordIterator(stream, encoding, delim, policy) parsed_table = record_iterator.get_all_records() stream.close() self.assertEqual(table, parsed_table) parsed_table = write_and_parse_back(table, encoding, delim, policy) self.assertEqual(table, parsed_table) stream = io.BytesIO(csv_data.encode('latin-1')) with self.assertRaises(Exception) as cm: record_iterator = rbql_csv.CSVRecordIterator(stream, 'utf-8', delim=delim, policy=policy) parsed_table = record_iterator.get_all_records() stream.close() e = cm.exception self.assertTrue( str(e).find('Unable to decode input table as UTF-8') != -1)
def test_bom_warning(self): table = list() table.append([u'\xef\xbb\xbfcde', '1234']) table.append(['abc', '1234']) table.append(['abc', '1234']) table.append(['efg', '100']) table.append(['abc', '100']) table.append(['cde', '12999']) table.append(['aaa', '2000']) table.append(['abc', '100']) delim = ',' policy = 'simple' encoding = 'latin-1' csv_data = table_to_csv_string_random(table, delim, policy) stream = io.BytesIO(csv_data.encode('latin-1')) record_iterator = rbql_csv.CSVRecordIterator(stream, encoding, delim, policy) parsed_table = record_iterator.get_all_records() stream.close() expected_warnings = [ 'UTF-8 Byte Order Mark (BOM) was found and skipped in input table' ] actual_warnings = record_iterator.get_warnings() self.assertEqual(expected_warnings, actual_warnings) expected_table = copy.deepcopy(table) expected_table[0][0] = 'cde' self.assertEqual(expected_table, parsed_table)
def test_multiline_fields(self): data_lines = [] data_lines.append('foo, bar,aaa') data_lines.append('test,"hello, bar", "aaa ') data_lines.append('test","hello, bar", "bbb ') data_lines.append('foo, bar,aaa') data_lines.append('foo, ""bar"",aaa') data_lines.append('foo, test","hello, bar", "bbb "') data_lines.append('foo, bar,aaa') csv_data = '\n'.join(data_lines) stream, encoding = string_to_randomly_encoded_stream(csv_data) table = [['foo', ' bar', 'aaa'], [ 'test', 'hello, bar', 'aaa \ntest', 'hello, bar', 'bbb \nfoo, bar,aaa\nfoo, "bar",aaa\nfoo, test', "hello, bar", 'bbb ' ], ['foo', ' bar', 'aaa']] delim = ',' policy = 'quoted_rfc' record_iterator = rbql_csv.CSVRecordIterator(stream, encoding, delim=delim, policy=policy) parsed_table = record_iterator.get_all_records() stream.close() self.assertEqual(table, parsed_table) parsed_table = write_and_parse_back(table, encoding, delim, policy) self.assertEqual(table, parsed_table)
def test_iterator_rfc_comments(self): for _test_num in xrange6(200): table = generate_random_decoded_binary_table(10, 10, None) comment_prefix = random.choice(['#', '>>']) if table_has_records_with_comment_prefix(table, comment_prefix): continue # Instead of complicating the generation procedure just skip the tables which were generated "incorrectly" delims = ['\t', ',', ';', '|'] delim = random.choice(delims) policy = 'quoted_rfc' csv_data = table_to_csv_string_random( table, delim, policy, comment_prefix=comment_prefix) normalize_newlines_in_fields( table ) # XXX normalizing '\r' -> '\n' because record iterator doesn't preserve original separators stream, encoding = string_to_randomly_encoded_stream(csv_data) record_iterator = rbql_csv.CSVRecordIterator( stream, encoding, delim=delim, policy=policy, comment_prefix=comment_prefix) parsed_table = record_iterator.get_all_records() stream.close() self.assertEqual(table, parsed_table) parsed_table = write_and_parse_back(table, encoding, delim, policy) self.assertEqual(table, parsed_table)
def write_and_parse_back(table, encoding, delim, policy): writer_stream = io.BytesIO() if encoding is not None else io.StringIO() line_separator = random.choice(line_separators) writer = rbql_csv.CSVWriter(writer_stream, False, encoding, delim, policy, line_separator) writer._write_all(table) assert not len(writer.get_warnings()) writer_stream.seek(0) record_iterator = rbql_csv.CSVRecordIterator(writer_stream, True, encoding, delim=delim, policy=policy) parsed_table = record_iterator.get_all_records() return parsed_table
def test_split_lines_custom(self): test_cases = list() test_cases.append(('', [])) test_cases.append(('hello', ['hello'])) test_cases.append(('hello\nworld', ['hello', 'world'])) test_cases.append(('hello\rworld\n', ['hello', 'world'])) test_cases.append(('hello\r\nworld\rhello world\nhello\n', ['hello', 'world', 'hello world', 'hello'])) for tc in test_cases: src, expected_res = tc stream, encoding = string_to_randomly_encoded_stream(src) line_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim=None, policy=None, chunk_size=6, line_mode=True) test_res = line_iterator._get_all_rows() self.assertEqual(expected_res, test_res)
def test_split_chunk_sizes(self): source_tokens = ['', 'defghIJKLMN', 'a', 'bc'] + ['\n', '\r\n', '\r'] for test_case in xrange6(1000): num_tokens = random.randint(0, 12) chunk_size = random.randint(1, 5) if random.randint(0, 1) else random.randint(1, 100) src = '' for tnum in xrange6(num_tokens): token = random.choice(source_tokens) src += token stream, encoding = string_to_randomly_encoded_stream(src) line_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim=None, policy=None, chunk_size=chunk_size, line_mode=True) test_res = line_iterator._get_all_rows() expected_res = src.splitlines() self.assertEqual(expected_res, test_res)
def test_multicharacter_separator_parsing(self): data_lines = [] data_lines.append('aaa:=)bbb:=)ccc') data_lines.append('aaa :=) bbb :=)ccc ') expected_table = [['aaa', 'bbb', 'ccc'], ['aaa ', ' bbb ', 'ccc ']] csv_data = '\n'.join(data_lines) stream = io.StringIO(csv_data) delim = ':=)' policy = 'simple' encoding = None record_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim, policy) parsed_table = record_iterator.get_all_records() self.assertEqual(expected_table, parsed_table) parsed_table = write_and_parse_back(expected_table, encoding, delim, policy) self.assertEqual(expected_table, parsed_table)
def test_iterator_rfc(self): for _test_num in xrange6(100): table = generate_random_decoded_binary_table(10, 10, None) delims = ['\t', ',', ';', '|'] delim = random.choice(delims) policy = 'quoted_rfc' csv_data = table_to_csv_string_random(table, delim, policy) normalize_newlines_in_fields(table) # XXX normalizing '\r' -> '\n' because record iterator doesn't preserve original separators stream, encoding = string_to_randomly_encoded_stream(csv_data) record_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim=delim, policy=policy) parsed_table = record_iterator.get_all_records() self.assertEqual(table, parsed_table) parsed_table = write_and_parse_back(table, encoding, delim, policy) self.assertEqual(table, parsed_table)
def test_iterator(self): for _test_num in xrange6(100): table = generate_random_decoded_binary_table(10, 10, ['\r', '\n']) delims = ['\t', ',', ';', '|'] delim = random.choice(delims) table_has_delim = find_in_table(table, delim) policy = 'quoted' if table_has_delim else random.choice(['quoted', 'simple']) csv_data = table_to_csv_string_random(table, delim, policy) stream, encoding = string_to_randomly_encoded_stream(csv_data) record_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim=delim, policy=policy) parsed_table = record_iterator.get_all_records() self.assertEqual(table, parsed_table) parsed_table = write_and_parse_back(table, encoding, delim, policy) self.assertEqual(table, parsed_table)
def test_iterator_unicode(self): for _test_num in xrange6(100): table = generate_random_unicode_table(10, 10, ['\r', '\n']) delims = ['\t', ',', ';', '|', 'Д', 'Ф', '\u2063'] delim = random.choice(delims) table_has_delim = find_in_table(table, delim) policy = 'quoted' if table_has_delim else random.choice(['quoted', 'simple']) csv_data = table_to_csv_string_random(table, delim, policy) encoding = 'utf-8' stream = io.BytesIO(csv_data.encode(encoding)) record_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim=delim, policy=policy) parsed_table = record_iterator.get_all_records() self.assertEqual(table, parsed_table) parsed_table = write_and_parse_back(table, encoding, delim, policy) self.assertEqual(table, parsed_table)
def test_monocolumn_separated_parsing(self): for i in xrange6(10): self.maxDiff = None table = list() num_rows = random.randint(1, 30) for irow in xrange6(num_rows): min_len = 0 if irow + 1 < num_rows else 1 table.append([make_random_decoded_binary_csv_entry(min_len, 20, restricted_chars=['\r', '\n'])]) csv_data = table_to_csv_string_random(table, None, 'monocolumn') stream = io.StringIO(csv_data) delim = None policy = 'monocolumn' encoding = None record_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim, policy) parsed_table = record_iterator.get_all_records() self.assertEqual(table, parsed_table) parsed_table = write_and_parse_back(table, encoding, delim, policy) self.assertEqual(table, parsed_table)
def test_whitespace_separated_parsing(self): data_lines = [] data_lines.append('hello world') data_lines.append(' hello world ') data_lines.append('hello world ') data_lines.append(' hello ') data_lines.append(' hello world') expected_table = [['hello', 'world'], ['hello', 'world'], ['hello', 'world'], ['hello'], ['hello', 'world']] csv_data = '\n'.join(data_lines) stream = io.StringIO(csv_data) delim = ' ' policy = 'whitespace' encoding = None record_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim, policy) parsed_table = record_iterator.get_all_records() self.assertEqual(expected_table, parsed_table) parsed_table = write_and_parse_back(expected_table, encoding, delim, policy) self.assertEqual(expected_table, parsed_table)
def _do_test_random_headers(self): num_rows = natural_random(0, 10) num_cols = natural_random(2, 10) input_table = list() expected_table = list() header_row = list() for col in range(num_cols): while True: if random.choice([True, False]): field_name_len = natural_random(1, 10) field_name_bytes = [] for c in range(field_name_len): field_name_bytes.append(random.randint(32, 126)) field_name = bytes( bytearray(field_name_bytes)).decode('ascii') else: field_name = random.choice( ['_foo', 'bar', 'Bar', '__foo', 'a', 'b', 'A', 'B']) if field_name not in header_row: header_row.append(field_name) break input_table.append(header_row[:]) expected_table.append(header_row[:]) all_col_nums = list(range(num_cols)) query_col_1 = random.choice(all_col_nums) all_col_nums.remove(query_col_1) query_col_2 = random.choice(all_col_nums) for row_id in range(num_rows): is_good_row = True row = list() for col_id in range(num_cols): if col_id == query_col_1: field_value = random.choice( ['foo bar good', 'foo bar bad']) if field_value != 'foo bar good': is_good_row = False elif col_id == query_col_2: field_value = random.choice(['10', '0']) if field_value != '10': is_good_row = False else: field_value = make_random_decoded_binary_csv_entry( 0, 10, restricted_chars=['\r', '\n']) row.append(field_value) input_table.append(row[:]) if is_good_row: expected_table.append(row[:]) query_col_name_1 = make_column_variable(header_row[query_col_1]) query_col_name_2 = make_column_variable(header_row[query_col_2]) query = 'select * where ({}.endswith("good") and int({}) * 2 == 20)'.format( query_col_name_1, query_col_name_2) delim = ',' policy = 'quoted' csv_data = table_to_csv_string_random(input_table, delim, policy) encoding = 'latin-1' stream = io.BytesIO(csv_data.encode(encoding)) input_stream, encoding = string_to_randomly_encoded_stream(csv_data) input_iterator = rbql_csv.CSVRecordIterator(input_stream, encoding, delim=delim, policy=policy, has_header=True) output_stream = io.BytesIO() if encoding is not None else io.StringIO() output_writer = rbql_csv.CSVWriter(output_stream, False, encoding, delim, policy) warnings = [] rbql.query(query, input_iterator, output_writer, warnings) input_stream.close() self.assertEqual(warnings, []) output_stream.seek(0) output_iterator = rbql_csv.CSVRecordIterator(output_stream, encoding, delim=delim, policy=policy) output_table = output_iterator.get_all_records() output_stream.close() self.assertEqual(expected_table, output_table)