def test_simple_case(self):
        input_table = list()
        input_table.append(['name', 'value'])
        input_table.append(['abc', '1234'])
        input_table.append(['abc', '1234'])
        input_table.append(['efg', '100'])
        input_table.append(['abc', '100'])
        input_table.append(['cde', '12999'])
        input_table.append(['aaa', '2000'])
        input_table.append(['abc', '100'])

        expected_table = list()
        expected_table.append(['abc', '12340'])
        expected_table.append(['abc', '12340'])
        expected_table.append(['abc', '1000'])
        expected_table.append(['abc', '1000'])

        delim = ','
        policy = 'quoted'
        csv_data = table_to_csv_string_random(input_table, delim, policy)
        input_stream, encoding = string_to_randomly_encoded_stream(csv_data)

        input_iterator = rbql_csv.CSVRecordIterator(input_stream, True, encoding, delim=delim, policy=policy)

        output_stream = io.BytesIO() if encoding is not None else io.StringIO()
        output_writer = rbql_csv.CSVWriter(output_stream, False, encoding, delim, policy)

        error_info, warnings = rbql.generic_run('select a.name, int(a.value) * 10 where NR > 1 and a.name == "abc"', input_iterator, output_writer)
        self.assertEqual(error_info, None)
        self.assertEqual(warnings, [])

        output_stream.seek(0)
        output_iterator = rbql_csv.CSVRecordIterator(output_stream, True, encoding, delim=delim, policy=policy)
        output_table = output_iterator.get_all_records()
        self.assertEqual(expected_table, output_table)
Exemple #2
0
    def test_utf_decoding_errors(self):
        table = [[
            'hello',
            u'\x80\x81\xffThis unicode string encoded as latin-1 is not a valid utf-8\xaa\xbb\xcc'
        ], ['hello', 'world']]
        delim = ','
        policy = 'simple'
        encoding = 'latin-1'
        csv_data = table_to_csv_string_random(table, delim, policy)
        stream = io.BytesIO(csv_data.encode('latin-1'))
        record_iterator = rbql_csv.CSVRecordIterator(stream, encoding, delim,
                                                     policy)
        parsed_table = record_iterator.get_all_records()
        stream.close()
        self.assertEqual(table, parsed_table)

        parsed_table = write_and_parse_back(table, encoding, delim, policy)
        self.assertEqual(table, parsed_table)

        stream = io.BytesIO(csv_data.encode('latin-1'))
        with self.assertRaises(Exception) as cm:
            record_iterator = rbql_csv.CSVRecordIterator(stream,
                                                         'utf-8',
                                                         delim=delim,
                                                         policy=policy)
            parsed_table = record_iterator.get_all_records()
            stream.close()
        e = cm.exception
        self.assertTrue(
            str(e).find('Unable to decode input table as UTF-8') != -1)
Exemple #3
0
 def test_bom_warning(self):
     table = list()
     table.append([u'\xef\xbb\xbfcde', '1234'])
     table.append(['abc', '1234'])
     table.append(['abc', '1234'])
     table.append(['efg', '100'])
     table.append(['abc', '100'])
     table.append(['cde', '12999'])
     table.append(['aaa', '2000'])
     table.append(['abc', '100'])
     delim = ','
     policy = 'simple'
     encoding = 'latin-1'
     csv_data = table_to_csv_string_random(table, delim, policy)
     stream = io.BytesIO(csv_data.encode('latin-1'))
     record_iterator = rbql_csv.CSVRecordIterator(stream, encoding, delim,
                                                  policy)
     parsed_table = record_iterator.get_all_records()
     stream.close()
     expected_warnings = [
         'UTF-8 Byte Order Mark (BOM) was found and skipped in input table'
     ]
     actual_warnings = record_iterator.get_warnings()
     self.assertEqual(expected_warnings, actual_warnings)
     expected_table = copy.deepcopy(table)
     expected_table[0][0] = 'cde'
     self.assertEqual(expected_table, parsed_table)
Exemple #4
0
    def test_multiline_fields(self):
        data_lines = []
        data_lines.append('foo, bar,aaa')
        data_lines.append('test,"hello, bar", "aaa ')
        data_lines.append('test","hello, bar", "bbb ')
        data_lines.append('foo, bar,aaa')
        data_lines.append('foo, ""bar"",aaa')
        data_lines.append('foo, test","hello, bar", "bbb "')
        data_lines.append('foo, bar,aaa')
        csv_data = '\n'.join(data_lines)
        stream, encoding = string_to_randomly_encoded_stream(csv_data)
        table = [['foo', ' bar', 'aaa'],
                 [
                     'test', 'hello, bar', 'aaa \ntest', 'hello, bar',
                     'bbb \nfoo, bar,aaa\nfoo, "bar",aaa\nfoo, test',
                     "hello, bar", 'bbb '
                 ], ['foo', ' bar', 'aaa']]
        delim = ','
        policy = 'quoted_rfc'

        record_iterator = rbql_csv.CSVRecordIterator(stream,
                                                     encoding,
                                                     delim=delim,
                                                     policy=policy)
        parsed_table = record_iterator.get_all_records()
        stream.close()
        self.assertEqual(table, parsed_table)
        parsed_table = write_and_parse_back(table, encoding, delim, policy)
        self.assertEqual(table, parsed_table)
Exemple #5
0
    def test_iterator_rfc_comments(self):
        for _test_num in xrange6(200):
            table = generate_random_decoded_binary_table(10, 10, None)
            comment_prefix = random.choice(['#', '>>'])
            if table_has_records_with_comment_prefix(table, comment_prefix):
                continue  # Instead of complicating the generation procedure just skip the tables which were generated "incorrectly"
            delims = ['\t', ',', ';', '|']
            delim = random.choice(delims)
            policy = 'quoted_rfc'
            csv_data = table_to_csv_string_random(
                table, delim, policy, comment_prefix=comment_prefix)
            normalize_newlines_in_fields(
                table
            )  # XXX normalizing '\r' -> '\n' because record iterator doesn't preserve original separators
            stream, encoding = string_to_randomly_encoded_stream(csv_data)

            record_iterator = rbql_csv.CSVRecordIterator(
                stream,
                encoding,
                delim=delim,
                policy=policy,
                comment_prefix=comment_prefix)
            parsed_table = record_iterator.get_all_records()
            stream.close()
            self.assertEqual(table, parsed_table)

            parsed_table = write_and_parse_back(table, encoding, delim, policy)
            self.assertEqual(table, parsed_table)
def write_and_parse_back(table, encoding, delim, policy):
    writer_stream = io.BytesIO() if encoding is not None else io.StringIO()
    line_separator = random.choice(line_separators)
    writer = rbql_csv.CSVWriter(writer_stream, False, encoding, delim, policy, line_separator)
    writer._write_all(table)
    assert not len(writer.get_warnings())
    writer_stream.seek(0)
    record_iterator = rbql_csv.CSVRecordIterator(writer_stream, True, encoding, delim=delim, policy=policy)
    parsed_table = record_iterator.get_all_records()
    return parsed_table
 def test_split_lines_custom(self):
     test_cases = list()
     test_cases.append(('', []))
     test_cases.append(('hello', ['hello']))
     test_cases.append(('hello\nworld', ['hello', 'world']))
     test_cases.append(('hello\rworld\n', ['hello', 'world']))
     test_cases.append(('hello\r\nworld\rhello world\nhello\n', ['hello', 'world', 'hello world', 'hello']))
     for tc in test_cases:
         src, expected_res = tc
         stream, encoding = string_to_randomly_encoded_stream(src)
         line_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim=None, policy=None, chunk_size=6, line_mode=True)
         test_res = line_iterator._get_all_rows()
         self.assertEqual(expected_res, test_res)
 def test_split_chunk_sizes(self):
     source_tokens = ['', 'defghIJKLMN', 'a', 'bc'] + ['\n', '\r\n', '\r']
     for test_case in xrange6(1000):
         num_tokens = random.randint(0, 12)
         chunk_size = random.randint(1, 5) if random.randint(0, 1) else random.randint(1, 100)
         src = ''
         for tnum in xrange6(num_tokens):
             token = random.choice(source_tokens)
             src += token
         stream, encoding = string_to_randomly_encoded_stream(src)
         line_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim=None, policy=None, chunk_size=chunk_size, line_mode=True)
         test_res = line_iterator._get_all_rows()
         expected_res = src.splitlines()
         self.assertEqual(expected_res, test_res)
    def test_multicharacter_separator_parsing(self):
        data_lines = []
        data_lines.append('aaa:=)bbb:=)ccc')
        data_lines.append('aaa :=) bbb :=)ccc ')
        expected_table = [['aaa', 'bbb', 'ccc'], ['aaa ', ' bbb ', 'ccc ']]
        csv_data = '\n'.join(data_lines)
        stream = io.StringIO(csv_data)
        delim = ':=)'
        policy = 'simple'
        encoding = None
        record_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim, policy)
        parsed_table = record_iterator.get_all_records()
        self.assertEqual(expected_table, parsed_table)

        parsed_table = write_and_parse_back(expected_table, encoding, delim, policy)
        self.assertEqual(expected_table, parsed_table)
    def test_iterator_rfc(self):
        for _test_num in xrange6(100):
            table = generate_random_decoded_binary_table(10, 10, None)
            delims = ['\t', ',', ';', '|']
            delim = random.choice(delims)
            policy = 'quoted_rfc'
            csv_data = table_to_csv_string_random(table, delim, policy)
            normalize_newlines_in_fields(table) # XXX normalizing '\r' -> '\n' because record iterator doesn't preserve original separators
            stream, encoding = string_to_randomly_encoded_stream(csv_data)

            record_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim=delim, policy=policy)
            parsed_table = record_iterator.get_all_records()
            self.assertEqual(table, parsed_table)

            parsed_table = write_and_parse_back(table, encoding, delim, policy)
            self.assertEqual(table, parsed_table)
    def test_iterator(self):
        for _test_num in xrange6(100):
            table = generate_random_decoded_binary_table(10, 10, ['\r', '\n'])
            delims = ['\t', ',', ';', '|']
            delim = random.choice(delims)
            table_has_delim = find_in_table(table, delim)
            policy = 'quoted' if table_has_delim else random.choice(['quoted', 'simple'])
            csv_data = table_to_csv_string_random(table, delim, policy)
            stream, encoding = string_to_randomly_encoded_stream(csv_data)

            record_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim=delim, policy=policy)
            parsed_table = record_iterator.get_all_records()
            self.assertEqual(table, parsed_table)

            parsed_table = write_and_parse_back(table, encoding, delim, policy)
            self.assertEqual(table, parsed_table)
    def test_iterator_unicode(self):
        for _test_num in xrange6(100):
            table = generate_random_unicode_table(10, 10, ['\r', '\n'])
            delims = ['\t', ',', ';', '|', 'Д', 'Ф', '\u2063']
            delim = random.choice(delims)
            table_has_delim = find_in_table(table, delim)
            policy = 'quoted' if table_has_delim else random.choice(['quoted', 'simple'])
            csv_data = table_to_csv_string_random(table, delim, policy)
            encoding = 'utf-8'
            stream = io.BytesIO(csv_data.encode(encoding))

            record_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim=delim, policy=policy)
            parsed_table = record_iterator.get_all_records()
            self.assertEqual(table, parsed_table)

            parsed_table = write_and_parse_back(table, encoding, delim, policy)
            self.assertEqual(table, parsed_table)
    def test_monocolumn_separated_parsing(self):
        for i in xrange6(10):
            self.maxDiff = None
            table = list()
            num_rows = random.randint(1, 30)
            for irow in xrange6(num_rows):
                min_len = 0 if irow + 1 < num_rows else 1
                table.append([make_random_decoded_binary_csv_entry(min_len, 20, restricted_chars=['\r', '\n'])])
            csv_data = table_to_csv_string_random(table, None, 'monocolumn')
            stream = io.StringIO(csv_data)
            delim = None
            policy = 'monocolumn'
            encoding = None
            record_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim, policy)
            parsed_table = record_iterator.get_all_records()
            self.assertEqual(table, parsed_table)

            parsed_table = write_and_parse_back(table, encoding, delim, policy)
            self.assertEqual(table, parsed_table)
    def test_whitespace_separated_parsing(self):
        data_lines = []
        data_lines.append('hello world')
        data_lines.append('   hello   world  ')
        data_lines.append('hello   world  ')
        data_lines.append('  hello   ')
        data_lines.append('  hello   world')
        expected_table = [['hello', 'world'], ['hello', 'world'], ['hello', 'world'], ['hello'], ['hello', 'world']]
        csv_data = '\n'.join(data_lines)
        stream = io.StringIO(csv_data)
        delim = ' '
        policy = 'whitespace'
        encoding = None
        record_iterator = rbql_csv.CSVRecordIterator(stream, True, encoding, delim, policy)
        parsed_table = record_iterator.get_all_records()
        self.assertEqual(expected_table, parsed_table)

        parsed_table = write_and_parse_back(expected_table, encoding, delim, policy)
        self.assertEqual(expected_table, parsed_table)
Exemple #15
0
    def _do_test_random_headers(self):
        num_rows = natural_random(0, 10)
        num_cols = natural_random(2, 10)
        input_table = list()
        expected_table = list()

        header_row = list()
        for col in range(num_cols):
            while True:
                if random.choice([True, False]):
                    field_name_len = natural_random(1, 10)
                    field_name_bytes = []
                    for c in range(field_name_len):
                        field_name_bytes.append(random.randint(32, 126))
                    field_name = bytes(
                        bytearray(field_name_bytes)).decode('ascii')
                else:
                    field_name = random.choice(
                        ['_foo', 'bar', 'Bar', '__foo', 'a', 'b', 'A', 'B'])
                if field_name not in header_row:
                    header_row.append(field_name)
                    break
        input_table.append(header_row[:])
        expected_table.append(header_row[:])
        all_col_nums = list(range(num_cols))
        query_col_1 = random.choice(all_col_nums)
        all_col_nums.remove(query_col_1)
        query_col_2 = random.choice(all_col_nums)
        for row_id in range(num_rows):
            is_good_row = True
            row = list()
            for col_id in range(num_cols):
                if col_id == query_col_1:
                    field_value = random.choice(
                        ['foo bar good', 'foo bar bad'])
                    if field_value != 'foo bar good':
                        is_good_row = False
                elif col_id == query_col_2:
                    field_value = random.choice(['10', '0'])
                    if field_value != '10':
                        is_good_row = False
                else:
                    field_value = make_random_decoded_binary_csv_entry(
                        0, 10, restricted_chars=['\r', '\n'])
                row.append(field_value)
            input_table.append(row[:])
            if is_good_row:
                expected_table.append(row[:])
        query_col_name_1 = make_column_variable(header_row[query_col_1])
        query_col_name_2 = make_column_variable(header_row[query_col_2])
        query = 'select * where ({}.endswith("good") and int({}) * 2 == 20)'.format(
            query_col_name_1, query_col_name_2)

        delim = ','
        policy = 'quoted'
        csv_data = table_to_csv_string_random(input_table, delim, policy)
        encoding = 'latin-1'
        stream = io.BytesIO(csv_data.encode(encoding))
        input_stream, encoding = string_to_randomly_encoded_stream(csv_data)

        input_iterator = rbql_csv.CSVRecordIterator(input_stream,
                                                    encoding,
                                                    delim=delim,
                                                    policy=policy,
                                                    has_header=True)

        output_stream = io.BytesIO() if encoding is not None else io.StringIO()
        output_writer = rbql_csv.CSVWriter(output_stream, False, encoding,
                                           delim, policy)

        warnings = []
        rbql.query(query, input_iterator, output_writer, warnings)
        input_stream.close()
        self.assertEqual(warnings, [])

        output_stream.seek(0)
        output_iterator = rbql_csv.CSVRecordIterator(output_stream,
                                                     encoding,
                                                     delim=delim,
                                                     policy=policy)
        output_table = output_iterator.get_all_records()
        output_stream.close()
        self.assertEqual(expected_table, output_table)