def test_row_generator_from_file_with_text_columns(self): data = make_data(2, 3, text_columns=(1, )) file = make_file(data) row_gen = row_generator_from_file(file.name) count = 0 for expected, received in zip(data, row_gen): self.assertListEqual(expected, received) count += 1 self.assertEqual(count, len(data))
def test_row_generator_from_file_with_text_columns(self): data = make_data(2, 3, text_columns=(1,)) file = make_file(data) row_gen = row_generator_from_file(file.name) count = 0 for expected, received in zip(data, row_gen): self.assertListEqual(expected, received) count += 1 self.assertEqual(count, len(data))
def test_row_generator_from_quoted_file(self): data = make_data(2, 3, text_columns=(1, ), with_quotes=True) file = make_file(data) unquoted = remove_quotes(data, (1, )) row_gen = row_generator_from_file(file.name) count = 0 for expected, received in zip(unquoted, row_gen): self.assertListEqual(expected, received) count += 1 self.assertEqual(count, len(data))
def test_row_generator_from_quoted_file(self): data = make_data(2, 3, text_columns=(1,), with_quotes=True) file = make_file(data) unquoted = remove_quotes(data, (1,)) row_gen = row_generator_from_file(file.name) count = 0 for expected, received in zip(unquoted, row_gen): self.assertListEqual(expected, received) count += 1 self.assertEqual(count, len(data))
def test_sentence_splitting(self): data = make_data(1, 3, text_columns=(1, )) data[0][1] = "This is a sentence. And it is from cell 1." file = make_file(data, sep='\t') row_gen = row_generator_from_file(file.name, dialect='plain') transformer = Extractor(row_gen, decap=True) data[0][1] = [['this', 'is', 'a', 'sentence', '.'], ['and', 'it', 'is', 'from', 'cell', '1', '.']] n = -1 for n, row in enumerate(transformer): self.assertEqual(list(data[n]), row) self.assertEqual(0, n)
def _decap_lower_helper(self, decap=False, lower=False): data = make_data(2, 3, text_columns=(1, )) file = make_file(data, sep='\t') row_gen = row_generator_from_file(file.name, dialect='plain') transformer = Extractor(row_gen, decap=decap, lower=lower) unquoted = remove_quotes(data, (1, )) unquoted[0][1] = [['this', 'is', 'cell', '1', '.']] unquoted[1][1] = [['this', 'is', 'cell', '1', '.']] n = -1 for n, row in enumerate(transformer): self.assertEqual(list(unquoted[n]), row) self.assertEqual(1, n)
def test_standard_process(self): data = make_data(2, 3, text_columns=(1, ), with_quotes=True) file = make_file(data) row_gen = row_generator_from_file(file.name) transformer = Extractor(row_gen) unquoted = remove_quotes(data, (1, )) unquoted[0][1] = [['This', 'is', 'cell', '1', '.']] unquoted[1][1] = [['This', 'is', 'cell', '1', '.']] n = -1 for n, row in enumerate(transformer): self.assertEqual(list(unquoted[n]), row) self.assertEqual(1, n)
def test_sentence_splitting(self): data = make_data(1, 3, text_columns=(1,)) data[0][1] = "This is a sentence. And it is from cell 1." file = make_file(data, sep='\t') row_gen = row_generator_from_file(file.name, dialect='plain') transformer = Extractor(row_gen, decap=True) data[0][1] = [['this', 'is', 'a', 'sentence', '.'], ['and', 'it', 'is', 'from', 'cell', '1', '.']] n = -1 for n, row in enumerate(transformer): self.assertEqual(list(data[n]), row) self.assertEqual(0, n)
def _decap_lower_helper(self, decap=False, lower=False): data = make_data(2, 3, text_columns=(1,)) file = make_file(data, sep='\t') row_gen = row_generator_from_file(file.name, dialect='plain') transformer = Extractor(row_gen, decap=decap, lower=lower) unquoted = remove_quotes(data, (1,)) unquoted[0][1] = [['this', 'is', 'cell', '1', '.']] unquoted[1][1] = [['this', 'is', 'cell', '1', '.']] n = -1 for n, row in enumerate(transformer): self.assertEqual(list(unquoted[n]), row) self.assertEqual(1, n)
def test_standard_process(self): data = make_data(2, 3, text_columns=(1,), with_quotes=True) file = make_file(data) row_gen = row_generator_from_file(file.name) transformer = Extractor(row_gen) unquoted = remove_quotes(data, (1,)) unquoted[0][1] = [['This', 'is', 'cell', '1', '.']] unquoted[1][1] = [['This', 'is', 'cell', '1', '.']] n = -1 for n, row in enumerate(transformer): self.assertEqual(list(unquoted[n]), row) self.assertEqual(1, n)
def test_tile_process(self): names = ("id", "text", "class") data = make_data(2, 3, text_columns=(1, )) data.insert(0, names) file = make_file(data, sep='\t') row_gen = row_generator_from_file(file.name, dialect='plain') transformer = Extractor(row_gen, has_title=True) data[1][1] = [['This', 'is', 'cell', '1', '.']] data[2][1] = [['This', 'is', 'cell', '1', '.']] self.assertEqual(names, transformer.names) n = -1 for n, row in enumerate(transformer, 1): self.assertEqual(list(data[n]), row) self.assertEqual(2, n)
def test_plain_row_generator_with_escapechar(self): data = make_data(2, 3) data[0][1] = "cell\\\tA" data[1][2] = "cell\tB" file = make_file(data, sep='\t') row_gen = row_generator_from_file(file.name, dialect='plain') count = 0 data[0][1] = "cell\tA" data[1][2] = "cell" data[1].append("B") for expected, received in zip(data, row_gen): self.assertListEqual(expected, received) count += 1 self.assertEqual(count, len(data))
def test_tile_process(self): names = ("id", "text", "class") data = make_data(2, 3, text_columns=(1,)) data.insert(0, names) file = make_file(data, sep='\t') row_gen = row_generator_from_file(file.name, dialect='plain') transformer = Extractor(row_gen, has_title=True) data[1][1] = [['This', 'is', 'cell', '1', '.']] data[2][1] = [['This', 'is', 'cell', '1', '.']] self.assertEqual(names, transformer.names) n = -1 for n, row in enumerate(transformer, 1): self.assertEqual(list(data[n]), row) self.assertEqual(2, n)
def test_row_generator_with_escapechar(self): data = make_data(3, 3, text_columns=(1, ), with_quotes=True) data[0][1] = '"Cell with comma, here A."' data[1][1] = '"Cell with quote "" char."' data[2][1] = '"Cell with both "","" chars."' file = make_file(data) unquoted = remove_quotes(data, (1, )) row_gen = row_generator_from_file(file.name) count = 0 unquoted[1][1] = 'Cell with quote " char.' unquoted[2][1] = 'Cell with both "," chars.' for expected, received in zip(unquoted, row_gen): self.assertListEqual(expected, received) count += 1 self.assertEqual(count, len(data))
def test_row_generator_with_escapechar(self): data = make_data(3, 3, text_columns=(1,), with_quotes=True) data[0][1] = '"Cell with comma, here A."' data[1][1] = '"Cell with quote "" char."' data[2][1] = '"Cell with both "","" chars."' file = make_file(data) unquoted = remove_quotes(data, (1,)) row_gen = row_generator_from_file(file.name) count = 0 unquoted[1][1] = 'Cell with quote " char.' unquoted[2][1] = 'Cell with both "," chars.' for expected, received in zip(unquoted, row_gen): self.assertListEqual(expected, received) count += 1 self.assertEqual(count, len(data))