Beispiel #1
0
 def test_extend(self):
     tv1 = TokenVector.make_token_vector(
         ["A", "B", "C", "D"],
         sep_size=1)
     tv2 = TokenVector.make_token_vector(
         ["E", "F", "G", "H"],
         sep_size=1)
     tvUnion = TokenVector.make_token_vector(
         ["A", "B", "C", "D", "E", "F", "G", "H"],
         sep_size=1)
     tv1.extend(tv2, sep_size=1)
     self.assertListEqual(tv1, tvUnion)
Beispiel #2
0
    def test_get_index_length(self):

        vector = TokenVector.make_token_vector(
            ["A", "B", "C", "D"],
            sep_size=1,
            sentence_idx=0)
        self.assertEquals(vector.get_index_length(), 7)
Beispiel #3
0
 def test_get_sentences(self):
     sentence_one_vector = TokenVector.make_token_vector(
         ["A", "B", "C", "D"],
         sep_size=1,
         sentence_idx=0)
     sentence_two_vector = TokenVector.make_token_vector(
         ["E", "F", "G", "H"],
         relindex=9,
         sep_size=1,
         sentence_idx=1)
     union_vector = TokenVector.make_token_vector(
         ["A", "B", "C", "D"],
         sep_size=1,
         sentence_idx=0)
     union_vector.extend(sentence_two_vector)
     sentences = union_vector.get_sentences()
     self.assertListEqual(sentences[0], sentence_one_vector)
     self.assertListEqual(sentences[1], sentence_two_vector)
Beispiel #4
0
    def test_garbage_collection(self):
        tv = TokenVector.make_token_vector(
            ["A", "", "B", "C", "D", "", "E", ""],
            sep_size=1,
            sentence_idx=0)
        clean_tv = TokenVector([])
        clean_tv.append(Token('A', sindex=0, eindex=0, sentence_idx=0))
        clean_tv.append(Token('B', sindex=3, eindex=3, sentence_idx=0))
        clean_tv.append(Token('C', sindex=5, eindex=5, sentence_idx=0))
        clean_tv.append(Token('D', sindex=7, eindex=7, sentence_idx=0))
        clean_tv.append(Token('E', sindex=10, eindex=10, sentence_idx=0))

        tv.collect_garbage()
        self.assertSequenceEqual(tv, clean_tv)
Beispiel #5
0
 def test_make_token_vector(self):
     made_tv = TokenVector.make_token_vector(
         ["A", "", "B", "C", "D", "", "E", ""],
         sep_size=1,
         sentence_idx=0)
     real_tv = TokenVector([])
     real_tv.append(Token('A', sindex=0, eindex=0, sentence_idx=0))
     real_tv.append(Token('', sindex=2, eindex=2, sentence_idx=0))
     real_tv.append(Token('B', sindex=3, eindex=3, sentence_idx=0))
     real_tv.append(Token('C', sindex=5, eindex=5, sentence_idx=0))
     real_tv.append(Token('D', sindex=7, eindex=7, sentence_idx=0))
     real_tv.append(Token('', sindex=9, eindex=9, sentence_idx=0))
     real_tv.append(Token('E', sindex=10, eindex=10, sentence_idx=0))
     real_tv.append(Token('', sindex=12, eindex=12, sentence_idx=0))
     self.assertSequenceEqual(made_tv, real_tv)
Beispiel #6
0
 def setUp(self):
     self.tv = TokenVector([Token("foo"),
                            Token("abc defg ijk", sindex=4),
                            Token("bla", sindex=17)])
Beispiel #7
0
class TestTokenVector(TestCase):
    def setUp(self):
        self.tv = TokenVector([Token("foo"),
                               Token("abc defg ijk", sindex=4),
                               Token("bla", sindex=17)])

    def test_constructor_end_indices(self):
        self.assertEqual(self.tv[0].eindex, 2)
        self.assertEqual(self.tv[1].eindex, 15)
        self.assertEqual(self.tv[2].eindex, 19)

    def test_replace_tokens(self):
        print self.tv
        new_tv = TokenVector([Token("abc"), Token("defg"), Token("ijk")])
        self.tv.replace_tokens_with_list({1: new_tv})
        print self.tv
        self.assertEqual(len(self.tv), 5, "size kaputt")

        for index, token in enumerate(new_tv):
            self.assertEqual(self.tv[index + 1], new_tv[index])

    def test_update_indices(self):
        self.tv.update_indices(5)
        self.assertEqual(self.tv[0].eindex, 7)
        self.assertEqual(self.tv[1].eindex, 20)
        self.assertEqual(self.tv[2].eindex, 24)
        self.assertEqual(self.tv[0].sindex, 5)
        self.assertEqual(self.tv[1].sindex, 9)
        self.assertEqual(self.tv[2].sindex, 22)

    def test_get_index_length(self):

        vector = TokenVector.make_token_vector(
            ["A", "B", "C", "D"],
            sep_size=1,
            sentence_idx=0)
        self.assertEquals(vector.get_index_length(), 7)

    def test_extend(self):
        tv1 = TokenVector.make_token_vector(
            ["A", "B", "C", "D"],
            sep_size=1)
        tv2 = TokenVector.make_token_vector(
            ["E", "F", "G", "H"],
            sep_size=1)
        tvUnion = TokenVector.make_token_vector(
            ["A", "B", "C", "D", "E", "F", "G", "H"],
            sep_size=1)
        tv1.extend(tv2, sep_size=1)
        self.assertListEqual(tv1, tvUnion)

    def test_get_sentences(self):
        sentence_one_vector = TokenVector.make_token_vector(
            ["A", "B", "C", "D"],
            sep_size=1,
            sentence_idx=0)
        sentence_two_vector = TokenVector.make_token_vector(
            ["E", "F", "G", "H"],
            relindex=9,
            sep_size=1,
            sentence_idx=1)
        union_vector = TokenVector.make_token_vector(
            ["A", "B", "C", "D"],
            sep_size=1,
            sentence_idx=0)
        union_vector.extend(sentence_two_vector)
        sentences = union_vector.get_sentences()
        self.assertListEqual(sentences[0], sentence_one_vector)
        self.assertListEqual(sentences[1], sentence_two_vector)

    def test_make_token_vector(self):
        made_tv = TokenVector.make_token_vector(
            ["A", "", "B", "C", "D", "", "E", ""],
            sep_size=1,
            sentence_idx=0)
        real_tv = TokenVector([])
        real_tv.append(Token('A', sindex=0, eindex=0, sentence_idx=0))
        real_tv.append(Token('', sindex=2, eindex=2, sentence_idx=0))
        real_tv.append(Token('B', sindex=3, eindex=3, sentence_idx=0))
        real_tv.append(Token('C', sindex=5, eindex=5, sentence_idx=0))
        real_tv.append(Token('D', sindex=7, eindex=7, sentence_idx=0))
        real_tv.append(Token('', sindex=9, eindex=9, sentence_idx=0))
        real_tv.append(Token('E', sindex=10, eindex=10, sentence_idx=0))
        real_tv.append(Token('', sindex=12, eindex=12, sentence_idx=0))
        self.assertSequenceEqual(made_tv, real_tv)

    def test_garbage_collection(self):
        tv = TokenVector.make_token_vector(
            ["A", "", "B", "C", "D", "", "E", ""],
            sep_size=1,
            sentence_idx=0)
        clean_tv = TokenVector([])
        clean_tv.append(Token('A', sindex=0, eindex=0, sentence_idx=0))
        clean_tv.append(Token('B', sindex=3, eindex=3, sentence_idx=0))
        clean_tv.append(Token('C', sindex=5, eindex=5, sentence_idx=0))
        clean_tv.append(Token('D', sindex=7, eindex=7, sentence_idx=0))
        clean_tv.append(Token('E', sindex=10, eindex=10, sentence_idx=0))

        tv.collect_garbage()
        self.assertSequenceEqual(tv, clean_tv)

    def _get_tokens(self): # tokenizer dummie
        return TokenVector([Token("abc", sindex=0),
                Token("defg", sindex=4),
                Token("ijk", sindex=9)])