def test_extend(self): tv1 = TokenVector.make_token_vector( ["A", "B", "C", "D"], sep_size=1) tv2 = TokenVector.make_token_vector( ["E", "F", "G", "H"], sep_size=1) tvUnion = TokenVector.make_token_vector( ["A", "B", "C", "D", "E", "F", "G", "H"], sep_size=1) tv1.extend(tv2, sep_size=1) self.assertListEqual(tv1, tvUnion)
def test_get_index_length(self): vector = TokenVector.make_token_vector( ["A", "B", "C", "D"], sep_size=1, sentence_idx=0) self.assertEquals(vector.get_index_length(), 7)
def test_get_sentences(self): sentence_one_vector = TokenVector.make_token_vector( ["A", "B", "C", "D"], sep_size=1, sentence_idx=0) sentence_two_vector = TokenVector.make_token_vector( ["E", "F", "G", "H"], relindex=9, sep_size=1, sentence_idx=1) union_vector = TokenVector.make_token_vector( ["A", "B", "C", "D"], sep_size=1, sentence_idx=0) union_vector.extend(sentence_two_vector) sentences = union_vector.get_sentences() self.assertListEqual(sentences[0], sentence_one_vector) self.assertListEqual(sentences[1], sentence_two_vector)
def test_garbage_collection(self): tv = TokenVector.make_token_vector( ["A", "", "B", "C", "D", "", "E", ""], sep_size=1, sentence_idx=0) clean_tv = TokenVector([]) clean_tv.append(Token('A', sindex=0, eindex=0, sentence_idx=0)) clean_tv.append(Token('B', sindex=3, eindex=3, sentence_idx=0)) clean_tv.append(Token('C', sindex=5, eindex=5, sentence_idx=0)) clean_tv.append(Token('D', sindex=7, eindex=7, sentence_idx=0)) clean_tv.append(Token('E', sindex=10, eindex=10, sentence_idx=0)) tv.collect_garbage() self.assertSequenceEqual(tv, clean_tv)
def test_make_token_vector(self): made_tv = TokenVector.make_token_vector( ["A", "", "B", "C", "D", "", "E", ""], sep_size=1, sentence_idx=0) real_tv = TokenVector([]) real_tv.append(Token('A', sindex=0, eindex=0, sentence_idx=0)) real_tv.append(Token('', sindex=2, eindex=2, sentence_idx=0)) real_tv.append(Token('B', sindex=3, eindex=3, sentence_idx=0)) real_tv.append(Token('C', sindex=5, eindex=5, sentence_idx=0)) real_tv.append(Token('D', sindex=7, eindex=7, sentence_idx=0)) real_tv.append(Token('', sindex=9, eindex=9, sentence_idx=0)) real_tv.append(Token('E', sindex=10, eindex=10, sentence_idx=0)) real_tv.append(Token('', sindex=12, eindex=12, sentence_idx=0)) self.assertSequenceEqual(made_tv, real_tv)
def setUp(self): self.tv = TokenVector([Token("foo"), Token("abc defg ijk", sindex=4), Token("bla", sindex=17)])
class TestTokenVector(TestCase): def setUp(self): self.tv = TokenVector([Token("foo"), Token("abc defg ijk", sindex=4), Token("bla", sindex=17)]) def test_constructor_end_indices(self): self.assertEqual(self.tv[0].eindex, 2) self.assertEqual(self.tv[1].eindex, 15) self.assertEqual(self.tv[2].eindex, 19) def test_replace_tokens(self): print self.tv new_tv = TokenVector([Token("abc"), Token("defg"), Token("ijk")]) self.tv.replace_tokens_with_list({1: new_tv}) print self.tv self.assertEqual(len(self.tv), 5, "size kaputt") for index, token in enumerate(new_tv): self.assertEqual(self.tv[index + 1], new_tv[index]) def test_update_indices(self): self.tv.update_indices(5) self.assertEqual(self.tv[0].eindex, 7) self.assertEqual(self.tv[1].eindex, 20) self.assertEqual(self.tv[2].eindex, 24) self.assertEqual(self.tv[0].sindex, 5) self.assertEqual(self.tv[1].sindex, 9) self.assertEqual(self.tv[2].sindex, 22) def test_get_index_length(self): vector = TokenVector.make_token_vector( ["A", "B", "C", "D"], sep_size=1, sentence_idx=0) self.assertEquals(vector.get_index_length(), 7) def test_extend(self): tv1 = TokenVector.make_token_vector( ["A", "B", "C", "D"], sep_size=1) tv2 = TokenVector.make_token_vector( ["E", "F", "G", "H"], sep_size=1) tvUnion = TokenVector.make_token_vector( ["A", "B", "C", "D", "E", "F", "G", "H"], sep_size=1) tv1.extend(tv2, sep_size=1) self.assertListEqual(tv1, tvUnion) def test_get_sentences(self): sentence_one_vector = TokenVector.make_token_vector( ["A", "B", "C", "D"], sep_size=1, sentence_idx=0) sentence_two_vector = TokenVector.make_token_vector( ["E", "F", "G", "H"], relindex=9, sep_size=1, sentence_idx=1) union_vector = TokenVector.make_token_vector( ["A", "B", "C", "D"], sep_size=1, sentence_idx=0) union_vector.extend(sentence_two_vector) sentences = union_vector.get_sentences() self.assertListEqual(sentences[0], sentence_one_vector) self.assertListEqual(sentences[1], sentence_two_vector) def test_make_token_vector(self): made_tv = TokenVector.make_token_vector( ["A", "", "B", "C", "D", "", "E", ""], sep_size=1, sentence_idx=0) real_tv = TokenVector([]) real_tv.append(Token('A', sindex=0, eindex=0, sentence_idx=0)) real_tv.append(Token('', sindex=2, eindex=2, sentence_idx=0)) real_tv.append(Token('B', sindex=3, eindex=3, sentence_idx=0)) real_tv.append(Token('C', sindex=5, eindex=5, sentence_idx=0)) real_tv.append(Token('D', sindex=7, eindex=7, sentence_idx=0)) real_tv.append(Token('', sindex=9, eindex=9, sentence_idx=0)) real_tv.append(Token('E', sindex=10, eindex=10, sentence_idx=0)) real_tv.append(Token('', sindex=12, eindex=12, sentence_idx=0)) self.assertSequenceEqual(made_tv, real_tv) def test_garbage_collection(self): tv = TokenVector.make_token_vector( ["A", "", "B", "C", "D", "", "E", ""], sep_size=1, sentence_idx=0) clean_tv = TokenVector([]) clean_tv.append(Token('A', sindex=0, eindex=0, sentence_idx=0)) clean_tv.append(Token('B', sindex=3, eindex=3, sentence_idx=0)) clean_tv.append(Token('C', sindex=5, eindex=5, sentence_idx=0)) clean_tv.append(Token('D', sindex=7, eindex=7, sentence_idx=0)) clean_tv.append(Token('E', sindex=10, eindex=10, sentence_idx=0)) tv.collect_garbage() self.assertSequenceEqual(tv, clean_tv) def _get_tokens(self): # tokenizer dummie return TokenVector([Token("abc", sindex=0), Token("defg", sindex=4), Token("ijk", sindex=9)])