def testAddAnnotationsAreUnique(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) s.addAnnotation('type1', 2, 4) self.assertEqual(len(s.annotations), 1) self.assertEqual( sum(len(annotations) for annotations in s.annotations.values()), 1)
def testGetAnnotations(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('type1', 2, 4) a1 = s.addAnnotation('type2', 6) a2 = s.addAnnotation('type2', 8, 9) self.assertEqual(s.getAnnotations('type1'), {a}) self.assertEqual(s.getAnnotations('type2'), {a1, a2})
def testGetStems(self): s = Sentence(TEST_TOKENS) s.addAnnotation('mask', 2, 4) s.addAnnotation('mask', 6) self.assertListEqual(list(s.stems(3, 7)), [ 'stem3', 'stem4', 'stem5', 'stem6' ])
def testGetPoSTags(self): s = Sentence(TEST_TOKENS) s.addAnnotation('mask', 2, 4) s.addAnnotation('mask', 6) self.assertListEqual(list(s.posTags(3, 7)), [ 'pos3', 'pos4', 'pos5', 'pos6' ])
def testGetPhraseTag(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('t', 2) a2 = s.addAnnotation('t', 0, 2) a3 = s.addAnnotation('t', 4) self.assertEqual(a1.getPhraseTag_(), 'O') self.assertEqual(a2.getPhraseTag_(), 'NP') self.assertEqual(a3.getPhraseTag_(), 'NP')
def testGetPhraseNumber(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('t', 8) a2 = s.addAnnotation('t', 0, 2) a3 = s.addAnnotation('t', 4) self.assertEqual(a1.getPhraseNumber_(), 0) self.assertEqual(a2.getPhraseNumber_(), 1) self.assertEqual(a3.getPhraseNumber_(), 2)
def testVerbPhraseBetweenExactOverlap(self): tokens = list(TEST_TOKENS) tokens[0] = tokens[0].replace(chunk="B-VP") tokens[1] = tokens[1].replace(chunk="I-VP", stem="sentinel") s = Sentence(tokens) this = s.addAnnotation('this', 3) other = s.addAnnotation('other', 3) self.assertEqual(this.verbPhraseBetween(other), None)
def testPhraseDistanceIfOverlapping(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 0, 2) b = s.addAnnotation('B', 1) c = s.addAnnotation('C', 7) d = s.addAnnotation('D', 6, 8) self.assertEqual(a.phraseDistanceTo(b), 0) self.assertEqual(c.phraseDistanceTo(d), 0)
def testIsInsidePhrase(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('true', 3) a2 = s.addAnnotation('true', 0, 2) a3 = s.addAnnotation('false', 8) self.assertTrue(a1.isInsidePhrase()) self.assertTrue(a2.isInsidePhrase()) self.assertTrue(a3.isInsidePhrase())
def testGetPrepositionedNounPhrase(self): tokens = list(TEST_TOKENS) tokens[3] = tokens[3].replace(chunk="B-PP") tokens[4] = tokens[4].replace(chunk="I-PP") s = Sentence(tokens) s.addAnnotation('sentinel', 0) a = s.addAnnotation('type', 6) self.assertListEqual(list(a.getPrepositionedNounPhrase_()), ['sentinel', 'stem1'])
def testVerbPhraseBetween(self): tokens = list(TEST_TOKENS) tokens[3] = tokens[3].replace(chunk="B-VP", stem="sentinel1") tokens[4] = tokens[4].replace(chunk="I-VP", stem="sentinel2") s = Sentence(tokens) this = s.addAnnotation('this', 0) other = s.addAnnotation('other', 6) self.assertListEqual(list(this.verbPhraseBetween(other)), ['sentinel1', 'sentinel2'])
def testTokenDistanceIfOverlapping(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 2, 4) b = s.addAnnotation('B', 2, 3) c = s.addAnnotation('C', 1, 3) d = s.addAnnotation('D', 0, 2) e = s.addAnnotation('E', 1, 2) for other in [a, b, d, e]: self.assertEqual(c.tokenDistanceTo(other), -1)
def testPhraseDistance(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 0, 2) b = s.addAnnotation('B', 3) c = s.addAnnotation('C', 6, 8) d = s.addAnnotation('D', 9) e = s.addAnnotation('E', 3, 5) for other, dist in [(a, 1), (b, 1), (c, -1), (d, 0), (e, 0)]: self.assertEqual(c.phraseDistanceTo(other), dist, msg=repr(other))
def testTokenDistance(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 0, 2) b = s.addAnnotation('B', 3) c = s.addAnnotation('C', 6, 8) d = s.addAnnotation('D', 8) e = s.addAnnotation('E', 4, 6) for other, dist in [(a, 4), (b, 2), (d, 0), (e, 0)]: self.assertEqual(c.tokenDistanceTo(other), dist)
def testPhraseTagsTo(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('true', 5) a2 = s.addAnnotation('true', 0, 2) a3 = s.addAnnotation('true', 3, 5) a0 = s.addAnnotation('true', 0) a9 = s.addAnnotation('true', 9) self.assertEqual(list(a1.phraseTagsBetween(a2)), ['NP']) self.assertEqual(list(a1.phraseTagsBetween(a1)), []) self.assertEqual(list(a2.phraseTagsBetween(a3)), []) self.assertEqual(list(a0.phraseTagsBetween(a9)), ['NP', 'NP', 'NP'])
def testPosTagsTo(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('true', 5) a2 = s.addAnnotation('true', 0, 2) a3 = s.addAnnotation('true', 3, 5) a0 = s.addAnnotation('true', 0) a9 = s.addAnnotation('true', 9) self.assertEqual(list(a1.posTagsBetween(a2)), ['pos2', 'pos3', 'pos4']) self.assertEqual(list(a1.posTagsBetween(a1)), []) self.assertEqual(list(a2.posTagsBetween(a3)), ['pos2']) self.assertEqual(list(a0.posTagsBetween(a9)), ['pos%d' % i for i in range(1, 9)])
def testGetPhraseStemsMultiPhrase(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('mask', 3, 8) # old behaviour: # self.assertListEqual(list(a.getPhraseStems()), ['mask'] * 5) # new behaviour only reports one mask token, not each in a successive row: self.assertListEqual(list(a.getPhraseStems()), ['mask'])
def testGetMaskedStems(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) s.addAnnotation('type2', 6) s.addAnnotation('type2', 8, 9) self.assertListEqual(list(s.maskedStems(3, 7)), ['type1', 'stem4', 'stem5', 'type2'])
def testGetMaskedStems(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) s.addAnnotation('type2', 6) s.addAnnotation('type2', 8, 9) self.assertListEqual(list(s.maskedStems(3, 7)), [ 'type1', 'stem4', 'stem5', 'type2' ])
def testGetWords(self): s = Sentence(TEST_TOKENS) s.addAnnotation('mask', 2, 4) s.addAnnotation('mask', 6) s.addAnnotation('mask', 8, 9) self.assertListEqual(list(s.words()), ['word%d' % i for i in range(len(s))]) self.assertListEqual(list(s.words(7)), ['word7', 'word8', 'word9'])
def testAddAnnotation(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) s.addAnnotation('type2', 6) s.addAnnotation('type2', 8, 9) self.assertEqual(len(s.annotations), 2) self.assertEqual(set(s.annotations.keys()), {'type1', 'type2'}) self.assertTrue(all(isinstance(annotations, set) for annotations in s.annotations.values())) self.assertEqual(sum(len(annotations) for annotations in s.annotations.values()), 3)
def testGetWords(self): s = Sentence(TEST_TOKENS) s.addAnnotation('mask', 2, 4) s.addAnnotation('mask', 6) s.addAnnotation('mask', 8, 9) self.assertListEqual(list(s.words()), [ 'word%d' % i for i in range(len(s)) ]) self.assertListEqual(list(s.words(7)), [ 'word7', 'word8', 'word9' ])
def testAddAnnotation(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) s.addAnnotation('type2', 6) s.addAnnotation('type2', 8, 9) self.assertEqual(len(s.annotations), 2) self.assertEqual(set(s.annotations.keys()), {'type1', 'type2'}) self.assertTrue( all( isinstance(annotations, set) for annotations in s.annotations.values())) self.assertEqual( sum(len(annotations) for annotations in s.annotations.values()), 3)
def testGetMaskedWords(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) # should only fetch one masked token ("type1") s.addAnnotation('type2', 6) s.addAnnotation('type2', 8, 9) self.maxDiff = None self.assertListEqual(list(s.maskedWords()), [ 'word0', 'word1', 'type1', 'word4', 'word5', 'type2', 'word7', 'type2', 'word9' ]) self.assertListEqual(list(s.maskedWords(7)), ['word7', 'type2', 'word9'])
def testGetMaskedWords(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) # should only fetch one masked token ("type1") s.addAnnotation('type2', 6) s.addAnnotation('type2', 8, 9) self.maxDiff = None self.assertListEqual(list(s.maskedWords()), [ 'word0', 'word1', 'type1', 'word4', 'word5', 'type2', 'word7', 'type2', 'word9' ]) self.assertListEqual(list(s.maskedWords(7)), [ 'word7', 'type2', 'word9' ])
def testIsNotInsidePhrase(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('false', 1, 3) self.assertFalse(a1.isInsidePhrase())
def testTokenDistanceIfEqual(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 0, 2) b = s.addAnnotation('B', 0, 2) self.assertEqual(a.tokenDistanceTo(b), -2)
def testGetPhraseOffsetMultiPhrase(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('mask', 4, 7) self.assertEqual(a.getPhraseOffset(), (4, 7))
def testTokenDistanceOnDifferentSentences(self): s1 = Sentence(TEST_TOKENS) s2 = Sentence(TEST_TOKENS) a1 = s1.addAnnotation('type', 0, 2) a2 = s2.addAnnotation('type', 6, 8) self.assertRaises(ValueError, a1.tokenDistanceTo, a2)
def testPhraseDistanceIfBothNotInPhrase(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 2) b = s.addAnnotation('B', 5) self.assertEqual(a.phraseDistanceTo(b), 1)
def testGetPhraseWordsOutside(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('mask', 2) self.assertListEqual(list(a.getPhraseWords()), ['mask'])
def testGetPhraseStems(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('mask', 3) self.assertListEqual(list(a.getPhraseStems()), ['mask', 'stem4'])
def testCopyConstructor(self): s1 = Sentence(TEST_TOKENS) s1.addAnnotation("ann", 1) s2 = Sentence(s1) self.assertEqual(s2.getAnnotations("ann"), s1.getAnnotations("ann"))
def testAddAnnotationsAreUnique(self): s = Sentence(TEST_TOKENS) s.addAnnotation('type1', 2, 4) s.addAnnotation('type1', 2, 4) self.assertEqual(len(s.annotations), 1) self.assertEqual(sum(len(annotations) for annotations in s.annotations.values()), 1)
def testGetPhraseTagFailure(self): s = Sentence(TEST_TOKENS) a1 = s.addAnnotation('true', 0, 5) a2 = s.addAnnotation('true', 2, 9) self.assertRaises(ValueError, a1.getPhraseTag_) self.assertRaises(ValueError, a2.getPhraseTag_)
def testPhraseDistanceIfBothInOverlappingPhrase(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('A', 0, 5) b = s.addAnnotation('B', 6, 10) self.assertEqual(a.phraseDistanceTo(b), 0)
def testGetPhraseOffsetOutside(self): s = Sentence(TEST_TOKENS) a = s.addAnnotation('mask', 2) self.assertEqual(a.getPhraseOffset(), (2, 3))