def test_multiple_matches(self): doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]), ] doc.mention_chains[0].candidates = [ Entity('122', EntityType.PER, 'John Smith', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_Smith']), Entity('123', EntityType.PER, 'John Smith', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_Smith']), Entity('124', EntityType.PER, 'Jake Smith', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_P_Smith']), ] WikipediaResolver().resolve(doc) self.assertEqual(1, len(doc.mention_chains)) self.assertEqual(2, len(doc.mention_chains[0].candidates)) self.assertIsNone(doc.mention_chains[0].entity)
def test(self): doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]), ] doc.mention_chains[0].candidates = [ Entity('122', EntityType.PER, 'John Smith', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_Smith']), Entity('123', EntityType.PER, 'John Smith', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_H_Smith']), Entity('124', EntityType.PER, 'Jake Smith', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_Smith']), ] CascadeResolver([ExactNameResolver(), WikipediaResolver()]).resolve(doc) self.assertEqual(1, len(doc.mention_chains)) self.assertEqual(2, len(doc.mention_chains[0].candidates)) self.assertEqual('122', doc.mention_chains[0].entity.id)
def test(self): v = FeatureVector() entity = Entity('1', EntityType.GPE, 'New York City', EntityOrigin.GEO) entity.names = {'NYC', 'New York'} chain = MentionChain( [Mention('Nueva York', 'doc1', (), (), EntityType.GPE)]) SharedTokensFeature().extract(chain, entity, None, v) self.assertAlmostEqual(0.5, v.data[0])
def test_match(self): v = FeatureVector() entity = Entity('1', EntityType.GPE, 'New York', EntityOrigin.GEO) entity.names = {'New York', 'New York City', 'NYC'} chain = MentionChain( [Mention('Nueva York', 'doc1', (), (), EntityType.GPE)]) chain.mentions[0].translate_string = 'new york' ExactMatchFeature().extract(chain, entity, None, v) self.assertTrue(v.data[0])
def test_positive(self): v = FeatureVector() entity = Entity('1', EntityType.GPE, 'New York', EntityOrigin.GEO, urls=['http://en.wikipedia.org/wiki/New_York_City']) entity.names = {'New York', 'New York City', 'NYC'} chain = MentionChain( [Mention('New York City', 'doc1', (), (), EntityType.GPE)]) WikipediaFeature().extract(chain, entity, None, v) self.assertTrue(v.data[0])
def test(self): chains = [ MentionChain([ Mention('Henry', '_WL_doc34', (123, 128), (17, 17), EntityType.PER, 'Men1') ]), MentionChain([ Mention('Ed Smith', '_WL_doc34', (141, 149), (22, 23), EntityType.PER, 'Men2'), Mention('Ed', '_WL_doc34', (197, 199), (44, 44), EntityType.PER, 'Men3') ]) ] chains[0].entity = Entity('67', EntityType.PER, 'Henry', EntityOrigin.WLL) chains[1].entity = None doc = Document(chains[0].mentions + chains[1].mentions, DocType.WL, Lang.ENG, [], []) doc.mention_chains = chains buffer = io.StringIO() writer = OutputWriter(buffer, 'test', 0.75) writer.write(doc) buffer.seek(0) lines = buffer.readlines() line1 = "test\tMen1\tHenry\t_WL_doc34:123-128\t67\tPER\tNAM\t0.75" self.assertEqual(line1, lines[0].strip())
def test_multiple_matches(self): doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]), ] doc.mention_chains[0].candidates = [ Entity('122', EntityType.PER, 'John Smith', EntityOrigin.WLL), Entity('123', EntityType.PER, 'John Smith', EntityOrigin.WLL), Entity('124', EntityType.PER, 'Jake Smith', EntityOrigin.WLL), ] ExactNameResolver().resolve(doc) self.assertEqual(1, len(doc.mention_chains)) self.assertEqual(2, len(doc.mention_chains[0].candidates)) self.assertIsNone(doc.mention_chains[0].entity)
def test_no_match(self): v = FeatureVector() entity = Entity('1', EntityType.GPE, 'Nueva York', EntityOrigin.GEO) chain = MentionChain( [Mention('New York', 'doc1', (), (), EntityType.GPE)]) ExactMatchFeature().extract(chain, entity, None, v) self.assertFalse(v.data[0])
def test_not_multi_token_name(self): v = FeatureVector() entity = Entity('1', EntityType.PER, 'John Smith', EntityOrigin.WLL) chain = MentionChain( [Mention('Smith', 'doc1', (), (), EntityType.PER)]) LastNameFeature().extract(chain, entity, None, v) self.assertFalse(v.data[0])
def test_positive(self): v = FeatureVector() entity = Entity('1', EntityType.PER, 'John Smith', EntityOrigin.WLL) chain = MentionChain( [Mention('Jep Smith', 'doc1', (), (), EntityType.PER)]) LastNameFeature().extract(chain, entity, None, v) self.assertTrue(v.data[0])
def test(self): gt = { 'doc1': { (0, 2): Link(EntityType.PER, LinkType.LINK, ['123', '122'], None), (4, 8): Link(EntityType.PER, LinkType.NIL, [], 'NIL999'), (10, 12): Link(EntityType.PER, LinkType.LINK, ['222'], None), (16, 17): Link(EntityType.PER, LinkType.LINK, ['333'], None), (18, 19): Link(EntityType.PER, LinkType.LINK, ['444'], None), } } doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain([ Mention('', 'doc1', (0, 2), (), EntityType.PER), Mention('', 'doc1', (3, 7), (), EntityType.PER), Mention('', 'doc1', (16, 17), (), EntityType.PER), ]), MentionChain([Mention('', 'doc1', (4, 8), (), EntityType.PER)]), MentionChain([Mention('', 'doc1', (10, 12), (), EntityType.PER)]), MentionChain([Mention('', 'doc1', (18, 19), (), EntityType.PER)]), ] doc.mention_chains[0].candidates = [ Entity('122', EntityType.PER, '', EntityOrigin.WLL) ] doc.mention_chains[0].entity = doc.mention_chains[0].candidates[0] doc.mention_chains[1].candidates = [ Entity('147', EntityType.PER, '', EntityOrigin.WLL) ] doc.mention_chains[1].entity = doc.mention_chains[1].candidates[0] doc.mention_chains[2].candidates = [ Entity('198', EntityType.PER, '', EntityOrigin.WLL), Entity('222', EntityType.PER, '', EntityOrigin.WLL), ] doc.mention_chains[2].entity = None doc.mention_chains[3].candidates = [ Entity('17', EntityType.PER, '', EntityOrigin.WLL) ] doc.mention_chains[3].entity = doc.mention_chains[3].candidates[0] scorer = ResolverScorer(gt) scorer.update(doc) self.assertEqual( 2, scorer.report.num_mentions_with_correct_candidate[EntityType.PER]) self.assertEqual( 1, scorer.report.num_mentions_correct_entity[EntityType.PER])
def test_one_match(self): doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]), ] entity1 = Entity('122', EntityType.PER, 'John', EntityOrigin.WLL) entity1.names = {'John', 'J. Smith', 'john smith'} doc.mention_chains[0].candidates = [ entity1, Entity('123', EntityType.PER, 'Not John', EntityOrigin.WLL) ] ExactNameResolver().resolve(doc) self.assertEqual(1, len(doc.mention_chains)) self.assertEqual(2, len(doc.mention_chains[0].candidates)) self.assertEqual(entity1, doc.mention_chains[0].entity)
def test(self): classifier = self.trainClassifier() extractor = EntityFeatureExtractor(SimpleFeature()) resolver = SvmResolver(classifier, extractor) doc = unittest.mock.Mock() doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (), (), EntityType.PER)]), ] doc.mention_chains[0].candidates = [ Entity('122', EntityType.PER, 'John Smith', EntityOrigin.WLL), Entity('123', EntityType.PER, 'John Smith', EntityOrigin.WLL), Entity('124', EntityType.PER, 'John P. Smith', EntityOrigin.WLL), Entity('125', EntityType.PER, 'Jake Smith', EntityOrigin.WLL), ] resolver.resolve(doc) self.assertEqual('124', doc.mention_chains[0].entity.id)
def test_one_match(self): doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]), ] entity1 = Entity('122', EntityType.PER, 'John', EntityOrigin.WLL, urls=['http://en.wikipedia.org/wiki/John_Smith']) doc.mention_chains[0].candidates = [ entity1, Entity('123', EntityType.PER, 'Not John', EntityOrigin.WLL) ] WikipediaResolver().resolve(doc) self.assertEqual(1, len(doc.mention_chains)) self.assertEqual(2, len(doc.mention_chains[0].candidates)) self.assertEqual(entity1, doc.mention_chains[0].entity)
def test_no_match(self): doc = unittest.mock.Mock() doc.doc_id = 'doc1' doc.mention_chains = [ MentionChain( [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]), ] doc.mention_chains[0].candidates = [ Entity('122', EntityType.PER, 'John', EntityOrigin.WLL) ] WikipediaResolver().resolve(doc) self.assertEqual(1, len(doc.mention_chains)) self.assertEqual(1, len(doc.mention_chains[0].candidates)) self.assertIsNone(doc.mention_chains[0].entity)
def test2(self): v = FeatureVector() entity = Entity('1', EntityType.PER, 'Christopher', EntityOrigin.AUG) OriginFeature().extract(None, entity, None, v) self.assertEqual(1, v.data[0])