Exemple #1
0
    def test(self):
        doc = unittest.mock.Mock()
        stage = TypeSpecificStage(CombineEverything(), EntityType.GPE,
                                  EntityType.LOC)

        doc.mention_chains = [
            MentionChain([
                Mention('New York', '_DF_doc34', (141, 149), (22, 23),
                        EntityType.LOC, 'Men1')
            ]),
            MentionChain([
                Mention('New York City', '_DF_doc34', (146, 154), (24, 25),
                        EntityType.GPE, 'Men2')
            ]),
            MentionChain([
                Mention('NY Yankees', '_DF_doc34', (173, 181), (36, 37),
                        EntityType.ORG, 'Men3')
            ]),
            MentionChain([
                Mention('Ed Koch', '_DF_doc34', (186, 194), (51, 52),
                        EntityType.PER, 'Men4')
            ]),
            MentionChain([
                Mention('NYC', '_DF_doc34', (237, 245), (71, 72),
                        EntityType.GPE, 'Men5')
            ]),
        ]
        stage.update(doc)
        self.assertEqual(3, len(doc.mention_chains))
        self.assertEqual(sorted([1, 1, 3]),
                         sorted(list(map(len, doc.mention_chains))))
Exemple #2
0
    def test(self):
        chains = [
            MentionChain([
                Mention('Henry', '_WL_doc34', (123, 128), (17, 17),
                        EntityType.PER, 'Men1')
            ]),
            MentionChain([
                Mention('Ed Smith', '_WL_doc34', (141, 149), (22, 23),
                        EntityType.PER, 'Men2'),
                Mention('Ed', '_WL_doc34', (197, 199), (44, 44),
                        EntityType.PER, 'Men3')
            ])
        ]
        chains[0].entity = Entity('67', EntityType.PER, 'Henry',
                                  EntityOrigin.WLL)
        chains[1].entity = None
        doc = Document(chains[0].mentions + chains[1].mentions, DocType.WL,
                       Lang.ENG, [], [])
        doc.mention_chains = chains

        buffer = io.StringIO()
        writer = OutputWriter(buffer, 'test', 0.75)
        writer.write(doc)

        buffer.seek(0)
        lines = buffer.readlines()
        line1 = "test\tMen1\tHenry\t_WL_doc34:123-128\t67\tPER\tNAM\t0.75"
        self.assertEqual(line1, lines[0].strip())
Exemple #3
0
 def test_update1(self):
     # first example from table 1 in https://www.aclweb.org/anthology/M95-1005
     gt = {
         'doc1': {
             (0, 1): Link(EntityType.PER, LinkType.LINK, 'NIL1', None),
             (2, 5): Link(EntityType.PER, LinkType.LINK, 'NIL1', None),
             (8, 11): Link(EntityType.PER, LinkType.LINK, 'NIL1', None),
             (14, 17): Link(EntityType.PER, LinkType.LINK, 'NIL1', None),
         },
     }
     doc = unittest.mock.Mock()
     doc.doc_id = 'doc1'
     doc.mention_chains = [
         MentionChain([
             Mention('1', 'doc1', (0, 1), (), EntityType.PER, 'NIL10'),
             Mention('2', 'doc1', (2, 5), (), EntityType.PER, 'NIL10'),
         ]),
         MentionChain([
             Mention('3', 'doc1', (8, 11), (), EntityType.PER, 'NIL11'),
             Mention('4', 'doc1', (14, 17), (), EntityType.PER, 'NIL11'),
         ]),
     ]
     scorer = CorefScorer(gt, CorefMetric.MUC)
     scorer.update(doc)
     self.assertEqual(2, scorer.recall_numerator)
     self.assertEqual(3, scorer.recall_denominator)
     self.assertEqual(2, scorer.precision_numerator)
     self.assertEqual(2, scorer.precision_denominator)
Exemple #4
0
    def test(self):
        doc = unittest.mock.Mock()
        stage = LanguageSpecificStage(CombineEverything(), Lang.ENG)

        doc.lang = Lang.AKA
        doc.mention_chains = [
            MentionChain([
                Mention('Ed Smith', '_DF_doc34', (141, 149), (22, 23),
                        EntityType.PER, 'Men1')
            ]),
            MentionChain([
                Mention('Ed Smith', '_DF_doc34', (146, 154), (24, 25),
                        EntityType.PER, 'Men2')
            ]),
            MentionChain([
                Mention('Ben Smith', '_DF_doc34', (173, 181), (36, 37),
                        EntityType.PER, 'Men3')
            ]),
            MentionChain([
                Mention('ed Smith', '_DF_doc34', (186, 194), (51, 52),
                        EntityType.PER, 'Men4')
            ]),
            MentionChain([
                Mention('Ed Smith', '_DF_doc34', (237, 245), (71, 72),
                        EntityType.ORG, 'Men5')
            ]),
        ]
        stage.update(doc)
        self.assertEqual(5, len(doc.mention_chains))

        doc.lang = Lang.ENG
        doc.mention_chains = [
            MentionChain([
                Mention('Ed Smith', '_DF_doc34', (141, 149), (22, 23),
                        EntityType.PER, 'Men1')
            ]),
            MentionChain([
                Mention('Ed Smith', '_DF_doc34', (146, 154), (24, 25),
                        EntityType.PER, 'Men2')
            ]),
            MentionChain([
                Mention('Ben Smith', '_DF_doc34', (173, 181), (36, 37),
                        EntityType.PER, 'Men3')
            ]),
            MentionChain([
                Mention('ed Smith', '_DF_doc34', (186, 194), (51, 52),
                        EntityType.PER, 'Men4')
            ]),
            MentionChain([
                Mention('Ed Smith', '_DF_doc34', (237, 245), (71, 72),
                        EntityType.ORG, 'Men5')
            ]),
        ]
        stage.update(doc)
        self.assertEqual(1, len(doc.mention_chains))
        self.assertEqual(5, len(doc.mention_chains[0].mentions))
Exemple #5
0
 def testWithNoStages(self):
     doc = unittest.mock.Mock()
     doc.mentions = [
         Mention('Ed Smith', '_DF_doc34', (141, 149), (22, 23),
                 EntityType.PER, 'Men1'),
         Mention('Ed Smith', '_DF_doc34', (146, 154), (24, 25),
                 EntityType.PER, 'Men2'),
         Mention('Ben Smith', '_DF_doc34', (173, 181), (36, 37),
                 EntityType.PER, 'Men3'),
     ]
     coref = CascadeCoref([])
     coref.coref(doc)
     self.assertEqual(3, len(doc.mention_chains))
Exemple #6
0
 def test_across_entity_type(self):
     doc = unittest.mock.Mock()
     doc.mention_chains = [
         MentionChain([
             Mention('South Carolina', '_DF_doc34', (141, 149), (22, 23),
                     EntityType.GPE, 'Men1'),
         ]),
         MentionChain([
             Mention('SC', '_DF_doc34', (146, 154), (24, 25),
                     EntityType.ORG, 'Men6')
         ]),
     ]
     stage = AcronymStage(2)
     stage.update(doc)
     self.assertEqual(2, len(doc.mention_chains))
Exemple #7
0
 def test_no_match(self):
     v = FeatureVector()
     entity = Entity('1', EntityType.GPE, 'Nueva York', EntityOrigin.GEO)
     chain = MentionChain(
         [Mention('New York', 'doc1', (), (), EntityType.GPE)])
     ExactMatchFeature().extract(chain, entity, None, v)
     self.assertFalse(v.data[0])
Exemple #8
0
 def test_multiple_matches(self):
     doc = unittest.mock.Mock()
     doc.doc_id = 'doc1'
     doc.mention_chains = [
         MentionChain(
             [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]),
     ]
     doc.mention_chains[0].candidates = [
         Entity('122',
                EntityType.PER,
                'John Smith',
                EntityOrigin.WLL,
                urls=['http://en.wikipedia.org/wiki/John_Smith']),
         Entity('123',
                EntityType.PER,
                'John Smith',
                EntityOrigin.WLL,
                urls=['http://en.wikipedia.org/wiki/John_Smith']),
         Entity('124',
                EntityType.PER,
                'Jake Smith',
                EntityOrigin.WLL,
                urls=['http://en.wikipedia.org/wiki/John_P_Smith']),
     ]
     WikipediaResolver().resolve(doc)
     self.assertEqual(1, len(doc.mention_chains))
     self.assertEqual(2, len(doc.mention_chains[0].candidates))
     self.assertIsNone(doc.mention_chains[0].entity)
Exemple #9
0
 def test_simple(self):
     mention = Mention("Ted", "IL9_SM_001", (4, 8), (0, 1), EntityType.PER)
     ia = InProcessIncremental()
     ia.assign(mention)
     self.assertEqual('M1', mention.id)
     ia.assign(mention)
     self.assertEqual('M2', mention.id)
Exemple #10
0
 def test_positive(self):
     v = FeatureVector()
     entity = Entity('1', EntityType.PER, 'John Smith', EntityOrigin.WLL)
     chain = MentionChain(
         [Mention('Jep Smith', 'doc1', (), (), EntityType.PER)])
     LastNameFeature().extract(chain, entity, None, v)
     self.assertTrue(v.data[0])
Exemple #11
0
 def test_not_multi_token_name(self):
     v = FeatureVector()
     entity = Entity('1', EntityType.PER, 'John Smith', EntityOrigin.WLL)
     chain = MentionChain(
         [Mention('Smith', 'doc1', (), (), EntityType.PER)])
     LastNameFeature().extract(chain, entity, None, v)
     self.assertFalse(v.data[0])
Exemple #12
0
 def test(self):
     doc = unittest.mock.Mock()
     doc.doc_id = 'doc1'
     doc.mention_chains = [
         MentionChain(
             [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]),
     ]
     doc.mention_chains[0].candidates = [
         Entity('122',
                EntityType.PER,
                'John Smith',
                EntityOrigin.WLL,
                urls=['http://en.wikipedia.org/wiki/John_Smith']),
         Entity('123',
                EntityType.PER,
                'John Smith',
                EntityOrigin.WLL,
                urls=['http://en.wikipedia.org/wiki/John_H_Smith']),
         Entity('124',
                EntityType.PER,
                'Jake Smith',
                EntityOrigin.WLL,
                urls=['http://en.wikipedia.org/wiki/John_Smith']),
     ]
     CascadeResolver([ExactNameResolver(),
                      WikipediaResolver()]).resolve(doc)
     self.assertEqual(1, len(doc.mention_chains))
     self.assertEqual(2, len(doc.mention_chains[0].candidates))
     self.assertEqual('122', doc.mention_chains[0].entity.id)
Exemple #13
0
 def test(self):
     v = FeatureVector()
     entity = Entity('1', EntityType.GPE, 'New York City', EntityOrigin.GEO)
     entity.names = {'NYC', 'New York'}
     chain = MentionChain(
         [Mention('Nueva York', 'doc1', (), (), EntityType.GPE)])
     SharedTokensFeature().extract(chain, entity, None, v)
     self.assertAlmostEqual(0.5, v.data[0])
Exemple #14
0
 def test_merge(self):
     chains = [
         MentionChain([
             Mention('1', '_NW_1', (), (), EntityType.PER, 'Men1'),
             Mention('4', '_NW_1', (), (), EntityType.PER, 'Men4')
         ]),
         MentionChain(
             [Mention('2', '_NW_1', (), (), EntityType.PER, 'Men2')]),
         MentionChain(
             [Mention('3', '_NW_1', (), (), EntityType.PER, 'Men3')]),
     ]
     doc = unittest.mock.Mock()
     doc.mention_chains = chains
     stage = CorefStageTest.DummyStage()
     stage.merge(doc, [chains[0], chains[2]])
     self.assertEqual(2, len(doc.mention_chains))
     self.assertEqual(3, len(doc.mention_chains[-1].mentions))
Exemple #15
0
 def test_match(self):
     v = FeatureVector()
     entity = Entity('1', EntityType.GPE, 'New York', EntityOrigin.GEO)
     entity.names = {'New York', 'New York City', 'NYC'}
     chain = MentionChain(
         [Mention('Nueva York', 'doc1', (), (), EntityType.GPE)])
     chain.mentions[0].translate_string = 'new york'
     ExactMatchFeature().extract(chain, entity, None, v)
     self.assertTrue(v.data[0])
Exemple #16
0
 def test_negative(self):
     v = FeatureVector()
     entity = Entity('1',
                     EntityType.GPE,
                     'New York',
                     EntityOrigin.GEO,
                     urls=['http://en.wikipedia.org/wiki/New_York'])
     chain = MentionChain(
         [Mention('New York City', 'doc1', (), (), EntityType.GPE)])
     WikipediaFeature().extract(chain, entity, None, v)
     self.assertFalse(v.data[0])
Exemple #17
0
    def test(self):
        doc = unittest.mock.Mock()
        doc.mention_chains = [
            MentionChain([
                Mention('Ed Smith', '_DF_doc34', (141, 149), (22, 23),
                        EntityType.PER, 'Men1')
            ]),
            MentionChain([
                Mention('Smith', '_DF_doc34', (146, 154), (24, 25),
                        EntityType.PER, 'Men2')
            ]),
            MentionChain([
                Mention('Ben Smithy', '_DF_doc34', (173, 181), (36, 37),
                        EntityType.PER, 'Men3')
            ]),
            MentionChain([
                Mention('ed smith', '_DF_doc34', (186, 194), (51, 52),
                        EntityType.PER, 'Men4')
            ]),
            MentionChain([
                Mention('Tony Smith', '_DF_doc34', (237, 245), (71, 72),
                        EntityType.ORG, 'Men5')
            ]),
            MentionChain([
                Mention('Smith Jones', '_DF_doc34', (298, 306), (36, 37),
                        EntityType.PER, 'Men6')
            ]),
        ]
        stage = SingleTokenMatchStage(index=-1)
        stage.update(doc)

        self.assertEqual(4, len(doc.mention_chains))
        self.assertEqual(sorted([1, 1, 1, 3]),
                         sorted(list(map(len, doc.mention_chains))))
Exemple #18
0
 def test(self):
     doc = unittest.mock.Mock()
     doc.mention_chains = [
         MentionChain([
             Mention('South Carolina', '_DF_doc34', (141, 149), (22, 23),
                     EntityType.GPE, 'Men1'),
             Mention('south carolina', '_DF_doc34', (173, 181), (36, 37),
                     EntityType.GPE, 'Men3')
         ]),
         MentionChain([
             Mention('SC', '_DF_doc34', (146, 154), (24, 25),
                     EntityType.GPE, 'Men2')
         ]),
         MentionChain([
             Mention('SC', '_DF_doc34', (146, 154), (24, 25),
                     EntityType.ORG, 'Men6')
         ]),
         MentionChain([
             Mention('ed Smith', '_DF_doc34', (186, 194), (51, 52),
                     EntityType.PER, 'Men4'),
             Mention('Ed Smith', '_DF_doc34', (237, 245), (71, 72),
                     EntityType.PER, 'Men5')
         ]),
     ]
     stage = AcronymStage(2)
     stage.update(doc)
     self.assertEqual(3, len(doc.mention_chains))
     self.assertEqual({1, 2, 3},
                      {len(x.mentions)
                       for x in doc.mention_chains})
Exemple #19
0
    def test(self):
        doc = unittest.mock.Mock()
        doc.mention_chains = [
            MentionChain([
                Mention('Ed Smith', '_DF_doc34', (141, 149), (22, 23),
                        EntityType.PER, 'Men1')
            ]),
            MentionChain([
                Mention('Ed Smith', '_DF_doc34', (146, 154), (24, 25),
                        EntityType.PER, 'Men2')
            ]),
            MentionChain([
                Mention('Ben Smith', '_DF_doc34', (173, 181), (36, 37),
                        EntityType.PER, 'Men3')
            ]),
            MentionChain([
                Mention('ed Smith', '_DF_doc34', (186, 194), (51, 52),
                        EntityType.PER, 'Men4')
            ]),
            MentionChain([
                Mention('Ed Smith', '_DF_doc34', (237, 245), (71, 72),
                        EntityType.ORG, 'Men5')
            ]),
        ]
        stage = ExactMatchStage()
        stage.update(doc)

        self.assertEqual(3, len(doc.mention_chains))
        self.assertEqual(sorted([1, 1, 3]),
                         sorted(list(map(len, doc.mention_chains))))
        for chain in doc.mention_chains:
            if chain.name.lower(
            ) == 'ed smith' and chain.type == EntityType.PER:
                self.assertEqual({'Men1', 'Men2', 'Men4'},
                                 {m.id
                                  for m in chain.mentions})
                self.assertEqual({(141, 149), (146, 154), (186, 194)},
                                 {m.offsets
                                  for m in chain.mentions})
Exemple #20
0
 def test_no_matches(self):
     doc = unittest.mock.Mock()
     doc.doc_id = 'doc1'
     doc.mention_chains = [
         MentionChain(
             [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]),
     ]
     doc.mention_chains[0].candidates = [
         Entity('122', EntityType.PER, 'John', EntityOrigin.WLL)
     ]
     ExactNameResolver().resolve(doc)
     self.assertEqual(1, len(doc.mention_chains))
     self.assertEqual(1, len(doc.mention_chains[0].candidates))
     self.assertIsNone(doc.mention_chains[0].entity)
Exemple #21
0
 def test_one_match(self):
     doc = unittest.mock.Mock()
     doc.doc_id = 'doc1'
     doc.mention_chains = [
         MentionChain(
             [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]),
     ]
     entity1 = Entity('122', EntityType.PER, 'John', EntityOrigin.WLL)
     entity1.names = {'John', 'J. Smith', 'john smith'}
     doc.mention_chains[0].candidates = [
         entity1,
         Entity('123', EntityType.PER, 'Not John', EntityOrigin.WLL)
     ]
     ExactNameResolver().resolve(doc)
     self.assertEqual(1, len(doc.mention_chains))
     self.assertEqual(2, len(doc.mention_chains[0].candidates))
     self.assertEqual(entity1, doc.mention_chains[0].entity)
Exemple #22
0
 def test_one_match(self):
     doc = unittest.mock.Mock()
     doc.doc_id = 'doc1'
     doc.mention_chains = [
         MentionChain(
             [Mention('John Smith', 'doc1', (4, 8), (), EntityType.PER)]),
     ]
     entity1 = Entity('122',
                      EntityType.PER,
                      'John',
                      EntityOrigin.WLL,
                      urls=['http://en.wikipedia.org/wiki/John_Smith'])
     doc.mention_chains[0].candidates = [
         entity1,
         Entity('123', EntityType.PER, 'Not John', EntityOrigin.WLL)
     ]
     WikipediaResolver().resolve(doc)
     self.assertEqual(1, len(doc.mention_chains))
     self.assertEqual(2, len(doc.mention_chains[0].candidates))
     self.assertEqual(entity1, doc.mention_chains[0].entity)
Exemple #23
0
    def test(self):

        classifier = self.trainClassifier()
        extractor = EntityFeatureExtractor(SimpleFeature())
        resolver = SvmResolver(classifier, extractor)

        doc = unittest.mock.Mock()
        doc.mention_chains = [
            MentionChain(
                [Mention('John Smith', 'doc1', (), (), EntityType.PER)]),
        ]
        doc.mention_chains[0].candidates = [
            Entity('122', EntityType.PER, 'John Smith', EntityOrigin.WLL),
            Entity('123', EntityType.PER, 'John Smith', EntityOrigin.WLL),
            Entity('124', EntityType.PER, 'John P. Smith', EntityOrigin.WLL),
            Entity('125', EntityType.PER, 'Jake Smith', EntityOrigin.WLL),
        ]

        resolver.resolve(doc)
        self.assertEqual('124', doc.mention_chains[0].entity.id)
Exemple #24
0
 def test(self):
     gt = {
         'doc1': {
             (0, 2): Link(EntityType.PER, LinkType.LINK, ['123', '122'],
                          None),
             (4, 8): Link(EntityType.PER, LinkType.NIL, [], 'NIL999'),
             (10, 12): Link(EntityType.PER, LinkType.LINK, ['222'], None),
             (16, 17): Link(EntityType.PER, LinkType.LINK, ['333'], None),
             (18, 19): Link(EntityType.PER, LinkType.LINK, ['444'], None),
         }
     }
     doc = unittest.mock.Mock()
     doc.doc_id = 'doc1'
     doc.mention_chains = [
         MentionChain([
             Mention('', 'doc1', (0, 2), (), EntityType.PER),
             Mention('', 'doc1', (3, 7), (), EntityType.PER),
             Mention('', 'doc1', (16, 17), (), EntityType.PER),
         ]),
         MentionChain([Mention('', 'doc1', (4, 8), (), EntityType.PER)]),
         MentionChain([Mention('', 'doc1', (10, 12), (), EntityType.PER)]),
         MentionChain([Mention('', 'doc1', (18, 19), (), EntityType.PER)]),
     ]
     doc.mention_chains[0].candidates = [
         Entity('122', EntityType.PER, '', EntityOrigin.WLL)
     ]
     doc.mention_chains[0].entity = doc.mention_chains[0].candidates[0]
     doc.mention_chains[1].candidates = [
         Entity('147', EntityType.PER, '', EntityOrigin.WLL)
     ]
     doc.mention_chains[1].entity = doc.mention_chains[1].candidates[0]
     doc.mention_chains[2].candidates = [
         Entity('198', EntityType.PER, '', EntityOrigin.WLL),
         Entity('222', EntityType.PER, '', EntityOrigin.WLL),
     ]
     doc.mention_chains[2].entity = None
     doc.mention_chains[3].candidates = [
         Entity('17', EntityType.PER, '', EntityOrigin.WLL)
     ]
     doc.mention_chains[3].entity = doc.mention_chains[3].candidates[0]
     scorer = ResolverScorer(gt)
     scorer.update(doc)
     self.assertEqual(
         2,
         scorer.report.num_mentions_with_correct_candidate[EntityType.PER])
     self.assertEqual(
         1, scorer.report.num_mentions_correct_entity[EntityType.PER])
Exemple #25
0
 def get_example1_test_data(self):
     # first example from "Algorithms for scoring coreference chains"
     gt = {
         'doc1': {
             (0, 1): Link(EntityType.PER, LinkType.LINK, 'NIL1', None),
             (0, 2): Link(EntityType.PER, LinkType.LINK, 'NIL1', None),
             (0, 3): Link(EntityType.PER, LinkType.LINK, 'NIL1', None),
             (0, 4): Link(EntityType.PER, LinkType.LINK, 'NIL1', None),
             (0, 5): Link(EntityType.PER, LinkType.LINK, 'NIL1', None),
             (0, 6): Link(EntityType.PER, LinkType.LINK, 'NIL2', None),
             (0, 7): Link(EntityType.PER, LinkType.LINK, 'NIL2', None),
             (0, 8): Link(EntityType.PER, LinkType.LINK, 'NIL3', None),
             (0, 9): Link(EntityType.PER, LinkType.LINK, 'NIL3', None),
             (0, 10): Link(EntityType.PER, LinkType.LINK, 'NIL3', None),
             (0, 11): Link(EntityType.PER, LinkType.LINK, 'NIL3', None),
             (0, 12): Link(EntityType.PER, LinkType.LINK, 'NIL3', None),
         },
     }
     doc = unittest.mock.Mock()
     doc.doc_id = 'doc1'
     doc.mention_chains = [
         MentionChain([
             Mention('1', 'doc1', (0, 1), (), EntityType.PER, 'NIL10'),
             Mention('2', 'doc1', (0, 2), (), EntityType.PER, 'NIL10'),
             Mention('3', 'doc1', (0, 3), (), EntityType.PER, 'NIL10'),
             Mention('4', 'doc1', (0, 4), (), EntityType.PER, 'NIL10'),
             Mention('5', 'doc1', (0, 5), (), EntityType.PER, 'NIL10'),
         ]),
         MentionChain([
             Mention('6', 'doc1', (0, 6), (), EntityType.PER, 'NIL11'),
             Mention('7', 'doc1', (0, 7), (), EntityType.PER, 'NIL11'),
             Mention('8', 'doc1', (0, 8), (), EntityType.PER, 'NIL11'),
             Mention('9', 'doc1', (0, 9), (), EntityType.PER, 'NIL11'),
             Mention('10', 'doc1', (0, 10), (), EntityType.PER, 'NIL11'),
             Mention('11', 'doc1', (0, 11), (), EntityType.PER, 'NIL11'),
             Mention('12', 'doc1', (0, 12), (), EntityType.PER, 'NIL11'),
         ]),
     ]
     return gt, doc