Example #1
0
    def test_two_entities_separated(self):
        sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)]
        entities = [Entity('', 2, 4, 'test'), Entity('', 9, 11, 'test')]

        expected_sentences = [Sentence(0, 2), Sentence(2, 5), Sentence(5, 8), Sentence(8, 15)]
        got_sentences = adjust_sentences(sentences, entities)
        self.assertEqual(expected_sentences, got_sentences)
Example #2
0
    def test_bio2_strategy_encoding(self):
        strategy = BIO2LabellingStrategy()
        sent = Sentence(110, 120)
        ents = [
            Entity("_", 110, 112, "T1"),
            Entity("_", 112, 113, "T2"),
            Entity("_", 114, 115, "T3"),
            Entity("_", 115, 118, "T3"),
            Entity("_", 119, 120, "T1")
        ]

        expected_possible_categories = {
            "O", "I-T1", "I-T2", "I-T3", "B-T1", "B-T2", "B-T3"
        }
        actual_possible_categories = strategy.get_possible_categories(
            self.ent_types)
        self.assertEqual(expected_possible_categories,
                         actual_possible_categories)

        expected_encoding = [
            "I-T1", "I-T1", "I-T2", "O", "I-T3", "B-T3", "I-T3", "I-T3", "O",
            "I-T1"
        ]
        actual_encoding = strategy.encode_labels(sent, ents)
        self.assertEqual(expected_encoding, actual_encoding)
    def test_nouns(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity('_', 0, 1, 'noun'),
            Entity('_', 1, 2, 'noun'),
            Entity('_', 2, 3, 'noun'),
            Entity('_', 3, 4, 'noun'),
        ]
        rels = {
            Relation(Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'noun'),
                     '1'),
            Relation(Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'noun'),
                     '1'),
            Relation(Entity('_', 2, 3, 'noun'), Entity('_', 3, 4, 'noun'),
                     '1'),
        }
        doc = Document('test', [], sentences, paragraphs, entities, rels)

        max_distance = 3

        actual_samples = get_pron_samples(doc, max_distance, False)
        expected_samples = []

        self.assertEqual(actual_samples, expected_samples)
Example #4
0
    def test_intersecting(self):
        ents = _create_ents([(0, 3), (0, 4), (1, 2), (5, 7), (5, 7), (6, 7),
                             (8, 9), (10, 13), (12, 14), (13, 16)])

        rels = {
            _create_rel(ents[1], ents[2]),
            _create_rel(ents[0], ents[1]),
            _create_rel(ents[5], ents[2]),
            _create_rel(ents[3], ents[4]),
            _create_rel(ents[6], ents[7]),
            _create_rel(ents[8], ents[6])
        }

        expected_ents = [
            Entity("0", 0, 4, "T1"),
            Entity("3", 5, 7, "T1"),
            Entity("6", 8, 9, "T1"),
            Entity("7", 10, 16, "T1")
        ]

        expected_rels = {
            _create_rel(expected_ents[0], expected_ents[0]),
            _create_rel(expected_ents[0], expected_ents[0]),
            _create_rel(expected_ents[1], expected_ents[0]),
            _create_rel(expected_ents[1], expected_ents[1]),
            _create_rel(expected_ents[2], expected_ents[3]),
            _create_rel(expected_ents[3], expected_ents[2])
        }

        self.assertEqual((expected_ents, expected_rels),
                         collapse_intersecting_entities(ents, rels))
Example #5
0
    def test_contained_entities(self):
        sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)]
        entities = [Entity('', 2, 6, 'test'), Entity('', 6, 7, 'test'), Entity('', 7, 8, 'test')]

        expected_sentences = [Sentence(0, 2), Sentence(2, 8), Sentence(8, 10), Sentence(10, 15)]
        got_sentences = adjust_sentences(sentences, entities)
        self.assertEqual(expected_sentences, got_sentences)
Example #6
0
    def _get_direction_feature(e1: Entity, e2: Entity):
        if e1.contains(e2):
            return "e2_in_e1"
        if e2.contains(e1):
            return "e1_in_e2"
        if e1.start_token < e2.start_token:
            return "e1_e2"

        return "e2_e1"
Example #7
0
 def test_assert_equal_with_different_types(self):
     ents = [
         # equal lenghts, different_types
         Entity("8", 200, 204, "T1"),
         Entity("9", 200, 204, "T2"),
     ]
     rels = set()
     self.assertRaises(Exception, collapse_intersecting_entities, ents,
                       rels)
Example #8
0
    def test_inner_entities_collapse(self):
        expected_tokens = [
            "Recurrence", "of", "Pelecypod-associated", "cholera", "in",
            "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7",
            ",", "1979", ",", "10", "people", "in", "the", "$Geographical$",
            "of", "$Geographical$", "had", "onset", "of", "bacteriologically",
            "confirmed", "cholera", "."
        ]
        expected_sentences = [Sentence(0, 7), Sentence(7, 30)]

        expected_entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 4, "Bacteria"),
            Entity("T3", 3, 4, "Bacteria"),
            Entity("T4", 5, 6, "Geographical"),
            Entity("T5", 17, 18, "Habitat"),
            Entity("T6", 17, 23, "Habitat"),
            Entity("T7", 20, 21, "Geographical"),
            Entity("T8", 22, 23, "Geographical"),
            Entity("T9", 28, 29, "Bacteria")
        ]

        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        expected_relations = [
            Relation(expected_entities[0], expected_entities[1], "Lives_in"),
            Relation(expected_entities[8], expected_entities[6], "Lives_in")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences,
                                expected_paragraphs, expected_entities,
                                expected_relations)

        actual_doc = EntitiesCollapser({"Geographical"}).transform(self.doc)
        self.assertEqual(expected_doc, actual_doc)
    def test_3_entity_paragraphs(self):
        sentences = [
            Sentence(0, 5),
            Sentence(5, 10),
        ]
        paragraphs = [
            Paragraph(0, 1),
            Paragraph(1, 2),
        ]
        entities = [
            Entity('_', 0, 1, '1'),
            Entity('_', 1, 2, '1'),
            Entity('_', 5, 6, '2'),
        ]
        doc = Document('test', [], sentences, paragraphs, entities)

        max_distance = 3

        actual_samples = get_samples(doc, max_distance, False)

        expected_samples = [
            (Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), None),
            (Entity('_', 0, 1, '1'), Entity('_', 5, 6, '2'), None),
            (Entity('_', 1, 2, '1'), Entity('_', 5, 6, '2'), None),
        ]

        self.assertEqual(expected_samples, actual_samples)
Example #10
0
    def setUp(self):
        tokens = [
            "Recurrence", "of", "Pelecypod-associated", "cholera", "in",
            "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",",
            "1979", ",", "10", "people", "in", "the", "Sardinian", "province",
            "of", "Cagliari", "had", "onset", "of", "bacteriologically",
            "confirmed", "cholera", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 31)]

        entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 4, "Bacteria"),
            Entity("T3", 3, 4, "Bacteria"),
            Entity("T4", 5, 6, "Geographical"),
            Entity("T5", 17, 18, "Habitat"),
            Entity("T6", 17, 24, "Habitat"),
            Entity("T7", 20, 22, "Geographical"),
            Entity("T8", 23, 24, "Geographical"),
            Entity("T9", 29, 30, "Bacteria")
        ]

        paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        relations = [
            Relation(entities[0], entities[1], "Lives_in"),
            Relation(entities[8], entities[6], "Lives_in")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities,
                            relations)
Example #11
0
    def test_eq(self):
        self.assertEqual(Entity("_", 1, 2, "T"), Entity("_", 1, 2, "T"))
        self.assertNotEqual(Entity("_", 1, 2, "T"), Entity("_", 1, 2, "P"))
        self.assertNotEqual(Entity("__", 1, 2, "T"), Entity("_", 1, 2, "T"))
        self.assertNotEqual(Entity("_", 1, 2, "T"), Entity("_", 1, 3, "T"))

        self.assertNotEqual(Entity("_", 1, 2, "T"), Sentence(1, 2))
Example #12
0
    def test_entities_with_nesting_collapse(self):
        expected_tokens = [
            "Recurrence", "of", "$Bacteria$", "in", "Sardinia", ".", "From",
            "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people",
            "in", "the", "Sardinian", "province", "of", "Cagliari", "had",
            "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "."
        ]
        expected_sentences = [Sentence(0, 6), Sentence(6, 30)]

        expected_entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 3, "Bacteria"),
            Entity("T3", 2, 3, "Bacteria"),
            Entity("T4", 4, 5, "Geographical"),
            Entity("T5", 16, 17, "Habitat"),
            Entity("T6", 16, 23, "Habitat"),
            Entity("T7", 19, 21, "Geographical"),
            Entity("T8", 22, 23, "Geographical"),
            Entity("T9", 28, 29, "Bacteria")
        ]

        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        expected_relations = [
            Relation(expected_entities[0], expected_entities[1], "Lives_in"),
            Relation(expected_entities[8], expected_entities[6], "Lives_in")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences,
                                expected_paragraphs, expected_entities,
                                expected_relations)

        actual_doc = EntitiesCollapser({"Bacteria"}).transform(self.doc)
        self.assertEqual(expected_doc, actual_doc)
Example #13
0
    def test_leq(self):
        self.assertTrue(Entity("_", 2, 3, "A") < Entity("_", 2, 4, "A"))
        self.assertTrue(Entity("_", 2, 3, "A") < Entity("_", 2, 3, "B"))
        self.assertTrue(Entity("1", 2, 3, "B") < Entity("11", 2, 3, "B"))
        self.assertTrue(Entity("1", 0, 1, "B") < Entity("1", 2, 3, "B"))

        self.assertRaises(Exception,
                          lambda: Entity("_", 0, 1, "A") < TokenSpan(2, 3))
Example #14
0
 def test_ne_features(self):
     ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")]
     doc = Document('',
                    ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"],
                    [Sentence(0, 7)], [],
                    extras={'ne': SortedSpansSet(ents)})
     fe, meta = ne_fe_factory([doc], {"ne_emb_size": 10})
     features = fe.extract_features_from_doc(doc, 3, 7)['ne']
     self.assertEqual(len(meta.get_embedded_features()), 1)
     self.assertEqual(len(features), 4)
     self.assertEqual(features[0], features[2])  # O O
     self.assertEqual(features[1], features[3])  # I-PER I-PER
     self.assertNotEqual(features[0], features[1])  # O I-PER
Example #15
0
    def test_collapsement_of_same_spans(self):
        tokens = ["Elon", "Musk", "is", "CEO", "of", "Tesla", "."]
        sentences = [Sentence(0, 7)]
        entities = [
            Entity("_", 0, 2, "ELON"),
            Entity("_", 0, 2, "MUSK"),
            Entity("_", 5, 6, "COMP"),
            Entity("_", 5, 6, "ORG")
        ]

        input_doc = Document("_", tokens, sentences, [], entities)

        expected_tokens = ["$ELON$", "is", "CEO", "of", "$COMP$", "."]
        expected_sentences = [Sentence(0, 6)]
        expected_entities = [
            Entity("_", 0, 1, "ELON"),
            Entity("_", 0, 1, "MUSK"),
            Entity("_", 4, 5, "COMP"),
            Entity("_", 4, 5, "ORG")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences, [],
                                expected_entities)

        actual_doc = EntitiesCollapser({"ELON", "COMP"}).transform(input_doc)
        self.assertEqual(expected_doc, actual_doc)
Example #16
0
    def test_multi_sentence(self):
        sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)]
        entities = [Entity('', 2, 9, 'test')]

        expected_sentences = [Sentence(0, 2), Sentence(2, 10), Sentence(10, 15)]
        got_sentences = adjust_sentences(sentences, entities)
        self.assertEqual(expected_sentences, got_sentences)
Example #17
0
    def test_one_entity(self):
        sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5)]
        entities = [Entity('', 2, 4, 'test')]

        expected_sentences = [Sentence(0, 2), Sentence(2, 5)]
        got_sentences = adjust_sentences(sentences, entities)
        self.assertEqual(expected_sentences, got_sentences)
Example #18
0
 def decode_labels(self, sent: Sentence,
                   sent_labels: List[str]) -> List[Entity]:
     return [
         Entity("generated", start + sent.start_token,
                end + sent.start_token, t)
         for start, end, t in self.decoder.decode(sent_labels)
     ]
Example #19
0
 def _get_token_distance(self, doc: Document, e1: Entity, e2: Entity):
     if 'entities_token_distance_in_classifier' not in self.classifier_converters:
         return {}
     return {
         'entities_token_distance_in_classifier':
         self.classifier_converters['entities_token_distance_in_classifier']
         [e1.token_distance_to(e2)]
     }
Example #20
0
    def test_io_strategy_decoding(self):
        strategy = IOLabellingStrategy()
        sent = Sentence(110, 120)
        labels = [
            "I-T1", "I-T1", "I-T2", "O", "I-T3", "I-T3", "I-T3", "I-T3", "O",
            "I-T1"
        ]

        expected = [
            Entity("generated", 110, 112, "T1"),
            Entity("generated", 112, 113, "T2"),
            Entity("generated", 114, 118, "T3"),
            Entity("generated", 119, 120, "T1")
        ]

        actual = strategy.decode_labels(sent, labels)
        self.assertEqual(expected, actual)
Example #21
0
def _map_ne(doc: Document, ne: Entity):
    ents_at_ne = doc.entities.contained_in(ne)

    for ent in ents_at_ne:
        if ne.coincides(ent):
            return ent.type

    return None
    def test_2_entity(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity('_', 0, 1, '1'),
            Entity('_', 1, 2, '1'),
        ]
        doc = Document('test', [], sentences, paragraphs, entities)

        max_distance = 3

        actual_samples = get_samples(doc, max_distance, False)

        expected_samples = [(Entity('_', 0, 1, '1'), Entity('_', 1, 2,
                                                            '1'), None)]

        self.assertEqual(expected_samples, actual_samples)
Example #23
0
    def test_without_labels(self):
        ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")]
        doc = Document('',
                       ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"],
                       [Sentence(0, 7)], [], ents)

        ner_fe, token_meta = ner_fe_factory([doc], {"internal_emb_size": 10})
        doc = doc.without_entities()
        features, = ner_fe.extract_features_from_doc(doc)

        words = features['words_0']
        self.assertEqual(features['seq_len'], 7)
        self.assertEqual(len(words), 7)

        self.assertNotEqual(words[0], words[1])  # Planning of
        self.assertEqual(words[1], words[3])  # of of

        self.assertRaises(KeyError, lambda: features['labels'])
Example #24
0
 def _create_entities_from(self, span2position: dict, fre_objects: list) -> List[Entity]:
     entities = []
     for fre_object in fre_objects:
         tokens = sorted([span2position[span] for span in fre_object["spans"]], key=lambda x: x['start'])
         start = tokens[0]["start"]
         end = tokens[-1]["end"]
         ent_type = self.convert_locorg(fre_object["type"])
         entities.append(Entity(fre_object["id"], start, end, ent_type))
     return entities
    def setUp(self) -> None:
        tokens = ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon", "in", "LA", "in", "USA", "."]
        sents = [Sentence(0, 12)]
        ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER"), Entity("_", 8, 9, "ORG"), Entity("_", 10, 11, "ORG")]
        nes = SortedSpansSet([
                Entity("gen", 0, 1, "STUFF"),
                Entity("gen", 4, 5, "PERORG"), Entity("gen", 6, 7, "PERORG"),
                Entity("gen", 8, 9, "PERORG"), Entity("gen", 10, 11, "PERORG")
        ])

        self.doc = Document('', tokens, sents, [], ents, extras={"ne": nes})
Example #26
0
    def setUp(self) -> None:
        tokens = [
            "I", "will", "do", "my", "homework", "today", ".", "It", "is",
            "very", "hard", "but", "i", "don't", "care", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 16)]
        paragraphs = [Paragraph(0, 2)]
        entities = [
            Entity("_", 0, 1, "t1"),
            Entity("_", 3, 5, "t2"),
            Entity("_", 7, 8, "t1"),
            Entity("_", 9, 11, "t2"),
            Entity("_", 10, 11, "t4")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities)
        self.relations = {
            Relation(entities[2], entities[3], "t1"),
            Relation(entities[3], entities[4], "t2")
        }
    def test_1_entity(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [Entity('_', 0, 1, 'noun')]
        doc = Document('test', [], sentences, paragraphs, entities)

        max_distance = 3

        actual_samples = get_noun_samples(doc, max_distance, False)
        expected_samples = []

        self.assertEqual(expected_samples, actual_samples)
 def test_unify_types(self):
     expected = [
         Entity("_", 0, 2, "League"),
         Entity("_", 4, 5, "Location1"),
         Entity("_", 6, 7, "Location1"),
         Entity("_", 8, 10, "League"),
         Entity("_", 11, 12, "League"),
         Entity("_", 17, 19, "Person"),
         Entity("_", 20, 21, "Organization"),
         Entity("_", 22, 23, "Person")
     ]
     self.assertListEqual(
         expected,
         unify_types_of_similar_entities(self.doc, self.doc.entities))
Example #29
0
    def _get_relation_features(doc, e1: Entity, e2: Entity, converters,
                               name_postfix):
        features = {}

        feature_name = "rel_args_in_{}".format(name_postfix)
        converter = converters.get(feature_name, None)
        if converter is not None:
            features[feature_name] = converter[(e1.type, e2.type)]

        feature_name = "entities_token_distance_in_{}".format(name_postfix)
        converter = converters.get(feature_name, None)
        if converter is not None:
            features[feature_name] = converter[e1.token_distance_to(e2)]

        feature_name = "entities_token_log_distance_in_{}".format(name_postfix)
        converter = converters.get(feature_name, None)
        if converter is not None:
            features[feature_name] = converter[e1.token_distance_to(e2)]

        feature_name = "entities_sent_distance_in_{}".format(name_postfix)
        converter = converters.get(feature_name, None)
        if converter is not None:
            features[feature_name] = converter[
                get_sentence_distance_between_entities(doc, e1, e2)]

        feature_name = "rel_dir_in_{}".format(name_postfix)
        converter = converters.get(feature_name, None)
        if converter is not None:
            features[feature_name] = converter[
                RelExtFeatureExtractor._get_direction_feature(e1, e2)]

        for ent_num, ent in enumerate((e1, e2)):
            feature_name = "entities_types_in_{}_{}".format(
                name_postfix, ent_num)
            converter = converters.get(feature_name, None)
            if converter is not None:
                features[feature_name] = converter[ent.type]

        return features
Example #30
0
    def test_with_labels(self):
        ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")]
        doc = Document('',
                       ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"],
                       [Sentence(0, 7)], [], ents)

        ner_fe, token_meta = ner_fe_factory([doc], {"internal_emb_size": 10})
        # one sentence in doc -> one sample
        features, = ner_fe.extract_features_from_doc(doc, include_labels=True)

        words = features['words_0']
        self.assertEqual(features['seq_len'], 7)
        self.assertEqual(len(words), 7)

        self.assertNotEqual(words[0], words[1])  # Planning of
        self.assertEqual(words[1], words[3])  # of of

        labels = features["labels"]
        self.assertEqual(len(labels), 7)
        self.assertEqual(labels[4], labels[6])  # B-PER, B-PER
        self.assertNotEqual(labels[3], labels[4])  # O, B-PER
        self.assertEqual(labels[0], labels[1])  # O, O