Python Relation Examples, derek.data.model.Relation Python Examples

Example #1

0

Show file

    def test_entities_with_nesting_collapse(self):
        expected_tokens = [
            "Recurrence", "of", "$Bacteria$", "in", "Sardinia", ".", "From",
            "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people",
            "in", "the", "Sardinian", "province", "of", "Cagliari", "had",
            "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "."
        ]
        expected_sentences = [Sentence(0, 6), Sentence(6, 30)]

        expected_entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 3, "Bacteria"),
            Entity("T3", 2, 3, "Bacteria"),
            Entity("T4", 4, 5, "Geographical"),
            Entity("T5", 16, 17, "Habitat"),
            Entity("T6", 16, 23, "Habitat"),
            Entity("T7", 19, 21, "Geographical"),
            Entity("T8", 22, 23, "Geographical"),
            Entity("T9", 28, 29, "Bacteria")
        ]

        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        expected_relations = [
            Relation(expected_entities[0], expected_entities[1], "Lives_in"),
            Relation(expected_entities[8], expected_entities[6], "Lives_in")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences,
                                expected_paragraphs, expected_entities,
                                expected_relations)

        actual_doc = EntitiesCollapser({"Bacteria"}).transform(self.doc)
        self.assertEqual(expected_doc, actual_doc)

Example #2

0

Show file

    def test_inner_entities_collapse(self):
        expected_tokens = [
            "Recurrence", "of", "Pelecypod-associated", "cholera", "in",
            "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7",
            ",", "1979", ",", "10", "people", "in", "the", "$Geographical$",
            "of", "$Geographical$", "had", "onset", "of", "bacteriologically",
            "confirmed", "cholera", "."
        ]
        expected_sentences = [Sentence(0, 7), Sentence(7, 30)]

        expected_entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 4, "Bacteria"),
            Entity("T3", 3, 4, "Bacteria"),
            Entity("T4", 5, 6, "Geographical"),
            Entity("T5", 17, 18, "Habitat"),
            Entity("T6", 17, 23, "Habitat"),
            Entity("T7", 20, 21, "Geographical"),
            Entity("T8", 22, 23, "Geographical"),
            Entity("T9", 28, 29, "Bacteria")
        ]

        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        expected_relations = [
            Relation(expected_entities[0], expected_entities[1], "Lives_in"),
            Relation(expected_entities[8], expected_entities[6], "Lives_in")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences,
                                expected_paragraphs, expected_entities,
                                expected_relations)

        actual_doc = EntitiesCollapser({"Geographical"}).transform(self.doc)
        self.assertEqual(expected_doc, actual_doc)

Example #3

0

Show file

    def setUp(self):
        tokens = [
            "Recurrence", "of", "Pelecypod-associated", "cholera", "in",
            "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",",
            "1979", ",", "10", "people", "in", "the", "Sardinian", "province",
            "of", "Cagliari", "had", "onset", "of", "bacteriologically",
            "confirmed", "cholera", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 31)]

        entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 4, "Bacteria"),
            Entity("T3", 3, 4, "Bacteria"),
            Entity("T4", 5, 6, "Geographical"),
            Entity("T5", 17, 18, "Habitat"),
            Entity("T6", 17, 24, "Habitat"),
            Entity("T7", 20, 22, "Geographical"),
            Entity("T8", 23, 24, "Geographical"),
            Entity("T9", 29, 30, "Bacteria")
        ]

        paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        relations = [
            Relation(entities[0], entities[1], "Lives_in"),
            Relation(entities[8], entities[6], "Lives_in")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities,
                            relations)

Example #4

0

Show file

File: coref_sampling_strategy.py Project: wayne9qiu/derek

    def test_2_chains_2_pron(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity('_', 0, 1, 'noun'),
            Entity('_', 1, 2, 'pron'),
            Entity('_', 2, 3, 'pron'),
            Entity('_', 3, 4, 'noun'),
            Entity('_', 5, 6, 'noun'),
        ]
        rels = {
            Relation(Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'),
                     '1'),
            Relation(Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'),
                     '1'),
            Relation(Entity('_', 3, 4, 'noun'), Entity('_', 5, 6, 'noun'),
                     '1'),
        }
        doc = Document('test', [], sentences, paragraphs, entities, rels)

        max_distance = 3

        actual_samples = get_pron_samples(doc, max_distance, True)
        expected_samples = [
            (Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'pron'), None),
            (Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'), '1'),
            (Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'), '1'),
            (Entity('_', 2, 3, 'pron'), Entity('_', 3, 4, 'noun'), None),
        ]
        self.assertEqual(actual_samples, expected_samples)

Example #5

0

Show file

File: readers.py Project: wayne9qiu/derek

def _get_relations(raw_relations: list, entities_dict: dict, symmetric_types: set):
    relations = set()
    for rel in raw_relations:
        e1 = entities_dict[rel['first']]
        e2 = entities_dict[rel['second']]
        rel_type = rel['type']

        relations.add(Relation(e1, e2, rel_type))

        if symmetric_types and rel_type in symmetric_types:
            relations.add(Relation(e2, e1, rel_type))

    return relations

Example #6

0

Show file

 def _get_relations(self, predictions: dict) -> set:
     rels = set()
     for (e1, e2), label in predictions.items():
         rel_type = self.extractor.get_type(label)
         if rel_type is not None:
             rels.add(Relation(e1, e2, rel_type))
     return rels

Example #7

0

Show file

File: candidates_extraction.py Project: wayne9qiu/derek

    def setUp(self) -> None:
        tokens = [
            "I", "will", "do", "my", "homework", "today", ".", "It", "is",
            "very", "hard", "but", "i", "don't", "care", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 16)]
        paragraphs = [Paragraph(0, 2)]
        entities = [
            Entity("_", 0, 1, "t1"),
            Entity("_", 3, 5, "t2"),
            Entity("_", 7, 8, "t1"),
            Entity("_", 9, 11, "t2"),
            Entity("_", 10, 11, "t4")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities)
        self.relations = {
            Relation(entities[2], entities[3], "t1"),
            Relation(entities[3], entities[4], "t2")
        }

Example #8

0

Show file

File: helper.py Project: wayne9qiu/derek

def collapse_intersecting_entities(entities: List[Entity],
                                   relations: Set[Relation]):
    # assume entities list is sorted with start token
    entities_to_process = list(entities)
    entities_mapping = {}
    new_entities = []

    while entities_to_process:
        ent1 = entities_to_process.pop(0)
        ent_end = ent1.end_token
        type_ent = ent1.type
        ents_to_collapse = []

        for ent2 in entities_to_process:
            if ent2.start_token >= ent_end:
                continue
            if ent1.type != ent2.type:
                warn(
                    f"Intersecting entities have different types: {ent1} absorbed {ent2}"
                )
                assert not ent1.coincides(
                    ent2), "Two entities of different types on the same span"
                assert ent1.contains(ent2) or ent2.contains(ent1) or not ent1.intersects(ent2), \
                    "Two entities of different types are not embedded, intersecting only"

            ents_to_collapse.append(ent2)
            ent_end = max(ent_end, ent2.end_token)

            if len(ent2) > len(ent1):
                type_ent = ent2.type

        if not ents_to_collapse:
            new_ent = ent1
        else:
            new_ent = ent1.relocated(ent1.start_token,
                                     ent_end).with_type(type_ent)

        new_entities.append(new_ent)
        entities_mapping[ent1] = new_ent

        for ent2 in ents_to_collapse:
            entities_mapping[ent2] = new_ent
            entities_to_process.remove(ent2)

    new_relations = {
        Relation(entities_mapping[r.first_entity],
                 entities_mapping[r.second_entity], r.type)
        for r in relations
    }

    # new entities list was constructed as sorted
    return new_entities, new_relations

Example #9

0

Show file

File: process_similar_entities.py Project: wayne9qiu/derek

def chain_similar_entities(
        doc: Document, entities: List[Entity],
        entity_comparator: Callable[[Document, Entity, Entity], bool] = compare_entities_by_tokens) \
            -> List[CoreferenceChain]:

    relations = set()

    for i, e1 in enumerate(entities):
        for e2 in entities[:i]:
            if entity_comparator(doc, e1, e2):
                relations.add(Relation(e1, e2, "match"))

    return collect_chains(relations, entities)

Example #10

0

Show file

File: classifier.py Project: wayne9qiu/derek

    def predict_doc(self, doc, include_probs=False):
        doc = self.feature_computer.create_features_for_doc(doc)

        # parallel lists for segment features and segment entity pairs for all doc segments
        samples, entity_pairs = self.extractor.extract_features_from_doc(doc, use_filter=True)
        entity_pairs = sum(entity_pairs, [])
        outputs = ["predictions"]

        if include_probs:
            outputs.append("scores")

        out = predict_for_samples(
            self.graph, self.session, outputs,
            get_coref_batcher_factory(samples, 300, self.extractor, False, False))  # labels, [scores]

        relations = self._collect_pair_results(out[0], entity_pairs)
        relations = self._get_relations(relations)

        posprocessing_result = self.classifiers.apply(doc)

        posprocessing_rels = set()
        for (e1, e2), scores in posprocessing_result.items():
            label = max(scores, key=scores.get)
            if label is not None:
                posprocessing_rels.add(Relation(e1, e2, label))

        relations |= posprocessing_rels

        try:
            relations |= doc.relations
        except ValueError:
            pass

        ret = relations

        if include_probs:
            scores = self._collect_pair_results(out[1], entity_pairs)
            scores = self._get_scores(scores)
            scores = {**scores, **posprocessing_result}
            try:
                scores = {**scores, **get_known_rel_scores(doc.relations)}
            except ValueError:
                pass
            ret = (relations, scores)

        return ret

Example #11

0

Show file

File: coref_sampling_strategy.py Project: wayne9qiu/derek

    def test_2_entity_rel(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity('_', 0, 1, '1'),
            Entity('_', 1, 2, '1'),
        ]
        rels = {Relation(Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), '1')}
        doc = Document('test', [], sentences, paragraphs, entities, rels)

        max_distance = 3

        actual_samples = get_samples(doc, max_distance, True)

        expected_samples = [(Entity('_', 0, 1, '1'), Entity('_', 1, 2,
                                                            '1'), '1')]

        self.assertEqual(expected_samples, actual_samples)

Example #12

0

Show file

File: chains_collection.py Project: wayne9qiu/derek

def _get_rank_rels(entities, pairs):
    relations = set()
    for entity in entities:
        best_score = 0
        best_candidate = None
        best_label = None
        for (e1, e2), scores in pairs.items():
            if entity != e2:
                continue
            max_scores = max(scores.values())
            label = max(scores, key=scores.get)
            if max_scores > best_score and label is not None:
                best_score = max_scores
                best_candidate = e1
                best_label = label
        if best_candidate is not None:
            relations.add(Relation(entity, best_candidate, best_label))
    return relations

Example #13

0

Show file

File: chains_collection.py Project: wayne9qiu/derek

def collect_pron_vote_rank(pairs: Dict[tuple, dict], known_rels):
    """
    This method collects relations from pairs with given class confidence, using knowledge about known relations.
    Known relations are used to get info about clusters. Mention is connected to cluster with score chosen as mean of
    all pair scores.
    :param pairs: scores of mention pairs
    :param known_rels: known relations
    :return: relations selected from pairs
    """
    entities = sum(map(lambda x: [x[0], x[1]], pairs), [])
    nouns = set(filter(lambda x: x.type != 'pron', entities))
    chains = collect_chains(known_rels, list(nouns))

    entities = set(filter(lambda x: x.type == 'pron', entities))
    rels = set()
    for entity in entities:
        best_score = 0
        best_candidate = None
        for chain in chains:
            if not chain.entities:
                continue
            chain_scores = []
            candidate = get_closest_entity(chain, entity, False)
            for e in chain.entities:
                score = None
                if (e, entity) in pairs:
                    score = pairs[(e, entity)]["COREF"]
                if (entity, e) in pairs:
                    score = pairs[(entity, e)]["COREF"]
                if score is not None:
                    chain_scores.append(score)
            chain_score = np.mean(chain_scores) if chain_scores else 0
            if best_candidate is None or best_score < chain_score:
                best_candidate = candidate
                best_score = chain_score
        if best_candidate is not None:
            rels.add(Relation(best_candidate, entity, "COREF"))

    return rels

Example #14

0

Show file

File: readers.py Project: wayne9qiu/derek

    def _fix_entity_types(docs):
        ret = []
        for doc in docs:
            new_entities = []
            entity_mapping = {}
            new_rels = []
            for entity in doc.entities:
                head = find_span_head_token(doc, entity)
                if doc.token_features['pos'][head] == 'PRON':
                    e_type = 'pron'
                else:
                    e_type = 'noun'
                new_entity = entity.with_type(e_type)
                entity_mapping[entity] = new_entity
                new_entities.append(new_entity)
            for rel in doc.relations:
                new_rels.append(
                    Relation(entity_mapping[rel.first_entity],
                             entity_mapping[rel.second_entity], rel.type))

            ret.append(
                doc.without_relations().without_entities().with_entities(
                    new_entities).with_relations(new_rels))
        return ret

Example #15

0

Show file

 def to_relations_set(self) -> List[Relation]:
     relations = []
     for i, entity in enumerate(self.entities):
         for next_entity in self.entities[i + 1:]:
             relations.append(Relation(entity, next_entity, "COREF"))
     return relations

Example #16

0

Show file

 def to_relations_chain(self) -> List[Relation]:
     relations = []
     for prev_entity, entity in zip(self.entities[:-1], self.entities[1:]):
         relations.append(Relation(prev_entity, entity, "COREF"))
     return relations

Example #17

0

Show file

def _create_rel(idx1, idx2):
    return Relation(_create_entity(idx1), _create_entity(idx2), '_')

Example #18

0

Show file

File: collapse_intersecting.py Project: wayne9qiu/derek

def _create_rel(e1, e2):
    return Relation(e1, e2, "T1")

Example #19

0

Show file

    def setUp(self) -> None:
        self.docs = [
            Document('1',
                     ['Во', 'время', 'своих', 'прогулок', 'в', 'окрестностях', 'Симеиза', 'я', 'обратил', 'внимание',
                      'на', 'одинокую', 'дачу', ',', 'стоявшую', 'на', 'крутом', 'склоне', 'горы', '.',
                      'К', 'этой', 'даче', 'не', 'было', 'проведено', 'даже', 'дороги', '.',
                      'Кругом', 'она', 'была', 'обнесена', 'высоким', 'забором', ',', 'с', 'единственной', 'низкой',
                      'калиткой', ',', 'которая', 'всегда', 'была', 'плотно', 'прикрыта', '.'],
                     [Sentence(0, 20), Sentence(20, 29), Sentence(29, 47)],
                     [Paragraph(0, 3)],
                     [Entity('1', 2, 3, 'pron'),
                      Entity('1', 7, 8, 'pron'),
                      Entity('1', 11, 13, 'noun'),
                      Entity('1', 21, 23, 'noun'),
                      Entity('1', 30, 31, 'pron'),
                      Entity('1', 33, 35, 'noun'),
                      Entity('1', 37, 38, 'noun'),
                      Entity('1', 37, 40, 'noun'),
                      Entity('1', 41, 42, 'pron')],
                     {
                         Relation(Entity('1', 2, 3, 'pron'), Entity('1', 7, 8, 'pron'), 'COREF'),
                         Relation(Entity('1', 11, 13, 'noun'), Entity('1', 21, 23, 'noun'), 'COREF'),
                         Relation(Entity('1', 11, 13, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'),
                         Relation(Entity('1', 21, 23, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'),
                         Relation(Entity('1', 37, 40, 'noun'), Entity('1', 41, 42, 'pron'), 'COREF'),
                     },
                     {
                         'pos': ['ADP', 'NOUN', 'DET', 'NOUN', 'ADP', 'NOUN', 'PROPN', 'PRON', 'VERB', 'NOUN', 'ADP',
                                 'ADJ', 'NOUN',
                                 'PUNCT', 'VERB', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'PART',
                                 'AUX', 'VERB',
                                 'PART', 'NOUN', 'PUNCT', 'ADV', 'PRON', 'AUX', 'VERB', 'ADJ', 'NOUN', 'PUNCT', 'ADP',
                                 'ADJ', 'ADJ',
                                 'NOUN', 'PUNCT', 'PRON', 'ADV', 'AUX', 'ADV', 'VERB', 'PUNCT'],
                         'dt_labels': ['case', 'fixed', 'amod', 'obl', 'case', 'nmod', 'nmod', 'nsubj', 'root', 'obj',
                                       'case',
                                       'amod', 'nmod', 'punct', 'amod', 'case', 'amod', 'obl', 'nmod', 'punct', 'case',
                                       'amod',
                                       'obl', 'advmod', 'aux:pass', 'root', 'advmod', 'nsubj', 'punct', 'advmod',
                                       'nsubj',
                                       'aux:pass', 'root', 'amod', 'obl', 'punct', 'case', 'amod', 'amod', 'conj',
                                       'punct', 'nsubj',
                                       'advmod', 'aux:pass', 'advmod', 'acl:relcl', 'punct'],
                         'dt_head_distances': [3, -1, 1, 5, 1, -2, -1, 1, 0, -1, 2, 1, -3, -1, -2, 2, 1, -3, -1, -1, 2,
                                               1, 3, 2, 1,
                                               0, 1, -2, -1, 3, 2, 1, 0, 1, -2, -1, 3, 2, 1, -5, -1, 4, 3, 2, 1, -11,
                                               -1],
                         'lemmas': ['во', 'время', 'свой', 'прогулка', 'в', 'окрестность', 'Симеиза', 'я', 'обращать',
                                    'внимание',
                                    'на', 'одинокий', 'дача', ',', 'стоять', 'на', 'крутой', 'склон', 'гора', '.', 'к',
                                    'этот',
                                    'дача', 'не', 'быть', 'проводить', 'даже', 'дорога', '.', 'кругом', 'она', 'быть',
                                    'обнесен',
                                    'высокий', 'забор', ',', 'с', 'единственный', 'низкий', 'калитка', ',', 'который',
                                    'всегда',
                                    'быть', 'плотно', 'прикрывать', '.'],
                         'feats': [{}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                        'Gender': 'Neuter'},
                                   {'Number': 'Plural', 'Pronoun': 'REFLEXIVE', 'Case': 'Genitive'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Plural',
                                    'Gender': 'Neuter'}, {},
                                   {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Plural',
                                    'Gender': 'Masculine'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular',
                                    'Pronoun': 'DEICTIC',
                                    'Case': 'Nominative'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Neuter'}, {},
                                   {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular',
                                    'Gender': 'Feminine',
                                    'Tense': 'NotPast'},
                                   {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular',
                                    'Gender': 'Feminine',
                                    'Tense': 'Past'}, {},
                                   {'Case': 'Prepositional', 'Number': 'Singular', 'Gender': 'Masculine'},
                                   {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {}, {},
                                   {'Case': 'Dative', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Neuter',
                                    'Shortness': 'Short',
                                    'Tense': 'Past', 'Voice': 'Passive'}, {},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {}, {},
                                   {'Animacy': 'Animated', 'Gender': 'Feminine', 'Number': 'Singular',
                                    'Pronoun': 'PERSONAL',
                                    'Case': 'Nominative'},
                                   {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine',
                                    'Shortness': 'Short',
                                    'Tense': 'Past', 'Voice': 'Passive'},
                                   {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Masculine'},
                                   {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {}, {}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Feminine'}, {},
                                   {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {},
                                   {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine',
                                    'Shortness': 'Short',
                                    'Tense': 'Past', 'Voice': 'Passive'}, {}],
                         'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O', 'O'],
                     },
                     {
                         'ne': SortedSpansSet([Entity('1', 6, 7, 'GPE_CITY')])
                     }
                     ),
            Document('1',
                     ['Когда', 'мы', 'шли', 'по', 'тропинке', ',', 'каждый', 'был', 'доволен', 'и', 'думал', ',', 'что',
                      'надул', 'другого', '.',
                      'Петька', 'изредка', 'посапывал', 'носом', '.',
                      'Давно', 'он', 'зарился', 'на', 'моих', 'голубей', ',', 'еще', 'с', 'прошлой', 'зимы', ',', 'а',
                      'теперь', 'вот', 'счастье', 'неожиданно', 'привалило', '.',
                      'А', 'у', 'меня', 'будет', 'пистолет', '.'],
                     [Sentence(0, 16), Sentence(16, 21), Sentence(21, 40), Sentence(40, 46)],
                     [Paragraph(0, 3)],
                     [
                         Entity('1', 1, 2, 'pron'),
                         Entity('1', 16, 17, 'noun'),
                         Entity('1', 22, 23, 'pron'),
                         Entity('1', 25, 26, 'pron'),
                         Entity('1', 25, 27, 'noun'),
                         Entity('1', 42, 43, 'pron'),
                         Entity('1', 44, 45, 'noun'),
                     ],
                     {
                         Relation(Entity('1', 16, 17, 'noun'), Entity('1', 22, 23, 'pron'), 'COREF'),
                         Relation(Entity('1', 25, 26, 'pron'), Entity('1', 42, 43, 'pron'), 'COREF'),
                     },
                     {
                         'pos': ['SCONJ', 'PRON', 'VERB', 'ADP', 'NOUN', 'PUNCT', 'ADJ', 'AUX', 'ADJ', 'CCONJ', 'VERB',
                                 'PUNCT',
                                 'SCONJ', 'VERB', 'ADJ', 'PUNCT', 'NOUN', 'ADV', 'VERB', 'NOUN', 'PUNCT', 'ADV', 'PRON',
                                 'VERB',
                                 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADV', 'ADP', 'NOUN', 'NOUN', 'PUNCT', 'CCONJ', 'ADV',
                                 'PART',
                                 'NOUN', 'ADV', 'VERB', 'PUNCT', 'CCONJ', 'ADP', 'PRON', 'VERB', 'NOUN', 'PUNCT'],
                         'dt_labels': ['mark', 'nsubj', 'advcl', 'case', 'obl', 'punct', 'nsubj', 'cop', 'root', 'cc',
                                       'conj',
                                       'punct', 'mark', 'advcl', 'obj', 'punct', 'nsubj', 'advmod', 'root', 'obl',
                                       'punct', 'advmod',
                                       'nsubj', 'root', 'case', 'amod', 'obl', 'punct', 'advmod', 'case', 'obl', 'nmod',
                                       'punct',
                                       'cc', 'advmod', 'advmod', 'nsubj', 'advmod', 'conj', 'punct', 'cc', 'case',
                                       'root', 'cop',
                                       'nsubj', 'punct'],
                         'dt_head_distances': [8, 1, 6, 1, -2, -1, 2, 1, 0, 1, -2, -1, -2, -3, -1, -1, 2, 1, 0, -1, -1,
                                               2, 1, 0, 2,
                                               1, -3, -1, 2, 1, -7, -1, -1, 5, 4, 1, 2, 1, -15, -1, 2, 1, 0, -1, -2,
                                               -1],
                         'lemmas': ['когда', 'мы', 'идти', 'по', 'тропинка', ',', 'каждый', 'быть', 'довольный', 'и',
                                    'думать', ',',
                                    'что', 'надуть', 'другой', '.', 'Петька', 'изредка', 'посапывать', 'нос', '.',
                                    'давно', 'он',
                                    'зариться', 'на', 'мой', 'голубь', ',', 'еще', 'с', 'прошлый', 'зима', ',', 'а',
                                    'теперь', 'вот',
                                    'счастье', 'неожиданно', 'приваливать', '.', 'а', 'у', 'я', 'быть', 'пистолет',
                                    '.'],
                         'feats': [{}, {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'DEICTIC',
                                        'Case': 'Nominative'},
                                   {'Number': 'Plural', 'Tense': 'Past', 'Mode': 'Indicative'}, {},
                                   {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Masculine'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Shortness': 'Short'}, {},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {}, {},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Singular',
                                    'Gender': 'Masculine'}, {},
                                   {'Case': 'Nominative', 'Animacy': 'Animated', 'Number': 'Singular',
                                    'Gender': 'Masculine'}, {},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {}, {},
                                   {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular',
                                    'Pronoun': 'PERSONAL',
                                    'Case': 'Nominative'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {},
                                   {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'POSSESSIVE',
                                    'Case': 'Accusative'},
                                   {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Plural',
                                    'Gender': 'Masculine'}, {}, {},
                                   {}, {'Case': 'Genitive', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {}, {},
                                   {}, {}, {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                            'Gender': 'Neuter'},
                                   {},
                                   {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {}, {}, {},
                                   {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular',
                                    'Pronoun': 'DEICTIC',
                                    'Case': 'Genitive'},
                                   {'Person': 'Third', 'Number': 'Singular', 'Tense': 'NotPast', 'Mode': 'Indicative'},
                                   {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'}, {}],
                         'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O'],
                     },
                     {
                         'ne': SortedSpansSet([Entity('1', 16, 17, 'PERSON')])
                     }
                     )
        ]
        # empty sets are "known" rels
        self.hook = get_hook([doc.without_relations().with_relations(set()) for doc in self.docs])
        self.base_props = {
            "seed": 12345,

            "distance": 10,
            "max_distance": 10,
            "loss": "cross_entropy",
            "optimizer": "momentum",
            "lr_decay": 0.05,
            "momentum": 0.9,
            "dropout": 0.5,
            "internal_size": 10,
            "epoch": 1,
            "batch_size": 64,
            "learning_rate": 0.1,

            "clip_norm": 5,

            "max_candidate_distance": 50,
            "max_entity_distance": 50,
            "max_word_distance": 50,
            "max_sent_distance": 10,
            "max_dt_distance": 10,
            "dist_size": 50,

            "pos_emb_size": 0,
            "morph_feats_emb_size": 0,
            "entities_types_size": 20,

            "morph_feats_size": 0,
            "morph_feats_list": ["Gender", "Animacy", "Number"],

            "encoding_type": "lstm",
            "entity_encoding_size": 10,
            "encoding_size": 10,
            "classifiers": ["exact_match", "intersecting_mentions"],
            "use_filter": False,

            "max_sent_entities_distance": 10,
            "max_token_entities_distance": 20,

            "agreement_types": ["Gender", "Animacy", "Number"],
            "classifier_agreement_size": 0,

            "head_str_match_size": 0,
            "partial_str_match_size": 0,
            "ordered_partial_str_match_size": 0,

            "mention_interrelation_size": 0,
            "mention_distance_size": 0,
            "max_mention_distance": 50,
            "classifier_entity_distance_size": 0,
            "entities_types_in_classifier_size": 0,
            "head_ne_types_size": 0,
            "entities_token_distance_in_classifier_size": 0,
            "entities_sent_distance_in_classifier_size": 0,

            "encoder_entity_types_size": 0,
            "encoder_entity_ne_size": 0,

            "speech_types": ["said"],
            "speech_size": 0,

            "entity_encoding_type": "rnn",

            "classification_dense_size": 20,
        }
        self.experiment_props = {
            "sampling_strategy": ["coref_noun", "coref_pron_cluster", 'coref_pron_cluster_strict', 'coref_pron']
        }

Example #20

0

Show file

File: entities_collapser.py Project: wayne9qiu/derek

def _collapse_entities_in_doc(doc, entities_to_collapse: Iterable[Entity],
                              entity_types_to_collapse: Union[set, frozenset]):

    if set(doc.extras.keys()).difference({"ne"}):
        raise Exception("Currently support only ne extras")

    # copy features not to affect default document
    tokens_to_process = list(doc.tokens)
    token_features_to_process = {
        k: list(v)
        for k, v in doc.token_features.items()
    }

    borders_to_change = {
        'entities_to_collapse': build_borders_dict(entities_to_collapse),
        'sentences': build_borders_dict(doc.sentences)
    }
    try:
        borders_to_change["entities"] = build_borders_dict(doc.entities)
    except ValueError:
        pass

    if "ne" in doc.extras:
        borders_to_change["ne"] = build_borders_dict(doc.extras["ne"])

    _collapse_entities_and_correct_features(entities_to_collapse,
                                            tokens_to_process,
                                            token_features_to_process,
                                            entity_types_to_collapse,
                                            borders_to_change)

    sentences_mapping = create_objects_with_new_borders(
        doc.sentences, borders_to_change['sentences'])
    collapsed_entities_mapping = create_objects_with_new_borders(
        entities_to_collapse, borders_to_change['entities_to_collapse'])

    if 'entities' in borders_to_change:
        doc_entities_mapping = create_objects_with_new_borders(
            doc.entities, borders_to_change['entities'])
        doc_entities = doc_entities_mapping.values()
    else:
        doc_entities = None

    if "ne" in doc.extras:
        ne_mapping = create_objects_with_new_borders(doc.extras["ne"],
                                                     borders_to_change["ne"])
        extras = {"ne": SortedSpansSet(ne_mapping.values())}
    else:
        extras = None

    doc_to_process = Document(doc.name,
                              tokens_to_process,
                              sentences_mapping.values(),
                              doc.paragraphs,
                              doc_entities,
                              token_features=token_features_to_process,
                              extras=extras)

    try:
        relations = [
            Relation(doc_entities_mapping[r.first_entity],
                     doc_entities_mapping[r.second_entity], r.type)
            for r in doc.relations
        ]
        doc_to_process = doc_to_process.with_relations(relations)
    except ValueError:
        pass

    return doc_to_process, collapsed_entities_mapping

Example #21

0

Show file

File: test_helper.py Project: ispras-texterra/derek

def make_document_from_json_file(file_path):
    d = load_json_file_as_dict(file_path)

    tokens = d.get('tokens', [])
    entities = d.get('entities', [])
    sentences = d.get('sentences', [])
    paragraphs = d.get('paragraphs', [])
    token_features = {}

    for feature in [
            'pos', 'entities_types', 'entities_depths', 'borders', 'dt_labels',
            'dt_head_distances', 'dt_depths', 'dt_deltas_forward',
            'dt_deltas_backward', 'dt_breakups_forward', 'dt_breakups_backward'
    ]:
        if feature in d:
            token_features[feature] = d[feature]

    relations = d.get('relations', [])

    doc_entities = []
    for ent in entities:
        id_, start_token, end_token, ent_type = tuple(ent)
        doc_entities.append(Entity(id_, start_token, end_token, ent_type))

    doc_sentences = []

    for sent in sentences:
        start_token, end_token = tuple(sent)
        doc_sentences.append(Sentence(start_token, end_token))

    doc_paragraphs = []

    for par in paragraphs:
        start_sentence, end_sentence = tuple(par)
        doc_paragraphs.append(Paragraph(start_sentence, end_sentence))

    doc_relations = []

    for rel in relations:
        e1 = None
        e2 = None
        e1_id, e2_id, rel_type = tuple(rel)

        for entity in doc_entities:
            if entity.id == e1_id:
                e1 = entity
            if entity.id == e2_id:
                e2 = entity

            if e1 is not None and e2 is not None:
                break

        doc_relations.append(Relation(e1, e2, rel_type))

    doc = Document("",
                   tokens,
                   doc_sentences,
                   doc_paragraphs,
                   token_features=token_features)
    if 'entities' in d:
        doc = doc.with_entities(doc_entities)
    if 'relations' in d:
        doc = doc.with_relations(doc_relations)
    return doc

Example #22

0

Show file

    def setUp(self) -> None:
        self.docs = []

        # BB-event-4329237
        tokens = [
            "The", "in", "vitro", "assay", "of", "tuberculin",
            "hypersensitivity", "in", "Macaca", "mulatta", "sensitized",
            "with", "bacille", "Calmette", "Guerin", "cell", "wall", "vaccine",
            "and-or", "infected", "with", "virulent", "Mycobacterium",
            "tuberculosis", "."
        ]
        sentences = [Sentence(0, 25)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity("T2", 8, 18, "Habitat"),
            Entity("T3", 8, 24, "Habitat"),
            Entity("T4", 12, 18, "Habitat"),
            Entity("T5", 12, 15, "Bacteria"),
            Entity("T6", 22, 24, "Bacteria")
        ]
        relations = {Relation(entities[4], entities[1], "Lives_In")}

        # token features generated by UDPipe
        pos = [
            'DET', 'ADP', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'ADP',
            'PROPN', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'NOUN',
            'NOUN', 'NUM', 'NOUN', 'VERB', 'ADP', 'ADJ', 'PROPN', 'NOUN',
            'PUNCT'
        ]

        dt_labels = [
            'det', 'case', 'compound', 'nsubj', 'case', 'compound', 'nmod',
            'case', 'compound', 'nmod', 'root', 'case', 'compound', 'flat',
            'compound', 'compound', 'obl', 'nummod', 'appos', 'acl', 'case',
            'amod', 'compound', 'obl', 'punct'
        ]

        dt_head_distances = [
            3, 2, 1, 7, 2, 1, -3, 2, 1, -6, 0, 5, 2, -1, 2, 1, -6, 1, -2, -1,
            3, 2, 1, -4, -14
        ]

        token_features = {
            "pos": pos,
            "dt_labels": dt_labels,
            "dt_head_distances": dt_head_distances
        }
        self.docs.append(
            Document("_", tokens, sentences, paragraphs, entities, relations,
                     token_features))

        # BB-event-9564489
        tokens = [
            'Gingivomandibular', 'infection', 'due', 'to', 'Mycobacterium',
            'kansasii', 'in', 'a', 'patient', 'with', 'AIDS', '.'
        ]
        sentences = [Sentence(0, 12)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity("T2", 0, 1, "Habitat"),
            Entity("T3", 4, 6, "Bacteria"),
            Entity("T4", 8, 11, "Habitat")
        ]
        relations = {
            Relation(entities[1], entities[0], "Lives_In"),
            Relation(entities[1], entities[2], "Lives_In")
        }

        # token features generated by UDPipe
        pos = [
            'ADJ', 'NOUN', 'ADP', 'ADP', 'PROPN', 'PROPN', 'ADP', 'DET',
            'NOUN', 'ADP', 'NOUN', 'PUNCT'
        ]

        dt_labels = [
            'amod', 'root', 'case', 'fixed', 'compound', 'nmod', 'case', 'det',
            'nmod', 'case', 'nmod', 'punct'
        ]

        dt_head_distances = [1, 0, 3, -1, 1, -4, 2, 1, -7, 1, -2, -10]

        token_features = {
            "pos": pos,
            "dt_labels": dt_labels,
            "dt_head_distances": dt_head_distances
        }
        self.docs.append(
            Document("_", tokens, sentences, paragraphs, entities, relations,
                     token_features))
        self.docs_no_rels = [doc.without_relations() for doc in self.docs]
        self.props = {
            "shared": {
                "internal_emb_size": 10,
                "token_position_size": 10,
                "max_word_distance": 20,
                "dt_distance_emb_size": 10,
                "max_dt_distance": 10,
                "dt_depth_emb_size": 10,
                "max_dt_depth": 10,
                "pos_emb_size": 10
            },
            "add_we": "true",
            "add_shared": "true",
            "optimizer": "adam",
            "learning_rate": 0.01,
            "epoch": 2,
            "loss": "cross_entropy",
            "l2": 0.0001,
            "lr_decay": 0.1,
            "dropout": 0.5,
            "clip_norm": 1,
            "max_candidate_distance": 20,
            "batcher": {
                "batch_size": 8
            },
            "token_position_size": 10,
            "max_word_distance": 10,
            "encoding_size": 10,
            "entities_types_emb_size": 20,
            "entities_depth_emb_size": 10,
            'max_entities_depth': 2,
            "specific_encoder_size": 10,
            "aggregation": {
                "attention": {},
                "max_pooling": {},
                "mean_pooling": {},
                "take_spans": {},
                "last_hiddens": {}
            },
            "seed": 100
        }

        # GENIA id=10022435
        tokens = [
            "Glucocorticoid", "resistance", "in", "the", "squirrel", "monkey",
            "is", "associated", "with", "overexpression", "of", "the",
            "immunophilin", "FKBP51", "."
        ]
        sentences = [Sentence(0, 15)]
        paragraphs = [Paragraph(0, 1)]

        pos = [
            "NN", "NN", "IN", "DT", "NN", "NN", "VBZ", "VBN", "IN", "NN", "IN",
            "DT", "NN", "NN", "PERIOD"
        ]

        dt_labels = [
            "compound", "nsubjpass", "case", "det", "compound", "nmod",
            "auxpass", "root", "case", "nmod", "case", "det", "compound",
            "nmod", "dep"
        ]

        dt_head_distances = [1, 6, 3, 2, 1, -4, 1, 0, 1, -2, 3, 2, 1, -4, -7]

        token_features = {
            "pos": pos,
            "dt_labels": dt_labels,
            "dt_head_distances": dt_head_distances
        }
        self.unlabeled_docs = [
            Document("_",
                     tokens,
                     sentences,
                     paragraphs,
                     token_features=token_features)
        ]

        self.sdp_config = {
            "context_encoding_non_linearity_size": 10,
            "loss": "cross_entropy",
            "learning_rate": 0.02,
            "query_dense_size": 10,
            "clip_norm": 1,
            "batcher": {
                "batch_size": 1
            }
        }

        self.parser_config = {
            "context_encoding_non_linearity_size": 10,
            "loss": "cross_entropy",
            "learning_rate": 0.02,
            "clip_norm": 1,
            "batcher": {
                "batch_size": 1
            },
            "add_shared": True,
            "specific_encoder_size": 10,
            "sampling_strategy": "pos_filtering",
            "arc_token_distance_in_classifier_size": 10,
            "arc_token_distance_in_attention_size": 10,
            "max_arc_token_distance": 10,
            "aggregation": {
                "attention": {
                    "type": "luong",
                    "normalise_coefficients": True
                },
                "take_spans": {}
            }
        }