def test_entities_with_nesting_collapse(self): expected_tokens = [ "Recurrence", "of", "$Bacteria$", "in", "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "Sardinian", "province", "of", "Cagliari", "had", "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "." ] expected_sentences = [Sentence(0, 6), Sentence(6, 30)] expected_entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 3, "Bacteria"), Entity("T3", 2, 3, "Bacteria"), Entity("T4", 4, 5, "Geographical"), Entity("T5", 16, 17, "Habitat"), Entity("T6", 16, 23, "Habitat"), Entity("T7", 19, 21, "Geographical"), Entity("T8", 22, 23, "Geographical"), Entity("T9", 28, 29, "Bacteria") ] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_relations = [ Relation(expected_entities[0], expected_entities[1], "Lives_in"), Relation(expected_entities[8], expected_entities[6], "Lives_in") ] expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, expected_relations) actual_doc = EntitiesCollapser({"Bacteria"}).transform(self.doc) self.assertEqual(expected_doc, actual_doc)
def test_inner_entities_collapse(self): expected_tokens = [ "Recurrence", "of", "Pelecypod-associated", "cholera", "in", "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "$Geographical$", "of", "$Geographical$", "had", "onset", "of", "bacteriologically", "confirmed", "cholera", "." ] expected_sentences = [Sentence(0, 7), Sentence(7, 30)] expected_entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 4, "Bacteria"), Entity("T3", 3, 4, "Bacteria"), Entity("T4", 5, 6, "Geographical"), Entity("T5", 17, 18, "Habitat"), Entity("T6", 17, 23, "Habitat"), Entity("T7", 20, 21, "Geographical"), Entity("T8", 22, 23, "Geographical"), Entity("T9", 28, 29, "Bacteria") ] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_relations = [ Relation(expected_entities[0], expected_entities[1], "Lives_in"), Relation(expected_entities[8], expected_entities[6], "Lives_in") ] expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, expected_relations) actual_doc = EntitiesCollapser({"Geographical"}).transform(self.doc) self.assertEqual(expected_doc, actual_doc)
def setUp(self): tokens = [ "Recurrence", "of", "Pelecypod-associated", "cholera", "in", "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people", "in", "the", "Sardinian", "province", "of", "Cagliari", "had", "onset", "of", "bacteriologically", "confirmed", "cholera", "." ] sentences = [Sentence(0, 7), Sentence(7, 31)] entities = [ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 4, "Bacteria"), Entity("T3", 3, 4, "Bacteria"), Entity("T4", 5, 6, "Geographical"), Entity("T5", 17, 18, "Habitat"), Entity("T6", 17, 24, "Habitat"), Entity("T7", 20, 22, "Geographical"), Entity("T8", 23, 24, "Geographical"), Entity("T9", 29, 30, "Bacteria") ] paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] relations = [ Relation(entities[0], entities[1], "Lives_in"), Relation(entities[8], entities[6], "Lives_in") ] self.doc = Document("_", tokens, sentences, paragraphs, entities, relations)
def test_2_chains_2_pron(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [ Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'pron'), Entity('_', 2, 3, 'pron'), Entity('_', 3, 4, 'noun'), Entity('_', 5, 6, 'noun'), ] rels = { Relation(Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'), '1'), Relation(Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'), '1'), Relation(Entity('_', 3, 4, 'noun'), Entity('_', 5, 6, 'noun'), '1'), } doc = Document('test', [], sentences, paragraphs, entities, rels) max_distance = 3 actual_samples = get_pron_samples(doc, max_distance, True) expected_samples = [ (Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'pron'), None), (Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'), '1'), (Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'), '1'), (Entity('_', 2, 3, 'pron'), Entity('_', 3, 4, 'noun'), None), ] self.assertEqual(actual_samples, expected_samples)
def _get_relations(raw_relations: list, entities_dict: dict, symmetric_types: set): relations = set() for rel in raw_relations: e1 = entities_dict[rel['first']] e2 = entities_dict[rel['second']] rel_type = rel['type'] relations.add(Relation(e1, e2, rel_type)) if symmetric_types and rel_type in symmetric_types: relations.add(Relation(e2, e1, rel_type)) return relations
def _get_relations(self, predictions: dict) -> set: rels = set() for (e1, e2), label in predictions.items(): rel_type = self.extractor.get_type(label) if rel_type is not None: rels.add(Relation(e1, e2, rel_type)) return rels
def setUp(self) -> None: tokens = [ "I", "will", "do", "my", "homework", "today", ".", "It", "is", "very", "hard", "but", "i", "don't", "care", "." ] sentences = [Sentence(0, 7), Sentence(7, 16)] paragraphs = [Paragraph(0, 2)] entities = [ Entity("_", 0, 1, "t1"), Entity("_", 3, 5, "t2"), Entity("_", 7, 8, "t1"), Entity("_", 9, 11, "t2"), Entity("_", 10, 11, "t4") ] self.doc = Document("_", tokens, sentences, paragraphs, entities) self.relations = { Relation(entities[2], entities[3], "t1"), Relation(entities[3], entities[4], "t2") }
def collapse_intersecting_entities(entities: List[Entity], relations: Set[Relation]): # assume entities list is sorted with start token entities_to_process = list(entities) entities_mapping = {} new_entities = [] while entities_to_process: ent1 = entities_to_process.pop(0) ent_end = ent1.end_token type_ent = ent1.type ents_to_collapse = [] for ent2 in entities_to_process: if ent2.start_token >= ent_end: continue if ent1.type != ent2.type: warn( f"Intersecting entities have different types: {ent1} absorbed {ent2}" ) assert not ent1.coincides( ent2), "Two entities of different types on the same span" assert ent1.contains(ent2) or ent2.contains(ent1) or not ent1.intersects(ent2), \ "Two entities of different types are not embedded, intersecting only" ents_to_collapse.append(ent2) ent_end = max(ent_end, ent2.end_token) if len(ent2) > len(ent1): type_ent = ent2.type if not ents_to_collapse: new_ent = ent1 else: new_ent = ent1.relocated(ent1.start_token, ent_end).with_type(type_ent) new_entities.append(new_ent) entities_mapping[ent1] = new_ent for ent2 in ents_to_collapse: entities_mapping[ent2] = new_ent entities_to_process.remove(ent2) new_relations = { Relation(entities_mapping[r.first_entity], entities_mapping[r.second_entity], r.type) for r in relations } # new entities list was constructed as sorted return new_entities, new_relations
def chain_similar_entities( doc: Document, entities: List[Entity], entity_comparator: Callable[[Document, Entity, Entity], bool] = compare_entities_by_tokens) \ -> List[CoreferenceChain]: relations = set() for i, e1 in enumerate(entities): for e2 in entities[:i]: if entity_comparator(doc, e1, e2): relations.add(Relation(e1, e2, "match")) return collect_chains(relations, entities)
def predict_doc(self, doc, include_probs=False): doc = self.feature_computer.create_features_for_doc(doc) # parallel lists for segment features and segment entity pairs for all doc segments samples, entity_pairs = self.extractor.extract_features_from_doc(doc, use_filter=True) entity_pairs = sum(entity_pairs, []) outputs = ["predictions"] if include_probs: outputs.append("scores") out = predict_for_samples( self.graph, self.session, outputs, get_coref_batcher_factory(samples, 300, self.extractor, False, False)) # labels, [scores] relations = self._collect_pair_results(out[0], entity_pairs) relations = self._get_relations(relations) posprocessing_result = self.classifiers.apply(doc) posprocessing_rels = set() for (e1, e2), scores in posprocessing_result.items(): label = max(scores, key=scores.get) if label is not None: posprocessing_rels.add(Relation(e1, e2, label)) relations |= posprocessing_rels try: relations |= doc.relations except ValueError: pass ret = relations if include_probs: scores = self._collect_pair_results(out[1], entity_pairs) scores = self._get_scores(scores) scores = {**scores, **posprocessing_result} try: scores = {**scores, **get_known_rel_scores(doc.relations)} except ValueError: pass ret = (relations, scores) return ret
def test_2_entity_rel(self): sentences = [Sentence(0, 10)] paragraphs = [Paragraph(0, 1)] entities = [ Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), ] rels = {Relation(Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), '1')} doc = Document('test', [], sentences, paragraphs, entities, rels) max_distance = 3 actual_samples = get_samples(doc, max_distance, True) expected_samples = [(Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), '1')] self.assertEqual(expected_samples, actual_samples)
def _get_rank_rels(entities, pairs): relations = set() for entity in entities: best_score = 0 best_candidate = None best_label = None for (e1, e2), scores in pairs.items(): if entity != e2: continue max_scores = max(scores.values()) label = max(scores, key=scores.get) if max_scores > best_score and label is not None: best_score = max_scores best_candidate = e1 best_label = label if best_candidate is not None: relations.add(Relation(entity, best_candidate, best_label)) return relations
def collect_pron_vote_rank(pairs: Dict[tuple, dict], known_rels): """ This method collects relations from pairs with given class confidence, using knowledge about known relations. Known relations are used to get info about clusters. Mention is connected to cluster with score chosen as mean of all pair scores. :param pairs: scores of mention pairs :param known_rels: known relations :return: relations selected from pairs """ entities = sum(map(lambda x: [x[0], x[1]], pairs), []) nouns = set(filter(lambda x: x.type != 'pron', entities)) chains = collect_chains(known_rels, list(nouns)) entities = set(filter(lambda x: x.type == 'pron', entities)) rels = set() for entity in entities: best_score = 0 best_candidate = None for chain in chains: if not chain.entities: continue chain_scores = [] candidate = get_closest_entity(chain, entity, False) for e in chain.entities: score = None if (e, entity) in pairs: score = pairs[(e, entity)]["COREF"] if (entity, e) in pairs: score = pairs[(entity, e)]["COREF"] if score is not None: chain_scores.append(score) chain_score = np.mean(chain_scores) if chain_scores else 0 if best_candidate is None or best_score < chain_score: best_candidate = candidate best_score = chain_score if best_candidate is not None: rels.add(Relation(best_candidate, entity, "COREF")) return rels
def _fix_entity_types(docs): ret = [] for doc in docs: new_entities = [] entity_mapping = {} new_rels = [] for entity in doc.entities: head = find_span_head_token(doc, entity) if doc.token_features['pos'][head] == 'PRON': e_type = 'pron' else: e_type = 'noun' new_entity = entity.with_type(e_type) entity_mapping[entity] = new_entity new_entities.append(new_entity) for rel in doc.relations: new_rels.append( Relation(entity_mapping[rel.first_entity], entity_mapping[rel.second_entity], rel.type)) ret.append( doc.without_relations().without_entities().with_entities( new_entities).with_relations(new_rels)) return ret
def to_relations_set(self) -> List[Relation]: relations = [] for i, entity in enumerate(self.entities): for next_entity in self.entities[i + 1:]: relations.append(Relation(entity, next_entity, "COREF")) return relations
def to_relations_chain(self) -> List[Relation]: relations = [] for prev_entity, entity in zip(self.entities[:-1], self.entities[1:]): relations.append(Relation(prev_entity, entity, "COREF")) return relations
def _create_rel(idx1, idx2): return Relation(_create_entity(idx1), _create_entity(idx2), '_')
def _create_rel(e1, e2): return Relation(e1, e2, "T1")
def setUp(self) -> None: self.docs = [ Document('1', ['Во', 'время', 'своих', 'прогулок', 'в', 'окрестностях', 'Симеиза', 'я', 'обратил', 'внимание', 'на', 'одинокую', 'дачу', ',', 'стоявшую', 'на', 'крутом', 'склоне', 'горы', '.', 'К', 'этой', 'даче', 'не', 'было', 'проведено', 'даже', 'дороги', '.', 'Кругом', 'она', 'была', 'обнесена', 'высоким', 'забором', ',', 'с', 'единственной', 'низкой', 'калиткой', ',', 'которая', 'всегда', 'была', 'плотно', 'прикрыта', '.'], [Sentence(0, 20), Sentence(20, 29), Sentence(29, 47)], [Paragraph(0, 3)], [Entity('1', 2, 3, 'pron'), Entity('1', 7, 8, 'pron'), Entity('1', 11, 13, 'noun'), Entity('1', 21, 23, 'noun'), Entity('1', 30, 31, 'pron'), Entity('1', 33, 35, 'noun'), Entity('1', 37, 38, 'noun'), Entity('1', 37, 40, 'noun'), Entity('1', 41, 42, 'pron')], { Relation(Entity('1', 2, 3, 'pron'), Entity('1', 7, 8, 'pron'), 'COREF'), Relation(Entity('1', 11, 13, 'noun'), Entity('1', 21, 23, 'noun'), 'COREF'), Relation(Entity('1', 11, 13, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'), Relation(Entity('1', 21, 23, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'), Relation(Entity('1', 37, 40, 'noun'), Entity('1', 41, 42, 'pron'), 'COREF'), }, { 'pos': ['ADP', 'NOUN', 'DET', 'NOUN', 'ADP', 'NOUN', 'PROPN', 'PRON', 'VERB', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT', 'VERB', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'PART', 'AUX', 'VERB', 'PART', 'NOUN', 'PUNCT', 'ADV', 'PRON', 'AUX', 'VERB', 'ADJ', 'NOUN', 'PUNCT', 'ADP', 'ADJ', 'ADJ', 'NOUN', 'PUNCT', 'PRON', 'ADV', 'AUX', 'ADV', 'VERB', 'PUNCT'], 'dt_labels': ['case', 'fixed', 'amod', 'obl', 'case', 'nmod', 'nmod', 'nsubj', 'root', 'obj', 'case', 'amod', 'nmod', 'punct', 'amod', 'case', 'amod', 'obl', 'nmod', 'punct', 'case', 'amod', 'obl', 'advmod', 'aux:pass', 'root', 'advmod', 'nsubj', 'punct', 'advmod', 'nsubj', 'aux:pass', 'root', 'amod', 'obl', 'punct', 'case', 'amod', 'amod', 'conj', 'punct', 'nsubj', 'advmod', 'aux:pass', 'advmod', 'acl:relcl', 'punct'], 'dt_head_distances': [3, -1, 1, 5, 1, -2, -1, 1, 0, -1, 2, 1, -3, -1, -2, 2, 1, -3, -1, -1, 2, 1, 3, 2, 1, 0, 1, -2, -1, 3, 2, 1, 0, 1, -2, -1, 3, 2, 1, -5, -1, 4, 3, 2, 1, -11, -1], 'lemmas': ['во', 'время', 'свой', 'прогулка', 'в', 'окрестность', 'Симеиза', 'я', 'обращать', 'внимание', 'на', 'одинокий', 'дача', ',', 'стоять', 'на', 'крутой', 'склон', 'гора', '.', 'к', 'этот', 'дача', 'не', 'быть', 'проводить', 'даже', 'дорога', '.', 'кругом', 'она', 'быть', 'обнесен', 'высокий', 'забор', ',', 'с', 'единственный', 'низкий', 'калитка', ',', 'который', 'всегда', 'быть', 'плотно', 'прикрывать', '.'], 'feats': [{}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Neuter'}, {'Number': 'Plural', 'Pronoun': 'REFLEXIVE', 'Case': 'Genitive'}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Plural', 'Gender': 'Neuter'}, {}, {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Plural', 'Gender': 'Masculine'}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular', 'Pronoun': 'DEICTIC', 'Case': 'Nominative'}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Neuter'}, {}, {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'NotPast'}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past'}, {}, {'Case': 'Prepositional', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {}, {'Case': 'Dative', 'Number': 'Singular', 'Gender': 'Feminine'}, {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Neuter', 'Shortness': 'Short', 'Tense': 'Past', 'Voice': 'Passive'}, {}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {}, {'Animacy': 'Animated', 'Gender': 'Feminine', 'Number': 'Singular', 'Pronoun': 'PERSONAL', 'Case': 'Nominative'}, {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine', 'Shortness': 'Short', 'Tense': 'Past', 'Voice': 'Passive'}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}, {}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'}, {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine', 'Shortness': 'Short', 'Tense': 'Past', 'Voice': 'Passive'}, {}], 'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], }, { 'ne': SortedSpansSet([Entity('1', 6, 7, 'GPE_CITY')]) } ), Document('1', ['Когда', 'мы', 'шли', 'по', 'тропинке', ',', 'каждый', 'был', 'доволен', 'и', 'думал', ',', 'что', 'надул', 'другого', '.', 'Петька', 'изредка', 'посапывал', 'носом', '.', 'Давно', 'он', 'зарился', 'на', 'моих', 'голубей', ',', 'еще', 'с', 'прошлой', 'зимы', ',', 'а', 'теперь', 'вот', 'счастье', 'неожиданно', 'привалило', '.', 'А', 'у', 'меня', 'будет', 'пистолет', '.'], [Sentence(0, 16), Sentence(16, 21), Sentence(21, 40), Sentence(40, 46)], [Paragraph(0, 3)], [ Entity('1', 1, 2, 'pron'), Entity('1', 16, 17, 'noun'), Entity('1', 22, 23, 'pron'), Entity('1', 25, 26, 'pron'), Entity('1', 25, 27, 'noun'), Entity('1', 42, 43, 'pron'), Entity('1', 44, 45, 'noun'), ], { Relation(Entity('1', 16, 17, 'noun'), Entity('1', 22, 23, 'pron'), 'COREF'), Relation(Entity('1', 25, 26, 'pron'), Entity('1', 42, 43, 'pron'), 'COREF'), }, { 'pos': ['SCONJ', 'PRON', 'VERB', 'ADP', 'NOUN', 'PUNCT', 'ADJ', 'AUX', 'ADJ', 'CCONJ', 'VERB', 'PUNCT', 'SCONJ', 'VERB', 'ADJ', 'PUNCT', 'NOUN', 'ADV', 'VERB', 'NOUN', 'PUNCT', 'ADV', 'PRON', 'VERB', 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADV', 'ADP', 'NOUN', 'NOUN', 'PUNCT', 'CCONJ', 'ADV', 'PART', 'NOUN', 'ADV', 'VERB', 'PUNCT', 'CCONJ', 'ADP', 'PRON', 'VERB', 'NOUN', 'PUNCT'], 'dt_labels': ['mark', 'nsubj', 'advcl', 'case', 'obl', 'punct', 'nsubj', 'cop', 'root', 'cc', 'conj', 'punct', 'mark', 'advcl', 'obj', 'punct', 'nsubj', 'advmod', 'root', 'obl', 'punct', 'advmod', 'nsubj', 'root', 'case', 'amod', 'obl', 'punct', 'advmod', 'case', 'obl', 'nmod', 'punct', 'cc', 'advmod', 'advmod', 'nsubj', 'advmod', 'conj', 'punct', 'cc', 'case', 'root', 'cop', 'nsubj', 'punct'], 'dt_head_distances': [8, 1, 6, 1, -2, -1, 2, 1, 0, 1, -2, -1, -2, -3, -1, -1, 2, 1, 0, -1, -1, 2, 1, 0, 2, 1, -3, -1, 2, 1, -7, -1, -1, 5, 4, 1, 2, 1, -15, -1, 2, 1, 0, -1, -2, -1], 'lemmas': ['когда', 'мы', 'идти', 'по', 'тропинка', ',', 'каждый', 'быть', 'довольный', 'и', 'думать', ',', 'что', 'надуть', 'другой', '.', 'Петька', 'изредка', 'посапывать', 'нос', '.', 'давно', 'он', 'зариться', 'на', 'мой', 'голубь', ',', 'еще', 'с', 'прошлый', 'зима', ',', 'а', 'теперь', 'вот', 'счастье', 'неожиданно', 'приваливать', '.', 'а', 'у', 'я', 'быть', 'пистолет', '.'], 'feats': [{}, {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'DEICTIC', 'Case': 'Nominative'}, {'Number': 'Plural', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Number': 'Singular', 'Gender': 'Masculine', 'Shortness': 'Short'}, {}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}, {'Case': 'Nominative', 'Animacy': 'Animated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}, {}, {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular', 'Pronoun': 'PERSONAL', 'Case': 'Nominative'}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'POSSESSIVE', 'Case': 'Accusative'}, {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Plural', 'Gender': 'Masculine'}, {}, {}, {}, {'Case': 'Genitive', 'Number': 'Singular', 'Gender': 'Feminine'}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {}, {}, {}, {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Neuter'}, {}, {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {}, {}, {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular', 'Pronoun': 'DEICTIC', 'Case': 'Genitive'}, {'Person': 'Third', 'Number': 'Singular', 'Tense': 'NotPast', 'Mode': 'Indicative'}, {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}], 'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], }, { 'ne': SortedSpansSet([Entity('1', 16, 17, 'PERSON')]) } ) ] # empty sets are "known" rels self.hook = get_hook([doc.without_relations().with_relations(set()) for doc in self.docs]) self.base_props = { "seed": 12345, "distance": 10, "max_distance": 10, "loss": "cross_entropy", "optimizer": "momentum", "lr_decay": 0.05, "momentum": 0.9, "dropout": 0.5, "internal_size": 10, "epoch": 1, "batch_size": 64, "learning_rate": 0.1, "clip_norm": 5, "max_candidate_distance": 50, "max_entity_distance": 50, "max_word_distance": 50, "max_sent_distance": 10, "max_dt_distance": 10, "dist_size": 50, "pos_emb_size": 0, "morph_feats_emb_size": 0, "entities_types_size": 20, "morph_feats_size": 0, "morph_feats_list": ["Gender", "Animacy", "Number"], "encoding_type": "lstm", "entity_encoding_size": 10, "encoding_size": 10, "classifiers": ["exact_match", "intersecting_mentions"], "use_filter": False, "max_sent_entities_distance": 10, "max_token_entities_distance": 20, "agreement_types": ["Gender", "Animacy", "Number"], "classifier_agreement_size": 0, "head_str_match_size": 0, "partial_str_match_size": 0, "ordered_partial_str_match_size": 0, "mention_interrelation_size": 0, "mention_distance_size": 0, "max_mention_distance": 50, "classifier_entity_distance_size": 0, "entities_types_in_classifier_size": 0, "head_ne_types_size": 0, "entities_token_distance_in_classifier_size": 0, "entities_sent_distance_in_classifier_size": 0, "encoder_entity_types_size": 0, "encoder_entity_ne_size": 0, "speech_types": ["said"], "speech_size": 0, "entity_encoding_type": "rnn", "classification_dense_size": 20, } self.experiment_props = { "sampling_strategy": ["coref_noun", "coref_pron_cluster", 'coref_pron_cluster_strict', 'coref_pron'] }
def _collapse_entities_in_doc(doc, entities_to_collapse: Iterable[Entity], entity_types_to_collapse: Union[set, frozenset]): if set(doc.extras.keys()).difference({"ne"}): raise Exception("Currently support only ne extras") # copy features not to affect default document tokens_to_process = list(doc.tokens) token_features_to_process = { k: list(v) for k, v in doc.token_features.items() } borders_to_change = { 'entities_to_collapse': build_borders_dict(entities_to_collapse), 'sentences': build_borders_dict(doc.sentences) } try: borders_to_change["entities"] = build_borders_dict(doc.entities) except ValueError: pass if "ne" in doc.extras: borders_to_change["ne"] = build_borders_dict(doc.extras["ne"]) _collapse_entities_and_correct_features(entities_to_collapse, tokens_to_process, token_features_to_process, entity_types_to_collapse, borders_to_change) sentences_mapping = create_objects_with_new_borders( doc.sentences, borders_to_change['sentences']) collapsed_entities_mapping = create_objects_with_new_borders( entities_to_collapse, borders_to_change['entities_to_collapse']) if 'entities' in borders_to_change: doc_entities_mapping = create_objects_with_new_borders( doc.entities, borders_to_change['entities']) doc_entities = doc_entities_mapping.values() else: doc_entities = None if "ne" in doc.extras: ne_mapping = create_objects_with_new_borders(doc.extras["ne"], borders_to_change["ne"]) extras = {"ne": SortedSpansSet(ne_mapping.values())} else: extras = None doc_to_process = Document(doc.name, tokens_to_process, sentences_mapping.values(), doc.paragraphs, doc_entities, token_features=token_features_to_process, extras=extras) try: relations = [ Relation(doc_entities_mapping[r.first_entity], doc_entities_mapping[r.second_entity], r.type) for r in doc.relations ] doc_to_process = doc_to_process.with_relations(relations) except ValueError: pass return doc_to_process, collapsed_entities_mapping
def make_document_from_json_file(file_path): d = load_json_file_as_dict(file_path) tokens = d.get('tokens', []) entities = d.get('entities', []) sentences = d.get('sentences', []) paragraphs = d.get('paragraphs', []) token_features = {} for feature in [ 'pos', 'entities_types', 'entities_depths', 'borders', 'dt_labels', 'dt_head_distances', 'dt_depths', 'dt_deltas_forward', 'dt_deltas_backward', 'dt_breakups_forward', 'dt_breakups_backward' ]: if feature in d: token_features[feature] = d[feature] relations = d.get('relations', []) doc_entities = [] for ent in entities: id_, start_token, end_token, ent_type = tuple(ent) doc_entities.append(Entity(id_, start_token, end_token, ent_type)) doc_sentences = [] for sent in sentences: start_token, end_token = tuple(sent) doc_sentences.append(Sentence(start_token, end_token)) doc_paragraphs = [] for par in paragraphs: start_sentence, end_sentence = tuple(par) doc_paragraphs.append(Paragraph(start_sentence, end_sentence)) doc_relations = [] for rel in relations: e1 = None e2 = None e1_id, e2_id, rel_type = tuple(rel) for entity in doc_entities: if entity.id == e1_id: e1 = entity if entity.id == e2_id: e2 = entity if e1 is not None and e2 is not None: break doc_relations.append(Relation(e1, e2, rel_type)) doc = Document("", tokens, doc_sentences, doc_paragraphs, token_features=token_features) if 'entities' in d: doc = doc.with_entities(doc_entities) if 'relations' in d: doc = doc.with_relations(doc_relations) return doc
def setUp(self) -> None: self.docs = [] # BB-event-4329237 tokens = [ "The", "in", "vitro", "assay", "of", "tuberculin", "hypersensitivity", "in", "Macaca", "mulatta", "sensitized", "with", "bacille", "Calmette", "Guerin", "cell", "wall", "vaccine", "and-or", "infected", "with", "virulent", "Mycobacterium", "tuberculosis", "." ] sentences = [Sentence(0, 25)] paragraphs = [Paragraph(0, 1)] entities = [ Entity("T2", 8, 18, "Habitat"), Entity("T3", 8, 24, "Habitat"), Entity("T4", 12, 18, "Habitat"), Entity("T5", 12, 15, "Bacteria"), Entity("T6", 22, 24, "Bacteria") ] relations = {Relation(entities[4], entities[1], "Lives_In")} # token features generated by UDPipe pos = [ 'DET', 'ADP', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'ADP', 'PROPN', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'NOUN', 'NOUN', 'NUM', 'NOUN', 'VERB', 'ADP', 'ADJ', 'PROPN', 'NOUN', 'PUNCT' ] dt_labels = [ 'det', 'case', 'compound', 'nsubj', 'case', 'compound', 'nmod', 'case', 'compound', 'nmod', 'root', 'case', 'compound', 'flat', 'compound', 'compound', 'obl', 'nummod', 'appos', 'acl', 'case', 'amod', 'compound', 'obl', 'punct' ] dt_head_distances = [ 3, 2, 1, 7, 2, 1, -3, 2, 1, -6, 0, 5, 2, -1, 2, 1, -6, 1, -2, -1, 3, 2, 1, -4, -14 ] token_features = { "pos": pos, "dt_labels": dt_labels, "dt_head_distances": dt_head_distances } self.docs.append( Document("_", tokens, sentences, paragraphs, entities, relations, token_features)) # BB-event-9564489 tokens = [ 'Gingivomandibular', 'infection', 'due', 'to', 'Mycobacterium', 'kansasii', 'in', 'a', 'patient', 'with', 'AIDS', '.' ] sentences = [Sentence(0, 12)] paragraphs = [Paragraph(0, 1)] entities = [ Entity("T2", 0, 1, "Habitat"), Entity("T3", 4, 6, "Bacteria"), Entity("T4", 8, 11, "Habitat") ] relations = { Relation(entities[1], entities[0], "Lives_In"), Relation(entities[1], entities[2], "Lives_In") } # token features generated by UDPipe pos = [ 'ADJ', 'NOUN', 'ADP', 'ADP', 'PROPN', 'PROPN', 'ADP', 'DET', 'NOUN', 'ADP', 'NOUN', 'PUNCT' ] dt_labels = [ 'amod', 'root', 'case', 'fixed', 'compound', 'nmod', 'case', 'det', 'nmod', 'case', 'nmod', 'punct' ] dt_head_distances = [1, 0, 3, -1, 1, -4, 2, 1, -7, 1, -2, -10] token_features = { "pos": pos, "dt_labels": dt_labels, "dt_head_distances": dt_head_distances } self.docs.append( Document("_", tokens, sentences, paragraphs, entities, relations, token_features)) self.docs_no_rels = [doc.without_relations() for doc in self.docs] self.props = { "shared": { "internal_emb_size": 10, "token_position_size": 10, "max_word_distance": 20, "dt_distance_emb_size": 10, "max_dt_distance": 10, "dt_depth_emb_size": 10, "max_dt_depth": 10, "pos_emb_size": 10 }, "add_we": "true", "add_shared": "true", "optimizer": "adam", "learning_rate": 0.01, "epoch": 2, "loss": "cross_entropy", "l2": 0.0001, "lr_decay": 0.1, "dropout": 0.5, "clip_norm": 1, "max_candidate_distance": 20, "batcher": { "batch_size": 8 }, "token_position_size": 10, "max_word_distance": 10, "encoding_size": 10, "entities_types_emb_size": 20, "entities_depth_emb_size": 10, 'max_entities_depth': 2, "specific_encoder_size": 10, "aggregation": { "attention": {}, "max_pooling": {}, "mean_pooling": {}, "take_spans": {}, "last_hiddens": {} }, "seed": 100 } # GENIA id=10022435 tokens = [ "Glucocorticoid", "resistance", "in", "the", "squirrel", "monkey", "is", "associated", "with", "overexpression", "of", "the", "immunophilin", "FKBP51", "." ] sentences = [Sentence(0, 15)] paragraphs = [Paragraph(0, 1)] pos = [ "NN", "NN", "IN", "DT", "NN", "NN", "VBZ", "VBN", "IN", "NN", "IN", "DT", "NN", "NN", "PERIOD" ] dt_labels = [ "compound", "nsubjpass", "case", "det", "compound", "nmod", "auxpass", "root", "case", "nmod", "case", "det", "compound", "nmod", "dep" ] dt_head_distances = [1, 6, 3, 2, 1, -4, 1, 0, 1, -2, 3, 2, 1, -4, -7] token_features = { "pos": pos, "dt_labels": dt_labels, "dt_head_distances": dt_head_distances } self.unlabeled_docs = [ Document("_", tokens, sentences, paragraphs, token_features=token_features) ] self.sdp_config = { "context_encoding_non_linearity_size": 10, "loss": "cross_entropy", "learning_rate": 0.02, "query_dense_size": 10, "clip_norm": 1, "batcher": { "batch_size": 1 } } self.parser_config = { "context_encoding_non_linearity_size": 10, "loss": "cross_entropy", "learning_rate": 0.02, "clip_norm": 1, "batcher": { "batch_size": 1 }, "add_shared": True, "specific_encoder_size": 10, "sampling_strategy": "pos_filtering", "arc_token_distance_in_classifier_size": 10, "arc_token_distance_in_attention_size": 10, "max_arc_token_distance": 10, "aggregation": { "attention": { "type": "luong", "normalise_coefficients": True }, "take_spans": {} } }