def test_ne_extras_collapse(self): nes = SortedSpansSet([ Entity("_", 0, 1, "left"), Entity("_", 2, 4, "same"), Entity("_", 3, 4, "include"), Entity("_", 5, 6, "same"), Entity("_", 15, 19, "intersect"), Entity("_", 17, 20, "include"), Entity("_", 22, 25, "intersect") ]) expected_nes = SortedSpansSet([ Entity("_", 0, 1, "left"), Entity("_", 2, 3, "same"), Entity("_", 2, 3, "include"), Entity("_", 4, 5, "same"), Entity("_", 14, 17, "intersect"), Entity("_", 16, 17, "include"), Entity("_", 16, 18, "intersect") ]) input_doc = self.doc.with_additional_extras({"ne": nes}) actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"}).transform(input_doc) actual_extras = actual_doc.extras self.assertDictEqual(actual_extras, {"ne": expected_nes})
def test_collapsing_with_ne(self): input_doc = self.doc.with_additional_extras({"ne": self.doc.entities}) input_doc = input_doc.without_relations().without_entities() entities = SortedSpansSet([ Entity("_", 0, 1, "left"), Entity("_", 2, 4, "same"), Entity("_", 3, 4, "include"), Entity("_", 5, 6, "same"), Entity("_", 15, 19, "intersect"), Entity("_", 17, 20, "include"), Entity("_", 22, 25, "intersect") ]) input_doc = input_doc.with_entities(entities) expected_tokens = [ "Recurrence", "of", "$Bacteria$", "in", "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "$Habitat$", "had", "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "." ] expected_sentences = [Sentence(0, 6), Sentence(6, 24)] expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)] expected_nes = SortedSpansSet([ Entity("T1", 2, 3, "Habitat"), Entity("T2", 2, 3, "Bacteria"), Entity("T3", 2, 3, "Bacteria"), Entity("T4", 4, 5, "Geographical"), Entity("T5", 16, 17, "Habitat"), Entity("T6", 16, 17, "Habitat"), Entity("T7", 16, 17, "Geographical"), Entity("T8", 16, 17, "Geographical"), Entity("T9", 22, 23, "Bacteria") ]) expected_entities = SortedSpansSet([ Entity("_", 0, 1, "left"), Entity("_", 2, 3, "same"), Entity("_", 2, 3, "include"), Entity("_", 4, 5, "same"), Entity("_", 14, 17, "intersect"), Entity("_", 16, 17, "include"), Entity("_", 16, 18, "intersect") ]) expected_doc = Document("_", expected_tokens, expected_sentences, expected_paragraphs, expected_entities, extras={"ne": expected_nes}) actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"}, True).transform(input_doc) self.assertEqual(expected_doc, actual_doc)
class TestSpansStorage(unittest.TestCase): def setUp(self): self.sents = SortedSpansSet([ Sentence(6, 9), Sentence(0, 10), Sentence(4, 12), Sentence(6, 9), Sentence(6, 12) ]) self.ents = SortedSpansSet([ Entity('', 0, 5, ''), Entity('', 0, 5, ''), Entity('1', 2, 6, ''), Entity('2', 2, 7, ''), Entity('', 7, 9, ''), Entity('', 7, 9, '') ]) def test_sent_contains(self): self.assertTrue(Sentence(0, 10) in self.sents) def test_sent_at_token_no_values(self): self.assertEqual([], self.sents.at_token(12)) def test_sent_at_token_single_value(self): self.assertEqual([Sentence(0, 10)], self.sents.at_token(0)) def test_sent_at_token_multiple_values(self): self.assertEqual([Sentence(0, 10), Sentence(4, 12)], self.sents.at_token(4)) def test_ent_indexed_at_token_middle(self): self.assertEqual([(1, Entity('1', 2, 6, '')), (2, Entity('2', 2, 7, ''))], self.ents.indexed_at_token(5)) def test_ent_indexed_at_token_end(self): self.assertEqual([(3, Entity('', 7, 9, ''))], self.ents.indexed_at_token(8)) def test_sent_contained_in_exact(self): self.assertEqual([Sentence(6, 9)], self.sents.contained_in(Sentence(6, 9))) def test_sent_contained_in_intersect(self): self.assertEqual([Sentence(0, 10), Sentence(6, 9)], self.sents.contained_in(Sentence(0, 11))) def test_ent_indexed_contained_inexact(self): self.assertEqual([(1, Entity('1', 2, 6, '')), (2, Entity('2', 2, 7, ''))], self.ents.indexed_contained_in(Sentence(1, 8))) def test_ent_indexed_contained_exact(self): self.assertEqual([(1, Entity('1', 2, 6, '')), (2, Entity('2', 2, 7, ''))], self.ents.indexed_contained_in(Sentence(2, 7)))
def setUp(self): self.sents = SortedSpansSet([ Sentence(6, 9), Sentence(0, 10), Sentence(4, 12), Sentence(6, 9), Sentence(6, 12) ]) self.ents = SortedSpansSet([ Entity('', 0, 5, ''), Entity('', 0, 5, ''), Entity('1', 2, 6, ''), Entity('2', 2, 7, ''), Entity('', 7, 9, ''), Entity('', 7, 9, '') ])
def test_net_preprocessor(self): filter_types = {"TeamFilter"} ne_replacements = {"PlayerCoach1": "Coach", "PlayerCoach2": "Coach"} ent_replacements = {"PlayerCoach1": "PlayerCoach", "PlayerCoach2": "PlayerCoach"} preprocessor = NETPreprocessor(filter_types, ne_replacements, ent_replacements) expected_entities = [ Entity("T1", 4, 5, "Team"), Entity("T2", 6, 8, "PlayerCoach"), Entity("T3", 23, 24, "PlayerCoach"), Entity("T5", 30, 31, "Team"), Entity("T6", 39, 41, "Coach"), Entity("T7", 42, 44, "Coach") ] expected_nes = SortedSpansSet([ Entity("T1", 4, 5, "Team"), Entity("T2", 6, 8, "Coach"), Entity("T3", 23, 24, "Coach"), Entity("T5", 30, 31, "Team"), Entity("T6", 39, 41, "Coach"), Entity("T7", 42, 44, "Coach") ]) expected_doc = self.doc.without_entities().with_entities(expected_entities).\ with_additional_extras({"ne": expected_nes}) self.assertEqual(expected_doc, preprocessor.process_doc(self.doc)) props = { "ent_types_to_filter": ["TeamFilter"], "ne_types_merge_pattern": {"Coach": ["PlayerCoach1", "PlayerCoach2"]}, "ent_types_merge_pattern": {"PlayerCoach": ["PlayerCoach1", "PlayerCoach2"]} } preprocessor = NETPreprocessor.from_props(props) self.assertEqual(expected_doc, preprocessor.process_doc(self.doc))
def setUp(self) -> None: tokens = ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon", "in", "LA", "in", "USA", "."] sents = [Sentence(0, 12)] ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER"), Entity("_", 8, 9, "ORG"), Entity("_", 10, 11, "ORG")] nes = SortedSpansSet([ Entity("gen", 0, 1, "STUFF"), Entity("gen", 4, 5, "PERORG"), Entity("gen", 6, 7, "PERORG"), Entity("gen", 8, 9, "PERORG"), Entity("gen", 10, 11, "PERORG") ]) self.doc = Document('', tokens, sents, [], ents, extras={"ne": nes})
def test_ne_features(self): ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER")] doc = Document('', ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon"], [Sentence(0, 7)], [], extras={'ne': SortedSpansSet(ents)}) fe, meta = ne_fe_factory([doc], {"ne_emb_size": 10}) features = fe.extract_features_from_doc(doc, 3, 7)['ne'] self.assertEqual(len(meta.get_embedded_features()), 1) self.assertEqual(len(features), 4) self.assertEqual(features[0], features[2]) # O O self.assertEqual(features[1], features[3]) # I-PER I-PER self.assertNotEqual(features[0], features[1]) # O I-PER
def process_doc(self, doc: Document) -> Document: new_entities = [] nes = [] for ent in doc.entities: if ent.type in self.__filter: continue new_ne_type = self.__ne_replacements.get(ent.type, ent.type) new_ent_type = self.__ents_replacements.get(ent.type, ent.type) new_entities.append(ent.with_type(new_ent_type)) nes.append(ent.with_type(new_ne_type)) return doc.without_relations().without_entities().with_entities(new_entities). \ with_additional_extras({"ne": SortedSpansSet(nes)})
def get_extras(self, tokens, sentences): sents, sent_starts, raw_tokens = _get_space_joined_sentences( tokens, sentences) ne_doc = list(self.api.named_entities(sents, language=self.lang)) raw_entities = [] for sent_start, ne_sent in zip(sent_starts, ne_doc): for ne in ne_sent: raw_entities.append({ 'id': str(len(raw_entities)), 'type': ne[-1], 'start': sent_start + ne[0], 'end': sent_start + ne[1] }) entities = align_raw_entities(raw_entities, raw_tokens) if self.remove_quotes: entities = self.__remove_quotes(tokens, entities) return {'ne': SortedSpansSet(entities)}
def test_fully_augmented(self): tokens = [ "Elon", "Musk", "must", "donate", "Tesla", "to", "our", "subscribers", ".", "It", "is", "important", "!" ] sentences = [Sentence(0, 9), Sentence(9, 13)] entities = [ Entity("_", 0, 2, "CEO"), Entity("_", 3, 4, "donate"), Entity("_", 4, 5, "Tesla"), Entity("_", 7, 8, "subscribers"), Entity("_", 9, 10, "It"), Entity("_", 11, 12, "important") ] nes = SortedSpansSet( [Entity("_", 4, 5, "Tesla"), Entity("_", 11, 12, "important")]) token_features = { "tokens": list(tokens), "pos": [ "NNP", "NNP", "VB", "VB", "NNP", "TO", "NNPS", "NNS", "DOT", "NNP", "VB", "RB", "DOT" ] } expected_doc = Document("_", tokens, sentences, [], entities, token_features=token_features, extras={"ne": nes}) to_augment = [ "CEO", "donate", "Tesla", "subscribers", "It", "important" ] actual_doc = EntitiesUnquoteAugmentor(1.0, to_augment).transform(self.doc) self.assertEqual(expected_doc, actual_doc)
def setUp(self) -> None: tokens = [ "Elon", "Musk", "must", "«", "donate", "»", "\'", "Tesla", "\'", "to", "our", "\"", "subscribers", "\"", ".", "It", "is", "\"", "important", "\"", "!" ] sentences = [Sentence(0, 15), Sentence(15, 21)] entities = [ Entity("_", 0, 2, "CEO"), Entity("_", 4, 5, "donate"), Entity("_", 7, 8, "Tesla"), Entity("_", 12, 13, "subscribers"), Entity("_", 15, 16, "It"), Entity("_", 18, 19, "important") ] nes = SortedSpansSet( [Entity("_", 6, 9, "Tesla"), Entity("_", 17, 20, "important")]) token_features = { "tokens": list(tokens), "pos": [ "NNP", "NNP", "VB", "QUOTE", "VB", "QUOTE", "QUOTE", "NNP", "QUOTE", "TO", "NNPS", "QUOTE", "NNS", "QUOTE", "DOT", "NNP", "VB", "QUOTE", "RB", "QUOTE", "DOT" ] } self.doc = Document("_", tokens, sentences, [], entities, token_features=token_features, extras={"ne": nes})
def setUp(self) -> None: self.docs = [] tokens = [ "Главный", "тренер", "римского", "«", "Лацио", "»", "Симоне", "Индзаги", "продолжит", "работу", "с", "командой", ",", "сообщает", "пресс-служба", "клуба", ".", "Ранее", "сообщалось", ",", "что", "в", "услугах", "Индзаги", "заинтересованы", "«", "Милан", "»", "и", "«", "Ювентус", "»", ",", "которые", "пребывают", "без", "наставников", "после", "ухода", "Дженнаро", "Гаттузо", "и", "Массимилиано", "Аллегри", "." ] sentences = [Sentence(0, 17), Sentence(17, 45)] paragraphs = [Paragraph(0, 1)] entities = [ Entity("T1", 4, 5, "Team"), Entity("T2", 6, 8, "Coach"), Entity("T3", 23, 24, "Coach"), Entity("T4", 26, 27, "Team"), Entity("T5", 30, 31, "Team"), Entity("T6", 39, 41, "Coach"), Entity("T7", 42, 44, "Coach") ] named_entities = [ Entity("generated", 3, 6, "ORG"), Entity("generated", 6, 8, "PER"), Entity("generated", 23, 24, "PER"), Entity("generated", 25, 28, "ORG"), Entity("generated", 29, 32, "ORG"), Entity("generated", 39, 41, "PER"), Entity("generated", 42, 44, "PER") ] doc = Document("_", tokens, sentences, paragraphs, entities, extras={"ne": SortedSpansSet(named_entities)}) self.docs.append(doc) tokens = [ "Врачи", "сборной", "Бразилии", "подтвердили", "травму", "нападающего", "«", "Пари", "Сен-Жермен", "»", "Неймара", ",", "полученную", "во", "время", "товарищеского", "матча", "с", "Катаром", "." ] sentences = [Sentence(0, 20)] paragraphs = [Paragraph(0, 1)] entities = [ Entity("T1", 1, 3, "Team"), Entity("T2", 7, 9, "Team"), Entity("T3", 10, 11, "Player"), Entity("T4", 18, 19, "Team") ] named_entities = [ Entity("generated", 1, 3, "ORG"), Entity("generated", 6, 10, "ORG"), Entity("generated", 10, 11, "PER"), Entity("generated", 18, 19, "ORG") ] doc = Document("_", tokens, sentences, paragraphs, entities, extras={"ne": SortedSpansSet(named_entities)}) self.docs.append(doc) self.common_props = { "seed": 1, "internal_emb_size": 10, "learning_rate": 0.005, "batcher": { "batch_size": 4, }, "encoding_size": 1, "dropout": 0.5, "optimizer": "adam", "epoch": 2, "clip_norm": 5 } self.docs_no_entities = [d.without_entities() for d in self.docs]
def transform(self, doc: Document) -> Document: if self.__prob == 0.0 or not self.__types: return doc borders_dicts = { 'entities': build_borders_dict(doc.entities), 'sentences': build_borders_dict(doc.sentences), } if 'ne' in doc.extras: borders_dicts['ne'] = build_borders_dict(doc.extras["ne"]) if set(doc.extras.keys()).difference({"ne"}): raise Exception("Can only work with ne extras") quotes_idx = set() for ent in doc.entities: if ent.type not in self.__types or not self.__quoted(doc, ent): continue if random() < 1.0 - self.__prob: continue quotes_idx.add(ent.start_token - 1) quotes_idx.add(ent.end_token) ent_shifted_start, ent_shifted_end = borders_dicts["entities"][ent] for key, val in borders_dicts.items(): shift_borders_after_collapse(val, ent_shifted_start - 1, ent_shifted_start, new_length=0) # shift second quote span after first quote replacement shift_borders_after_collapse(val, ent_shifted_end - 1, ent_shifted_end, new_length=0) new_tokens = [ tok for idx, tok in enumerate(doc.tokens) if idx not in quotes_idx ] new_sentences = create_objects_with_new_borders( doc.sentences, borders_dicts["sentences"]) new_entities = create_objects_with_new_borders( doc.entities, borders_dicts["entities"]) if "ne" in doc.extras: new_extras = { "ne": SortedSpansSet( create_objects_with_new_borders( doc.extras["ne"], borders_dicts["ne"]).values()) } else: new_extras = None new_token_features = { k: [v for idx, v in enumerate(val) if idx not in quotes_idx] for k, val in doc.token_features.items() } return Document(doc.name, new_tokens, new_sentences.values(), doc.paragraphs, new_entities.values(), token_features=new_token_features, extras=new_extras)
def _collapse_entities_in_doc(doc, entities_to_collapse: Iterable[Entity], entity_types_to_collapse: Union[set, frozenset]): if set(doc.extras.keys()).difference({"ne"}): raise Exception("Currently support only ne extras") # copy features not to affect default document tokens_to_process = list(doc.tokens) token_features_to_process = { k: list(v) for k, v in doc.token_features.items() } borders_to_change = { 'entities_to_collapse': build_borders_dict(entities_to_collapse), 'sentences': build_borders_dict(doc.sentences) } try: borders_to_change["entities"] = build_borders_dict(doc.entities) except ValueError: pass if "ne" in doc.extras: borders_to_change["ne"] = build_borders_dict(doc.extras["ne"]) _collapse_entities_and_correct_features(entities_to_collapse, tokens_to_process, token_features_to_process, entity_types_to_collapse, borders_to_change) sentences_mapping = create_objects_with_new_borders( doc.sentences, borders_to_change['sentences']) collapsed_entities_mapping = create_objects_with_new_borders( entities_to_collapse, borders_to_change['entities_to_collapse']) if 'entities' in borders_to_change: doc_entities_mapping = create_objects_with_new_borders( doc.entities, borders_to_change['entities']) doc_entities = doc_entities_mapping.values() else: doc_entities = None if "ne" in doc.extras: ne_mapping = create_objects_with_new_borders(doc.extras["ne"], borders_to_change["ne"]) extras = {"ne": SortedSpansSet(ne_mapping.values())} else: extras = None doc_to_process = Document(doc.name, tokens_to_process, sentences_mapping.values(), doc.paragraphs, doc_entities, token_features=token_features_to_process, extras=extras) try: relations = [ Relation(doc_entities_mapping[r.first_entity], doc_entities_mapping[r.second_entity], r.type) for r in doc.relations ] doc_to_process = doc_to_process.with_relations(relations) except ValueError: pass return doc_to_process, collapsed_entities_mapping
def setUp(self) -> None: self.docs = [ Document('1', ['Во', 'время', 'своих', 'прогулок', 'в', 'окрестностях', 'Симеиза', 'я', 'обратил', 'внимание', 'на', 'одинокую', 'дачу', ',', 'стоявшую', 'на', 'крутом', 'склоне', 'горы', '.', 'К', 'этой', 'даче', 'не', 'было', 'проведено', 'даже', 'дороги', '.', 'Кругом', 'она', 'была', 'обнесена', 'высоким', 'забором', ',', 'с', 'единственной', 'низкой', 'калиткой', ',', 'которая', 'всегда', 'была', 'плотно', 'прикрыта', '.'], [Sentence(0, 20), Sentence(20, 29), Sentence(29, 47)], [Paragraph(0, 3)], [Entity('1', 2, 3, 'pron'), Entity('1', 7, 8, 'pron'), Entity('1', 11, 13, 'noun'), Entity('1', 21, 23, 'noun'), Entity('1', 30, 31, 'pron'), Entity('1', 33, 35, 'noun'), Entity('1', 37, 38, 'noun'), Entity('1', 37, 40, 'noun'), Entity('1', 41, 42, 'pron')], { Relation(Entity('1', 2, 3, 'pron'), Entity('1', 7, 8, 'pron'), 'COREF'), Relation(Entity('1', 11, 13, 'noun'), Entity('1', 21, 23, 'noun'), 'COREF'), Relation(Entity('1', 11, 13, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'), Relation(Entity('1', 21, 23, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'), Relation(Entity('1', 37, 40, 'noun'), Entity('1', 41, 42, 'pron'), 'COREF'), }, { 'pos': ['ADP', 'NOUN', 'DET', 'NOUN', 'ADP', 'NOUN', 'PROPN', 'PRON', 'VERB', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'PUNCT', 'VERB', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'PART', 'AUX', 'VERB', 'PART', 'NOUN', 'PUNCT', 'ADV', 'PRON', 'AUX', 'VERB', 'ADJ', 'NOUN', 'PUNCT', 'ADP', 'ADJ', 'ADJ', 'NOUN', 'PUNCT', 'PRON', 'ADV', 'AUX', 'ADV', 'VERB', 'PUNCT'], 'dt_labels': ['case', 'fixed', 'amod', 'obl', 'case', 'nmod', 'nmod', 'nsubj', 'root', 'obj', 'case', 'amod', 'nmod', 'punct', 'amod', 'case', 'amod', 'obl', 'nmod', 'punct', 'case', 'amod', 'obl', 'advmod', 'aux:pass', 'root', 'advmod', 'nsubj', 'punct', 'advmod', 'nsubj', 'aux:pass', 'root', 'amod', 'obl', 'punct', 'case', 'amod', 'amod', 'conj', 'punct', 'nsubj', 'advmod', 'aux:pass', 'advmod', 'acl:relcl', 'punct'], 'dt_head_distances': [3, -1, 1, 5, 1, -2, -1, 1, 0, -1, 2, 1, -3, -1, -2, 2, 1, -3, -1, -1, 2, 1, 3, 2, 1, 0, 1, -2, -1, 3, 2, 1, 0, 1, -2, -1, 3, 2, 1, -5, -1, 4, 3, 2, 1, -11, -1], 'lemmas': ['во', 'время', 'свой', 'прогулка', 'в', 'окрестность', 'Симеиза', 'я', 'обращать', 'внимание', 'на', 'одинокий', 'дача', ',', 'стоять', 'на', 'крутой', 'склон', 'гора', '.', 'к', 'этот', 'дача', 'не', 'быть', 'проводить', 'даже', 'дорога', '.', 'кругом', 'она', 'быть', 'обнесен', 'высокий', 'забор', ',', 'с', 'единственный', 'низкий', 'калитка', ',', 'который', 'всегда', 'быть', 'плотно', 'прикрывать', '.'], 'feats': [{}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Neuter'}, {'Number': 'Plural', 'Pronoun': 'REFLEXIVE', 'Case': 'Genitive'}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Plural', 'Gender': 'Neuter'}, {}, {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Plural', 'Gender': 'Masculine'}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular', 'Pronoun': 'DEICTIC', 'Case': 'Nominative'}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Neuter'}, {}, {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'NotPast'}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past'}, {}, {'Case': 'Prepositional', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {}, {'Case': 'Dative', 'Number': 'Singular', 'Gender': 'Feminine'}, {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Neuter', 'Shortness': 'Short', 'Tense': 'Past', 'Voice': 'Passive'}, {}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {}, {'Animacy': 'Animated', 'Gender': 'Feminine', 'Number': 'Singular', 'Pronoun': 'PERSONAL', 'Case': 'Nominative'}, {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine', 'Shortness': 'Short', 'Tense': 'Past', 'Voice': 'Passive'}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}, {}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'}, {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine', 'Shortness': 'Short', 'Tense': 'Past', 'Voice': 'Passive'}, {}], 'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], }, { 'ne': SortedSpansSet([Entity('1', 6, 7, 'GPE_CITY')]) } ), Document('1', ['Когда', 'мы', 'шли', 'по', 'тропинке', ',', 'каждый', 'был', 'доволен', 'и', 'думал', ',', 'что', 'надул', 'другого', '.', 'Петька', 'изредка', 'посапывал', 'носом', '.', 'Давно', 'он', 'зарился', 'на', 'моих', 'голубей', ',', 'еще', 'с', 'прошлой', 'зимы', ',', 'а', 'теперь', 'вот', 'счастье', 'неожиданно', 'привалило', '.', 'А', 'у', 'меня', 'будет', 'пистолет', '.'], [Sentence(0, 16), Sentence(16, 21), Sentence(21, 40), Sentence(40, 46)], [Paragraph(0, 3)], [ Entity('1', 1, 2, 'pron'), Entity('1', 16, 17, 'noun'), Entity('1', 22, 23, 'pron'), Entity('1', 25, 26, 'pron'), Entity('1', 25, 27, 'noun'), Entity('1', 42, 43, 'pron'), Entity('1', 44, 45, 'noun'), ], { Relation(Entity('1', 16, 17, 'noun'), Entity('1', 22, 23, 'pron'), 'COREF'), Relation(Entity('1', 25, 26, 'pron'), Entity('1', 42, 43, 'pron'), 'COREF'), }, { 'pos': ['SCONJ', 'PRON', 'VERB', 'ADP', 'NOUN', 'PUNCT', 'ADJ', 'AUX', 'ADJ', 'CCONJ', 'VERB', 'PUNCT', 'SCONJ', 'VERB', 'ADJ', 'PUNCT', 'NOUN', 'ADV', 'VERB', 'NOUN', 'PUNCT', 'ADV', 'PRON', 'VERB', 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADV', 'ADP', 'NOUN', 'NOUN', 'PUNCT', 'CCONJ', 'ADV', 'PART', 'NOUN', 'ADV', 'VERB', 'PUNCT', 'CCONJ', 'ADP', 'PRON', 'VERB', 'NOUN', 'PUNCT'], 'dt_labels': ['mark', 'nsubj', 'advcl', 'case', 'obl', 'punct', 'nsubj', 'cop', 'root', 'cc', 'conj', 'punct', 'mark', 'advcl', 'obj', 'punct', 'nsubj', 'advmod', 'root', 'obl', 'punct', 'advmod', 'nsubj', 'root', 'case', 'amod', 'obl', 'punct', 'advmod', 'case', 'obl', 'nmod', 'punct', 'cc', 'advmod', 'advmod', 'nsubj', 'advmod', 'conj', 'punct', 'cc', 'case', 'root', 'cop', 'nsubj', 'punct'], 'dt_head_distances': [8, 1, 6, 1, -2, -1, 2, 1, 0, 1, -2, -1, -2, -3, -1, -1, 2, 1, 0, -1, -1, 2, 1, 0, 2, 1, -3, -1, 2, 1, -7, -1, -1, 5, 4, 1, 2, 1, -15, -1, 2, 1, 0, -1, -2, -1], 'lemmas': ['когда', 'мы', 'идти', 'по', 'тропинка', ',', 'каждый', 'быть', 'довольный', 'и', 'думать', ',', 'что', 'надуть', 'другой', '.', 'Петька', 'изредка', 'посапывать', 'нос', '.', 'давно', 'он', 'зариться', 'на', 'мой', 'голубь', ',', 'еще', 'с', 'прошлый', 'зима', ',', 'а', 'теперь', 'вот', 'счастье', 'неожиданно', 'приваливать', '.', 'а', 'у', 'я', 'быть', 'пистолет', '.'], 'feats': [{}, {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'DEICTIC', 'Case': 'Nominative'}, {'Number': 'Plural', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Masculine'}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Number': 'Singular', 'Gender': 'Masculine', 'Shortness': 'Short'}, {}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}, {'Case': 'Nominative', 'Animacy': 'Animated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}, {}, {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular', 'Pronoun': 'PERSONAL', 'Case': 'Nominative'}, {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'POSSESSIVE', 'Case': 'Accusative'}, {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Plural', 'Gender': 'Masculine'}, {}, {}, {}, {'Case': 'Genitive', 'Number': 'Singular', 'Gender': 'Feminine'}, {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Feminine'}, {}, {}, {}, {}, {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Neuter'}, {}, {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'}, {}, {}, {}, {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular', 'Pronoun': 'DEICTIC', 'Case': 'Genitive'}, {'Person': 'Third', 'Number': 'Singular', 'Tense': 'NotPast', 'Mode': 'Indicative'}, {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular', 'Gender': 'Masculine'}, {}], 'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], }, { 'ne': SortedSpansSet([Entity('1', 16, 17, 'PERSON')]) } ) ] # empty sets are "known" rels self.hook = get_hook([doc.without_relations().with_relations(set()) for doc in self.docs]) self.base_props = { "seed": 12345, "distance": 10, "max_distance": 10, "loss": "cross_entropy", "optimizer": "momentum", "lr_decay": 0.05, "momentum": 0.9, "dropout": 0.5, "internal_size": 10, "epoch": 1, "batch_size": 64, "learning_rate": 0.1, "clip_norm": 5, "max_candidate_distance": 50, "max_entity_distance": 50, "max_word_distance": 50, "max_sent_distance": 10, "max_dt_distance": 10, "dist_size": 50, "pos_emb_size": 0, "morph_feats_emb_size": 0, "entities_types_size": 20, "morph_feats_size": 0, "morph_feats_list": ["Gender", "Animacy", "Number"], "encoding_type": "lstm", "entity_encoding_size": 10, "encoding_size": 10, "classifiers": ["exact_match", "intersecting_mentions"], "use_filter": False, "max_sent_entities_distance": 10, "max_token_entities_distance": 20, "agreement_types": ["Gender", "Animacy", "Number"], "classifier_agreement_size": 0, "head_str_match_size": 0, "partial_str_match_size": 0, "ordered_partial_str_match_size": 0, "mention_interrelation_size": 0, "mention_distance_size": 0, "max_mention_distance": 50, "classifier_entity_distance_size": 0, "entities_types_in_classifier_size": 0, "head_ne_types_size": 0, "entities_token_distance_in_classifier_size": 0, "entities_sent_distance_in_classifier_size": 0, "encoder_entity_types_size": 0, "encoder_entity_ne_size": 0, "speech_types": ["said"], "speech_size": 0, "entity_encoding_type": "rnn", "classification_dense_size": 20, } self.experiment_props = { "sampling_strategy": ["coref_noun", "coref_pron_cluster", 'coref_pron_cluster_strict', 'coref_pron'] }