Example #1
0
    def setUp(self):
        sent_1_tokens = [
            "Human", "and", "tick", "spotted", "fever", "group", "Rickettsia",
            "isolates", "from", "Israel", ":", "a", "genotypic", "analysis",
            "."
        ]
        sent_1_head_distances = [
            3, -1, -2, 0, 2, 1, 1, 7, -1, -1, 4, 2, 1, 1, -11
        ]

        self.doc_with_1_sent = Document(
            "",
            sent_1_tokens, [Sentence(0, len(sent_1_tokens))],
            [Paragraph(0, 1)],
            token_features={"dt_head_distances": sent_1_head_distances})

        sent_2_tokens = [
            "The", "precise", "mechanisms", "that", "initiate", "bacterial",
            "uptake", "have", "not", "yet", "been", "elucidated", "."
        ]
        sent_2_head_distances = [2, 1, 9, 1, -2, 1, -2, 4, 3, 2, 1, 0, -1]

        self.doc_with_2_sent = Document(
            "",
            sent_1_tokens + sent_2_tokens, [
                Sentence(0, len(sent_1_tokens)),
                Sentence(len(sent_1_tokens),
                         len(sent_1_tokens) + len(sent_2_tokens))
            ], [Paragraph(0, 2)],
            token_features={
                "dt_head_distances":
                sent_1_head_distances + sent_2_head_distances
            })
Example #2
0
    def setUp(self):
        tokens = [
            "Recurrence", "of", "Pelecypod-associated", "cholera", "in",
            "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",",
            "1979", ",", "10", "people", "in", "the", "Sardinian", "province",
            "of", "Cagliari", "had", "onset", "of", "bacteriologically",
            "confirmed", "cholera", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 31)]

        entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 4, "Bacteria"),
            Entity("T3", 3, 4, "Bacteria"),
            Entity("T4", 5, 6, "Geographical"),
            Entity("T5", 17, 18, "Habitat"),
            Entity("T6", 17, 24, "Habitat"),
            Entity("T7", 20, 22, "Geographical"),
            Entity("T8", 23, 24, "Geographical"),
            Entity("T9", 29, 30, "Bacteria")
        ]

        paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        relations = [
            Relation(entities[0], entities[1], "Lives_in"),
            Relation(entities[8], entities[6], "Lives_in")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities,
                            relations)
Example #3
0
    def test_inner_entities_collapse(self):
        expected_tokens = [
            "Recurrence", "of", "Pelecypod-associated", "cholera", "in",
            "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7",
            ",", "1979", ",", "10", "people", "in", "the", "$Geographical$",
            "of", "$Geographical$", "had", "onset", "of", "bacteriologically",
            "confirmed", "cholera", "."
        ]
        expected_sentences = [Sentence(0, 7), Sentence(7, 30)]

        expected_entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 4, "Bacteria"),
            Entity("T3", 3, 4, "Bacteria"),
            Entity("T4", 5, 6, "Geographical"),
            Entity("T5", 17, 18, "Habitat"),
            Entity("T6", 17, 23, "Habitat"),
            Entity("T7", 20, 21, "Geographical"),
            Entity("T8", 22, 23, "Geographical"),
            Entity("T9", 28, 29, "Bacteria")
        ]

        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        expected_relations = [
            Relation(expected_entities[0], expected_entities[1], "Lives_in"),
            Relation(expected_entities[8], expected_entities[6], "Lives_in")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences,
                                expected_paragraphs, expected_entities,
                                expected_relations)

        actual_doc = EntitiesCollapser({"Geographical"}).transform(self.doc)
        self.assertEqual(expected_doc, actual_doc)
    def test_3_entity_paragraphs(self):
        sentences = [
            Sentence(0, 5),
            Sentence(5, 10),
        ]
        paragraphs = [
            Paragraph(0, 1),
            Paragraph(1, 2),
        ]
        entities = [
            Entity('_', 0, 1, '1'),
            Entity('_', 1, 2, '1'),
            Entity('_', 5, 6, '2'),
        ]
        doc = Document('test', [], sentences, paragraphs, entities)

        max_distance = 3

        actual_samples = get_samples(doc, max_distance, False)

        expected_samples = [
            (Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), None),
            (Entity('_', 0, 1, '1'), Entity('_', 5, 6, '2'), None),
            (Entity('_', 1, 2, '1'), Entity('_', 5, 6, '2'), None),
        ]

        self.assertEqual(expected_samples, actual_samples)
Example #5
0
    def test_entities_with_nesting_collapse(self):
        expected_tokens = [
            "Recurrence", "of", "$Bacteria$", "in", "Sardinia", ".", "From",
            "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people",
            "in", "the", "Sardinian", "province", "of", "Cagliari", "had",
            "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "."
        ]
        expected_sentences = [Sentence(0, 6), Sentence(6, 30)]

        expected_entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 3, "Bacteria"),
            Entity("T3", 2, 3, "Bacteria"),
            Entity("T4", 4, 5, "Geographical"),
            Entity("T5", 16, 17, "Habitat"),
            Entity("T6", 16, 23, "Habitat"),
            Entity("T7", 19, 21, "Geographical"),
            Entity("T8", 22, 23, "Geographical"),
            Entity("T9", 28, 29, "Bacteria")
        ]

        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        expected_relations = [
            Relation(expected_entities[0], expected_entities[1], "Lives_in"),
            Relation(expected_entities[8], expected_entities[6], "Lives_in")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences,
                                expected_paragraphs, expected_entities,
                                expected_relations)

        actual_doc = EntitiesCollapser({"Bacteria"}).transform(self.doc)
        self.assertEqual(expected_doc, actual_doc)
Example #6
0
    def test_collapsing_with_ne(self):
        input_doc = self.doc.with_additional_extras({"ne": self.doc.entities})
        input_doc = input_doc.without_relations().without_entities()

        entities = SortedSpansSet([
            Entity("_", 0, 1, "left"),
            Entity("_", 2, 4, "same"),
            Entity("_", 3, 4, "include"),
            Entity("_", 5, 6, "same"),
            Entity("_", 15, 19, "intersect"),
            Entity("_", 17, 20, "include"),
            Entity("_", 22, 25, "intersect")
        ])

        input_doc = input_doc.with_entities(entities)

        expected_tokens = [
            "Recurrence", "of", "$Bacteria$", "in", "$Geographical$", ".",
            "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10",
            "$Habitat$", "had", "onset", "of", "bacteriologically",
            "confirmed", "$Bacteria$", "."
        ]
        expected_sentences = [Sentence(0, 6), Sentence(6, 24)]
        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]

        expected_nes = SortedSpansSet([
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 3, "Bacteria"),
            Entity("T3", 2, 3, "Bacteria"),
            Entity("T4", 4, 5, "Geographical"),
            Entity("T5", 16, 17, "Habitat"),
            Entity("T6", 16, 17, "Habitat"),
            Entity("T7", 16, 17, "Geographical"),
            Entity("T8", 16, 17, "Geographical"),
            Entity("T9", 22, 23, "Bacteria")
        ])

        expected_entities = SortedSpansSet([
            Entity("_", 0, 1, "left"),
            Entity("_", 2, 3, "same"),
            Entity("_", 2, 3, "include"),
            Entity("_", 4, 5, "same"),
            Entity("_", 14, 17, "intersect"),
            Entity("_", 16, 17, "include"),
            Entity("_", 16, 18, "intersect")
        ])

        expected_doc = Document("_",
                                expected_tokens,
                                expected_sentences,
                                expected_paragraphs,
                                expected_entities,
                                extras={"ne": expected_nes})

        actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"},
                                       True).transform(input_doc)
        self.assertEqual(expected_doc, actual_doc)
Example #7
0
def convert_from_digger_to_derek(diggerdoc: DiggerDoc,
                                 doc_name: str) -> Document:
    tokens = []
    token_features = {
        "pos": [],
        "dt_labels": [],
        "dt_head_distances": [],
        "lemmas": [],
        "feats": []
    }

    for i, token in enumerate(diggerdoc.tokens):
        tokens.append(token.doc_text)
        token_features["pos"].append(token.pos.upos)
        token_features["dt_labels"].append(token.deprel)
        token_features["dt_head_distances"].append(
            token.head_index - i if token.head_index != -1 else 0)
        token_features["lemmas"].append(token.lemma)
        token_features["feats"].append(token.pos.feats)

    sentences = list(
        Sentence(sent.start, sent.end)
        for sent in diggerdoc.sentences_boundaries)
    # here we assume all doc sentences to be in 1 paragraph
    paragraphs = [Paragraph(0, len(sentences))]

    return Document(doc_name,
                    tokens,
                    sentences,
                    paragraphs,
                    token_features=token_features)
    def test_2_chains_2_pron(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity('_', 0, 1, 'noun'),
            Entity('_', 1, 2, 'pron'),
            Entity('_', 2, 3, 'pron'),
            Entity('_', 3, 4, 'noun'),
            Entity('_', 5, 6, 'noun'),
        ]
        rels = {
            Relation(Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'),
                     '1'),
            Relation(Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'),
                     '1'),
            Relation(Entity('_', 3, 4, 'noun'), Entity('_', 5, 6, 'noun'),
                     '1'),
        }
        doc = Document('test', [], sentences, paragraphs, entities, rels)

        max_distance = 3

        actual_samples = get_pron_samples(doc, max_distance, True)
        expected_samples = [
            (Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'pron'), None),
            (Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'), '1'),
            (Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'), '1'),
            (Entity('_', 2, 3, 'pron'), Entity('_', 3, 4, 'noun'), None),
        ]
        self.assertEqual(actual_samples, expected_samples)
Example #9
0
    def _get_doc_from_raw_text(self, raw_text, doc_name) -> Document:
        tokens, sentences, raw_tokens = self.segmenter.segment(raw_text)

        # here we assume all text to be one paragraph
        paragraphs = [Paragraph(0, len(sentences))]

        return Document(splitext(doc_name)[0], tokens, sentences, paragraphs,  token_features={"char_spans": raw_tokens})
Example #10
0
    def _create_doc(self, doc_raw_tokens: List[List[str]], doc_idx) -> Document:
        tokens, sentences, entities, pos_tags = [], [], [], []

        sent_tokens, sent_pos_tags, sent_entities_labels = [], [], []
        sent_start = 0
        for raw_token in doc_raw_tokens:
            if not raw_token:
                if sent_tokens:
                    tokens.extend(sent_tokens)
                    pos_tags.extend(sent_pos_tags)
                    sentences.append(Sentence(sent_start, sent_start + len(sent_tokens)))
                    sent_start += len(sent_tokens)
                    entities.extend(self._decode_strategy.decode_labels(sentences[-1], sent_entities_labels))
                    sent_tokens, sent_pos_tags, sent_entities_labels = [], [], []
                continue

            token, pos_tag, _, ent_label = raw_token
            sent_tokens.append(token)
            sent_entities_labels.append(ent_label)
            sent_pos_tags.append(pos_tag)

        if sent_tokens:
            tokens.extend(sent_tokens)
            pos_tags.extend(sent_pos_tags)
            sentences.append(Sentence(sent_start, sent_start + len(sent_tokens)))
            entities.extend(self._decode_strategy.decode_labels(sentences[-1], sent_entities_labels))

        return Document(
            str(doc_idx), tokens, sentences, [Paragraph(0, len(sentences))], entities, token_features={"pos": pos_tags})
Example #11
0
    def test_borders_extraction_3(self):
        tokens = ["bacteria", "spotted", ".", ".", "it's", "."]
        sentences = [Sentence(0, 3), Sentence(3, 4), Sentence(4, 6)]
        broken_doc = Document("", tokens, sentences, [Paragraph(0, 2)])

        borders = ["start", "in", "end", "start", "start", "end"]
        self.assertEqual(get_sentence_borders_feature(broken_doc), borders)
Example #12
0
def _merge(raw_tokens: list, sentences: list, raw_paragraphs: list, raw_entities: list, raw_relations: list, *,
           symmetric_types: set = None) -> Tuple[List[Sentence], List[Paragraph], List[Entity], Set[Relation]]:
    """
    :param raw_tokens: list of tuples: (start, end, text)
    :param sentences: list of Sentence objects
    :param raw_paragraphs: list of tuples: (start, end)
    :param raw_entities: list of dicts: {'id', 'type', 'start', 'end'}
    :param raw_relations: list of dicts: {'type', 'first', 'second'}
    """
    paragraphs = []

    cur_par_idx = 0
    par_start = 0

    entities = sorted(align_raw_entities(raw_entities, raw_tokens))
    entities_dict = {ent.id: ent for ent in entities}
    sentences = adjust_sentences(sentences, entities)

    for i, sentence in enumerate(sentences):
        for token in raw_tokens[sentence.start_token: sentence.end_token]:
            if par_start != i + 1 and (_end_of_text(sentences, raw_tokens, sentence, token, i)
                                       or _end_of_paragraph(raw_paragraphs, cur_par_idx, token)):
                paragraphs.append(Paragraph(par_start, i + 1))
                par_start = i + 1
                cur_par_idx += 1

    return sentences, paragraphs, entities, _get_relations(raw_relations, entities_dict, symmetric_types)
Example #13
0
    def read(self, path: str) -> List[Document]:
        all_files = listdir(path)
        file_names = sorted(set(splitext(f)[0] for f in all_files))
        docs = []

        for f in file_names:
            with open(join(path, f"{f}.txt"), "r", encoding="utf-8") as g:
                raw_text = g.read()

            if f"{f}.ann" not in all_files:
                warn(f"Skipping {f}.txt, no {f}.ann file found")
                continue

            with open(join(path, f"{f}.ann"), "r", encoding="utf-8") as g:
                annotations = g.read()

            tokens, sentences, raw_tokens = self.segmenter.segment(raw_text)
            raw_entities, raw_relations = _read_brat_annotations(annotations)
            raw_entities = _expand_spans(raw_entities)
            sentences, _, entities, relations = _merge(raw_tokens, sentences, [], raw_entities, raw_relations)

            if self.collapse_intersecting:
                entities, relations = collapse_intersecting_entities(entities, relations)

            # here we assume all doc to be in one paragraph
            doc = Document(f, tokens, sentences, [Paragraph(0, len(sentences))], entities, relations)
            docs.append(doc)

        return docs
    def test_1_entity(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [Entity('_', 0, 1, '1')]
        doc = Document('test', [], sentences, paragraphs, entities)

        max_distance = 3

        actual_samples = get_samples(doc, max_distance, False)
        expected_samples = []

        self.assertEqual(expected_samples, actual_samples)
    def test_no_entities(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = []
        doc = Document('test', [], sentences, paragraphs, entities)

        max_distance = 3

        actual_samples = get_noun_samples(doc, max_distance, False)
        expected_samples = []

        self.assertEqual(actual_samples, expected_samples)
Example #16
0
def _conllu_text(text) -> str:
    tokens, sentences, _ = segmentor.segment(text)
    token_features = processor.get_token_features(tokens, sentences)
    # assume doc as paragraph
    paragraphs = [Paragraph(0, len(sentences))]

    doc = Document("",
                   tokens,
                   sentences,
                   paragraphs,
                   token_features=token_features)
    return writer.write_to_str(convert_from_derek_to_digger(doc))
    def test_2_entity(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity('_', 0, 1, 'noun'),
            Entity('_', 1, 2, 'noun'),
        ]
        doc = Document('test', [], sentences, paragraphs, entities)

        max_distance = 3

        actual_samples = get_noun_samples(doc, max_distance, False)

        expected_samples = [(Entity('_', 0, 1,
                                    'noun'), Entity('_', 1, 2, 'noun'), None)]

        self.assertEqual(expected_samples, actual_samples)
    def test_2_entity_rel(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity('_', 0, 1, '1'),
            Entity('_', 1, 2, '1'),
        ]
        rels = {Relation(Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), '1')}
        doc = Document('test', [], sentences, paragraphs, entities, rels)

        max_distance = 3

        actual_samples = get_samples(doc, max_distance, True)

        expected_samples = [(Entity('_', 0, 1, '1'), Entity('_', 1, 2,
                                                            '1'), '1')]

        self.assertEqual(expected_samples, actual_samples)
Example #19
0
    def setUp(self):
        doc_tokens = [
            "Human", "and", "tick", "spotted", "fever", "group", "Rickettsia",
            "isolates", "from", "Israel", ":", "a", "genotypic", "analysis",
            "."
        ] + [
            "The", "precise", "mechanisms", "that", "initiate", "bacterial",
            "uptake", "have", "not", "yet", "been", "elucidated", "."
        ]

        doc_sentences = [Sentence(0, 15), Sentence(15, 28)]
        doc_paragraphs = [Paragraph(0, 2)]

        doc_head_distances = [
            3, -1, -2, 0, 2, 1, 1, 7, -1, -1, 4, 2, 1, 1, -11
        ] + [2, 1, 9, 1, -2, 1, -2, 4, 3, 2, 1, 0, -1]

        doc_dt_labels = ["test"] * len(doc_tokens)
        doc_token_features = {
            "dt_head_distances": doc_head_distances,
            "dt_labels": doc_dt_labels
        }

        self.entity_with_one_token_no_root = (6, 7, 0)
        self.entity_with_several_tokens_no_root = (12, 14, 0)
        self.entity_with_one_token_root = (3, 4, 0)
        self.entity_with_several_tokens_root = (22, 27, 1)

        doc_entities = [
            self.entity_with_one_token_no_root,
            self.entity_with_several_tokens_no_root,
            self.entity_with_one_token_root,
            self.entity_with_several_tokens_root
        ]
        doc_entities = [
            Entity("", start, end, "") for start, end, _ in doc_entities
        ]

        self.doc = Document("",
                            doc_tokens,
                            doc_sentences,
                            doc_paragraphs,
                            doc_entities,
                            token_features=doc_token_features)
Example #20
0
    def _get_docs(self, raw_docs: Dict[str, List[dict]],
                  groups: Dict[str, list]) -> Dict[str, Document]:
        docs = {}
        for doc_id, raw_tokens in raw_docs.items():
            tokens = []
            token_features = {}
            sentences = []
            sent_start = 0
            shift2idx = {}

            for i, raw_token in enumerate(raw_tokens):
                tokens.append(raw_token['token'])
                token_features.setdefault('lemma',
                                          []).append(raw_token['lemma'])
                token_features.setdefault('gram', []).append(raw_token['gram'])
                if "speech" in raw_token:
                    token_features.setdefault("speech",
                                              []).append(raw_token['speech'])
                    token_features.setdefault("said",
                                              []).append(raw_token['said'])
                    token_features.setdefault("author_comment", []).append(
                        raw_token['author_comment'])
                    token_features.setdefault("speech_verb", []).append(
                        raw_token['speech_verb'])
                shift2idx[raw_token['shift']] = i

                if raw_token['gram'] == 'SENT':
                    sentences.append(Sentence(sent_start, i + 1))
                    sent_start = i + 1
            if sentences[-1].end_token != len(tokens):
                sentences.append(Sentence(sent_start, len(tokens)))
            entities = self._get_entities(groups, shift2idx, doc_id)
            sentences = adjust_sentences(sentences, entities)

            doc = Document(doc_id,
                           tokens,
                           sentences, [Paragraph(0, len(sentences))],
                           entities,
                           token_features=token_features)
            docs[doc_id] = doc

        return docs
Example #21
0
    def setUp(self) -> None:
        tokens = [
            "I", "will", "do", "my", "homework", "today", ".", "It", "is",
            "very", "hard", "but", "i", "don't", "care", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 16)]
        paragraphs = [Paragraph(0, 2)]
        entities = [
            Entity("_", 0, 1, "t1"),
            Entity("_", 3, 5, "t2"),
            Entity("_", 7, 8, "t1"),
            Entity("_", 9, 11, "t2"),
            Entity("_", 10, 11, "t4")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities)
        self.relations = {
            Relation(entities[2], entities[3], "t1"),
            Relation(entities[3], entities[4], "t2")
        }
Example #22
0
    def setUp(self) -> None:
        tokens = [
            "Главный", "тренер", "римского", "«", "Лацио", "»", "Симоне", "Индзаги", "продолжит", "работу", "с",
            "командой", ",", "сообщает", "пресс-служба", "клуба", ".", "Ранее", "сообщалось", ",", "что", "в",
            "услугах", "Индзаги", "заинтересованы", "«", "Милан", "»", "и", "«", "Ювентус", "»", ",", "которые",
            "пребывают", "без", "наставников", "после", "ухода", "Дженнаро", "Гаттузо", "и", "Массимилиано", "Аллегри",
            "."
        ]

        sentences = [Sentence(0, 17), Sentence(17, 45)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity("T1", 4, 5, "Team"),
            Entity("T2", 6, 8, "PlayerCoach1"),
            Entity("T3", 23, 24, "PlayerCoach2"),
            Entity("T4", 26, 27, "TeamFilter"),
            Entity("T5", 30, 31, "Team"),
            Entity("T6", 39, 41, "Coach"),
            Entity("T7", 42, 44, "Coach")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities)
Example #23
0
    def predict_doc(self, text, raw_entities, need_entities, need_relations):
        """
        :param raw_entities: list of {"id","start","end","type"} dicts
        :return: (raw_entities, raw_relations) where:
          raw_entities is list of {"id","start","end","type"} dicts or None
          raw_relations is list of {"first","second","type"} dicts or None
        """

        if self.ent_clf is None and raw_entities is None and (need_entities or
                                                              need_relations):
            raise BadRequest("Server doesn't support entities recognition")

        if self.rel_clf is None and need_relations:
            raise BadRequest("Server doesn't support relation extraction")

        tokens, sentences, raw_tokens = self.segmenter.segment(text)
        doc = Document("_", tokens, sentences, [Paragraph(0, len(sentences))])
        doc = self.transformer.transform(doc)

        entities = None
        if raw_entities is not None:
            if need_relations:
                entities = align_raw_entities(raw_entities, raw_tokens)
            if not need_entities:
                raw_entities = None
        else:
            if need_entities or need_relations:
                entities = self.ent_clf.predict_doc(doc)
            if need_entities:
                raw_entities = self._to_raw_entities(entities, raw_tokens)

        raw_relations = None
        if need_relations:
            doc = doc.with_entities(entities)
            relations = self.rel_clf.predict_doc(doc)
            raw_relations = self._to_raw_relations(relations)

        return raw_entities, raw_relations
    def test_2_entity_long(self):
        sentences = [
            Sentence(0, 3),
            Sentence(3, 5),
            Sentence(5, 10),
        ]
        paragraphs = [Paragraph(0, 3)]
        entities = [
            Entity('_', 0, 1, '1'),
            Entity('_', 3, 4, '1'),
            Entity('_', 5, 6, '1'),
        ]
        doc = Document('test', [], sentences, paragraphs, entities)

        max_distance = 1

        actual_samples = get_samples(doc, max_distance, False)

        expected_samples = [(Entity('_', 0, 1, '1'), Entity('_', 3, 4,
                                                            '1'), None),
                            (Entity('_', 3, 4, '1'), Entity('_', 5, 6,
                                                            '1'), None)]

        self.assertEqual(expected_samples, actual_samples)
Example #25
0
 def _read_document(self, directory, name):
     path = join(directory, name)
     tokens, fre_id2token_id, sentences, char_spans = self._get_tokens_and_sentences(path)
     paragraphs = [Paragraph(0, len(sentences))]
     entities = self._get_entities(path, fre_id2token_id)
     return Document(name, tokens, sentences, paragraphs, entities, token_features={"char_spans": char_spans})
Example #26
0
    def setUp(self) -> None:
        self.docs = []

        tokens = [
            "Главный", "тренер", "римского", "«", "Лацио", "»", "Симоне", "Индзаги", "продолжит", "работу", "с",
            "командой", ",", "сообщает", "пресс-служба", "клуба", ".", "Ранее", "сообщалось", ",",  "что", "в",
            "услугах", "Индзаги", "заинтересованы", "«", "Милан", "»", "и", "«", "Ювентус", "»", ",", "которые",
            "пребывают", "без", "наставников", "после", "ухода", "Дженнаро", "Гаттузо", "и", "Массимилиано", "Аллегри",
            "."
        ]

        sentences = [Sentence(0, 17), Sentence(17, 45)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity("T1", 4, 5, "Team"),
            Entity("T2", 6, 8, "Coach"),
            Entity("T3", 23, 24, "Coach"),
            Entity("T4", 26, 27, "Team"),
            Entity("T5", 30, 31, "Team"),
            Entity("T6", 39, 41, "Coach"),
            Entity("T7", 42, 44, "Coach")
        ]
        named_entities = [
            Entity("generated", 3, 6, "ORG"),
            Entity("generated", 6, 8, "PER"),
            Entity("generated", 23, 24, "PER"),
            Entity("generated", 25, 28, "ORG"),
            Entity("generated", 29, 32, "ORG"),
            Entity("generated", 39, 41, "PER"),
            Entity("generated", 42, 44, "PER")
        ]

        doc = Document("_", tokens, sentences, paragraphs, entities, extras={"ne": SortedSpansSet(named_entities)})
        self.docs.append(doc)

        tokens = [
            "Врачи", "сборной", "Бразилии", "подтвердили", "травму", "нападающего", "«", "Пари", "Сен-Жермен", "»",
            "Неймара", ",", "полученную", "во", "время", "товарищеского", "матча", "с", "Катаром", "."
        ]

        sentences = [Sentence(0, 20)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity("T1", 1, 3, "Team"),
            Entity("T2", 7, 9, "Team"),
            Entity("T3", 10, 11, "Player"),
            Entity("T4", 18, 19, "Team")
        ]
        named_entities = [
            Entity("generated", 1, 3, "ORG"),
            Entity("generated", 6, 10, "ORG"),
            Entity("generated", 10, 11, "PER"),
            Entity("generated", 18, 19, "ORG")
        ]

        doc = Document("_", tokens, sentences, paragraphs, entities, extras={"ne": SortedSpansSet(named_entities)})
        self.docs.append(doc)

        self.common_props = {
            "seed": 1,
            "internal_emb_size": 10,
            "learning_rate": 0.005,
            "batcher": {
                "batch_size": 4,
            },
            "encoding_size": 1,
            "dropout": 0.5,
            "optimizer": "adam",
            "epoch": 2,
            "clip_norm": 5
        }

        self.docs_no_entities = [d.without_entities() for d in self.docs]
Example #27
0
    def setUp(self) -> None:
        self.docs = [
            Document('1',
                     ['Во', 'время', 'своих', 'прогулок', 'в', 'окрестностях', 'Симеиза', 'я', 'обратил', 'внимание',
                      'на', 'одинокую', 'дачу', ',', 'стоявшую', 'на', 'крутом', 'склоне', 'горы', '.',
                      'К', 'этой', 'даче', 'не', 'было', 'проведено', 'даже', 'дороги', '.',
                      'Кругом', 'она', 'была', 'обнесена', 'высоким', 'забором', ',', 'с', 'единственной', 'низкой',
                      'калиткой', ',', 'которая', 'всегда', 'была', 'плотно', 'прикрыта', '.'],
                     [Sentence(0, 20), Sentence(20, 29), Sentence(29, 47)],
                     [Paragraph(0, 3)],
                     [Entity('1', 2, 3, 'pron'),
                      Entity('1', 7, 8, 'pron'),
                      Entity('1', 11, 13, 'noun'),
                      Entity('1', 21, 23, 'noun'),
                      Entity('1', 30, 31, 'pron'),
                      Entity('1', 33, 35, 'noun'),
                      Entity('1', 37, 38, 'noun'),
                      Entity('1', 37, 40, 'noun'),
                      Entity('1', 41, 42, 'pron')],
                     {
                         Relation(Entity('1', 2, 3, 'pron'), Entity('1', 7, 8, 'pron'), 'COREF'),
                         Relation(Entity('1', 11, 13, 'noun'), Entity('1', 21, 23, 'noun'), 'COREF'),
                         Relation(Entity('1', 11, 13, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'),
                         Relation(Entity('1', 21, 23, 'noun'), Entity('1', 30, 31, 'pron'), 'COREF'),
                         Relation(Entity('1', 37, 40, 'noun'), Entity('1', 41, 42, 'pron'), 'COREF'),
                     },
                     {
                         'pos': ['ADP', 'NOUN', 'DET', 'NOUN', 'ADP', 'NOUN', 'PROPN', 'PRON', 'VERB', 'NOUN', 'ADP',
                                 'ADJ', 'NOUN',
                                 'PUNCT', 'VERB', 'ADP', 'ADJ', 'NOUN', 'NOUN', 'PUNCT', 'ADP', 'DET', 'NOUN', 'PART',
                                 'AUX', 'VERB',
                                 'PART', 'NOUN', 'PUNCT', 'ADV', 'PRON', 'AUX', 'VERB', 'ADJ', 'NOUN', 'PUNCT', 'ADP',
                                 'ADJ', 'ADJ',
                                 'NOUN', 'PUNCT', 'PRON', 'ADV', 'AUX', 'ADV', 'VERB', 'PUNCT'],
                         'dt_labels': ['case', 'fixed', 'amod', 'obl', 'case', 'nmod', 'nmod', 'nsubj', 'root', 'obj',
                                       'case',
                                       'amod', 'nmod', 'punct', 'amod', 'case', 'amod', 'obl', 'nmod', 'punct', 'case',
                                       'amod',
                                       'obl', 'advmod', 'aux:pass', 'root', 'advmod', 'nsubj', 'punct', 'advmod',
                                       'nsubj',
                                       'aux:pass', 'root', 'amod', 'obl', 'punct', 'case', 'amod', 'amod', 'conj',
                                       'punct', 'nsubj',
                                       'advmod', 'aux:pass', 'advmod', 'acl:relcl', 'punct'],
                         'dt_head_distances': [3, -1, 1, 5, 1, -2, -1, 1, 0, -1, 2, 1, -3, -1, -2, 2, 1, -3, -1, -1, 2,
                                               1, 3, 2, 1,
                                               0, 1, -2, -1, 3, 2, 1, 0, 1, -2, -1, 3, 2, 1, -5, -1, 4, 3, 2, 1, -11,
                                               -1],
                         'lemmas': ['во', 'время', 'свой', 'прогулка', 'в', 'окрестность', 'Симеиза', 'я', 'обращать',
                                    'внимание',
                                    'на', 'одинокий', 'дача', ',', 'стоять', 'на', 'крутой', 'склон', 'гора', '.', 'к',
                                    'этот',
                                    'дача', 'не', 'быть', 'проводить', 'даже', 'дорога', '.', 'кругом', 'она', 'быть',
                                    'обнесен',
                                    'высокий', 'забор', ',', 'с', 'единственный', 'низкий', 'калитка', ',', 'который',
                                    'всегда',
                                    'быть', 'плотно', 'прикрывать', '.'],
                         'feats': [{}, {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                        'Gender': 'Neuter'},
                                   {'Number': 'Plural', 'Pronoun': 'REFLEXIVE', 'Case': 'Genitive'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Plural',
                                    'Gender': 'Neuter'}, {},
                                   {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Plural',
                                    'Gender': 'Masculine'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular',
                                    'Pronoun': 'DEICTIC',
                                    'Case': 'Nominative'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Neuter'}, {},
                                   {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular',
                                    'Gender': 'Feminine',
                                    'Tense': 'NotPast'},
                                   {'Case': 'Accusative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Case': 'Accusative', 'Representation': 'Participle', 'Number': 'Singular',
                                    'Gender': 'Feminine',
                                    'Tense': 'Past'}, {},
                                   {'Case': 'Prepositional', 'Number': 'Singular', 'Gender': 'Masculine'},
                                   {'Case': 'Prepositional', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {}, {},
                                   {'Case': 'Dative', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Neuter',
                                    'Shortness': 'Short',
                                    'Tense': 'Past', 'Voice': 'Passive'}, {},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {}, {},
                                   {'Animacy': 'Animated', 'Gender': 'Feminine', 'Number': 'Singular',
                                    'Pronoun': 'PERSONAL',
                                    'Case': 'Nominative'},
                                   {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine',
                                    'Shortness': 'Short',
                                    'Tense': 'Past', 'Voice': 'Passive'},
                                   {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Masculine'},
                                   {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {}, {}, {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Instrumental', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Feminine'}, {},
                                   {'Number': 'Singular', 'Gender': 'Feminine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {},
                                   {'Representation': 'Participle', 'Number': 'Singular', 'Gender': 'Feminine',
                                    'Shortness': 'Short',
                                    'Tense': 'Past', 'Voice': 'Passive'}, {}],
                         'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O', 'O'],
                     },
                     {
                         'ne': SortedSpansSet([Entity('1', 6, 7, 'GPE_CITY')])
                     }
                     ),
            Document('1',
                     ['Когда', 'мы', 'шли', 'по', 'тропинке', ',', 'каждый', 'был', 'доволен', 'и', 'думал', ',', 'что',
                      'надул', 'другого', '.',
                      'Петька', 'изредка', 'посапывал', 'носом', '.',
                      'Давно', 'он', 'зарился', 'на', 'моих', 'голубей', ',', 'еще', 'с', 'прошлой', 'зимы', ',', 'а',
                      'теперь', 'вот', 'счастье', 'неожиданно', 'привалило', '.',
                      'А', 'у', 'меня', 'будет', 'пистолет', '.'],
                     [Sentence(0, 16), Sentence(16, 21), Sentence(21, 40), Sentence(40, 46)],
                     [Paragraph(0, 3)],
                     [
                         Entity('1', 1, 2, 'pron'),
                         Entity('1', 16, 17, 'noun'),
                         Entity('1', 22, 23, 'pron'),
                         Entity('1', 25, 26, 'pron'),
                         Entity('1', 25, 27, 'noun'),
                         Entity('1', 42, 43, 'pron'),
                         Entity('1', 44, 45, 'noun'),
                     ],
                     {
                         Relation(Entity('1', 16, 17, 'noun'), Entity('1', 22, 23, 'pron'), 'COREF'),
                         Relation(Entity('1', 25, 26, 'pron'), Entity('1', 42, 43, 'pron'), 'COREF'),
                     },
                     {
                         'pos': ['SCONJ', 'PRON', 'VERB', 'ADP', 'NOUN', 'PUNCT', 'ADJ', 'AUX', 'ADJ', 'CCONJ', 'VERB',
                                 'PUNCT',
                                 'SCONJ', 'VERB', 'ADJ', 'PUNCT', 'NOUN', 'ADV', 'VERB', 'NOUN', 'PUNCT', 'ADV', 'PRON',
                                 'VERB',
                                 'ADP', 'DET', 'NOUN', 'PUNCT', 'ADV', 'ADP', 'NOUN', 'NOUN', 'PUNCT', 'CCONJ', 'ADV',
                                 'PART',
                                 'NOUN', 'ADV', 'VERB', 'PUNCT', 'CCONJ', 'ADP', 'PRON', 'VERB', 'NOUN', 'PUNCT'],
                         'dt_labels': ['mark', 'nsubj', 'advcl', 'case', 'obl', 'punct', 'nsubj', 'cop', 'root', 'cc',
                                       'conj',
                                       'punct', 'mark', 'advcl', 'obj', 'punct', 'nsubj', 'advmod', 'root', 'obl',
                                       'punct', 'advmod',
                                       'nsubj', 'root', 'case', 'amod', 'obl', 'punct', 'advmod', 'case', 'obl', 'nmod',
                                       'punct',
                                       'cc', 'advmod', 'advmod', 'nsubj', 'advmod', 'conj', 'punct', 'cc', 'case',
                                       'root', 'cop',
                                       'nsubj', 'punct'],
                         'dt_head_distances': [8, 1, 6, 1, -2, -1, 2, 1, 0, 1, -2, -1, -2, -3, -1, -1, 2, 1, 0, -1, -1,
                                               2, 1, 0, 2,
                                               1, -3, -1, 2, 1, -7, -1, -1, 5, 4, 1, 2, 1, -15, -1, 2, 1, 0, -1, -2,
                                               -1],
                         'lemmas': ['когда', 'мы', 'идти', 'по', 'тропинка', ',', 'каждый', 'быть', 'довольный', 'и',
                                    'думать', ',',
                                    'что', 'надуть', 'другой', '.', 'Петька', 'изредка', 'посапывать', 'нос', '.',
                                    'давно', 'он',
                                    'зариться', 'на', 'мой', 'голубь', ',', 'еще', 'с', 'прошлый', 'зима', ',', 'а',
                                    'теперь', 'вот',
                                    'счастье', 'неожиданно', 'приваливать', '.', 'а', 'у', 'я', 'быть', 'пистолет',
                                    '.'],
                         'feats': [{}, {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'DEICTIC',
                                        'Case': 'Nominative'},
                                   {'Number': 'Plural', 'Tense': 'Past', 'Mode': 'Indicative'}, {},
                                   {'Case': 'Dative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {},
                                   {'Case': 'Nominative', 'Number': 'Singular', 'Gender': 'Masculine'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Shortness': 'Short'}, {},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {}, {},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Singular',
                                    'Gender': 'Masculine'}, {},
                                   {'Case': 'Nominative', 'Animacy': 'Animated', 'Number': 'Singular',
                                    'Gender': 'Masculine'}, {},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {'Case': 'Instrumental', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'},
                                   {}, {},
                                   {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular',
                                    'Pronoun': 'PERSONAL',
                                    'Case': 'Nominative'},
                                   {'Number': 'Singular', 'Gender': 'Masculine', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {},
                                   {'Animacy': 'Animated', 'Number': 'Plural', 'Pronoun': 'POSSESSIVE',
                                    'Case': 'Accusative'},
                                   {'Case': 'Accusative', 'Animacy': 'Animated', 'Number': 'Plural',
                                    'Gender': 'Masculine'}, {}, {},
                                   {}, {'Case': 'Genitive', 'Number': 'Singular', 'Gender': 'Feminine'},
                                   {'Case': 'Genitive', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Feminine'}, {}, {},
                                   {}, {}, {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                            'Gender': 'Neuter'},
                                   {},
                                   {'Number': 'Singular', 'Gender': 'Neuter', 'Tense': 'Past', 'Mode': 'Indicative'},
                                   {}, {}, {},
                                   {'Animacy': 'Animated', 'Gender': 'Masculine', 'Number': 'Singular',
                                    'Pronoun': 'DEICTIC',
                                    'Case': 'Genitive'},
                                   {'Person': 'Third', 'Number': 'Singular', 'Tense': 'NotPast', 'Mode': 'Indicative'},
                                   {'Case': 'Nominative', 'Animacy': 'Inanimated', 'Number': 'Singular',
                                    'Gender': 'Masculine'}, {}],
                         'said': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O',
                                  'O', 'O', 'O',
                                  'O', 'O', 'O', 'O', 'O', 'O'],
                     },
                     {
                         'ne': SortedSpansSet([Entity('1', 16, 17, 'PERSON')])
                     }
                     )
        ]
        # empty sets are "known" rels
        self.hook = get_hook([doc.without_relations().with_relations(set()) for doc in self.docs])
        self.base_props = {
            "seed": 12345,

            "distance": 10,
            "max_distance": 10,
            "loss": "cross_entropy",
            "optimizer": "momentum",
            "lr_decay": 0.05,
            "momentum": 0.9,
            "dropout": 0.5,
            "internal_size": 10,
            "epoch": 1,
            "batch_size": 64,
            "learning_rate": 0.1,

            "clip_norm": 5,

            "max_candidate_distance": 50,
            "max_entity_distance": 50,
            "max_word_distance": 50,
            "max_sent_distance": 10,
            "max_dt_distance": 10,
            "dist_size": 50,

            "pos_emb_size": 0,
            "morph_feats_emb_size": 0,
            "entities_types_size": 20,

            "morph_feats_size": 0,
            "morph_feats_list": ["Gender", "Animacy", "Number"],

            "encoding_type": "lstm",
            "entity_encoding_size": 10,
            "encoding_size": 10,
            "classifiers": ["exact_match", "intersecting_mentions"],
            "use_filter": False,

            "max_sent_entities_distance": 10,
            "max_token_entities_distance": 20,

            "agreement_types": ["Gender", "Animacy", "Number"],
            "classifier_agreement_size": 0,

            "head_str_match_size": 0,
            "partial_str_match_size": 0,
            "ordered_partial_str_match_size": 0,

            "mention_interrelation_size": 0,
            "mention_distance_size": 0,
            "max_mention_distance": 50,
            "classifier_entity_distance_size": 0,
            "entities_types_in_classifier_size": 0,
            "head_ne_types_size": 0,
            "entities_token_distance_in_classifier_size": 0,
            "entities_sent_distance_in_classifier_size": 0,

            "encoder_entity_types_size": 0,
            "encoder_entity_ne_size": 0,

            "speech_types": ["said"],
            "speech_size": 0,

            "entity_encoding_type": "rnn",

            "classification_dense_size": 20,
        }
        self.experiment_props = {
            "sampling_strategy": ["coref_noun", "coref_pron_cluster", 'coref_pron_cluster_strict', 'coref_pron']
        }
Example #28
0
    def setUp(self) -> None:
        self.docs = []

        # BB-event-4329237
        tokens = [
            "The", "in", "vitro", "assay", "of", "tuberculin",
            "hypersensitivity", "in", "Macaca", "mulatta", "sensitized",
            "with", "bacille", "Calmette", "Guerin", "cell", "wall", "vaccine",
            "and-or", "infected", "with", "virulent", "Mycobacterium",
            "tuberculosis", "."
        ]
        sentences = [Sentence(0, 25)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity("T2", 8, 18, "Habitat"),
            Entity("T3", 8, 24, "Habitat"),
            Entity("T4", 12, 18, "Habitat"),
            Entity("T5", 12, 15, "Bacteria"),
            Entity("T6", 22, 24, "Bacteria")
        ]
        relations = {Relation(entities[4], entities[1], "Lives_In")}

        # token features generated by UDPipe
        pos = [
            'DET', 'ADP', 'NOUN', 'NOUN', 'ADP', 'NOUN', 'NOUN', 'ADP',
            'PROPN', 'PROPN', 'VERB', 'ADP', 'PROPN', 'PROPN', 'PROPN', 'NOUN',
            'NOUN', 'NUM', 'NOUN', 'VERB', 'ADP', 'ADJ', 'PROPN', 'NOUN',
            'PUNCT'
        ]

        dt_labels = [
            'det', 'case', 'compound', 'nsubj', 'case', 'compound', 'nmod',
            'case', 'compound', 'nmod', 'root', 'case', 'compound', 'flat',
            'compound', 'compound', 'obl', 'nummod', 'appos', 'acl', 'case',
            'amod', 'compound', 'obl', 'punct'
        ]

        dt_head_distances = [
            3, 2, 1, 7, 2, 1, -3, 2, 1, -6, 0, 5, 2, -1, 2, 1, -6, 1, -2, -1,
            3, 2, 1, -4, -14
        ]

        token_features = {
            "pos": pos,
            "dt_labels": dt_labels,
            "dt_head_distances": dt_head_distances
        }
        self.docs.append(
            Document("_", tokens, sentences, paragraphs, entities, relations,
                     token_features))

        # BB-event-9564489
        tokens = [
            'Gingivomandibular', 'infection', 'due', 'to', 'Mycobacterium',
            'kansasii', 'in', 'a', 'patient', 'with', 'AIDS', '.'
        ]
        sentences = [Sentence(0, 12)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity("T2", 0, 1, "Habitat"),
            Entity("T3", 4, 6, "Bacteria"),
            Entity("T4", 8, 11, "Habitat")
        ]
        relations = {
            Relation(entities[1], entities[0], "Lives_In"),
            Relation(entities[1], entities[2], "Lives_In")
        }

        # token features generated by UDPipe
        pos = [
            'ADJ', 'NOUN', 'ADP', 'ADP', 'PROPN', 'PROPN', 'ADP', 'DET',
            'NOUN', 'ADP', 'NOUN', 'PUNCT'
        ]

        dt_labels = [
            'amod', 'root', 'case', 'fixed', 'compound', 'nmod', 'case', 'det',
            'nmod', 'case', 'nmod', 'punct'
        ]

        dt_head_distances = [1, 0, 3, -1, 1, -4, 2, 1, -7, 1, -2, -10]

        token_features = {
            "pos": pos,
            "dt_labels": dt_labels,
            "dt_head_distances": dt_head_distances
        }
        self.docs.append(
            Document("_", tokens, sentences, paragraphs, entities, relations,
                     token_features))
        self.docs_no_rels = [doc.without_relations() for doc in self.docs]
        self.props = {
            "shared": {
                "internal_emb_size": 10,
                "token_position_size": 10,
                "max_word_distance": 20,
                "dt_distance_emb_size": 10,
                "max_dt_distance": 10,
                "dt_depth_emb_size": 10,
                "max_dt_depth": 10,
                "pos_emb_size": 10
            },
            "add_we": "true",
            "add_shared": "true",
            "optimizer": "adam",
            "learning_rate": 0.01,
            "epoch": 2,
            "loss": "cross_entropy",
            "l2": 0.0001,
            "lr_decay": 0.1,
            "dropout": 0.5,
            "clip_norm": 1,
            "max_candidate_distance": 20,
            "batcher": {
                "batch_size": 8
            },
            "token_position_size": 10,
            "max_word_distance": 10,
            "encoding_size": 10,
            "entities_types_emb_size": 20,
            "entities_depth_emb_size": 10,
            'max_entities_depth': 2,
            "specific_encoder_size": 10,
            "aggregation": {
                "attention": {},
                "max_pooling": {},
                "mean_pooling": {},
                "take_spans": {},
                "last_hiddens": {}
            },
            "seed": 100
        }

        # GENIA id=10022435
        tokens = [
            "Glucocorticoid", "resistance", "in", "the", "squirrel", "monkey",
            "is", "associated", "with", "overexpression", "of", "the",
            "immunophilin", "FKBP51", "."
        ]
        sentences = [Sentence(0, 15)]
        paragraphs = [Paragraph(0, 1)]

        pos = [
            "NN", "NN", "IN", "DT", "NN", "NN", "VBZ", "VBN", "IN", "NN", "IN",
            "DT", "NN", "NN", "PERIOD"
        ]

        dt_labels = [
            "compound", "nsubjpass", "case", "det", "compound", "nmod",
            "auxpass", "root", "case", "nmod", "case", "det", "compound",
            "nmod", "dep"
        ]

        dt_head_distances = [1, 6, 3, 2, 1, -4, 1, 0, 1, -2, 3, 2, 1, -4, -7]

        token_features = {
            "pos": pos,
            "dt_labels": dt_labels,
            "dt_head_distances": dt_head_distances
        }
        self.unlabeled_docs = [
            Document("_",
                     tokens,
                     sentences,
                     paragraphs,
                     token_features=token_features)
        ]

        self.sdp_config = {
            "context_encoding_non_linearity_size": 10,
            "loss": "cross_entropy",
            "learning_rate": 0.02,
            "query_dense_size": 10,
            "clip_norm": 1,
            "batcher": {
                "batch_size": 1
            }
        }

        self.parser_config = {
            "context_encoding_non_linearity_size": 10,
            "loss": "cross_entropy",
            "learning_rate": 0.02,
            "clip_norm": 1,
            "batcher": {
                "batch_size": 1
            },
            "add_shared": True,
            "specific_encoder_size": 10,
            "sampling_strategy": "pos_filtering",
            "arc_token_distance_in_classifier_size": 10,
            "arc_token_distance_in_attention_size": 10,
            "max_arc_token_distance": 10,
            "aggregation": {
                "attention": {
                    "type": "luong",
                    "normalise_coefficients": True
                },
                "take_spans": {}
            }
        }
Example #29
0
def _get_lemma(token, transformer):
    doc = Document("", [token], [Sentence(0, 1)], [Paragraph(0, 1)])
    featured_doc = transformer.transform(doc)
    return featured_doc.token_features['lemmas'][0]
Example #30
0
def make_document_from_json_file(file_path):
    d = load_json_file_as_dict(file_path)

    tokens = d.get('tokens', [])
    entities = d.get('entities', [])
    sentences = d.get('sentences', [])
    paragraphs = d.get('paragraphs', [])
    token_features = {}

    for feature in [
            'pos', 'entities_types', 'entities_depths', 'borders', 'dt_labels',
            'dt_head_distances', 'dt_depths', 'dt_deltas_forward',
            'dt_deltas_backward', 'dt_breakups_forward', 'dt_breakups_backward'
    ]:
        if feature in d:
            token_features[feature] = d[feature]

    relations = d.get('relations', [])

    doc_entities = []
    for ent in entities:
        id_, start_token, end_token, ent_type = tuple(ent)
        doc_entities.append(Entity(id_, start_token, end_token, ent_type))

    doc_sentences = []

    for sent in sentences:
        start_token, end_token = tuple(sent)
        doc_sentences.append(Sentence(start_token, end_token))

    doc_paragraphs = []

    for par in paragraphs:
        start_sentence, end_sentence = tuple(par)
        doc_paragraphs.append(Paragraph(start_sentence, end_sentence))

    doc_relations = []

    for rel in relations:
        e1 = None
        e2 = None
        e1_id, e2_id, rel_type = tuple(rel)

        for entity in doc_entities:
            if entity.id == e1_id:
                e1 = entity
            if entity.id == e2_id:
                e2 = entity

            if e1 is not None and e2 is not None:
                break

        doc_relations.append(Relation(e1, e2, rel_type))

    doc = Document("",
                   tokens,
                   doc_sentences,
                   doc_paragraphs,
                   token_features=token_features)
    if 'entities' in d:
        doc = doc.with_entities(doc_entities)
    if 'relations' in d:
        doc = doc.with_relations(doc_relations)
    return doc