Example #1
0
    def test_different_entities(self):
        raw_tokens = [(0, 4), (5, 7), (7, 12), (14, 17), (18, 20)]
        raw_entities = [
            self._r_e(0, 4),
            self._r_e(0, 5),
            self._r_e(0, 6),
            self._r_e(0, 7),
            self._r_e(0, 8),
            self._r_e(4, 7),
            self._r_e(4, 8),
            self._r_e(4, 12),
            self._r_e(14, 20)
        ]
        expected = [
            self._ent(0, 1),
            self._ent(0, 1),
            self._ent(0, 2),
            self._ent(0, 2),
            self._ent(0, 3),
            self._ent(1, 2),
            self._ent(1, 3),
            self._ent(1, 3),
            self._ent(3, 5)
        ]

        self.assertEqual(expected, align_raw_entities(raw_entities,
                                                      raw_tokens))
Example #2
0
    def test_inner_entities(self):
        raw_tokens = [(0, 4), (5, 7), (7, 12), (14, 17)]
        raw_entities = [self._r_e(6, 7), self._r_e(10, 16)]
        expected = [self._ent(1, 2), self._ent(2, 4)]

        self.assertEqual(expected, align_raw_entities(raw_entities,
                                                      raw_tokens))
Example #3
0
    def test_normal_alignment(self):
        raw_tokens = [(0, 4), (5, 7), (7, 12), (14, 17), (18, 20)]
        raw_entities = [self._r_e(5, 7), self._r_e(14, 17)]
        expected = [self._ent(1, 2), self._ent(3, 4)]

        self.assertEqual(expected, align_raw_entities(raw_entities,
                                                      raw_tokens))
Example #4
0
def _merge(raw_tokens: list, sentences: list, raw_paragraphs: list, raw_entities: list, raw_relations: list, *,
           symmetric_types: set = None) -> Tuple[List[Sentence], List[Paragraph], List[Entity], Set[Relation]]:
    """
    :param raw_tokens: list of tuples: (start, end, text)
    :param sentences: list of Sentence objects
    :param raw_paragraphs: list of tuples: (start, end)
    :param raw_entities: list of dicts: {'id', 'type', 'start', 'end'}
    :param raw_relations: list of dicts: {'type', 'first', 'second'}
    """
    paragraphs = []

    cur_par_idx = 0
    par_start = 0

    entities = sorted(align_raw_entities(raw_entities, raw_tokens))
    entities_dict = {ent.id: ent for ent in entities}
    sentences = adjust_sentences(sentences, entities)

    for i, sentence in enumerate(sentences):
        for token in raw_tokens[sentence.start_token: sentence.end_token]:
            if par_start != i + 1 and (_end_of_text(sentences, raw_tokens, sentence, token, i)
                                       or _end_of_paragraph(raw_paragraphs, cur_par_idx, token)):
                paragraphs.append(Paragraph(par_start, i + 1))
                par_start = i + 1
                cur_par_idx += 1

    return sentences, paragraphs, entities, _get_relations(raw_relations, entities_dict, symmetric_types)
Example #5
0
    def get_extras(self, tokens, sentences):
        sents, sent_starts, raw_tokens = _get_space_joined_sentences(
            tokens, sentences)
        ne_doc = list(self.api.named_entities(sents, language=self.lang))

        raw_entities = []
        for sent_start, ne_sent in zip(sent_starts, ne_doc):
            for ne in ne_sent:
                raw_entities.append({
                    'id': str(len(raw_entities)),
                    'type': ne[-1],
                    'start': sent_start + ne[0],
                    'end': sent_start + ne[1]
                })

        entities = align_raw_entities(raw_entities, raw_tokens)
        if self.remove_quotes:
            entities = self.__remove_quotes(tokens, entities)

        return {'ne': SortedSpansSet(entities)}
Example #6
0
    def predict_doc(self, text, raw_entities, need_entities, need_relations):
        """
        :param raw_entities: list of {"id","start","end","type"} dicts
        :return: (raw_entities, raw_relations) where:
          raw_entities is list of {"id","start","end","type"} dicts or None
          raw_relations is list of {"first","second","type"} dicts or None
        """

        if self.ent_clf is None and raw_entities is None and (need_entities or
                                                              need_relations):
            raise BadRequest("Server doesn't support entities recognition")

        if self.rel_clf is None and need_relations:
            raise BadRequest("Server doesn't support relation extraction")

        tokens, sentences, raw_tokens = self.segmenter.segment(text)
        doc = Document("_", tokens, sentences, [Paragraph(0, len(sentences))])
        doc = self.transformer.transform(doc)

        entities = None
        if raw_entities is not None:
            if need_relations:
                entities = align_raw_entities(raw_entities, raw_tokens)
            if not need_entities:
                raw_entities = None
        else:
            if need_entities or need_relations:
                entities = self.ent_clf.predict_doc(doc)
            if need_entities:
                raw_entities = self._to_raw_entities(entities, raw_tokens)

        raw_relations = None
        if need_relations:
            doc = doc.with_entities(entities)
            relations = self.rel_clf.predict_doc(doc)
            raw_relations = self._to_raw_relations(relations)

        return raw_entities, raw_relations