Exemple #1
0
    def __call__(self, doc):
        text = urllib.parse.quote_plus(doc.text)
        result = urllib.request.urlopen(self.url + text).read()
        annotationset = ET.fromstring(result).find('AnnotationSet')
        for annotation in annotationset.getchildren():
            if annotation.get('Type') != 'Token':
                continue

            word_index = int(annotation.get('StartNode'))
            token = self.get_token_by_idx(word_index, doc)
            ner_tag = self.get_value_from_annotation(annotation, 'NER-BIO1')
            ner_tag = ner_tag.strip()
            if ner_tag != 'O':  # this is a capital o letter
                ner_tag_parts = ner_tag.split('-')
                label = doc.vocab.strings[ner_tag_parts[1]]
                if ner_tag_parts[0] == '1':
                    doc.ents = (list(doc.ents) +
                                [Span(doc, token.i, token.i + 1, label=label)])
                elif ner_tag_parts[0] == 'B':
                    self.ner_begin_idx = token.i
                elif ner_tag_parts[0] == 'E':
                    if self.ner_begin_idx is None:
                        raise Exception('Found end of ner,'
                                        'when looking for beginning')
                    else:
                        span = Span(doc,
                                    self.ner_begin_idx,
                                    token.i + 1,
                                    label=label)
                        span.merge()
                        doc.ents = (list(doc.ents) + [span])
                        self.ner_begin_idx = None

        return doc
Exemple #2
0
    def __call__(self, doc):

        entities = []
        for c in self.pipe_:
            doc = c(doc, entities)

        #  add extracted entities to doc.ents
        for e in entities:
            start, end, label = e
            span = Span(doc, start, end, label=label)
            doc.ents = list(doc.ents) + [span]

        # merge entities into one token? (default to False)
        if (self.merge_entity_spans):
            for span in doc.ents:
                span.merge()

        return doc
Exemple #3
0
    def init_doc(self, words, spaces, spans):
        ## Creating a new document with the text
        doc = Doc(self.nlp.vocab, words=words, spaces=spaces)

        ## Loading GROBID entities in the spaCY document
        entities = []
        for s in spans:
            span = Span(doc=doc,
                        start=s['tokenStart'],
                        end=s['tokenEnd'],
                        label=s['type'])
            span._.set('id', str(s['id']))
            if 'boundingBoxes' in s:
                span._.set('bounding_boxes', s['boundingBoxes'])
            if 'formattedText' in s:
                span._.set('formattedText', s['formattedText'])
            if 'links' in s:
                span._.set('links', s['links'])
            if 'linkable' in s:
                span._.set('linkable', s['linkable'])

            entities.append(span)

        doc.ents = entities
        # print("Entities: " + str(doc.ents))
        for span in entities:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
            for token in span:
                token._.id = span._.id
                token._.bounding_boxes = span._.bounding_boxes
                token._.formattedText = span._.formattedText
                token._.links = span._.links
                token._.linkable = span._.linkable
        self.nlp.tagger(doc)
        self.nlp.parser(doc)
        ## Merge entities and phrase nouns, but only when they are not overlapping,
        # to avoid loosing the entity type information
        phrases_ents = self.extract_phrases_ents(doc)
        # print(phrases_ents)
        for span in phrases_ents:
            # print("Span " + str(span))
            overlapping = False
            for ent in entities:
                # print(ent)
                if ((span.start <= ent.start <= span.end)
                        or (span.start <= ent.end >= span.end)
                        or (span.start >= ent.start and span.end <= ent.end)
                        or (span.start <= ent.start and span.end >= ent.end)):
                    overlapping = True
                    break

            # Entities and phrase noun are not overlapping
            if not overlapping:
                span.merge()
        # self.nlp.tagger(doc)
        # self.nlp.parser(doc)

        return doc