Python Span.merge Examples

Programming Language: Python

Namespace/Package Name: spacy.tokens

Class/Type: Span

Method/Function: merge

Examples at hotexamples.com: 3

Python Span.merge - 3 examples found. These are the top rated real world Python examples of spacy.tokens.Span.merge extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

set_extension(30)

Span(30)

has_extension(20)

merge(3)

get_extension(2)

kb_id_(2)

label_(2)

id_(1)

lower(1)

similarity(1)

Example #1

Show file

File: NER.py Project: Prodinal/GateSpacyWrapping

    def __call__(self, doc):
        text = urllib.parse.quote_plus(doc.text)
        result = urllib.request.urlopen(self.url + text).read()
        annotationset = ET.fromstring(result).find('AnnotationSet')
        for annotation in annotationset.getchildren():
            if annotation.get('Type') != 'Token':
                continue

            word_index = int(annotation.get('StartNode'))
            token = self.get_token_by_idx(word_index, doc)
            ner_tag = self.get_value_from_annotation(annotation, 'NER-BIO1')
            ner_tag = ner_tag.strip()
            if ner_tag != 'O':  # this is a capital o letter
                ner_tag_parts = ner_tag.split('-')
                label = doc.vocab.strings[ner_tag_parts[1]]
                if ner_tag_parts[0] == '1':
                    doc.ents = (list(doc.ents) +
                                [Span(doc, token.i, token.i + 1, label=label)])
                elif ner_tag_parts[0] == 'B':
                    self.ner_begin_idx = token.i
                elif ner_tag_parts[0] == 'E':
                    if self.ner_begin_idx is None:
                        raise Exception('Found end of ner,'
                                        'when looking for beginning')
                    else:
                        span = Span(doc,
                                    self.ner_begin_idx,
                                    token.i + 1,
                                    label=label)
                        span.merge()
                        doc.ents = (list(doc.ents) + [span])
                        self.ner_begin_idx = None

        return doc

Example #2

Show file

    def __call__(self, doc):

        entities = []
        for c in self.pipe_:
            doc = c(doc, entities)

        #  add extracted entities to doc.ents
        for e in entities:
            start, end, label = e
            span = Span(doc, start, end, label=label)
            doc.ents = list(doc.ents) + [span]

        # merge entities into one token? (default to False)
        if (self.merge_entity_spans):
            for span in doc.ents:
                span.merge()

        return doc

Example #3

Show file

    def init_doc(self, words, spaces, spans):
        ## Creating a new document with the text
        doc = Doc(self.nlp.vocab, words=words, spaces=spaces)

        ## Loading GROBID entities in the spaCY document
        entities = []
        for s in spans:
            span = Span(doc=doc,
                        start=s['tokenStart'],
                        end=s['tokenEnd'],
                        label=s['type'])
            span._.set('id', str(s['id']))
            if 'boundingBoxes' in s:
                span._.set('bounding_boxes', s['boundingBoxes'])
            if 'formattedText' in s:
                span._.set('formattedText', s['formattedText'])
            if 'links' in s:
                span._.set('links', s['links'])
            if 'linkable' in s:
                span._.set('linkable', s['linkable'])

            entities.append(span)

        doc.ents = entities
        # print("Entities: " + str(doc.ents))
        for span in entities:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()
            for token in span:
                token._.id = span._.id
                token._.bounding_boxes = span._.bounding_boxes
                token._.formattedText = span._.formattedText
                token._.links = span._.links
                token._.linkable = span._.linkable
        self.nlp.tagger(doc)
        self.nlp.parser(doc)
        ## Merge entities and phrase nouns, but only when they are not overlapping,
        # to avoid loosing the entity type information
        phrases_ents = self.extract_phrases_ents(doc)
        # print(phrases_ents)
        for span in phrases_ents:
            # print("Span " + str(span))
            overlapping = False
            for ent in entities:
                # print(ent)
                if ((span.start <= ent.start <= span.end)
                        or (span.start <= ent.end >= span.end)
                        or (span.start >= ent.start and span.end <= ent.end)
                        or (span.start <= ent.start and span.end >= ent.end)):
                    overlapping = True
                    break

            # Entities and phrase noun are not overlapping
            if not overlapping:
                span.merge()
        # self.nlp.tagger(doc)
        # self.nlp.parser(doc)

        return doc