Python Doc.ents Examples

Programming Language: Python

Namespace/Package Name: spacy.tokens.doc

Class/Type: Doc

Method/Function: ents

Examples at hotexamples.com: 4

Python Doc.ents - 4 examples found. These are the top rated real world Python examples of spacy.tokens.doc.Doc.ents extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Doc(30)

set_extension(16)

has_extension(10)

read_bytes(8)

char_span(7)

ents(4)

from_bytes(4)

from_array(3)

is_parsed(3)

remove_extension(1)

retokenize(1)

sentiment(1)

tensor(1)

to_disk(1)

Example #1

Show file

    def __call__(self, doc: Doc):
        """
        The spacy pipeline caller
        :param doc: The Doc token.
        """

        # get matches
        phrase_matches = self.phrase_matcher(doc)
        matches = self.matcher(doc)

        # process them
        spans = []
        for match_id, start, end in phrase_matches + matches:
            # start add them into entities list
            span = Span(doc, start, end, label=match_id)
            spans.append(span)
        # print('Before', [(ent.label_, ent.text) for ent in doc.ents])
        doc.ents = self._filter_spans(spans + list(doc.ents))
        # print('After', [(ent.label_, ent.text) for ent in doc.ents])
        return doc

Example #2

Show file

    def __call__(self, doc: Doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.

        References:
            - ``https://spacy.io/usage/processing-pipelines#component-example2``.

        Args:
            doc (Doc): spaCy document.

        Returns:
            doc
        """
        if not self.crf_extractor:
            raise RuntimeError("`CRFEntityExtractor` was not initialized. "
                               "Did you call `.from_disk()` method ?")

        example = {"doc": doc, "text": doc.text}
        self.spacy_tokenizer.tokenize(example, attribute="doc")

        spans = [
            doc.char_span(entity_dict["start"],
                          entity_dict["end"],
                          label=entity_dict["entity"])
            for entity_dict in self.crf_extractor.process(example)
        ]

        doc.ents = list(doc.ents) + spans
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()

        return doc

Example #3

Show file

File: spacy_interface.py Project: dwadden/dygiepp

def prepare_spacy_doc(doc: Doc, prediction: Dict) -> Doc:
    doc_rels = []
    doc_evs = []
    # store events as relations. include confidence scores in the relation tuple (TODO: add relation property)
    for evs, ds in zip(prediction.get("predicted_events", []), doc.sents):
        sent_evs = []
        for ev in evs:
            if len(ev) >= 3:
                trig = [r for r in ev if r[1] == "TRIGGER"]
                arg0s = [r for r in ev if r[2] == "ARG0"]
                #example arg0s: [[40, 43, 'ARG0', 12.1145, 1.0], [45, 45, 'ARG0', 11.3498, 1.0]]
                arg1s = [r for r in ev if r[2] == "ARG1"]
                e_trig = doc[trig[0][0]:trig[0][0] + 1]
                for arg0 in arg0s:
                    e_arg0 = doc[arg0[0]:arg0[1] + 1]
                    for arg1 in arg1s:
                        e_arg1 = doc[arg1[0]:arg1[1] + 1]
                        #here confidence is set as the minimum among {trigger,args}, as a conservative measure.
                        sent_evs.append({
                            "ARG0":
                            e_arg0,
                            "ARG1":
                            e_arg1,
                            "RELATION_TRIGGER":
                            e_trig,
                            "CONF":
                            min([arg0[4], arg1[4], trig[0][3]])
                        })

        doc_evs.append(sent_evs)
        ds._.events = sent_evs
    doc._.events = doc_evs
    #TODO add doc._.span_ents too.

    for rels, ds in zip(prediction.get("predicted_relations", []), doc.sents):
        sent_rels = []
        for rel in rels:
            e1 = doc[rel[0]:rel[1] + 1]
            e2 = doc[rel[2]:rel[3] + 1]
            tag = rel[4]
            sent_rels.append((e1, e2, tag))
        doc_rels.append(sent_rels)
        ds._.rels = sent_rels
    doc._.rels = doc_rels
    if "predicted_ner" not in prediction:
        return doc
    preds = [p for r in prediction.get("predicted_ner", []) for p in r]
    # storing all span based entitis to doc._.span_ents
    span_ents = []
    for sent in prediction["predicted_ner"]:
        ent_sent = []
        for ent in sent:
            d = doc[ent[0]:ent[1] + 1]
            d._.label_ = ent[2]
            ent_sent.append(d)
        span_ents.append(ent_sent)
    doc._.span_ents = span_ents
    # store entities to doc.ents of spacy
    # because spacy can't support the overlapped entities we have to merge overlapped entities
    # to the longest ones.
    dist_ents = []
    prc = []
    for i, p1 in enumerate(preds):
        t = [p1]
        if i in prc:
            continue
        for j, p2 in enumerate(preds[i + 1:]):
            if p2[0] <= p1[1]:
                t.append(p1)
                prc.append(j + i + 1)
        dist_ents.append(t)
    res = []
    for t in dist_ents:
        if len(t) == 1:
            res.append(t[0])
        elif len(t) > 1:
            mn = t[0][0]
            mx = t[0][1]
            for p in t[1:]:
                if p[0] < mn:
                    mn = p[0]
                if p[1] > mx:
                    mx = p[1]
            res.append([mn, mx, t[0][2], t[0][3], t[0][4]])
    sel_ents = []
    for ent in res:
        try:
            d = doc[ent[0]:ent[1] + 1]
            s = doc.char_span(d.start_char, d.end_char, label=ent[2])
            if s:
                sel_ents.append(s)
        except Exception as e:
            print("error in spacy span", e)
            raise e
    doc.ents = sel_ents
    return doc

Example #4

Show file

File: base_reader.py Project: ryannetwork/medspacy_io

 def process_without_overlaps(self, doc: Doc,
                              sorted_spans: _OrderedDictItemsView,
                              classes: OrderedDict, attributes: OrderedDict,
                              relations: OrderedDict) -> Doc:
     """:arg a SpaCy Doc, can be overwriten by the subclass as needed.
         This function will add spans to doc.ents (defined by SpaCy as default)
         which doesn't allow overlapped annotations.
         @param doc: initiated SpaCy Doc
         @param sorted_spans: a sorted OrderedDict Items ( spans[entity_id] = (start, end, span_text))
         @param classes: a OrderedDict to map a entity id to [entity label, [attr_ids]]
         @param attributes: a OrderedDict to map a attribute id to (attribute_name, attribute_value)
         @param relations: a OrderedDict to map a relation_id to (label, (relation_component_ids))
         @return: annotated Doc
     """
     existing_entities = list(doc.ents)
     new_entities = list()
     # token_left_bound = 0
     token_right_bound = len(doc) - 1
     token_start = -1
     token_end = -1
     for id, span_tuple in sorted_spans:
         # because SpaCy uses token offset instead of char offset to define Spans, we need to match them,
         # binary search is used here to speed up
         if self.store_anno_string:
             start, end, span_txt = span_tuple
         else:
             start, end = span_tuple
         # because SpaCy uses token offset instead of char offset to define Spans, we need to match them,
         # binary search is used here to speed up
         if start < doc[0].idx:
             # If the annotation fall into a span that is before the 1st Spacy token, adjust the span to the 1st
             # token
             token_start = 0
             token_end = 1
         elif token_start >= token_right_bound:
             # If the annotation fall into a span that is after the last Spacy token, adjust the span to the last
             # token
             token_start = token_right_bound - 1
             token_end = token_right_bound
         else:
             token_start = self.find_start_token(start, token_start,
                                                 token_right_bound, doc)
             if end >= doc[-1].idx + doc[-1].__len__():
                 token_end = token_right_bound + 1
             else:
                 token_end = self.find_end_token(end, token_start,
                                                 token_right_bound, doc)
         if token_start < 0 or token_start >= token_right_bound or token_end < 0 or token_end > token_right_bound:
             raise ValueError(
                 "It is likely your annotations overlapped, which process_without_overlaps doesn't support parsing "
                 "those. You will need to initiate the EhostDocReader with 'support_overlap=True' in the arguements"
             )
         if token_start >= 0 and token_end > 0:
             span = Span(doc, token_start, token_end, label=classes[id][0])
             for attr_id in classes[id][1]:
                 if attr_id not in attributes:
                     continue
                 attr_name = attributes[attr_id][0]
                 attr_value = attributes[attr_id][1]
                 setattr(span._, attr_name, attr_value)
             if self.store_anno_string and span_txt is not None:
                 setattr(span._, "span_txt", span_txt)
             new_entities.append(span)
             token_start = token_end
         else:
             raise OverflowError(
                 'The span of the annotation: {}[{}:{}] is out of document boundary.'
                 .format(classes[id][0], start, end))
         pass
     doc.ents = existing_entities + new_entities
     return doc