Example #1
0
    def __call__(self, doc: Doc):
        """
        The spacy pipeline caller
        :param doc: The Doc token.
        """

        # get matches
        phrase_matches = self.phrase_matcher(doc)
        matches = self.matcher(doc)

        # process them
        spans = []
        for match_id, start, end in phrase_matches + matches:
            # start add them into entities list
            span = Span(doc, start, end, label=match_id)
            spans.append(span)
        # print('Before', [(ent.label_, ent.text) for ent in doc.ents])
        doc.ents = self._filter_spans(spans + list(doc.ents))
        # print('After', [(ent.label_, ent.text) for ent in doc.ents])
        return doc
Example #2
0
    def __call__(self, doc: Doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.

        References:
            - ``https://spacy.io/usage/processing-pipelines#component-example2``.

        Args:
            doc (Doc): spaCy document.

        Returns:
            doc
        """
        if not self.crf_extractor:
            raise RuntimeError("`CRFEntityExtractor` was not initialized. "
                               "Did you call `.from_disk()` method ?")

        example = {"doc": doc, "text": doc.text}
        self.spacy_tokenizer.tokenize(example, attribute="doc")

        spans = [
            doc.char_span(entity_dict["start"],
                          entity_dict["end"],
                          label=entity_dict["entity"])
            for entity_dict in self.crf_extractor.process(example)
        ]

        doc.ents = list(doc.ents) + spans
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()

        return doc
Example #3
0
def prepare_spacy_doc(doc: Doc, prediction: Dict) -> Doc:
    doc_rels = []
    doc_evs = []
    # store events as relations. include confidence scores in the relation tuple (TODO: add relation property)
    for evs, ds in zip(prediction.get("predicted_events", []), doc.sents):
        sent_evs = []
        for ev in evs:
            if len(ev) >= 3:
                trig = [r for r in ev if r[1] == "TRIGGER"]
                arg0s = [r for r in ev if r[2] == "ARG0"]
                #example arg0s: [[40, 43, 'ARG0', 12.1145, 1.0], [45, 45, 'ARG0', 11.3498, 1.0]]
                arg1s = [r for r in ev if r[2] == "ARG1"]
                e_trig = doc[trig[0][0]:trig[0][0] + 1]
                for arg0 in arg0s:
                    e_arg0 = doc[arg0[0]:arg0[1] + 1]
                    for arg1 in arg1s:
                        e_arg1 = doc[arg1[0]:arg1[1] + 1]
                        #here confidence is set as the minimum among {trigger,args}, as a conservative measure.
                        sent_evs.append({
                            "ARG0":
                            e_arg0,
                            "ARG1":
                            e_arg1,
                            "RELATION_TRIGGER":
                            e_trig,
                            "CONF":
                            min([arg0[4], arg1[4], trig[0][3]])
                        })

        doc_evs.append(sent_evs)
        ds._.events = sent_evs
    doc._.events = doc_evs
    #TODO add doc._.span_ents too.

    for rels, ds in zip(prediction.get("predicted_relations", []), doc.sents):
        sent_rels = []
        for rel in rels:
            e1 = doc[rel[0]:rel[1] + 1]
            e2 = doc[rel[2]:rel[3] + 1]
            tag = rel[4]
            sent_rels.append((e1, e2, tag))
        doc_rels.append(sent_rels)
        ds._.rels = sent_rels
    doc._.rels = doc_rels
    if "predicted_ner" not in prediction:
        return doc
    preds = [p for r in prediction.get("predicted_ner", []) for p in r]
    # storing all span based entitis to doc._.span_ents
    span_ents = []
    for sent in prediction["predicted_ner"]:
        ent_sent = []
        for ent in sent:
            d = doc[ent[0]:ent[1] + 1]
            d._.label_ = ent[2]
            ent_sent.append(d)
        span_ents.append(ent_sent)
    doc._.span_ents = span_ents
    # store entities to doc.ents of spacy
    # because spacy can't support the overlapped entities we have to merge overlapped entities
    # to the longest ones.
    dist_ents = []
    prc = []
    for i, p1 in enumerate(preds):
        t = [p1]
        if i in prc:
            continue
        for j, p2 in enumerate(preds[i + 1:]):
            if p2[0] <= p1[1]:
                t.append(p1)
                prc.append(j + i + 1)
        dist_ents.append(t)
    res = []
    for t in dist_ents:
        if len(t) == 1:
            res.append(t[0])
        elif len(t) > 1:
            mn = t[0][0]
            mx = t[0][1]
            for p in t[1:]:
                if p[0] < mn:
                    mn = p[0]
                if p[1] > mx:
                    mx = p[1]
            res.append([mn, mx, t[0][2], t[0][3], t[0][4]])
    sel_ents = []
    for ent in res:
        try:
            d = doc[ent[0]:ent[1] + 1]
            s = doc.char_span(d.start_char, d.end_char, label=ent[2])
            if s:
                sel_ents.append(s)
        except Exception as e:
            print("error in spacy span", e)
            raise e
    doc.ents = sel_ents
    return doc
Example #4
0
 def process_without_overlaps(self, doc: Doc,
                              sorted_spans: _OrderedDictItemsView,
                              classes: OrderedDict, attributes: OrderedDict,
                              relations: OrderedDict) -> Doc:
     """:arg a SpaCy Doc, can be overwriten by the subclass as needed.
         This function will add spans to doc.ents (defined by SpaCy as default)
         which doesn't allow overlapped annotations.
         @param doc: initiated SpaCy Doc
         @param sorted_spans: a sorted OrderedDict Items ( spans[entity_id] = (start, end, span_text))
         @param classes: a OrderedDict to map a entity id to [entity label, [attr_ids]]
         @param attributes: a OrderedDict to map a attribute id to (attribute_name, attribute_value)
         @param relations: a OrderedDict to map a relation_id to (label, (relation_component_ids))
         @return: annotated Doc
     """
     existing_entities = list(doc.ents)
     new_entities = list()
     # token_left_bound = 0
     token_right_bound = len(doc) - 1
     token_start = -1
     token_end = -1
     for id, span_tuple in sorted_spans:
         # because SpaCy uses token offset instead of char offset to define Spans, we need to match them,
         # binary search is used here to speed up
         if self.store_anno_string:
             start, end, span_txt = span_tuple
         else:
             start, end = span_tuple
         # because SpaCy uses token offset instead of char offset to define Spans, we need to match them,
         # binary search is used here to speed up
         if start < doc[0].idx:
             # If the annotation fall into a span that is before the 1st Spacy token, adjust the span to the 1st
             # token
             token_start = 0
             token_end = 1
         elif token_start >= token_right_bound:
             # If the annotation fall into a span that is after the last Spacy token, adjust the span to the last
             # token
             token_start = token_right_bound - 1
             token_end = token_right_bound
         else:
             token_start = self.find_start_token(start, token_start,
                                                 token_right_bound, doc)
             if end >= doc[-1].idx + doc[-1].__len__():
                 token_end = token_right_bound + 1
             else:
                 token_end = self.find_end_token(end, token_start,
                                                 token_right_bound, doc)
         if token_start < 0 or token_start >= token_right_bound or token_end < 0 or token_end > token_right_bound:
             raise ValueError(
                 "It is likely your annotations overlapped, which process_without_overlaps doesn't support parsing "
                 "those. You will need to initiate the EhostDocReader with 'support_overlap=True' in the arguements"
             )
         if token_start >= 0 and token_end > 0:
             span = Span(doc, token_start, token_end, label=classes[id][0])
             for attr_id in classes[id][1]:
                 if attr_id not in attributes:
                     continue
                 attr_name = attributes[attr_id][0]
                 attr_value = attributes[attr_id][1]
                 setattr(span._, attr_name, attr_value)
             if self.store_anno_string and span_txt is not None:
                 setattr(span._, "span_txt", span_txt)
             new_entities.append(span)
             token_start = token_end
         else:
             raise OverflowError(
                 'The span of the annotation: {}[{}:{}] is out of document boundary.'
                 .format(classes[id][0], start, end))
         pass
     doc.ents = existing_entities + new_entities
     return doc