Ejemplo n.º 1
0
 def _make_span(self, doc: Doc, start: int, end: int, label: str,
                is_char: bool, retok: bool):
     span: Span
     if is_char:
         if label is None:
             span = doc.char_span(start, end)
         else:
             span = doc.char_span(start, end, label=label)
     else:
         if label is None:
             span = Span(doc, start, end)
         else:
             span = Span(doc, start, end, label=label)
     if logger.isEnabledFor(logging.DEBUG):
         logger.debug(f'span ({start}, {end}) for {label}: {span}')
     if span is not None:
         # this is a span object or none if match doesn't map to valid token
         # sequence
         if logger.isEnabledFor(logging.DEBUG):
             logger.debug(f'match: {span.text}')
         if label is not None:
             doc.ents += (span, )
         if retok:
             # https://github.com/explosion/spaCy/discussions/4806
             with doc.retokenize() as retokenizer:
                 # Iterate over all spans and merge them into one
                 # token. This is done after setting the entities –
                 # otherwise, it would cause mismatched indices!
                 retokenizer.merge(span)
Ejemplo n.º 2
0
 def extract_entity(self, doc: Doc) -> List[Span]:
     food_spans = []
     for food in self.food_names:
         food_index = doc.text.lower().find(food)
         if food_index > -1:
             food_spans.append(
                 doc.char_span(food_index, food_index + len(food)))
     return food_spans
Ejemplo n.º 3
0
def split_span(doc: Doc, span: Span) -> List[Span]:
    """
    Split a span in multiple span (one token per span)
    """
    s = doc.text
    new_spans = list()
    label = span.label_
    start_search = span.start_char
    for word in span:
        start = s.index(word.text, start_search, span.end_char)
        end = start + len(word.text)
        new_spans.append(doc.char_span(start, end, label))
        start_search += len(word.text)
    return new_spans
Ejemplo n.º 4
0
def offsets_from_tags(
    doc: Doc,
    tags: List[str],
    label_encoding: Optional[str] = "BIOUL",
    only_token_spans: bool = False,
) -> List[Dict]:
    """Converts BIOUL or BIO tags to offsets

    Parameters
    ----------
    doc
        A spaCy Doc created with `text` and the backbone tokenizer
    tags
        A list of BIOUL or BIO tags
    label_encoding
        The label encoding of the tags: BIOUL or BIO
    only_token_spans
        If True, offsets contains only token index references. Default is False

    Returns
    -------
    offsets
        A list of dicts with start and end character/token index with respect to the doc and the span label:
        `{"start": int, "end": int, "start_token": int, "end_token": int, "label": str}`
    """
    # spacy's biluo_tags_to_offsets surprisingly does not check this ...
    if len(doc) != len(tags):
        raise ValueError(
            f"Number of tokens and tags must be the same, "
            f"but 'len({list(doc)}) != len({tags})"
        )

    if label_encoding == "BIO":
        tags = to_bioul(tags, encoding="BIO")

    offsets = []
    for start, end, label in biluo_tags_to_offsets(doc, tags):
        span = doc.char_span(start, end)
        data = {
            "start_token": span.start,
            "end_token": span.end,
            "label": label,
        }
        if not only_token_spans:
            data.update({"start": start, "end": end})
        offsets.append(data)
    return offsets
Ejemplo n.º 5
0
    def __call__(self, doc: Doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.

        References:
            - ``https://spacy.io/usage/processing-pipelines#component-example2``.

        Args:
            doc (Doc): spaCy document.

        Returns:
            doc
        """
        if not self.crf_extractor:
            raise RuntimeError("`CRFEntityExtractor` was not initialized. "
                               "Did you call `.from_disk()` method ?")

        example = {"doc": doc, "text": doc.text}
        self.spacy_tokenizer.tokenize(example, attribute="doc")

        spans = [
            doc.char_span(entity_dict["start"],
                          entity_dict["end"],
                          label=entity_dict["entity"])
            for entity_dict in self.crf_extractor.process(example)
        ]

        doc.ents = list(doc.ents) + spans
        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()

        return doc
Ejemplo n.º 6
0
 def parse_read_doc(self, doc: Doc = None) -> List:
     return [doc.char_span(span[0], span[1]) for span in doc._.entities]
Ejemplo n.º 7
0
def prepare_spacy_doc(doc: Doc, prediction: Dict) -> Doc:
    doc_rels = []
    doc_evs = []
    # store events as relations. include confidence scores in the relation tuple (TODO: add relation property)
    for evs, ds in zip(prediction.get("predicted_events", []), doc.sents):
        sent_evs = []
        for ev in evs:
            if len(ev) >= 3:
                trig = [r for r in ev if r[1] == "TRIGGER"]
                arg0s = [r for r in ev if r[2] == "ARG0"]
                #example arg0s: [[40, 43, 'ARG0', 12.1145, 1.0], [45, 45, 'ARG0', 11.3498, 1.0]]
                arg1s = [r for r in ev if r[2] == "ARG1"]
                e_trig = doc[trig[0][0]:trig[0][0] + 1]
                for arg0 in arg0s:
                    e_arg0 = doc[arg0[0]:arg0[1] + 1]
                    for arg1 in arg1s:
                        e_arg1 = doc[arg1[0]:arg1[1] + 1]
                        #here confidence is set as the minimum among {trigger,args}, as a conservative measure.
                        sent_evs.append({
                            "ARG0":
                            e_arg0,
                            "ARG1":
                            e_arg1,
                            "RELATION_TRIGGER":
                            e_trig,
                            "CONF":
                            min([arg0[4], arg1[4], trig[0][3]])
                        })

        doc_evs.append(sent_evs)
        ds._.events = sent_evs
    doc._.events = doc_evs
    #TODO add doc._.span_ents too.

    for rels, ds in zip(prediction.get("predicted_relations", []), doc.sents):
        sent_rels = []
        for rel in rels:
            e1 = doc[rel[0]:rel[1] + 1]
            e2 = doc[rel[2]:rel[3] + 1]
            tag = rel[4]
            sent_rels.append((e1, e2, tag))
        doc_rels.append(sent_rels)
        ds._.rels = sent_rels
    doc._.rels = doc_rels
    if "predicted_ner" not in prediction:
        return doc
    preds = [p for r in prediction.get("predicted_ner", []) for p in r]
    # storing all span based entitis to doc._.span_ents
    span_ents = []
    for sent in prediction["predicted_ner"]:
        ent_sent = []
        for ent in sent:
            d = doc[ent[0]:ent[1] + 1]
            d._.label_ = ent[2]
            ent_sent.append(d)
        span_ents.append(ent_sent)
    doc._.span_ents = span_ents
    # store entities to doc.ents of spacy
    # because spacy can't support the overlapped entities we have to merge overlapped entities
    # to the longest ones.
    dist_ents = []
    prc = []
    for i, p1 in enumerate(preds):
        t = [p1]
        if i in prc:
            continue
        for j, p2 in enumerate(preds[i + 1:]):
            if p2[0] <= p1[1]:
                t.append(p1)
                prc.append(j + i + 1)
        dist_ents.append(t)
    res = []
    for t in dist_ents:
        if len(t) == 1:
            res.append(t[0])
        elif len(t) > 1:
            mn = t[0][0]
            mx = t[0][1]
            for p in t[1:]:
                if p[0] < mn:
                    mn = p[0]
                if p[1] > mx:
                    mx = p[1]
            res.append([mn, mx, t[0][2], t[0][3], t[0][4]])
    sel_ents = []
    for ent in res:
        try:
            d = doc[ent[0]:ent[1] + 1]
            s = doc.char_span(d.start_char, d.end_char, label=ent[2])
            if s:
                sel_ents.append(s)
        except Exception as e:
            print("error in spacy span", e)
            raise e
    doc.ents = sel_ents
    return doc