def __call__(self, doc: Doc): """ The spacy pipeline caller :param doc: The Doc token. """ # get matches phrase_matches = self.phrase_matcher(doc) matches = self.matcher(doc) # process them spans = [] for match_id, start, end in phrase_matches + matches: # start add them into entities list span = Span(doc, start, end, label=match_id) spans.append(span) # print('Before', [(ent.label_, ent.text) for ent in doc.ents]) doc.ents = self._filter_spans(spans + list(doc.ents)) # print('After', [(ent.label_, ent.text) for ent in doc.ents]) return doc
def __call__(self, doc: Doc): """Apply the pipeline component on a Doc object and modify it if matches are found. Return the Doc, so it can be processed by the next component in the pipeline, if available. References: - ``https://spacy.io/usage/processing-pipelines#component-example2``. Args: doc (Doc): spaCy document. Returns: doc """ if not self.crf_extractor: raise RuntimeError("`CRFEntityExtractor` was not initialized. " "Did you call `.from_disk()` method ?") example = {"doc": doc, "text": doc.text} self.spacy_tokenizer.tokenize(example, attribute="doc") spans = [ doc.char_span(entity_dict["start"], entity_dict["end"], label=entity_dict["entity"]) for entity_dict in self.crf_extractor.process(example) ] doc.ents = list(doc.ents) + spans for span in spans: # Iterate over all spans and merge them into one token. This is done # after setting the entities – otherwise, it would cause mismatched # indices! span.merge() return doc
def prepare_spacy_doc(doc: Doc, prediction: Dict) -> Doc: doc_rels = [] doc_evs = [] # store events as relations. include confidence scores in the relation tuple (TODO: add relation property) for evs, ds in zip(prediction.get("predicted_events", []), doc.sents): sent_evs = [] for ev in evs: if len(ev) >= 3: trig = [r for r in ev if r[1] == "TRIGGER"] arg0s = [r for r in ev if r[2] == "ARG0"] #example arg0s: [[40, 43, 'ARG0', 12.1145, 1.0], [45, 45, 'ARG0', 11.3498, 1.0]] arg1s = [r for r in ev if r[2] == "ARG1"] e_trig = doc[trig[0][0]:trig[0][0] + 1] for arg0 in arg0s: e_arg0 = doc[arg0[0]:arg0[1] + 1] for arg1 in arg1s: e_arg1 = doc[arg1[0]:arg1[1] + 1] #here confidence is set as the minimum among {trigger,args}, as a conservative measure. sent_evs.append({ "ARG0": e_arg0, "ARG1": e_arg1, "RELATION_TRIGGER": e_trig, "CONF": min([arg0[4], arg1[4], trig[0][3]]) }) doc_evs.append(sent_evs) ds._.events = sent_evs doc._.events = doc_evs #TODO add doc._.span_ents too. for rels, ds in zip(prediction.get("predicted_relations", []), doc.sents): sent_rels = [] for rel in rels: e1 = doc[rel[0]:rel[1] + 1] e2 = doc[rel[2]:rel[3] + 1] tag = rel[4] sent_rels.append((e1, e2, tag)) doc_rels.append(sent_rels) ds._.rels = sent_rels doc._.rels = doc_rels if "predicted_ner" not in prediction: return doc preds = [p for r in prediction.get("predicted_ner", []) for p in r] # storing all span based entitis to doc._.span_ents span_ents = [] for sent in prediction["predicted_ner"]: ent_sent = [] for ent in sent: d = doc[ent[0]:ent[1] + 1] d._.label_ = ent[2] ent_sent.append(d) span_ents.append(ent_sent) doc._.span_ents = span_ents # store entities to doc.ents of spacy # because spacy can't support the overlapped entities we have to merge overlapped entities # to the longest ones. dist_ents = [] prc = [] for i, p1 in enumerate(preds): t = [p1] if i in prc: continue for j, p2 in enumerate(preds[i + 1:]): if p2[0] <= p1[1]: t.append(p1) prc.append(j + i + 1) dist_ents.append(t) res = [] for t in dist_ents: if len(t) == 1: res.append(t[0]) elif len(t) > 1: mn = t[0][0] mx = t[0][1] for p in t[1:]: if p[0] < mn: mn = p[0] if p[1] > mx: mx = p[1] res.append([mn, mx, t[0][2], t[0][3], t[0][4]]) sel_ents = [] for ent in res: try: d = doc[ent[0]:ent[1] + 1] s = doc.char_span(d.start_char, d.end_char, label=ent[2]) if s: sel_ents.append(s) except Exception as e: print("error in spacy span", e) raise e doc.ents = sel_ents return doc
def process_without_overlaps(self, doc: Doc, sorted_spans: _OrderedDictItemsView, classes: OrderedDict, attributes: OrderedDict, relations: OrderedDict) -> Doc: """:arg a SpaCy Doc, can be overwriten by the subclass as needed. This function will add spans to doc.ents (defined by SpaCy as default) which doesn't allow overlapped annotations. @param doc: initiated SpaCy Doc @param sorted_spans: a sorted OrderedDict Items ( spans[entity_id] = (start, end, span_text)) @param classes: a OrderedDict to map a entity id to [entity label, [attr_ids]] @param attributes: a OrderedDict to map a attribute id to (attribute_name, attribute_value) @param relations: a OrderedDict to map a relation_id to (label, (relation_component_ids)) @return: annotated Doc """ existing_entities = list(doc.ents) new_entities = list() # token_left_bound = 0 token_right_bound = len(doc) - 1 token_start = -1 token_end = -1 for id, span_tuple in sorted_spans: # because SpaCy uses token offset instead of char offset to define Spans, we need to match them, # binary search is used here to speed up if self.store_anno_string: start, end, span_txt = span_tuple else: start, end = span_tuple # because SpaCy uses token offset instead of char offset to define Spans, we need to match them, # binary search is used here to speed up if start < doc[0].idx: # If the annotation fall into a span that is before the 1st Spacy token, adjust the span to the 1st # token token_start = 0 token_end = 1 elif token_start >= token_right_bound: # If the annotation fall into a span that is after the last Spacy token, adjust the span to the last # token token_start = token_right_bound - 1 token_end = token_right_bound else: token_start = self.find_start_token(start, token_start, token_right_bound, doc) if end >= doc[-1].idx + doc[-1].__len__(): token_end = token_right_bound + 1 else: token_end = self.find_end_token(end, token_start, token_right_bound, doc) if token_start < 0 or token_start >= token_right_bound or token_end < 0 or token_end > token_right_bound: raise ValueError( "It is likely your annotations overlapped, which process_without_overlaps doesn't support parsing " "those. You will need to initiate the EhostDocReader with 'support_overlap=True' in the arguements" ) if token_start >= 0 and token_end > 0: span = Span(doc, token_start, token_end, label=classes[id][0]) for attr_id in classes[id][1]: if attr_id not in attributes: continue attr_name = attributes[attr_id][0] attr_value = attributes[attr_id][1] setattr(span._, attr_name, attr_value) if self.store_anno_string and span_txt is not None: setattr(span._, "span_txt", span_txt) new_entities.append(span) token_start = token_end else: raise OverflowError( 'The span of the annotation: {}[{}:{}] is out of document boundary.' .format(classes[id][0], start, end)) pass doc.ents = existing_entities + new_entities return doc