def get_extended_token(self, tok): ''' given a token find a more descriptive string extending it with its chindren :param tok: :param doc: :return: ''' allowed_pos = [NOUN, ADJ, PUNCT, PROPN] if 'Parkinson' in tok.text: pass allowed_dep = [ "nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl", "dobj", "attr", "oprd", "pobj", # "conj", "compound", "amod", "meta", "npadvmod", "nmod", "amod" ] # , add "prep" to extend for "of and "in" extended_tokens = [ i for i in tok.subtree if (i.dep_ in allowed_dep and i in tok.children) or (i == tok) ] # just get continous tokens span_range = [tok.i, tok.i] ext_tokens_i = [i.i for i in extended_tokens] max_bound = max(ext_tokens_i) min_bound = min(ext_tokens_i) curr_pos = tok.i for cursor in range(tok.i, max_bound + 1): if cursor in ext_tokens_i: if cursor == curr_pos + 1: span_range[1] = cursor curr_pos = cursor curr_pos = tok.i for cursor in range(tok.i, min_bound - 1, -1): if cursor in ext_tokens_i: if cursor == curr_pos - 1: span_range[0] = cursor curr_pos = cursor span = Span(self.doc, span_range[0], span_range[1] + 1) return span
def __call__(self, doc): matches = self.matcher(doc) spans = [] for _, start, end in matches: entity = Span(doc, start, end, label=self.label) spans.append(entity) for token in entity: token._.set('is_nfl_player', True) doc.ents = list(doc.ents) + [entity] for span in spans: span.merge() return doc
def __call__(self, doc: Doc): phrases = [] last_start = 0 for _, start, end in self._matcher(doc): if last_start >= end: continue last_start = end span = Span(doc, start, end) phrases.append(span) phrases.sort(key=lambda x: (x.start, -len(x))) setattr(doc._, self._phrases_name, _fix_overlappings(phrases)) return doc
def _extract_content(tag: Span) -> Span: start = None end = None for token in tag: if _is_content(token): if start is None: start = token.i end = token.i + 1 else: break assert start is not None and end is not None return Span(tag.doc, start, end)
def decode_bilou(labels: Sequence[str], tokens: Sequence[Token], doc: Doc) -> List[Span]: outside = True current_type = None start_index = 0 res = [] for label, token in zip(labels, tokens): if "-" in label: tag, entity_type = label.split("-", 2) else: tag, entity_type = "O", None if outside and tag != "O": current_type = entity_type outside = False start_index = token.i elif not outside: if tag == "O": outside = True res.append( Span(doc=doc, start=start_index, end=token.i, label=current_type)) start_index = token.i elif tag == "B" or tag == "U" or entity_type != current_type: res.append( Span(doc=doc, start=start_index, end=token.i, label=current_type)) current_type = entity_type start_index = token.i if not outside: outside = True res.append( Span(doc=doc, start=start_index, end=tokens[-1].i + 1, label=current_type)) return res
def custom_pipe_component_Name_et_al(self, doc): print("entering_custom_pipe_component Name et al") new_ents = [] for ent in doc.ents: print(f"ent = {ent}") # Only check for title if it's a person and not the first token replaced = False if ent.label_ == "PERSON":# and ent.end<len(doc)-2: # gib das neue label if et al. is in person or after Person if 'et' in ent.text and ('al' in ent.text or 'al.' in ent.text): new_ent = Span(doc, ent.start, ent.end, label="REF") replaced = True print("new ents") else: # wir schauen ob die danach folgenden et al sind print("within label Person") next_token = doc[ent.end + 1] next_next_token = doc[ent.end + 2] print(next_token.text) print(next_next_token.text) if next_token.text == "et" and next_next_token.text in ("al.", "al"): new_ent = Span(doc, ent.start, ent.end+2, label="REF") new_ents.append(new_ent) replaced = True print("new_ent") # es wird das neue angehangen if replaced: new_ents.append(new_ent) print('new ent') else: # es wird die alte Entitaet uveraendert uebertragen new_ents.append(ent) print("old ents") doc.ents = new_ents print(doc.ents) return doc
def tag_placing(doc,tag,string,sent): """Add tag to the word "string" in text "sent" """ # time1 = time.time() doc.vocab.strings.add(tag) #add the new tag to the list of pre-existed tags TAG = doc.vocab.strings[tag] #convert it to a spacy vocab indexes=list(find_all(sent,string)) if len(indexes)>0: st=string.split() #Find the indexes of token of the words which have been found token_pos=[token.i for token in doc if ((token.text.lower()==st[0] or token.text.lower()==st[0]+"s"))] token_pos_real=[] numberoftoken=len(list(doc)) for i in token_pos: if(i+len(st)<=numberoftoken): spantext=Span(doc, i, i+len(st)).text.lower() if(spantext==string or spantext==string+"s"): token_pos_real.append(i) tag1=tag #copy f our tag for i in token_pos_real: tag=tag1 fb_ent = Span(doc, i, i+len(st)) # create a Span for the new entity if (fb_ent in doc.ents): #This part is for word with a lot of tag Signalisation--BF Batiment--BF for example j=doc.ents.index(fb_ent) ent=doc.ents[j].label_ if(not(tag in ent)): #pre-existed tag if not(ent in ['PER','ORG','LOC','MISC']): tag=ent+' '+tag doc.vocab.strings.add(tag) TAG=doc.vocab.strings[tag] fb_ent = Span(doc, i, i+len(st), label=TAG) doc.ents=list(doc.ents)+[fb_ent]#update the NER tag list else: fb_ent = Span(doc, i, i+len(st), label=TAG) doc.ents=list(doc.ents)+[fb_ent] #update the NER tag list # time2 = time.time() # print(time2-time1) return len(token_pos_real) return 0
def _fix_overlabelings(doc): good_labelings = set() for span in doc._.labelings: should_add_span = False for other_span in doc._.labelings: # good if spans are identical # or they aren't overlapping if ( span.start == other_span.start and span.end == other_span.end or span.start >= other_span.end or span.end <= other_span.start ): should_add_span = True continue # exit loop as spans overlap # but one is larger if ( span.start > other_span.start and span.end <= other_span.end or span.start >= other_span.start and span.end < other_span.end ): should_add_span = False break # merge spans overlapping # in a tail-head manner # (last label wins) if ( span.start < other_span.start and span.end > other_span.start and span.end < other_span.end ) or ( span.start > other_span.start and span.start < other_span.end and span.end > other_span.end ): if span.start < other_span.start: start = span.start end = other_span.end label = other_span.label else: start = other_span.start end = span.end label = span.label merge_span = Span(doc, start, end, label) good_labelings.add(merge_span) should_add_span = False break if should_add_span: good_labelings.add(span) doc._.labelings = list(good_labelings)
def _merge_abbrs_labelings(doc): num_labelings = len(doc._.labelings) chunk2label = {s.text: s.label for s in doc._.labelings} for abbr in doc._.abbrs: # first check if long form is labeled # and short one is not if (abbr._.long_form.text in chunk2label and abbr.text not in chunk2label): label = chunk2label[abbr._.long_form.text] short_span = Span(doc, abbr.start, abbr.end, label) doc._.labelings.append(short_span) # otherwise check if short form is labeled # and long one is not elif (abbr.text in chunk2label and abbr._.long_form.text not in chunk2label): lf = abbr._.long_form label = chunk2label[abbr.text] long_span = Span(doc, lf.start, lf.end, label) doc._.labelings.append(long_span) # sort labelings if there's changes only if num_labelings < len(doc._.labelings): _sort_labelings(doc)
def test_is_negated(self): doc = nlp("There is no evidence of pneumonia.") context = ConTextComponent(nlp, add_attrs=True, rules=None) rules = [ ConTextRule("no evidence of", "NEGATED_EXISTENCE", direction="forward") ] context.add(rules) doc.ents = (Span(doc, 5, 6, "CONDITION"), ) context(doc) assert doc.ents[0]._.is_negated is True
def product_component(doc): # apply the matcher to the doc matches = matcher(doc) # create a span for each match and assign the label 'PRODUCT' spans = [ Span(doc, start, end, label="PRODUCT") for match_id, start, end in matches ] # overwrite the doc.ents with the matched spans doc.ents = spans return doc
def animal_component(self, doc): #Apply the matcher to the doc matches = self.matcher(doc) #create a Span for each match and assign the label 'ANIMAL' spans = [ Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches ] print("Spans are:", spans) doc.ents = doc.ents + tuple(spans) return doc
def test_doc_is_nered(en_vocab): words = ["I", "live", "in", "New", "York"] doc = Doc(en_vocab, words=words) assert not doc.has_annotation("ENT_IOB") doc.ents = [Span(doc, 3, 5, label="GPE")] assert doc.has_annotation("ENT_IOB") # Test creating doc from array with unknown values arr = numpy.array([[0, 0], [0, 0], [0, 0], [384, 3], [384, 1]], dtype="uint64") doc = Doc(en_vocab, words=words).from_array([ENT_TYPE, ENT_IOB], arr) assert doc.has_annotation("ENT_IOB") # Test serialization new_doc = Doc(en_vocab).from_bytes(doc.to_bytes()) assert new_doc.has_annotation("ENT_IOB")
def test_spans(doc): print(doc) print(doc.spans) for encoding in ["IO", "BIO", "BILUO"]: aggregator = aggregation.BaseAggregator("", ["GPE", "NORP", "ORG", "PERSON"], prefixes=encoding) obs = aggregator.get_observation_df(doc) print(obs) for source in ["name", "org", "place"]: spans = utils.token_array_to_spans(obs[source].values, aggregator.out_labels) spans = [Span(doc, start, end, label=label) for (start,end),label in spans.items()] all_spans = utils.get_spans(doc, [source]) assert spans == all_spans
def test_hmm3(doc2, combi_annotator): hmm = aggregation.HMM("hmm", ["GPE", "PRODUCT", "MONEY", "PERSON", "ORG", "DATE"]) hmm.add_underspecified_label("ENT", {"GPE", "PRODUCT", "MONEY", "PERSON", "ORG", "DATE"}) combi_annotator(doc2) hmm.fit([doc2]*100) doc2 = hmm(doc2) assert len(doc2.spans["hmm"]) > 30 assert len(doc2.spans["hmm"]) < 45 assert Span(doc2, 1,2, "GPE") in doc2.spans["hmm"] found_entities = {(span.text, span.label_) for span in doc2.spans["hmm"]} assert ('Scott Moore', 'PERSON') in found_entities assert (('197', 'MONEY') in found_entities or ("$197", "MONEY") in found_entities) assert ('iPhone 3Gs', 'PRODUCT') in found_entities
def __call__(self, doc): nlp = self.nlp with doc.retokenize() as retokenizer: #match and tag volume units matches = self.volume_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['volume_unit']) for token in span: token._.feature_is_volume_unit = True if len(span) > 1: retokenizer.merge(span) doc.ents = list(doc.ents) + [span] return doc
def animal_component(doc): # apply the matcher to the doc matches = matcher(doc) # create a span for each match and assign the label 'ANIMAL' spans = [ Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches ] # overwrite the doc.ents with the matched spans doc.ents = spans return doc
def test_docbins(nlp_small, temp_file="data/temporary_test.docbin"): doc = nlp_small( "Pierre Lison is working at the Norwegian Computing Center.") doc2 = nlp_small("He is working on various NLP topics.") doc.spans["test"] = [Span(doc, 0, 2, label="PERSON")] utils.docbin_writer([doc, doc2], temp_file) doc3, doc4 = list(utils.docbin_reader(temp_file, "en_core_web_sm")) assert doc.text == doc3.text assert doc2.text == doc4.text assert [(e.text, e.label_) for e in doc.ents] == [(e.text, e.label_) for e in doc3.ents] assert doc.user_data == doc3.user_data os.remove(temp_file)
def merge_phrases(matcher, doc, i, matches): """Merge a phrase. We have to be careful here because we'll change the token indices. To avoid problems, merge all the phrases once we're called on the last match.""" if i != len(matches) - 1: return None spans = [Span(doc, start, end, label=label) for label, start, end in matches] with doc.retokenize() as retokenizer: for span in spans: tag = "NNP" if span.label_ else span.root.tag_ attrs = {"tag": tag, "lemma": span.text} retokenizer.merge(span, attrs=attrs) doc.ents = doc.ents + (span,)
def __call__(self, doc): nlp = self.nlp with doc.retokenize() as retokenizer: # match units of measurement (x/y, , etc) matches = self.unit_of_measurement_matcher(doc) for match_id, start, end in matches: span = Span(doc, start, end, label=nlp.vocab.strings['measurement_unit']) for token in span: token._.feature_is_measurement_unit = True if len(span) > 1: retokenizer.merge(span) doc.ents = list(doc.ents) + [span] return doc
def __call__(self, doc): relations = [] for match_id, start, end in self.matcher(doc): match = doc[start:end] for tok in match: tok.ent_type_ = self.label relations.append(tok) span = Span(doc, start, end, label=match_id) if span not in doc.ents: doc.ents = list(doc.ents) + [span] doc.user_data[self.label] = relations print("tokens") return doc
def animal_component(doc, vocab): animals = [ "Golden Retriever", "cat", "turtle", "dog", "fish", "Rattus norvegicus", "snake", "lion", "tigger" ] matcher = PhraseMatcher(vocab) mathes = matcher(doc) span = [ Span(doc, start, end, label="ANIMAL") for matcher_id, start, end in mathes ] doc.ents = span return doc
def _add_ann(self, cui, doc, tkns, acc=-1, name=None, is_disamb=False): """ Add annotation to a document cui: concept id doc: spacy document where the concept was found tkns: tokens for this cui acc: accuracy for this annotation name: concept name """ # Skip if tui filter if self.TUI_FILTER is None or self.cdb.cui2tui[cui] in self.TUI_FILTER: if not is_disamb and cui in self.cdb.cui_disamb_always: self.to_disamb.append((list(tkns), name)) else: if self.LBL_STYLE == 'long': lbl = "{} - {} - {} - {} - {:.2}".format( cui, self.cdb.cui2pretty_name.get(cui, ''), self.cdb.cui2tui.get(cui, ''), self.cdb.tui2name.get(self.cdb.cui2tui.get(cui, ''), ''), float(acc)) elif self.LBL_STYLE == 'ent': lbl = "{} - {:.2}".format( self.cdb.tui2name.get(self.cdb.cui2tui.get(cui, ''), ''), float(acc)) else: lbl = cui lbl = doc.vocab.strings.add(lbl) ent = Span(doc, tkns[0].i, tkns[-1].i + 1, label=lbl) if self.ACC_ALWAYS: acc = self._calc_acc(cui, doc, tkns, name) ent._.acc = acc ent._.cui = cui ent._.tui = self.cdb.cui2tui.get(cui, 'None') ent._.id = self.ent_id self.ent_id += 1 doc._.ents.append(ent) # Increase counter for cui_count_ext if not already added if cui not in self._cuis: if cui in self.cdb.cui_count_ext: self.cdb.cui_count_ext[cui] += 1 else: self.cdb.cui_count_ext[cui] = 1 if self.train or self.force_train: self._add_cntx_vec(cui, doc, tkns) self._cuis.add(cui)
def Serialization(): print("\nThe outcomes of Serialization are:") try: text = open("/home/wangdi498/SpaCy/diary2.txt", 'r').read( ) # 'r'会按编码格式进行解析,read()返回的是str;'rb':会按二进制进行解析,read()返回的是bytes。 print("\nInfo: The Serialization file can be read.\n") except FileNotFoundError: print("\nError! The Serialization file cannot be read!\n") sys.exit(0) # os._exit()会直接将python程序终止,之后的所有代码都不会继续执行。 except: print("\nError! The .txt file must be UTF-8 encoded format!\n") doc = nlp(text) doc.to_disk("/home/wangdi498/SpaCy/diary1.bin") from spacy.tokens import Doc from spacy.vocab import Vocab doc = Doc(Vocab()).from_disk("/home/wangdi498/SpaCy/diary1.bin") print("The texts are:\n{}".format(doc)) from spacy.tokens import Span doc = nlp(text) print("\nThe 1st round of Save and Load is:") for ent in doc.ents: print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format( ent.text, ent.start_char, ent.end_char, ent.label_, doc[doc.ents.index(ent)].ent_iob_, doc[doc.ents.index(ent)].ent_type_)) assert len( doc.ents) != 0, "\nError! This document cannot be empty!" # 防止Doc为空。 augment = [Span(doc, 0, 2, label=doc.vocab.strings[u'EVENT'])] doc.ents = list(doc.ents) + augment doc.to_disk("/home/wangdi498/SpaCy/diary2.bin") print("\nThe 2nd round of Save and Load is:") for ent in doc.ents: print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format( ent.text, ent.start_char, ent.end_char, ent.label_, doc[doc.ents.index(ent)].ent_iob_, doc[doc.ents.index(ent)].ent_type_)) paragraph = Doc(Vocab()).from_disk("/home/wangdi498/SpaCy/diary2.bin") assert len(paragraph.ents ) != 0, "\nError! This document cannot be empty!" # 防止Doc为空。 print("\nThe 3rd round of Save and Load is:") for ent in paragraph.ents: print("\t{}\t\t{}\t{}\t{}\t{}\t{}".format( ent.text, ent.start_char, ent.end_char, ent.label_, doc[doc.ents.index(ent)].ent_iob_, doc[doc.ents.index(ent)].ent_type_)) assert [(ent.text, ent.label_) for ent in paragraph.ents] != [ (u'2018年9月27日', u'EVENT') ], "\nHere! The entity '%s' has matched the specified one." % ent.text
def test_span_idx2i(nlp, start_idx, end_idx, start_i, end_i, slice_at): doc = nlp(_TEXT_SAMPLE) doc_bounds = span_idx2i(doc, start_idx, end_idx) assert doc_bounds == (start_i, end_i) offset_idx = len(doc[:slice_at].text_with_ws) fix_start_idx = start_idx - offset_idx fix_end_idx = end_idx - offset_idx span = Span(doc, slice_at, len(doc)) span_bounds = span_idx2i(span, fix_start_idx, fix_end_idx) tokens = doc[slice_at:] tokens_bounds = span_idx2i(tokens, fix_start_idx, fix_end_idx) fix_bounds = (start_i - slice_at, end_i - slice_at) assert span_bounds == fix_bounds assert tokens_bounds == fix_bounds
def test_context_attributes(self): sectionizer = Sectionizer( nlp, rules=None, add_attrs={"past_medical_history": { "is_negated": True }}) sectionizer.add( [SectionRule("Past Medical History:", "past_medical_history")]) doc = nlp("Past Medical History: Pneumonia") from spacy.tokens import Span doc.ents = (Span(doc, 4, 5), ) sectionizer(doc) assert doc.ents[0]._.is_negated is True
def test_displacy_render_wrapper(en_vocab): """Test that displaCy accepts custom rendering wrapper.""" def wrapper(html): return "TEST" + html + "TEST" displacy.set_render_wrapper(wrapper) doc = get_doc(en_vocab, words=["But", "Google", "is", "starting", "from", "behind"]) doc.ents = [Span(doc, 1, 2, label=doc.vocab.strings["ORG"])] html = displacy.render(doc, style="ent") assert html.startswith("TEST<div") assert html.endswith("/div>TEST") # Restore displacy.set_render_wrapper(lambda html: html)
def bunsetu_spans(doc): if type(doc)==Doc: b=[i for i,j in enumerate(doc.user_data["bunsetu_bi_labels"]) if j=="B"] b.append(len(doc)) return [Span(doc,i,j) for i,j in zip(b,b[1:])] elif type(doc)==Span: b=doc[0].doc.user_data["bunsetu_bi_labels"] s=[bunsetu_span(doc[0])] if b[doc[0].i]=="I" else [] for t in doc: if b[t.i]=="B": s.append(bunsetu_span(t)) return s elif type(doc)==Token: return [bunsetu_span(doc)]
def doc2(en_vocab): words = ["I", "like", "New", "York", "in", "Autumn", "."] heads = [1, 1, 3, 1, 1, 4, 1] tags = ["PRP", "IN", "NNP", "NNP", "IN", "NNP", "."] pos = ["PRON", "VERB", "PROPN", "PROPN", "ADP", "PROPN", "PUNCT"] deps = ["ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"] doc = Doc(en_vocab, words=words, heads=heads, tags=tags, pos=pos, deps=deps) doc.ents = [Span(doc, 2, 4, label="GPE")] return doc
def Entity(): print("\nThe outcomes of Entity Extraction are:") nlp = spacy.load('en_core_web_sm') doc = nlp(u"Apple isn't looking at buying U.S.A. startup for $1 billion.") for ent in doc.ents: print("\t{}\t\t{}\t{}\t{}".format(ent.text, ent.start_char, ent.end_char, ent.label_)) from spacy.tokens import Span doc = nlp(u"FB is hiring a new VP of global policy.") doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings[u'ORG'])] for ent in doc.ents: print("\t{}\t\t{}\t{}\t{}".format(ent.text, ent.start_char, ent.end_char, ent.label_))