def __init__(self, wordvecs=False, sentiws_path='data/sentiws/', polarity_modifiers_path='data/polarity_modifiers.pickle'): """ Parameters: - wordvecs: True or False; use de_core_news_sm or de_core_news_md german spacy model - sentiws_path: path of your sentiws data - polarity_modifiers_path: path of your polarity modifier dict as pickle """ # loading german spacy model if wordvecs: self.nlp = spacy.load('de_core_news_md') else: self.nlp = spacy.load('de_core_news_sm') # integrating SentiWS-Corpus as token attribute sentiws = spaCySentiWS(sentiws_path=sentiws_path) self.nlp.add_pipe(sentiws) self.doc = None self.modifiers = pickle.load(open(polarity_modifiers_path, 'rb')) if not Token.has_extension("modified"): Token.set_extension("modified", getter=self.modify_polarity) if not Token.has_extension("negated"): Token.set_extension("negated", getter=self.negate)
def set_extensions(): if not Doc.has_extension('coref_chains'): Doc.set_extension('coref_chains', default=None) if not Token.has_extension('coref_chains'): Token.set_extension('coref_chains', default=None) if not Doc.has_extension('holmes_document_info'): Doc.set_extension('holmes_document_info', default=None) if not Token.has_extension('holmes'): Token.set_extension('holmes', default=None)
def fijar_extensiones(self): """Fija extensiones globalmente.""" if not Token.has_extension("ok_token"): Token.set_extension("ok_token", default=True) if self.grupos: for grupo in self.grupos: if not Token.has_extension(grupo): Token.set_extension(grupo, default=False)
def test_call_lexicon_component(self): """ Test running a doc through the lexicon component and properly overlaying features from the lexicon. """ lexicon_component = LexiconOverlayer(self.nlp, self.lexicon) self.assertFalse(Token.has_extension('feature_is_ADE_from_lexicon')) self.assertFalse(Token.has_extension('feature_is_DRUG_from_lexicon')) lexicon_component(self.doc) self.assertTrue(Token.has_extension('feature_is_ADE_from_lexicon')) self.assertTrue(Token.has_extension('feature_is_DRUG_from_lexicon'))
def test_call_lexicon_component(self): """ Test running a doc through the lexicon component and properly overlaying features from the lexicon. :return: """ lexicon_component = LexiconComponent(self.nlp, self.lexicon) self.assertIs(Token.has_extension('feature_is_ADE_from_lexicon'), False) self.assertIs(Token.has_extension('feature_is_DRUG_from_lexicon'), False) doc = lexicon_component(self.doc) self.assertIs(Token.has_extension('feature_is_ADE_from_lexicon'), True) self.assertIs(Token.has_extension('feature_is_DRUG_from_lexicon'), True)
def _set_extensions(self): """Sets the default extensions if they do not exist yet.""" for obj in Doc, Span, Token: if not obj.has_extension(self.ext_names["conll_str"]): obj.set_extension(self.ext_names["conll_str"], default=None) if not obj.has_extension(self.ext_names["conll"]): obj.set_extension(self.ext_names["conll"], default=None) if PD_AVAILABLE and not self.disable_pandas: if not obj.has_extension(self.ext_names["conll_pd"]): obj.set_extension(self.ext_names["conll_pd"], default=None) # Adds fields from the CoNLL-U format that are not available in spaCy # However, ConllParser might set these fields when it has read CoNLL_str->spaCy if not Token.has_extension("conll_deps_graphs_field"): Token.set_extension("conll_deps_graphs_field", default="_") if not Token.has_extension("conll_misc_field"): Token.set_extension("conll_misc_field", default="_") if not Span.has_extension("conll_metadata"): Span.set_extension("conll_metadata", default=None)
def __init__(self, nlp: Language): """Initialise components""" if not Token.has_extension("dependency_distance"): Token.set_extension("dependency_distance", getter=self.token_dependency) if not Span.has_extension("dependency_distance"): Span.set_extension("dependency_distance", getter=self.span_dependency) if not Doc.has_extension("dependency_distance"): Doc.set_extension("dependency_distance", getter=self.doc_dependency)
def __init__(self, in_names, extension_name="is_name"): """names is an iterable of names where names consist of max 1 word (so no spaces) If there are names with spaces they will be removed from the set of names""" in_names_set = set(in_names) names = { AccentRemover.remove_accents(name.casefold()) for name in in_names_set if len(name.split(" ")) == 1 } self.names = names self.extension_name = extension_name if not Token.has_extension(extension_name): Token.set_extension(extension_name, default=False)
def write_conllu(docs, file_): if not Token.has_extension("get_conllu_lines"): Token.set_extension("get_conllu_lines", method=get_token_conllu) if not Token.has_extension("begins_fused"): Token.set_extension("begins_fused", default=False) if not Token.has_extension("inside_fused"): Token.set_extension("inside_fused", default=False) merger = Matcher(docs[0].vocab) merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}]) for i, doc in enumerate(docs): matches = [] if doc.is_parsed: matches = merger(doc) spans = [doc[start:end + 1] for _, start, end in matches] seen_tokens = set() with doc.retokenize() as retokenizer: for span in spans: span_tokens = set(range(span.start, span.end)) if not span_tokens.intersection(seen_tokens): retokenizer.merge(span) seen_tokens.update(span_tokens) file_.write("# newdoc id = {i}\n".format(i=i)) for j, sent in enumerate(doc.sents): file_.write("# sent_id = {i}.{j}\n".format(i=i, j=j)) file_.write("# text = {text}\n".format(text=sent.text)) for k, token in enumerate(sent): if token.head.i > sent[-1].i or token.head.i < sent[0].i: for word in doc[sent[0].i - 10:sent[0].i]: print(word.i, word.head.i, word.text, word.dep_) for word in sent: print(word.i, word.head.i, word.text, word.dep_) for word in doc[sent[-1].i:sent[-1].i + 10]: print(word.i, word.head.i, word.text, word.dep_) raise ValueError( "Invalid parse: head outside sentence (%s)" % token.text) file_.write(token._.get_conllu_lines(k) + "\n") file_.write("\n")
def __init__(self): # register Token attributes if they are not registered already from spacy.tokens import Token for attr_name in [ "speaker", "start_time", "end_time", "confidence", "entity_linking", "addressee" ]: if not Token.has_extension(attr_name): Token.set_extension(attr_name, default=None) # register Span attributes if they are not registered already from spacy.tokens import Span if not Span.has_extension("speaker"): Span.set_extension("speaker", getter=self.span_speaker) if not Span.has_extension("start_time"): Span.set_extension("start_time", getter=self.span_start_time) if not Span.has_extension("end_time"): Span.set_extension("end_time", getter=self.span_end_time) if not Span.has_extension("confidence"): Span.set_extension("confidence", getter=self.span_average_confidence) if not Span.has_extension("entity_linking"): Span.set_extension("entity_linking", getter=self.span_entity_linking) if not Span.has_extension("addressee"): Span.set_extension("addressee", getter=self.span_addressee) # minimalist spaCy pipeline (used only for its tokenizer) self.tokenizer = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"]) # custom spaCy pipeline (that adds forced alignment attributes and ensures # that a new sentence starts at every speaker change) self.nlp = spacy.load("en_core_web_sm") self.nlp.add_pipe(self.placeholder, name="forced_alignment", first=True) self.nlp.add_pipe(self.start_sentence_at_speaker_change, after="forced_alignment")
def test_overlays_cuis(self): """Tests that the MetaMapOverlayer overlays CUIs correctly given a document that hasn't been metamapped""" doc = self.nlp('I took Tylenol and it gave me nausea and chest pain') metamap = MetaMap(metamap_path) metamap_component = MetaMapOverlayer(self.nlp, metamap) metamap_component(doc) self.assertTrue(Token.has_extension('feature_cui')) cuis = [token._.feature_cui for token in doc] # Test that at least one of the features are a CUI any_match = any(re.match(r'C\d+', c) for c in cuis) self.assertTrue(any_match) # Test that all features are a CUI or '-1' all_match = all(re.match(r'(C\d+)|(-1)', c) for c in cuis) self.assertTrue(all_match)
def __init__(self, first_name_extension_name=FirstNameListMatcher.EXTENSION_NAME, last_name_extension_name=LastNameListMatcher.EXTENSION_NAME): self.token_extension_name = self.TOKEN_EXTENSION_NAME self.span_extension_name = self.SPAN_EXTENSION_NAME self.doc_extension_name = self.DOC_EXTENSION_NAME self.first_name_extension_name = first_name_extension_name self.last_name_extension_name = last_name_extension_name if not Token.has_extension(self.token_extension_name): Token.set_extension(self.token_extension_name, default=self.ANOT_NONE) if not Span.has_extension(self.span_extension_name): Span.set_extension(self.span_extension_name, getter=self.is_full_name_getter) if not Doc.has_extension(self.doc_extension_name): Doc.set_extension(self.doc_extension_name, default=[])
def test_overlays_cuis(self): """Tests that the MetaMapOverlayer overlays CUIs correctly given a document that hasn't been metamapped""" sample_doc = sample_dataset.data_files[0].txt_path with open(sample_doc) as f: sample_text = f.read() doc = self.nlp(sample_text) metamap = MetaMap(metamap_path) metamap_component = MetaMapOverlayer(self.nlp, metamap) metamap_component(doc) self.assertTrue(Token.has_extension('feature_cui')) cuis = [token._.feature_cui for token in doc] # Test that at least one of the features are a CUI any_match = any(re.match(r'C\d+', c) for c in cuis) self.assertTrue(any_match) # Test that all features are a CUI or '-1' all_match = all(re.match(r'(C\d+)|(-1)', c) for c in cuis) self.assertTrue(all_match)
def __init__(self, nlp, keywords, label, tokentag, doctag=None, spantag=None): nlp.vocab.strings.add(label) self.label = nlp.vocab.strings[label] self._label_str = label self._token_tag = tokentag self._doctag = doctag self._spantag = spantag self._keywordtag = "is_keyword" self._labeltag = "label_" # Set up the PhraseMatcher – it can now take Doc objects as patterns, # so even if the list of companies is long, it's very efficient patterns = [nlp(key) for key in keywords] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add(self._token_tag, None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension(self._token_tag, default=False) if not Token.has_extension(self._keywordtag): Token.set_extension(self._keywordtag, default=False) Token.set_extension(self._labeltag, default=None) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_tech_org == True. Doc.set_extension(self._doctag, getter=lambda tokens: any( [t._.get(self._token_tag) for t in tokens])) Span.set_extension(self._spantag, getter=lambda tokens: any( [t._.get(self._token_tag) for t in tokens])) if not Span.has_extension("dep_"): Span.set_extension("dep_", default="") Span.set_extension("head_", default=None)
def tool_hashtags(text: list): # nlp = es_core_news_md.load() matcher = Matcher(nlp.vocab) matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]]) #Aqui me daba error, porque "is_hashtag" ya estaa agregado a Token, utilizo Token.has_extension # para validar si ya se encuentra, de lo contrario se agrega. ver Documentación https://spacy.io/api/token if not Token.has_extension("is_hashtag"): Token.set_extension("is_hashtag", default=False) within_hash = [] with_hash = [] # doc = nlp(" ".join(text)) doc = nlp(text) matches = matcher(doc) hashtags = [] for match_id, start, end in matches: if doc.vocab.strings[match_id] == "HASHTAG": hashtags.append(doc[start:end]) with doc.retokenize() as retokenizer: for span in hashtags: # print(hashtags) retokenizer.merge(span) for token in span: if not token._.is_hashtag: token._.is_hashtag = True # print(token) for token in doc: # print(token.text, token._.is_hashtag) if not token._.is_hashtag: within_hash.append(token.text) with_hash.append(token.text) else: with_hash.append(token.text) # print(within_hash, with_hash, len(hashtags)) vector = within_hash, with_hash, len(hashtags) # print(vector[1]) return vector
def extract(self, text): #構文解析器のインスタンス化 nlp = spacy.load('ja_ginza') #マッチャーインスタンス化 matcher = Matcher(nlp.vocab) #TextRank tr = pytextrank.TextRank() nlp.add_pipe(tr.PipelineComponent, name='textrank', last=True) if not Token.has_extension("_5w1h"): Token.set_extension("_5w1h", default='None') if not Token.has_extension("_type"): Token.set_extension("_type", default=None) #5w1hパターン Where_pattern1 = [{ "POS": { "REGEX": "NOUN|PROPN|PRON" }, "ENT_TYPE": { "REGEX": "Country|Province" } }] Where_pattern2 = [{"TEXT": {"REGEX": "Amazon"}}] Where_pattern3 = [{"TEXT": "中"}, {"TEXT": "に"}] When_pattern1 = [{ "TEXT": { "REGEX": "今日|昨日|おととい|明日|先日|明後日|今回|前回|後ほど|その後" } }] When_pattern2 = [{ "POS": "NUM" }, { "TEXT": { "REGEX": "時|日|月|年" }, "LENGTH": 1 }] When_pattern3 = [{"TEXT": {"REGEX": "今"}, "LENGTH": 1}] When_pattern4 = [{"TEXT": "この"}, {"TEXT": {"REGEX": "後|前"}}] Why_pattern1 = [{ "POS": { "REGEX": "VERB|ADJ" }, "OP": "+" }, { "POS": { "REGEX": "AUX" }, "OP": "?" }, { "TEXT": "から" }] How_pattern1 = [{ "POS": { "REGEX": "VERB|ADJ|AUX|PART" }, "DEP": { "REGEX": "ROOT|punct" } }] How_pattern2 = [{ "POS": { "REGEX": "VERB|ADJ" }, "OP": "+" }, { "DEP": { "REGEX": "ROOT|cc" } }] How_pattern3 = [{ "POS": { "REGEX": "VERB|ADJ|NOUN" }, "OP": "+" }, { "POS": { "REGEX": "AUX|ADP" }, "OP": "?" }, { "DEP": { "REGEX": "ROOT" } }] How_pattern4 = [{ "POS": { "REGEX": "VERB|ADJ" }, "OP": "+" }, { "OP": "?" }, { "POS": "PUNCT" }] How_pattern5 = [{ "POS": { "REGEX": "VERB|ADJ|AUX" }, "OP": "*" }, { "TEXT": "の" }, { "TEXT": "に" }] How_pattern6 = [{ "POS": "VERB", "OP": "?" }, { "POS": "AUX", "OP": "+" }, { "TEXT": "が", "POS": "CCONJ" }, { "POS": "PUNCT", "OP": "?" }] How_pattern7 = [{ "POS": "VERB" }, { "POS": "AUX" }, { "TEXT": "ところ", "POS": "NOUN" }] How_pattern8 = [{ "POS": "VERB", "DEP": "advcl" }, { "TEXT": "し", "POS": "CCONJ" }] How_pattern9 = [{ "POS": { "REGEX": "VERB|ADJ" } }, { "TEXT": "ん", "OP": "?" }, { "TEXT": "です", "OP": "?" }, { "LEMMA": "けれど" }, { "TEXT": "も", "OP": "?" }, { "TEXT": "、", "OP": "?" }] How_pattern10 = [{ "POS": "AUX", "TEXT": { "NOT_IN": ["で"] } }, { "POS": { "REGEX": "PUNCT" } }] How_pattern11 = [{ "POS": { "REGEX": "VERB|ADJ|AUX" }, "OP": "*" }, { "TEXT": "けれど" }, { "TEXT": "も" }] How_pattern12 = [{ "POS": "VERB", "OP": "?" }, { "POS": "AUX", "OP": "+" }, { "TEXT": "と" }, { "TEXT": "か", "OP": "?" }, { "POS": "PUNCT", "OP": "?" }] How_pattern13 = [{ "POS": "AUX" }, { "TEXT": "が", "POS": { "REGEX": "CCONJ" } }] How_pattern14 = [{ "POS": "NOUN" }, { "TEXT": "です" }, { "POS": { "REGEX": "PART" }, "OP": "*" }] How_pattern15 = [{ "POS": "NOUN" }, { "TEXT": "か" }, { "TEXT": "な" }, { "POS": { "REGEX": "PUNCT" } }] Who_pattern1 = [{ "TEXT": "の", "OP": "?" }, { "POS": "NOUN", "DEP": "compound", "OP": "*" }, { "POS": { "REGEX": "NOUN|PRON|PROPN" }, "DEP": { "REGEX": "iobj|obl|nsubj" }, "TAG": { "NOT_IN": ["名詞-普通名詞-助数詞可能"] }, "TEXT": { "NOT_IN": ["幾つ"] } }, { "TEXT": { "REGEX": "が|は|も" } }, { "TEXT": { "REGEX": "です|ね|、" }, "OP": "*" }] Who_pattern2 = [{ "DEP": { "REGEX": "amod|advmod|acl" }, "OP": "+" }, { "TEXT": "ところ", "DEP": { "REGEX": "compound" } }] Who_pattern3 = [{ "POS": { "REGEX": "NOUN|PRON|PROPN" }, "DEP": { "REGEX": "iobj|obl|nsubj" } }, { "TEXT": { "REGEX": "に" } }, { "TEXT": { "REGEX": "は" } }] Who_pattern4 = [{ "POS": { "REGEX": "NOUN|PRON|PROPN" }, "DEP": { "REGEX": "obl|nmod|dep" }, "TEXT": { "NOT_IN": ["幾つ"] }, "TAG": { "NOT_IN": ["名詞-普通名詞-助数詞可能"] } }, { "TEXT": { "REGEX": "が|は|も|って" } }] Who_pattern5 = [{ "TEXT": "こと", "DEP": "compound" }, { "TEXT": { "REGEX": "が|は|も" } }] Who_pattern6 = [{ "POS": "NUM", "OP": "!" }, { "POS": { "REGEX": "NOUN|PRON|PROPN" }, "DEP": { "REGEX": "iobj|obl|nsubj" }, "TAG": "名詞-普通名詞-助数詞可能" }, { "TEXT": { "REGEX": "が|は|も" } }] Who_pattern7 = [{ "POS": { "REGEX": "VERB|ADJ" }, "OP": "+" }, { "POS": "NOUN", "OP": "?" }, { "TEXT": "の" }, { "TEXT": "が" }] Who_pattern8 = [{ "TEXT": "と" }, { "TEXT": "いう" }, { "TEXT": "の" }, { "TEXT": "は" }] Who_pattern9 = [{ "POS": "NOUN" }, { "TEXT": "に" }, { "TEXT": "おい" }, { "TEXT": "て" }, { "TEXT": "は" }] What_pattern1 = [{ "POS": { "REGEX": "NOUN|PRON|PROPN" }, "DEP": { "REGEX": "obl|obj|iobj" } }, { "TEXT": { "REGEX": "を" } }] What_pattern2 = [{ "POS": { "REGEX": "SYM" }, "DEP": { "REGEX": "dep" } }, { "TEXT": { "REGEX": "を" } }] Mod_pattern1 = [{ "DEP": { "REGEX": "amod|advmod|nmod|case|obl|case|acl|aux|det|nsubj|dep|mark|compound|nummod|advcl|iobj|det|obj" } }] Mod_pattern2 = [{"TEXT": "いつ"}] Task_pattern1 = [{ "TEXT": "たい" }, { "TEXT": "の" }, { "POS": { "REGEX": "PUNCT" }, "OP": "*" }] Task_pattern2 = [{ "TEXT": "ない" }, { "TEXT": "か" }, { "TEXT": "な" }, { "POS": { "REGEX": "PUNCT" }, "OP": "*" }] Task_pattern3 = [{ "POS": { "REGEX": "VERB|AUX" } }, { "TEXT": "ない" }, { "TEXT": "で" }, { "TEXT": "ね" }, { "POS": { "REGEX": "PUNCT" }, "OP": "*" }] Task_pattern4 = [{ "POS": "AUX" }, { "POS": "SCONJ" }, { "TEXT": "ください" }, { "POS": { "REGEX": "PUNCT" }, "OP": "*" }] #matcherのコールバック関数 #5w1hのラベルだけ付与 def add_label(matcher, doc, id, matches): l = list(matches[id]) for t in doc[l[1]:l[2]]: if t._._5w1h == 'None' or t._._5w1h == "Mod": t._._5w1h = nlp.vocab.strings[l[0]] def add_right(matcher, doc, id, matches): l = list(matches[id]) tag = nlp.vocab.strings[l[0]] end = l[-1] if end != len(doc): while l[1] <= doc[end].head.i <= l[-1] - 1: end = end + 1 if end == len(doc): break l[-1] = end - 1 if end < len(doc): if (re.search("ので|だから", doc[l[-1] - 2:l[-1]].text) or re.search("ので、|だから、", doc[l[-1] - 3:l[-1]].text) or re.search("ため", doc[l[-1] - 1:l[-1]].text) or re.search("ため、", doc[l[-1] - 2:l[-1] - 1].text) ) and doc[l[-1]].pos_ != "ADP": tag = "Why" for t in doc[l[1]:end]: if t._._5w1h == 'None' or t._._5w1h == "Mod" or tag == "Why": t._._5w1h = tag matches[id] = tuple(l) def add_right_left(matcher, doc, id, matches): l = list(matches[id]) tag = nlp.vocab.strings[l[0]] end = l[-1] start = l[1] if end != len(doc): while l[1] <= doc[end].head.i <= l[-1] - 1: end = end + 1 if end == len(doc): break if start != 0: while doc[start].head.i == l[1]: start = start - 1 if start == 0: break l[1] = start l[-1] = end if end < len(doc): if (re.search("ので|だから", doc[l[-1] - 2:l[-1]].text) or re.search("ので、|だから、", doc[l[-1] - 3:l[-1]].text) or re.search("ため", doc[l[-1] - 1:l[-1]].text) or re.search("ため、", doc[l[-1] - 2:l[-1]].text) ) and doc[l[-1]].pos_ != "ADP": tag = "Why" for t in doc[l[1]:l[2]]: if t._._5w1h == 'None' or t._._5w1h == "Mod" or tag == "Why": t._._5w1h = tag matches[id] = tuple(l) def add_label_type(matcher, doc, id, matches): l = list(matches[id]) for t in doc[l[1]:l[2]]: t._._type = nlp.vocab.strings[l[0]] #matcherを追加 matcher.add("When", add_right, When_pattern1, When_pattern2, When_pattern3, When_pattern4) matcher.add("Where", add_right, Where_pattern1, Where_pattern2, Where_pattern3) matcher.add("How", add_right, How_pattern1, How_pattern2, How_pattern3, How_pattern4, How_pattern5, How_pattern6, How_pattern7, How_pattern8, How_pattern9, How_pattern10, How_pattern11, How_pattern12, How_pattern13, How_pattern14, How_pattern15) matcher.add("Who", add_label, Who_pattern1, Who_pattern2, Who_pattern3, Who_pattern4, Who_pattern5, Who_pattern6, Who_pattern7, Who_pattern8, Who_pattern9) matcher.add("What", add_right, What_pattern1, What_pattern2) matcher.add("Why", add_label, Why_pattern1) matcher.add("Mod", add_right_left, Mod_pattern1, Mod_pattern2) matcher.add("Task", add_label_type, Task_pattern1, Task_pattern2, Task_pattern3, Task_pattern4) doc = nlp(text) text2 = [ s.text for s in doc if not re.fullmatch("まあ|まぁ|ま|えー|あのー|あ", s.text) ] text2 = ''.join(text2) doc = nlp(text2) for sent in doc.sents: matches = matcher(doc) num = 0 start = 0 end = 0 tmp_label = None #その他の処理 for token in doc: if (token._._5w1h == "Who" or token._._5w1h == "What") and re.search( "VERB|ADJ", doc[token.head.i].pos_ ) and doc[token.head.i].i > token.i: tag2 = "How" start = token.head.i end = start + 1 while end < len(doc): if doc[end].head.i != start: break end = end + 1 if (re.search("ので|だから", doc[end - 2:end - 1].text) or re.search("ので、|だから、", doc[end - 3:end - 1].text) or re.search("ため", doc[end - 1:end - 1].text) or re.search("ため、", doc[end - 2:end - 1].text) ) and doc[end - 1].pos_ != "ADP": tag2 = "Why" for t in doc[start:end]: if not t._._5w1h or t._._5w1h == "Mod" or tag2 == "Why": t._._5w1h = tag2 for token in reversed(doc): if token._._5w1h != "Mod": if tmp_label == "Who" and token._._5w1h == "What": token._._5w1h = "Who" tmp_label = token._._5w1h else: token._._5w1h = tmp_label self.doc = doc
def set_extensions(): if not Doc.has_extension('coref_chains'): Doc.set_extension('coref_chains', default=None) if not Token.has_extension('coref_chains'): Token.set_extension('coref_chains', default=None)
def __init__(self, extension_name="sin_accents"): self.extension_name = extension_name if not Token.has_extension(extension_name): Token.set_extension(extension_name, default=False)
import spacy from pdfminer.high_level import extract_text from rich.progress import track from spacy.tokens import Doc, Token, Span from .console import console from .parsers import pdfminer from .parsers.base import BaseParser from ._utils import _filter_doc_by_page, _get_number_of_pages # Set up the spacy custom extensions. if not Token.has_extension("page_number"): Token.set_extension("page_number", default=None) if not Doc.has_extension("pdf_file_name"): Doc.set_extension("pdf_file_name", default=None) if not Doc.has_extension("page"): Doc.set_extension("page", method=_filter_doc_by_page) if not Doc.has_extension("first_page"): Doc.set_extension("first_page", getter=lambda doc: doc[0]._.page_number) if not Doc.has_extension("last_page"): Doc.set_extension("last_page", getter=lambda doc: doc[-1]._.page_number) if not Doc.has_extension("page_range"):
def parse_conll_text_as_spacy( self, text: str, ner_tag_pattern: str = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$", ner_map: Dict[str, str] = None, ) -> Doc: """Parses a given CoNLL-U string into a spaCy doc. Parsed sentence section must be separated by a new line (\n\n). Note that we do our best to retain as much information as possible but that not all CoNLL-U fields are supported in spaCy. We add a Token._.conll_misc_field extension to save CoNLL-U MISC field, and a Token._.conll_deps_graphs_field extension to save CoNLL-U DEPS field. The metadata (lines starting with #) is saved in Span._.conll_metadata of sentence Spans. This method has been adapted from the work by spaCy. See: https://github.com/explosion/spaCy/blob/a1c5b694be117ac92e21f9860309821ad6da06f7/spacy/cli/converters/conllu2json.py#L179 Multi-word tokens and empty nodes are not supported. :param text: CoNLL-U formatted text :param ner_tag_pattern: Regex pattern for entity tag in the MISC field :param ner_map: Map old NER tag names to new ones, '' maps to O :return: a spacy Doc containing all the tokens and sentences from the CoNLL file including the custom CoNLL extensions """ if not Token.has_extension("conll_misc_field"): Token.set_extension("conll_misc_field", default="_") if not Token.has_extension("conll_deps_graphs_field"): Token.set_extension("conll_deps_graphs_field", default="_") if not Span.has_extension("conll_metadata"): Span.set_extension("conll_metadata", default=None) docs = [] for chunk in text.split("\n\n"): lines = [ l for l in chunk.splitlines() if l and not l.startswith("#") ] words, spaces, tags, poses, morphs, lemmas, miscs = [], [], [], [], [], [], [] heads, deps, deps_graphs = [], [], [] for i in range(len(lines)): line = lines[i] parts = line.split("\t") if any(not p for p in parts): raise ValueError( "According to the CoNLL-U Format, fields cannot be empty. See" " https://universaldependencies.org/format.html") id_, word, lemma, pos, tag, morph, head, dep, deps_graph, misc = parts if any(" " in f for f in (id_, pos, tag, morph, head, dep, deps_graph)): raise ValueError( "According to the CoNLL-U Format, only FORM, LEMMA, and MISC fields can contain" " spaces. See https://universaldependencies.org/format.html" ) if "." in id_ or "-" in id_: raise NotImplementedError( "Multi-word tokens and empty nodes are not supported in spacy_conll" ) words.append(word) if "SpaceAfter=No" in misc: spaces.append(False) else: spaces.append(True) id_ = int(id_) - 1 lemmas.append(lemma) poses.append(pos) tags.append(pos if tag == "_" else tag) morphs.append(morph if morph != "_" else "") heads.append((int(head) - 1) if head not in ("0", "_") else id_) deps.append("ROOT" if dep == "root" else dep) deps_graphs.append(deps_graph) miscs.append(misc) doc = Doc( self.nlp.vocab, words=words, spaces=spaces, tags=tags, pos=poses, morphs=morphs, lemmas=lemmas, heads=heads, deps=deps, ) # Set custom Token extensions for i in range(len(doc)): doc[i]._.conll_misc_field = miscs[i] doc[i]._.conll_deps_graphs_field = deps_graphs[i] ents = get_entities(lines, ner_tag_pattern, ner_map) doc.ents = spans_from_biluo_tags(doc, ents) # The deprel relations ensure that this CoNLL chunk is one sentence # Deprel cannot therefore not be empty or each word is considered a separate sentence if len(list(doc.sents)) != 1: raise ValueError( "Your data is in an unexpected format. Make sure that it follows the CoNLL-U format" " requirements. See https://universaldependencies.org/format.html. Particularly make" " sure that the DEPREL field is filled in.") # Save the metadata in a custom sentence Span attribute so that the formatter can use it metadata = "\n".join( [l for l in chunk.splitlines() if l.startswith("#")]) # We really only expect one sentence for sent in doc.sents: sent._.conll_metadata = f"{metadata}\n" if metadata else "" docs.append(doc) # Add CoNLL custom extensions return self.nlp.get_pipe("conll_formatter")(Doc.from_docs(docs))
def __call__(self, doc): """ Runs a document to the metamap_annotator pipeline component. This overlays rich medical features by utilizing MetaMap output and aligning it with a passed spacy Doc object. By medaCy conventions, each overlayed feature is available as a token extension starting with 'feature_'. This component overlays 'feature_cui' and a separate boolean feature for each semantic type to detect available under 'feature_is_{type}". This component was originally designed to increase recall on Drug entities hence by default 'feature_is_orch' and 'feature_is_phsu' where orch and phsu are semantic types corresponding to organic chemicals and pharmalogical substances respectively. :param doc: spaCy Doc object to run through pipeline :return: the same Doc object """ logging.debug("Called MetaMap Component") # register all extensions if self.cuis: Token.set_extension('feature_cui', default="-1", force=True) #cui feature for label in self.semantic_type_labels: # is_semantic type features Token.set_extension('feature_is_' + label, default=False, force=True) if not hasattr(doc._, 'file_name'): metamap_dict = self.metamap.map_text(str(doc)) elif doc._.file_name is None or doc._.file_name == 'STRING_INPUT': metamap_dict = self.metamap.map_text(str(doc)) elif os.path.isfile(doc._.file_name): # Check if pre-metamapped file exists at expected location txt_file_path = doc._.file_name metamapped_path = _get_metamapped_path(txt_file_path) if not os.path.isfile(metamapped_path): warnings.warn( f"No metamapped file was found for '{txt_file_path}'; attempting to run MetaMap over document (results in slower runtime); ensure MetaMap is running" ) metamap_dict = self.metamap.map_text(str(doc)) else: # This branch of the decision tree is reached if the file is already metamapped metamap_dict = self.metamap.load(metamapped_path) # TODO refactor second part of if statement when implementing live model prediction if metamap_dict == '' or metamap_dict['metamap'] is None: if hasattr(doc._, 'file_name'): warnings.warn( f"MetaMap produced no output for given file: {doc._.file_name}" ) warnings.warn("MetaMap failed") return doc mapped_terms = self.metamap.extract_mapped_terms( metamap_dict) # parse terms out of mappings dictionary spans = [] # for displaying NER output with displacy # Overlays semantic type presence if the given semantic type is set in metamap span. for semantic_type_label in self.semantic_type_labels: entity_name = semantic_type_label self.nlp.entity.add_label(entity_name) # register entity label entity_tags = self.metamap.get_term_by_semantic_type( mapped_terms, include=[semantic_type_label]) entity_annotations = self.metamap.mapped_terms_to_spacy_ann( entity_tags, semantic_type_label) with doc.retokenize() as retokenizer: for start, end, label in entity_annotations: span = doc.char_span( start, end, label=self.nlp.vocab.strings[entity_name]) #TODO spans are none when indices and token boundaries don't line up. if span not in spans: if span is not None: logging.debug( "Found from metamap: (label=%s,raw_text=\"%s\",location=(%i, %i))" % (label, span.text, start, end)) spans.append(span) for token in span: token._.set('feature_is_' + label, True) if self.merge_tokens: try: retokenizer.merge(span) except BaseException: continue else: logging.debug( "Metamap span could not be overlayed due to tokenization mis-match: (%i, %i)" % (start, end)) # Overlays CUI of each term if Token.has_extension('feature_cui'): with doc.retokenize() as retokenizer: for term in mapped_terms: cui = term['CandidateCUI'] start, end = self.metamap.get_span_by_term(term)[0] span = doc.char_span(start, end) if span is not None: for token in span: token._.set('feature_cui', cui) if self.merge_tokens: try: retokenizer.merge(span) except BaseException: continue return doc
def __call__(self, doc): """ Runs a document to the metamap_annotator pipeline component. This overlays rich medical features by utilizing MetaMap output and aligning it with a passed spacy Doc object. By medaCy conventions, each overlayed feature is available as a token extension starting with 'feature_'. This component overlays 'feature_cui' and a separate boolean feature for each semantic type to detect available under 'feature_is_{type}". This component was originally designed to increase recall on Drug entities hence by default 'feature_is_orch' and 'feature_is_phsu' where orch and phsu are semantic types corresponding to organic chemicals and pharmalogical substances respectively. :param doc: document to run through pipeline :return: """ logging.debug("Called MetaMap Component") metamap = self.metamap nlp = self.nlp semantic_type_labels = self.semantic_type_labels #register all extensions if self.cuis: Token.set_extension('feature_cui', default="-1", force=True) #cui feature for semantic_type_label in semantic_type_labels: #is_semantic type features Token.set_extension('feature_is_' + semantic_type_label, default=False, force=True) #check if pre-metamapped file has been assigned to the document if hasattr(doc._, 'metamapped_file'): metamap_dict = metamap.load(doc._.metamapped_file) else: if hasattr(doc._, 'file_name'): logging.debug("%s: Could not find metamap file for document." % doc._.file_name) metamap_dict = metamap.map_text( doc.text) #TODO metamap.map_text is broken currently if not hasattr(doc._, 'file_name' ): #TODO REMOVE when implemnting live model prediction return doc # TODO refactor second part of if statement when implementing live model prediction if metamap_dict['metamap'] is None: if hasattr(doc._, 'metamapped_file'): warnings.warn( "%s: This metamap file is invalid and cannot be parsed in MetaMapComponent: %s \n Ignore this warning if this is a unittest - all may be fine." % (doc._.file_name, doc._.metamapped_file)) else: warnings.warn( "Metamapping text on the fly failed - aborting. Try to pre-metamap with DataLoader." ) return doc mapped_terms = metamap.extract_mapped_terms( metamap_dict) #parse terms out of mappings dictionary spans = [] #for displaying NER output with displacy #Overlays semantic type presence if the given semantic type is set in metamap span. for semantic_type_label in semantic_type_labels: entity_name = semantic_type_label nlp.entity.add_label(entity_name) #register entity label entity_tags = metamap.get_term_by_semantic_type( mapped_terms, include=[semantic_type_label]) entity_annotations = metamap.mapped_terms_to_spacy_ann( entity_tags, semantic_type_label) for start, end, label in [ entity_annotations['entities'][key] for key in entity_annotations['entities'].keys() ]: span = doc.char_span(start, end, label=nlp.vocab.strings[entity_name]) #TODO spans are none when indices and token boundaries don't line up. if span not in spans: if span is not None: logging.debug( "Found from metamap: (label=%s,raw_text=\"%s\",location=(%i, %i))" % (label, span.text, start, end)) spans.append(span) for token in span: token._.set('feature_is_' + label, True) else: logging.debug( "Metamap span could not be overlayed due to tokenization mis-match: (%i, %i)" % (start, end)) #adds labels for displaying NER output with displacy. for span in spans: try: doc.ents = list(doc.ents) + [span] except ValueError as error: logging.warning( str(error) ) #This gets called when the same token may match multiple semantic types #Overlays CUI of each term if Token.has_extension('feature_cui'): for term in mapped_terms: cui = term['CandidateCUI'] start, end = metamap.get_span_by_term(term)[0] span = doc.char_span(start, end) if span is not None: for token in span: token._.set('feature_cui', cui) return doc
#Language class with the English model 'en_core_web_sm' is loaded nlp = spacy.load('en_core_web_sm') #The input text string is converted to Document object doc = nlp( "The French Revolution was a period of time in France when the people overthrew the monarchy and took control of the government." ) #Define the extension attribute on the token level with name as #'context' and default value as false Token.set_extension('context', default=False, force=True) #Try printing the each token on the Document object and the stored #value by the extension attribute. All the values default to 'False' for d in doc: print(d.text, d._.context) #The entity type of previous, next and self tokens are computed and #is set by the 'set' function for i, d in enumerate(doc): if i > 0 and (i < len(doc) - 1): meaning = '|' + doc[i - 1].ent_type_ + '-' + d.ent_type_ + '-' + doc[ i + 1].ent_type_ d._.set('context', meaning) #Printing the tokens again to see the modified values for d in doc: print(d.text, d._.context) Token.has_extension('context') #returns True Token.remove_extension('context') #removes the attribute Token.has_extension('context') #returns False # In[ ]: