class TechCompanyRecognizer(object): """Example of a spaCy v2.0 pipeline component that sets entity annotations based on list of single or multiple-word company names. Companies are labelled as ORG and their spans are merged into one token. Additionally, ._.has_tech_org and ._.is_tech_org is set on the Doc/Span and Token respectively.""" name = 'tech_companies' # component name, will show up in the pipeline def __init__(self, nlp, companies=tuple(), label='ORG'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher – it can now take Doc objects as patterns, # so even if the list of companies is long, it's very efficient patterns = [nlp(org) for org in companies] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('TECH_ORGS', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension('is_tech_org', default=False) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_tech_org == True. Doc.set_extension('has_tech_org', getter=self.has_tech_org) Span.set_extension('has_tech_org', getter=self.has_tech_org) def __call__(self, doc): """Apply the pipeline component on a Doc object and modify it if matches are found. Return the Doc, so it can be processed by the next component in the pipeline, if available. """ matches = self.matcher(doc) spans = [] # keep the spans for later so we can merge them afterwards for _, start, end in matches: # Generate Span representing the entity & set label entity = Span(doc, start, end, label=self.label) spans.append(entity) # Set custom attribute on each token of the entity for token in entity: token._.set('is_tech_org', True) # Overwrite doc.ents and add entity – be careful not to replace! doc.ents = list(doc.ents) + [entity] for span in spans: # Iterate over all spans and merge them into one token. This is done # after setting the entities – otherwise, it would cause mismatched # indices! span.merge() return doc # don't forget to return the Doc! def has_tech_org(self, tokens): """Getter for Doc and Span attributes. Returns True if one of the tokens is a tech org. Since the getter is only called when we access the attribute, we can refer to the Token's 'is_tech_org' attribute here, which is already set in the processing step.""" return any([t._.get('is_tech_org') for t in tokens])
def test_issue3248_1(): """Test that the PhraseMatcher correctly reports its number of rules, not total number of patterns.""" nlp = English() matcher = PhraseMatcher(nlp.vocab) matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) matcher.add("TEST2", None, nlp("d")) assert len(matcher) == 2
def test_issue3248_2(): """Test that the PhraseMatcher can be pickled correctly.""" nlp = English() matcher = PhraseMatcher(nlp.vocab) matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) matcher.add("TEST2", None, nlp("d")) data = pickle.dumps(matcher) new_matcher = pickle.loads(data) assert len(new_matcher) == len(matcher)
def get_matches(tokenizer, phrases, texts, max_length=6): matcher = PhraseMatcher(tokenizer.vocab, max_length=max_length) matcher.add('Phrase', None, *phrases) for text in texts: doc = tokenizer(text) for w in doc: _ = doc.vocab[w.text] matches = matcher(doc) for ent_id, start, end in matches: yield (ent_id, doc[start:end].text)
def test_issue3331(en_vocab): """Test that duplicate patterns for different rules result in multiple matches, one per rule. """ matcher = PhraseMatcher(en_vocab) matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"])) matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"])) doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) matches = matcher(doc) assert len(matches) == 2 match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]] assert sorted(match_ids) == ["A", "B"]
def test_issue3410(): texts = ["Hello world", "This is a test"] nlp = English() matcher = Matcher(nlp.vocab) phrasematcher = PhraseMatcher(nlp.vocab) with pytest.deprecated_call(): docs = list(nlp.pipe(texts, n_threads=4)) with pytest.deprecated_call(): docs = list(nlp.tokenizer.pipe(texts, n_threads=4)) with pytest.deprecated_call(): list(matcher.pipe(docs, n_threads=4)) with pytest.deprecated_call(): list(phrasematcher.pipe(docs, n_threads=4))
def __init__(self, nlp, label='GPE'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ # Make request once on initialisation and store the data r = requests.get('https://restcountries.eu/rest/v2/all') r.raise_for_status() # make sure requests raises an error if it fails countries = r.json() # Convert API response to dict keyed by country name for easy lookup # This could also be extended using the alternative and foreign language # names provided by the API self.countries = {c['name']: c for c in countries} self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher with Doc patterns for each country name patterns = [nlp(c) for c in self.countries.keys()] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('COUNTRIES', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. # If no default value is set, it defaults to None. Token.set_extension('is_country', default=False) Token.set_extension('country_capital') Token.set_extension('country_latlng') Token.set_extension('country_flag') # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_country == True. Doc.set_extension('has_country', getter=self.has_country) Span.set_extension('has_country', getter=self.has_country)
def __init__(self, nlp, companies=tuple(), label='ORG'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher – it can now take Doc objects as patterns, # so even if the list of companies is long, it's very efficient patterns = [nlp(org) for org in companies] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('TECH_ORGS', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension('is_tech_org', default=False) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_tech_org == True. Doc.set_extension('has_tech_org', getter=self.has_tech_org) Span.set_extension('has_tech_org', getter=self.has_tech_org)
class RESTCountriesComponent(object): """spaCy v2.0 pipeline component that requests all countries via the REST Countries API, merges country names into one token, assigns entity labels and sets attributes on country tokens. """ name = 'rest_countries' # component name, will show up in the pipeline def __init__(self, nlp, label='GPE'): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ # Make request once on initialisation and store the data r = requests.get('https://restcountries.eu/rest/v2/all') r.raise_for_status() # make sure requests raises an error if it fails countries = r.json() # Convert API response to dict keyed by country name for easy lookup # This could also be extended using the alternative and foreign language # names provided by the API self.countries = {c['name']: c for c in countries} self.label = nlp.vocab.strings[label] # get entity label ID # Set up the PhraseMatcher with Doc patterns for each country name patterns = [nlp(c) for c in self.countries.keys()] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add('COUNTRIES', None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. # If no default value is set, it defaults to None. Token.set_extension('is_country', default=False) Token.set_extension('country_capital') Token.set_extension('country_latlng') Token.set_extension('country_flag') # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_country == True. Doc.set_extension('has_country', getter=self.has_country) Span.set_extension('has_country', getter=self.has_country) def __call__(self, doc): """Apply the pipeline component on a Doc object and modify it if matches are found. Return the Doc, so it can be processed by the next component in the pipeline, if available. """ matches = self.matcher(doc) spans = [] # keep the spans for later so we can merge them afterwards for _, start, end in matches: # Generate Span representing the entity & set label entity = Span(doc, start, end, label=self.label) spans.append(entity) # Set custom attribute on each token of the entity # Can be extended with other data returned by the API, like # currencies, country code, flag, calling code etc. for token in entity: token._.set('is_country', True) token._.set('country_capital', self.countries[entity.text]['capital']) token._.set('country_latlng', self.countries[entity.text]['latlng']) token._.set('country_flag', self.countries[entity.text]['flag']) # Overwrite doc.ents and add entity – be careful not to replace! doc.ents = list(doc.ents) + [entity] for span in spans: # Iterate over all spans and merge them into one token. This is done # after setting the entities – otherwise, it would cause mismatched # indices! span.merge() return doc # don't forget to return the Doc! def has_country(self, tokens): """Getter for Doc and Span attributes. Returns True if one of the tokens is a country. Since the getter is only called when we access the attribute, we can refer to the Token's 'is_country' attribute here, which is already set in the processing step.""" return any([t._.get('is_country') for t in tokens])
def test_phrase_matcher_basic_check(en_vocab): matcher = PhraseMatcher(en_vocab) # Potential mistake: pass in pattern instead of list of patterns pattern = Doc(en_vocab, words=["hello", "world"]) with pytest.raises(ValueError): matcher.add("TEST", pattern)
def test_phrase_matcher_sent_start(en_vocab, attr): _ = PhraseMatcher(en_vocab, attr=attr) # noqa: F841
def __init__(self, nlp, ontoDict): # add ontology and label from ontoDict self.ontoDict = ontoDict self.all_labels = "" # stop words, don't try to match these stopwords = nlp.Defaults.stop_words stopwords.add("ands") stopwords.add("ends") stopwords.add("ci") self.ontols = [] ontologies = ontoDict["ontologies"] for ontology in ontologies: for key, value in ontology.items(): if (key == "label"): self.all_labels = self.all_labels + value if (key == "ontology"): self.ontols.append(value) # print("self.ontols: ", self.ontols) # for x in self.ontols: # print("got x: ", x) # print("all_labels = ", self.all_labels) # for making plural forms of labels for text matching engine = inflect.engine() # init terms and patterns self.terms = {} patterns = [] #build unified table of all ID, IRI, Label and Synonyms: for ontol in self.ontols: #should be all ontols in print("checking ontol: ", ontol) for termid in ontol.get_classes(): # print("k is: ", k) termshortid = ontol.get_id_for_iri(termid) label = ontol.get_annotation(termid, RDFSLABEL) definition = ontol.get_annotation(termid, DEFINITION) if label: term_entry = { 'id': termid if termshortid is None else termshortid, 'name': label.strip(), 'definition': definition } if label is not None and label.strip().lower( ) not in stopwords: self.terms[label.strip().lower()] = term_entry patterns.append(nlp.make_doc(label.strip().lower())) plural = engine.plural(label.strip()) self.terms[plural.lower()] = term_entry patterns.append(nlp.make_doc(plural.lower())) synonyms = ontol.get_annotations(termid, SYN) for s in synonyms: # print("adding SYNONYM in ontotagtext: ", s) if s.strip().lower() not in stopwords: self.terms[s.strip().lower()] = term_entry patterns.append(nlp.make_doc(s.strip().lower())) try: plural = engine.plural(s.strip().lower()) self.terms[plural.lower()] = term_entry patterns.append(nlp.make_doc(plural.lower())) except: print("Problem getting plural of ", s) continue # initialize matcher and add patterns self.matcher = PhraseMatcher(nlp.vocab, attr='LOWER') self.matcher.add(self.all_labels, None, *patterns) # set extensions to tokens, spans and docs Token.set_extension("is_ontol_term", default=False, force=True) Token.set_extension("ontol_id", default=False, force=True) Token.set_extension("merged_concept", default=False, force=True) Doc.set_extension("has_ontols", getter=self.has_ontols, force=True) Doc.set_extension("ontols", default=[], force=True) Span.set_extension("has_ontols", getter=self.has_ontols, force=True)
class key_word_recognizer(object): """Initialise the pipeline component. The shared nlp instance is used to initialise the matcher with the shared vocab, get the label ID and generate Doc objects as phrase match patterns. """ def __init__(self, nlp, keywords, label, tokentag, doctag=None, spantag=None): nlp.vocab.strings.add(label) self.label = nlp.vocab.strings[label] self._label_str = label self._token_tag = tokentag self._doctag = doctag self._spantag = spantag self._keywordtag = "is_keyword" self._labeltag = "label_" # Set up the PhraseMatcher – it can now take Doc objects as patterns, # so even if the list of companies is long, it's very efficient patterns = [nlp(key) for key in keywords] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add(self._token_tag, None, *patterns) # Register attribute on the Token. We'll be overwriting this based on # the matches, so we're only setting a default value, not a getter. Token.set_extension(self._token_tag, default=False) if not Token.has_extension(self._keywordtag): Token.set_extension(self._keywordtag, default=False) Token.set_extension(self._labeltag, default=None) # Register attributes on Doc and Span via a getter that checks if one of # the contained tokens is set to is_tech_org == True. Doc.set_extension(self._doctag, getter=lambda tokens: any( [t._.get(self._token_tag) for t in tokens])) Span.set_extension(self._spantag, getter=lambda tokens: any( [t._.get(self._token_tag) for t in tokens])) if not Span.has_extension("dep_"): Span.set_extension("dep_", default="") Span.set_extension("head_", default=None) def __call__(self, doc): """Apply the pipeline component on a Doc object and modify it if matches are found. Return the Doc, so it can be processed by the next component in the pipeline, if available. """ matches = self.matcher(doc) spans = [] # keep the spans for later so we can merge them afterwards for _, start, end in matches: # Generate Span representing the entity & set label entity = Span(doc, start, end, label=self.label) spans.append(entity) # Set custom attribute on each token of the entity for token in entity: token._.set(self._token_tag, True) token._.set(self._labeltag, self._label_str) entity._.set("dep_", token.dep_) entity._.set("head_", { "text": token.head.text, "index": token.head.i }) if not token._.get(self._keywordtag): token._.set(self._keywordtag, True) # Overwrite doc.ents and add entity – be careful not to replace! # print(doc.ents) # print(entity) if not entity in list(doc.ents): doc.ents = list(doc.ents) + [entity] for span in spans: # Iterate over all spans and merge them into one token. This is done # after setting the entities – otherwise, it would cause mismatched # indices! span.merge() return doc
import json from spacy.lang.es import Spanish from spacy.tokens import Span from spacy.matcher import PhraseMatcher with open("exercises/es/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) with open("exercises/es/capitals.json", encoding="utf8") as f: CAPITALS = json.loads(f.read()) nlp = Spanish() matcher = PhraseMatcher(nlp.vocab) matcher.add("COUNTRY", None, *list(nlp.pipe(COUNTRIES))) def countries_component(doc): # Crea un Span de entidades con el label "LOC" para todos los resultados matches = matcher(doc) doc.ents = [ Span(doc, start, end, label="LOC") for match_id, start, end in matches ] return doc # Añade el componente al pipeline nlp.add_pipe(countries_component) print(nlp.pipe_names) # El getter que busca el texto del span en un diccionario de ciudades # capitales de países
# To add a new cell, type '# %%' # To add a new markdown cell, type '# %% [markdown]' # %% import pickle from sklearn.feature_extraction.text import TfidfVectorizer import pandas as pd from nlp import nlp as nlp from collections import Counter from fuzzywuzzy import fuzz import Levenshtein as lev import spacy Spnlp = spacy.load("en_core_web_sm") from spacy.matcher import PhraseMatcher matcher = PhraseMatcher(Spnlp.vocab) import matplotlib.pyplot as plt from wordcloud import WordCloud LangProcessor = nlp() # %% #load the job description with open('identity.txt') as job: text = job.read() # %% #load cv with open('cv') as cv:
nlp = spacy.load('en_core_web_sm') doc = nlp( "A” login to his application, he should only see things pertaining to him and not things pertaining to “Mr. B” " ) # for token in doc: # print(token) print(f"Token \t\tLemma \t\tStopword".format('Token', 'Lemma', 'Stopword')) print("-" * 40) for token in doc: print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}") from spacy.matcher import PhraseMatcher matcher = PhraseMatcher(nlp.vocab, attr='LOWER') terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel'] patterns = [nlp(text) for text in terms] matcher.add("TerminologyList", patterns) text_doc = nlp( "Glowing review overall, and some really interesting side-by-side " "photography tests pitting the iPhone 11 Pro against the " "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.") matches = matcher(text_doc) print(matches) match_id, start, end = matches[0] print(nlp.vocab.strings[match_id], text_doc[start:end])
def create_Data_Scientist_profile(file): text = pdfextract(file) text = str(text) text = text.replace("\\n", "") text = text.lower() #below is the csv where we have all the keywords, you can customize your own keyword_dict = pd.read_csv('data_science_keywords.csv') keyword_total = list(keyword_dict.count()) global total_sum total_sum = 0 for i in keyword_total: total_sum = total_sum + i print('ee',total_sum) stats_words = [nlp(text) for text in keyword_dict['Statistics'].dropna(axis = 0)] NLP_words = [nlp(text) for text in keyword_dict['NLP'].dropna(axis = 0)] ML_words = [nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis = 0)] DL_words = [nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis = 0)] R_words = [nlp(text) for text in keyword_dict['R Language'].dropna(axis = 0)] python_words = [nlp(text) for text in keyword_dict['Python Language'].dropna(axis = 0)] Data_Engineering_words = [nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis = 0)] matcher = PhraseMatcher(nlp.vocab) matcher.add('Stats', None, *stats_words) matcher.add('NLP', None, *NLP_words) matcher.add('ML', None, *ML_words) matcher.add('DL', None, *DL_words) matcher.add('R', None, *R_words) matcher.add('Python', None, *python_words) matcher.add('DE', None, *Data_Engineering_words) doc = nlp(text) d = [] matches = matcher(doc) for match_id, start, end in matches: rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COLOR' span = doc[start : end] # get the matched slice of the doc d.append((rule_id, span.text)) keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items()) ## convertimg string of keywords to dataframe df = pd.read_csv(StringIO(keywords),names = ['Keywords_List']) df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword']) df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count']) df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1) df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")")) base = os.path.basename(file) filename = os.path.splitext(base)[0] name = filename.split('_') name2 = name[0] name2 = name2.lower() ## converting str to dataframe name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name']) dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1) dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True) print(dataf) return(dataf)
def create_web_dev_profile(file): text = pdfextract(file) text = str(text) text = text.replace("\\n", "") text = text.lower() #below is the csv where we have all the keywords, you can customize your own keyword_dict = pd.read_csv('web_developer_keywords.csv') keyword_total = list(keyword_dict.count()) global total_sum total_sum = 0 for i in keyword_total: total_sum = total_sum + i print('ee',total_sum) front_end = [nlp(text) for text in keyword_dict['Front End'].dropna(axis = 0)] back_end = [nlp(text) for text in keyword_dict['Back End'].dropna(axis = 0)] database = [nlp(text) for text in keyword_dict['Database'].dropna(axis = 0)] project = [nlp(text) for text in keyword_dict['Projects'].dropna(axis = 0)] frameworks = [nlp(text) for text in keyword_dict['Frameworks'].dropna(axis = 0)] #print(front_end) # print(back_end) #print(database) matcher = PhraseMatcher(nlp.vocab) matcher.add('FrontEnd', None, *front_end) matcher.add('BackEnd', None, *back_end) matcher.add('Database', None, *database) matcher.add('Projects', None, *project) matcher.add('Frameworks', None, *frameworks) doc = nlp(text) #print(doc) d = [] matches = matcher(doc) # print(matches) for match_id, start, end in matches: rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COLOR' span = doc[start : end] # get the matched slice of the doc d.append((rule_id, span.text)) keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items()) ## convertimg string of keywords to dataframe df = pd.read_csv(StringIO(keywords),names = ['Keywords_List']) df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword']) df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count']) df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1) df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")")) base = os.path.basename(file) filename = os.path.splitext(base)[0] name = filename.split('_') name2 = name[0] name2 = name2.lower() ## converting str to dataframe name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name']) dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1) dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True) print(dataf) return(dataf)
def __init__(self, nlp, terms): self.terms = terms self.matcher = PhraseMatcher(nlp.vocab) patterns = [nlp.make_doc(text) for text in terms] self.matcher.add("TerminologyList", None, *patterns) Doc.set_extension("phrase_matches", getter=self.matcher, force=True)
class CkSpacyModel(): def __init__(self, xml_dir, output_dir, section_names): self.xml_dir = xml_dir self.output_dir = output_dir self.section_names = section_names self.__current_xml_files_for_spacy_preprocessing = [] self.__filenames = [] self._TEXTS = [] self._current_TEXTS_idx = 0 self.nlp = spacy.load('en_core_web_md') self.ruler = EntityRuler(self.nlp,overwrite_ents=True ).from_disk("./patterns.jsonl") #self.ruler = EntityRuler(self.nlp) self._current_sentence_idx = 0 self.TRAIN_DATA = [] self.stringstore = 0 self.matcher = Matcher(self.nlp.vocab) Token.set_extension("is_unit", getter= self.is_unit) Token.set_extension("alt_text", default = None) # getter= self.get_alt_text) Token.set_extension("alt_text_keep", default = True) # whether this word should be keeped in the alternative text (necessary because of trailing whitespaces)) Token.set_extension("alt_text_trailing_whitespace_", default = " ") self.matcher_units = PhraseMatcher(self.nlp.vocab) # der PhraseMatcher fuer die Uniterkennung fuer alternative words self.matcher_alt_text = Matcher(self.nlp.vocab) self.pattern_file_custom_matcher_alt_text = "./Lib/units.jsonl" def pre_process(self): print('starting preprocess') self.nlp.add_pipe(self.ruler, after="ner") self.nlp.add_pipe(self.custom_pipe_component_phrase_entity, before="ner") #self.nlp.add_pipe(self.custom_pipe_component_Name_et_al, after="ner") #self.nlp.add_pipe(self.custom_pipe_component_Quantity, last=True) #self.nlp.add_pipe(self.custom_pipe_component_set_extension_unit, last=True) # lade die pattern in den Matcher self.custom_matcher_alt_text() # self.nlp.add_pipe(self.custom_pipe_component_set_extension_unit_text, last=True) self.nlp.add_pipe(self.custom_pipe_comp_alt_text, last = True) # als letztes kommt dann die Wortersetzung fuer das simplified english ... 10 mg = xy mg self.extract_text() def reintegrate_patterns_to_ruler(self, file): self.ruler = EntityRuler(self.nlp).from_disk(file) #self.nlp.remove_pipe("ruler") self.nlp.replace_pipe("entity_ruler", self.ruler) #self.nlp.add_pipe(self.ruler, before="ner") #* The entity ruler is designed to integrate with spaCy’s existing statistical models #* and enhance the named entity recognizer. If it’s added before the "ner" component, #* the entity recognizer will respect the existing entity spans and adjust its #* predictions around it. This can significantly improve accuracy in some cases. #* If it’s added after the "ner" component, the entity ruler will only add spans to #* the doc.ents if they don’t overlap with existing entities predicted by the model. #* To overwrite overlapping entities, you can set overwrite_ents=True on initialization. def show_ents(self, doc): if doc.ents: for ent in doc.ents: print(ent.text+' - '+ent.label_+' - '+str(spacy.explain(ent.label_))) else: print('No named entities found.') def get_next_sentence(self): self._current_TEXT = self._TEXTS[self._current_TEXTS_idx] self._current_doc = self.nlp(self._current_TEXT) sentences = list(self._current_doc.sents) sentence = sentences[self._current_sentence_idx] if self._current_sentence_idx < len(sentences)-1: self._current_sentence_idx += 1 else: self._current_sentence_idx = 0 print('next document') if self._current_TEXTS_idx < len(self._TEXTS)-1: self._current_TEXTS_idx += 1 else: print('end of Text list') sentence = self.nlp(sentence.text) unknown_words = [] for token in sentence: #print("check for : " + token.text) if token.is_oov: unknown_words.append(token) #print("not found: " + token.text) print(f"token.text = {token.text:{18}} : token._.alt_text = {token._.alt_text:{10}}") return (sentence, unknown_words) def add_pattern_to_entity_ruler(self,patterns,file): # die Prufung auf gleiche Lines hab ich nicht hinbekommen # daher pruefung auf doppelte und Loeschung von diesen self.ruler.add_patterns(patterns) self.ruler.to_disk(file) uniqlines = set(open(file).readlines()) with open(file,'w',encoding='utf8') as fp: for line in uniqlines: fp.write(line) def add_sentence_to_TRAIN_DATA(self,sentence, filename): exists = os.path.isfile(filename) if exists: with open(filename,'r',encoding='utf8') as fh: for line in fh: one_line = line[:-1] self.TRAIN_DATA.append(one_line) self.TRAIN_DATA.append(sentence) if exists: # haenge nur den einen aktuellen Listeneintrag an with open(filename,'a',encoding='utf8') as fh: listitem = self.TRAIN_DATA.pop() fh.write('%s\n' % listitem) if not exists: with open(filename,'w+',encoding='utf8') as fh: for listitem in self.TRAIN_DATA: fh.write('%s\n' % listitem) def add_word_to_stringstore(self, word, path): try: self.stringstore = StringStore().from_disk(path) self.stringstore.add(word) except: self.stringstore = StringStore(word) self.stringstore.to_disk(path) def add_word_to_vocab_permanently(self,word): pass def add_word_to_vocab_temporarely(self, word): pass def add_stringstore_to_vocab_temporarely(self, file): try: self.stringstore = StringStore().from_disk(file) for word in self.stringstore: lex = self.nlp.vocab[word] self.nlp.vocab[word].is_oov = False except: print("cannot read stringstore in file " + file) def add_pattern_jsonl_file_to_vocab_and_entity_matcher(self, pattern_file): (ents, pattern) = self.read_gazetteer(pattern_file) for i in range(len(ents)-1): #print(ents[i]) #print(pattern[i]) #print(type(pattern[i])) self.matcher.add(ents[i], None, pattern[i]) # self.matcher.add(entity, None, *phrases) def read_gazetteer(self, loc): pattern = [] ents = [] idx = 0 for i, line in enumerate(open(loc)): idx +=1 data = eval(line.strip()) # data = json.loads(line.strip()) # ich fuege zum Vocab den String #phrase = self.nlp.tokenizer(data["pattern"]) #phrase = data["pattern"][0] ents.append(data["label"]) # ich fuege zum matcher das pattenr pattern.append(data["pattern"]) # adde die Worte zum vocab #print(f"laenge der phrases = {len(phrases)}") # print(phrase) try: phrase = ["pattern"][1]["lower"] for w in phrase: _ = self.nlp.tokenizer.vocab[w.text] except: pass return (ents, pattern) # for i, line in enumerate(open(loc)): # data = json.loads(line.strip()) # #! dann duerfen es aber nur einzelne Worte sein # phrase = self.nlp.tokenizer(data["pattern"]) # # adde die Worte zum vocab # print(f"laenge der phrases = {len(phrase)}") # for w in phrase: # _ = self.nlp.tokenizer.vocab[w.text] # if len(phrase) >= 2: # yield phrase #*___________________________________________________________ #*___________________________________________________________ #* CUSTOM PIPE COMPONENTS #* Hier kommen die Cusom Pipe Components #*Aufgabe hauptsaechlich Entitaeten mittels Matchern zu verbessern #*Diese werden in der Funktion preproces in die Pipeline integriert def custom_pipe_component_phrase_entity(self, doc): # for ent in doc.ents: # print(ent.text) # Apply the matcher to the doc matches = self.matcher(doc) # Create a Span for each match and assign the label 'ANIMAL' spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches] # Overwrite the doc.ents with the matched spans try: doc.ents = list(doc.ents) + spans except: print(f"overlapping Entities with {spans}") # doc.ents = spans return doc def custom_pipe_component_Name_et_al(self, doc): print("entering_custom_pipe_component Name et al") new_ents = [] for ent in doc.ents: print(f"ent = {ent}") # Only check for title if it's a person and not the first token replaced = False if ent.label_ == "PERSON":# and ent.end<len(doc)-2: # gib das neue label if et al. is in person or after Person if 'et' in ent.text and ('al' in ent.text or 'al.' in ent.text): new_ent = Span(doc, ent.start, ent.end, label="REF") replaced = True print("new ents") else: # wir schauen ob die danach folgenden et al sind print("within label Person") next_token = doc[ent.end + 1] next_next_token = doc[ent.end + 2] print(next_token.text) print(next_next_token.text) if next_token.text == "et" and next_next_token.text in ("al.", "al"): new_ent = Span(doc, ent.start, ent.end+2, label="REF") new_ents.append(new_ent) replaced = True print("new_ent") # es wird das neue angehangen if replaced: new_ents.append(new_ent) print('new ent') else: # es wird die alte Entitaet uveraendert uebertragen new_ents.append(ent) print("old ents") doc.ents = new_ents print(doc.ents) return doc def custom_pipe_component_Quantity(self, doc): # 10 mg macht er meist als 10(CARDINAL) mg # Ziel 10 mg (QUANTITY) print("entering_custom_pipe_component Quantity") print(doc.text) new_ents = [] for ent in doc.ents: print(ent.text) print(ent.label_) # Only check for title if it's a person and not the first token replaced = False if ent.label_ == "CARDINAL":# and ent.end<len(doc)-2: next_token = doc[ent.end] if next_token.text in ["mg", "g"]: new_ent = Span(doc, ent.start, ent.end+1, label="QUANTITY") replaced = True # es wird das neue angehangen if replaced: new_ents.append(new_ent) print('new ent') else: # es wird die alte Entitaet uveraendert uebertragen new_ents.append(ent) print("old ents") try: doc.ents = new_ents except: print("overlapping Entities in Quantity") for ent in new_ents: print(f"ent = {ent.text} start = {ent.start} stop = {ent.end} label = {ent.label_}") #print(doc.ents) return doc def custom_pipe_component_set_extension_unit(self, doc): pass #*___________________________________________________________ #*___________________________________________________________ #* EXTENSION Methods # Hier kommen die EXTENSION Methods # Hauptaufgabe ist das setzen von user defined Attributes, Propertien and Methods #Hauptziel fuer bestimmte Tokens ein neues text Token mit simplified english #zu def custom_pipe_comp_alt_text(self, doc): # setze standardmaessig den alternativ text auf den Orginaltext for token in doc: token._.alt_text = token.text token._.alt_text_trailing_whitespace_ = token.whitespace_ # nun wird der Matcher aufgerufen, der nach verschiedenen Regeln sucht # diese gefundenen Regeln werden danach abgefangen und der Alternativtext # wird entsprechend dieser Regeln gesetzt matches = self.matcher_alt_text(doc) # Create a Span for each match and assign the label 'ANIMAL' for match_id, start, end in matches: # Zahl die allein steht und als ent Type Cardinal ist if self.nlp.vocab.strings[match_id]=="NUMCARDINAL": doc[start]._.alt_text = "NUM" # UNITS # Wenn UNITS allein stehen if self.nlp.vocab.strings[match_id]=="UNITS": doc[start]._.alt_text = "UNITS" # Wenn Units nach einer Zahl als eigenes Token stehen if self.nlp.vocab.strings[match_id]=="NUM_UNIT": doc[start]._.alt_text = "99" doc[start+1]._.alt_text = "UNITS" # WEnn Units nach einer Zahl in einem Token stehen if self.nlp.vocab.strings[match_id]=="NUMUNIT": # zahl und Einheit wurde zusammen geschrieben doc[start]._.alt_text = "99UNITS" if self.nlp.vocab.strings[match_id]=="DRUGNAME": doc[start]._.alt_text = "DRUGNAME" if self.nlp.vocab.strings[match_id]=="NAMEETAL": doc[start]._.alt_text = "REF" doc[start+1]._.alt_text = "not to keep" doc[start+1]._.alt_text_keep = False doc[start+2]._.alt_text = "not to keep" doc[start+2]._.alt_text_keep = False doc[start+3]._.alt_text = "not to keep" doc[start+3]._.alt_text_keep = False if self.nlp.vocab.strings[match_id]=="REFx": doc[start]._.alt_text = "REF" if self.nlp.vocab.strings[match_id]=="REFS": doc[start]._.alt_text = "REF" if self.nlp.vocab.strings[match_id]=="REFpunkt": doc[start]._.alt_text = "REF" if self.nlp.vocab.strings[match_id]=="XYMIN": doc[start]._.alt_text = "XYMIN" if self.nlp.vocab.strings[match_id]=="XY-YEARREG": doc[start]._.alt_text = "99-year" if self.nlp.vocab.strings[match_id]=="XYYEARREG": doc[start]._.alt_text = "99year" if self.nlp.vocab.strings[match_id]=="XYMINREG": doc[start]._.alt_text = "99min" if self.nlp.vocab.strings[match_id]=="XY-MINREG": doc[start]._.alt_text = "99-min" if self.nlp.vocab.strings[match_id]=="XY_PROCENT": doc[start]._.alt_text = "99" doc[start+1]._.alt_text = "%" if self.nlp.vocab.strings[match_id]=="XY-RECEPTOR": doc[start]._.alt_text = "XY" doc[start+1]._.alt_text = "-" doc[start+2]._.alt_text = "receptor" if self.nlp.vocab.strings[match_id]=="XY_RECEPTOR": doc[start]._.alt_text = "XY" doc[start+1]._.alt_text = "receptor" # {"label":"REFS","pattern":[{"TEXT": "AuthorsEtAl"}]} # {"label":"REFx","pattern":[{"TEXT": "AuthorEtAl"}]} # doc[start]._.alt_text = doc[start].text + " " + self.nlp.vocab.strings[match_id] + " gefunden" # spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches] return doc def custom_matcher_alt_text(self): pattern_file = self.pattern_file_custom_matcher_alt_text (ents, pattern) = self.read_pattern_matcher_file(pattern_file) for i in range(len(ents)-1): self.matcher_alt_text.add(ents[i], None, pattern[i]) # self.matcher.add(entity, None, *phrases) # pattern = [] # pattern.append([{'IS_DIGIT': True}, {'LOWER':'ng'}]) # pattern.append([{'IS_DIGIT': True}, {'LOWER':'mg'}]) # self.matcher_units2.add('UNITS', None, *pattern) # diese Funktion soll den Text jedes Tokens setzen def custom_pipe_component_set_extension_unit_text(self, doc): # rufe den PhraseMatcher fuer die units auf #self.matcher_units2 = Matcher(self.nlp.vocab) self.add_pattern_jsonl_file_Phrasematcher("./Lib/units.jsonl") matches = self.matcher_units(doc) # Create a Span for each match and assign the label 'ANIMAL' for match_id, start, end in matches: doc[start]._.alt_text = doc[start].text + "_ unit gefunden" # spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches] return doc def is_unit(self,token): return token.text == "mg" #def get_alt_text(self,token): # return token._.alt_text def add_pattern_jsonl_file_Phrasematcher(self, pattern_file): (ents, unit_pattern) = self.read_gazetteer2(pattern_file) for i in range(len(ents)-1): #matcher_units.add("Units", None, *list(nlp.pipe(COUNTRIES))) self.matcher_units.add("UNITS", None, *list(self.nlp.pipe(unit_pattern))) # self.matcher_units.add(ents[i], None, pattern[i]) # self.matcher.add(entity, None, *phrases) def read_gazetteer2(self, loc): pattern = [] ents = [] idx = 0 for i, line in enumerate(open(loc)): idx +=1 data = eval(line.strip()) ents.append(data["label"]) # ich fuege zum matcher das pattenr pattern.append(data["pattern"]) return (ents, pattern) def read_pattern_matcher_file(self, loc): pattern = [] ents = [] for i, line in enumerate(open(loc)): data = eval(line.strip()) ents.append(data["label"]) pattern.append(data["pattern"]) return (ents, pattern) #*___________________________________________________________ #*___________________________________________________________ #* Text Extraction von XML to txt # Wandelt den Text von den XML Dokumenten in reinen Text um #diese werden dann im self.output_dir gespeichert # def extract_text(self): idx = 0 for file in os.listdir(self.xml_dir): print(f'schleife extract text with : {idx} ') if file.endswith('.xml'): input_filename = os.path.join(self.xml_dir, file) if len(self.section_names)==1: prefix = self.section_names[0] else: prefix = 'section_mix' output_filename = os.path.join(self.output_dir, prefix + '_' + file) print(output_filename) self.__current_xml_files_for_spacy_preprocessing.append(input_filename) with open(input_filename, "r", encoding="utf8") as f1: print('-------------------------') print('filename:' + input_filename) xml = f1.read() P = RP.Research_Paper_XMLJSON(xml, "json") P.development_test() #P.analyse_parsing() rtext = '' for section_name in self.section_names: rtext = rtext + P.get_part_of_text(section_name) #print(rtext) with open(output_filename,"w+", encoding="utf8") as f2: self._TEXTS.append(rtext) f2.write(rtext) idx += 1 # ! This has to be removed in further versions if idx > 10: break def get_sentence_alt_text(self, sent): # uebergabe eines doc objects /// sentence # rueckgabe eines TExtes das den alternativen TExt nutzt alt_text = "" sent_org_text = sent.text for token in sent: if token._.alt_text_keep: alt_text = alt_text + token._.alt_text + token._.alt_text_trailing_whitespace_ return alt_text
import spacy from spacy.matcher import PhraseMatcher from spacy.tokens import Span from spacy.lang.en import English nlp = spacy.load("en_core_web_md") animal_patterns = ["Golden Retriever", "cat", "turtle", "Rattus norvegicus"] patterns = list(nlp.pipe(animal_patterns)) matcher = PhraseMatcher(nlp.vocab) matcher.add("ANIMALS", None, *patterns) # define custom component def animal_component(doc): # create span for each match and assign label animal # and overwrite the doc.ents with the matched spans doc.ents = [ Span(doc, start, end, label=nlp.vocab.strings["ANIMAL"]) for match_id, start, end in matcher(doc) ] return doc # add the component to the pipeline after the ner component nlp.add_pipe(animal_component, after='ner') # process the text and print the text and label for the doc.ents doc = nlp('I have a cat and a Golden Retriever') print([(ent.text, ent.label_) for ent in doc.ents])
def __get_country_matcher__(self): matcher = PhraseMatcher(self.nlp.vocab) countries = ['Czech Republic', 'Australia', 'Germany', 'Slovakia'] patterns = list(self.nlp.pipe(countries)) matcher.add('ANIMAL', None, *patterns) return matcher
def __get_animal_matcher__(self): matcher = PhraseMatcher(self.nlp.vocab) animals = ['dog', 'cat', 'mouse', 'dogs', 'cats', 'mice'] patterns = list(self.nlp.pipe(animals)) matcher.add('ANIMAL', None, *patterns) return matcher
import spacy from spacy.language import Language from spacy.matcher import PhraseMatcher from spacy.tokens import Span nlp = spacy.load("pt_core_news_sm") animals = ["Golden Retriever", "gato", "tartaruga", "Rattus norvegicus"] animal_patterns = list(nlp.pipe(animals)) print("animal_patterns:", animal_patterns) matcher = PhraseMatcher(nlp.vocab) matcher.add("ANIMAL", animal_patterns) # Definir o componente customizado @Language.component("animal_component") def animal_component_function(doc): # Aplicar o matcher ao doc matches = matcher(doc) # Criar uma partição para cada correspondência e atribuir o rótulo "ANIMAL" spans = [Span(doc, start, end, label="ANIMAL") for match_id, start, end in matches] # Sobrescrever doc.ents com as correspondências doc.ents = spans return doc # Adicionar o componente ao fluxo de processamento após o componente "ner" nlp.add_pipe("animal_component", after="ner") print(nlp.pipe_names) # Processar o texto e imprimir o texto e rótulo de doc.ents doc = nlp("Eu tenho um gato e um Golden Retriever") print([(ent.text, ent.label_) for ent in doc.ents])
def get_validator_matches(text): match_ents.clear() matcher = Matcher(nlp.vocab) phraseMatcher = PhraseMatcher(nlp.vocab, attr="LEMMA") adverbPattern = [{"POS": "ADV"}] matcher.add("Adverbs", match_adverb, adverbPattern) adjectivePattern = [{"POS": "ADJ"}] matcher.add("Adjectives", match_adjective, adjectivePattern) pastTenseVerbPattern1 = [{"TAG": "VBD"}] pastTenseVerbPattern2 = [{"TAG": "VBN"}] matcher.add("Passive Voice", match_passive, pastTenseVerbPattern1, pastTenseVerbPattern2) infinitivePattern1 = [{"LOWER": "be"}, {"POS": "ADJ"}, {"POS": "ADP"}] infinitivePattern2 = [{"LOWER": "to"}, {"POS": "VERB"}] matcher.add("Infinitive", match_infinitive, infinitivePattern1, infinitivePattern2) pronounPattern = [{"POS": "PRON"}] matcher.add("Pronoun", match_pronoun, pronounPattern) indefiniteArticles = ["a", "an"] indefiniteArticlePatterns = [nlp(text) for text in indefiniteArticles] phraseMatcher.add("Indefinite Articles", match_indefinite_articles, *indefiniteArticlePatterns) vagueTerms = [ "some", "any", "allowable", "several", "many", "lot of", "a few", "almost always", "very nearly", "nearly", "about", "close to", "almost", "approximate" ] vagueTermsPatterns = [nlp(text) for text in vagueTerms] phraseMatcher.add("Vague Terms", match_vague_terms, *vagueTermsPatterns) escapeClauses = [ "so far as is possible", "as possible", "as little as possible", "where possible", "as much as possible", "if it should prove necessary", "if necessary", "to the extent necessary", "as appropriate", "as required", "to the extent practical", "if practicable" ] escapeClausesPatterns = [nlp(text) for text in escapeClauses] phraseMatcher.add("Escape Clauses", match_escape_clauses, *escapeClausesPatterns) openEndedClauses = ["including but not limitedd to", "etc", "and so on"] openEndedPatterns = [nlp(text) for text in openEndedClauses] phraseMatcher.add("Open Ended Clauses", match_open_ended_clauses, *openEndedPatterns) notTerms = ["not"] notPatterns = [nlp(text) for text in notTerms] phraseMatcher.add("Negations", match_negations, *notPatterns) universalQuantifiers = [ "all", "any", "both", "completely", "prompt", "fast", "minimum", "maximum", "optimum" ] universalPatterns = [nlp(text) for text in universalQuantifiers] phraseMatcher.add("Immeasurable Quantifiers", match_universal_quantifier, *universalPatterns) temporalDependencies = [ "eventually", "before", "when", "after", "as", "once", "earliest", "latest", "instantaneous", "simultaneous", "while", "at last" ] temporalPatterns = [nlp(text) for text in temporalDependencies] phraseMatcher.add("Temporal Dependencies", match_temporal, *temporalPatterns) doc = nlp(inputText) matches = matcher(doc) lowercaseDoc = nlp(inputText.lower()) phraseMatches = phraseMatcher(lowercaseDoc) match_ents.sort(key=lambda x: x["start"]) return match_ents
import io import re import spacy nlp = spacy.load('en_core_web_sm') from spacy.matcher import PhraseMatcher from spacy.matcher import Matcher phrase_matcher = PhraseMatcher(nlp.vocab) matcher = Matcher(nlp.vocab) matcher1 = Matcher(nlp.vocab) def hasNumbers(inputString): return any(char.isdigit() for char in inputString) def most_common(lst): return (max(lst, key=lst.count)) def listToString(s): str1 = " " return (str1.join(s)) def extract_full_name(nlp_doc, no_of_word): names = [] if no_of_word >= 3: pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]
def __init__(self, nlp, terms, label): patterns = [nlp(term) for term in terms] self.matcher = PhraseMatcher(nlp.vocab) self.matcher.add(label, None, *patterns)
class Sectionizer: name = "sectionizer" def __init__(self, nlp, patterns="default", add_attrs=False, max_scope=None): self.nlp = nlp self.add_attrs = add_attrs self.matcher = Matcher(nlp.vocab) self.max_scope = max_scope self.phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") self.assertion_attributes_mapping = None self._patterns = [] self._section_titles = set() if patterns is not None: if patterns == "default": import os if not os.path.exists(DEFAULT_RULES_FILEPATH): raise FileNotFoundError( "The expected location of the default patterns file cannot be found. Please either " "add patterns manually or add a jsonl file to the following location: ", DEFAULT_RULES_FILEPATH) self.add(self.load_patterns_from_jsonl(DEFAULT_RULES_FILEPATH)) # If a list, add each of the patterns in the list elif isinstance(patterns, list): self.add(patterns) elif isinstance(patterns, str): import os assert os.path.exists(patterns) self.add(self.load_patterns_from_jsonl(patterns)) if add_attrs is False: self.add_attrs = False elif add_attrs is True: self.assertion_attributes_mapping = DEFAULT_ATTRS self.register_default_attributes() elif isinstance(add_attrs, dict): # Check that each of the attributes being added has been set for modifier in add_attrs.keys(): attr_dict = add_attrs[modifier] for attr_name, attr_value in attr_dict.items(): if not Span.has_extension(attr_name): raise ValueError( "Custom extension {0} has not been set. Call Span.set_extension." ) self.add_attrs = True self.assertion_attributes_mapping = add_attrs else: raise ValueError( "add_attrs must be either True (default), False, or a dictionary, not {0}" .format(add_attrs)) @property def patterns(self): return self._patterns @property def section_titles(self): return self._section_titles @classmethod def load_patterns_from_jsonl(self, filepath): import json patterns = [] with open(filepath) as f: for line in f: if line.startswith("//"): continue patterns.append(json.loads(line)) return patterns def register_default_attributes(self): """Register the default values for the Span attributes defined in DEFAULT_ATTRS.""" for attr_name in [ "is_negated", "is_uncertain", "is_historical", "is_hypothetical", "is_family", ]: try: Span.set_extension(attr_name, default=False) except ValueError: # Extension already set pass def add(self, patterns): """Add a list of patterns to the clinical_sectionizer. Each pattern should be a dictionary with two keys: 'section': The normalized section name of the section, such as 'pmh'. 'pattern': The spaCy pattern matching a span of text. Either a string for exact matching (case insensitive) or a list of dicts. Example: >>> patterns = [ \ {"section_title": "past_medical_history", "pattern": "pmh"}\ {"section_title": "past_medical_history", "pattern": [{"LOWER": "past", "OP": "?"}, \ {"LOWER": "medical"}, \ {"LOWER": "history"}]\ },\ {"section_title": "assessment_and_plan", "pattern": "a/p:"}\ ] >>> clinical_sectionizer.add(patterns) """ for pattern_dict in patterns: name = pattern_dict["section_title"] pattern = pattern_dict["pattern"] if isinstance(pattern, str): self.phrase_matcher.add(name, None, self.nlp.make_doc(pattern)) else: self.matcher.add(name, [pattern]) self._patterns.append(pattern_dict) self._section_titles.add(name) def set_assertion_attributes(self, ents): """Add Span-level attributes to entities based on which section they occur in. Args: edges: the edges to modify """ for ent in ents: if ent._.section_title in self.assertion_attributes_mapping: attr_dict = self.assertion_attributes_mapping[ ent._.section_title] for (attr_name, attr_value) in attr_dict.items(): setattr(ent._, attr_name, attr_value) def __call__(self, doc): matches = self.matcher(doc) matches += self.phrase_matcher(doc) matches = prune_overlapping_matches(matches) if len(matches) == 0: doc._.sections.append((None, None, doc[0:])) return doc first_match = matches[0] section_spans = [] if first_match[1] != 0: section_spans.append((None, None, doc[0:first_match[1]])) for i, match in enumerate(matches): (match_id, start, end) = match section_header = doc[start:end] name = self.nlp.vocab.strings[match_id] # If this is the last match, it should include the rest of the doc if i == len(matches) - 1: if self.max_scope is None: section_spans.append((name, section_header, doc[start:])) else: section_spans.append((name, section_header, doc[start:end + self.max_scope])) # Otherwise, go until the next section header else: next_match = matches[i + 1] _, next_start, _ = next_match if self.max_scope is None: section_spans.append( (name, section_header, doc[start:next_start])) else: section_spans.append((name, section_header, doc[start:end + self.max_scope])) for name, header, section in section_spans: doc._.sections.append((name, header, section)) for token in section: token._.section_span = section token._.section_title = name token._.section_header = header # If it is specified to add assertion attributes, # iterate through the entities in doc and add them if self.add_attrs is True: self.set_assertion_attributes(doc.ents) return doc
# # show_ents(doc3) # from spacy.tokens import Span # doc = nlp(u'Tesla to build a BR factory for alot of money') # ORG = doc.vocab.strings[u'ORG'] # print(ORG) # # print(doc.ents) from spacy.tokens import Span doc = nlp(u'Our company created a brand new vacuum cleaner This new vacuum-cleaner is the best in show' u'This new vacuum-cleaner is the best in show') show_ents(doc) from spacy.matcher import PhraseMatcher encontrador = PhraseMatcher(nlp.vocab) lista_frase = ['vacuum cleaner', 'vacuum-cleaner'] padroes_frase = [nlp(text) for text in lista_frase] encontrador.add('novoproduto', None, *padroes_frase) found_matches = encontrador(doc) print(found_matches) from spacy.tokens import Span PROD = doc.vocab.strings[u'PRODUCT']#esse product é a tag da lista de tags que voce atribui para as palavras que quer adicionar print(found_matches) new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]#aquei nos vamos atribuir a a match1 que eh o atributo 2 e o match2 que eh o terceiro atributo , sendo respectivamente onde começa e termina a palvra que queremos adicionar doc.ents = list(doc.ents) + new_ents show_ents(doc) doc_encontra = nlp(u'Originally I paid $29.95 for this card, but now this card is much more expencive. It is now 50 dollars') test = len([ent for ent in doc_encontra.ents if ent.label_ == 'MONEY']) print(test)
class MultiExtractorComponent(object): def __init__(self, nlp, ontoDict): # add ontology and label from ontoDict self.ontoDict = ontoDict self.all_labels = "" # stop words, don't try to match these stopwords = nlp.Defaults.stop_words stopwords.add("ands") stopwords.add("ends") stopwords.add("ci") self.ontols = [] ontologies = ontoDict["ontologies"] for ontology in ontologies: for key, value in ontology.items(): if (key == "label"): self.all_labels = self.all_labels + value if (key == "ontology"): self.ontols.append(value) # print("self.ontols: ", self.ontols) # for x in self.ontols: # print("got x: ", x) # print("all_labels = ", self.all_labels) # for making plural forms of labels for text matching engine = inflect.engine() # init terms and patterns self.terms = {} patterns = [] #build unified table of all ID, IRI, Label and Synonyms: for ontol in self.ontols: #should be all ontols in print("checking ontol: ", ontol) for termid in ontol.get_classes(): # print("k is: ", k) termshortid = ontol.get_id_for_iri(termid) label = ontol.get_annotation(termid, RDFSLABEL) definition = ontol.get_annotation(termid, DEFINITION) if label: term_entry = { 'id': termid if termshortid is None else termshortid, 'name': label.strip(), 'definition': definition } if label is not None and label.strip().lower( ) not in stopwords: self.terms[label.strip().lower()] = term_entry patterns.append(nlp.make_doc(label.strip().lower())) plural = engine.plural(label.strip()) self.terms[plural.lower()] = term_entry patterns.append(nlp.make_doc(plural.lower())) synonyms = ontol.get_annotations(termid, SYN) for s in synonyms: # print("adding SYNONYM in ontotagtext: ", s) if s.strip().lower() not in stopwords: self.terms[s.strip().lower()] = term_entry patterns.append(nlp.make_doc(s.strip().lower())) try: plural = engine.plural(s.strip().lower()) self.terms[plural.lower()] = term_entry patterns.append(nlp.make_doc(plural.lower())) except: print("Problem getting plural of ", s) continue # initialize matcher and add patterns self.matcher = PhraseMatcher(nlp.vocab, attr='LOWER') self.matcher.add(self.all_labels, None, *patterns) # set extensions to tokens, spans and docs Token.set_extension("is_ontol_term", default=False, force=True) Token.set_extension("ontol_id", default=False, force=True) Token.set_extension("merged_concept", default=False, force=True) Doc.set_extension("has_ontols", getter=self.has_ontols, force=True) Doc.set_extension("ontols", default=[], force=True) Span.set_extension("has_ontols", getter=self.has_ontols, force=True) def __call__(self, doc): matches = self.matcher(doc) spans = [ Span(doc, match[1], match[2], label=self.all_labels) for match in matches ] for i, span in enumerate(spans): span._.set("has_ontols", True) for token in span: if span.text.lower() in self.terms: token._.set("is_ontol_term", True) token._.set("ontol_id", self.terms[span.text.lower()]["id"]) else: print("Term not found: ", span.text.lower()) with doc.retokenize() as retokenizer: for span in filter_spans(spans): retokenizer.merge(span, attrs={"_": {"merged_concept": True}}) doc._.ontols = list(doc._.ontols) + [span] return doc # getter function for doc level def has_ontols(self, tokens): return any([t._.get("is_ontol_term") for t in tokens]) def get_term(self, term_id): # print("getting term") if term_id in [v['id'] for v in self.terms.values()]: keys = [ k for k, v in self.terms.items() if v['id'].strip() == term_id.strip() ] return self.terms[keys[0]] else: return None def get_label(self, label): # print("getting label") if label.strip().lower() in [ v['name'].strip().lower() for v in self.terms.values() ]: keys = [ k for k, v in self.terms.items() if v['name'].strip().lower() == label.strip().lower() ] return self.terms[keys[0]] else: return None
print("Please wait whilst spaCy language library is loaded...") nlp = spacy.load('en_core_web_md') """ ////////////////////////////////////////////////////// Change global values for bad words here ////////////////////////////////////////////////////// """ BAD_STEM_WORDS_LIST = [ "you", "option", "accurate", "correct", "true", "can be", "only", "statement" ] BAD_OPTION_WORDS_LIST = ["only", "statement", "all of the above"] # Create spaCy PhraseMatchers (lowercase for case-insensitivity) dnd_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") dnd_term = ["Drag and drop the"] dnd_patterns = [nlp.make_doc(text) for text in dnd_term] dnd_matcher.add("TerminologyList", None, *dnd_patterns) canbe_matcher = PhraseMatcher(nlp.vocab, attr="LOWER") canbe_term = ["can be"] canbe_patterns = [nlp.make_doc(text) for text in canbe_term] canbe_matcher.add("TerminologyList", None, *canbe_patterns) negative_matcher = Matcher(nlp.vocab) negative_matcher.add("NegativeList", None, [{ 'POS': 'VERB' }, { 'DEP': 'neg' }], [{
def test_issue4373(): """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab).""" matcher = Matcher(Vocab()) assert isinstance(matcher.vocab, Vocab) matcher = PhraseMatcher(Vocab()) assert isinstance(matcher.vocab, Vocab)
import spacy from spacy.matcher import PhraseMatcher from spacy.tokens import Span import json with open("exercises/en/countries.json", encoding="utf8") as f: COUNTRIES = json.loads(f.read()) with open("exercises/en/country_text.txt", encoding="utf8") as f: TEXT = f.read() nlp = spacy.load("en_core_web_sm") matcher = PhraseMatcher(nlp.vocab) patterns = list(nlp.pipe(COUNTRIES)) matcher.add("COUNTRY", patterns) # Create a doc and reset existing entities doc = nlp(TEXT) doc.ents = [] # Iterate over the matches for match_id, start, end in matcher(doc): # Create a Span with the label for "GPE" span = ____(____, ____, ____, label=____) # Overwrite the doc.ents and add the span doc.ents = list(doc.ents) + [____] # Get the span's root head token span_root_head = ____.____.____ # Print the text of the span root's head token and the span text print(span_root_head.____, "-->", span.text)
python -m spacy download en ''' import spacy nlp = spacy.load('en') # load text object # doc = nlp("Tea is healthy and calming, don't you think?") # doc is a document object, containing tokens # 遍历输出doc # for token in doc: # print(token) # # 检查lemma, stopword # print("Token\t\tLemma\t\tStopword") # print('-'*40) # for token in doc: # print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}") from spacy.matcher import PhraseMatcher matcher = PhraseMatcher(nlp.vocab, attr='LOWER') # 构建模型 terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google pixel'] patterns = [nlp(text) for text in terms] # 为要匹配的词构建模型 matcher.add("TerminologyList", patterns) # 给matcher增加一个属性,该属性的ID为TermologyList text_doc = nlp("Glowing review overall, and some really interesting side-by-side " "photography tests pitting the iPhone 11 Pro against the " "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.") # 待匹配的文本 matches = matcher(text_doc) # 用matcher规则来找text_doc中的内容 print(matches) match_id, start, end = matches[0] print(nlp.vocab.strings[match_id], text_doc[start:end])
def test_attr_pipeline_checks(en_vocab): doc1 = Doc(en_vocab, words=["Test"]) doc1[0].dep_ = "ROOT" doc2 = Doc(en_vocab, words=["Test"]) doc2[0].tag_ = "TAG" doc2[0].pos_ = "X" doc2[0].set_morph("Feat=Val") doc2[0].lemma_ = "LEMMA" doc3 = Doc(en_vocab, words=["Test"]) # DEP requires DEP matcher = PhraseMatcher(en_vocab, attr="DEP") matcher.add("TEST1", [doc1]) with pytest.raises(ValueError): matcher.add("TEST2", [doc2]) with pytest.raises(ValueError): matcher.add("TEST3", [doc3]) # TAG, POS, LEMMA require those values for attr in ("TAG", "POS", "LEMMA"): matcher = PhraseMatcher(en_vocab, attr=attr) matcher.add("TEST2", [doc2]) with pytest.raises(ValueError): matcher.add("TEST1", [doc1]) with pytest.raises(ValueError): matcher.add("TEST3", [doc3]) # TEXT/ORTH only require tokens matcher = PhraseMatcher(en_vocab, attr="ORTH") matcher.add("TEST3", [doc3]) matcher = PhraseMatcher(en_vocab, attr="TEXT") matcher.add("TEST3", [doc3])
# load pre-trained model base_path = os.path.dirname(__file__) nlp = spacy.load('en_core_web_sm') custom_nlp2 = spacy.load(os.path.join(base_path,"degree","model")) custom_nlp3 = spacy.load(os.path.join(base_path,"company_working","model")) # initialize matcher with a vocab matcher = Matcher(nlp.vocab) file = os.path.join(base_path,"titles_combined.txt") file = open(file, "r", encoding='utf-8') designation = [line.strip().lower() for line in file] designitionmatcher = PhraseMatcher(nlp.vocab) patterns = [nlp.make_doc(text) for text in designation if len(nlp.make_doc(text)) < 10] designitionmatcher.add("Job title", None, *patterns) file = os.path.join(base_path,"LINKEDIN_SKILLS_ORIGINAL.txt") file = open(file, "r", encoding='utf-8') skill = [line.strip().lower() for line in file] skillsmatcher = PhraseMatcher(nlp.vocab) patterns = [nlp.make_doc(text) for text in skill if len(nlp.make_doc(text)) < 10] skillsmatcher.add("Job title", None, *patterns) class resumeparse(object): objective = ( 'career goal',
class MatcherPipe(object): name = EXCELCY_MATCHER def __init__(self, nlp, patterns: list = None): """ SpaCy pipe to match Entity based on multiple patterns. Pattern examples: patterns = [ {'kind': 'phrase', 'value': 'amazon', 'entity': 'PRODUCT'}, {'kind': 'regex', 'value': 'ama(.+)', 'entity': 'PRODUCT'} ] :param nlp: The NLP object :param patterns: The matcher patterns """ self.nlp = nlp self.phrase_matcher = PhraseMatcher(nlp.vocab) self.matcher = Matcher(nlp.vocab) self.extra_patterns = [] # start add pattern self.add_patterns(patterns=patterns or []) def add_patterns(self, patterns: list): """ Add pattern list into matcher algo. :param patterns: List of pattern """ for pattern in patterns: kind, value, entity = pattern.get('kind'), pattern.get( 'value'), pattern.get('entity') self.add_pattern(kind=kind, value=value, entity=entity) def add_pattern(self, kind: str, value, entity: str): """ Add pattern into matcher algorithm. There are two different types: - phrase: This uses PhraseMatcher which described in https://spacy.io/usage/linguistic-features#adding-phrase-patterns - regex: This uses Matcher which described in https://spacy.io/usage/linguistic-features#regex :param kind: Pattern matcher type, either 'phrase', 'regex' :param value: Entity pattern matcher :param entity: Entity to be matched """ if kind == 'phrase': self.phrase_matcher.add(entity, None, *[self.nlp(value)]) elif kind == 'regex': regex_flag = self.nlp.vocab.add_flag( lambda text: self.eval_regex(pattern=value, text=text)) self.matcher.add(entity, None, [{regex_flag: True}]) def eval_regex(self, pattern, text): return bool(re.compile(pattern).match(text)) def __call__(self, doc: Doc): """ The spacy pipeline caller :param doc: The Doc token. """ # get matches phrase_matches = self.phrase_matcher(doc) matches = self.matcher(doc) # process them for match_id, start, end in phrase_matches + matches: # start add them into entities list entity = (match_id, start, end) doc.ents += (entity, ) return doc
class NLP_Processer : # def __init__ (self, disease_pattern_loc = os.path.join("NLP_PhaseMatcher_version","disease_pattern.json") , search_pattern_loc = os.path.join("NLP_PhaseMatcher_version","search_pattern.json"), # syndrome_pattern_loc = os.path.join("NLP_PhaseMatcher_version","syndrome_pattern.json"), disease_catogary_loc = os.path.join("NLP_PhaseMatcher_version","disease_catogary.json")): # Yahnis windows' version def __init__ (self, disease_pattern_loc = Path(sourcepath,"NLP_PhaseMatcher_version/disease_pattern.json") , search_pattern_loc = Path(sourcepath,"NLP_PhaseMatcher_version/search_pattern.json"), syndrome_pattern_loc = Path(sourcepath,"NLP_PhaseMatcher_version/syndrome_pattern.json"), disease_catogary_loc = Path(sourcepath,"NLP_PhaseMatcher_version/disease_catogary.json"), geocode_service = True): self.nlp = spacy.load('en_core_web_sm') self.matcher = PhraseMatcher(self.nlp.vocab, attr='LOWER', max_length=5) self.load_pattern(disease_pattern_loc) self.load_pattern(syndrome_pattern_loc) self.load_search_pattern(search_pattern_loc) self.location_checker = Location_Checker() self.publication_date = "2020-xx-xx xx:xx:xx" self.keyword_location = [] self.keyword_frequency = [] self.keyword_list = [] self.disease_to_family_dic = {} self.family_to_syndrome_dic = {} self.load_dictionary(disease_catogary_loc) def set_publication_date(self, date) : self.publication_date = date def get_keyword_location(self) : return self.keyword_location def get_keyword_frequency(self) : return self.keyword_frequency def get_keyword_list(self) : return self.keyword_list def load_dictionary(self,location) : with open(location) as f: datas = json.load(f) f.close() for data in datas: family = data["family"] for disease in data["disease"] : self.disease_to_family_dic[disease] = family self.family_to_syndrome_dic[family] = data["syndrome"] def load_pattern(self,location) : with open(location) as f: datas = json.load(f) f.close() for data in datas: name = data["name"] general_names = data["general_names"] patterns = [self.nlp.make_doc(text) for text in general_names] self.matcher.add(name, None, *patterns) def load_search_pattern(self,location) : with open(location) as f: data = json.load(f) f.close() patterns = [self.nlp.make_doc(text) for text in data["keywords"]] self.matcher.add(data["name"], None, *patterns) def category_report(self, event_date, locations, diseases, syndromes) : syndrome_usage = {} diseases_not_captured = [] disease_same_family = {} reports = [] for syndrome in syndromes : syndrome_usage[syndrome] = False for disease in diseases : if disease in self.disease_to_family_dic.keys() : family_name = self.disease_to_family_dic[disease] if family_name in disease_same_family.keys() : disease_same_family[family_name].append(disease) else : disease_same_family[family_name] = [disease] else : diseases_not_captured.append(disease) for k, v in disease_same_family.items() : syndrome_list = [] possible_sydromes = self.family_to_syndrome_dic[k] for syndrome in syndromes : if syndrome in possible_sydromes : syndrome_list.append(syndrome) syndrome_usage[syndrome] = True d = {} d["event_date"] = event_date d["locations"] = locations d["diseases"] = v d["syndromes"] = syndrome_list reports.append(d) if diseases_not_captured == [] : if reports == [] : print("weird article!") d = {} d["event_date"] = event_date d["locations"] = locations d["diseases"] = diseases d["syndromes"] = syndromes else : for k, v in syndrome_usage.items() : if v == False : reports[0]["syndromes"].append(k) else : d = {} d["event_date"] = event_date d["locations"] = locations d["diseases"] = diseases_not_captured syndrome_list = [] for k, v in syndrome_usage.items() : if v == False : syndrome_list.append(k) d["syndromes"] = syndrome_list reports.append(d) return reports def make_reports(self, text) : doc = self.nlp(text) matches = self.matcher(doc) text_length = len([token.text for token in doc]) disease_dic = {} syndrome_dic = {} search_dic = {} for match_id, start, end in matches: category = self.nlp.vocab.strings[match_id] span = doc[start:end] temp = re.search("^([A-Z]{3})-(.+)$",category) # print(category + " " + span.text) if temp == None : if str(span).lower() in search_dic : search_dic[str(span).lower()] +=1 else : search_dic[str(span).lower()] = 1 elif temp.group(1) == "DIS" : if temp.group(2) in disease_dic : disease_dic[temp.group(2)] +=1 else : disease_dic[temp.group(2)] = 1 elif temp.group(1) == "SYN" : if temp.group(2) in syndrome_dic : syndrome_dic[temp.group(2)] +=1 else : syndrome_dic[temp.group(2)] = 1 # At this stage disease and syndrome parts are done temp = dict(disease_dic, **syndrome_dic) keyword_dic = dict(temp,**search_dic) keyword_dic = dict(sorted(keyword_dic.items(), key=lambda kv: kv[1], reverse=True)) keyword_dic = dict((k.lower(), round(v/text_length,8)) for k,v in keyword_dic.items()) #This is for date and location temp = re.search("^([0-9]{4})-([0-9]{2})-([0-9]{2}) ([0-9]{2}|x{2}):([0-9]{2}|x{2}):([0-9]{2}|x{2})$", self.publication_date) if temp == None : print ("error error error nlp processer publication date!") else : test = Date_Formater(year = temp.group(1), month = int(temp.group(2))) country_dic = {} location_dic = {} for ent in doc.ents: text = ent.text if ent.label_ == "TIME" : test.add_time(text) elif ent.label_ == "DATE" or ent.label_ == "ORG" : test.add_date(text) elif ent.label_ == "GPE" : text = text.replace(".","") country = self.location_checker.get_country(text) if country == None : temp = re.search(r"[0-9]|:|;|\(|\)|\"|\'|\\|\/|@|Discover|\`|\=|\+|\?|\!", text) if temp == None and len(text) > 3: if text in location_dic : location_dic[text] += 1 else : location_dic[text] = 1 else : if country in country_dic : country_dic[country] += 1 else : country_dic[country] = 1 # convert to json location_handler = Geocode_Location() location_handler.load_locations_countires(sorted(location_dic.keys()), sorted(country_dic.keys())) event_date = test.get_event_date() locations = location_handler.get_locations() diseases = sorted(disease_dic.keys()) syndromes = sorted(syndrome_dic.keys()) self.keyword_location = sorted(location_handler.get_location_keywords()) self.keyword_frequency = keyword_dic temp = sorted(self.keyword_frequency.keys()) for a in temp : self.keyword_list.append(a) return self.category_report(event_date, locations, diseases, syndromes)