def test_usage_multiple_components(nlp): entity1 = Entity(nlp, keywords_list=keyword_list(), label='ACME_1') nlp.add_pipe(entity1, first=False, name='entity1') entity2 = Entity(nlp, keywords_dict=keyword_dict(), label='ACME_2') nlp.add_pipe(entity2, first=False, name='entity2') doc = nlp(u"I am a product manager for a java_2e platform and python.") assert doc._.has_entities assert len(doc._.entities) == 3 assert doc[:8]._.has_entities assert doc[3]._.entity_desc == 'product manager' assert len(doc[:4]._.entities) == 1 assert doc[6]._.entity_desc == 'java_2e' assert doc[9]._.entity_desc == 'python'
def __init__(self, entities: list, remove_stopwords: bool = True): self.spacyload = spacy.load('en_core_web_sm') self.ext = Entity(keywords_list=entities) self.spacyload.add_pipe(self.ext, last=True) self.error_text = 0 self.no_stopwords = remove_stopwords self.stopwords = set(stopwords.words('english'))
def test_usage_no_entity(nlp): entity = Entity(nlp, keywords_list=keyword_list(), label='ACME') nlp.add_pipe(entity, last=True) doc = nlp(u"This is a sentence without entities.") assert not doc._.has_entities for token in doc: assert not token._.is_entity
def test_usage_multiple_entities_from_list(nlp): entity = Entity(nlp, keywords_list=keyword_list(), label='ACME') nlp.add_pipe(entity, last=True) doc = nlp(u"I am a product manager for a java platform and python.") assert doc._.has_entities assert len(doc._.entities) == 2 assert doc[:8]._.has_entities assert len(doc[:8]._.entities) == 1
def create_ner_detector(file_directory): nlp = spacy.blank('en') with open(file_directory) as f: reference_data = f.read().splitlines() entity = Entity(keywords_list=reference_data) nlp.add_pipe(entity) return nlp
def test_usage_multiple_entities_from_dict(nlp): entity = Entity(nlp, keywords_dict=keyword_dict(), label='ACME') nlp.add_pipe(entity, last=True) doc = nlp(u"I am a product manager for a java_2e platform and python.") assert doc._.has_entities assert len(doc._.entities) == 2 assert doc[:8]._.has_entities assert doc[3]._.entity_desc == 'product manager' assert len(doc[:4]._.entities) == 1 assert doc[6]._.entity_desc == 'java_2e'
def test_usage_entities_from_file(nlp, file_name, keyword_dict): keyword_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), file_name) entity = Entity(keywords_file=keyword_file, keywords_dict=keyword_dict, label='ACME') nlp.add_pipe(entity, last=True) doc = nlp(u"I am a product manager for a java_2e platform and python.") assert doc._.has_entities assert len(doc._.entities) == 2 assert doc[:8]._.has_entities assert doc[3]._.entity_desc == 'product manager' assert len(doc[:4]._.entities) == 1 assert doc[6]._.entity_desc == 'java_2e platform'
def extract_tags(query): with open('tags.txt', 'rb') as fp: tags = pickle.load(fp) entity = Entity(keywords_list=tags) nlp.add_pipe(entity, name="entity", last=True) doc = nlp(query) tags_taken, tag_indices = [], [] for name, index, named in doc._.entities: tag_indices.append(index) tags_taken.append(name) nlp.remove_pipe("entity") return tags_taken, tag_indices
def extractTags(): query = request.args.get('query') with open('tags.txt', 'rb') as fp: tags = pickle.load(fp) nlp = spacy.load('en') entity = Entity(keywords_list=tags) nlp.add_pipe(entity, last=True) # txt = unicode(query,encoding="utf-8") doc = nlp(query) tagstaken = [] for name, index, named in doc._.entities: tagstaken.append(name) response = {"data": tagstaken} return json.dumps(response)
def __init__(self, email: str, outprefix: str): Entrez.email = email self.paths = PhenoXPaths(outprefix) self.pmid_abstracts = dict() # disease and human phenotype NER self.pmid_dner = {} # raw entity text self.pmid_ent_text = {} self.dner_cluster = {} self.total_dner = [] self.nlp = spacy.load('en') self.id2kw = pickle.load( open(os.path.join(self.paths.data_dir, 'id2kw_dict.pkl'), 'rb')) self.kw2id = pickle.load( open(os.path.join(self.paths.data_dir, 'kw2id_dict.pkl'), 'rb')) entity = Entity(keywords_list=list(self.kw2id.keys()), label='DO/HPO') self.nlp.add_pipe(entity, last=True) # read synonyms from HGNC with open(os.path.join(self.paths.data_dir, 'hgnc_synonyms.json'), 'r') as f: hgnc_syn = f.read() self.hgnc = json.loads(hgnc_syn)
import spacy from spacy.matcher import PhraseMatcher from spacy_lookup import Entity from spacy.lang.es import Spanish nlp = Spanish() entity = Entity(nlp, keywords_list=['pera en Dulce', 'manzana', 'tentacion'], label='FOOD') nlp.add_pipe(entity, name='Food') entity2 = Entity(nlp, keywords_list=['#mora'], label='FOOD_HASHTAGS') nlp.add_pipe(entity2, name='FoodHashtags') text = "Me gustan mucho la manzana y tambien la pera en dulce en salsa de #mora. También me gusta la paleta tentación." doc = nlp(text) for e in doc: print(e.text, e._.is_entity, e.ent_type_)
def main(): #change input file name here file = "input.txt" outputfile = file[1:-4] + ".json" content = None try: f = open(file, "r") content = f.read() except UnicodeDecodeError: f = open(file, "r", encoding='utf8') content = f.read() # Create a dataframe from csv df = pd.read_csv('./titles.csv') # User list comprehension to create a list of lists from Dataframe rows job_titles = [row[0] for row in df.values] # nlp.remove_pipe('entity') entity = Entity(keywords_list=job_titles, label='Job-Title') nlp = spacy.load('en_core_web_sm') nlp.add_pipe(entity, before='ner') neuralcoref.add_to_pipe(nlp) doc = nlp(content) #coreference resolution is done in the whole document content = doc._.coref_resolved #Extacting the sentences from the text document sentences = [] tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') sentences.extend(tokenizer.tokenize(content)) output = {} output["document"] = file output["extraction"] = [] for sentence in sentences: sen = nlp(sentence) #Find part template locations_count = 0 for ents in sen.ents: if ents.label_ == 'GPE' or ents.label_ == 'LOC': locations_count = locations_count + 1 if locations_count >= 2: resultset = parse_part_template(sen) for x in resultset: if x[0] != '' and x[1] != '': part_temp = {} part_temp["template"] = "PART" part_temp["sentences"] = [] part_temp["sentences"].append(sentence) part_temp["arguments"] = {} part_temp["arguments"]["1"] = x[0] part_temp["arguments"]["2"] = x[1] output["extraction"].append(part_temp) #Find work template isJobTitleGiven = False isOrgGiven = False for ents in sen.ents: if ents.label_ == 'Job-Title': isJobTitleGiven = True if ents.label_ == 'ORG': isOrgGiven = True if isOrgGiven and isJobTitleGiven: results = parse_work_template(sen) for res in results: if res[0] != '' and res[1] != '' and res[2] != '': work_temp = {} work_temp["template"] = "WORK" work_temp["sentences"] = [] work_temp["sentences"].append(sentence) work_temp["arguments"] = {} work_temp["arguments"]["1"] = res[0] work_temp["arguments"]["2"] = res[2] work_temp["arguments"]["3"] = res[1] work_temp["arguments"]["4"] = res[3] output["extraction"].append(work_temp) #Find buy template for i, tok in enumerate(sen): if tok.dep_ == 'ROOT' or tok.pos_ == 'VERB': if tok.text.lower() in ['buy', 'bought','shop', 'acquire', 'acquired', 'purchase', 'invest in'\ 'invested', 'get', 'obtain', 'obtained', 'secure', 'redeem', 'land', 'spent', 'get']: results = parse_buy_template(sen) if results[0] != '' and results[1] != '': buy_temp = {} buy_temp["template"] = "BUY" buy_temp["sentences"] = [] buy_temp["sentences"].append(sentence) buy_temp["arguments"] = {} buy_temp["arguments"]["1"] = results[0] buy_temp["arguments"]["2"] = results[1] buy_temp["arguments"]["3"] = results[2] buy_temp["arguments"]["4"] = results[3] buy_temp["arguments"]["5"] = results[4] output["extraction"].append(buy_temp) # Serializing json json_object = json.dumps(output, indent=4) # Writing to sample.json with open(outputfile, 'w+') as f: f.write(json_object)
def modify_nlp_pipe(nlp, taxon_names, label): entity = Entity(keywords_list=taxon_names, label=label) nlp.add_pipe(entity) nlp.remove_pipe("ner")
#!/usr/bin/env python # -*- coding: utf-8 -*- import spacy from spacy.lang.es.examples import sentences from spacy_lookup import Entity nlp = spacy.load('es_core_news_sm') entity = Entity(keywords_list=[ 'invico', 'victoria', 'montelibano', 'san', 'luis', 'nicolas', 'utp', 'dosquebradas' ]) nlp.add_pipe(entity, last=True) # add this entity al pipeline #doc = nlp('notas de Yerson victoria') doc = nlp(u'ruta entre san nicolas y luis') #print(doc.text) #nouns = dict() #diccionario para almecenar los sustantivos recorrido = dict() for token in doc: print("categorizacion: {} {} {}".format(token.text, token.pos_, token.dep_)) # #print(token.is_stop) #para saber si el token es de parada ( words que se repiten mucho ) if token._.is_entity: recorrido[token.text] = token.pos_ print(recorrido) #print(recorrido.keys()) #print(recorrido.values())
dictionary=[] for index, row in df_clean.iterrows(): if (type(row[2]) == str): #Avoid numbers #print(row[2].lower()) dictionary.append(row[2].lower()) #language_index : 1 spanish and 2 english ? #print(dictionary) #IOPub data rate exceeded #print('length of the dictionary loaded: ', len(dictionary)) return(dictionary) ##English print("---Load dictionary english") diagnostics_dictionary = load_dictionary(df_codeD, language_index=2) #Add Named Entities metadata to Doc objects in Spacy. First, we load the model and replace the NER module with de entity diagnosticEnt. We do this to avoid overlapping of entities. Then, we also add procedureEnt. diagnosticEnt = Entity(keywords_list=diagnostics_dictionary,label="DIAGNOSTICO") #Detect Named Entities using dictionaries. We can process a text and show its entities. nlp.replace_pipe("ner", diagnosticEnt) #We replace the common entities with diagnostics print('entities loaded in nlp') def Dict_code_reference(df): #Make a dictionary {'A00.0': 'Cólera debido a Vibrio cholerae 01, biotipo cholerae ', 'A00.1': 'Cólera debido a Vibrio cholerae 01, biotipo El Tor',... df_clean = df.dropna() #drop NaN values code_list = [] reference_list = [] for index, row in df_clean.iterrows(): if (row[2]!=None): code_list.append(row[0]) reference_list.append(row[2]) #1 spanish, 2 english ?
def add_service_entities(self): service_data = self.gen_service_tags_df[self.gen_service_tags_df['Type'] == 1] service_list = service_data['Tag'].tolist() service_entity = Entity(keywords_list=service_list, label='GEN_SERVICE') self.nlp.add_pipe(service_entity, last=True)
Iterates through a bunch of docs and creates a list of GOV_ORG entities for each doc. Usage: python3 govorg_matcher.py df """ import get_govorg_list import spacy from spacy_lookup import Entity # stuff to run always here such as class/def # if we call the function after import, need nlp defined nlp = spacy.load('en_core_web_sm') # get lookup GOV_ORG = get_govorg_list.main() # Create custom entity list entity = Entity(keywords_list=GOV_ORG, label='GOV_ORG') # happens after other processing steps nlp.add_pipe(entity, last=True) def text_gov_org_match(text): """Return a list of GOV_ORG entities found at least once in a str.""" doc = nlp(text) # remove any duplicates because dictionaries cannot have duplicate keys list_of_gov_org_entities_with_duplicates = [ token._.canonical for token in doc if token._.is_entity ] list_no_duplicates = list( dict.fromkeys(list_of_gov_org_entities_with_duplicates)) return list_no_duplicates
def test_integration(nlp): entity = Entity(nlp) nlp.add_pipe(entity, last=True) assert nlp.pipe_names[-1] == 'entity'
def create_species_entity(nlp: Language, name: str): return Entity(name=name, keywords_dict=species_dict, label="SPECIES")
def fast_spacy(fast_df): entity = Entity(keywords_dict=generate_labels(fast_df), label="FAST_TOPIC") nlp = spacy.load("en_core_web_md") nlp.add_pipe(entity) nlp.remove_pipe("ner") return nlp, entity
def create_location_entity(nlp: Language, name: str): return Entity(name=name, keywords_dict=location_dict, label="LOCATION")
def create_habitat_entity(nlp: Language, name: str): habitats_list = list(habitats.Habitat) return Entity(name=name, keywords_list=habitats_list, label="HABITAT")