def __init__(self, content): self._content = content self._nlp = en_core_web_sm.load() self._processed = self._nlp(self._content) self._lemma = {} self._pos = {} self._pos_ = {} self._word = {} self._sentiment = {} self._tag = {} self._dep = {} self._prob = {} self._idx = {} self.process()
def nlp_init(tries=0): if tries > 0: print('Retrying, try%d' % tries) global loading_status if loading_status == 'none' and 'nlp' not in data: try: print('NER init...') loading_status = 'loading' data['nlp'] = english_model.load() loading_status = 'done' except Exception as exc: print(exc) loading_status = 'none' elif loading_status == 'loading' and tries < 30: time.sleep(10) if loading_status == 'loading': new_tries = tries + 1 return nlp_init(tries=new_tries) return data['nlp']
import spacy from spacy import displacy from collections import Counter import en_core_web_sm from pprint import pprint import re import itertools import contractions import nltk nltk.download('stopwords') from nltk.corpus import stopwords nlp = en_core_web_sm.load() L = pd.read_csv("lyrics.csv", index_col=0) #dropnan values L=L.dropna() #removing end of sentences L=L.replace({'\n': ' '}, regex=True)
def __init__(self, stopWordsFilePath): self.nlp = en_core_web_sm.load() self.stop_words_path = stopWordsFilePath self.noOfClasses = ["Negative", "Positive"]
def aqgParse(self, sentence): #nlp = spacy.load("en") nlp = en_core_web_sm.load() singleSentences = sentence.split(".") questionsList = [] if len(singleSentences) != 0: for i in range(len(singleSentences)): segmentSets = singleSentences[i].split(",") ner = nerTagger(nlp, singleSentences[i]) if (len(segmentSets)) != 0: for j in range(len(segmentSets)): try: questionsList += clause.howmuch_2(segmentSets, j, ner) except Exception: pass if identification.clause_identify(segmentSets[j]) == 1: try: questionsList += clause.whom_1(segmentSets, j, ner) except Exception: pass try: questionsList += clause.whom_2(segmentSets, j, ner) except Exception: pass try: questionsList += clause.whom_3(segmentSets, j, ner) except Exception: pass try: questionsList += clause.whose(segmentSets, j, ner) except Exception: pass try: questionsList += clause.what_to_do(segmentSets, j, ner) except Exception: pass try: questionsList += clause.who(segmentSets, j, ner) except Exception: pass try: questionsList += clause.howmuch_1(segmentSets, j, ner) except Exception: pass try: questionsList += clause.howmuch_3(segmentSets, j, ner) except Exception: pass else: try: s = identification.subjectphrase_search(segmentSets, j) except Exception: pass if len(s) != 0: segmentSets[j] = s + segmentSets[j] try: questionsList += clause.whom_1(segmentSets, j, ner) except Exception: pass try: questionsList += clause.whom_2(segmentSets, j, ner) except Exception: pass try: questionsList += clause.whom_3(segmentSets, j, ner) except Exception: pass try: questionsList += clause.whose(segmentSets, j, ner) except Exception: pass try: questionsList += clause.what_to_do(segmentSets, j, ner) except Exception: pass try: questionsList += clause.who(segmentSets, j, ner) except Exception: pass else: try: questionsList += nonClause.what_whom1(segmentSets, j, ner) except Exception: pass try: questionsList += nonClause.what_whom2(segmentSets, j, ner) except Exception: pass try: questionsList += nonClause.whose(segmentSets, j, ner) except Exception: pass try: questionsList += nonClause.howmany(segmentSets, j, ner) except Exception: pass try: questionsList += nonClause.howmuch_1(segmentSets, j, ner) except Exception: pass questionsList.append('\n') return questionsList
def retrieve_input(): s = textBox.get("1.0", "end-1c") name = [] ent = [] nlp = en_core_web_sm.load() doc = nlp(s) for X in doc.ents: name.append(X.text) ent.append(X.label_) s = pos_tag(word_tokenize(s)) ls = [] ls2 = [] for i in s: if i[1][0] == "W" or i[1][0] == "J" or i[1][0] == "D" or i[1][ 0] == "W" or i[1][0] == "M" or i[1][0] == "R": continue else: ls.append(i) ls2.append(i[0].lower()) c = 1 ## index for dataframe if len(ent): for i in range(len(ent)): if ent[i] == 'GPE': df.iloc[c]["Team"] = name[i] if "win" in ls2: df.iloc[c]["Team_win"] = 1 df.iloc[c]["Valid Before"] = "first Innings" elif "wickets" in ls2 or "wicket" in ls2: df.iloc[c]["Team_wickets"] = 1 df.iloc[c]["Valid Before"] = 20 elif "runs" in ls2 or "run" in ls2: df.iloc[c]["Team_run"] = 1 df.iloc[c]["Valid Before"] = 20 elif "boundaries" in ls2: df.iloc[c]["Boundaries"] = 1 df.iloc[c]["Valid Before"] = 20 elif "sixes" in ls2: df.iloc[c]["Sixes"] = 1 df.iloc[c]["Valid Before"] = 20 elif ent[i] == 'ORDINAL' or ent[i] == 'CARDINAL' or ent[ i] == 'DATE': if "over" in ls2 or "overs" in ls2: if "." in name[i]: s = name[i].split(".") df.iloc[c]["Over"] = int(s[0]) df.iloc[c]["Ball"] = int(s[1]) df.iloc[c]["Over"] = int(name[i][0:-2]) else: if "wickets" in ls2 or "wicket" in ls2: df.iloc[c]["Bowler"] = name[i] df.iloc[c]["Valid Before"] = 5 else: df.iloc[c]["Batsmen"] = name[i] df.iloc[c]["Valid Before"] = 20 if "boundaries" in ls2: df.iloc[c]["Boundaries"] = 1 df.iloc[c]["Valid Before"] = 20 elif "sixes" in ls2: df.iloc[c]["Sixes"] = 1 df.iloc[c]["Valid Before"] = 20 elif "wickets" in ls2 or "wicket" in ls2: df.iloc[c]["Team"] = "Both" df.iloc[c]["Team_wickets"] = 1 df.iloc[c]["Valid Before"] = 20 elif "win" in ls2: df.iloc[c]["Team"] = "Both" df.iloc[c]["Team_win"] = 1 df.iloc[c]["Valid Before"] = 20 elif "runs" in ls2 or "run" in ls2: df.iloc[c]["Team"] = "Both" df.iloc[c]["Team_run"] = 1 df.iloc[c]["Valid Before"] = 20 elif "boundaries" in ls2: df.iloc[c]["Boundaries"] = 1 df.iloc[c]["Team"] = "Both" df.iloc[c]["Valid Before"] = 20 elif "sixes" in ls2: df.iloc[c]["Sixes"] = 1 df.iloc[c]["Team"] = "Both" df.iloc[c]["Valid Before"] = 20 else: print("INVALID QUESTION")
all_lectures = json.load(f) # Days of the week days = { "Mon": "Monday", "Tue": "Tuesday", "Wed": "Wednesday", "Thu": "Thursday", "Fri": "Friday", "Sat": "Saturday", "Sun": "Sunday" } # Lectures lectures = {} parser = en_core_web_sm.load() for entry in all_lectures: lecture = all_lectures[entry]["name"] tree = parser(lecture) acronym = "" for token in tree: if (token.tag_ != "IN" and token.tag_ != "TO" and token.tag_ != "DT" and token.tag_ != "CC"): acronym = acronym + token.text[0].upper() lectures[acronym] = lecture with open('lectures_acr.json', 'w') as fp: json.dump(lectures, fp) with open('days_acr.json', 'w') as fp: json.dump(days, fp)
from flask import jsonify from flask import Flask, request, send_from_directory, send_file import en_core_web_sm from dateparser import parse # from nltk import word_tokenize, ngrams import en_core_web_sm import string # from nltk.stem import WordNetLemmatizer # from gensim.models import KeyedVectors import random import pdfrw from nameparser import HumanName # wordnet_lemmatizer = WordNetLemmatizer() spacy_nlp = en_core_web_sm.load() app = Flask(__name__, static_url_path='/documents') # spacy_nlp = spacy.load('en') # topics # DISABILITY = 'DISABILITY' # ILLNESS = 'ILLNESS' # DEATH = 'DEATH' # CHILDREN = 'CHILDREN' # seed words for the word2vec model # REASONS = { # 'disability': DISABILITY, # 'deaf': DISABILITY, # 'disabled': DISABILITY, # 'illness': ILLNESS,
def loadProdReviewData(PROD_DICT): global category1, category2, main_cat, cond_date rev_db_col = 'new_reviews2' review_cursor = mongo.searchInDB(mongoObj.getReviewKey(), db_col=rev_db_col) docCount = review_cursor.count() print("make product reviews feature from %s reviews..." % (docCount)) print("Search reviews finished...") nlp = en_core_web_sm.load() with tqdm(total=docCount) as pbar: for i2, rev in enumerate(review_cursor): asin, review, overall, vote, summary, review_ID = \ rev["asin"], rev["reviewText"], rev['overall'], rev['vote'], rev['summary'], str(rev['unixReviewTime']) if asin not in PROD_DICT.keys(): asin = 'UNK' if "REVIEW_ITEM_LIST" not in PROD_DICT[asin].keys(): DATA_DICT = PROD_DICT[asin] DATA_DICT["REVIEW_ITEM_LIST"] = [] DATA_DICT = PROD_DICT[asin] REVIEW_ITEM_LIST = DATA_DICT["REVIEW_ITEM_LIST"] # ---------------------------------------------------------------------------------------------- # print("\n\n") # print(review) # print("\n\n\n\n") review = re.sub(r'http\S+', '', review) review = remove_word4(review) lemm_review = lemm_sent_process4(review, remove_stopwords=False, summary=False, mode="spacy", withdot=True) lemm_review_len = len(lemm_review.split(" ")) lemm_review = lemm_review.split(" .") lemm_review = [ line + " . " for line in lemm_review if len(line) > 2 ] # print("\n\n\n\n") # [print(line) for line in lemm_review] # break # --------------------------------------------------------------------------------------------- # print("\n\n") # summary = ''' # Light weight and smaller size = Fantastic! # ''' # print(summary) # print("\n\n\n\n") summary = re.sub(r'http\S+', '', summary) summary = remove_word4(summary) lemm_summary = lemm_sent_process4(summary, remove_stopwords=False, summary=True, mode="spacy", withdot=False) lemm_summary_len = len(lemm_summary.split(" ")) # print("\n\n\n\n") # print(lemm_summary) # break # ---------------------------------------------------------------------------------------------- item_dict = { 'review_ID': review_ID, "review": review, "overall": overall, "vote": vote, 'summary': summary, 'lemm_review': lemm_review, 'lemm_review_len': lemm_review_len, 'lemm_summary': lemm_summary, 'lemm_summary_len': lemm_summary_len } REVIEW_ITEM_LIST.append(item_dict) DATA_DICT["REVIEW_ITEM_LIST"] = REVIEW_ITEM_LIST PROD_DICT[asin] = DATA_DICT pbar.set_description("%s REVIEW_ITEM_LIST " % (category1)) pbar.update(1) DEL_ASIN = [] for asin, DATA_DICT in PROD_DICT.items(): if "REVIEW_ITEM_LIST" not in DATA_DICT: DEL_ASIN.append(asin) for asin in DEL_ASIN: del PROD_DICT[asin] return PROD_DICT
def parseTranscripts(): # loading to a dataframe #df = pd.read_table("feedback_ansi.txt", header=(0)) df = pd.read_excel("clean_reviews_numbered.xlsx", header=(0)) df = df.dropna() # throw error if output file already exists: if os.path.isfile('adhoc_chunked_output.csv'): exit('Older output file already exists!') else: with open('adhoc_chunked_output.csv', 'a') as f: f.write('doc_id,cleanPhrase,mergedPhrase,parsedNouns' + '\n') # cleaning transcripts after collecting lemmas using spacy lemmatizer df_clean_trans = rc.DataFrame(columns=['phrase', 'nouns']) print(datetime.datetime.now()) start_time = datetime.datetime.now() print('Starting the process') rowNum = 0 nlp = en_core_web_sm.load() spacyPhraseDict = {} nounDict = {} #phrased and cleaned trans dataframe: df = phraseBuilder(df) replaceDict = {} # load word2vec model model = word2vec.Word2Vec.load('../VEC_MODELS/fid_w2vec_pos_model') for index, row in df.iterrows(): phrase = row['finalTrans'] doc_id = row['RowNum'] rowNum += 1 print('Processing Row : %s' % rowNum) # spacy processing: spacyPhrase = nlp(phrase.decode('utf-8')) # buildWordTagDict(wordTagDict,spacyPhrase) posTaggedWordPhrase = ' '.join( string.replace(w.lemma_, '_', '-') + '_' + wordPosTag(w) for w in spacyPhrase if w.lemma_ not in spacy.en.STOP_WORDS) nounTaggedWords = ' '.join( string.replace(w.lemma_, '_', '-') for w in spacyPhrase if (w.pos_ == 'NOUN' or w.pos_ == 'PROPN' or w.pos_ == 'NUM') and w.lemma_ not in spacy.en.STOP_WORDS) interaction_key = doc_id if interaction_key not in spacyPhraseDict: spacyPhraseDict[interaction_key] = posTaggedWordPhrase nounDict[interaction_key] = nounTaggedWords # use spacy tags to create word-tag freq dictionary for the entire corpus wordFreqDict = wordListToFreqDict(list(spacyPhraseDict.values())) # now we can use the word-tag dict to start replacing words and reducing our sample space print('Reducing word feature space so as to reduce variants of words') rowNum = 0 newWordCounter = 0 for index, row in df.iterrows(): #print(spacyPhrase) #print(nounPhrase) rowNum += 1 interaction_key = row['RowNum'] spacyPhrase = spacyPhraseDict[interaction_key] nounPhrase = nounDict[interaction_key] print('Processing Row : %s' % rowNum) # calculate similarity and reduce feature space by reducing words using word2vec and wordnet intersection # building sentence one word at a time sen = [] for taggedWord in spacyPhrase.split(): wordKey = taggedWord # already in the key format, can be used later baseWord, tag = taggedWord.split('_') # make sure we see each word only once if wordKey in wordFreqDict and baseWord != '' and tag != '': wordFreq = wordFreqDict[wordKey] # first check the replaceDict if this key is already present, replace if present and skip the loop iteration if wordKey in replaceDict: replaceWord = replaceDict[wordKey] w, t = replaceWord.split('_') sen.append(w) continue # continue if no replacement word found in the replaceDict # word that is not present in the model or is a unique word newWordCounter += 1 if not wordnet.synsets(baseWord, tag) or baseWord not in model.wv.vocab: sen.append(baseWord) replaceDict[wordKey] = wordKey # getting best replacement from word2vec context, note that synset only gets the pos tag synonymns # hence even though we don't fetch word2vec context we should be finding the correct replacement word else: synonyms = wordnet.synsets(baseWord, tag) lemmas = set( chain.from_iterable( [word.lemma_names() for word in synonyms])) synlist = [str(term) for term in lemmas] model_similar_words = model.most_similar( positive=[baseWord], topn=10) modellist = [] for item in model_similar_words: item = str(item) item = item.replace('\'', '') item = item.replace('(', '') item = item.replace(')', '') term, score = item.split(',') modellist.append(term) # common words between synsets and w2vec may be more than one commonWord = [ word for word in modellist if word in synlist ] if (len(commonWord) != 0): score = -1 for word in commonWord: key = word + '_' + tag score = wordFreq if key in wordFreqDict: score_ = wordFreqDict[key] else: score_ = score if score_ > score: score = score_ maxScoreWord = word if (score > wordFreq): sen.append(maxScoreWord) # word substitution happening, cache it in the dict replaceDict[wordKey] = maxScoreWord + '_' + tag print('Replacing word :: ' + wordKey + ' with :: ' + maxScoreWord) else: # substitution not happening for the word, add original to the dict sen.append(baseWord) replaceDict[wordKey] = wordKey else: sen.append(baseWord) if wordKey not in replaceDict: replaceDict[wordKey] = wordKey else: sen.append(baseWord) # phrase = ' '.join(wordnet.synsets(w)[0].lemmas()[0].name() for w in phrase.split()) phrase = ' '.join(w for w in sen) phrase = phrase.replace('-PRON-', '') phrase = re.sub(r'\b\w{1,2}\b', '', phrase) phrase = phrase.replace('\'', '') phrase = re.sub(r'(\s)\w+-+\s', ' ', phrase) phrase = re.sub(r'(\s)-+\w+\s', ' ', phrase) phrase = phrase.replace(' - ', '') phrase = ' '.join(phrase.split()) #print(phrase) nounPhrase = nounPhrase.replace('-PRON-', '') nounPhrase = re.sub(r'\b\w{1,2}\b', '', nounPhrase) nounPhrase = nounPhrase.replace('\'', '') nounPhrase = re.sub(r'(\s)\w+-+\s', ' ', nounPhrase) nounPhrase = re.sub(r'(\s)-+\w+\s', ' ', nounPhrase) nounPhrase = nounPhrase.replace(' - ', '') nounPhrase = ' '.join(nounPhrase.split()) #print(nounPhrase) # collecting all clean lemmatized phrases by interaction id in a dataframe, this df will be used with gensim phraser df_clean_trans.append_row(index, { 'doc_id': interaction_key, 'phrase': phrase, 'nouns': nounPhrase }) # except: # print('Encountered issue with Spacy unicode token!!') # modify this function to directly read data from disk using LineSentences function of Gensim # function to build gensim phraser # convert to normal dataframe print('New words seen :: ' + str(newWordCounter)) print('Num of words replaced : ' + str(len(replaceDict.keys()))) # pickle the dictionary: # with open('../REPLACE_DICT_PICKLE', 'wb') as handle: # pickle.dump(replaceDict, handle, protocol=pickle.HIGHEST_PROTOCOL) data_dict = df_clean_trans.to_dict(index=False) df_clean_trans = pd.DataFrame(data_dict, columns=df_clean_trans.columns, index=df_clean_trans.index) # feed tokenized transcript through the POS context extractor logic, which will create final file for LDA input chunkProcessedTrans(df_clean_trans) # write gensim phrase to file #nlpOutput(df_clean_trans) end_time = datetime.datetime.now() print('Completed process in ::' + str(end_time - start_time)) exit(0)
""" __init__.py library module for the app """ import json import os import tempfile import en_core_web_sm nlp = en_core_web_sm.load() from app import app def nlp_over_lines_as_blob(lines, *extractors): """Given an iterable collection of lines of text, generates complete sentences and runs a series of extractor functions over each sentence. Yields a tuple for each sentence containing the results of each extractor """ blob = ' '.join(lines) doc = nlp(blob) for sent in doc.sents: extracted = [ext(sent) for ext in extractors] yield tuple(extracted) def nlp_over_lines(lines, *extractors): """Given an iterable collection of lines of text, runs a series of extractor functions over each line.
wordsByRake.append(rake.apply(docs[i])) def listOfLists(lst): temp =[] for i in range(0,len(lst)): if(lst[i][1] > 4.0): #this can be changed according to the required number of tags temp.append(lst[i][0]) return [elem for elem in temp] for i in range(0,len(wordsByRake)): trainDocs.insert(i, listOfLists(wordsByRake[i])) wordsByRake = [] #to avoid memory usage docs = [] #to avoid memory usage nlp = en_core_web_sm.load() #loading the model ruler = EntityRuler(nlp) pattern = []; #to store the pattern for tagging each doc def labelDecider(txt,tag,count): patternDict = {} patternDict = {'label': str(tag) ,'pattern': txt[count]} return patternDict for i in range(0,len(trainDocs)): #0 to 80 docCount = 0; case = trainDocs[i] if(len(trainDocs[i]) != 0): if(len(trainTags[i]) == 1): #for docs with one lable tagCase = trainTags[i]
import pandas as pd import pickle5 as pickle from app.lrs.base_model import BaseModel pd.set_option('display.max_colwidth', None) pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) import en_core_web_sm nlp = en_core_web_sm.load() # https://spacy.io/usage/models#production #https://pypi.org/project/spacy-langdetect/ from spacy_langdetect import LanguageDetector nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) MODEL_PATH = 'app/models/' REPLACED_WORDS = pickle.load(open(MODEL_PATH + "replaced_words.pkl", "rb")) stop_list = pickle.load(open(MODEL_PATH + "stop_list.pkl", "rb")) ENGINE_TYPES = ['LDA', 'LDA', 'NMF', 'NMF'] ENGINE_VERSIONS = [ 'model_12_content_3_topic_80_20', 'model_12_content_10_topic_80_20', 'nmf_model_13_content_3_topic_80_20', 'nmf_model_13_content_10_topic_80_20' ] # Utility functions def make_suggestions(TAR_6, engine): # P R E D I C T I O N norms = pickle.load(
def parseTranscripts(): #loading to a dataframe df = pd.read_table("outTrans.tsv", sep='\t', header=(0)) #throw error if output file already exists: if os.path.isfile('SCRIPT_NLP_CONTEXT.csv'): exit('Older output file already exists!') else: with open('SCRIPT_NLP_CONTEXT.csv', 'a') as f: f.write('interaction_id,nice_interaction_id,customer_id,call_date,cleanpPhrase,mergedPhrase'+'\n') #cleaning transcripts after collecting lemmas using spacy lemmatizer df_clean_trans = rc.DataFrame(columns=['interaction_id','nice_interaction_id','customer_id','call_date', 'phrase']) print(datetime.datetime.now()) start_time = datetime.datetime.now() print('Starting the process') rowNum = 0 nlp = en_core_web_sm.load() spacyPhraseDict = {} ''' if os.path.isfile('../REPLACE_DICT_PICKLE'): with open('../REPLACE_DICT_PICKLE', 'rb') as handle: replaceDict = pickle.load(handle) else: replaceDict = {} ''' replaceDict = {} #load word2vec model print('Number of words in the list :: '+str(len(replaceDict.keys()))) model = word2vec.Word2Vec.load('../VEC_MODELS/fid_w2vec_pos_model') for index, row in df.iterrows(): phrase = row['transcript'] interaction_id = row['interaction_id'] nice_interaction_id = row['nice_interaction_id'] #nice_interaction_id = '123' customer_id = row['customer_id'] call_date = row['call_date'] rowNum += 1 print('Processing Row : %s' % rowNum) #rawPhrase = basicClean(phrase) # clean trans as per business logic, excluding generic rep conversations: cleanPhrase = cleanTranscript(phrase) #spacy processing: spacyPhrase = nlp(cleanPhrase.decode('utf-8')) #buildWordTagDict(wordTagDict,spacyPhrase) posTaggedWordPhrase = ' '.join(w.lemma_ + '_' + wordPosTag(w) for w in spacyPhrase if w.lemma_ not in spacy.en.STOP_WORDS) if interaction_id not in spacyPhraseDict: spacyPhraseDict[interaction_id] = posTaggedWordPhrase #use spacy tags to create word-tag freq dictionary for the entire corpus wordFreqDict = wordListToFreqDict(list(spacyPhraseDict.values())) # now we can use the word-tag dict to start replacing words and reducing our sample space print('Reducing word feature space so as to reduce variants of words') rowNum = 0 newWordCounter = 0 for index, row in df.iterrows(): phrase = row['transcript'] interaction_id = row['interaction_id'] nice_interaction_id = row['nice_interaction_id'] #nice_interaction_id = '123' customer_id = row['customer_id'] call_date = row['call_date'] spacyPhrase = spacyPhraseDict[interaction_id] rowNum += 1 print('Processing Row : %s' % rowNum) #calculate similarity and reduce feature space by reducing words using word2vec and wordnet intersection # building sentence one word at a time sen = [] for taggedWord in spacyPhrase.split(): wordKey = taggedWord #already in the key format, can be used later baseWord,tag = taggedWord.split('_') # make sure we see each word only once if wordKey in wordFreqDict and baseWord != '' and tag != '': wordFreq = wordFreqDict[wordKey] #first check the replaceDict if this key is already present, replace if present and skip the loop iteration if wordKey in replaceDict: replaceWord = replaceDict[wordKey] w,t = replaceWord.split('_') sen.append(w) continue #continue if no replacement word found in the replaceDict #word that is not present in the model or is a unique word newWordCounter += 1 if not wordnet.synsets(baseWord,tag) or baseWord not in model.wv.vocab: sen.append(baseWord) replaceDict[wordKey] = wordKey #getting best replacement from word2vec context, note that synset only gets the pos tag synonymns #hence even though we don't fetch word2vec context we should be finding the correct replacement word else: synonyms = wordnet.synsets(baseWord,tag) lemmas = set(chain.from_iterable([word.lemma_names() for word in synonyms])) synlist = [str(term) for term in lemmas] model_similar_words = model.most_similar(positive = [baseWord],topn = 10) modellist = [] for item in model_similar_words: item = str(item) item = item.replace('\'','') item = item.replace('(','') item = item.replace(')','') term,score = item.split(',') modellist.append(term) #common words between synsets and w2vec may be more than one commonWord = [word for word in modellist if word in synlist] if(len(commonWord) != 0): score = -1 for word in commonWord: key = word + '_' + tag score = wordFreq if key in wordFreqDict: score_ = wordFreqDict[key] else: score_ = score if score_ > score: score = score_ maxScoreWord = word if(score > wordFreq): sen.append(maxScoreWord) #word substitution happening, cache it in the dict replaceDict[wordKey] = maxScoreWord + '_'+tag print('Replacing word :: '+ wordKey + ' with :: ' + maxScoreWord) else: #substitution not happening for the word, add original to the dict sen.append(baseWord) replaceDict[wordKey] = wordKey #nothing common was found with w2vec, pick up best replacement from synsets else: ''' #pick max frequency from the wordfreqdict for syn in lemmas: synKey = syn+'_'+tag # word-tag distribution likely to follow zipf's law if synKey in wordFreqDict and wordFreqDict[synKey] > wordFreq: #replace the word wordFreq = wordFreqDict[synKey] #if there is a different higher freq word in synset replaceDict[wordKey] = synKey baseWord = syn ''' sen.append(baseWord) if wordKey not in replaceDict: replaceDict[wordKey] = wordKey #phrase = ' '.join(wordnet.synsets(w)[0].lemmas()[0].name() for w in phrase.split()) phrase = ' '.join(w for w in sen) phrase = phrase.replace('-PRON-_n','') phrase = phrase.replace(' whatev ', ' ') phrase = re.sub(r'\b\w{1,2}\b', '', phrase) phrase = phrase.replace('\'', '') phrase = phrase.replace(' ', ' ') phrase = phrase.strip() #print(phrase) #collecting all clean lemmatized phrases by interaction id in a dataframe, this df will be used with gensim phraser df_clean_trans.append_row(index,{'interaction_id' : interaction_id, 'nice_interaction_id' : nice_interaction_id, 'customer_id' : customer_id, 'call_date' : call_date, 'phrase' : phrase}) #except: #print('Encountered issue with Spacy unicode token!!') #modify this function to directly read data from disk using LineSentences function of Gensim #function to build gensim phraser #convert to normal dataframe print('New words seen :: '+str(newWordCounter)) print('Num of words replaced : '+str(len(replaceDict.keys()))) #pickle the dictionary: #with open('../REPLACE_DICT_PICKLE', 'wb') as handle: # pickle.dump(replaceDict, handle, protocol=pickle.HIGHEST_PROTOCOL) data_dict = df_clean_trans.to_dict(index=False) df_clean_trans = pd.DataFrame(data_dict, columns=df_clean_trans.columns, index=df_clean_trans.index) df_phrased = phraseBuilder(df_clean_trans) #feed tokenized transcript through the POS context extractor logic, which will create final file for LDA input chunkProcessedTrans(df_phrased) #write gensim phrase to file #nlpOutput(df_phrased) end_time = datetime.datetime.now() print('Completed process in ::' + str(end_time - start_time)) exit(0)
def spacy_tokenizer(text): nlp = en_core_web_sm.load() doc = nlp(text) tokens = [token.lemma_ for token in doc] tokens = [i for i in tokens if i != '-PRON-'] return tokens
# coding: utf-8 from datetime import datetime, timedelta import en_core_web_sm from flask import g, render_template, request, jsonify, make_response from flask_sqlalchemy import SQLAlchemy from sqlalchemy.orm import relationship from flask_oauthlib.provider import OAuth2Provider from flask_oauthlib.contrib.oauth2 import bind_sqlalchemy from flask_oauthlib.contrib.oauth2 import bind_cache_grant from Pipelines import ner, ner_negation # initialize spacy for preprocessing spacy_model = en_core_web_sm.load() # initialize SQLAlchemy instance to hold users db = SQLAlchemy() class User(db.Model): id = db.Column(db.Integer, primary_key=True) username = db.Column(db.String(40), unique=True, index=True, nullable=False) def check_password(self, password): return True class Client(db.Model): id = db.Column(db.Integer, primary_key=True)
#!/usr/bin/env python """Create annotation tiers using spacy""" from annotator import Annotator, AnnoSpan, AnnoTier import re import en_core_web_sm as spacy_model spacy_nlp = spacy_model.load() class TokenSpan(AnnoSpan): def __init__(self, token, doc): self.doc = doc self.start = token.idx self.end = token.idx + len(token) self.label = token.text self.token = token class SentSpan(AnnoSpan): def __init__(self, span, doc): self.doc = doc self.start = span.start_char self.end = span.end_char self.label = span.text self.span = span class SpacyAnnotator(Annotator): def annotate(self, doc): ne_spans = [] token_spans = [] ne_chunk_start = None
def load_mode(model_name): if model_name == 'en': return enmodel.load() return None
import sys sys.path.insert(0, "..") import os import en_core_web_sm from utils.data_preprocess.annotation_loader import AnnotationLoader from utils.params import TRAIN_ANNOTATION_SRC, DEV_ANNOTATION_SRC NLP = en_core_web_sm.load() def _get_np_root(np_phrase): for w in NLP(np_phrase): if w.dep_ == "ROOT": return w.text # because annotations gold label and np chunks can be slightly different # e.g gold value is: "Air Line Pilots Association" and the np chunk value is "The Air Line Pilots Association" # we tell if there are referring to the same object by looking at the ROOT word at the NP-chunk/Gold-val def _get_label(labels, sent_id, per, org): label = 0 org_root = _get_np_root(org) # loop over list of [ .. (per, org, org_root) .. ] for sent_per, sent_org, sent_org_root in labels.get(sent_id, []): # check similarity between PERSON and ORGANIZATION-ROOT if _get_np_root(per) == _get_np_root( sent_per) and org_root == sent_org_root: label = 1 break return label
import en_core_web_sm from spacy.matcher import Matcher nlp = en_core_web_sm.load() #load in the english pretrained spacy model pPt = Matcher(nlp.vocab) #create a matcher that we can add patterns to #-------------------------patterns being added to the matcher---------------------------- #pattern to recognize "(name) (optional ,) (optional the) (any lemmatized title)" pPt.add("pat1", None, [{ "POS": "PROPN" }, { "POS": "PUNCT", "OP": "?" }, { "POS": "DET", "OP": "?" }, { "LEMMA": { "IN": [ "director", "engineer", "governer", "mayor", "manager", "official", "CEO", "COO", "commissioner", "spokesperson", "spokeswoman", "spokesman", "representative", "chief", "coordinator" ] } }]) #pattern to recognize "(lemmatized verb) (optional noun) (any lemmatized title)" pPt.add("pat2", None, [{ "LEMMA": { "IN": ["announce", "hazard", "say", "stated", "issued", "warned"] } }, {
# post请求 from fastapi import FastAPI from pydantic import BaseModel import spacy import en_core_web_sm en_core_web_sm.load() nlp = spacy.load("en_core_web_sm") app = FastAPI() @app.get('/test/a={a}/b={b}') def calculate(a: int = None, b: int = None): c = a + b res = {"res": c} return res @app.get('/') def hello(): return "Hello1" class Item(BaseModel): POS: str = None @app.post('/test') def calculate(request_data: Item): if request_data.POS: doc = nlp(request_data.POS)
def preprocess(self): self.logger.info("****Preparing dataset****") if cfg.EMBS_TYPE == 'glove': embedding_reader = GloVeReader() elif cfg.EMBS_TYPE == 'fasttext': embedding_reader = FastTextReader() else: raise ValueError(f"Unsupported embeddings type {cfg.EMBS_TYPE}") pretrained_vectors = embedding_reader.read(cfg.EMBS_FILE) vocab = Vocab(embedding_reader.START, embedding_reader.END, embedding_reader.PAD, embedding_reader.UNK, cfg.CSEQ_LEN, cfg.QSEQ_LEN) pardir = os.path.dirname(cfg.VOCAB_SAVE) if not os.path.exists(pardir): os.makedirs(pardir) ner = NERTagger(cfg.NER_TAGS_FILE, cfg.CSEQ_LEN) pos = PosTagger(cfg.POS_TAGS_FILE, cfg.CSEQ_LEN) self.nlp = en_core_web_sm.load() train = tfds.load("squad", data_dir="/tf/data/tf_data", split='train') AUTOTUNE = tf.data.experimental.AUTOTUNE train_context = train.map(lambda x: x['context'], num_parallel_calls=AUTOTUNE) train_question = train.map(lambda x: x['question'], num_parallel_calls=AUTOTUNE) train_ans = train.map(lambda x: x['answers']['text'][0], num_parallel_calls=AUTOTUNE) test = tfds.load("squad", data_dir="/tf/data/tf_data", split='validation') test_context = test.map(lambda x: x['context'], num_parallel_calls=AUTOTUNE) test_question = test.map(lambda x: x['question'], num_parallel_calls=AUTOTUNE) test_ans = test.map(lambda x: x['answers']['text'][0], num_parallel_calls=AUTOTUNE) mr = MapReduce() self.logger.info("****Preparing training split****") train_context = train_context.as_numpy_iterator() train_context = mr.process(self.utf8_decoder, train_context) train_question = train_question.as_numpy_iterator() train_question = mr.process(self.utf8_decoder, train_question) train_ans = train_ans.as_numpy_iterator() train_ans = mr.process(self.utf8_decoder, train_ans) self.logger.info("****Tokenizing training split****") train_context = self.nlp.pipe(train_context, batch_size=128, n_process=6) train_question = self.nlp.pipe(train_question, batch_size=128, n_process=6) train_ans = self.nlp.pipe(train_ans, batch_size=128, n_process=6) self.logger.info("****Tokenized training split****") training_context = [] training_question = [] training_ans = [] for context, ques, ans in zip(train_context, train_question, train_ans): ans_start, al = self.substrSearch(ans, context) ans_start += 1 if len(ques) >= 20 or ans_start == -1 or ans_start + al >= 250: continue training_context.append(context) training_question.append(ques) ans = np.zeros(cfg.CSEQ_LEN, dtype=np.uint8) ans[ans_start:ans_start + al] = 1 training_ans.append(ans) self.logger.info("****Filtered training split****") vocab.fit(training_context, training_question, pretrained_vectors, 0, 0) vocab.save(cfg.VOCAB_SAVE) train_cidx = vocab.transform(training_context, "source") train_ner = ner.transform(training_context) train_pos = pos.transform(training_context) train_qidx = vocab.transform(training_question, "target") cseq = cfg.CSEQ_LEN qseq = cfg.QSEQ_LEN def gen(): for cidx, ner, pos, qidx, ans in zip(train_cidx, train_ner, train_pos, train_qidx, training_ans): yield (cidx, ans, qidx, ner, pos) train_dataset = tf.data.Dataset.from_generator( gen, (tf.int32, tf.uint8, tf.int32, tf.uint8, tf.uint8), (tf.TensorShape([cseq]), tf.TensorShape( [cseq]), tf.TensorShape([qseq]), tf.TensorShape( [cseq]), tf.TensorShape([cseq]))) self.logger.info("****Preparing test split****") test_context = test_context.as_numpy_iterator() test_context = mr.process(self.utf8_decoder, test_context) test_question = test_question.as_numpy_iterator() test_question = mr.process(self.utf8_decoder, test_question) test_ans = test_ans.as_numpy_iterator() test_ans = mr.process(self.utf8_decoder, test_ans) self.logger.info("****Tokenizing test split****") test_context = self.nlp.pipe(test_context, batch_size=128, n_process=6) test_question = self.nlp.pipe(test_question, batch_size=128, n_process=6) test_ans = self.nlp.pipe(test_ans, batch_size=128, n_process=6) self.logger.info("****Tokenized test split****") testing_context = [] testing_question = [] testing_ans = [] for context, ques, ans in zip(test_context, test_question, test_ans): ans_start, al = self.substrSearch(ans, context) ans_start += 1 if len(ques) >= 20 or ans_start == -1 or ans_start + al >= 250: continue testing_context.append(context) testing_question.append(ques) ans = np.zeros(cfg.CSEQ_LEN, dtype=np.uint8) ans[ans_start:ans_start + al] = 1 testing_ans.append(ans) self.logger.info("****Filtered test split****") test_cidx = vocab.transform(testing_context, "source") test_ner = ner.transform(testing_context) test_pos = pos.transform(testing_context) test_qidx = vocab.transform(testing_question, "target") cseq = cfg.CSEQ_LEN qseq = cfg.QSEQ_LEN def gen(): for cidx, ner, pos, qidx, ans in zip(test_cidx, test_ner, test_pos, test_qidx, testing_ans): yield (cidx, ans, qidx, ner, pos) test_dataset = tf.data.Dataset.from_generator( gen, (tf.int32, tf.uint8, tf.int32, tf.uint8, tf.uint8), (tf.TensorShape([cseq]), tf.TensorShape( [cseq]), tf.TensorShape([qseq]), tf.TensorShape( [cseq]), tf.TensorShape([cseq]))) train_dataset = train_dataset.map(self.make_example, num_parallel_calls=-1) test_dataset = test_dataset.map(self.make_example, num_parallel_calls=-1) self.save(train_dataset, test_dataset) self.logger.debug(f"Memory freed: {gc.collect()}")