def process(sentence, doc_2, answer): # Processes user input and outputs the correct response invalid_responses = [ "I do not understand the question", "That question is not in my database", "I cannot answer " "that question", "I am not familiar with that question", "I am sorry. Could you please ask another question?" ] similarity_index = 0 index = 0 nlp = en_core_web_lg.load() doc_1 = nlp(preprocess(sentence)) similarity = 0 for i in range(len(doc_2)): if doc_2[i].vector_norm and doc_1.vector_norm: similarity = doc_1.similarity(doc_2[i]) if similarity > similarity_index: similarity_index = similarity index = i if similarity_index > 0.60: return answer[index] else: return random.choice(invalid_responses)
def prepare_mag_data(base_dir): print("reading file") mag_file = os.path.join(base_dir, "mag_subset.txt") mag_df = pd.read_csv(mag_file, sep="\t") samples = [] print("file read in") # prepare tokenization functions nlp=en_core_web_lg.load() tokenizer = Tokenizer(nlp.vocab) print("vocab loaded") #take samples with at least 10 words in citation context for index, row in mag_df.iterrows(): context = row['citationcontext'] text = re.sub("[" + re.escape(string.punctuation) + "]", " ", context) text = [token.lemma_ for token in tokenizer(text) if not token.like_num] text = [token for token in text if token.strip()] if(len(text) < MIN_CONTEXT_LENGTH): continue # generate sample in correct format sample = {"context": context, "authors_citing": row['citingauthors'], "title_cited": row['citedtitle'], "authors_cited": row['citedauthors'], "year": row['year'] } samples.append(pd.DataFrame(sample, index=[0])) print("processing done") logger.info("mag samples ready to load to file...") dataset = pd.concat(samples, axis=0) save_path = os.path.join(base_dir, "mag_data.csv") dataset.to_csv(save_path, sep="\t", compression=None, index=False, index_label=False) print("done")
def calculate_similarity(src_files, bug_reports): # Loading word vectors nlp = en_core_web_lg.load() src_docs = [ nlp(' '.join(src.file_name['unstemmed'] + src.class_names['unstemmed'] + src.attributes['unstemmed'] + src.comments['unstemmed'] + src.method_names['unstemmed'])) for src in src_files.values() ] min_max_scaler = MinMaxScaler() all_simis = [] for report in bug_reports.values(): report_doc = nlp(' '.join(report.summary['unstemmed'] + report.pos_tagged_description['unstemmed'])) scores = [] for src_doc in src_docs: simi = report_doc.similarity(src_doc) scores.append(simi) scores = np.array([float(count) for count in scores]).reshape(-1, 1) normalized_scores = np.concatenate( min_max_scaler.fit_transform(scores)) all_simis.append(normalized_scores.tolist()) return all_simis
def vectorizer(question): # Turns questions into a vectorized list question_list = [] nlp = en_core_web_lg.load() for index in range(len(question)): vectorized_question = nlp(preprocess(question[index])) question_list.append(vectorized_question) return question_list
def clean_mag_data(dataframe, save_path): samples = [] # prepare tokenization functions #nlp = spacy.load("en_core_web_lg") ---------didnt work?? nlp=en_core_web_lg.load() tokenizer = Tokenizer(nlp.vocab) #take samples with at least 10 words in citation context for index, row in dataframe.iterrows(): context = row['context'] text = re.sub("[" + re.escape(string.punctuation) + "]", " ", context) text = [token.lemma_ for token in tokenizer(text) if not token.like_num] text = [token for token in text if token.strip()] if(len(text) < MIN_CONTEXT_LENGTH): continue # generate sample in correct format sample = {"context": context, "authors_citing": row['authors_citing'], "title_cited": row['title_cited'], "authors_cited": row['authors_cited']} samples.append(pd.DataFrame(sample, index=[0])) logger.info("mag samples ready to load to file...") dataset = pd.concat(samples, axis=0) dataset.to_csv(save_path, sep="\t", compression=None, index=False, index_label=False)
def semantic_similarity(word_set): similar = en_core_web_lg.load(); word_vec = list(); for word in word_set: word_vec.append(word); word_similarity = dict(); for i in range(0,len(word_vec)): word_vec[i] = similar(word_vec[i]); #print(word_vec) for i in range(0,len(word_vec)): for j in range(0,i): #word_vec[i] = (str)(word_vec[i]); #word_vec[j] = (str)(word_vec[j]); similarity = word_vec[i].similarity(word_vec[j]); #print(word_vec[i],word_vec[j],similarity) if(similarity>=0.8): if(word_vec[i] in word_similarity): word_similarity[word_vec[i]].append(word_vec[j]); else: word_similarity[word_vec[i]] = list(); word_similarity[word_vec[i]].append(word_vec[j]); if(word_vec[j] in word_similarity): word_similarity[word_vec[j]].append(word_vec[i]); else: word_similarity[word_vec[j]] = list(); word_similarity[word_vec[j]].append(word_vec[i]); return word_similarity
class SpacyEntityExtractor: nlp = en_core_web_lg.load() accepted_entity_types = ['PERSON', 'GPE', 'ORG', 'Product', 'Event'] def process_item(self, item, _): text = item.get('text') doc = self.nlp(text) entities_in_doc = {t: [] for t in self.accepted_entity_types} for entity in doc.ents: label = entity.label_ if label not in entities_in_doc.keys(): continue else: entities_in_doc[label].append({ 'spacy_entity': entity.orth_, 'spacy_position_start': entity.start_char, 'spacy_position_end': entity.end_char, }) item['spacy_entities'] = entities_in_doc return item
def main(): args = parse() run = check_if_should_run(args) if run == True: nlp = en_core_web_lg.load() path = "../lyrics/" + args.dataset + "/" if args.dataset != "GBDS": data = import_artist_files(path) datas = split_train_dev_test(data) datas = { data_type: form_x_of_songs_and_verses(data) for data_type, data in datas.items() } else: datas = create_datasets_for_GBDS(path) if "CADS" in args.dataset: datas = {'train': datas['train'] + datas['dev'] + datas['test']} data = import_duo_artist_file(path) data = convert_to_verse_classification_duo_artist(data) data = preprocess(data, nlp) write_to_csv(data, "test", args.dataset) datas = { data_type: preprocess(data, nlp) for data_type, data in datas.items() } for data_type, data in datas.items(): write_to_csv(data, data_type, args.dataset) print("Succesfully preprocessed all lyrics to .../datasets/" + args.dataset + "_'train/dev/test'.csv")
def tokenize_ft_extraction(df: pd.DataFrame, col_name: str) -> pd.DataFrame: """ This function takes in a dataframe and column name for text and performs the following to the text: remove punctuation; change to lower cases; tokenization, including tokenize numbers as [NUM]; lemmatization Args: df (pd.DataFrame): dataframe to be transformed col_name (str): column name of the text Returns: pd.DataFrame: transformed dataframe """ punctuation = string.punctuation df[col_name] = df[col_name].str.replace('[' + punctuation + ']', '', regex=True) df[col_name] = df[col_name].str.lower().str.strip() nlp = en_core_web_lg.load() nlp.add_pipe(merge_entities) df[col_name] = df[col_name].apply(_regex_clean) lemmatized_text = [] df['ents_rep'] = None df['vocab'] = None df['ppo_rep'] = None df['no_ents_text'] = None for idx, text in enumerate(df[col_name]): doc = nlp(text) tokens = [] ents = [] texts = [] ppo = [] # places = [] # persons = [] # orgs = [] for token in doc: if token.lemma_ == '-PRON-': tokens.append(token.text) elif not token.ent_type_: texts.append(token.text) tokens.append(token.lemma_) else: tokens.append(token.ent_type_) ents.append(token.text.lower()) if token.ent_type_ in [*PERSON, *PLACE, *ORG]: ppo.append(token.text.lower()) lemmatized_text.append(tokens) df['ents_rep'][idx] = len(ents) / len(set(ents)) df['vocab'][idx] = len(set(texts)) / len(texts) df['ppo_rep'][idx] = len(ppo) / (len(set(ppo)) + np.exp(float('-inf'))) df['no_ents_text'][idx] = ' '.join(texts) df['lem_text'] = lemmatized_text df = _feature_extraction(df, 'lem_text') return df
def getModelWithAbbrQRAndSpeller(): nlp = model_lg.load() return CliNlpModel("lgd_lgm_abbrqr_speller", getIntentSet(), nlp, rewriteDataQuery=rewriteAbbrInQuery, rewriteUserQuery=combineQueryRewriters( [rewriteAbbrInQuery, correctSpellingErrors]))
def load(self): if self._nlp is None: print("Create nlpWithAzureResourceRecognizer") nlp = model_lg.load() azureResourceRecognizer = AzureResourceRecognizer(nlp) nlp.add_pipe(azureResourceRecognizer, last=True) self._nlp = nlp return self._nlp
def main(): news = 'Rancho Mirage, a 310-unit multifamily property located in the Las Colinas master-planned community, recently underwent $2 million in property improvements to overhaul units and amenities for 3 Columbus Circle.' nlp = en_core_web_lg.load() doc = nlp(news) for np in list(doc.noun_chunks): np.merge(np.root.tag_, np.root.lemma_, np.root.ent_type_) for ent in doc.ents: print(ent.text, ent.label, ent.lemma_, ent.root.ent_type)
def main(): # load the GloVe model with 300 dimensions nlp = en_core_web_lg.load() with open("output/analogy.txt", 'w') as f: f.write(print_analogy("king", "man", "queen", nlp) + '\n') f.write(print_analogy("London", "England", "Paris", nlp) + '\n') f.write(print_analogy("Dog", "Puppy", "Cat", nlp) + '\n') f.write(print_analogy("Sister", "Brother", "Aunt", nlp) + '\n') f.write(print_analogy("Slow", "Slower", "Fast", nlp) + '\n')
def getModelWithAbbrQRAndSpeller(): nlp = model_lg.load() return CliNlpModel("lgd_lgm_abbrqr_speller", getAllAsQueries, data.cliData, nlp, rewriteDataQuery=rewriteAbbrInQuery, rewriteUserQuery=lambda q: correctSpellingErrors( rewriteAbbrInQuery(q)))
def get_model(): """ Lazy initializer of model """ global model if model is None: print("Loading model...") model = en_core_web_lg.load() return model
def main(argv): start_time = time.time() # Config csv_file_name = 'user_queries.csv' data_target_name = 'query.pickle' x_index = 0 y_index = 1 keyword_index = 2 # query_index = 16 # Let's take a query from the query file query_index = int(argv[0]) keyword_delimiter = ' ' csv_delimiter = ';' csv_quotechar = '"' file_allow_overwrite = True # Code if query_index <= 0: print('The query row has to be positive.') else: print('Loading CSV', csv_file_name) data = load_csv(file_name=csv_file_name, x_coordinate_index=x_index, y_coordinate_index=y_index, keywords_index=keyword_index, keywords_delimiter=keyword_delimiter, delimiter=csv_delimiter, quotechar=csv_quotechar, max_read_length=query_index + 1, query_load=True) if len(data) > 0: print('Query Datapoint:', data[query_index - 1].coordinates.x, data[query_index - 1].coordinates.y, data[query_index - 1].keywords) write_pickle(data=data[query_index - 1], file_name=data_target_name, file_allow_overwrite=file_allow_overwrite) else: print('Could not load any data.') nlp = en_core_web_lg.load() df_poi_encoded = pd.read_csv(os.path.dirname(os.path.abspath(__file__)) + '/../files/' + 'poi_keywords_encoded.csv', index_col='poi_name', encoding='utf-8') # print(df_poi_encoded) for kw in data[query_index - 1].keywords: df_poi_encoded[kw] = df_poi_encoded.apply( lambda row: nlp(row['nlp_keywords_encoded']).similarity(nlp(kw)), axis=1) df_poi_encoded.to_csv(os.path.dirname(os.path.abspath(__file__)) + '/../files/' + 'poi_queries_similarities.csv', encoding='utf-8') print("--- %s seconds ---" % (time.time() - start_time))
def get_spacy_nlp(): """Handles lazy loading of the Spacy NLP model. Returns: The loaded Spacy model. """ global _nlp if _nlp is None: _nlp = en_core_web_lg.load() return _nlp
def wordEmbedding( question ): #change all questions in the corpora to vectors and store in a list embeddingList = [] nlp = en_core_web_lg.load() for x in range(len(question)): doc = nlp(preprocess(question[x])) embeddingList.append(doc) return embeddingList
def getModelWithAzureResourceRecognizer(): nlp = model_lg.load() intentSet = getIntentSet() azureResourceRecognizer = AzureResourceRecognizer(nlp) nlp.add_pipe(azureResourceRecognizer, last=True) return CliNlpModel("lgd_lgm_azRecognizer", intentSet, nlp, rewriteAbbrInQuery, preProcessDoc=updateDocVector)
def wordEmbedding(question): #change all questions in the corpora to vectors and store in a list embeddingList=[] nlp = en_core_web_lg.load() for x in range(len(question)): doc=nlp(preprocess(question[x])) pre=[doc] pre.append(prep.findsenti(question[x]))#also include the sentiment embeddingList.append(pre) return embeddingList
def getModelWithAbbrQrStopsQrAndSpeller(): nlp = model_lg.load() intentSet = getIntentSet() return CliNlpModel("lgd_lgm_abbrQrStopsQrAndSpeller", intentSet, nlp, rewriteDataQuery=combineQueryRewriters( [rewriteAbbrInQuery, rewriteStopWords]), rewriteUserQuery=combineQueryRewriters([ rewriteAbbrInQuery, rewriteStopWords, correctSpellingErrors ]))
def generate(intputSen, doc2, answer): index = 0 nlp = en_core_web_lg.load() doc1 = nlp(preprocess(intputSen)) inputsenti = prep.findsenti(intputSen) similarity = 0 bestlist = [] for x in range(len(doc2)): if doc2[x][0].vector_norm and doc1.vector_norm: similarity = doc1.similarity( doc2[x][0] ) #compare the input sentence and questions stored in the list if similarity > 0.60: # this is the threshold, so if this value is too high, then your input must #have a higher degree of similarity to the questions in the corpora index = x bestlist.append([similarity, index, doc2[x][1]]) if len(bestlist) == 0: # at least 5 different reasonable responses when the user enters something outside the two topics listReply = [ 'Sorry your question is not included in my database', 'Sorry, I do not know how to reply that', 'Whoops! my brain is dead, may be next question', 'Pass that bro, I cannot remember', 'This question is too difficult, next question please', 'Your question is hard for me, sorry abour that' ] replyOutsideTopic = random.choice(listReply) print(replyOutsideTopic) return replyOutsideTopic sortedanswer = sorted(bestlist, key=operator.itemgetter(0)) if len(sortedanswer) == 1: print(answer[sortedanswer[0][1]]) return answer[sortedanswer[0][1]] else: if sortedanswer[-1][0] != sortedanswer[-2][0]: print(answer[sortedanswer[-1][1]]) return answer[sortedanswer[-1][1]] else: if abs(sortedanswer[-1][2] - inputsenti) > abs( sortedanswer[-2][2] - inputsenti ): #if top 2 answer have same similarity, then check the sentiment. print(answer[sortedanswer[-2][1]]) return answer[sortedanswer[-2][1]] else: print(answer[sortedanswer[-1][1]]) return answer[sortedanswer[-1][1]]
def train(train_df, dev_df, lstm_shape, lstm_settings, model_name, batch_size=100, nb_epoch=15): logger.info("Loading spaCy") nlp = en_core_web_lg.load() # spacy.load('en_vectors_web_lg') nlp.add_pipe(nlp.create_pipe('sentencizer')) embeddings = get_embeddings(nlp.vocab) if model_name == "lstm": model = compile_lstm(embeddings, lstm_shape, lstm_settings) elif model_name == "lstm_with_attention": model = compile_lstm_attention(embeddings, lstm_shape, lstm_settings) else: # model_name == "lstm_with_visualization": model = compile_visualizable_lstm_attention(embeddings, lstm_shape, lstm_settings) tensorboard_dir = os.path.join(logdir, "checkpoints") if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) tbCallBack = TensorBoard(log_dir=tensorboard_dir, histogram_freq=0, write_graph=True) logger.info("Start training...") model.fit_generator( DataGenerator(train_df, nlp=nlp, batch_size=batch_size, max_sentences=lstm_shape["max_sentences"], max_sentence_length=lstm_shape["max_sentence_length"], shuffle=True), validation_data=DataGenerator( dev_df, nlp=nlp, batch_size=batch_size, max_sentences=lstm_shape["max_sentences"], max_sentence_length=lstm_shape["max_sentence_length"], shuffle=False), steps_per_epoch=int(np.floor(len(train_df) / batch_size)), validation_steps=int(np.floor(len(dev_df) / batch_size)), nb_epoch=nb_epoch, callbacks=[tbCallBack]) return model, nlp
def main(): # read in human judgement scores fname = "data/wordsim-353.txt" df = read_human_judgements(fname) # load the GloVe model with 300 dimensions nlp = en_core_web_lg.load() # determine similarity scores using cosine similarity of embeddings df['score_embeddings'] = df.apply( lambda row: similarity_score(row['word1'], row['word2'], nlp), axis=1) # write results to file outfile = "output/word_similarity.txt" generate_output(df, outfile)
def annotate_DEP(dataset_file): TRAIN_DATA = [] model = en_core_web_lg.load() with open(dataset_file, 'r') as cve_dataset_f: cve_reader = csv.DictReader(cve_dataset_f, delimiter=';') for cve in cve_reader: tagged_desc = model(unicode(cve['Avail.'])) heads = [tok.head.i for tok in tagged_desc] deps = ['-'] * len(heads) TRAIN_DATA += [[cve['Avail.'], {'heads': heads, 'deps': deps}]] with open( 'annotated_{}_DEP_train.json'.format( dataset_file.replace('.csv', '')), 'w') as annotated_f: json.dump(TRAIN_DATA, annotated_f)
def get_people_orgs_batch(court="Chancery", jx="Delaware", model="large", write=True, overwrite=False): """Gets the people and orgs from opinions for an entire court at a time. Options allow saving to the database.""" bigDict = {} db, ct = check_court_jx(court, jx) if not db: return docs = db.objects.filter(Court__exact=ct) print("Total records:", len(docs)) print("Done: ", end="") for i, doc in enumerate(docs): if (i % 10 == 0): print(i, ". . . ", end="") #INITIAL CONDITION: IF OVERWRITE IS NOT SET if write and (not overwrite): if not ((doc.People == None) or (doc.People == "")) and\ ((doc.Organizations == None) or (doc.Organizations == "")): continue text = doc.MainText clean_text = clean_MT(text) cites = getDocCites(clean_text) full_cites = [getWholeCite(cite, clean_text) for cite in cites] processed_text = removeCites(clean_text, full_cites) nlp = en_core_web_lg.load() people, orgs = people_orgs_batchNLP(processed_text, nlp) people = [p for p, _ in people] orgs = [o for o, _ in orgs] docID = doc.id if write: doc.People = json.dumps(people) doc.Organizations = json.dumps(orgs) doc.save() newDict = {"people": people, "orgs": orgs} bigDict[docID] = newDict print("Done") return bigDict
def criar_tabela_associacao_frases_recentes_com_tabela_de_frases_agrupada_usando_linguagem_natural_como_criterio(self): print("criar_tabela_associacao_frases_recentes_com_tabela_de_frases_agrupada_usando_linguagem_natural_como_criterio") self.nlp = en_core_web_lg.load() repositorio = PostagensRepository() frases_recentes = repositorio.listar_frases_recentes() frases = repositorio.listar_frases_com_tendencia() resultado = list() for frase in frases: frase_equivalente :str= self.equivalencia_semantica(frase, frases_recentes) if frase_equivalente!=None: nova = dict() nova["frase_recente"] = frase_equivalente nova["frase"]= frase resultado.append(nova) repositorio.insere_tabela_associacoes(resultado, "Equivalencia Semantica")
def get_most_similar_item(item, old_items): """ given name of new item, and list of names of old items, returns the index of the most similar item """ nlp = en_core_web_lg.load() item_token = nlp(item.lower().replace('_', ' ')) similarities = [] for old_item in old_items: similarities.append( item_token.similarity(nlp(old_item.lower().replace('_', ' '))) ) similar_index = similarities.index(max(similarities)) # similar_name = old_items[similar_index] return similar_index
def categorize_jobs(self): # #Predefined categories #Compare similarities of word embeddings #nlp=spacy.load('en_core_web_lg') nlp=en_core_web_lg.load() job_id=self.df2.loc[:,'Job_Id'].tolist()[:self.training_range] job_titles=self.df2.loc[:,'jobtitle'].tolist()[:self.training_range] job_descriptions=self.df2.loc[:,'jobdescription'].tolist()[:self.training_range] final_cat=pd.DataFrame(index=job_id) #categories=['Network Engineer','Application Development','Big Data','Data Analyst','Software Developer','DevOps','Software Testing','Front End','Back End','Full Stack','Web Development','Information Security','Mobile developer','System Administrator','Business Analyst','Manager','Cloud'] categories=['Network Engineer','Full stack','QA/Test Developer','Enterprise application','DevOps','Mobile Developer','Back End','Database Administrator(DBA)','Front End','Game developer','System Administrator','Data Scientist','Business analyst','Sales professional','Product Manager','Information Security','Software Developer/Java Developer','Web Developer','Cloud Computing'] for category in categories: final_cat[category]=np.nan for job_t_d in list(zip(job_id,job_titles,job_descriptions)): id_job=job_t_d[0] job_i=job_t_d[1] job_d=job_t_d[2] job_title=nlp(job_i.lower()) job_description=nlp(job_d.lower()) match_cat_title=dict() match_cat_description=dict() for category in categories: word=nlp(category.lower()) match_cat_title[category]=job_title.similarity(word) match_cat_description[category]=job_description.similarity(word) match_cat_title=sorted(match_cat_title.items(),key=lambda x:x[1],reverse=True) match_cat_description=sorted(match_cat_description.items(),key=lambda x:x[1],reverse=True) #a represents max #if(match_cat_title[0][1]>0.5 or match_cat_description[0][1]>0.5): a=match_cat_title[0] #print(a) match_cat_description=list(filter(lambda x: self.check_threshold(match_cat_title,x),match_cat_description)) if(len(match_cat_description)!=0): print(match_cat_description) print(id_job) #b=match_cat_description[0] final_cat.loc[id_job,a[0]]=1 match_cat_description.extend([(match_cat_title[0][0],1)]) sum_proportion=sum([x[1] for x in match_cat_description]) for ele in match_cat_description: final_cat.loc[id_job,ele[0]]=ele[1]/sum_proportion else: print(id_job) final_cat.loc[id_job,a[0]]=1 return final_cat
def __init__(self): """Constructor Load spacy model once """ # Set log level loglevel = os.environ.get("LOG_LEVEL", "INFO") self.logger = logging.getLogger(__name__) self.logger.setLevel(loglevel) logging.getLogger('tldextract').setLevel(loglevel) # Caching top level domains tldextract.extract("") # Load spaCy lg model self.logger.info("Loading NLP model...") self.nlp = en_core_web_lg.load(disable=['parser', 'tagger'])