def match(inp, filename): exclude = ["iit", "mandi", "!", "?", "-", ".", ",", "in", "at", "on"] for i in exclude: inp = inp.replace(i, "") data = pd.read_csv(filename) processedInp = lemmatize_sentence(inp.strip().lower()) maxSim = -1 ans = None bestQues = None for idx in range(data.shape[0]): processedQues = lemmatize_sentence(data.question[idx].strip().lower()) sim = len(set(processedInp).intersection(set(processedQues))) if sim > maxSim: maxSim = sim bestQues = data.question[idx] ans = data.answers[idx] if maxSim == 0: return "Sorry, I cannot answer you... Some secrets are best not to be revealed :)" else: return ( "Your question matched to the following question in database...\n" + bestQues + "\nAnswer: \n" + ans)
def calculate_jaccard_score(sen1, sen2): array1 = set(lemmatize_sentence(sen1) ) array2 = set(lemmatize_sentence(sen2)) intersection = array1.intersection(array2) union = array1.union(array2) return float(len(intersection)) / len(union)
def simple_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=False, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ Simple Lesk is somewhere in between using more than the original Lesk algorithm (1986) and using less signature words than adapted Lesk (Banerjee and Pederson, 2002) """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def cosine_lesk(context_sentence, ambiguous_word, pos=None, lemma=True, stem=True, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, from_cache=True): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None ss_sign = simple_signatures(ambiguous_word, pos, lemma, stem, hyperhypo, stop, from_cache=from_cache) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in ss_sign.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") scores.append((cos_sim(context_sentence, signature), ss)) scores = sorted(scores, reverse=True) return scores if nbest else scores[0][1]
def adapted_lesk(context_sentence, ambiguous_word, pos=None, lemma=True, stem=False, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, keepscore=False, normalizescore=False, from_cache=True): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = signatures(ambiguous_word, pos=pos, hyperhypo=hyperhypo, adapted=True, remove_stopwords=stop, to_lemmatize=lemma, remove_numbers=True, lowercase=True, to_stem=stem, from_cache=from_cache) # Disambiguate the sense in context. context_sentence = context_sentence.split() if context_is_lemmatized else lemmatize_sentence(context_sentence) return compare_overlaps(context_sentence, ss_sign, nbest=nbest, keepscore=keepscore, normalizescore=normalizescore)
def evaluate_algorithm(similarity_option, chunk): match = 0 total = 0 chunk_text = tree_to_list(chunk) surface_words, lemmas, morphy_poss = lemmatize_sentence(chunk_text, keepWordPOS=True) assert(len(lemmas) == len(chunk)) for i in range(0, len(chunk)): semcor_word = chunk[i] # Skip stop-words and punctuation since neither they are in WordNet if not isinstance(semcor_word, nltk.tree.Tree): continue if not isinstance(semcor_word.label(), nltk.corpus.reader.wordnet.Lemma): # TODO: semcor_word.label() == 'such.s.00' continue # Skip named entities if semcor_word.label() == nltk.corpus.wordnet.lemma('group.n.01.group') and "') (NE " in semcor_word.pformat(): continue context = [lemma for lemma in lemmas[max(0, i - 15):i+9]] lemma = lemmas[i] pos = morphy_poss[i] synset = max_similarity(context, lemma, pos=pos, option=similarity_option) if synset is None: # TODO: possibly this is bug, for example, "over-all" should be converted to "overall" before looking in WordNet database continue if synset is not None and semcor_word.label().synset() == synset: match += 1 total += 1 accuracy = match / total return match, total, accuracy
def adapted_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): """ This function is the implementation of the Adapted Lesk algorithm, described in Banerjee and Pederson (2002). It makes use of the lexical items from semantically related senses within the wordnet hierarchies and to generate more lexical items for each sense. see www.d.umn.edu/~tpederse/Pubs/cicling2002-b.pdf """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) for ss in ss_sign: # Includes holonyms. ss_mem_holonyms = synset_properties(ss, 'member_holonyms') ss_part_holonyms = synset_properties(ss, 'part_holonyms') ss_sub_holonyms = synset_properties(ss, 'substance_holonyms') # Includes meronyms. ss_mem_meronyms = synset_properties(ss, 'member_meronyms') ss_part_meronyms = synset_properties(ss, 'part_meronyms') ss_sub_meronyms = synset_properties(ss, 'substance_meronyms') # Includes similar_tos ss_simto = synset_properties(ss, 'similar_tos') related_senses = list( set(ss_mem_holonyms + ss_part_holonyms + ss_sub_holonyms + ss_mem_meronyms + ss_part_meronyms + ss_sub_meronyms + ss_simto)) signature = list([ j for j in chain( *[synset_properties(i, 'lemma_names') for i in related_senses]) if j not in EN_STOPWORDS ]) # Lemmatized context is preferred over stemmed context if lemma == True: signature = [lemmatize(i) for i in signature] # Matching exact words causes sparsity, so optional matching for stems. if stem == True: signature = [porter.stem(i) for i in signature] # Adds the extended signature to the simple signatures. ss_sign[ss] += signature # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def disambiguate(sentence, algorithm=simple_lesk, context_is_lemmatized=False, similarity_option='path', keepLemmas=False, prefersNone=True, from_cache=True): tagged_sentence = [] # Pre-lemmatize the sentnece before WSD if not context_is_lemmatized: surface_words, lemmas, morphy_poss = lemmatize_sentence( sentence, keepWordPOS=True) lemma_sentence = " ".join(lemmas) else: lemma_sentence = sentence # TODO: Miss out on POS specification, how to resolve? for word, lemma, pos in zip(surface_words, lemmas, morphy_poss): if lemma not in stopwords: # Checks if it is a content word try: wn.synsets(lemma)[0] if algorithm == original_lesk: # Note: Original doesn't care about lemmas synset = algorithm(lemma_sentence, lemma, from_cache=from_cache) elif algorithm == max_similarity: synset = algorithm(lemma_sentence, lemma, pos=pos, option=similarity_option) else: synset = algorithm(lemma_sentence, lemma, pos=pos, context_is_lemmatized=True, from_cache=from_cache) except: # In case the content word is not in WordNet synset = '#NOT_IN_WN#' else: synset = '#STOPWORD/PUNCTUATION#' if keepLemmas: tagged_sentence.append((word, lemma, synset)) else: tagged_sentence.append((word, synset)) # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None. if prefersNone and not keepLemmas: tagged_sentence = [(word, None) if str(tag).startswith('#') else (word, tag) for word, tag in tagged_sentence] if prefersNone and keepLemmas: tagged_sentence = [(word, lemma, None) if str(tag).startswith('#') else (word, lemma, tag) for word, lemma, tag in tagged_sentence] return tagged_sentence
def clean_text(text): clean_text = [] text = text.lower() cleanr = re.compile('<.*?>') text = re.sub(cleanr, ' ', text) #remove HTML tag text = re.sub(r'[?|!|\'|"|#]', r'', text) text = re.sub(r'[.|,|)|(|\|/]', r' ', text) #remove punctuation text = neg_pattern.sub(lambda x: negations_dic[x.group()], text) for word in text.split(): if word not in stopwords.words('english'): word = lemmatize_sentence(word) word = word[0] clean_text.append(word) return (" ".join(clean_text))
def document_tf(doc): term_counts = dict() sent_tokens = sent_tokenize(doc) word_count = 0 for sent in sent_tokens: word_tokens = lemmatize_sentence(sent) for word in word_tokens: if (word not in stop_words) and (word not in punctuators): word_count += 1 # keep track of total no. of words in the doc term_counts[word] = term_counts.get( word, 0 ) + 1 #dict.get returns default value specified if key not found for k in term_counts: term_counts[ k] /= word_count # convert raw occurence count to relative frequency return term_counts
def preprocessing(self,sentence): clean_text = [] sentence = sentence.lower() cleanr = re.compile('<.*?>') sentence = re.sub(cleanr, ' ', sentence) #menghilangkan HTML tag sentence = re.sub(r'[?|!|\'|"|#]',r'', sentence) sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)#menghilangkan punctuation sentence = self.neg_pattern.sub(lambda x: self.negations_dic[x.group()], sentence) #menyimpan kata negasi agar tidak hilang for word in sentence.split(): if word not in stopwords.words('english'): word = lemmatize_sentence(word) word = word[0] clean_text.append(word) return (" ".join(clean_text))
def cosine_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] for ss, signature in synsets_signatures.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") # Removes punctuation. signature = [i for i in word_tokenize(signature) \ if i not in string.punctuation] # Optional: remove stopwords. if stop: signature = [i for i in signature if i not in EN_STOPWORDS] # Optional: Lemmatize the tokens. if lemma == True: signature = [lemmatize(i) for i in signature] # Optional: stem the tokens. if stem: signature = [porter.stem(i) for i in signature] scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) if not nbest: return sorted(scores, reverse=True)[0][1] else: return [(j, i) for i, j in sorted(scores, reverse=True)]
def simple_lesk(context_sentence: str, ambiguous_word: str, pos: str = None, lemma=True, stem=False, hyperhypo=True, stop=True, context_is_lemmatized=False, nbest=False, keepscore=False, normalizescore=False, from_cache=True) -> "wn.Synset": """ Simple Lesk is somewhere in between using more than the original Lesk algorithm (1986) and using less signature words than adapted Lesk (Banerjee and Pederson, 2002) :param context_sentence: String, sentence or document. :param ambiguous_word: String, a single word. :param pos: String, one of 'a', 'r', 's', 'n', 'v', or None. :return: A Synset for the estimated best sense. """ # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word, pos=pos) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signatures(ambiguous_word, pos, lemma, stem, hyperhypo, stop, from_cache=from_cache) # Disambiguate the sense in context. context_sentence = context_sentence.split( ) if context_is_lemmatized else lemmatize_sentence(context_sentence) return compare_overlaps(context_sentence, ss_sign, nbest=nbest, keepscore=keepscore, normalizescore=normalizescore)
def recurse(key, d, ans, altans): key_lem = None if key in courses: key_lem = key.lower().strip() else: key_lem = lemmatize_sentence(key)[0] #print("Lemmatized key: " + key_lem) #print("Original Key: " + key) if key_lem in keywords: ans.add(d[key]['resp0nse']) for j in d: if j != key and j != 'resp0nse': #try: # altans[j] += d[j]['resp0nse'] #except: altans.update({j: d[j]['resp0nse']}) if type(d[key]) == dict: for i in d[key]: if i != 'resp0nse': ans, altans = recurse(i, d[key], ans, altans) return ans, altans
def simple_lesk(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=False, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, keepscore=False, normalizescore=False): # Ensure that ambiguous word is a lemma. ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None if not wn.synsets(ambiguous_word): return None # Get the signatures for each synset. ss_sign = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo) # Disambiguate the sense in context. if context_is_lemmatized: context_sentence = context_sentence.split() else: context_sentence = lemmatize_sentence(context_sentence) best_sense = compare_overlaps(context_sentence, ss_sign, \ nbest=nbest, keepscore=keepscore, \ normalizescore=normalizescore) return best_sense
def clean_sentence(sentence): negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not", "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not", "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not", "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not", "mustn't":"must not"} neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b') clean_text = [] sentence = sentence.lower() cleanr = re.compile('<.*?>') sentence = re.sub(cleanr, ' ', sentence) #menghilangkan HTML tag sentence = re.sub(r'[?|!|\'|"|#]',r'', sentence) sentence = re.sub(r'[.|,|)|(|\|/]',r' ',sentence)#menghilangkan punctuation sentence = neg_pattern.sub(lambda x: negations_dic[x.group()], sentence) #menyimpan kata negasi agar tidak hilang for word in sentence.split(): if word not in stopwords.words('english'): word = lemmatize_sentence(word) word = word[0] clean_text.append(word) return (" ".join(clean_text))
def disambiguate(sentence, algorithm=simple_lesk, context_is_lemmatized=False, similarity_option='path', keepLemmas=False, prefersNone=True, from_cache=True, tokenizer=word_tokenize): tagged_sentence = [] # Pre-lemmatize the sentnece before WSD if not context_is_lemmatized: surface_words, lemmas, morphy_poss = lemmatize_sentence(sentence, keepWordPOS=True, tokenizer=tokenizer) lemma_sentence = " ".join(lemmas) else: lemma_sentence = sentence # TODO: Miss out on POS specification, how to resolve? for word, lemma, pos in zip(surface_words, lemmas, morphy_poss): if lemma not in stopwords: # Checks if it is a content word if wn.synsets(lemma): if algorithm == original_lesk: # Note: Original doesn't care about lemmas synset = algorithm(lemma_sentence, lemma, from_cache=from_cache) elif algorithm == max_similarity: synset = algorithm(lemma_sentence, lemma, pos=pos, option=similarity_option) else: synset = algorithm(lemma_sentence, lemma, pos=pos, context_is_lemmatized=True, from_cache=from_cache) else: # In case the content word is not in WordNet. synset = '#NOT_IN_WN#' else: synset = '#STOPWORD/PUNCTUATION#' if keepLemmas: tagged_sentence.append((word, lemma, synset)) else: tagged_sentence.append((word, synset)) # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None. if prefersNone and not keepLemmas: tagged_sentence = [(word, None) if str(tag).startswith('#') else (word, tag) for word, tag in tagged_sentence] if prefersNone and keepLemmas: tagged_sentence = [(word, lemma, None) if str(tag).startswith('#') else (word, lemma, tag) for word, lemma, tag in tagged_sentence] return tagged_sentence
def normalize_corpus(corpus): normalize_corpus=[] for doc in corpus: doc = re.sub(r"\b[A-Z\.]{2,}s?\b", "", doc) doc = remove_stopwords(doc,is_lower_case=True) doc = remove_accented_chars(doc) doc = expand_contractions(doc) doc = doc.lower() # remove extra newlines doc = re.sub(r'[\r|\n|\r\n]+', ' ', doc) special_char_pattern = re.compile(r'([{.(-)!}])') doc = special_char_pattern.sub(" \\1 ", doc) doc = remove_special_characters(doc,remove_digits=True) # remove extra whitespace doc = re.sub(' +', ' ', doc) from pywsd.utils import lemmatize_sentence doc=lemmatize_sentence(doc) doc=' '.join(doc) doc = remove_proper_nouns(doc) normalize_corpus.append(doc) return normalize_corpus
def cosine_lesk_inventario_estendido(context_sentence, ambiguous_word, \ pos=None, lemma=True, stem=True, hyperhypo=True, \ stop=True, context_is_lemmatized=False, \ nbest=False, synsets_signatures=None, busca_ampla=False): """ In line with vector space models, we can use cosine to calculate overlaps instead of using raw overlap counts. Essentially, the idea of using signatures (aka 'sense paraphrases') is lesk-like. """ # Ensure that ambiguous word is a lemma. if lemma: ambiguous_word = lemmatize(ambiguous_word) # If ambiguous word not in WordNet return None #if not wn.synsets(ambiguous_word): if not criar_inventario_des_wn(ambiguous_word, busca_ampla=busca_ampla): return None if context_is_lemmatized: context_sentence = " ".join(context_sentence.split()) else: context_sentence = " ".join(lemmatize_sentence(context_sentence)) scores = [] chave_assinatura = "%s.%s.%s.%s.%s.%s" % (ambiguous_word, pos, lemma, stem, hyperhypo, busca_ampla) if not chave_assinatura in DesWordnet.cache_assinaturas: synsets_signatures = simple_signature(ambiguous_word, pos, lemma, stem, hyperhypo, busca_ampla=busca_ampla) DesWordnet.cache_assinaturas[chave_assinatura] = [] for ss, signature in synsets_signatures.items(): # Lowercase and replace "_" with spaces. signature = " ".join(map(str, signature)).lower().replace("_", " ") # Removes punctuation. signature = [i for i in Util.word_tokenize(signature) \ if i not in string.punctuation] signature = Util.normalizar_ctx(signature, stop=stop, lematizar=lemma, stem=stem) scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) DesWordnet.cache_assinaturas[chave_assinatura].append( (ss, signature)) else: synsets_signatures = DesWordnet.cache_assinaturas[chave_assinatura] for ss, signature in synsets_signatures: scores.append((cos_sim(context_sentence, " ".join(signature)), ss)) if not nbest: return sorted(scores, reverse=True)[0][1] else: return [(j, i) for i, j in sorted(scores, reverse=True)]
def searchAPI(searchterm): #print(vocab) query = searchterm query_tokens = lemmatize_sentence(query) # lemmatize tokens to use as in vocabulary query_vector = [] query_tf = {} total_query_vocab = 0 print(query_tokens) for i in range(len(query_tokens)): tok = query_tokens[i] try: indexvalue = vocab.index(tok) query_vector.append(indexvalue) query_tf[indexvalue] = 1 + query_tf.get(indexvalue,0) total_query_vocab += 1 except ValueError: # Token doesnt exist in vocab - ignored #print(tok, "does not exist in the vocabulary. - Ignoring") if tok not in stop_words: misspelled = list(spell.unknown([tok])) print("invalid -> ",tok) if(len(misspelled)==0): print("trying synonyms") syn = list() for synset in wordnet.synsets(tok): for lemma in synset.lemmas(): syn.append(lemma.name()) #add the synonyms #print('Synonyms: ' + str(list(set(syn)))) found_word = False new_word = '' for word in syn: if word in vocab: new_word = word found_word = True break if found_word: #print('Synonym present in vocab -> ',new_word) indexvalue = vocab.index(new_word) query_vector.append(indexvalue) query_tf[indexvalue] = 1 + query_tf.get(indexvalue,0) total_query_vocab += 1 else: print("None of the synonyms present in the vocabulary") else: print("trying spelling correction") candidate_list = spell.candidates(misspelled[0]) found_word = False corrected_tok = '' #print("Candidates -> ",candidate_list) for word in candidate_list: new_query_tokens = query_tokens new_query_tokens[i] = word new_query = '' for j in range(len(new_query_tokens)): new_query += new_query_tokens[j]+' ' new_query = new_query[0:len(new_query)-1] new_lem_query = lemmatize_sentence(new_query) lem_word = new_lem_query[i] #print("word -> ",word, ", present in vocab ->", word in vocab) #print("lem_word -> ",lem_word, ", present in vocab ->", lem_word in vocab) #print("new query -> ",new_query) print("new lemmatized query -> ",new_lem_query) if lem_word in vocab: corrected_tok = lem_word found_word = True break if found_word: #print("corrected -> "+corrected_tok) indexvalue = vocab.index(corrected_tok) query_vector.append(indexvalue) query_tf[indexvalue] = 1 + query_tf.get(indexvalue,0) total_query_vocab += 1 else: print("couldnt find any word") print("Query as vocab indices:", query_vector) print() start_time = time.time() # Timer starts # First we obtain the list of all possible documents we actually need to search # This is a union of the docs in each query term's posting list # Not an intersection because we use cosine similarity and not boolean retrieval possible_docs = set() query_tf_vector = [] for q in query_vector: possible_docs = possible_docs.union(posting_list[q].keys()) query_tf_vector.append(query_tf[q]/total_query_vocab) # We also generate a TDF vector for the query. Does not make sense to scale with IDF # Run through each doc and generate the vector corresponding to the query terms # Compute the cosine similarities of it vs the TF vector of the query # Ties are broken by the magnitude of the vector - note that this is obtained by only considering the query terms # Plus these query term weights were scaled with relative TF, so a higher magnitude means the terms were more important doc_scores = {} for doc in possible_docs: doc_vector = [] for q in query_vector: doc_vector.append(posting_list[q].get(doc,0)) doc_scores[doc] = (cosine_similarity(doc_vector,query_tf_vector), norm(doc_vector)) # Results are sorted sorted_results = sorted(doc_scores.items(), key=operator.itemgetter(1), reverse=True) end_time = time.time() # Timer ends as search portion is complete search_time = end_time - start_time ct = 0 print("-------------- SEARCH RESULTS --------------") results={} results['Documents']={} for i in sorted_results: fname, rownum = file_dict[i[0]].split(' ') rownum = int(rownum[3:]) search_res = lines[i[0]] search_res = search_res.split('\t')[2] results['Documents'][i[0]]={'Name': fname, 'Row': rownum, 'Score': i[1], 'Results': search_res} ct += 1 if ct == 10: break results['Time']=end_time-start_time return results
def process(inpText): """This function lemmatizes the input sentence.""" l = lemmatize_sentence(inpText, keepWordPOS=True) return l[1]
def remove_stop(sentence): words = lemmatize_sentence(sentence) words = [w for w in words if not w in stopwords] return ' '.join(words)
from sklearn.preprocessing import normalize import scipy.cluster.hierarchy as shc from sklearn.cluster import AgglomerativeClustering ### data reading + prep lemmatizer = WordNetLemmatizer() loc = ("tagged_comments.xlsx") wb = xlrd.open_workbook(loc) sheet = wb.sheet_by_index(0) #lemmatize the sentences temp_comment = [] for i in range(sheet.nrows): temp_comment.append(" ".join( lemmatize_sentence( re.sub(r'[^a-zA-Z0-9 ]', '', str(sheet.cell_value(i, 0)))))) temp_speaker = [] for j in range(sheet.nrows): temp_speaker.append(str(sheet.cell_value(j, 1))) temp_tag = [] for k in range(sheet.nrows): temp_tag.append(str(sheet.cell_value(k, 2))) data_tuples = list(zip(temp_comment, temp_speaker, temp_tag)) data = pd.DataFrame(data_tuples, columns=["comments", "speaker", "tag"]) ### SWEETNAM sweetnam_comments = data[data['speaker'] == "Sweetnam"]
'[^\w\s]', '') #remove numeric Descriptions.Descriptions = Descriptions.Descriptions.str.replace("\d+", "") #Change to lowercase Descriptions.Descriptions = Descriptions.Descriptions.str.lower() #not sure I want to remove stop words yet from nltk.corpus import stopwords stop = stopwords.words('english') Descriptions.Descriptions = Descriptions.Descriptions.apply( lambda x: " ".join(x for x in x.split() if x not in stop)) Descriptions.Descriptions = Descriptions.Descriptions.apply( lambda x: " ".join(lemmatize_sentence(x))) #Lets create a DTM dtMatrix = cv.fit_transform(Descriptions.Descriptions).transpose().toarray() #Inspect it #print(dtMatrix.shape) #Print the names of the skills featurenames = cv.get_feature_names() #print(featurenames) #Create a tf-idf tfidf = TfidfTransformer() #Turn DTM into tf-idf matrix tfidfMatrix = tfidf.fit_transform(dtMatrix).toarray() #print(tfidfMatrix.shape) del (dtMatrix)
from pywsd.utils import lemmatize_sentence print(lemmatize_sentence("Really scared."))
# The actual data is also loaded to be display the search results fileobj = open('tvnews_corpus.tsv', 'r') lines = fileobj.readlines() fileobj.close() # Using a fixed query for testing. Remove and use free input later # query = input("Query:") #query = "Donald Trump accuses China of artificially creating climate change" #query = "Climate change is very important" #print("Input Query:", query) while True: query = input("Query:") query_tokens = lemmatize_sentence( query) # lemmatize tokens to use as in vocabulary query_vector = [] query_tf = {} total_query_vocab = 0 for tok in query_tokens: try: indexvalue = vocab.index(tok) query_vector.append(indexvalue) query_tf[indexvalue] = 1 + query_tf.get(indexvalue, 0) total_query_vocab += 1 except ValueError: # Token doesnt exist in vocab - ignored print(tok, "does not exist in the vocabulary. - Ignoring") print("Query as vocab indices:", query_vector) print() start_time = time.time() # Timer starts
def lemmatize_words(sentence): from pywsd.utils import lemmatize_sentence return lemmatize_sentence(sentence)
except: # In case the content word is not in WordNet synset = ''#'#NOT_IN_WN#' else: synset = ''#'#STOPWORD/PUNCTUATION#' if keepLemmas: tagged_sentence.append((word, lemma, pos_, synset)) # else: # tagged_sentence.append((word, synset)) # Change #NOT_IN_WN# and #STOPWORD/PUNCTUATION# into None. #if prefersNone and not keepLemmas: # tagged_sentence = [(word, None) if str(tag).startswith('#') # else (word, tag) for word, tag in tagged_sentence] #if prefersNone and keepLemmas: # tagged_sentence = [(word, lemma, None) if str(tag).startswith('#') # else (word, lemma, tag) for word, lemma, tag in tagged_sentence] return tagged_sentence if __name__=='__main__': sentence = "I went to the bank to get a loan." surface_words, lemmas, morphy_poss = lemmatize_sentence(sentence, keepWordPOS=True) print(surface_words, lemmas, morphy_poss) result = disambiguate('I went to the bank to deposit my money', \ keepLemmas=True, prefersNone=False) print(result)