def uniqueWord (train_set, test_set): exclude = set(string.punctuation) lmtzr = WordNetLemmatizer() for element1 in train_set: temp_list1 = element1.decode('utf-8').split() for word1 in temp_list1: word1 = lmtzr.lemmatize(word1) word1 = ''.join(ch for ch in word1 if ch not in exclude) if not word1 in unique_word1: unique_word1.append(word1) for element2 in test_set: temp_list2 = element2.split() temp_list2 = element2.decode('utf-8').split() for word2 in temp_list2: non_unique_word2.append(word2) dictionary.setdefault("list", []).append("list_item") for element in unique_word1: dictionary.update({element:1}) for e in non_unique_word2: if dictionary.has_key(e): dictionary[e] += 1 sorted_dic = sorted(dictionary.items(), key = operator.itemgetter(1)) return sorted_dic
def _VERBAL_PREDICATE_FEATURE_Lemma(self): from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() if self.pos in pos_penn_to_wordnet: return lmtzr.lemmatize(self.word, pos_penn_to_wordnet[self.pos]) else: return False
def get_2prev_pos_lemma_verb(arg_clauses, clause_index, parse_dict): DocID = arg_clauses.DocID sent_index = arg_clauses.sent_index verb_pos = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"] curr_clause_indices = arg_clauses.clauses[clause_index][0]# ([1,2,3],yes) lmtzr = WordNetLemmatizer() first_verb = "" first_verb_index = 0 for index in curr_clause_indices: word = parse_dict[DocID]["sentences"][sent_index]["words"][index][0] pos = parse_dict[DocID]["sentences"][sent_index]["words"][index][1]["PartOfSpeech"] if pos in verb_pos: word = lmtzr.lemmatize(word) first_verb = (word, index) break first_verb_index += 1 if first_verb == "": return "NULL|NULL|NULL" if first_verb_index == 0: return "%s|%s|%s" % ("NULL", "NULL", first_verb[0]) if first_verb_index == 1: prev1_pos = parse_dict[DocID]["sentences"][sent_index]["words"][first_verb[1] - 1][1]["PartOfSpeech"] return "%s|%s|%s" % ("NULL", prev1_pos, first_verb[0]) prev1_pos = parse_dict[DocID]["sentences"][sent_index]["words"][first_verb[1] - 1][1]["PartOfSpeech"] prev2_pos = parse_dict[DocID]["sentences"][sent_index]["words"][first_verb[1] - 2][1]["PartOfSpeech"] return "%s|%s|%s" % (prev2_pos, prev1_pos, first_verb[0])
def parseLine(line, stopWords_, wordInd, currWrd): """ Removes stop words and lemmas using nltk and punctuations using re. Returns a list with valid words in the line. currWrd is the index of next word occurring for the first time """ lineWords = [] # Hypen in hyphenated words are removed e.g. wi-fi ==> wifi. line = re.sub('(\w)-(\w)',r'\1\2',line) # replace underscore with space line = re.sub('(\w)_(\w)',r'\1 \2',line) # Remove punctuation marks. line = re.sub("[',~`@#$%^&*|<>{}[\]\\\/.:;?!\(\)_+\"-]",r'',line) wnLmtzr = WordNetLemmatizer() for word in line.split(): # Get index of word from wordInd. If it is seen for the first # time assign an index to the word. word = word.lower() # case of words is ignored # Lemmatize word using word net function word = wnLmtzr.lemmatize(word, 'n') # with noun word1 = wnLmtzr.lemmatize(word, 'v') # with verb if len(word1) < len(word): # select smaller of two word = word1 # Ignore stop words and numbers. if word in stopWords_ or \ re.match('^\d+x?\d*$',word) is not None: continue # Update wordInd with number of occurrences of word. if word not in wordInd: wordInd[word] = currWrd[0] currWrd[0] += 1 # Update lineWords with word. lineWords.append(word) return lineWords
def parseLyrics2(outlist): bandLyricInfo = {} master = [['death', 0],['violence',0],['sacrifice',0],['nature',0],['peace',0],['storm',0],['spirit',0],[ 'dark',0],['scream',0],['pain',0],['blood',0],['flesh',0],['love',0],['greed',0],['poison',0],['anger',0],['revenge',0],['misery',0],['hell',0],['heaven',0],['hate',0],['soul',0],['battle',0],['ghost',0],['joy',0],['light',0],['omen',0],['miracle',0],['magic',0],['universe',0],['disease',0],['god',0],['satan',0],['struggle',0],['heart',0]] for key in outlist: templist = copy.deepcopy(master) ; #key = 'Queensryche' raw = outlist[key]; raw = raw.lower(); words = re.findall(r'\w+', raw,flags = re.UNICODE | re.LOCALE) # punctuation imp_words = filter(lambda x: x not in stopwords.words('english'), words) # filter noise lmt = WordNetLemmatizer() words_new = [lmt.lemmatize(x) for x in words] dw = list(set(words_new)) for word in dw: for m in templist: p1 = wordnet.synsets(word) ; p2 = wordnet.synsets(m[0]) ; if(len(p1) >0 and len(p2) >0): c = p1[0].wup_similarity(p2[0]) if(c > m[1]): m[1] = c # sort words according to similarity tnew = sorted(templist,key=lambda val:val[1],reverse=True) [0:10] ; # remove the other column for l in tnew: del l[1] print 'Done ',key #break ; bandLyricInfo[key] = tnew #del templist return bandLyricInfo
def feature_extractor(data): """Extract features from a relation for the classifier.""" features = dict() lmtzr = WordNetLemmatizer() h2, h3, paragraph = data features['h2_' + h2.lower()] = True for word in h2.split(' '): if word.lower() not in stopwords.words('english') and len(word) > 1: features['h2word_' + word.lower()] = True features['h_' + h2.lower()] = True for word in h2.split(' '): if word.lower() not in stopwords.words('english') and len(word) > 1: features['hword_' + word.lower()] = True if h3 != None: features['h3_' + h3.lower()] = True for word in h3.split(' '): if word.lower() not in stopwords.words('english') and len(word) > 1: features['h3word_' + word.lower()] = True features['h_' + h3.lower()] = True for word in h3.split(' '): if word.lower() not in stopwords.words('english') and len(word) > 1: features['hword_' + word.lower()] = True for word in nltk.wordpunct_tokenize(paragraph): if word.lower() not in stopwords.words('english') and len(word) > 1: features[word] = True features['lower_' + word.lower()] = True features['lmtzr_' + lmtzr.lemmatize(word).lower()] = True return features
def cleanUp(rawWords): stops = [t.lower() for t in stopwords.words('english')] sumarr = [] for i in range(0,len(rawWords)): arr = [t.lower() for t in rawWords[i].split()] for word in arr: if word not in stops: sumarr.append(word.lower()) punct1 = '.,?/><";![]:@#$%&*()' punct2 = "'" # for i in range(0,len(sumarr)): for i in range(0,1): if r'\\xe2' in sumarr[i]: sumarr[i] = '*' if len(sumarr[i]) > 1: if sumarr[i][-1] in punct1 or sumarr[i][-1] in punct2: sumarr[i] = sumarr[i][:-1] # delete punctuation at the end of the word. if sumarr[i][-1] in punct1 or sumarr[i][-1] in punct2: sumarr[i] = sumarr[i][:-1] # once more to delete double punctuations. if sumarr[i][0] in punct1 or sumarr[i][0] in punct2: sumarr[i] = sumarr[i][1:] # delete punctuation at the start of the word. if len(sumarr[i]) > 2: # print sumarr if sumarr[i][-2] == "'" and sumarr[i][-1] == 's' : sumarr[i] = sumarr[i][:-2] # so that Jim's --> Jim. if sumarr[i][-2] == "'" and sumarr[i][-1] == 'm' : sumarr[i] = sumarr[i][:-2] # so that I'm --> I. if len(sumarr[i]) > 3: if sumarr[i][-3] == 'n' and sumarr[i][-2] == "'" and sumarr[i][-1] == 't' : sumarr[i] = sumarr[i][:-3] # so that isn't --> is. Not is a stop word. lmtzr = WordNetLemmatizer() return [lmtzr.lemmatize(t) for t in sumarr if ("'" not in t and t not in stops)]
def wordMatch(story, ques): quesWords = word_tokenize(ques.ques) lemmatizer = WordNetLemmatizer() quesWords_Lemmatized = [lemmatizer.lemmatize(word) for word in quesWords] maxSentence = '' maxScore = 0 maxSentenceWords = [] for sent in story.sentences: sent = sent.replace(".", "") sent = sent.replace(",", "") sent = sent.replace("\n", " ") sent = sent.replace("\s", "") sent = sent.replace("\\", "") score = 0 filteredWords_Lemmatized = story.sentLemmaWords[sent] postags = story.sentPosTags[sent] dict = {} propernouns = [] referencetohuman = 'false' for tag in postags: dict[tag[0]] = tag[1] if 'NNP' in tag[1]: propernouns.append(tag[0]) if 'NN' in tag[1]: referencetohuman = 'true' # Rule 1 for qWord in quesWords_Lemmatized: if qWord in filteredWords_Lemmatized: if 'VB' in dict[qWord]: score += 6 # break else: score += 3 # break # Rule 2 for pn in propernouns: if pn in quesWords_Lemmatized: # The same noun word is present in the ques as well score += 6 # Rule 3 quesposttags = nltk.pos_tag(quesWords_Lemmatized) for tag in quesposttags: if 'NNP' in tag[1] and 'name' in filteredWords_Lemmatized: score += 4 # Rule 4 if propernouns.__len__() > 0 or referencetohuman == 'true': score += 4 if score >= maxScore: maxScore = score maxSentence = sent maxSentenceWords = story.sentWords[sent] print("Answer: " + removeCommonWords(maxSentenceWords, quesWords)) finalString = "\nAnswer: " + removeCommonWords(maxSentenceWords, quesWords) + "\n\n" answerFile.write(finalString)
def convertToVec(self, line): lmtzr = WordNetLemmatizer() if isinstance(line, unicode): line = str(unicodedata.normalize('NFKD', line).encode('ascii','ignore')) #Strip of special characters line = re.sub(r'[^a-z^A-Z^0-9^,^.]|\^', ' ', line) line = line.lower() wordcount = {} count = self.Dic.count for word in line.split(' '): word = lmtzr.lemmatize(word) if isinstance(word, unicode): word = str(unicodedata.normalize('NFKD', word).encode('ascii','ignore')) if word in self.Dic.words.keys(): num = self.Dic.words[word] else: num = count count += 1 if num not in wordcount.keys(): wordcount[num] = 1 else: wordcount[num] = wordcount[num] + 1 vec = [] for key in wordcount.keys(): tp = (key, wordcount[key] + 0.0) vec.append(tp) return vec
def lemmatize(tokens): tokenLemmas = []; lmtzr = WordNetLemmatizer() for items in tokens: tokenLemmas.append([lmtzr.lemmatize(item) for item in items]) return tokenLemmas
def MakeLemmaList(tagged): # n noun # v verb # a adje # r adverb # m,w,.. something else noun_op, adj_op, adv_op, verb_op, other_op = [], [], [], [], [] lm = WordNetLemmatizer() for i in tagged: # print i, i[0], i[1][0:2] if cmp(i[1][0:1], "N") == 0: noun_op.append(lm.lemmatize(i[0], "n")) elif cmp(i[1][0:1], "V") == 0: asd = lm.lemmatize(i[0], "v") if asd != "be" and asd != "have" and asd != "do" and asd != "done" and asd != "should": verb_op.append(asd) elif cmp(i[1][0:1], "J") == 0: adj_op.append(lm.lemmatize(i[0], "a")) elif cmp(i[1][0:1], "R") == 0: adv_op.append(lm.lemmatize(i[0], "r")) else: # print lm.lemmatize(i[0])+ " " pass final_op = noun_op + verb_op + other_op + adj_op + adv_op return final_op
def get_skill_for_entity(entity_name): lmtzr = WordNetLemmatizer() name = entity_name.replace('_', ' ') lemma = lmtzr.lemmatize(name) skills = Skill.objects.filter(lemma_name=lemma) return skills
def get_cooc(chunk_trees,stoplist=True): triples, simple_trees = [], [] lmtzr = WordNetLemmatizer() for t in chunk_trees: entities = [] for chunk in t[:]: if isinstance(chunk,Tree) and chunk.node == 'NP': # getting a tree for later processing of triples from the simple noun # phrases (if present) simple_trees.append(parser_smp.parse(chunk.leaves())) words = [] for word, tag in chunk[:]: # stem/discard elements and construct an argument if (stoplist and word in STOPLIST) or \ (len([x for x in word if x.isalnum()]) == 0): # do not process stopwords for simple trees, do not process purely # non alphanumeric characters continue if tag.startswith('N'): words.append(lmtzr.lemmatize(word,'n')) elif tag.startswith('J'): words.append(lmtzr.lemmatize(word,'a')) else: words.append(word) if len(words) > 0: entities.append(SEP.join(words)) for e1, e2 in combinations(entities,2): triples.append((e1,util.COOC_RELNAME,e2)) triples.append((e2,util.COOC_RELNAME,e1)) return triples, simple_trees
def getting_sentiment(word,pos): flag = 0 if 'NN' in pos: tag = 'n' elif 'JJ' in pos: tag = 'a' if pos == 'JJS': flag = 1 elif 'VB' in pos: tag = 'v' elif 'RB' in pos: tag = 'r' else: tag = '' stemmer = WordNetLemmatizer() if tag != '': x = stemmer.lemmatize(word,tag) else: x = stemmer.lemmatize(word) try: score = float(score_dic[x]) #* float(m1) except KeyError: if len(swn.senti_synsets(x,tag)) > 0: score = swn.senti_synsets(x,tag)[0].pos_score() * 5 else: score = 100 if flag == 1 and score != -100 and score < 4: score = score + 1 elif flag == 1 and score != -100 and score > -4 and score < 0: score = score - 1 print word + '--->' + str(score) return score
def processwords(words): # Lemmatize the words print 'Lemmatizing...' lmtzr = WNL() lemmatized = [lmtzr.lemmatize(w) for w in words ] print len(lemmatized) # Create a dictionary of the words and the counts # Place words in a Counter collection object (this removes duplicates and counts the occurences of a word) print 'Mapping words to counts...' word_dict = Counter(lemmatized) print len(word_dict) # Drop out words that occur less than 100 times in the entire set of webpages print 'Removing words that appear less than 100 times...' for key, count in dropwhile(lambda key_count: key_count[1] >= 100, word_dict.most_common()): del word_dict[key] print len(word_dict) # Filter the words of stopwords (too common), non-English words, and single-letter words print 'Filtering out stopwords, non-English words, and single-lettered words...' for w in list(word_dict): if w in stopwords.words('english'): del word_dict[w] elif not wordnet.synsets(w): del word_dict[w] elif len(w)==1: del word_dict[w] print len(word_dict) return word_dict
def get_bag_of_senses(temp_words1): senses = [] lmtzr = WordNetLemmatizer() temp_words1 = nltk.pos_tag(temp_words1.split()) for t in temp_words1: try: if 'VB' in t[1]: senses.append(wordnet.synsets(lmtzr.lemmatize(t[0],'v'))) else: senses.append(wordnet.synsets(t[0])) except: pass hypernyms = [] for sense_l in senses: for s in sense_l: hypernyms.append(s.hypernyms()) hyponyms = [] for sense_l in senses: for s in sense_l: hyponyms.append(s.hyponyms()) '''meronyms = [] for sense_l in senses: for s in sense_l: meronyms.append(s.part_meronyms()) toponyms = [] for sense_l in senses: for s in sense_l: toponyms.append(s.part_holonyms())''' definitions = [] for sense_l in senses: if len(sense_l) > 1: for s in sense_l: definitions.append(s.definition) for sense_l in hypernyms: if len(sense_l) > 1: for s in sense_l: definitions.append(s.definition) for sense_l in hyponyms: if len(sense_l) > 1: for s in sense_l: definitions.append(s.definition) '''for sense_l in meronyms: for s in sense_l: definitions.append(s.name) for sense_l in toponyms: for s in sense_l: definitions.append(s.name)''' definitions = ' '.join(definitions) return definitions
def run(self): """ How do I run this Task? Luigi will call this method if the Task needs to be run. """ # remove stop words and punctuation stop = set(stopwords.words('english')) tokenizer = RegexpTokenizer(r'\w+') wordnet = WordNetLemmatizer() docs = [] #ipdb.set_trace() for f in self.input(): # The input() method is a wrapper around requires() that returns Target objects lines = 0 words = [] for line in f.open('r'): if lines == 0: label = line lines +=1 else: words.extend(tokenizer.tokenize(line)) lines +=1 words_filtered = filtered_words = [wordnet.lemmatize(w) for w in words if not w in stopwords.words('english')] docs.append((label, '\t'.join(words))) out = self.output().open('w') for label, tokens in docs: out.write("%s,%s\n" % (label.strip(), tokens.strip())) out.close()
def lemmat(str): lemm = WordNetLemmatizer() split = str.split(' ') index = 1 new = [] new1 = [] dict = {} new1.append(['',0,0]) pom2 = 0 for word in split: item = [] new.append(word) if word != '': pom = lemm.lemmatize(word,'n') item.append(pom) item.append(pom2 + new1[-1][1]) dict[index]= item item.append(index) new1.append(item) index += len(pom) + 1 pom2 = len(word) - len(pom) else: pom2 += 1 outp = '' for i in new1: outp += i[0] + ' ' outp = outp[1:] result = [] result.append(outp) result.append(dict) return result
def process(): print("Loading...") documentDict = loadRaw('data/cnn-stories') documents = [] print("Cleaning...") i = 0 for filename, documentText in documentDict.items(): tokens = tokenize(documentText) tagged_tokens = pos_tag(tokens) wnl = WordNetLemmatizer() stemmedTokens = [wnl.lemmatize(word, wordnetPos(tag)).lower() for word, tag in tagged_tokens] documents.append({ 'filename': filename, 'text': documentText, 'words': stemmedTokens, }) if i % 100 == 0: print(i) i += 1 print("Writing to disk...") with open('all_stories.json', 'w') as outfile: outfile.write(json.dumps(documents)) print("Done!")
def lemmatize(w,p): if p.startswith("N"): return (wnl.lemmatize(wnl,w,'n'),p) elif p.startswith("V"): return (wnl.lemmatize(wnl,w,'v'),p) else: return (w,p)
def convert_speeches_into_matrix(features,speech_list,label): sample_matrix = [] label_vector = [] #print len(features) for speech in speech_list: sample = [] speech = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speech) speech = re.sub('%[0-9|.]*', ' ', speech) speech = re.sub('$[0-9|.]*',' ', speech) for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ": speech = speech.replace(ch,' ') tokens = speech.split() #word lemmatization lmtzr = WordNetLemmatizer() tokens = [lmtzr.lemmatize(token) for token in tokens] tokens = [lmtzr.lemmatize(token,'v') for token in tokens] #tokens = bigrams(tokens) # uncomment this line, we can use bigram as unique_tokens_dict = collections.Counter(tokens) for fea in features: if fea in unique_tokens_dict: sample.append(unique_tokens_dict[fea]) else: sample.append(0) #print(sample) sample_matrix.append(sample) label_vector.append(label) return sample_matrix,label_vector
def lemma_tokenize(paragraph): lmtzr = WordNetLemmatizer() try: return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence] except LookupError: nltk.download('wordnet') return [lmtzr.lemmatize(word).lower() for sentence in tokenize(paragraph) for word in sentence]
def wordLemmatization(self): #should be working now lemmatizer = WordNetLemmatizer() lemmatization_result = [] for word in self.file: lemmatization_result.append(lemmatizer.lemmatize(word)) self.file=lemmatization_result
def main(): rake=RAKE.Rake('SmartStoplist.txt') fp=open(input_file,'r') text=fp.read() text=text_clean(text) wnl=WordNetLemmatizer() text=' '.join([wnl.lemmatize(i.strip()) for i in nltk.word_tokenize(text)]) keywords=rake.run(text) #print keywords #key_list=list() with open(key_score_file,'wb') as out: csv_out=csv.writer(out) csv_out.writerow(['KEYWORD','SCORE']) for row in keywords: #csv_out.writerow(row) if row[1]>0: csv_out.writerow(row) unibitrigram_list=[] unibitrigram_list=generate_unibitrigrams(key_score_file) ngram_freq=Counter(unibitrigram_list) sorted_ngram_freq=sorted(ngram_freq.items(),key=lambda x:x[1],reverse=True ) print ngram_freq with open('bcom_ngramfr.csv','wb') as nf_csv: csv_wr=csv.writer(nf_csv) for item in sorted_ngram_freq: if ((item[0]!='' or item[1]>0 )): csv_wr.writerow(item)
def clean_single_word(word, lemmatizing="wordnet"): """ Performs stemming or lemmatizing on a single word. If we are to search for a word in a clean bag-of-words, we need to search it after the same kind of preprocessing. Inputs: - word: A string containing the source word. - lemmatizing: A string containing one of the following: "porter", "snowball" or "wordnet". Output: - lemma: The resulting clean lemma or stem. """ if lemmatizing == "porter": porter = PorterStemmer() lemma = porter.stem(word) elif lemmatizing == "snowball": snowball = SnowballStemmer('english') lemma = snowball.stem(word) elif lemmatizing == "wordnet": wordnet = WordNetLemmatizer() lemma = wordnet.lemmatize(word) else: print("Invalid lemmatizer argument.") raise RuntimeError return lemma
def stemWordMatch(question,sentence): lmtzr = WordNetLemmatizer() question_tokens = set(nltk.word_tokenize(question)) sentence_tokens=set(nltk.word_tokenize(sentence)) count=0 '''for i in sentence_tokens: #Finding the exact word match if lmtzr.lemmatize(i, 'v').lower() in [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]: #print 'matching word is:',i count=count+6 elif i.lower() in [x.lower() for x in question_tokens]: print 'i is :',i count=count+3 #print 'Exact word match count is :',count''' for i in sentence_tokens: #Finding the exact word match if i.lower() in [x.lower() for x in question_tokens]: #print 'i is :',i count=count+3 elif lmtzr.lemmatize(i, 'v').lower() in [lmtzr.lemmatize(x, 'v').lower() for x in question_tokens]: #print 'matching word is:',i count=count+6 #print 'Exact word match count is :',count return count
def data_preprocessing(file_path): f = open(file_path,'r') speech_list = f.read().split("###") # read speeches, split with ###, and save them into list. del speech_list[-1] f.close() #print len(speech_list) f = open(file_path,'r') speeches = f.read().lower() #set all letters lower case speeches = re.sub('http://[a-zA-Z0-9|/|.]*',' ',speeches) speeches = re.sub('%[0-9|.]*', ' ', speeches) speeches = re.sub('$[0-9|.]*',' ', speeches) #speeches = re.sub('\\\\xe2\\\\x80\\\\x[a-zA-Z0-9]*',' ',speeches) #print speeches for ch in " \"$!'@#%&()*+,-./:;<=>?[\\]^_`{|}~ ": speeches = speeches.replace(ch,' ') tokens = speeches.split() #word lemmatization lmtzr = WordNetLemmatizer() tokens = [lmtzr.lemmatize(token) for token in tokens] tokens = [lmtzr.lemmatize(token,'v') for token in tokens] #tokens = bigrams(tokens) # uncomment this line, we can use bigram as total_tokens_count = len(tokens) unique_tokens_dict = collections.Counter(tokens) #key is word, value is the count, #also default value 0 for non-exsit key. result = [ speech_list, unique_tokens_dict, total_tokens_count ] return result
def __tokenize(self,text): """ function: tokenize ------------------ generate list of tokens given a block of @text :param text: string representing article text field :returns: list of tokens with various modifications """ ascii = text.encode('ascii', 'ignore') # remove digits & punctuation no_digits = ascii.translate(None, string.digits) no_punctuation = no_digits.translate(None, string.punctuation) # separate text blocks into tokens tokens = nltk.word_tokenize(no_punctuation) # remove class labels, stopwords, and non-english words no_class_labels = [w for w in tokens if not w in Document.banned_words] no_stop_words = [w for w in no_class_labels if not w in stopwords.words('english')] eng = [y for y in no_stop_words if wordnet.synsets(y)] # lemmatization lemmas = [] lmtzr = WordNetLemmatizer() for token in eng: lemmas.append(lmtzr.lemmatize(token)) # stemming stems = [] stemmer = PorterStemmer() for token in lemmas: stem = stemmer.stem(token).encode('ascii', 'ignore') if len(stem) >= 4: stems.append(stem) return stems
def weed_out_lexelts(tweets_file): lexelts = [] WNL = WordNetLemmatizer() with open(tweets_file, 'r') as twh: for line in twh: line = line.strip().split(' :: ')[1] lexelts_temp = [] try: lexelts_temp = pos_tag(word_tokenize(line)) except TypeError: print line # Get sanitized parts of speech, not the Treebank style # Tuples are immutable, need to make a new single-tuple list for w, p in lexelts_temp: new_p = get_sanitized_pos(p) new_w = w try: new_w = WNL.lemmatize(w, new_p) except KeyError: pass lexelts.extend([(new_w, new_p)]) lexelts = list(set(lexelts)) print lexelts return lexelts
def getlemmas(tokens): lemmas = [] for token in tokens: if len(token) < 2 or not isWord(token) or token == "the": lemmas.append({}) continue tokenLemmas = {} #Synonyms for syn in wn.synsets(token): #Derived Forms and their Syns for lemma in syn.lemmas(): for df in lemma.derivationally_related_forms(): for ln in df.synset().lemma_names(): tokenLemmas[ln] = 4 tokenLemmas[df.name()] = 3 for lname in syn.lemma_names(): tokenLemmas[lname] = 2 #Wordnet lemmas l = WordNetLemmatizer() for x in ('v','a','s','r','n'): tmp = l.lemmatize(token, x) tokenLemmas[tmp] = 1 tmp = l.lemmatize(tmp, x) tokenLemmas[tmp] = 1 #Exact tokenLemmas[token] = 1 lemmas.append(tokenLemmas) return lemmas
from gensim.models import Word2Vec from pprint import pprint google = gensim.models.KeyedVectors.load_word2vec_format( '~/word2vec-model/GoogleNews-vectors-negative300.bin', binary=True) with open('data/requirements.txt', 'r') as myfile: data = myfile.read().replace('\n', ' ') #stemmer = PorterStemmer() #stemmed_text1 = [stemmer.stem(i) for i in word_tokenize(data1)] #s1 = ' '.join(stemmed_text1) #print 'Stemmed text1: %s \n\n\n' % s1 lemma = WordNetLemmatizer() #lemma_text = [lemma.lemmatize(i, pos="n") for i in word_tokenize(data1)] # Remove stopwords stops = set(stopwords.words("english")) #lemma_filtered = [word for word in lemma_text if word not in stops] #ls1 = ' '. join(lemma_text) #print 'Lemma text1: %s \n\n\n' % ls1 #with open("data/ls1.txt", 'w') as f: # f.write(ls1) #print 'Text1 %s' % string.join(stemmed_text1, " ")
from gensim import corpora, models, similarities from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from nltk import pos_tag from nltk.stem.wordnet import WordNetLemmatizer from nltk.stem.snowball import SnowballStemmer import re from collections import defaultdict lemmatizer = WordNetLemmatizer() snb = SnowballStemmer('english') def remove_punctuation(text): return re.sub(ur"\p{P}+", "", text) def stem_with_replacement(texts): ''' attempts to find a common form of each word. keeps the shortest full word for each stem rather than nonsensible root. common form is taken from the entire corpus rather than just the single document ''' stem_dict = defaultdict(set) unstem = lambda x: min(stem_dict[x], key=len) words = word_tokenize(' '.join(texts).lower()) for word in words: stemmed = snb.stem(word) stem_dict[stemmed].add(word) new_texts = []
'auguments', 'get', 'string', 'prototype', 'nodeType', 'slice', 'header', 'top', 'li', 'style', 'Appendix','Table', 'owl', 'hover', 'pageination'] for i in extra: stop_words.append(i) keywords = [word for word in tokens if not word in stop_words and not word in punctuations] #nltk.download('wordnet') from nltk.stem.wordnet import WordNetLemmatizer ##Convert to list from string text = text.split() #Lemmatisation lem = WordNetLemmatizer() text = [lem.lemmatize(word) for word in text if not word in stop_words] text = " ".join(text) corpus = [] corpus.append(text) #Word cloud from os import path from PIL import Image from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator import matplotlib.pyplot as plt wordcloud = WordCloud( background_color='white', stopwords=stop_words,
def success(text): data = {} data[0] = text pd.set_option('max_colwidth', 200) df1 = pd.DataFrame.from_dict(data, orient='index') df1.columns = ['Lyrics'] def round1(text): # lower the Text text = text.lower() # Remove Numbers text = re.sub(r"\d+", "", text) # Remove Symbols and special characters # Below return true if not alphanumereic text = re.sub(r'[^\w]', ' ', text) # Remove more than a single whitespace text = ' '.join(text.split()) # Remove Leading and Trailing Whitespaces text = text.strip() return text rnd1 = lambda x: round1(x) df2 = df1.copy() df2['Lyrics'] = df2['Lyrics'].apply(rnd1) stop = list(string.punctuation) def cleaning(text): clean_doc = [] for x in text: clean_sent = [] for i in word_tokenize(x): # for i in x.lower(): if i not in stop: clean_sent.append(i) clean_doc.append(clean_sent) return clean_doc df3 = df2.copy() df3['Lyrics'] = cleaning(df3['Lyrics']) s = ' ' for i in range(len(df3)): df3['Lyrics'].loc[i] = s.join(df3['Lyrics'].loc[i]) wordnet = WordNetLemmatizer() def Lemmatizing(text): pre_doc = [] for word in text: pre_doc.append(wordnet.lemmatize(word)) return pre_doc df4 = df3.copy() df4['Lyrics'] = Lemmatizing(df4['Lyrics']) cv = CountVectorizer(stop_words='english') df5 = cv.fit_transform(df4['Lyrics']) df6 = pd.DataFrame(df5.toarray(), columns=cv.get_feature_names()) df6.index = df4.index df7 = df6.transpose() top_dict = {} for c in df7.columns: top = df7[c].sort_values(ascending=False).head(30) top_dict[c] = list(zip(top.index, top.values)) for album, top_words in top_dict.items(): print(album) print(', '.join([word for word, count in top_words])) print('------------') ts = Translator() res = ts.translate(df4['Lyrics'].loc[0], dest='hi') hitext = res.text return '%s' % hitext
temptext = re.sub('[^a-zA-Z]', ' ', str(content)) temptext = temptext.lower() tokens = nltk.word_tokenize(temptext) #tokens = [word for word in tokens if word not in set(builtinstopwords)] cleanbody= [lm.lemmatize(word) for word in tokens if not word in set(builtinstopwords)] return (str(cleanbody)[1:-1]) def exec_time(start, end): if (end - start) <= 60: print("Total Execution time was {} seconds".format(end - start)) else: print("Total Execution time was {} minutes".format((end - start)/60)) #%% # TODO - check the difference in time when lemmatizer is instantiated inside the clean function vs outside lm = WordNetLemmatizer() df['cleaned']=df['text'].apply(lambda x : clean(x)) df.reset_index(drop=True,inplace=True) #%% # FIT THE TFIDF VECTORIZER AND PICKLE THE VOCAB #tfidf_obj = TfidfVectorizer(max_df=0.5,min_df=0.01,use_idf=True) tfidf_obj = TfidfVectorizer(max_features = 5000) X_train_tfidf = tfidf_obj.fit_transform(df.cleaned) # X_train_tfidf.shape # feature_list = tfidf_obj.vocabulary_ # feature_list #%%
def __init__(self): self.wordnet_lemmatizer = WordNetLemmatizer() self.mapping = tagset_mapping('en-ptb', 'universal')
#exit() Roles_Entities = Extract_Roles_entities(processed_input, CC_resolve_pos_dict, conjunction_index) print("\n\n") print("processed_information: ", processed_information) print("\n\n") print("schema Identification.....") possible_schemas = [] possible_schemas_sent_index = [] for schema, values in schemas_keys.items(): for ind, sent in enumerate(processed_input): sent_tokens = nltk.word_tokenize(sent) for w in sent_tokens: word_lemmatization = WordNetLemmatizer().lemmatize(w, 'v') if (word_lemmatization in values): possible_schemas.append(schema) # Schema and sent's index possible_schemas_sent_index.append(ind) print("\n\n") print("possible_schemas and their sent's indices: ", possible_schemas, possible_schemas_sent_index) print("\n") print("Identified Unique Schemas: ", set(possible_schemas)) Unique_Schemas = set(possible_schemas) print("\n") for pred in Unique_Schemas: #print("Test sample",i+1,pred[i])
pat_s2 = re.compile("(?<=s)\'s?") # to find the abbreviation of not pat_not = re.compile("(?<=[a-zA-Z])n\'t") # to find the abbreviation of would pat_would = re.compile("(?<=[a-zA-Z])\'d") # to find the abbreviation of will pat_will = re.compile("(?<=[a-zA-Z])\'ll") # to find the abbreviation of am pat_am = re.compile("(?<=[I|i])\'m") # to find the abbreviation of are pat_are = re.compile("(?<=[a-zA-Z])\'re") # to find the abbreviation of have pat_ve = re.compile("(?<=[a-zA-Z])\'ve") lmtzr = WordNetLemmatizer() def get_words(file): with open (file) as f: words_box=[] pat = re.compile(r'[^a-zA-Z \']+') for line in f: #if re.match(r'[a-zA-Z]*',line): # words_box.extend(line.strip().strip('\'\"\.,').lower().split()) # words_box.extend(pat.sub(' ', line).strip().lower().split()) words_box.extend(merge(replace_abbreviations(line).split())) return collections.Counter(words_box) def merge(words):
def get_wordnet_pos(treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return '' lemmatizer = WordNetLemmatizer() filename = "../resources/semeval/train/english-lexical-sample.train.xml" output_dir = "../resources/semeval/lexelts" lemmas = [] tree = etree.parse(filename) for lexelt_idx, lexelt in enumerate(tree.findall("lexelt")): lexelt_lemma = lexelt.attrib['item'] lemmas.append(lexelt_lemma) print "Analysing {} (number {})".format(lexelt_lemma, lexelt_idx) sentences = []
def get_answer(question, story): """ :param question: dict :param story: dict :return: str question is a dictionary with keys: dep -- A list of dependency graphs for the question sentence. par -- A list of constituency parses for the question sentence. text -- The raw text of story. sid -- The story id. difficulty -- easy, medium, or hard type -- whether you need to use the 'sch' or 'story' versions of the . qid -- The id of the question. story is a dictionary with keys: story_dep -- list of dependency graphs for each sentence of the story version. sch_dep -- list of dependency graphs for each sentence of the sch version. sch_par -- list of constituency parses for each sentence of the sch version. story_par -- list of constituency parses for each sentence of the story version. sch -- the raw text for the sch version. text -- the raw text for the story version. sid -- the story id """ ### Your Code Goes Here ### # Our tools stemmer = SnowballStemmer("english") chunker = nltk.RegexpParser(GRAMMAR) lmtzr = WordNetLemmatizer() driver = QABase() # question["qid"] returns the form: "fables-04-7" q = driver.get_question(question["qid"]) current_story = driver.get_story(q["sid"]) text = story["text"] # Apply the standard NLP pipeline we've seen before sentences = get_sentences(text) # tokenize questions, also removing punctuations to extract keywords tokenizer = RegexpTokenizer(r'\w+') tokenized_question_text = tokenizer.tokenize(question["text"]) tagged_tokenized_question_text = nltk.pos_tag(tokenized_question_text) # remove stopwords tagged_keywords_list = [] stopwords = set(nltk.corpus.stopwords.words("english")) for word, tag in tagged_tokenized_question_text: if word not in stopwords: tagged_keywords_list.append((word, tag)) # lemmatize keywords lemmatized_keywords_list = [] for keyword, tag in tagged_keywords_list: lemmatized_keywords_list.append(stemmer.stem(keyword)) # Find the sentences that have all of our keywords in them target_sentences = find_sentences(lemmatized_keywords_list, sentences) # Extract the candidate locations from these sentences candidates_forest = find_candidates(target_sentences, chunker, question["text"]) if (question["difficulty"] == 'Easy' and len(candidates_forest) != 0): possible_answers_list = [] # locations is a list of trees for candidate in candidates_forest: # candidate.draw() possible_answers_list.append(" ".join( [token[0] for token in candidate.leaves()])) answer = " ".join(possible_answers_list) ########################################### # currently, possible_answer contains the actual needed answer, # plus some garbage words around it from chunking, # we might be able to filter this out SOMEHOW # possible_answer is a list of strings ########################################### elif question["difficulty"] == 'Medium': if question["type"] != 'Story': sentences = get_sentences(current_story["sch"]) else: sentences = get_sentences(current_story["text"]) Q = nltk.word_tokenize(question["text"].lower()) # print(Q) all_stemmed_sentences = [] for sent in sentences: temp_sent = [] for w, pos in sent: temp_sent.append((stemmer.stem(w), pos)) all_stemmed_sentences.append(temp_sent) stop_words = set(nltk.corpus.stopwords.words("english")) qbow = get_bow(get_sentences(question["text"])[0], stopwords) stemmed_qbow = [] for w in qbow: stemmed_qbow.append(stemmer.stem(w)) stemmed_qbow = set(stemmed_qbow) best_idx = best_overlap_index(stemmed_qbow, all_stemmed_sentences, stop_words) # print(question["qid"], best_idx) if question["type"] != 'Story': tree = current_story["sch_par"][best_idx] else: tree = current_story["story_par"][best_idx] ############################################# # if question["qid"] == 'blogs-03-13': # print(Q) # print(tree) # print("++++++++++++++++++++++++++++++++++++++++++++++") ############################################ # print(tree) # Create our pattern # First level subtree matching # candidate_sents = [] # # for sub in tree: # subsent = " ".join(sub.leaves()) # candidate_sents.append(subsent) # # stemmed_candidate_sents = [] # for s in candidate_sents: # temp_candidate_sents = [] # s = nltk.word_tokenize(s) # s = nltk.pos_tag(s) # # for w, p in s: # temp_candidate_sents.append((stemmer.stem(w), p)) # stemmed_candidate_sents.append(temp_candidate_sents) # # best_idx = best_overlap_index(stemmed_qbow, stemmed_candidate_sents, stopwords) # tree = tree[best_idx] # if question["qid"] == 'mc500.train.18.18': # print(tree) ######################################### # MAKE PATTERN FIT FOR TYPE OF QUESTION # ######################################### # print(Q[0]) if Q[0] == 'where' or Q[0] == 'when': pattern = nltk.ParentedTree.fromstring("(VP (*) (PP))") elif Q[0] == 'who': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'what': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'why': pattern = nltk.ParentedTree.fromstring("(SBAR)") elif Q[0] == 'how': pattern = nltk.ParentedTree.fromstring("(RB)") # don't know how to deal with 'did' questions elif Q[0] == 'did': pattern = nltk.ParentedTree.fromstring("(S)") subtree1 = pattern_matcher(pattern, tree) ############################################ # if question["qid"] == 'blogs-03-13': # print("subtree1") # print(subtree1) ############################################ if subtree1 == None: ####################################### answer = doBaseline(question, story) # answer = "doBaseline" ####################################### else: # create a new pattern to match a smaller subset of subtrees if Q[0] == 'where' or Q[0] == 'when': pattern = nltk.ParentedTree.fromstring("(VP)") elif Q[0] == 'who': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'what': pattern = nltk.ParentedTree.fromstring("(NP)") elif Q[0] == 'why': pattern = nltk.ParentedTree.fromstring("(SBAR (IN) (S))") elif Q[0] == 'how': pattern = nltk.ParentedTree.fromstring("(RB)") # don't know how to deal with 'did' questions elif Q[0] == 'did': pattern = nltk.ParentedTree.fromstring("(S)") # Find and make the answer # print(subtree) subtree2 = pattern_matcher(pattern, subtree1) answer = " ".join(subtree2.leaves()) ############################################ # if question["qid"] == 'mc500.train.18.18': # print("subtree2") # print(subtree2) ############################################ # cheat for dealing with 'did' questions if Q[0] == 'did': answer = "yes" else: ######################################### answer = doBaseline(question, story) # answer = "doBaseline" ######################################### ### End of Your Code ### return answer
# if ele[6] == '331786748': # s = ele[1] + " " + ele[2] # print(s,file=tfile) docs = [] with open( '/Users/shrey/AnacondaProjects/Application_reviews/Experiments/CNNnouns/RawData/CNNnews.txt', 'r') as tfile: for line in tfile: docs.append(line) tokenizer = RegexpTokenizer(r'\w+') en_stop = get_stop_words('en') en_stop.extend(['app', 'cnn', 'news']) #p_stemmer = PorterStemmer() lemma = WordNetLemmatizer() texts = [] # loop through document list for i in docs: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] tagged = nltk.pos_tag(stopped_tokens) nouns = [i[0] for i in tagged if i[1][0] == 'N'] # stem tokens
# Converting symptoms scraper to the corresponding root word ( Precalculation ) import re file = open('symptoms.txt') data = file.read() arr = data.split('\n') #print arr lls = list() for line in arr: ls = list() ls.extend(re.findall(r"[\w']+", line)) lls.append((list(ls[:]))) from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() nlls = list() for il in lls: nls = list() for tmp in il: word = tmp.lower() temp_n = lmtzr.lemmatize(word) temp_v = lmtzr.lemmatize(word, 'v') if (temp_n != word) and (temp_v != word): nls.append(str(temp_v)) elif (temp_n == word): nls.append(str(temp_v)) else: nls.append(str(temp_n)) nlls.append((list(nls[:]))) #print nls
from nltk.stem.lancaster import LancasterStemmer st = LancasterStemmer() from nltk.stem import PorterStemmer pt = PorterStemmer() from nltk.stem.snowball import EnglishStemmer sb = EnglishStemmer() from nltk.stem.wordnet import WordNetLemmatizer wn = WordNetLemmatizer() ##let's examine the word ``better" st.stem('better') pt.stem('better') sb.stem('better') wn.lemmatize('better', 'a') wn.lemmatize('families', 'n') ## ##applying the porter stemmer to the gettysburg address text_5 = map(pt.stem, text_4) ##now creating a dictionary that will count the occurrence of the words
def clean_tokens(df): list_of_artists = list( set([ item.lower() for it in [nltk.word_tokenize(art) for art in list(dff['artists'])] for item in it ])) ff = [] hh = [] lmtzr = WordNetLemmatizer() stopwords = list( set( nltk.corpus.stopwords.words('english') + ['ap'] + ['i'] + ["y'all"] + ['m.'] + ['mme'] + ['donot'] + ['rah'] + ['&'] + ['de'] + ['b'] + ['ca'] + ['of'] + ['us'] + ['the'] + ['at'] + ["in"] + ['and'] + ['be'] + ['it'] + ['what'] + ['sv'] + ['lo'] + ['d'] + ['n'] + ['spotify'] + ['record'] + ['studios'] + ['chorus'] + ['verse'] + ['intro'] + ['outro'])) for sent in df: #print(sent) for token in sent: tt = token.replace("'s", " ").replace("n't", "not").replace('-','').replace("'ll", "will").replace('my—','my').\ replace("'cross",'across').replace("'ve",'have').replace("'bout","about").replace("'m","am").replace("'d","would").replace("'re",'are').\ replace('wantt','want').replace('mr.','mister').replace('ms.','miss').replace('murda','murder').replace('like-','like').replace('smallz','small') tt = re.sub(r'^([0-9]|[0-9][0-9]|[0-9][0-9][0-9])$', ' ', tt) #remove numbers tt = tt.lower() if tt == 'wo': tt = 'would' elif tt == 'gon': tt = 'going' elif tt == 'wan': tt = 'want' elif tt == 'na' or tt == "ta": tt = 'to' elif tt == 'ya': tt = 'you' elif tt == 'lil': tt = 'little' elif tt == 'ain': tt = 'am' elif tt == "'em" or tt == "em": tt = 'them' elif tt == 'cause' or tt == "'cause": tt = 'because' elif tt == 't': tt = 'not' elif tt == 'till' or tt == "'till" or tt == "'til" or tt == "til": tt = 'until' elif tt.endswith('—') == True: tt = tt.split('—')[0] elif tt == 'hol': tt = 'hold' elif tt == 'l': tt = 'lost' elif tt == 'cali': tt = 'california' tt = tt.split('_') if len(tt) == 1: if tt[0] not in [ '[]', '[:’', ':', '[', ']', '?', ',', ')', '(', ' ', ';', '—', '!', "'", '’', '.', '"', "...", '“', '”', "”", 'mme', "''", '``', "''", 'si', 'vv', 'c', '”', 'ii', '+', '$' ] and tt[0] not in stopwords and tt[0] not in list_of_artists: ff.append(tt[0]) else: for t in tt: if t not in [ '[]', '[:’', ':', '[', ']', '?', ',', ')', '(', ' ', ';', '—', '!', "'", '’', '.', '"', "...", '“', '”' + 'mme', "''", '``', "''", 'si', 'vv', 'c', '”', 'ii', '+', '$' ] and t not in stopwords and t not in list_of_artists: ff.append(t) lemmas = [lmtzr.lemmatize(xt, 'v') for xt in ff] hh.append(lemmas) ff = [] return hh
import nltk # nltk.download() # Just for one time downloading with GUI # nltk.download("stopwords") # Just for one time downloading with command # https://www.nltk.org/book/ # 2. Text Pre-processing : Lexicon Normalization # Stemming from nltk.stem.porter import PorterStemmer stem = PorterStemmer() word = "races" print(stem.stem(word)) # Lemmatization from nltk.stem.wordnet import WordNetLemmatizer lem = WordNetLemmatizer() word = "playing" print(lem.lemmatize(word, "v")) # 2. Text Pre-processing : Object Standardization dictionary = { "brb": "be right back", "cb": "call back", "awsm": "awesome", "lol": "laugh out loud" } def objectStandardization(text): words = text.split() substitutedWords = []
from nltk.stem.wordnet import WordNetLemmatizer import string from nltk.corpus import stopwords from nltk.tokenize import word_tokenize #word_tokenize accepts a string as an input, not a file. wordlist = ["one", "im", "would", "also", "ive", "lol"] stop_words = set(stopwords.words('english')) file1 = open("C:\TuDiabetes_Code\Diabetes_Text_New\All.txt") line = file1.read() # Use this to read file content as a stream: words = line.split() for r in words: if not r in stop_words: appendFile = open('C:\TuDiabetes_Code\Diabetes_Text_New\CleanText.txt', 'a') appendFile.write(" " + r) appendFile.close() lemma = WordNetLemmatizer() exclude = set(string.punctuation) stoplist = stopwords.words('english') stoplist = stoplist + wordlist stop = set(stoplist) # stop= stop.append # print type(stop) # print(stop) # exit(0) stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) return normalized
def head_related(query, candidate): lmt = WordNetLemmatizer() sd = StanfordDependencies.get_instance(backend='subprocess') a = Annotator() synTree = a.getAnnotations(query)['syntax_tree'] tokens = sd.convert_tree(synTree) queue = [] for i, token in enumerate(tokens): if token[6] == 0: queue.append((i + 1, token)) qHeadWords = [] while queue != []: s = queue[0] queue.remove(s) flag = 0 #print s[1][1], s[0] for i, word in enumerate(tokens): if word[6] == s[0]: flag = 1 queue.append((i + 1, word)) if flag == 1: qHeadWords.append(lmt.lemmatize(s[1][1], 'v')) synTree = a.getAnnotations(candidate)['syntax_tree'] tokens = sd.convert_tree(synTree) queue = [] for i, token in enumerate(tokens): if token[6] == 0: queue.append((i + 1, token)) cHeadWords = [] while queue != []: s = queue[0] queue.remove(s) flag = 0 #print s[1][1], s[0] for i, word in enumerate(tokens): if word[6] == s[0]: flag = 1 queue.append((i + 1, word)) if flag == 1: cHeadWords.append(lmt.lemmatize(s[1][1], 'v')) queryRel = [] for word in qHeadWords: for i, j in enumerate(wn.synsets(word)): for l in j.lemmas(): queryRel.append(l.name()) #queryRel.append(l.lemma_names() for l in j.hypernyms()) for l in j.hypernyms(): for k in l.lemma_names(): queryRel.append(k) for l in j.hyponyms(): for k in l.lemma_names(): queryRel.append(k) candidateRel = [] for word in cHeadWords: for i, j in enumerate(wn.synsets(word)): for l in j.lemmas(): candidateRel.append(l.name()) #queryRel.append(l.lemma_names() for l in j.hypernyms()) for l in j.hypernyms(): for k in l.lemma_names(): candidateRel.append(k) for l in j.hyponyms(): for k in l.lemma_names(): candidateRel.append(k) exactHeadScore = 0 count = 0 for j in cHeadWords: count = count + 1 for i in qHeadWords: #print i,j if i == j: exactHeadScore = exactHeadScore + 1 try: exactHeadScore = exactHeadScore / count except: exactHeadScore = 0 #print "Exact Head Score\n" relHeadScore = 0 count = 0 for j in candidateRel: count = count + 1 if j in queryRel: relHeadScore = relHeadScore + 1 try: relHeadScore = relHeadScore / count except: relHeadScore = 0 #print "Relative Head Score\n" return relHeadScore, exactHeadScore
for i in range(len(files)): file = open(path + files[i], 'r') text = file.read() file.close() books.append(text) # corpuses = categorized + plaintext corpuses = [books] ''' Reading Input File ''' file = open("test.txt", 'r') text = file.read() words = word_tokenize(text) words = [w.lower() for w in words] lmtzr = WordNetLemmatizer() words = [lmtzr.lemmatize(w) for w in words] count_of_words = len(words) fd = nltk.FreqDist(words) ''' Blob Parsing ''' # blob = TextBlob(text) # words = [n.lower() for n,t in blob.tags if t == 'NN' or t == 'NNP'] ''' Stop Words Removal ''' stop_words = stop_words() words = [w for w in words if w not in stop_words] words = [w for w in words if w.isalpha() == True and len(w) > 1] words = set(words) words = list(words) words_dict = {}
text_file = "fables-01.sch" dep_file = "fables-01.sch.dep" q_file = "fables-01.questions.dep" # Read the dependency graphs into a list sgraphs = read_dep_parses(dep_file) qgraphs = read_dep_parses(q_file) # TODO: You may need to include different rules in find_answer() for # different types of questions. For example, the rule here is good for # answering "Where was the crow sitting?", but not necessarily the others. # You would have to figure this out like in the chunking demo for qgraph in qgraphs: print("Question:", pretty_question(qgraph), "?") answer = find_answer(qgraph, sgraphs) print("Answer:", answer) print() # example of how to use a lemmatizer print("\nLemma:") lmtzr = WordNetLemmatizer() for node in sgraphs[1].nodes.values(): tag = node["tag"] word = node["word"] if word is not None: if tag.startswith("V"): print(lmtzr.lemmatize(word, 'v')) else: print(lmtzr.lemmatize(word, 'n')) print()
from nltk.corpus import wordnet as wn from nltk.stem.wordnet import WordNetLemmatizer from nltk import word_tokenize, pos_tag from collections import defaultdict tag_map = defaultdict(lambda : wn.NOUN) tag_map['J'] = wn.ADJ tag_map['V'] = wn.VERB tag_map['R'] = wn.ADV text = "Another way of achieving this task. I ate an apple." text2 = "ate" tokens = word_tokenize(text) lmtzr = WordNetLemmatizer() for token, tag in pos_tag(tokens): lemma = lmtzr.lemmatize(token, tag_map[tag[0]]) print(token, "=>", lemma, tag) lemma2 = lmtzr.lemmatize(text2,tag_map['0']) print(lemma2)
df = pd.read_csv('breast-cancer-wisconsin.data') X = X._get_numeric_data() # delete 'Survived', the response vector (Series) X.drop('Survived', axis=1, inplace=True) # we drop age for the sake of this example because it contains NaN in some examples X.drop('Age', axis=1, inplace=True) #Before Lenght of text sum([len(x) for x in data]) #160 #Cleaning and Tokenizing data stop = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WordNetLemmatizer() def clean(doc): stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) punc_free = ''.join(ch for ch in stop_free if ch not in exclude) remove_numbers = re.sub(r"[0-9]+", "", punc_free) normalized = " ".join(lemma.lemmatize(word) for word in remove_numbers.split()) return normalized texts = [text for text in data if len(text) > 2] doc_clean = [clean(doc).split() for doc in texts] all_words = sum(doc_clean,[])#removing the nested lists and making one list #dictionary = corpora.Dictionary(doc_clean) #doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
def lemmatize(self, tweet): lem = WordNetLemmatizer() words = tweet.split(" ") words = np.array([lem.lemmatize(word) for word in words]) tweet = " ".join(words) return tweet
def lemmatize(features): lemmatizer = WordNetLemmatizer() return [lemmatizer.lemmatize(feature) for feature in features]
""" Created on Wed Mar 22 14:12:34 2017 @author: Pooja Lahoti """ import nltk import re import json from nltk.stem.wordnet import WordNetLemmatizer from wordcloud import WordCloud import matplotlib.pyplot as plt from nltk.stem.lancaster import LancasterStemmer ls = LancasterStemmer() wnl = WordNetLemmatizer() #reload(sys) #sys.setdefaultencoding('utf8') #from nltk.stem.porter import PorterStemmer #ps = PorterStemmer() # #from nltk.stem.snowball import SnowballStemmer #ss = SnowballStemmer("english") # #stopwords.append(unicode("trump", "utf-8")) #stopwords.append(unicode("https", "utf-8")) #stopwords.append(unicode("Donald", "utf-8")) #stopwords.append(unicode("@realdonald", "utf-8")) #stopwords.append(unicode("RT", "utf-8")) #stopwords = set(stopwords) #stopwords.update(("https","geo","trump"))
def clean_data(data): words_to_exclude = set(stopwords.words('english')) exclude = set(string.punctuation) lemma = WordNetLemmatizer() return [clean(doc, words_to_exclude, exclude, lemma) for doc in data]
class tfidf: def __init__(self): # Data Fetch # data_folder = 'C:/Users/yashd/PycharmProjects/txt_search/' self.meta_cols = {"id": None, "original_title": None, "overview": None, "release_date": None} meta_data = pd.read_csv('movies_metadata.csv', usecols=self.meta_cols.keys(), index_col="id") self.meta_data = meta_data.dropna(subset=["overview"]) self.N = self.meta_data.shape[0] # Pre-processing self.tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+') self.stopword = stopwords.words('english') self.stemmer = SnowballStemmer() self.lemmatizer = WordNetLemmatizer() self.inverted_index = {} self.document_vector = {} if os.path.isfile("invertedIndexPickle.pkl"): self.inverted_index = pickle.load(open('invertedIndexPickle.pkl', 'rb')) self.document_vector = pickle.load(open('documentVectorPickle.pkl', 'rb')) else: self.build() self.save() def build(self): self.create_inverted_index() self.build_doc_vector() def save(self): pickle.dump(self.inverted_index, open('invertedIndexPickle.pkl', 'wb+')) pickle.dump(self.document_vector, open('documentVectorPickle.pkl', 'wb+')) def create_inverted_index(self): for row in self.meta_data.itertuples(): index = getattr(row, 'Index') data = [] for col in self.meta_cols.keys(): if col != "id": col_values = getattr(row, col) parameters = self.meta_cols[col] if parameters is None: data.append(col_values if isinstance(col_values, str) else "") else: col_values = ast.literal_eval(col_values if isinstance(col_values, str) else '[]') if type(col_values) == bool: continue else: for col_value in col_values: for param in parameters: data.append(col_value[param]) self.insert(index, self.pre_processing(' '.join(data))) def build_doc_vector(self): for token_key in self.inverted_index: token_values = self.inverted_index[token_key] idf = math.log10(self.N / token_values["df"]) for doc_key in token_values: if doc_key != "df": tf_idf = (1 + math.log10(token_values[doc_key])) * idf if doc_key not in self.document_vector: self.document_vector[doc_key] = {token_key: tf_idf, "_sum_": math.pow(tf_idf, 2)} else: self.document_vector[doc_key][token_key] = tf_idf self.document_vector[doc_key]["_sum_"] += math.pow(tf_idf, 2) for doc in self.document_vector: tf_idf_vector = self.document_vector[doc] normalize = math.sqrt(tf_idf_vector["_sum_"]) for tf_idf_key in tf_idf_vector: tf_idf_vector[tf_idf_key] /= normalize def insert(self, index, tokens): for token in tokens: if token in self.inverted_index: value = self.inverted_index[token] if index in value.keys(): value[index] += 1 else: value[index] = 1 value["df"] += 1 else: self.inverted_index[token] = {index: 1, "df": 1} def pre_processing(self, data_string): tokens = self.tokenizer.tokenize(data_string) processed_data = [] for t in tokens: if t not in self.stopword: processed_data.append(self.lemmatizer.lemmatize(t).lower()) return processed_data def get_relevant_docs(self, query_list): relevant_docs = set() for query in query_list: if query in self.inverted_index: keys = self.inverted_index[query].keys() for key in keys: relevant_docs.add(key) if "df" in relevant_docs: relevant_docs.remove("df") # print(relevant_docs) return relevant_docs def build_query_vector(self, processed_query): query_vector = {} tf_vector = {} idf_vector = {} sum = 0 for token in processed_query: if token in self.inverted_index: # tf_idf = (1 + math.log10(processed_query.count(token))) * math.log10(N/inverted_index[token]["df"]) tf = (1 + math.log10(processed_query.count(token))) tf_vector[token] = tf idf = (math.log10(self.N / self.inverted_index[token]["df"])) idf_vector[token] = idf tf_idf = tf * idf query_vector[token] = tf_idf sum += math.pow(tf_idf, 2) sum = math.sqrt(sum) for token in query_vector: query_vector[token] /= sum return query_vector, idf_vector, tf_vector def similarity(self, relevant_docs, query_vector, idf_vector, tf_vector): FinalScore = {} IdfScore = {} TfScore = {} for doc in relevant_docs: score_idf = 0 score_tf = 0 score_tf_idf = 0 for token in query_vector: score_final += query_vector[token] * ( self.document_vector[doc][token] if token in self.document_vector[doc] else 0) for token in query_vector: score_tf_idf = query_vector[token] * ( self.document_vector[doc][token] if token in self.document_vector[doc] else 0) score_tf_idf_term[token] = score_tf_idf score_tf_idf_term_keys = list(score_tf_idf_term.keys()) score_tf_idf_term_values = list(score_tf_idf_term.values()) final_score_tf_idf_term = list(zip(score_tf_idf_term_keys, score_tf_idf_term_values)) final_TermTf = list(zip(TermTf_keys, TermTf_values)) FinalScore[doc] = score_final IdfScore[doc] = score_idf TfScore[doc] = score_tf tf_idf_term_new[doc] = final_score_tf_idf_term sorted_FinalScore = sorted(FinalScore.items(), key=operator.itemgetter(1), reverse=True) return sorted_FinalScore[:50], tf_term_new, idf_term_new, tf_idf_term_new def get_movie_info(self, sorted_score_list, tf_new, idf_new, tf_idf_new): result = [] for entry in sorted_score_list: doc_id = entry[0] row = self.meta_data.loc[doc_id] info = (row["original_title"], row["overview"] if isinstance(row["overview"], str) else "", entry[1], idf_new[doc_id], tf_new[doc_id], tf_idf_new[doc_id], row["release_date"]) result.append(info) new_score = None # print(result[0:5]) return result
class ToxicComment: _eng_stopwords = set(stopwords.words("english")) _lemmatizer = WordNetLemmatizer() _tokenizer = TweetTokenizer() _appos = { "aren't": "are not", "can't": "cannot", "couldn't": "could not", "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would", "he'll": "he will", "he's": "he is", "i'd": "I would", "i'd": "I had", "i'll": "I will", "i'm": "I am", "isn't": "is not", "it's": "it is", "it'll": "it will", "i've": "I have", "let's": "let us", "mightn't": "might not", "mustn't": "must not", "shan't": "shall not", "she'd": "she would", "she'll": "she will", "she's": "she is", "shouldn't": "should not", "that's": "that is", "there's": "there is", "they'd": "they would", "they'll": "they will", "they're": "they are", "they've": "they have", "we'd": "we would", "we're": "we are", "weren't": "were not", "we've": "we have", "what'll": "what will", "what're": "what are", "what's": "what is", "what've": "what have", "where's": "where is", "who'd": "who would", "who'll": "who will", "who're": "who are", "who's": "who is", "who've": "who have", "won't": "will not", "wouldn't": "would not", "you'd": "you would", "you'll": "you will", "you're": "you are", "you've": "you have", "'re": " are", "wasn't": "was not", "we'll": " will", "didn't": "did not" } @staticmethod def _clean(comment): # make all characters lower cased comment = comment.lower() # remove new line character comment = re.sub('\\n', ' ', comment) # remove ip addresses comment = re.sub('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', '', comment) # remove usernames comment = re.sub('\[\[.*\]', '', comment) # split the comment into words words = ToxicComment._tokenizer.tokenize(comment) # replace that's to that is by looking up the dictionary words = [ ToxicComment._appos[word] if word in ToxicComment._appos else word for word in words ] # replace variation of a word with its base form words = [ ToxicComment._lemmatizer.lemmatize(word, "v") for word in words ] # eliminate stop words words = [w for w in words if not w in ToxicComment._eng_stopwords] # now we will have only one string containing all the words clean_comment = " ".join(words) # remove all non alphabetical characters clean_comment = re.sub("\W+", " ", clean_comment) clean_comment = re.sub(" ", " ", clean_comment) return clean_comment def __init__(self, csv_row, glove_model, comment_max_length): self._id = csv_row['id'] self._comment_text = csv_row['comment_text'] self._tokens = word_tokenize( ToxicComment._clean(csv_row['comment_text'])) self._labels = np.array([ float(csv_row['toxic']), float(csv_row['severe_toxic']), float(csv_row['obscene']), float(csv_row['threat']), float(csv_row['insult']), float(csv_row['identity_hate']) ]) self._indexed_tokens = np.zeros(shape=[comment_max_length], dtype=np.int32) self._token_count = min(len(self._tokens), comment_max_length) for i, token in enumerate(self._tokens): if i < comment_max_length: token = token.lower() index = glove_model.token_to_embedding['something'].index if token in glove_model.token_to_embedding: index = glove_model.token_to_embedding[token].index self._indexed_tokens[i] = index else: break @property def tokens(self): return self._tokens @property def labels(self): return self._labels @property def indexed_tokens(self): return self._indexed_tokens @property def token_count(self): return self._token_count @property def id(self): return self._id @property def comment_text(self): return self._comment_text
def main(): parser = argparse.ArgumentParser(description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument('--data', type=str, default='../data/', help='location of the data corpus') parser.add_argument('--presaved', action='store_true', help='use presaved data') parser.add_argument('--glovedata', type=str, default='../data/glove.6B', help='location of the pretrained glove embeddings') parser.add_argument('--din', type=int, default=30, help='length of LSTM') parser.add_argument('--demb', type=int, default=100, help='size of word embeddings') parser.add_argument('--dhid', type=int, default=100, help='humber of hidden units per layer') parser.add_argument('--dout', type=int, default=2, help='number of output classes') parser.add_argument('--nlayers', type=int, default=1, help='number of layers') parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--embinit', type=str, default='random', help='embedding weight initialization type') parser.add_argument('--decinit', type=str, default='random', help='decoder weight initialization type') parser.add_argument('--hidinit', type=str, default='random', help='recurrent hidden weight initialization type') parser.add_argument('--dropout', type=float, default=0.0, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') parser.add_argument('--batchsize', type=int, default=20, metavar='N', help='batch size') parser.add_argument('--seed', type=int, default=3, help='random seed') parser.add_argument('--vocabsize', type=int, default=200000, help='random seed') parser.add_argument('--optimizer', action='store_true', help='use ADAM optimizer') parser.add_argument('--pipeline', action='store_true', help='use pipeline file') parser.add_argument('--psw', type=int, default=1, help='remove stop words') parser.add_argument('--ppunc', action='store_true', help='remove punctuation') parser.add_argument('--pntok', action='store_true', help='use number tokens') parser.add_argument('--pkq', action='store_true', help='keep question words') parser.add_argument('--stem', action='store_true', help='use stemmer') parser.add_argument('--lemma', action='store_true', help='use lemmatizer') parser.add_argument('--freezeemb', action='store_false', help='freezes embeddings') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--loginterval', type=int, default=100, metavar='N', help='report interval') parser.add_argument('--save', type=str, default='', help='path to save the final model') args = parser.parse_args() if not args.presaved: pipe = None if args.pipeline: stemmer, lemmatizer = None, None if args.stem: stemmer = SnowballStemmer('english') elif args.lemma: lemmatizer = WordNetLemmatizer() pipe = functools.partial(pipeline, rm_stop_words=args.psw, rm_punc=args.ppunc, number_token=args.pntok, keep_questions=args.pkq, stemmer=stemmer, lemmatizer=lemmatizer) corpus = TacoText(args.vocabsize, lower=True, vocab_pipe=pipe) print('Loading Data') # train_data = pd.read_csv(args.data) #Shuffle order of training data # train_data = train_data.reindex(np.random.permutation(train_data.index)) # val_data = train_data.iloc[int(len(train_data) * 0.9):] # train_data = train_data.iloc[:int(len(train_data) * 0.9)] train_data = pd.read_csv('../data/train_data_shuffle.csv') val_data = pd.read_csv('../data/val_data_shuffle.csv') print('Cleaning and Tokenizing') q1, q2, y = clean_and_tokenize(train_data, corpus) q1_val, q2_val, y_val = clean_and_tokenize(val_data, corpus) train_feat = list(map(feature_gen, zip(q1, q2))) val_feat = list(map(feature_gen, zip(q1_val, q2_val))) scalar = preprocessing.StandardScaler() train_feat = scalar.fit_transform(train_feat) val_feat = scalar.transform(val_feat) print('Piping Data') q1 = corpus.pipe_data(q1) q2 = corpus.pipe_data(q2) q1_val = corpus.pipe_data(q1_val) q2_val = corpus.pipe_data(q2_val) corpus.gen_vocab(q1 + q2 + q2_val + q1_val) n_feat = train_feat.shape[1] d_in = args.din feat_max = int(np.max([n_feat, d_in])) X = torch.Tensor(len(train_data), 1, 3, feat_max) X[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1, feat_max)).long() X[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2, feat_max)).long() X[:, 0, 2, :n_feat] = torch.from_numpy(np.array(train_feat)) y = torch.from_numpy(np.array(y)).long() X_val = torch.Tensor(len(val_data), 1, 3, feat_max) X_val[:, 0, 0, :] = torch.from_numpy(corpus.pad_numericalize(q1_val, feat_max)).long() X_val[:, 0, 1, :] = torch.from_numpy(corpus.pad_numericalize(q2_val, feat_max)).long() X_val[:, 0, 2, :n_feat] = torch.from_numpy(np.array(val_feat)) y_val = torch.from_numpy(np.array(y_val)).long() torch.save(X, '../data/X_featd.t') torch.save(y, '../data/y_featd.t') torch.save(X_val, '../data/X_val_featd.t') torch.save(y_val, '../data/y_val_featd.t') with open('../data/corpus_featd.pkl', 'wb') as corp_f: pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL) else: n_feat = 22 d_in = args.din print('Loading Presaved Data') X = torch.load(args.data + 'X_featd.t') y = torch.load(args.data + 'y_featd.t') X_val = torch.load(args.data + 'X_val_featd.t') y_val = torch.load(args.data + 'y_val_featd.t') with open('../data/corpus_featd.pkl', 'rb') as f: corpus = pkl.load(f) if args.cuda: X, y = X.cuda(), y.cuda() X_val, y_val = X_val.cuda(), y_val.cuda() print('Generating Data Loaders') #X.size len(train_data),1,2,fix_length train_dataset = TensorDataset(X, y) train_loader = DataLoader(train_dataset, batch_size=args.batchsize, shuffle=True) valid_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=args.batchsize, shuffle=False) ntokens = len(corpus) glove_embeddings = None if args.embinit == 'glove': assert args.demb in (50, 100, 200, 300) glove_embeddings = get_glove_embeddings(args.glovedata, corpus.dictionary.word2idx, ntokens, args.demb) model = LSTMModelMLPFeatDist(args.din, args.dhid, args.nlayers, args.dout, args.demb, n_feat, args.vocabsize, args.dropout, args.embinit, args.hidinit, args.decinit, glove_embeddings, args.cuda) if args.cuda: model.cuda() criterion = nn.NLLLoss() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) model_config = '\t'.join([str(x) for x in (torch.__version__, args.clip, args.nlayers, args.din, args.demb, args.dhid, args.embinit, args.decinit, args.hidinit, args.dropout, args.optimizer, args.lr, args.vocabsize, args.pipeline, args.psw, args.ppunc, args.pntok, args.pkq, args.stem, args.lemma)]) print('Pytorch | Clip | #Layers | InSize | EmbDim | HiddenDim | EncoderInit | DecoderInit | WeightInit | Dropout | Optimizer| LR | VocabSize | pipeline | stop | punc | ntoken | keep_ques | stem | lemma') print(model_config) # best_val_acc = 0.78 best_ll = 0.5 for epoch in range(args.epochs): model.train() total_cost = 0 start_time = time.time() cur_loss = 0 for ind, (qs, duplicate) in enumerate(train_loader): model.zero_grad() pred = model(qs[:, 0, 0, :d_in].long(), qs[:, 0, 1, :d_in].long(), qs[:, 0, 2, :n_feat]) if args.cuda: pred = pred.cuda() duplicate = duplicate.cuda() duplicate = Variable(duplicate) loss = criterion(pred, duplicate) loss.backward() clip_grad_norm(model.parameters(), args.clip) if optimizer: optimizer.step() else: for p in model.parameters(): p.data.add_(-args.lr, p.grad.data) total_cost += loss.data[0] cur_loss += loss.data[0] if ind % args.loginterval == 0 and ind > 0: cur_loss = loss.data[0] / args.loginterval elapsed = time.time() - start_time print('| Epoch {:3d} | {:5d}/{:5d} Batches | ms/batch {:5.2f} | ' 'Loss {:.6f}'.format( epoch, ind, len(X) // args.batchsize, elapsed * 1000.0 / args.loginterval, cur_loss)) start_time = time.time() cur_loss = 0 model.eval() train_acc, train_ll = evaluate(model, train_loader, args.cuda, d_in, n_feat) val_acc, val_ll = evaluate(model, valid_loader, args.cuda, d_in, n_feat) # if args.save and (val_acc > best_val_acc): if args.save and (val_ll < best_ll): with open(args.save + '_corpus.pkl', 'wb') as corp_f: pkl.dump(corpus, corp_f, protocol=pkl.HIGHEST_PROTOCOL) torch.save(model.cpu(), args.save) torch.save(model.cpu().state_dict(), args.save + ".state_dict") with open(args.save + ".state_dict.config", "w") as f: f.write(model_config) best_ll = val_ll if args.cuda: model.cuda() print('Epoch: {} | Train Loss: {:.4f} | Train Accuracy: {:.4f} | Val Accuracy: {:.4f} | Train LL: {:.4f} | Val LL: {:.4f}'.format( epoch, total_cost, train_acc, val_acc, train_ll, val_ll)) print('-' * 89)
import networkx as nx import numpy as np from copy import copy import string from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer from scipy import argmax from scipy.spatial.distance import cosine from galaxy.vector import vectorize LEMMATIZER = WordNetLemmatizer() STOPWORDS = set(list(string.punctuation) + stopwords.words('english')) # Level Definitions HIGH = 0 MED = 1 LOW = 2 WEIGHTS = {HIGH: 2.0, MED: 1.5, LOW: 1.0} def sentencize(plain_text): sentences = sent_tokenize(plain_text) sentences = [tokenize(s) for s in sentences] return [RankedSentence(sentence=s, level=LOW) for s in sentences]
# 保留的词性 expected_tags = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ", \ "NN", "NNS", "NNP", "NNPS", \ "JJ", "JJR", "JJS"] # 加载停词表 fs = open('../../stopwords_en.txt') stopwords = fs.read() swlist = stopwords.splitlines() fs.close() print("step1:加载语料库及预处理") timestamp = time.time() corpus = [] #存放语料库,每个元素代表一篇文档 if not os.path.exists('../../corpus/segwords.txt'): lemmatizer = WordNetLemmatizer() with open('../../corpus/news.txt', 'r') as df: for line in df: if len(line.strip()) != 0: words = word_tokenize(line.strip()) tags = pos_tag(words) seglist = [] for i in range(len(words)): if tags[i][1] in expected_tags and words[ i] not in swlist and words[i].isalpha(): taghead = tags[i][1][0].lower() # {ADJ:a, ADJ_SAT:s, ADV:r, NOUN:n or VERB:v} 词形还原 seglist.append( lemmatizer.lemmatize( words[i], wordnet.ADJ if taghead == 'j' else taghead))