def _pos_tag(words_all): from nltk import pos_tag as pt n_sent = len(words_all) pos_tags_all = [[] for i in range(n_sent)] for i, words in enumerate(words_all): pos_tags = pt(words) pos_tags_all[i] = pos_tags return pos_tags_all
def add_lemmatizer(): in_fp = open(word_topic_file) out_fp = open(word_topic_lexeme_file, 'w') wnl = WordNetLemmatizer() ### line = '' line_num = 0 while 1 and line_num < max_line_num: line = in_fp.readline() line = line.strip() line_words = line.split(' ') line_write = '' for words in line_words: word_topic = words.split(':') word_id = word_topic[0] topic_id = word_topic[1] line_write += word_id line_write += ':' line_write += topic_id line_write += ':' ## if id_word_dict.has_key(word_id): word = id_word_dict[word_id] if word_lexeme_id_dict.has_key(word): line_write += word_lexeme_id_dict[word] line_write += ' ' else: word_list = [] word_list.append(word) pos = pt(word_list) tag = pos[0][1] lexeme = wnl.lemmatize(word, penn_to_wn(tag)) #print ': ', word, lexeme if word_id_dict.has_key(lexeme): lexeme_id = word_id_dict[lexeme] word_lexeme_id_dict[word] = lexeme_id line_write += lexeme_id line_write += ' ' else: word_lexeme_id_dict[word] = word_id line_write += word_id line_write += ' ' ## line_write = line_write.strip() out_fp.write(line_write) if line_num < max_line_num - 1: out_fp.write('\n') line_num += 1 if line_num % 1000 == 0: print 'line: ', line_num ### in_fp.close() out_fp.close()
def add_lemmatizer(): in_fp = open(word_topic_file) out_fp = open(word_topic_lexeme_file, 'w') wnl = WordNetLemmatizer() ### line = '' line_num = 0 while 1 and line_num < max_line_num: line = in_fp.readline() line = line.strip() line_words = line.split(' ') line_write = '' for words in line_words: word_topic = words.split(':') word_id = word_topic[0] topic_id = word_topic[1] line_write += word_id line_write += ':' line_write += topic_id line_write += ':' ## if id_word_dict.has_key(word_id): word = id_word_dict[word_id] if word_lexeme_id_dict.has_key(word): line_write += word_lexeme_id_dict[word] line_write += ' ' else: word_list = [] word_list.append(word) pos = pt(word_list) tag = pos[0][1] lexeme = wnl.lemmatize(word, penn_to_wn(tag)) #print ': ', word, lexeme if word_id_dict.has_key(lexeme): lexeme_id = word_id_dict[lexeme] word_lexeme_id_dict[word] = lexeme_id line_write += lexeme_id line_write += ' ' else: word_lexeme_id_dict[word] = word_id line_write += word_id line_write += ' ' ## line_write = line_write.strip() out_fp.write(line_write) if line_num < max_line_num -1: out_fp.write('\n') line_num += 1 if line_num%1000 ==0: print 'line: ', line_num ### in_fp.close() out_fp.close()
def word_freq(self): ''' single word frequency without any context. This will result in the top 100 words that will be shown and identified as the most repeated words. However, rigorous filtration will be applied to the printed words getting rid of words that are not Nouns :return: the frequency distribution, obj. ''' classified_text = pt(self.clean_words) noun_descriptor = [word for word, pos in classified_text if pos == 'NN'] revised_noun_descriptor = [word for word in noun_descriptor if word not in self.loose_words] self.fdist = FreqDist(revised_noun_descriptor) return self.fdist
# text = fsm_m1.read() # module_test = 1 # print("%-20s %-8s %s\n" %("Token", "Tag", "Corrected Tag")) # for sent in tagged_data: # for word in sent: # print("%-20s %-8s %s" %(word[0], word[1], word[1])) # fsm_m1.close() with open('data/FMFS_Module_1_Verified_Post_Verbatim_TScript.txt', mode='r') as f: for i in range( 3 ): # for ignoring the forst three lines (details about the module) f.readline() module_text = f.read() module_text = re.sub(r'([A-Za-z])/([a-zA-Z])', r'\1 / \2', module_text) module_text = re.sub(r'([a-z])\.([A-Z])', r'\1\. \2', module_text) sentences = tk.sent_tokenize(module_text) tagged_data = [] for sent in sentences: tagged_data.append(pt(tk.word_tokenize(sent))) # print(tagged_data) for sent in tagged_data: for token in sent: print("%-25s %-4s" % (token[0], token[1]))
return (order >= 48 and order <= 57) or (order >= 65 and order <= 90) or (order >= 97 and order <= 122) or order == 9 or order == 32 def intersection(keywords, text): resultVec = [] textSet = set(text) for i in keywords: if i in textSet: resultVec.append(1) else: resultVec.append(0) return resultVec if __name__ == '__main__': loadModule(argv[1]) os.system("echo -n 'loading keywords...\t\t'") wordList = loadWords(argv[1]) os.system("echo -n '[done]\n'") review = asciify(open(raw_input("Please enter the review location: "), 'r').read()).split() if argv[1] == 'P': review = pt(review) X = intersection(wordList, review) result = movieReviewer.predict(X)[0] if (result > 0): print "Good movie!" #print result else: print "BAD movie!" #print result
if i in textSet: resultVec.append(1) else: resultVec.append(0) return resultVec os.system("echo -n 'loading training files:\n[05%'") files = os.listdir('./train/pos')[:3500] total = float(len(files)) * 2 done = 0 progress = 0 for i in files: f = asciify(open('./train/pos/' + i, 'r').read()).split() if argv[1] == 'P': f = pt(f) X.append(intersection(wordList, f)) y.append(1) done += 1 if (done / total * 100 >= progress + 5): progress += 5 os.system("echo -n '\b\b\b=%d%%'" % progress) files = os.listdir('./train/neg')[:3500] for i in files: f = asciify(open('./train/neg/' + i, 'r').read()).split() if argv[1] == 'P': f = pt(f) X.append(intersection(wordList, f)) y.append(-1)
# -*- coding: utf-8 -*- """ Created on Tue Nov 12 15:32:07 2013 @author: ibews """ from nltk import pos_tag as pt t = ['Dies', 'ist', 'eine', 'Beispiel', 'Game', '.'] a = [len(x) for x in t if "a" in x] # Was ist mit types der Länge 4 gemeint? # Ist eine Liste der Types von „Worten mit einer Länge == 4“ gefordert? b = sorted(list(set(x[1] for x in pt(t) if len(x[0])==4)))