def main(): words = [] sentences = [] with open('..//dataset//computer.txt', 'r') as f: for line in f: words += util.split_words(line) sentences += util.split_sentences(line) # Tf-idf evaluation tfidf = Tfidf(words, sample_doc_ids) tfidf.calc_tfidf() # Show tf-idf values sorted_tfidf = util.sort_dict_by_value(tfidf.tfidf) for i in range(len(sorted_tfidf)): print('Word: {0:25}; tfidf = {1}'.format(sorted_tfidf[i][0], sorted_tfidf[i][1])) # Work out summary summary = tfidf.best_sentences(sentences, 100) for sentence in summary: print(sentence.text) print("Score: {0}\n".format(sentence.score)) print("-----------\nDONE")
def preprocessing(self, is_tfidf=False, is_cosinesimilarty=False, is_Nmf=False): all = self.Alllist() #print("all uniq list") #print(all) lst = self.listtostring(all) uniq = self.unique_list(lst) concerns=[] concerns = self.load_set1() concerns = [self.tokenization(concerns[i]) for i in range(len(concerns))] concerns = [self.stop_words(concerns[i]) for i in range(len(concerns))] concerns = [self.regular_exp(concerns[i]) for i in range(len(concerns))] concerns = [self.port_stem(concerns[i]) for i in range(len(concerns))] concerns= [" ".join(concerns[i]) for i in range(len(concerns))] op1 = None if is_tfidf: op1 = Tfidf(concerns).tfidf() if is_cosinesimilarty: op1 = Cosine(op1).cosinesimilarty() if is_Nmf: op1=NMF(op1).non_negative_matrices() return op1
def reveal_doc(self, text_panel): """ 点击“选择文件”按钮调用此函数,获取文件路径,并将文档内容展示在相应的文本框中 Args: text_panel: 'dp': 表示关键词展示模块中的text框 'dp_r1': 表示文档相似度对比模块中的第一个text框 'dp_r2': 表示文档相似度对比模块中的第二个text框 """ if text_panel == 'dp': t = self.doc_panel title = self.doc_title elif text_panel == 'dp_r1': t = self.doc_panel_r1 title = self.doc_title_r1 elif text_panel == 'dp_r2': t = self.doc_panel_r2 title = self.doc_title_r2 file_path = filedialog.askopenfilename() print('file_path', file_path) # 判断是否选择了文件 if not file_path: messagebox.showinfo(message='请选择正确的文件。') return try: # 先清除文本框 t.delete('1.0', 'end') data = Data(file_path) title.set(os.path.split(file_path)[-1]) print('file:', os.path.split(file_path)[-1]) t.insert('end', data.raw_text) # 加载 idf 字典 idf = load_idf(self.idf_dir) # 计算 tf_idf score tf_idf = Tfidf(data.corpus, idf) # 保存当前文档的data、tfidf对象,用于后续的提取关键词或者计算相似度 self.status[text_panel] = { 'data': data, 'tfidf': tf_idf } if text_panel == 'dp': # 提取关键词模块,默认提取20个关键词 self.topk_var.set(20) self.refresh_key_words() elif text_panel == 'dp_r1' or text_panel == 'dp_r2': # if self.status.get('dp_r1', None) and self.status.get('dp_r2', None): # self.show_similarity() # 当文档相似度模块选择新的文档时,清空相似度输出的文本框 self.sim_panel.delete(0, 'end') except Exception: traceback.print_exc()
def reveal_doc(self, text_panel): """ 点击“选择文件”按钮调用此函数,获取文件路径,并将文档内容展示在相应的文本框中 Args: text_panel: 'dp': 表示关键词展示模块中的text框 'dp_r1': 表示文档相似度对比模块中的第一个text框 'dp_r2': 表示文档相似度对比模块中的第二个text框 """ self.root.wm_attributes('-topmost', 0) if text_panel == 'dp': t = self.doc_panel title = self.doc_title elif text_panel == 'dp_r1': t = self.doc_panel_r1 title = self.doc_title_r1 elif text_panel == 'dp_r2': t = self.doc_panel_r2 title = self.doc_title_r2 file_path = filedialog.askopenfilename() print('file_path', file_path) # 判断是否选择了文件 if not file_path: self.show_error('请选择正确的文件。') return try: # 先清除文本框 t.delete('1.0', 'end') data = Data(file_path) title.set(os.path.split(file_path)[-1]) print('file:', os.path.split(file_path)[-1]) t.insert('end', data.corpus) # 计算 tf_idf score tf_idf = Tfidf(data.corpus, len(data.corpus) // 20) # 保存当前文档的data、tfidf对象,用于后续的提取关键词或者计算相似度 self.status[text_panel] = {'data': data, 'tfidf': tf_idf} if text_panel == 'dp': # 提取关键词模块,默认提取20个关键词 self.kw_ours_var.set('Ours') self.kw_jieba_var.set('Jieba') self.kw_panel_jieba.delete(0, 'end') self.kw_panel_ours.delete(0, 'end') elif text_panel == 'dp_r1' or text_panel == 'dp_r2': # 当文档相似度模块选择新的文档时,清空相似度输出的文本框 self.sim_panel.delete(0, 'end') self.root.wm_attributes('-topmost', 1) except Exception: traceback.print_exc()
def parse_cbr(self): all_content = [] with codecs.open(self.path, 'r', 'utf-8-sig') as lines: for lin in lines: create_time = utility.split_data_by_date(lin) lin = lin.strip().split() userid, newsid, scan_time, title, create_time_ = int( lin[0]), int(lin[1]), lin[2], lin[3], lin[-1] news = News(int(userid), int(newsid), title, scan_time, [], create_time_) self.AllNews.append(news) content = "".join(lin[4:-1]) all_content.append(content) if self.isTfidf: tags = Tfidf(all_content).derive_keyword_zh(keyword_num=5) for index in xrange(len(tags)): self.AllNews[index].tags = tags[index]
def __init__(self, posting_list_path, data_path): super(QuerySearch, self).__init__() self.posting_list_path = posting_list_path self.data_path = data_path self.TfidfObj = Tfidf(self.posting_list_path, self.data_path)
else: print "language error!" exit() tfidfText.append(text1) vsText.append(text2) return tfidfText, vsText, np.array(polarities), categories if __name__ == "__main__": if len(sys.argv) < 4: print "sys.argv[1]: input train corpus" print "sys.argv[2]: input test corpus" print "sys.argv[3]: corpus language, 'en' for English and 'zh-CN' for Chinese" exit() tfidfInstance = Tfidf() sentimentInstance = Sentiment() trCorpus1, trCorpus2, trPolarity, trCategory = readData( sys.argv[1], sys.argv[3]) teCorpus1, teCorpus2, tePolarity, teCategory = readData( sys.argv[2], sys.argv[3]) trainTfidf, testTfidf = tfidfInstance.tfidf(trCorpus1, teCorpus1) trainVS = sentimentInstance.VSPolarity(trCorpus2) testVS = sentimentInstance.VSPolarity(teCorpus2) trainMatrix = combineFeature(trainTfidf, trainVS) testMatrix = combineFeature(testTfidf, testVS) print sys.argv[2] trainMatrix, testMatrix = featureSelection(trainMatrix, trPolarity,
from brown import get_indexed from tfidf import Tfidf if __name__ == '__main__': documents_indexed, word2idx = get_indexed(10000) vocab_size = len(word2idx) print("Data loaded | Vocab size:", vocab_size, '| Document size:', len(documents_indexed)) model = Tfidf() TD = model.fit(documents_indexed, vocab_size) idx2word = {idx: word for word, idx in word2idx.items()} model.find_closest(['london', 'king', 'italy', 'queen'], TD, word2idx, idx2word)
# print("Starting with bigrams...") # bigram_perceptron = Bigram(train_ratio=0.8) # print("Bigram accuracy", bigram_perceptron.accuracy) # PART C: Compare the data representations ratios = np.arange(0.05, 1.05, 0.05) unigram_accuracies = [] tfidf_accuracies = [] bigram_accuracies = [] for r in ratios: unigram_perceptron = Unigram(train_ratio=r) unigram_accuracy = unigram_perceptron.accuracy unigram_accuracies.append(unigram_accuracy) print(r, "unigram_perceptron", unigram_accuracy) tfidf_perceptron = Tfidf(train_ratio=r) tfidf_accuracy = tfidf_perceptron.accuracy tfidf_accuracies.append(tfidf_accuracy) print(r, "tfidf_perceptron", tfidf_accuracy) bigram_perceptron = Bigram(train_ratio=r) bigram_accuracy = bigram_perceptron.accuracy bigram_accuracies.append(bigram_accuracy) print(r, "bigram_perceptron", bigram_accuracy) pickle.dump(unigram_accuracies, open("unigram_accuracies.pkl", "wb")) pickle.dump(tfidf_accuracies, open("tfidf_accuracies.pkl", "wb")) pickle.dump(bigram_accuracies, open("bigram_accuracies.pkl", "wb")) # unigram_accuracies = pickle.load(open("unigram_accuracies.pkl", "rb")) # tfidf_accuracies = pickle.load(open("tfidf_accuracies.pkl", "rb")) # bigram_accuracies = pickle.load(open("bigram_accuracies.pkl", "rb"))