def main(): words = [] sentences = [] with open('..//dataset//computer.txt', 'r') as f: for line in f: words += util.split_words(line) sentences += util.split_sentences(line) # Tf-idf evaluation tfidf = Tfidf(words, sample_doc_ids) tfidf.calc_tfidf() # Show tf-idf values sorted_tfidf = util.sort_dict_by_value(tfidf.tfidf) for i in range(len(sorted_tfidf)): print('Word: {0:25}; tfidf = {1}'.format(sorted_tfidf[i][0], sorted_tfidf[i][1])) # Work out summary summary = tfidf.best_sentences(sentences, 100) for sentence in summary: print(sentence.text) print("Score: {0}\n".format(sentence.score)) print("-----------\nDONE")
def preprocessing(self, is_tfidf=False, is_cosinesimilarty=False, is_Nmf=False): all = self.Alllist() #print("all uniq list") #print(all) lst = self.listtostring(all) uniq = self.unique_list(lst) concerns=[] concerns = self.load_set1() concerns = [self.tokenization(concerns[i]) for i in range(len(concerns))] concerns = [self.stop_words(concerns[i]) for i in range(len(concerns))] concerns = [self.regular_exp(concerns[i]) for i in range(len(concerns))] concerns = [self.port_stem(concerns[i]) for i in range(len(concerns))] concerns= [" ".join(concerns[i]) for i in range(len(concerns))] op1 = None if is_tfidf: op1 = Tfidf(concerns).tfidf() if is_cosinesimilarty: op1 = Cosine(op1).cosinesimilarty() if is_Nmf: op1=NMF(op1).non_negative_matrices() return op1
class Feature: def __init__(self): self.tfidf = Tfidf() self.docs = [] self.words = [] self.title = Set() self.body = Set() self.topics = Set() self.places = Set() self.matrix = {} self.dfs = {} def add(self, doc): if not doc.topics: return self.docs.append(doc) self.tfidf.add(doc) self.title.update(doc.title) self.body.update(doc.body) self.topics.update(doc.topics) self.places.update(doc.places) def _compress(self): self.title = list(self.title) self.body = list(self.body) self.topics = sorted(self.topics) self.places = sorted(self.places) self.words = self.title + self.body self.title = [] self.body = [] def build(self): self._compress() print "Building tf-idf matrix" for d in self.docs: self.matrix[d.id] = {} for w in self.words: df = self.tfidf.get_df(w) if df > 2: self.dfs[w] = df for d in self.docs: tfidf = self.tfidf.get_tfidf(w, d.id) if tfidf: self.matrix[d.id][w] = tfidf self.words = []
def __init__(self): self.tfidf = Tfidf() self.docs = [] self.words = [] self.title = Set() self.body = Set() self.topics = Set() self.places = Set() self.matrix = {} self.dfs = {}
def reveal_doc(self, text_panel): """ 点击“选择文件”按钮调用此函数,获取文件路径,并将文档内容展示在相应的文本框中 Args: text_panel: 'dp': 表示关键词展示模块中的text框 'dp_r1': 表示文档相似度对比模块中的第一个text框 'dp_r2': 表示文档相似度对比模块中的第二个text框 """ if text_panel == 'dp': t = self.doc_panel title = self.doc_title elif text_panel == 'dp_r1': t = self.doc_panel_r1 title = self.doc_title_r1 elif text_panel == 'dp_r2': t = self.doc_panel_r2 title = self.doc_title_r2 file_path = filedialog.askopenfilename() print('file_path', file_path) # 判断是否选择了文件 if not file_path: messagebox.showinfo(message='请选择正确的文件。') return try: # 先清除文本框 t.delete('1.0', 'end') data = Data(file_path) title.set(os.path.split(file_path)[-1]) print('file:', os.path.split(file_path)[-1]) t.insert('end', data.raw_text) # 加载 idf 字典 idf = load_idf(self.idf_dir) # 计算 tf_idf score tf_idf = Tfidf(data.corpus, idf) # 保存当前文档的data、tfidf对象,用于后续的提取关键词或者计算相似度 self.status[text_panel] = { 'data': data, 'tfidf': tf_idf } if text_panel == 'dp': # 提取关键词模块,默认提取20个关键词 self.topk_var.set(20) self.refresh_key_words() elif text_panel == 'dp_r1' or text_panel == 'dp_r2': # if self.status.get('dp_r1', None) and self.status.get('dp_r2', None): # self.show_similarity() # 当文档相似度模块选择新的文档时,清空相似度输出的文本框 self.sim_panel.delete(0, 'end') except Exception: traceback.print_exc()
def jaccard_score_tfidf(locu, four, p1,p2,field): #make a tfidf object so that we don't need to re-compute the list of all names every time tfidf = Tfidf(locu, four,field) name1 = p1[field] name2 = p2[field] if name1 == "": set1 = set() else: set1 = set(name1.lower().split()) if name2 == "": set2 = set() else: set2 = set(name2.lower().split()) i = list(set1.intersection(set2)) u = list(set1.union(set2)) #compute idf score (decided to ignore tf) iscore = sum([tfidf.get_score(word) for word in i]) uscore = sum([tfidf.get_score(word) for word in u]) return 0 if uscore == 0 else float(iscore) / uscore
def reveal_doc(self, text_panel): """ 点击“选择文件”按钮调用此函数,获取文件路径,并将文档内容展示在相应的文本框中 Args: text_panel: 'dp': 表示关键词展示模块中的text框 'dp_r1': 表示文档相似度对比模块中的第一个text框 'dp_r2': 表示文档相似度对比模块中的第二个text框 """ self.root.wm_attributes('-topmost', 0) if text_panel == 'dp': t = self.doc_panel title = self.doc_title elif text_panel == 'dp_r1': t = self.doc_panel_r1 title = self.doc_title_r1 elif text_panel == 'dp_r2': t = self.doc_panel_r2 title = self.doc_title_r2 file_path = filedialog.askopenfilename() print('file_path', file_path) # 判断是否选择了文件 if not file_path: self.show_error('请选择正确的文件。') return try: # 先清除文本框 t.delete('1.0', 'end') data = Data(file_path) title.set(os.path.split(file_path)[-1]) print('file:', os.path.split(file_path)[-1]) t.insert('end', data.corpus) # 计算 tf_idf score tf_idf = Tfidf(data.corpus, len(data.corpus) // 20) # 保存当前文档的data、tfidf对象,用于后续的提取关键词或者计算相似度 self.status[text_panel] = {'data': data, 'tfidf': tf_idf} if text_panel == 'dp': # 提取关键词模块,默认提取20个关键词 self.kw_ours_var.set('Ours') self.kw_jieba_var.set('Jieba') self.kw_panel_jieba.delete(0, 'end') self.kw_panel_ours.delete(0, 'end') elif text_panel == 'dp_r1' or text_panel == 'dp_r2': # 当文档相似度模块选择新的文档时,清空相似度输出的文本框 self.sim_panel.delete(0, 'end') self.root.wm_attributes('-topmost', 1) except Exception: traceback.print_exc()
def parse_cbr(self): all_content = [] with codecs.open(self.path, 'r', 'utf-8-sig') as lines: for lin in lines: create_time = utility.split_data_by_date(lin) lin = lin.strip().split() userid, newsid, scan_time, title, create_time_ = int( lin[0]), int(lin[1]), lin[2], lin[3], lin[-1] news = News(int(userid), int(newsid), title, scan_time, [], create_time_) self.AllNews.append(news) content = "".join(lin[4:-1]) all_content.append(content) if self.isTfidf: tags = Tfidf(all_content).derive_keyword_zh(keyword_num=5) for index in xrange(len(tags)): self.AllNews[index].tags = tags[index]
exit() else: print "language error!" exit() tfidfText.append(text1) vsText.append(text2) return tfidfText, vsText, np.array(polarities), categories if __name__ == "__main__": if len(sys.argv) < 4: print "sys.argv[1]: input train corpus" print "sys.argv[2]: input test corpus" print "sys.argv[3]: corpus language, 'en' for English and 'zh-CN' for Chinese" exit() tfidfInstance = Tfidf() sentimentInstance = Sentiment() trCorpus1, trCorpus2, trPolarity, trCategory = readData(sys.argv[1], sys.argv[3]) teCorpus1, teCorpus2, tePolarity, teCategory = readData(sys.argv[2], sys.argv[3]) trainTfidf, testTfidf = tfidfInstance.tfidf(trCorpus1, teCorpus1) trainVS = sentimentInstance.VSPolarity(trCorpus2) testVS = sentimentInstance.VSPolarity(teCorpus2) trainMatrix = combineFeature(trainTfidf, trainVS) testMatrix = combineFeature(testTfidf, testVS) print sys.argv[2] trainMatrix, testMatrix = featureSelection(trainMatrix, trPolarity, testMatrix, tePolarity)
def __init__(self, posting_list_path, data_path): super(QuerySearch, self).__init__() self.posting_list_path = posting_list_path self.data_path = data_path self.TfidfObj = Tfidf(self.posting_list_path, self.data_path)
class QuerySearch(object): """docstring for QuerySearch""" def __init__(self, posting_list_path, data_path): super(QuerySearch, self).__init__() self.posting_list_path = posting_list_path self.data_path = data_path self.TfidfObj = Tfidf(self.posting_list_path, self.data_path) def find_best_doc(self, data_dict, num_docs_retrieval=2): ''' Find sum of all tf-idfs of all words in data_dict for all documents. ''' # Dictionary. {docid: (sum of tf-idf of all words from query)} lst_tfidf = {} for key in (data_dict): for docid in data_dict[key]: try: lst_tfidf[docid] += self.TfidfObj.tfidf(key,docid) except: lst_tfidf[docid] = self.TfidfObj.tfidf(key,docid) # Sorting dictionary in descending manner # lst_tfidf = {k: v for k, v in sorted(lst_tfidf.items(), key=lambda item: -1*item[1])} arr = sorted(lst_tfidf.items(), key=lambda item: -1*item[1]) return arr[:num_docs_retrieval] def search_query(self, query): ''' Assuming query is similar to N:new* | new* ''' temp = query.split(':') res = {} if(temp[0]=='P'): res = {key:val for key, val in self.TfidfObj.data.items() if key.startswith('person_'+temp[-1].split('*')[0])} elif(temp[0]=='O'): res = {key:val for key, val in self.TfidfObj.data.items() if key.startswith('org_'+temp[-1].split('*')[0])} elif(temp[0]=='L'): res = {key:val for key, val in self.TfidfObj.data.items() if key.startswith('loc_'+temp[-1].split('*')[0])} elif(temp[0]=='N'): res = { key:val for key, val in self.TfidfObj.data.items() if (key.startswith('person_'+temp[-1].split('*')[0]) or key.startswith('loc_'+temp[-1].split('*')[0]) or key.startswith('org_'+temp[-1].split('*')[0])) } else: res = {key:val for key, val in self.TfidfObj.data.items() if key.startswith(temp[-1].split('*')[0])} # print('Refined posting list to relevant queries') return self.find_best_doc(res) def search_queries(self, queries, num_docs_retrieval=2): res = {} for query in queries: temp = query.split(':') if(len(temp)>1): res = dict({key:val for key, val in self.TfidfObj.data.items() if (temp[0]=='P' and key.startswith('person_'+temp[-1].split('*')[0])) or (temp[0]=='O' and key.startswith('org_'+temp[-1].split('*')[0])) or (temp[0]=='L' and key.startswith('loc_'+temp[-1].split('*')[0])) or (temp[0]=='N' and (key.startswith('person_'+temp[-1].split('*')[0]) or key.startswith('loc_'+temp[-1].split('*')[0]) or key.startswith('org_'+temp[-1].split('*')[0]))) }, **res) else: res = dict({key:val for key, val in self.TfidfObj.data.items() if key.startswith(temp[-1].split('*')[0])}, **res) return self.find_best_doc(res, num_docs_retrieval=num_docs_retrieval)
else: print "language error!" exit() tfidfText.append(text1) vsText.append(text2) return tfidfText, vsText, np.array(polarities), categories if __name__ == "__main__": if len(sys.argv) < 4: print "sys.argv[1]: input train corpus" print "sys.argv[2]: input test corpus" print "sys.argv[3]: corpus language, 'en' for English and 'zh-CN' for Chinese" exit() tfidfInstance = Tfidf() sentimentInstance = Sentiment() trCorpus1, trCorpus2, trPolarity, trCategory = readData( sys.argv[1], sys.argv[3]) teCorpus1, teCorpus2, tePolarity, teCategory = readData( sys.argv[2], sys.argv[3]) trainTfidf, testTfidf = tfidfInstance.tfidf(trCorpus1, teCorpus1) trainVS = sentimentInstance.VSPolarity(trCorpus2) testVS = sentimentInstance.VSPolarity(teCorpus2) trainMatrix = combineFeature(trainTfidf, trainVS) testMatrix = combineFeature(testTfidf, testVS) print sys.argv[2] trainMatrix, testMatrix = featureSelection(trainMatrix, trPolarity,
def convert_to_tfidf(filename): with open(filename,"w+") as f: query_keyword = Tfidf("query_ids.txt","keyword_ids.txt") query_title = Tfidf("query_ids.txt","title_ids.txt") query_description = Tfidf("query_ids.txt","desc_ids.txt") keyword_title = Tfidf("keyword_ids.txt","title_ids.txt") keyword_description = Tfidf("keyword_ids.txt","desc_ids.txt") title_description = Tfidf("title_ids.txt","desc_ids.txt") data = csv_io.read_train("10percent_5lakh_preprocessed_training_data.txt") count = 0 with open("2lakh_training_data.txt") as f1: for line in f1: count = count + 1 sample = csv_io.split(line,[',']) queryid = sample[7] keywordid = ''+sample[8] titleid = ''+sample[9] descriptionid = ''+sample[10] qk_sim = query_keyword.classify(queryid,keywordid) qt_sim = query_title.classify(queryid,titleid) qd_sim = query_description.classify(queryid,descriptionid) kt_sim = keyword_title.classify(keywordid,titleid) kd_sim = keyword_description.classify(keywordid,descriptionid) td_sim = title_description.classify(titleid,descriptionid) sample.append('%.2f' % qk_sim[0][0]) sample.append('%.2f' % qt_sim[0][0]) sample.append('%.2f' % qd_sim[0][0]) sample.append('%.2f' % kt_sim[0][0]) sample.append('%.2f' % kd_sim[0][0]) sample.append('%.2f' % td_sim[0][0]) f.write(",".join(sample)) f.write("\n")
def dummy(): id1 = '1' id2 = '3' tfidf = Tfidf("dummy.txt","dummy2.txt") return tfidf.classify(id1,id2)
from brown import get_indexed from tfidf import Tfidf if __name__ == '__main__': documents_indexed, word2idx = get_indexed(10000) vocab_size = len(word2idx) print("Data loaded | Vocab size:", vocab_size, '| Document size:', len(documents_indexed)) model = Tfidf() TD = model.fit(documents_indexed, vocab_size) idx2word = {idx: word for word, idx in word2idx.items()} model.find_closest(['london', 'king', 'italy', 'queen'], TD, word2idx, idx2word)
# print("Starting with bigrams...") # bigram_perceptron = Bigram(train_ratio=0.8) # print("Bigram accuracy", bigram_perceptron.accuracy) # PART C: Compare the data representations ratios = np.arange(0.05, 1.05, 0.05) unigram_accuracies = [] tfidf_accuracies = [] bigram_accuracies = [] for r in ratios: unigram_perceptron = Unigram(train_ratio=r) unigram_accuracy = unigram_perceptron.accuracy unigram_accuracies.append(unigram_accuracy) print(r, "unigram_perceptron", unigram_accuracy) tfidf_perceptron = Tfidf(train_ratio=r) tfidf_accuracy = tfidf_perceptron.accuracy tfidf_accuracies.append(tfidf_accuracy) print(r, "tfidf_perceptron", tfidf_accuracy) bigram_perceptron = Bigram(train_ratio=r) bigram_accuracy = bigram_perceptron.accuracy bigram_accuracies.append(bigram_accuracy) print(r, "bigram_perceptron", bigram_accuracy) pickle.dump(unigram_accuracies, open("unigram_accuracies.pkl", "wb")) pickle.dump(tfidf_accuracies, open("tfidf_accuracies.pkl", "wb")) pickle.dump(bigram_accuracies, open("bigram_accuracies.pkl", "wb")) # unigram_accuracies = pickle.load(open("unigram_accuracies.pkl", "rb")) # tfidf_accuracies = pickle.load(open("tfidf_accuracies.pkl", "rb")) # bigram_accuracies = pickle.load(open("bigram_accuracies.pkl", "rb"))