Beispiel #1
0
def main():
    words = []
    sentences = []
    with open('..//dataset//computer.txt', 'r') as f:
        for line in f:
            words += util.split_words(line)
            sentences += util.split_sentences(line)

    # Tf-idf evaluation
    tfidf = Tfidf(words, sample_doc_ids)
    tfidf.calc_tfidf()

    # Show tf-idf values
    sorted_tfidf = util.sort_dict_by_value(tfidf.tfidf)
    for i in range(len(sorted_tfidf)):
        print('Word: {0:25}; tfidf = {1}'.format(sorted_tfidf[i][0],
                                                 sorted_tfidf[i][1]))

    # Work out summary
    summary = tfidf.best_sentences(sentences, 100)

    for sentence in summary:
        print(sentence.text)
        print("Score: {0}\n".format(sentence.score))

    print("-----------\nDONE")
Beispiel #2
0
    def preprocessing(self, is_tfidf=False, is_cosinesimilarty=False, is_Nmf=False):
        all = self.Alllist()
        #print("all uniq list")
        #print(all)
        lst = self.listtostring(all)

        uniq = self.unique_list(lst)
        concerns=[]

        concerns = self.load_set1()
        concerns = [self.tokenization(concerns[i]) for i in range(len(concerns))]
        concerns = [self.stop_words(concerns[i]) for i in range(len(concerns))]
        concerns = [self.regular_exp(concerns[i]) for i in range(len(concerns))]
        concerns = [self.port_stem(concerns[i]) for i in range(len(concerns))]
        concerns= [" ".join(concerns[i]) for i in range(len(concerns))]
        op1 = None
        if is_tfidf:
            
            op1 = Tfidf(concerns).tfidf()
        
        
        if is_cosinesimilarty:
            op1 = Cosine(op1).cosinesimilarty()
        
        
        if is_Nmf:
           op1=NMF(op1).non_negative_matrices()

        return op1
Beispiel #3
0
 def reveal_doc(self, text_panel):
     """
     点击“选择文件”按钮调用此函数,获取文件路径,并将文档内容展示在相应的文本框中
     Args:
         text_panel: 'dp': 表示关键词展示模块中的text框
                     'dp_r1': 表示文档相似度对比模块中的第一个text框
                     'dp_r2': 表示文档相似度对比模块中的第二个text框
     """
     if text_panel == 'dp':
         t = self.doc_panel
         title = self.doc_title
     elif text_panel == 'dp_r1':
         t = self.doc_panel_r1
         title = self.doc_title_r1
     elif text_panel == 'dp_r2':
         t = self.doc_panel_r2
         title = self.doc_title_r2
     file_path = filedialog.askopenfilename()
     print('file_path', file_path)
     # 判断是否选择了文件
     if not file_path:
         messagebox.showinfo(message='请选择正确的文件。')
         return
     try:
         # 先清除文本框
         t.delete('1.0', 'end')
         data = Data(file_path)
         title.set(os.path.split(file_path)[-1])
         print('file:', os.path.split(file_path)[-1])
         t.insert('end', data.raw_text)
         # 加载 idf 字典
         idf = load_idf(self.idf_dir)
         # 计算 tf_idf score
         tf_idf = Tfidf(data.corpus, idf)
         # 保存当前文档的data、tfidf对象,用于后续的提取关键词或者计算相似度
         self.status[text_panel] = {
             'data': data,
             'tfidf': tf_idf
         }
         if text_panel == 'dp':
             # 提取关键词模块,默认提取20个关键词
             self.topk_var.set(20)
             self.refresh_key_words()
         elif text_panel == 'dp_r1' or text_panel == 'dp_r2':
             # if self.status.get('dp_r1', None) and self.status.get('dp_r2', None):
             #     self.show_similarity()
             # 当文档相似度模块选择新的文档时,清空相似度输出的文本框
             self.sim_panel.delete(0, 'end')
     except Exception:
         traceback.print_exc()
Beispiel #4
0
 def reveal_doc(self, text_panel):
     """
     点击“选择文件”按钮调用此函数,获取文件路径,并将文档内容展示在相应的文本框中
     Args:
         text_panel: 'dp': 表示关键词展示模块中的text框
                     'dp_r1': 表示文档相似度对比模块中的第一个text框
                     'dp_r2': 表示文档相似度对比模块中的第二个text框
     """
     self.root.wm_attributes('-topmost', 0)
     if text_panel == 'dp':
         t = self.doc_panel
         title = self.doc_title
     elif text_panel == 'dp_r1':
         t = self.doc_panel_r1
         title = self.doc_title_r1
     elif text_panel == 'dp_r2':
         t = self.doc_panel_r2
         title = self.doc_title_r2
     file_path = filedialog.askopenfilename()
     print('file_path', file_path)
     # 判断是否选择了文件
     if not file_path:
         self.show_error('请选择正确的文件。')
         return
     try:
         # 先清除文本框
         t.delete('1.0', 'end')
         data = Data(file_path)
         title.set(os.path.split(file_path)[-1])
         print('file:', os.path.split(file_path)[-1])
         t.insert('end', data.corpus)
         # 计算 tf_idf score
         tf_idf = Tfidf(data.corpus, len(data.corpus) // 20)
         # 保存当前文档的data、tfidf对象,用于后续的提取关键词或者计算相似度
         self.status[text_panel] = {'data': data, 'tfidf': tf_idf}
         if text_panel == 'dp':
             # 提取关键词模块,默认提取20个关键词
             self.kw_ours_var.set('Ours')
             self.kw_jieba_var.set('Jieba')
             self.kw_panel_jieba.delete(0, 'end')
             self.kw_panel_ours.delete(0, 'end')
         elif text_panel == 'dp_r1' or text_panel == 'dp_r2':
             # 当文档相似度模块选择新的文档时,清空相似度输出的文本框
             self.sim_panel.delete(0, 'end')
         self.root.wm_attributes('-topmost', 1)
     except Exception:
         traceback.print_exc()
Beispiel #5
0
 def parse_cbr(self):
     all_content = []
     with codecs.open(self.path, 'r', 'utf-8-sig') as lines:
         for lin in lines:
             create_time = utility.split_data_by_date(lin)
             lin = lin.strip().split()
             userid, newsid, scan_time, title, create_time_ = int(
                 lin[0]), int(lin[1]), lin[2], lin[3], lin[-1]
             news = News(int(userid), int(newsid), title, scan_time, [],
                         create_time_)
             self.AllNews.append(news)
             content = "".join(lin[4:-1])
             all_content.append(content)
     if self.isTfidf:
         tags = Tfidf(all_content).derive_keyword_zh(keyword_num=5)
         for index in xrange(len(tags)):
             self.AllNews[index].tags = tags[index]
	def __init__(self, posting_list_path, data_path):
		super(QuerySearch, self).__init__()
		self.posting_list_path = posting_list_path
		self.data_path = data_path
		self.TfidfObj = Tfidf(self.posting_list_path, self.data_path)
Beispiel #7
0
            else:
                print "language error!"
                exit()
            tfidfText.append(text1)
            vsText.append(text2)
    return tfidfText, vsText, np.array(polarities), categories


if __name__ == "__main__":
    if len(sys.argv) < 4:
        print "sys.argv[1]: input train corpus"
        print "sys.argv[2]: input test corpus"
        print "sys.argv[3]: corpus language, 'en' for English and 'zh-CN' for Chinese"
        exit()

    tfidfInstance = Tfidf()
    sentimentInstance = Sentiment()

    trCorpus1, trCorpus2, trPolarity, trCategory = readData(
        sys.argv[1], sys.argv[3])
    teCorpus1, teCorpus2, tePolarity, teCategory = readData(
        sys.argv[2], sys.argv[3])

    trainTfidf, testTfidf = tfidfInstance.tfidf(trCorpus1, teCorpus1)
    trainVS = sentimentInstance.VSPolarity(trCorpus2)
    testVS = sentimentInstance.VSPolarity(teCorpus2)
    trainMatrix = combineFeature(trainTfidf, trainVS)
    testMatrix = combineFeature(testTfidf, testVS)

    print sys.argv[2]
    trainMatrix, testMatrix = featureSelection(trainMatrix, trPolarity,
Beispiel #8
0
from brown import get_indexed
from tfidf import Tfidf

if __name__ == '__main__':
    documents_indexed, word2idx = get_indexed(10000)
    vocab_size = len(word2idx)
    print("Data loaded | Vocab size:", vocab_size, '| Document size:',
          len(documents_indexed))

    model = Tfidf()
    TD = model.fit(documents_indexed, vocab_size)

    idx2word = {idx: word for word, idx in word2idx.items()}
    model.find_closest(['london', 'king', 'italy', 'queen'], TD, word2idx,
                       idx2word)
Beispiel #9
0
    # print("Starting with bigrams...")
    # bigram_perceptron = Bigram(train_ratio=0.8)
    # print("Bigram accuracy", bigram_perceptron.accuracy)

    # PART C: Compare the data representations
    ratios = np.arange(0.05, 1.05, 0.05)
    unigram_accuracies = []
    tfidf_accuracies = []
    bigram_accuracies = []
    for r in ratios:
        unigram_perceptron = Unigram(train_ratio=r)
        unigram_accuracy = unigram_perceptron.accuracy
        unigram_accuracies.append(unigram_accuracy)
        print(r, "unigram_perceptron", unigram_accuracy)

        tfidf_perceptron = Tfidf(train_ratio=r)
        tfidf_accuracy = tfidf_perceptron.accuracy
        tfidf_accuracies.append(tfidf_accuracy)
        print(r, "tfidf_perceptron", tfidf_accuracy)

        bigram_perceptron = Bigram(train_ratio=r)
        bigram_accuracy = bigram_perceptron.accuracy
        bigram_accuracies.append(bigram_accuracy)
        print(r, "bigram_perceptron", bigram_accuracy)

    pickle.dump(unigram_accuracies, open("unigram_accuracies.pkl", "wb"))
    pickle.dump(tfidf_accuracies, open("tfidf_accuracies.pkl", "wb"))
    pickle.dump(bigram_accuracies, open("bigram_accuracies.pkl", "wb"))
    # unigram_accuracies = pickle.load(open("unigram_accuracies.pkl", "rb"))
    # tfidf_accuracies = pickle.load(open("tfidf_accuracies.pkl", "rb"))
    # bigram_accuracies = pickle.load(open("bigram_accuracies.pkl", "rb"))