#coding:utf-8 import logging import jieba from gensim import corpora, models import myConfig import preprocess import sys reload(sys) sys.setdefaultencoding('utf-8') logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary = corpora.Dictionary.load(myConfig.dict_file) corpus = corpora.MmCorpus(myConfig.corpora_file) if myConfig.useTFIDF: tfidf = models.TfidfModel(corpus) corpus = tfidf[corpus] lsi = models.LsiModel(corpus, id2word =dictionary, num_topics = myConfig.num_topics ) corpus_lsi = lsi[corpus] lsi.save(myConfig.topic_model_file) lsi.print_topics(myConfig.num_topics) lsi.print_debug(myConfig.num_topics, num_words = 20 )
def gensim_feature(corpus=None): # corpus参数样例数据如下: # corpus = [["我", "来到", "成都", "春熙路"], # ["今天", "在", "宽窄巷子", "耍", "了", "一天"], # ["成都", "整体", "来说", "还是", "挺", "安逸", "的"], # ["成都", "的", "美食", "真", "巴适", "惨", "了"]] dictionary = corpora.Dictionary(corpus) # 构建语料词典 # # 收集停用词和仅出现一次的词的id # stop_ids = [dictionary.token2id[stopword] for stopword in user_stop_word_list if stopword in dictionary.token2id] # once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1] # dictionary.filter_tokens(stop_ids + once_ids) # 删除停用词和仅出现一次的词 # dictionary.compactify() # 消除id序列在删除词后产生的不连续的缺口 # dictionary.save('mycorpus.dict') # 把字典保存起来,方便以后使用 # 统计词频特征 dfs = dictionary.dfs # 词频词典 for key_id, c in dfs.items(): print(dictionary[key_id], c) # 转换成doc_bow doc_bow_corpus = [dictionary.doc2bow(doc_cut) for doc_cut in corpus] # 生成tfidf特征 tfidf_model = models.TfidfModel(dictionary=dictionary) # 生成tfidf模型 tfidf_corpus = [tfidf_model[doc_bow] for doc_bow in doc_bow_corpus ] # 将每doc_bow转换成对应的tfidf_doc向量 # 生成lsi特征(潜在语义索引) lsi_model = models.LsiModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100) # 生成lsi model # 生成corpus of lsi lsi_corpus = [lsi_model[tfidf_doc] for tfidf_doc in tfidf_corpus] # 转换成lsi向量 # 生成lda特征(主题模型) lda_model = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=100) # 生成lda model # 生成corpus of lsi lda_corpus = [lda_model[tfidf_doc] for tfidf_doc in tfidf_corpus] # 转换成lda向量 # 生成随机映射(Random Projections,RP, 优点:减小空维度、CPU和内存都很友好) rp_model = models.RpModel(tfidf_corpus, num_topics=500) rp_corpus = [rp_model[tfidf_doc] for tfidf_doc in tfidf_corpus] # 转换成随机映射tfidf向量 # 分层狄利克雷过程(Hierarchical Dirichlet Process,HDP ,一种无参数贝叶斯方法) hdp_model = models.HdpModel(doc_bow_corpus, id2word=dictionary) hdp_corpus = [hdp_model[doc_bow] for doc_bow in doc_bow_corpus] # 转换成HDP向量 # 文档向量和词向量 (Doc2Vec and Word2Vec) tld_list = [] for ind, line_list in enumerate(corpus): tld_list.append(TaggedDocument(line_list, tags=[str(ind)])) d2v_model = Doc2Vec(tld_list, min_count=5, window=3, size=100, sample=1e-3, negative=5, iter=15) # 由于Doc2vec的训练过程也可以同时训练Word2vec,所以可以直接获取两个模型,全部保存起来: # model.save(save_model_d2v_file_path) # model.save_word2vec_format(save_model_w2v_file_path, binary=True) # 将文本转换成向量矩阵 docvecs = d2v_model.docvecs docvecs_matrix = np.asarray(docvecs)
from gensim import corpora, models, similarities texts = ["きのう", "も", "私", "は", "その", "料理", "を", "食べました"] # textsを予め準備しておく(分かち書き文のリスト) num_topics = 3 dictionary = corpora.Dictionary(texts) # 入力textsをdictionaryに変換 corpus = [dictionary.doc2bow(text) for text in texts] # corpusを作成 tfidf = models.TfidfModel(corpus) # TFIDFモデルを作成 corpus_tfidf = tfidf[corpus] # corpusをTF-IDFで重要語のみに変換 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics) # corpus_tfidfからLSIモデルを作成 # トピックの表示 print(lsi.show_topics(num_topics, formatted=True)) # topicを表示 corpus_lsi = lsi[corpus_tfidf] # corpus_tfidfのすべての文をLSIに変換 for doc in corpus_lsi: x = [ sorted(doc, key=lambda u: u[1], reverse=True) for u in doc if len(u) != 0 ] print(x)
print(final) dictionary = corpora.Dictionary(final) dictionary.save('dictionary/mydict.dic') corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('corpus/corpus.mm', corpus) # Reload dictionary and corpus dictionary = corpora.Dictionary.load('dictionary/mydict.dic') corpus = corpora.MmCorpus('corpus/corpus.mm') tfidf = models.TfidfModel(corpus=corpus) tfidf.save('model/model.tfidf') # Serialize corpus tfidf_corpus = tfidf[corpus] corpora.MmCorpus.serialize('model/tfidf_corpus.mm', tfidf_corpus) #lsi lsi = models.LsiModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=120) #TODO lsi_corpus = lsi[tfidf_corpus] lsi.save('model/lsi/model.lsi') corpora.MmCorpus.serialize('model/lsi/lsi_corpus.mm', lsi_corpus) print('LSI Topics:') print(lsi.print_topics(120)) #lda lda = models.LdaModel(corpus=tfidf_corpus, id2word=dictionary, num_topics=120) lda_corpus = lda[tfidf_corpus] lda.save('model/lda/model.lda') corpora.MmCorpus.serialize('model/lda/lda_corpus.mm', lda_corpus) index = similarities.MatrixSimilarity(corpus) index.save('similarity/lsi_similarity.sim')
def lsi_model(corpus_tfidf, dictionary, lsi_save_path): lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=50) lsi.print_topics(50) lsi.save(lsi_save_path)
from gensim import corpora, models, similarities import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) dictionary = corpora.Dictionary.load('resources/today.dict') corpus = corpora.MmCorpus('resources/today.mm') tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model corpus_tfidf = tfidf[corpus] # apply the trained model to a corpus lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) # initialize an LSI transformation corpus_lsi = lsi[ corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi lsi.print_topics(5)
# Transform the collection of texts to a numerical form corpus = [dictionary.doc2bow(text) for text in tokenized_data] # Have a look at how the 20th document looks like: [(word_id, count), ...] #print(corpus[20]) # [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), ... # Build the LDA model lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary) # Build the LSI model lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary) print("LDA Model:") for idx in range(NUM_TOPICS): # Print the first 10 most representative topics print("Topic #%s:" % idx, lda_model.print_topic(idx, 10)) print("=" * 20) print("LSI Model:") for idx in range(NUM_TOPICS): # Print the first 10 most representative topics print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
#dictionary = corpora.Dictionary(sentTokens[0]) #dictionary.save(os.path.join(TEMP_FOLDER, 'execsSententceTokens.dict')) #print(dictionary) # create and save corpus #corpus = [dictionary.doc2bow(sentTk) for sentTk in sentTokens[0]] #corpora.MmCorpus.serialize(os.path.join(TEMP_FOLDER, 'execsSententceTokens.mm'), corpus) # store to disk, for later use # load saved dictionary and corpus dictionary = corpora.Dictionary.load(TEMP_FOLDER + '\execsAnnotatedtext.dict') corpus = corpora.MmCorpus(TEMP_FOLDER + '\execsSententceTokens.mm') # create LSI model with 250 topics lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=250) # transform corpus to LSI space and index it index = similarities.MatrixSimilarity(lsi[corpus])#, num_features=208) #save the index #index.save(TEMP_FOLDER + '\execsSententceTokensIndex.index') #load the saved index #index = similarities.MatrixSimilarity.load(TEMP_FOLDER + '\execsSententceTokensIndex.index') #lsi.show_topic(1, topn=15) docSentLkup = {} docSentAll = {} sentNdoc = []
# print(new_vec) # the word "interaction" does not appear in the dictionary and is ignored corpus = [dictionary.doc2bow(text) for text in texts] # for c in corpus: # print(c) from gensim import models from gensim import similarities ## ## tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ## ## lsi = models.LsiModel(corpus_tfidf, id2word=dictionary) doc = 'What worries me about AI'#documents[0] vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lsi[vec_bow] # convert the query to LSI space # print(vec_lsi) index = similarities.MatrixSimilarity(lsi[corpus_tfidf]) # transform corpus to LSI space and index it sims = index[vec_lsi] # perform a similarity query against the corpus # for i, sim in enumerate(sims): # print('{} - {}'.format(sim, documents[i])) sims_s = sorted(list(enumerate(sims)), key=lambda tup: tup[1], reverse=True) for item in sims_s: i = item[0]
frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 #texts = [[token for token in text if frequency[token] > 1] for text in texts] #pprint(texts) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) #lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10) lsi = models.LsiModel(tfidf[corpus], id2word=dictionary, num_topics=10) #index = similarities.MatrixSimilarity(lsi[corpus]) #transform corpus to LSI and index it #print(corpus) @app.route('/_analyse') def _analyse(): return_message = "" doc = request.args.get('usrinp') if doc.lower() == "yes": session.clear() return jsonify(
def calcLSI(tfidfList, dictionary): global corpus_lsi topics = 4 #this is arbitrary we should play around with this lsi = models.LsiModel(tfidfList, id2word=dictionary, num_topics=topics) corpus_lsi = lsi[tfidfList] lsi.print_topics(topics)
p_ans_dictionary.save('/tmp/perfect_answer.dict') # store the dictionary, for future reference print(p_ans_dictionary) ########################################################### print(p_ans_dictionary.token2id) ########################################################### corpus = p_ans_dictionary.doc2bow(recorded_answer.lower().split()) corpora.MmCorpus.serialize('/tmp/p_ans.mm', [corpus]) ############################################################## from gensim import models lsi = models.LsiModel([corpus], id2word=p_ans_dictionary, num_topics=2) vec_bow = p_ans_dictionary.doc2bow(r_ans.lower().split()) vec_lsi = lsi[vec_bow] from gensim import similarities index = similarities.MatrixSimilarity(lsi[[corpus]]) # transform corpus to LSI space and index it ############################################################### sims = index[vec_lsi] print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples
from gensim import corpora from gensim import models from gensim import similarities # l1 = ["你的名字是什么", "你今年几岁了", "你有多高你胸多大", "你胸多大"] # a = "你今年多大了" from Config import MongoDB l1 = list(MongoDB.Content.find({})) # 从数据库中获取问题库 all_doc_list = [] for doc in l1: doc_list = list(jieba.cut_for_search(doc.get("title"))) # 把Content表中的歌曲名进行jieba处理,放在大的列表中 all_doc_list.append(doc_list) # 制作语料库 dictionary = corpora.Dictionary(all_doc_list) # 制作词袋 corpus = [dictionary.doc2bow(doc) for doc in all_doc_list] lsi = models.LsiModel(corpus) # 数据量小时相对精确,大了就非常的不精确 500万以内,LsiModel就是用来寻找、提取公共的搜索条件 index = similarities.SparseMatrixSimilarity(lsi[corpus], num_features=len(dictionary.keys())) def my_gensim_nlp(a): doc_test_list = list(jieba.cut_for_search(a)) doc_test_vec = dictionary.doc2bow(doc_test_list) sim = index[lsi[doc_test_vec]] cc = sorted(enumerate(sim), key=lambda item: -item[1]) if cc[0][0] >= 0.55: text = l1[cc[0][0]] return text
def TFIDFH(): document1 = [] for w in brown.words(categories='mystery'): document1.append(w.lower()) B = document1[:len(document1) // 2] doc = "" for w in brown.words(categories='mystery'): doc += str(w.lower()) C, D = doc[:int(len(doc) / 2)], doc[int(len(doc) / 2):] stoplist = set('for a of the and to in - , is'.split()) texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in B] texts[0] = [text.replace(',', '') for text in texts[0]] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2) vec_bow = dictionary.doc2bow(D.lower().split()) vec_lsi = lsi[vec_bow] index = similarities.MatrixSimilarity(lsi[corpus]) sims = index[vec_lsi] # perform a similarity query against the corpus print("results: ", sims) bow1 = document1 bow2 = doc wordSet = set(bow1).union(set(bow2)) wordDict1 = dict.fromkeys(wordSet, 0) wordDict2 = dict.fromkeys(wordSet, 0) for word in bow1: wordDict1[word] += 1 for word in bow2: wordDict2[word] += 1 def computeTF(wordDict, bow): tfDict = {} bowCount = len(bow) for word, count in wordDict.items(): tfDict[word] = count / float(bowCount) return tfDict tfBow1 = computeTF(wordDict1, bow1) tfBow2 = computeTF(wordDict2, bow2) def computeIDF(docList): import math idfDict = {} N = len(docList) idfDict = dict.fromkeys(docList[0].keys(), 0) for doc in docList: for word, val in doc.items(): if val > 0: idfDict[word] += 1 for word, val in idfDict.items(): idfDict[word] = math.log10(N / float(val)) return idfDict idfs = computeIDF([wordDict1, wordDict2]) print("IDF") print(idfs) def computeTFIDF(tfBow, idfs): tfidf = {} for word, val in tfBow.items(): tfidf[word] = val * idfs[word] return tfidf tfidfBow1 = computeTFIDF(tfBow1, idfs) tfidfBow2 = computeTFIDF(tfBow2, idfs) print("TF-IDF Document1: ") print(tfidfBow1) print("TF-IDF Document2: ") print(tfidfBow2)
#Create a dictionary of the words dictionary = corpora.Dictionary(feature_none) #print( dictionary.token2id) #Transform the document to a BOW corpus = [dictionary.doc2bow(text) for text in feature_none] #print(corpus[:2]) #Transform to TFIDF tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] #Extract top topics using Latent Semantic Indexing lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=total_topics) #Print the top topics print_topics_gensim(topic_model=lsi, total_topics=total_topics, num_terms=15, display_weights=False) # In[ ]: ''' Observations of the topics by MBTI type: - Across the MBTI types some common theme occur: Personality Relationship Music
# Step 4: Creating bag-of-words model and generate corpus sparse vector dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # print(dictionary.token2id) # Step 5: Generating TF-IDF tfidf = models.TfidfModel(corpus, normalize=True) corpus_tfidf = tfidf[corpus] # for document in corpus_tfidf: # print(document) #TODO: learn how to read this and annotate the topic and observe the stuff num_topic = [100, 200] #### LSI ########## for num_topics in num_topic: lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topics, power_iters=300) corpus_lsi = lsi_model[corpus_tfidf] #writing doc classification to file count = 0 filename = 'lsi_list_' + str(num_topics) + '.csv' with open(filename, 'w') as csvfile: writer = csv.writer(csvfile, delimiter = ',') for document in corpus_lsi: count += 1 try: writer.writerow(max(document, key = lambda item:item[1])) except ValueError: print(count) name_of_file = 'lsi_topics_' + str(num_topics) + '.csv'
def LSI_analysis(texts, nTopics): # 抽取一个“词袋(bag-of-words)",将文档的token映射为id dictionary = corpora.Dictionary(texts) # 将用字符串表示的文档转换为用id表示的文档向量 corpus = [dictionary.doc2bow(text) for text in texts] #print 'Corpus: ' #print corpus # 基于这些“训练文档”计算一个TF-IDF“模型 tfidf = models.TfidfModel(corpus) #print 'tfidf model:' #print tfidf.dfs #print tfidf.idfs # 将上述用词频表示文档向量表示为一个用tf-idf值表示的文档向量 corpus_tfidf = tfidf[corpus] print 'Text vector formed by tf-idf' #for doc in corpus_tfidf: #print doc # 训练一个LSI模型 lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=nTopics) print 'Top 2 topics of LSI model:' print lsi.print_topics(2) # 将文档映射到一个n维的topic空间中 print 'The text projection in the n-dim topic space:' corpus_lsi = lsi[corpus_tfidf] # n of doc * n of topic for doc in corpus_lsi: print doc # LDA模型; lda模型中的每个主题单词都有概率意义,其加和为1,值越大权重越大,物理意义比较明确 #lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=nTopics) #print 'Top 2 topics of LDA model:' #print lda.print_topics(2) # 将文档映射到一个二维的topic空间中 #print 'The text projection in the n-dim topic space:' #corpus_lda = lda[corpus_tfidf] #for doc in corpus_lda: ##print doc # 计算文档之间的相似度,或者给定一个查询Query,如何找到最相关的文档: 首先建索引 index = similarities.MatrixSimilarity(lsi[corpus]) # 将query向量化 query = "shipment of silver arrived" query_bow = dictionary.doc2bow(query.lower().split()) print "query: " + query + "; the bow vector: " #print query_bow # 用之前训练好的LSI模型将其映射到n维的topic空间 query_lsi = lsi[query_bow] print 'The projection of query in the n-dim topic space:' print query_lsi # 计算其和index中doc的余弦相似度 sims = index[query_lsi] print 'The cos simularity between query and doc:' print sims print list(enumerate(sims)) # 也可以按相似度进行排序 sort_sims = sorted(enumerate(sims), key=lambda item: -item[1]) print sort_sims query_tfidf = tfidf[query_bow] query_tfidf_lsi = lsi[query_tfidf] index_tfidf = similarities.MatrixSimilarity(lsi[corpus_tfidf]) sims_tfid = index_tfidf[query_tfidf_lsi] print 'The cos simularity between query and doc:' print list(enumerate(sims_tfid)) return
def suggest_next_video(original_id, input_chunks, search_term): if(search_term == ''): global last_search search_term = last_search # This video_id is just a test case #if (original_id == 'R9npBuS9AsE'): # output_id_list = get_canned_search_results() #else: output_video_list = query_video_ids(search_term) output_name_map = dict(output_video_list) output_id_list = [video[0] for video in output_video_list] #Truncate possible video list to 20 for performance reasons try: output_id_list.remove(original_id) except: pass output_id_list = output_id_list[:40] chunk_lookup_dict = {} start = time.time() chunk_counter = 0 output_chunks = [] myq = queue.Queue() threads = list() for video_id in output_id_list: thread = threading.Thread(target=queueTranscript,args=(video_id,myq)) threads.append(thread) thread.start() for thread in threads: thread.join() for transcript in list(myq.queue): transcript_counter = 0 #try: # output_video_list = yttapi.get_transcript(str(video_id)) #except yttapi.CouldNotRetrieveTranscript: # continue output_video_list = transcript[1] video_length = len(transcript[1]) video_id = transcript[0] for i in range(video_length//10): chunk_text_list = [] for j in range(10): try: chunk_text_list.append(output_video_list[transcript_counter]['text']) except Exception: break chunk_text = ' '.join(chunk_text_list) transcript_counter += 1 output_chunks.append(chunk_text) chunk_lookup_dict[chunk_counter] = video_id chunk_counter += 1 print ("After chunking output: " + str(time.time() - start)) start = time.time() # Exclude common stop words and those used frequently in YouTube transcripts my_stop_words = STOPWORDS.union(set(['[Music]', '[music]', '[Applause]', 'subscribe', 'channel', 'youtube'])) #stoplist = set('for a of the and to in [music]'.split()) texts = [ [word for word in document.lower().split() if word not in my_stop_words] for document in output_chunks ] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10) # generates an index of the corpus, need only do this once index = similarities.MatrixSimilarity(lsi[corpus]) print ("After building index: " + str(time.time() - start)) video_average_score = {} for video_id in output_id_list: video_average_score[video_id] = [] start = time.time() # Go through each input chunk and get an average score for each video for i in range(len(input_chunks)): # Skip over chunks the user didn't watch watched_score = input_chunks[i][1] if (watched_score == 0): continue doc=input_chunks[i][0] #doc=input_chunks[0][0] vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lsi[vec_bow] similarity_score = index[vec_lsi] # sorts based on descending relevance (earlier sort order = more useful) similarity_scores = sorted(enumerate(similarity_score), key=lambda item: -item[1]) #chunk_ranking = [(documents[x],y) for (x,y) in similarity_scores] video_total_score = {} video_chunk_counts = {} for video_id in output_id_list: video_total_score[video_id] = 0 video_chunk_counts[video_id] = 0 for chunk_id, score in similarity_scores: video_total_score[chunk_lookup_dict[chunk_id]] += score video_chunk_counts[chunk_lookup_dict[chunk_id]] += 1 # Multiply the similarity ranking by the 'score' given to us that represents how slowly they # watched the video chunk and how many times they repeated it # We append this to a list of average scores for the video for video_id in output_id_list: if (video_chunk_counts[video_id] == 0): video_average_score[video_id].append(0) else: avg_score = video_total_score[video_id]/video_chunk_counts[video_id] video_average_score[video_id].append(avg_score) print ("After looping through input chunks: " + str(time.time() - start)) video_sum = {} for idx, video_id in enumerate(video_average_score.keys()): total_score = sum(x for x in video_average_score[video_id]) #video_sum[video_id] = (total_score * (1 + RL_WEIGHT_FACTOR * rl_network.weights['param_' + str(idx)]), output_name_map[video_id]) video_sum[video_id] = (total_score, output_name_map[video_id]) sorted_videos = list(sorted(video_sum.items(), key=lambda kv: -kv[1][0])) # now apply geva return_videos = [] for i in range(0,10): return_videos.append((sorted_videos[i][0], (sorted_videos[i][1][0]* (1.0+RL_WEIGHT_FACTOR * rl_network.weights['param_'+str(i)]),sorted_videos[i][1][1]))) return return_videos
def data(): data = [] pos = pd.read_excel("E:/dataset/words_classification/dataset/pos.xls", header=None, index=None) neg = pd.read_excel("E:/dataset/words_classification/dataset/neg.xls", header=None, index=None) stop_words = [] with open('E:/dataset/NLP/stopwords/stopwords_TUH.txt', 'r', encoding='gbk') as f: line = f.readline() while line: stop_words.append(line[:-1]) line = f.readline() stop_words = set(stop_words) pos_split = [] for index, seq in enumerate(pos[0]): pos_seq = list(jieba.cut(seq, cut_all=False)) pos_line = [] for word in pos_seq: if word not in stop_words: pos_line.append(word) pos_split.append(pos_line) neg_split = [] for index, seq in enumerate(neg[0]): neg_seq = list(jieba.cut(seq, cut_all=False)) neg_line = [] for word in neg_seq: if word not in stop_words: neg_line.append(word) neg_split.append(neg_line) data = np.concatenate((pos_split, neg_split)) # 分词词典 data_dic = corpora.Dictionary(data) data_dic.save( 'E:/dataset/words_classification/dataset/tf-idf_model/data_dict') # 转为频率表示的稀疏向量 corpus = [data_dic.doc2bow(text) for text in data] corpora.MmCorpus.serialize( 'E:/dataset/words_classification/dataset/tf-idf_model/data_corpus', corpus) #保存生成的语料 # tf-idf tfidf_model = models.TfidfModel(corpus=corpus, dictionary=data_dic) # corpus_tfidf = tfidf_model[corpus] corpus_tfidf = [tfidf_model[doc] for doc in corpus] tfidf_model.save( 'E:/dataset/words_classification/dataset/tf-idf_model/data_tf-idf.tfidf' ) # lsi lsi_model = models.LsiModel(corpus=corpus, id2word=data_dic, num_topics=100) # corpus_lsi = lsi_model[tfidf_corpus] corpus_lsi = [lsi_model[doc] for doc in corpus] lsi_model.save( "E:/dataset/words_classification/dataset/tf-idf_model/data_lsi") return corpus_lsi
def get_recmd(self, id, type): texts1 = self.get_behaviors(id, type) #get user's behaviors if texts1 == []: print "user's behavior is null" if type == 'Home': return_articles = Article.objects.all().order_by('?')[:20] else: return_articles = Article.objects.filter( articleType1=type).order_by('?')[:20] return return_articles else: print "user has behavior" if type == 'Home': num = len(texts1) study = 0 news = 0 fun = 0 for i in texts1: # ???? t = Article.objects.get(articleId=i).articleType1 if t == 'Study': study += 1 else: if t == 'News': news += 1 else: fun += 1 s_num = (int)(1.0 * study / num * 20) n_num = (int)(1.0 * news / num * 20) f_num = (int)(1.0 * fun / num * 20) return_articles = [] return_articles.extend( Article.objects.filter( articleType1='Study').order_by('?')[0:s_num]) return_articles.extend( Article.objects.filter( articleType1='News').order_by('?')[0:n_num]) return_articles.extend( Article.objects.filter( articleType1='Fun').order_by('?')[0:f_num]) return return_articles else: texts = [] articles = Article.objects.filter(articleType1=type) dict = {} i = 0 for article in articles: dict[i] = article.articleId i += 1 l = article.articleEnglishText.lower().replace( ',', " ").replace('.', " ").replace('"', " ").split() new_l = [word for word in l if word not in stopwords] texts.append(new_l) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) corpus_lsi = lsi[corpus_tfidf] index = similarities.MatrixSimilarity(lsi[corpus]) simi = zeros((len(texts1), len(texts))) for i in range(len(texts1)): query = texts[i] query_bow = dictionary.doc2bow(query) query_lsi = lsi[query_bow] sims = index[query_lsi] l = list(enumerate(sims)) for j in range(len(l)): simi[i][l[j][0]] = l[j][1] simi1 = zeros(len(texts)) s1 = numpy.array(simi1) for i in range(len(simi)): s1 += numpy.array(simi[i]) for i in s1: i = i / len(texts1) simi1 = s1 d1 = {} for i in range(len(simi1)): d1[dict[i]] = simi1[i] sorted_dict = sorted(d1.iteritems(), key=lambda x: x[1], reverse=True) return_articles = [] for i in range(20): return_articles.append( Article.objects.get(articleId=sorted_dict[i][0])) return return_articles
def train_lsi(self): lsi = models.LsiModel(self.corpus_tfidf, id2word=self.dictionary, num_topics=self.num_topics) return lsi
file_names = [ file_name for file_name in file_names if filter_file_by_content(file_name, unanalyzed_senders) is not None ] logging.debug('after filtering we are using %d files', len(file_names)) corpus = MyCorpus([file_name for file_name in file_names]) corpus.dictionary.save(dictionary_file_name) corpora.MmCorpus.save_corpus(corpus_file_name, corpus) model = models.LsiModel(corpus, num_topics=topics_count, id2word=corpus.dictionary, chunksize=20000, distributed=False, onepass=True) logging.debug('built LSI model') model.save(model_file_name) logging.debug('saved LSI model as %s' % model_file_name) model.show_topics(num_topics=topics_count) topics_matrix = model.show_topics(formatted=False, num_words=top_words_count) logging.debug(topics_matrix) model.print_topics(-1)
def LSI(request): query = "" query_response = None file_list = None file_list_dictionary = None search_result_dictionary = None documents = [] for counter in range(1033): temp = open("IR/" + str(counter + 1) + ".txt", 'r') documents.append(temp.read()) temp.close() stop_words = stopwords.words('english') texts = [[ word for word in document.lower().split() if word not in stop_words ] for document in documents] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('/tmp/ir.mm', corpus) lsi = models.LsiModel(corpus, num_topics=43, id2word=dictionary) if request.method == "POST": form = SearchForm(request.POST) if form.is_valid(): query_response = list() user_query = form.save() query = user_query.query user_query.save() index = similarities.MatrixSimilarity(lsi[corpus]) doc = user_query.query vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lsi[vec_bow] sims = index[vec_lsi] sims = sorted(enumerate(sims, 1), key=lambda item: -item[1]) file_list = list() for element in sims[0:5]: file_list.append(element[0]) temp = None for text in file_list: temp = open("IR/" + str(text) + ".txt", 'r') query_response.append(temp.read()) temp.close() #print(query_response) file_list_dictionary = dict() file_list_dictionary = { i: file_list[i - 1] for i in range(1, len(file_list) + 1) } search_result_dictionary = { i: query_response[i - 1] for i in range(1, len(query_response) + 1) } else: form = SearchForm() return render( request, "lsi.html", { 'form': form, 'query': query, 'answer': file_list, 'search_results': query_response, 'file_dictionary': file_list_dictionary, 'search_result_dictionary': search_result_dictionary })
def create_lsi_model(texts): dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2) return dictionary, corpus, lsi
print('Text = ') pprint(texts) dictionary = corpora.Dictionary(texts) print(dictionary) V = len(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] corpus_tfidf = models.TfidfModel(corpus)[corpus] corpus_tfidf = corpus print('TF-IDF:') for c in corpus_tfidf: print(c) print('\nLSI Model:') lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary) topic_result = [a for a in lsi[corpus_tfidf]] pprint(topic_result) print('LSI Topics:') pprint(lsi.print_topics(num_topics=2, num_words=5)) similarity = similarities.MatrixSimilarity(lsi[corpus_tfidf]) # similarities.Similarity() print('Similarity:') pprint(list(similarity)) print('\nLDA Model:') num_topics = 2 lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, alpha='auto', eta='auto', minimum_probability=0.001, passes=10) doc_topic = [doc_t for doc_t in lda[corpus_tfidf]] print('Document-Topic:\n') pprint(doc_topic)
cleaned_text = [p_stemmer.stem(i) for i in tokens] all_sentences.append(cleaned_text) for token in cleaned_text: all_words.append(token) # Generate corpus and dictionary. dictionary = gensim.corpora.Dictionary(all_sentences) corpus = [dictionary.doc2bow(word) for word in all_sentences] ### Step 3: Train LSA and t-SNE model. # Here I train a LSA model and reduce it to a 2D t-SNE space. # Train. lsa_model = models.LsiModel(corpus, id2word=dictionary, num_topics=1000) # Get most frequent unique words. all_words_unique = nltk.FreqDist(all_words).most_common(4100) # Prepare input for t-SNE all_words_unique = nltk.FreqDist(all_words).most_common(4100) all_words_unique_vec = [] all_words_unique_word = [] for index2, item2 in enumerate(all_words_unique): all_words_unique_vec.append(list(model.wv[item2[0]])) all_words_unique_word.append(item2[0]) # Set up t-SNE model. tsne_model = TSNE(n_components=2, random_state=10, perplexity=50.0) X = np.array(all_words_unique_vec)
thunderbird_rss_list = list() for row in cursor_1: thunderbird_rss_list.append(row[4]) tokenizer = RegexpTokenizer(r'\w+') en_stop = set(stopwords.words('english')) p_stemmer = PorterStemmer() doc_set = thunderbird_rss_list texts = [] for i in doc_set: raw = i.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = [i for i in tokens if not i in en_stop] stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lsimodel = models.LsiModel(corpus, num_topics=2, id2word = dictionary) print(lsimodel.print_topics(2)) end_time = time() time_taken = end_time - start_time print("Total time taken in seconds: ", time_taken)
def CallTransformationModel(self, Dict, Bowvec, **kwarg): '''Invoke specific transformation models of Gensim module. # Arguments: Dict: Dictionary made by all tokenized news(articles/documents). Bowvec: Bow-vector created by all tokenized news(articles/documents). modelType: Transformation model type, including 'lsi', 'lda' and 'None', 'None' means TF-IDF mmodel. tfDim: The number of topics that will be extracted from each news(articles/documents). renewModel: Re-train the transformation models or not(bool type). modelPath: The path of saving trained transformation models. ''' if kwarg['renewModel']: tfidf = models.TfidfModel(Bowvec) # initialize tfidf model tfidfVec = tfidf[Bowvec] # use the model to transform whole corpus tfidf.save(kwarg['modelPath'] + "tfidf_model.tfidf") if kwarg['modelType'] == 'lsi': model = models.LsiModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'] ) # initialize an LSI transformation modelVec = model[ tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi model.save(kwarg['modelPath']) # same for tfidf, lda, ... elif kwarg['modelType'] == 'lda': model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) modelVec = model[tfidfVec] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重 model.save(kwarg['modelPath']) # same for tfidf, lda, ... elif kwarg['modelType'] == 'None': model = tfidf modelVec = tfidfVec else: if not os.path.exists(kwarg['modelPath'] + "tfidf_model.tfidf"): tfidf = models.TfidfModel(Bowvec) # initialize tfidf model tfidfVec = tfidf[Bowvec] # tfidf.save(kwarg['modelPath'] + "tfidf_model.tfidf") else: tfidf = models.TfidfModel.load(kwarg['modelPath'] + "tfidf_model.tfidf") tfidfVec = tfidf[ Bowvec] # use the model to transform whole corpus if kwarg['modelType'] == 'lsi': if not os.path.exists(kwarg['modelPath'] + "lsi_model.lsi"): tfidf = models.TfidfModel.load(kwarg['modelPath'] + "tfidf_model.tfidf") tfidfVec = tfidf[ Bowvec] # use the model to transform whole corpus model = models.LsiModel( tfidfVec, id2word=Dict, num_topics=kwarg['tfDim'] ) # initialize an LSI transformation modelVec = model[ tfidfVec] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi model.save(kwarg['modelPath'] + "lsi_model.lsi") # same for tfidf, lda, ... else: model = models.LsiModel.load(kwarg['modelPath'] + "lsi_model.lsi") modelVec = model[tfidfVec] elif kwarg['modelType'] == 'lda': if not os.path.exists(kwarg['modelPath'] + "lda_model.lda"): tfidf = models.TfidfModel.load(kwarg['modelPath'] + "tfidf_model.tfidf") tfidfVec = tfidf[ Bowvec] # use the model to transform whole corpus model = models.LdaModel(tfidfVec, id2word=Dict, num_topics=kwarg['tfDim']) modelVec = model[ tfidfVec] #每个文本对应的LDA向量,稀疏的,元素值是隶属与对应序数类的权重 model.save(kwarg['modelPath'] + "lda_model.lda") # same for tfidf, lda, ... else: model = models.LdaModel.load(kwarg['modelPath'] + "lda_model.lda") modelVec = model[tfidfVec] elif kwarg['modelType'] == 'None': model = tfidf modelVec = tfidfVec return tfidfVec, modelVec
corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('lyrics.mm', corpus) ################## if (os.path.exists("lyrics.dict")): dictionary = corpora.Dictionary.load('lyrics.dict') corpus = corpora.MmCorpus('lyrics.mm') print("Used files generated from first tutorial") else: print("Please run first tutorial to generate data set") tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) corpus_lsi = lsi[corpus_tfidf] lsi.save('lyrics.lsi') lsi = models.LsiModel.load('lyrics.lsi') ################## dictionary = corpora.Dictionary.load('lyrics.dict') corpus = corpora.MmCorpus('lyrics.mm') lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10) doc = "望著 滿天星斗 的 塗鴉 好像 看見 自己 童年 的 模樣 總是 說 著 淘氣 浪漫 的 願望 夢想 能夠 飛往 燦爛 的 天堂 而 那天 真的 心願 正 溫柔 地 對 我 說 當你 陷入 絕望 中請 記得 我 用 美麗 的 幻想 讓 真心 永遠 純真 而 不變 當你 寂寞 的 時候 請 想念 我 用 單純 的 信仰 給 自己 溫暖 的 回答 閉上 雙眼 靜靜地 徜徉 彷彿 穿越時空 回到 了 過往 以為 銀河 就 在 不遠 的 前方 星星 月亮 都 在 我 面前 玩耍 而 那 微小 的 喜悅 正 溫柔 地 對 我 說 當你 陷入 絕望 中請 記得 我 用 美麗 的 幻想 讓 真心 永遠 純真 而 不變 當你 寂寞 的 時候 請 想念 我 用 單純 的 信仰 給 自己 溫暖 的 回答 ( 和 童 年時 無邪 的 希望 ) 親愛 的 我 親愛 的 我 願 你 永遠 像 我 一樣 帶著 勇氣 和 倔強 歲月 改變 你 的 模樣 無法 改變 你 的 去向" vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lsi[vec_bow]
# for idx, f in enumerate(sys.argv[3:]): # sample_map[idx] = f # # A magic number from the original program num_topics = round(len(sys.argv[3:]) * 0.8) # set it to at least 5 (for testing) if num_topics < 5: num_topics = 5 # from paper READING_CHUNK_SIZE = 200000 lsi = models.LsiModel( corpus, num_topics=num_topics, #id2word=sample_map, distributed=False, chunksize=READING_CHUNK_SIZE) # lsi.save(outdir + '/kmer_lsi.gensim') # Done training the lsi now we cluster kmers # map the kmer_docs into topic space corpus_transform = lsi[corpus] num_kmer_docs = len(corpus_transform) num_kmer_docs_to_sample = num_kmer_docs * PERCENTAGE_OF_KMER_DOCS_FOR_SEEDING NUM_CHUNKS = int( round((num_kmer_docs * PERCENTAGE_OF_KMER_DOCS_FOR_SEEDING) / KMER_DOC_CHUNK_SIZE))