texts = [[token for token in text if frequency[token] > 1] for text in texts] dictionary = corpora.Dictionary(texts) #print(dictionary.token2id) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=400) corpus_lsi = lsi[corpus_tfidf] #lsi.print_topics(20) #print(corpus_lsi[0], max(corpus_lsi[0], key=lambda x:x[1])) #print(corpus_lsi[1], max(corpus_lsi[0], key=lambda x:x[1])) index = similarities.MatrixSimilarity(corpus_lsi) #vec_lsi = corpus_lsi[0] #sims = index[vec_lsi] #sims = sorted(enumerate(sims), key=lambda item: -item[1]) #print(list(enumerate(sims))[0:9]) #print(titles[0]) #for i in range(5): # print(titles[sims[i][0]]) #For now, just print out first 5 papers in DB and top 3 recommended papers for i in range(6): print("Title:", titles[i]) print( "Abstract:", abstracts[i], )
# 10. Compute distance between texts # The results of the tf-idf algorithm now return stemmed tokens which are specific to each book. # We can, for example, see that topics such as selection, breeding or domestication are defining "On the Origin of Species" (and yes, in this book, # Charles Darwin talks quite a lot about pigeons too). Now that we have a model associating tokens to how specific they are to each book, # we can measure how related to books are between each other. # To this purpose, we will use a measure of similarity called cosine similarity # and we will visualize the results as a distance matrix, i.e., a matrix showing all pairwise distances between Darwin's books. # Load the library allowing similarity computations from gensim import similarities # Compute the similarity matrix (pairwise distance between all texts) sims = similarities.MatrixSimilarity(model[bows]) # Transform the resulting list into a dataframe sim_df = pd.DataFrame(list(sims)) # Add the titles of the books as columns and index of the dataframe sim_df.columns = titles sim_df.index = titles # Print the resulting matrix sim_df # 11. The book most similar to "On the Origin of Species"
# lsi_model.save(os.getcwd() + '/tmp/model.lsi') # same for tfidf, lda, ... lsi_model = models.LsiModel.load(os.getcwd() + '/tmp/model.lsi') # # # ********************************************************* # # # # ## LDA模型 ************************************************** # lda_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=2) # corpus_lda = lda_model[corpus_tfidf] # # Show2dCorpora(corpus_lsi) # print("===========corpus_ldacorpus_lda") # nodes = list(corpus_lda) # pprint(list(corpus_lda)) # # # # # 此外,还有Random Projections, Hierarchical Dirichlet Process等模型 corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi) # 计算一个新的文本与既有文本的相关度 #要处理的对象登场 target_courses = ['环境还行,但是感觉不是很好吃,排队的人太多了'] target_text = tp.seg_fil_rew(target_courses) print(target_text) test_bow = dictionary.doc2bow(target_text[0]) #转换成次数 test_tfidf = tfidf_model[test_bow] test_lsi = lsi_model[test_tfidf] test_simi = corpus_simi_matrix[test_lsi] print(list(enumerate(test_simi))) # 排序,为输出方便 sort_sims = sorted(enumerate(test_simi), key=lambda item: -item[1]) # 查看结果 print(sort_sims[0:10]) # 看下前10个最相似的,第一个是基准数据自身
FILENAME = 'panda_corpus.txt' panda_g = corpus(FILENAME) si = SenSimi(panda_g) panda_raw = si.reconstructdata() print(type(panda_raw)) bowlist = si.bowcorpus(panda_raw) print(bowlist[1]) panda_tfidfmodel = si.tfidfmodel(bowlist) panda_tfidf = panda_tfidfmodel[bowlist] # FIXME:不能用全部的语料生成索引,超出numpy.array限制 # FIXME: 如果用部分语料,用于比较相似的句子不包括在索引矩阵的特征向量中(基) print('using lsi model...') panda_lsi = LsiModel(corpus=panda_tfidf, id2word=si.word_dict, num_topics=300) index = similarities.MatrixSimilarity(panda_lsi[panda_tfidf]) good = ['可爱', '萌', '喜欢', '国宝', '神奇'] good_bow = si.word_dict.doc2bow(good) good_tfidf = panda_tfidfmodel[good_bow] good_lsi = panda_lsi[good_tfidf] simi = index[good_lsi] simi_list = list(simi) print(max(simi_list)) where = simi_list.index(max(simi_list)) print(panda_raw[where])
logging.root.level = logging.INFO dictionary = corpora.Dictionary.load('D:/workspace/scrap_sg/dictionary.dict') corpus = corpora.MmCorpus('D:/workspace/scrap_sg/corpus.mm') lda = models.ldamodel.LdaModel.load('D:/workspace/scrap_sg/lda.model') def print_text(filename): with open(filename, "r") as input: raw = input.read() print raw lda.print_topics(50) index = similarities.MatrixSimilarity(lda[corpus]) index.save("D:/workspace/scrap_sg/simIndex.index") doc_lda = lda[corpus] # inspect one doc def inspect_corpus(index, doc_lda, file_lst): pprint(doc_lda[index]) print file_lst[index] topics = [topic for topic, weight in doc_lda[index]] for i in range(0, lda.num_topics): if i in topics: print "TOPIC " + str(i) + ":" + str(lda.print_topic(i)) print "\n" print_text(file_lst[index])
stoplist = set('for a of the and to in'.split()) texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] # remove words that appear only once from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] from pprint import pprint # pretty-printer #pprint(texts) dictionary = corpora.Dictionary(texts) #print dictionary.token2id new_doc = "Human computer interaction" new_vec = dictionary.doc2bow(new_doc.lower().split()) #print new_vec corpus = [dictionary.doc2bow(text) for text in texts] #print corpus tfidf = models.TfidfModel(corpus) index = similarities.MatrixSimilarity(tfidf[corpus]) sims = index[tfidf[new_vec]] i = sorted(list(enumerate(sims)), key=lambda x: -x[1])[1][0] print documents[i]
def LSA_Kmeans(clusters, textoTreinamento, nomeUsuarios, textoComparacao=None): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) ########################################################################################## # PRÉ-PROCESSAMENTO DO TEXTO ENVIADO PARA CRIAÇÃO DO DICIONÁRIO DE RELACOES SEMANTICAS # ########################################################################################## #UTILIZA AS FUNCOES removeA e removePontuacao PARA TRATAR textoTreinamento textoTrein = [removeA(removePontuacao(i)) for i in textoTreinamento] #print textoTrein textoComp = [removeA(removePontuacao(i)) for i in textoComparacao] #CARREGA A LISTA DE STOPWORDS DA NLTK stop = stopwords.words('portuguese') #RETIRA OS ACENTOS DA LISTA DE STOPWORDS stoplist = [(removeA(s)) for s in stop] # print stoplist #REMOVE AS STOPWORDS E PALAVRAS COM MENOS DE 3 CARACTERES textoTrein = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \ for document in textoTrein] # print sw_textoTrein textoComp = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \ for document in textoComp] # print textoComp ############################################################################################## # INICIO DE APLICACAO DO LSA - CRIANDO O DICIONARIO DE TERMOS/FREQUENCIA # ############################################################################################## #DEFINE FREQUENCIA COMO UMA VARIAVEL DO TIPO DICIONARIO DE INTEIROS frequencia = defaultdict(int) #ARMAZENA A QUANTIDADE DE REPETIÇÕES DE UM TERMO EM TODOS OS DOCUMENTOS DA COLECAO for t in textoTrein: for token in t: frequencia[token] += 1 # pprint(frequencia) #PALAVRAS COM FREQUENCIA 1 NÃO SÃO IMPORTANTES, POIS NÃO POSSUEM RELACOES DE CO-OCORRENCIA #Remove todas as palavras que apareceram apenas 1 vez durante a contagem textoTrein = [[token for token in palavra if frequencia[token] > 1]\ for palavra in textoTrein] # pprint(textoTrein) ########################################################################################## # Dictionary encapsulates the mapping between normalized words and their integer ids. # # The main function is `doc2bow`, which converts a collection of words to its # # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. # ########################################################################################## dicionario = corpora.Dictionary(textoTrein) # print dicionario # Armazena o ID das palavras que aparecem apenas 1 vez nos textos once_ids = [ tokenId for tokenId, docfreq in dicionario.dfs.iteritems() if docfreq == 1 ] # print once_ids #remove todas as palavras com frequencia = 1 dicionario.filter_tokens(once_ids) #reorganiza o dicionario, realocando os dados para os indices que foram removidos dicionario.compactify() # print dicionario.token2id # token -> tokenId # print dicionario.dfs # document frequencies: tokenId -> in how many documents this token appeared # Atribui a corpus_textoTrein o textoTrein no formato "bag-of-words" # The main function is `doc2bow`, which converts a collection of words to its # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples. corpus_textoTrein = [dicionario.doc2bow(texto) for texto in textoTrein] # pprint(corpus_textoTrein) corpus_textoComp = [dicionario.doc2bow(textoC) for textoC in textoComp] # pprint(corpus_textoComp) ########################################################################################## # MODELO DE TRANSFORMACAO - BAG-OF-WORDS PARA TF-IDF # ########################################################################################## # TRANSFORMA corpus_textoTrein (bag-of-words) # PARA tfidf_TextoTrein (frequencia termos x inverso da freq no documento tfidf_TextoTrein = models.TfidfModel(corpus=corpus_textoTrein) # print tfidf_TextoTrein #USA O posIni PARA GERAR A MATRIZ DE COMPARACAO COM OS DADOS DO DICIONARIO corpus_tfidf_TextoTrein = tfidf_TextoTrein[corpus_textoComp] # print list(corpus_tfidf_TextoTrein) #TRANSFORMA A MATRIZ TF-IDF modelo_lsa = models.LsiModel(corpus_tfidf_TextoTrein, id2word=dicionario, num_topics=len(dicionario)) query = [] for q in textoComparacao: vec_bow = dicionario.doc2bow(q.lower().split()) vec_lsi = modelo_lsa[ vec_bow] #convert a query de comparação num espaço LSI query.append(vec_lsi) # print "query" # pprint(query) #TRANSFORMA corpus_textoComp num espaço LSA e indexa indexComp = similarities.MatrixSimilarity(modelo_lsa[corpus_textoComp]) # print "indexComp" # pprint(list(indexComp)) # To obtain similarities of our query document against the indexed documents: # perform a similarity query against the corpus sims = indexComp[query] # pprint(sims) ########################################################################################## # JUNÇÃO COM K-MEANS PARA REALIZAR AGRUPAMENTOS # ########################################################################################## ##Valor ideal, após experimentos = 100000 km_model = KMeans(n_clusters=clusters, n_init=100000) km_model.fit_transform(sims) clustering = collections.defaultdict(list) for idx, label in enumerate(km_model.labels_): clustering[label].append(idx) ### impressões para visualizar no console # print "clustering _LSA_KMEANS" # pprint(clustering) # print len(clustering) # for i in range(len(clustering)): # for j in clustering[i]: # print "grupo", i # print j, nomeUsuarios[j] # print textoComparacao[j] return clustering
f.close() #语料太大的情况下可以强制GC回收内存空间 #gc.collect() #生成字典 dictionary = corpora.Dictionary(train_set) #去除极低频的杂质词 dictionary.filter_extremes(no_below=1, no_above=1, keep_n=None) #将词典保存下来,将语料也保存下来,语料转换成bow形式,方便后续使用 dictionary.save(output + "all.dic") corpus = [dictionary.doc2bow(text) for text in train_set] saveObject(output + "all.cps", corpus) #存储原始的数据 saveObject(output + "all.info", docinfos) #TF*IDF模型生成 #使用原始数据生成TFIDF模型 tfidfModel = models.TfidfModel(corpus) #通过TFIDF模型生成TFIDF向量 tfidfVectors = tfidfModel[corpus] #存储tfidfModel tfidfModel.save(output + "allTFIDF.mdl") indexTfidf = similarities.MatrixSimilarity(tfidfVectors) indexTfidf.save(output + "allTFIDF.idx") #LDA模型 lda = models.LdaModel(tfidfVectors, id2word=dictionary, num_topics=50) lda.save(output + "allLDA50Topic.mdl") corpus_lda = lda[tfidfVectors] indexLDA = similarities.MatrixSimilarity(corpus_lda) indexLDA.save(output + "allLDA50Topic.idx")
tfidf = models.TfidfModel(corpus_tfidf) vec = [(0, 1), (4, 1)] # print(tfidf[vec]) print(corpus_tfidf[0]) print(corpus_tfidf[-1]) from sim import print_similaries num_topic = 250 lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=num_topic, update_every=0, passes=20) lda.print_topics(num_topic) tfidf_index = similarities.SparseMatrixSimilarity(tfidf[corpus_tfidf], num_features=len(tfidf.dfs)) lda_index = similarities.MatrixSimilarity(lda[corpus_tfidf]) print_similaries(tfidf, corpus_tfidf, zip_data, tfidf_index) print_similaries(lda, corpus_tfidf, zip_data, lda_index) # num_topic = 10 # lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topic) # corpus_lsi = lsi[corpus_tfidf] # lsi.print_topics(num_topic)
from gensim import corpora dct = corpora.Dictionary(contexts) low_freq_ids = [id_ for id_, freq in dct.dfs.items() if freq < 3] high_freq_ids = [id_ for id_, freq in dct.dfs.items() if freq > 10000] freq_ids = low_freq_ids + high_freq_ids dct.filter_tokens(freq_ids) dct.compactify() corpus = [dct.doc2bow(s) for s in contexts] from gensim import models tfidf_model = models.TfidfModel(corpus) corpus_mm = tfidf_model[corpus] from gensim import similarities index = similarities.MatrixSimilarity(corpus_mm, num_features=len(dct)) def text2vec(text): bow = dct.doc2bow(text) return tfidf_model[bow] input_text = '花呗透支了为什么不可以继续用了' my_text = list(jieba.cut(input_text)) vec = text2vec(my_text) sims = index[vec] sim_sort = sorted(list(enumerate(sims)), key=lambda item: item[1], reverse=True)
def computeSimilarityMatrix(self, corpus, numFeatures, num_best=7): self.similarityMatrix = similarities.MatrixSimilarity( self.model[corpus], num_features=numFeatures, num_best=num_best)
else: print("Please run first tutorial to generate data set") tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) corpus_lsi = lsi[corpus_tfidf] lsi.save('lyrics.lsi') lsi = models.LsiModel.load('lyrics.lsi') ################## dictionary = corpora.Dictionary.load('lyrics.dict') corpus = corpora.MmCorpus('lyrics.mm') lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10) doc = "望著 滿天星斗 的 塗鴉 好像 看見 自己 童年 的 模樣 總是 說 著 淘氣 浪漫 的 願望 夢想 能夠 飛往 燦爛 的 天堂 而 那天 真的 心願 正 溫柔 地 對 我 說 當你 陷入 絕望 中請 記得 我 用 美麗 的 幻想 讓 真心 永遠 純真 而 不變 當你 寂寞 的 時候 請 想念 我 用 單純 的 信仰 給 自己 溫暖 的 回答 閉上 雙眼 靜靜地 徜徉 彷彿 穿越時空 回到 了 過往 以為 銀河 就 在 不遠 的 前方 星星 月亮 都 在 我 面前 玩耍 而 那 微小 的 喜悅 正 溫柔 地 對 我 說 當你 陷入 絕望 中請 記得 我 用 美麗 的 幻想 讓 真心 永遠 純真 而 不變 當你 寂寞 的 時候 請 想念 我 用 單純 的 信仰 給 自己 溫暖 的 回答 ( 和 童 年時 無邪 的 希望 ) 親愛 的 我 親愛 的 我 願 你 永遠 像 我 一樣 帶著 勇氣 和 倔強 歲月 改變 你 的 模樣 無法 改變 你 的 去向" vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lsi[vec_bow] index = similarities.MatrixSimilarity(lsi[corpus], num_features=500) index.save('lyrics.index') sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) print(sims[:10])
def get_similarity(self,lda, query_vector): index = similarities.MatrixSimilarity(lda[self.corpus]) sims = index[query_vector] return sims
word for word in document if '/w' not in word and '/y' not in word and '/u' not in word and '/c' not in word and '/k' not in word ] for document in documents] times = defaultdict(int) for page in ptexts: for word in page: times[word] += 1 ptexts = [[word for word in text if times[word] > 1] for text in ptexts] dictionary = corpora.Dictionary(ptexts) corpus = [dictionary.doc2bow(text) for text in ptexts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] result = similarities.MatrixSimilarity(corpus_tfidf) data = pd.DataFrame(result[corpus_tfidf], index=date_indexs, columns=date_indexs) data.to_csv("text_result.csv") output = open("text_result_100.csv", "w") for i in range(0, 100): tmp = sorted(enumerate(result[corpus_tfidf[i]]), key=lambda x: x[1], reverse=True) result100 = [] for j, m in tmp: result100.append([date_indexs[j], m])
def main(): logging.basicConfig(format='[%(levelname)s] : %(message)s', level=logging.INFO) if (os.path.exists("output/0814.dict")): dictionary = corpora.Dictionary.load("output/0814.dict") corpus = corpora.MmCorpus("output/0814.mm") logging.info("Load model success") else: logging.info("Please run the train2.py to create dict & data flow") # Create tf-idf model tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] # Transfer to LSI model lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=40) corpus_lsi = lsi[corpus_tfidf] # LSI潛在語義索引 lsi.save('output/0814.lsi') corpora.MmCorpus.serialize('output/0814_lsi.mm', corpus_lsi) """ print("LSI topics:") results = lsi.print_topics(5) for result in results: print(result) """ # test_data = '' # with open('input/test.txt', 'r', encoding='utf-8') as f: # for line in f: # words = jieba.cut(line) # test_data += ' '.join(words) # # print(test_data.split()) test_data = [] init_stopword() test_data = getTestData('input/test.txt') test_data_seg = getSingleSegment(test_data) vec_bow = dictionary.doc2bow(test_data_seg) vec_lsi = lsi[vec_bow] print("\nAriticle:\n%s" % test_data) # Create index index = similarities.MatrixSimilarity(lsi[corpus]) index.save("output/0814.index") # Similarity sims = index[vec_lsi] sims = sorted(enumerate(sims), key=lambda item: -item[1]) print("result:") print(sims[:5]) # Print results articles = getArticle() for idx in sims[:3]: print("\nSimilar Ariticle:\n", articles[idx[0]]) print("\nSimilarity:", idx[1])
def visual(): ''' get data depending on algo create a similarity matrix feed it to a templete for visulization ''' tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') ALGORITHM = request.form['algorithm'] LEVEL = request.form['level'] DOC_COUNT = int(request.form['num-of-docs']) DOCUMENTS = [] for i in range(DOC_COUNT): DOCUMENTS.append(request.form['document' + str(i + 1)]) raw_sentences = [] if LEVEL == "sentence": for each in DOCUMENTS: # raw sentences will be each document splited into sentences raw_sentences += tokenizer.tokenize(each.decode('utf8').strip()) else: raw_sentences = DOCUMENTS # raw sentence will be the whole do itself. matrix = [] if ALGORITHM == "TF-IDF": # Need to write functions for each. Wrote for TF-IDF. tfidf = TfidfVectorizer().fit_transform(raw_sentences) matrix = (tfidf * tfidf.T).A # For each algo the Idea is to form a martix of similarities. #--------- #Algo 2:Latent Semantic Indexing if ALGORITHM == "LSI": #added by sneha git:coder477 . texts = [] matrix = np.zeros(shape=(len(raw_sentences), len(raw_sentences))) for each in raw_sentences: texts.append(document_to_wordlist(each)) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] lsii = models.LsiModel(corpus) matrix = np.zeros(shape=(len(raw_sentences), len(raw_sentences))) for i in range(len(raw_sentences)): vec = corpus[i] doc = raw_sentences[i] vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lsii[vec_bow] # convert the query to LSI space index = similarities.MatrixSimilarity(lsii[corpus]) sims = index[ vec_lsi] # perform a similarity query against the corpus cosine = list(enumerate(sims)) for j in range(len(raw_sentences)): matrix[i][j] = cosine[j][1] #--------- #Algo 3 if ALGORITHM == "WORDNET": print("here---------------------------") matrix = [] for each in range(len(raw_sentences)): li = [] for each1 in range(len(raw_sentences)): li.append(0) matrix.append(li) for i in range(0, len(raw_sentences)): for j in range(0, len(raw_sentences)): input1 = raw_sentences[i].encode('ascii', 'ignore') input2 = raw_sentences[j].encode('ascii', 'ignore') input1_nps = nps(input1) input2_nps = nps(input2) common_nps = common1(input1_nps, input2_nps) lsv_input1 = lsv(common_nps, input1_nps) lsv_input2 = lsv(common_nps, input2_nps) matrix[i][j] = cosine_similarity(lsv_input1, lsv_input2) #--------- #Algo 4 #Got pretrained vectors from GIT. #added by sneha git:coder477 . if ALGORITHM == "WORD2VEC": word_vector = load_word2vec('static\\vectors') matrix = [] for each in range(len(raw_sentences)): li = [] for each1 in range(len(raw_sentences)): li.append(0) matrix.append(li) for i in range(0, len(raw_sentences)): for j in range(0, len(raw_sentences)): sen1 = raw_sentences[i] sen2 = raw_sentences[j] sen1_words = document_to_wordlist(sen1) sen2_words = document_to_wordlist(sen2) sen1_vectors = [] for each in sen1_words: if each in word_vector: sen1_vectors.append(word_vector[each]) sen1_vector = np.array(sen1_vectors).sum(axis=0) sen2_vectors = [] for each in sen2_words: if each in word_vector: sen2_vectors.append(word_vector[each]) sen2_vector = np.array(sen2_vectors).sum(axis=0) matrix[i][j] = cosine_similarity(sen1_vector, sen2_vector)[0][0] #--------- #Forming nodes and links for graph. #code might as well be same for all algos. #Refine note : Think of creating private funcs and moving code. force = {} force["nodes"] = [] force["links"] = [] for each in raw_sentences: temp = {} temp["name"] = each temp["length"] = len(document_to_wordlist(each)) force["nodes"].append(temp) for ((i, _), (j, _)) in itertools.combinations(enumerate(raw_sentences), 2): temp = {} temp["source"] = i temp["target"] = j temp["value"] = matrix[i][j] force["links"].append(temp) graph = json.dumps(force) wordlist = [] for each in raw_sentences: wordlist += document_to_wordlist(each) c = Counter(wordlist) wordcloud = [] for each in c: temp = {} temp["text"] = each temp["size"] = c[each] * 20 wordcloud.append(temp) wordcloud = json.dumps(wordcloud) return render_template('visual.html', graph=graph, sentences=raw_sentences, wordcloud=wordcloud)
def LDA(): train = [] # 训练数据 fp = codecs.open(r'F:\github\WBFL\uploadpath\output\output.txt', 'r', encoding='utf-8') for line in fp: line = line.split() train.append([w for w in line]) dictionary = corpora.Dictionary(train) # 构造词典 corpus = [dictionary.doc2bow(text) for text in train] # 每个text对应的稀疏向量 tfidf = models.TfidfModel(corpus) # 统计tfidf corpus_tfidf = tfidf[corpus] # 将文本的tfidf向量输入生成Lsi模型,num_topics为生成主题个数,也为Lsi进行SVD分解,生成矩阵列向量数;id2word是语料字典 lsi = models.LsiModel(corpus_tfidf, num_topics=50, id2word=dictionary) topic_result = [a for a in lsi[corpus_tfidf]] # 給lsi的索引为tfidf向量 print(lsi) # 打印LSI Model topic_result # print(lsi.print_topics(num_topics=50, num_words=5)) # 打印5个主题并且打印与主题有关的5个关键词,关键词前面的系数为权重而不是概率值 similarity = similarities.MatrixSimilarity( lsi[corpus_tfidf]) # 根据lsi计算文档之间的相似性 # print(list(similarity)) # alpha,eta即为LDA公式中的α和β,minimum_probability表示主题小于某个值(比如0.001)就舍弃此主题。 lda = models.LdaModel(corpus_tfidf, num_topics=50, id2word=dictionary, alpha='auto', eta='auto', minimum_probability=0.001) # for doc_topic in lda.get_document_topics(corpus_tfidf): # 可以获得每个文档的主题分布 # print(doc_topic) with open(r'F:\github\WBFL\uploadpath\output\wordlistOutput.txt', 'w', encoding='utf-8') as f1: for topic_id in range(50): print('Topic', topic_id) # print(lda.get_topic_terms(topicid=topic_id)) # lda生成的主题中的词分布,默认显示10个 print(lda.show_topic(topicid=topic_id)) word_list = lda.show_topic(topicid=topic_id) for i in range(10): f1.write(word_list[i][0] + '\n') a = np.array(list(similarity)) result_index = a > 0.99000000 # print(result_index) inputs = open(r'F:\github\WBFL\uploadpath\input\input.txt', 'r', encoding='utf-8') text_list = inputs.readlines() # print(text_list) count = len(text_list) for line in range(count): nowline = count - 1 - line num = 0 for item in result_index[nowline][nowline:count - 1]: if item and num != 0: text_list[nowline] = text_list[nowline + num].replace( '\n', '\\n') + text_list[nowline] text_list[nowline + num] = '' num += 1 # print(text_list) with open(r'F:\github\WBFL\uploadpath\output\finaloutput.txt', 'w', encoding='utf-8') as f: for text in text_list: f.write(text)
for choice in (models.LsiModel, models.LdaModel): model = choice(corpus_tfidf, id2word=dictionary, num_topics=2) corpus_mod = model[ corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi fname = '/tmp/%s' % repr(choice) model.save(fname) # same for tfidf, lda, ... model = choice.load(fname) # topic model.print_topics(2) for doc in corpus_mod: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly print doc # test example doc = "Human computer interaction" vec_bow = dictionary.doc2bow(doc.lower().split()) vec_mod = model[vec_bow] # convert the query to MODEL space print vec_mod # Initializing query structures index = similarities.MatrixSimilarity( model[corpus]) # transform corpus to MODEL space and index it index.save('/tmp/deerwester.index') index = similarities.MatrixSimilarity.load('/tmp/deerwester.index') # Performing queries sims = index[vec_mod] # perform a similarity query against the corpus sims = sorted(enumerate(sims), key=lambda item: -item[1]) print sims # print sorted (document number, similarity score) 2-tuples
content.close() # textdict里面是词典 # textcorpus就是分开的一篇篇文章,也叫做语料库 textdict = corpora.Dictionary(corpus) # 词到数字{"数据挖掘":0,"篮球":1,} # 通过词典,将语料中的文字转化为数字(编号) # 最终形如[[(0,1),(1,4),(2,2)],[(1,3),(2,2)],[(3,1),(4,1),(5,1)]] # 小括号第一个是词的编号,第二个是个数 textcorpus = [textdict.doc2bow(i) for i in corpus] # model = models.ldamodel.LdaModel( # textcorpus,num_topics=3,id2word = textdict) # topics = [model[c] for c in textcorpus] # print topics # for i in range(3): # print model.print_topic(i) # tfidf模型 tfidf = models.TfidfModel(textcorpus) corpus_tfidf = tfidf[textcorpus] # lsa模型 lsi = models.LsiModel(corpus_tfidf, id2word=textdict, num_topics=2) corpus_lsi = lsi[textcorpus] # print lsi.print_topics(2) print corpus_lsi index = similarities.MatrixSimilarity(lsi[textcorpus]) sims = index[corpus_lsi] print list(enumerate(sims)) sims = sorted(enumerate(sims), key=lambda item: -item[1]) print sims
def computeSimilarity_lsm(X, query): index = similarities.MatrixSimilarity(X) sims = index[query] scoreList = list(enumerate(sims)) rankList = [scoreList[i][1] for i in range(len(scoreList))] return rankList
def main(): board = 'Japan_Travel' conn = MongoClient('localhost', 27017) db = conn['bdhackthon'] collection = db[board] d_start = datetime.datetime(2016, 1, 1, 0) d_end = datetime.datetime(2016, 3, 1, 0) t_start = time.time() # bulil corpus if os.path.exists('corpus_data.json'): corpus_data = load_json('corpus_data.json') else: corpus_data = {} corpus = [] articles = collection.find( { "$or": [{ "article_title": { "$regex": "\[[遊食]記\].*(東京)+.*" }, "date": { "$gt": d_start, "$lt": d_end } }, { "article_title": { "$regex": "\[住宿\].*(東京)+.*" }, "date": { "$gt": d_start, "$lt": d_end } }] }, no_cursor_timeout=True).batch_size(20) print('Total:', articles.count()) index_aid = {} # map index of corpus to article_id i = 0 tmp_data = {} for article in articles: #if i==80: # break tmp_data[article['article_id']] = (article['article_title'], article['content']) index_aid[str(i)] = article['article_id'] print(i) #print(article, article['article_title']) print(article['article_title']) #print(article['content']) #print(article) if article['article_id'] in corpus_data.keys(): corpus.append(corpus_data[article['article_id']]['feature']) corpus_data[article['article_id']]['index'] = i i = i + 1 continue else: doc = [] doc += splitWord(article['article_title']) doc += splitWord(article['content']) corpus_data[article['article_id']] = { 'feature': doc, 'topic': [], 'index': i } corpus.append(doc) i = i + 1 #input() t_end = time.time() write_json(corpus_data, 'corpus_data.json') print('time elapsed for building corpus: %f minutes' % ((t_end - t_start) / 60.0)) dictionary = corpora.Dictionary(corpus) stoplist = [ line.lower().split()[0] for line in open('stop_words.txt', 'r') ] # remove stop words and words that appear only once stop_ids = [ dictionary.token2id[stopword] for stopword in stoplist if stopword in dictionary.token2id ] once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1 ] #once_ids = [] dictionary.filter_tokens( stop_ids + once_ids) # remove stop words and words that appear only once dictionary.compactify( ) # remove gaps in id sequence after words that were removed #print(dictionary) #print(dictionary.dfs) #pprint(dictionary.token2id) dictionary.save('train.dict') # store the dictionary, for future reference corpus_bow = [dictionary.doc2bow(doc) for doc in corpus] corpora.MmCorpus.serialize('train.mm', corpus_bow) # store to disk, for later use tfidf = models.TfidfModel(corpus_bow) # initialize (train) a model tfidf.save('train.tfidf') corpus_tfidf = tfidf[corpus_bow] lda = models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, alpha='auto', num_topics=50) #print(lda.print_topics(50)) lda.save('train.lda') corpus_lda = lda[corpus_tfidf] index = similarities.MatrixSimilarity( corpus_lda) # transform corpus to LDA space and index it index.save('train.index') topic = {} for i in range(len(corpus_lda)): #print(corpus_lda[i]) #print(corpus[i]) key = max(corpus_lda[i], key=lambda x: abs(x[1]))[0] if key in topic.keys(): topic[key].append(i) else: topic[key] = [i] #input() vec_topic = {} print('%d topics identified. Classify them:' % len(topic)) old_corpus_data = load_json('old_model/corpus_data.json') for k, v in topic.items(): print('Group %s (%d):' % (k, len(v))) for c_index in v: a_id = index_aid[str(c_index)] #if a_id in corpus_data.keys(): if a_id in old_corpus_data.keys(): #print(corpus_data[a_id]['topic']) if not old_corpus_data[a_id]['topic']: #print(corpus_data[a_id]['feature']) print(tmp_data[a_id]) line = input('Enter topics, separate by space: ') corpus_data[a_id]['topic'] = line.split(' ') else: corpus_data[a_id]['topic'] = old_corpus_data[a_id]['topic'] else: raise ValueError('Empty article_id') write_json(corpus_data, 'corpus_data_labeled.json')
def generateModel(fileName='RawData20160307.json'): RawData = open(fileName, 'r') stopWords = getStopWords() print "prepocessing the RawData...", texts = [ simpleTokenize( json.loads(line)['AppName'] + ' ' + json.loads(line)['Description'], stopWords) for line in RawData ] RawData.close() print "prepocessing the RawData Done!" print "generating a dictionary...", dictionary = corpora.Dictionary(texts) once_ids = [ tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 1 ] dictionary.filter_tokens(once_ids) dictionary.compactify() print "generating a dictionary Done!" # create a dir called modelfile print dictionary datapath = os.path.join(os.getcwd(), 'data') if not os.path.isdir(datapath): os.mkdir(datapath) dicFilePath = os.path.join(datapath, 'appdesc.dict') if os.path.isfile(dicFilePath): os.remove(dicFilePath) dictionary.save(dicFilePath) print "generating a Coupus...", corpus = [dictionary.doc2bow(text) for text in texts] mmFlePath = os.path.join(datapath, 'appdesc.mm') if os.path.isfile(mmFlePath): os.remove(mmFlePath) corpora.MmCorpus.serialize(mmFlePath, corpus, progress_cnt=10000) print "Done!" # Creating a transformation,train the TF-IDF model print "training the tfidf model...", tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model tfidfFilePath = os.path.join(datapath, 'model.tfidf_model') if os.path.isfile(tfidfFilePath): os.remove(tfidfFilePath) tfidf.save(tfidfFilePath) corpus_tfidf = tfidf[corpus] print "Done!" print "Mapping from tfidf to lsi...", lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300) corpus_lsi = lsi[corpus_tfidf] lsiFilePath = os.path.join(datapath, 'model.lsi') if os.path.isfile(lsiFilePath): os.remove(lsiFilePath) lsi.save(lsiFilePath) print "Done!" # transform corpus to LSI space and index it print "Generating the index...", index = similarities.MatrixSimilarity(corpus_lsi) indexFilePath = os.path.join(datapath, 'appdesc.index') if os.path.isfile(indexFilePath): os.remove(indexFilePath) index.save(indexFilePath) print "done!"
words = ' '.join(jieba.cut(line)).split(' ') texts.append(words) frequency = defaultdict(int) # 构建一个字典对象 for text in texts: for word in text: frequency[word] += 1 texts = [[word for word in text if frequency[word] > 1] for text in texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] index = similarities.MatrixSimilarity(corpus_tfidf) def get_similar(token): words = [] token = ' '.join(jieba.cut(token)).split(' ') for word in token: words.append(word.lower()) print(words) new_vec = dictionary.doc2bow(words) new_vec_tfidf = tfidf[new_vec] # 将待比较文档转换为tfidf表示方法 sims = index[new_vec_tfidf] sims_list = sims.tolist() if max(sims_list) < 0.5: return 'NO DATA' else:
res = get_cosine(A, B) if (res > 0.95): UrlList.remove(UrlList[i]) except IndexError: pass ####################################### ######### APPLICATION TESTS ########### ####################################### fileList = getFileList() deleteSimilarPage(fileList) #Pour déterminer la page la plus proche d'une requête donnée (le plus pertinent sémantiquement sur cette requête) from gensim import corpora, similarities # corpus is your text, tokenized dictionary = corpora.Dictionary(corpus) # transform the corpus into vectors # Bag of words (BOW) is an algorithm like word2vec, to transform words into vectors vectors_corpus = [dictionary.doc2bow(text) for text in corpus] # Build your similarity matrix matrix = similarities.MatrixSimilarity(vectors_corpus) # Query is your search query query = "Does it work" vector_query = dictionary.doc2bow(query.lower.split()) similarity = matrix[vector_query] # Now we see which document is closer to the search query print(list(enumerate(similarity)))
dictionary = corpora.Dictionary(texts) V = len(dictionary) corpus = [dictionary.doc2bow(text) for text in texts] corpus_tfidf = models.TfidfModel(corpus)[corpus] print('TF-IDF:') for c in corpus_tfidf: print(c) print('\nLSI Model:') lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary) topic_result = [a for a in lsi[corpus_tfidf]] pprint(topic_result) print('LSI Topics:') pprint(lsi.print_topics(num_topics=2, num_words=5)) similarity = similarities.MatrixSimilarity( lsi[corpus_tfidf]) # similarities.Similarity() print('Similarity:') pprint(list(similarity)) print('\nLDA Model:') num_topics = 2 lda = models.LdaModel(corpus_tfidf, num_topics=num_topics, id2word=dictionary, alpha='auto', eta='auto', minimum_probability=0.001) doc_topic = [doc_t for doc_t in lda[corpus_tfidf]] print('Document-Topic:\n') pprint(doc_topic) for doc_topic in lda.get_document_topics(corpus_tfidf):
corpus_tfidf = corpora.MmCorpus('../Save/scopus_corpus.mm') #index = similarities.MatrixSimilarity.load('../Save/scopus_research.index') ''' Similarities between pairs of documents ''' similarities_between_pairs = True if similarities_between_pairs == True: query = "The challenge of a purposeful design addressed in this article is to align offshore energy systems not only with technical and economic values like efficiency and profitability but also with moral and social values more generally We elaborate a theoretical framework that allows us to make a systematic inventory of embedded values of offshore energy systems and relate them to their societal acceptability By characterizing both objects and subjects of acceptability we shed light on ways to identify areas of value conflicts that must be addressed in purposeful design We suggest the capabilities approach as a normative theory to deal with the arising value conflicts" split_lower_query = query.lower().split() stopped_query = [f for f in split_lower_query if not f in en_stop] stemmed_query = [p_stemmer.stem(h) for h in stopped_query] vec_bow = dictionary.doc2bow(stemmed_query) vec_lda = lda[vec_bow] index = similarities.MatrixSimilarity( lda[corpus_tfidf] ) # only possible if the total memory required is lower than the RAM. In any other case, you should use similarities. Can also add: ,num_features=len(dictionary) index.save('../Save/scopus_research.index') sims = index[vec_lda] sims = sorted(enumerate(sims), key=lambda item: -item[1]) pprint(sims) ''' Get topics ''' num_topics = 100 num_words = 5 #pprint(lda.get_topic_terms(0)) # terms for one topic #pprint(lda.show_topics(num_topics, num_words)) #pprint(lda.print_topics())
print("---for doc in corpus_lsi") for doc in corpus_lsi: # bow->tfidf和tfidf->lsi变换都会在这里快速执行 print(doc) lsi.save('/tmp/model.lsi') # same for tfidf, lda, ... lsi = models.LsiModel.load('/tmp/model.lsi') dictionary = corpora.Dictionary.load('/tmp/deerwester.dict') corpus = corpora.MmCorpus('/tmp/deerwester.mm') print("---corpus") print(corpus) lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2) doc = "Human computer interaction" vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lsi = lsi[vec_bow] # convert the query to LSI space print("---vec_lsi") print(vec_lsi) index = similarities.MatrixSimilarity(lsi[corpus]) # 转换语料库到LSI空间并索引它 index.save('/tmp/deerwester.index') index = similarities.MatrixSimilarity.load('/tmp/deerwester.index') sims = index[vec_lsi] # perform a similarity query against the corpus print("list(enumerate(sims))" ) # print (document_number, document_similarity) 2-tuples print(list(enumerate( sims))) # print (document_number, document_similarity) 2-tuples sims = sorted(enumerate(sims), key=lambda item: -item[1]) print("---sims") print(sims)
def main(transcript_dir,testfile,xls_pathname): reload(sys) sys.setdefaultencoding('utf8') print ('\ntranscript_dir:', transcript_dir) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING) documents, doc_index= create_doc_index(transcript_dir) # Remove common words and tokenize stoplist = set('for a of the and to in'.split()) texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] # Remove words that appear only once frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 1] for text in texts] print("\nCreate and save dictionary:") dictionary = corpora.Dictionary(texts) dictionary.save('./tmp/TopicClassifier.dict') # store the dictionary, for future reference #print(dictionary) #print(dictionary.token2id) print ('\nOpen testfile:', testfile) tf = open(testfile, 'r') new_doc=(str.decode(tf.read(), "UTF-8", "ignore")) tf.close() print ("\nQuery document:\n") print(new_doc) print ("\nVector representation of query document:\n") new_vec = dictionary.doc2bow(new_doc.lower().split()) print(new_vec) print("\nCreate and save corpus:") corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('./tmp/TopicClassifier.mm', corpus) print("\nCorpus:") print(corpus) print("\nBuild LSI Model:") lsi = models.LdaModel(corpus, id2word=dictionary, num_topics=10) # Transform testfile document (question) to bag of words vec_bow = new_vec vec_lsi = lsi[vec_bow] # convert the query to LSI space print(vec_lsi) # Transform corpus to LSI space and index it index = similarities.MatrixSimilarity(lsi[corpus]) #index = similarities.MatrixSimilarity.load(save_index) print("Creating sims...") sims = index[vec_lsi] # perform a similarity query against the corpus print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples print("Sorting sims...") sims_sorted = sorted(enumerate(sims), key=lambda item: -item[1]) print(sims_sorted) # print sorted (document number, similarity score) 2-tuples print("Doc Index...") print(doc_index) print("Combined Results...") #[ print ('Seq: {} CaseID: {} File: {} Similarity: {}'.format(item[0][0], item[0][1], item[0][2], item[1])) for item in zip(doc_index,sims)] print ("Len(sims): {} Len(Index): {}".format(len(sims), len(doc_index))) combined=[] for item in zip(doc_index, sims): combined.append((item[0][1], item[0][2], item[1])) combined_sorted=sorted(combined, key=lambda x: -x[-1]) for item in combined_sorted: print(item) tw=TagWorksheet(xls_pathname) s = [] #print ("tw.get.tags(825):", tw.get_tags(825)) for item in combined_sorted[0:4]: tl=tw.get_tags(int(item[0])) print(item,'\t',tl) s.append(set(tl)) print("\ntag sets:",s) y = set.intersection(*s) print ("\ntag set intersection:",y)
np.array(corpus).shape lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2) p=[] for i in range(0,len(documents)): doc1 = documents[i] vec_bow2 = dictionary.doc2bow(doc1.lower().split()) vec_lsi2 = lsi[vec_bow2] # convert the query to LSI space p.append(vec_lsi2) p index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it index.save('/tmp/deerwester4.index') index = similarities.MatrixSimilarity.load('/tmp/deerwester4.index') ################# import gensim import numpy as np import matplotlib.colors as colors import matplotlib.cm as cmx import matplotlib as mpl matrix1 = gensim.matutils.corpus2dense(p, num_terms=4) matrix3=matrix1.T matrix3
def NMF(request): query = "" query_response = None file_list = None file_list_dictionary = None search_result_dictionary = None documents = [] for counter in range(1033): temp = open("IR/" + str(counter + 1) + ".txt", 'r') documents.append(temp.read()) temp.close() stop_words = stopwords.words('english') texts = [[ word for word in document.lower().split() if word not in stop_words ] for document in documents] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] corpora.MmCorpus.serialize('/tmp/ir.mm', corpus) nmfmodel = nmf.Nmf(corpus, num_topics=43, id2word=dictionary, normalize=True) if request.method == "POST": form = SearchForm(request.POST) if form.is_valid(): query_response = list() user_query = form.save() user_query.save() query = user_query.query doc = user_query.query index = similarities.MatrixSimilarity(nmfmodel[corpus]) vec_bow = dictionary.doc2bow(doc.split()) vec_nmf = nmfmodel[vec_bow] sims = index[vec_nmf] sims = sorted(enumerate(sims, 1), key=lambda item: -item[1]) file_list = list() for element in sims[0:5]: file_list.append(element[0]) temp = None for text in file_list: temp = open("IR/" + str(text) + ".txt", 'r') query_response.append(temp.read()) temp.close() #print(query_response) file_list_dictionary = dict() file_list_dictionary = { i: file_list[i - 1] for i in range(1, len(file_list) + 1) } search_result_dictionary = { i: query_response[i - 1] for i in range(1, len(query_response) + 1) } else: form = SearchForm() return render( request, "nmf.html", { 'form': form, 'query': query, 'answer': file_list, 'search_results': query_response, 'file_dictionary': file_list_dictionary, 'search_result_dictionary': search_result_dictionary })