mytext = ["Some text about christianity and bible"] doc_ids, docs = similar_euc_documents(text=mytext, doc_topic_probs=lda_output, documents = data, top_n=1, verbose=True) print('\n', docs[0][:500]) doc_ids, docs = similar_cos_documents(text=mytext, doc_topic_probs=lda_output, documents = data, top_n=1, verbose=True) print('\n', docs[0][:500]) all_top_vecs = [lda.get_document_topics(serial_corp[n], minimum_probability=0) \ for n in range(len(serial_corp))] def find_most_similar(sim_vec, all_top_vecs, title_lst, vec_in_corp='Y', n_results=7): ''' Calculates cosine similarity across the entire corpus and returns the n_results number of most similar documents ''' cos_sims = [gensim.matutils.cossim(sim_vec, vec) for vec in all_top_vecs] if vec_in_corp == 'N': most_similar_ind = np.argsort(cos_sims)[::-1][:n_results] if vec_in_corp == 'Y': most_similar_ind = np.argsort(cos_sims)[::-1][:n_results+1][1:]
words.append(word) f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n') return lsi ## main if __name__ == '__main__': cluster_keyword_lda_filepath = './data_out/cluster_keywords_lda.txt' cluster_keyword_lsi_filepath = './data_out/cluster_keywords_lsi.txt' corpus_path = './data/self_info_results_all.xls' dictionary, corpus, corpus_tfidf = create_data(corpus_path) lda = lda_model(dictionary, corpus, corpus_tfidf, 6, cluster_keyword_lda_filepath) lsi = lsi_model(dictionary, corpus, corpus_tfidf, 6, cluster_keyword_lsi_filepath) # show cluster keywords for LDA f = open(cluster_keyword_lda_filepath, 'r', encoding='utf-8') cluster_keyword_lda = f.read() print(cluster_keyword_lda) f.close() # test test_data = open('./data/test_chictr.txt', 'r', encoding='utf-8-sig').readlines() test_dictionary, test_corpus, test_corpus_tfidf = data_process(test_data) topics_test = lda.get_document_topics(test_corpus) for i in range(len(test_data)): print(i, 'topic distribution: ', topics_test[i], '\n')