Ejemplo n.º 1
0

mytext = ["Some text about christianity and bible"]
doc_ids, docs = similar_euc_documents(text=mytext, doc_topic_probs=lda_output, documents = data, top_n=1, verbose=True)
print('\n', docs[0][:500])

doc_ids, docs = similar_cos_documents(text=mytext, doc_topic_probs=lda_output, documents = data, top_n=1, verbose=True)
print('\n', docs[0][:500])







all_top_vecs = [lda.get_document_topics(serial_corp[n], minimum_probability=0) \
                    for n in range(len(serial_corp))]


def find_most_similar(sim_vec, all_top_vecs, title_lst, vec_in_corp='Y', n_results=7):                
    '''
    Calculates cosine similarity across the entire corpus and returns 
    the n_results number of most similar documents
    '''
    
    cos_sims = [gensim.matutils.cossim(sim_vec, vec) for vec in all_top_vecs]
    
    if vec_in_corp == 'N':
        most_similar_ind = np.argsort(cos_sims)[::-1][:n_results]
    if vec_in_corp == 'Y':
        most_similar_ind = np.argsort(cos_sims)[::-1][:n_results+1][1:]
            words.append(word)
        f_keyword.write(str(topic[0]) + '\t' + ','.join(words) + '\n')

    return lsi


## main
if __name__ == '__main__':
    cluster_keyword_lda_filepath = './data_out/cluster_keywords_lda.txt'
    cluster_keyword_lsi_filepath = './data_out/cluster_keywords_lsi.txt'
    corpus_path = './data/self_info_results_all.xls'
    dictionary, corpus, corpus_tfidf = create_data(corpus_path)
    lda = lda_model(dictionary, corpus, corpus_tfidf, 6,
                    cluster_keyword_lda_filepath)
    lsi = lsi_model(dictionary, corpus, corpus_tfidf, 6,
                    cluster_keyword_lsi_filepath)

    # show cluster keywords for LDA
    f = open(cluster_keyword_lda_filepath, 'r', encoding='utf-8')
    cluster_keyword_lda = f.read()
    print(cluster_keyword_lda)
    f.close()

    # test
    test_data = open('./data/test_chictr.txt', 'r',
                     encoding='utf-8-sig').readlines()
    test_dictionary, test_corpus, test_corpus_tfidf = data_process(test_data)
    topics_test = lda.get_document_topics(test_corpus)
    for i in range(len(test_data)):
        print(i, 'topic distribution: ', topics_test[i], '\n')