from corpora.util import select_top from matplotlib.pyplot import savefig import numpy as np if __name__ == '__main__': corpus = load_vraagtekst_corpus('data/preprocessedData.pkl') print("nSamples (docs) : {0}".format(corpus.num_samples)) print("nFeatures(words): {0}".format(corpus.num_features)) print("saving dictionary") corpus.save_dictionary('data/preprocessedData.dic') print("computing LDA") lda = ScikitLda(corpus=corpus, n_topics=10) lda.fit() print("saving LDA") lda.save('data/preprocessedData.lda_10.pkl') topicWords, topicWeightedWords = lda.topic_words() for topic_idx, wordsInTopic in enumerate(topicWords): print("Topic #{0}:".format(topic_idx)) print(" ".join(wordsInTopic)) topicsByOrg, orgs = topics_by_discrete_property( lda, corpus.metadata_frame['individu of groep']) averageWeights = np.average(lda.weights, axis=0) # get topic specificity by comparing with the average topic weights # normalize by average topic weights