def load_topics(self, dirname): self.topics = [] for subdir in [x[0] for x in os.walk(dirname)][1:]: for file in os.listdir(subdir): if file.endswith('pkl'): print("attempting... ", file) lda = ScikitLda.load(subdir + "/" + file) for topic in lda.topics: self.topics.append(topic / topic.sum())
# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from scipy.spatial.distance import cosine from corpora.scikit import ScikitLda import argparse if __name__ == "__main__": parser = argparse.ArgumentParser(description="Calculate distance between topics") parser.add_argument("-t", "--topic-model", default=None, required=True) args = parser.parse_args() lda = ScikitLda.load(args.topic_model) topics = [] for topic in lda.topics: topics.append(topic / topic.sum()) for i in range(len(topics)): for j in range(i + 1, len(topics)): print("Topic#{0} Topic#{1} {2}".format(i, j, cosine(topics[i], topics[j])))
from corpora.corpus import load_vraagtekst_corpus from corpora.util import select_top from matplotlib.pyplot import savefig import numpy as np if __name__ == '__main__': corpus = load_vraagtekst_corpus('data/preprocessedData.pkl') print("nSamples (docs) : {0}".format(corpus.num_samples)) print("nFeatures(words): {0}".format(corpus.num_features)) print("saving dictionary") corpus.save_dictionary('data/preprocessedData.dic') print("computing LDA") lda = ScikitLda(corpus=corpus, n_topics=10) lda.fit() print("saving LDA") lda.save('data/preprocessedData.lda_10.pkl') topicWords, topicWeightedWords = lda.topic_words() for topic_idx, wordsInTopic in enumerate(topicWords): print("Topic #{0}:".format(topic_idx)) print(" ".join(wordsInTopic)) topicsByOrg, orgs = topics_by_discrete_property( lda, corpus.metadata_frame['individu of groep']) averageWeights = np.average(lda.weights, axis=0) # get topic specificity by comparing with the average topic weights