import preprocess import util.util as util import os.path n_samples = 2000 n_topics = 2 n_top_words = 20 def print_top_words(model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_): print "Topic #%d:" % topic_idx print " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) count_vector, vectorizer, raw_data = preprocess.get_segmented_data("segmented_data.csv") # util.save_object(vectorizer, "vectorizer.pkl") if os.path.isfile("lda.pkl"): print "load lda model" lda = util.load_object("lda.pkl") else: print "train lda model" lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=50, learning_method='online', learning_offset=10., random_state=0, n_jobs=-1) lda.fit(count_vector) util.save_object(lda, "lda.pkl") print "Topics in LDA model:" tf_feature_names = vectorizer.get_feature_names()
f.write("\n\n") # close everything for f in cluster_to_file.values(): f.close() """ Tag all the training data by its clustering number. Ignore clusters with less than 20 counts. """ def get_cluster_tagged_data(clusters, X): print "getting clustering results with X" new_X = [] new_Y = [] cluster_counts = Counter(clusters) for i in range(len(clusters)): cluster = clusters[i] if cluster_counts[cluster] > 20: new_X.append(X[i]) new_Y.append(cluster) return (new_X, new_Y) X, cv, raw_content = preprocess.get_segmented_data() #hierarchical_clustering(X) clusters = get_cluster_number(X) print clusters #write_result_to_file(clusters, raw_content)