Esempio n. 1
0
import preprocess
import util.util as util
import os.path


n_samples = 2000
n_topics = 2
n_top_words = 20


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print "Topic #%d:" % topic_idx
        print " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])

count_vector, vectorizer, raw_data = preprocess.get_segmented_data("segmented_data.csv")
# util.save_object(vectorizer, "vectorizer.pkl")

if os.path.isfile("lda.pkl"):
    print "load lda model"
    lda = util.load_object("lda.pkl")
else:
    print "train lda model"
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=50,
                                learning_method='online', learning_offset=10.,
                                random_state=0, n_jobs=-1)
    lda.fit(count_vector)
    util.save_object(lda, "lda.pkl")

print "Topics in LDA model:"
tf_feature_names = vectorizer.get_feature_names()
Esempio n. 2
0
            f.write("\n\n")
    # close everything
    for f in cluster_to_file.values():
        f.close()

"""
Tag all the training data by its clustering number.
Ignore clusters with less than 20 counts.
"""
def get_cluster_tagged_data(clusters, X):
    print "getting clustering results with X"
    new_X = []
    new_Y = []
    cluster_counts = Counter(clusters)
    for i in range(len(clusters)):
        cluster = clusters[i]
        if cluster_counts[cluster] > 20:
            new_X.append(X[i])
            new_Y.append(cluster)
    return (new_X, new_Y)



X, cv, raw_content = preprocess.get_segmented_data()
#hierarchical_clustering(X)

clusters = get_cluster_number(X)
print clusters

#write_result_to_file(clusters, raw_content)