def get_lda(out_dir, topic_name): global corp dictionary = corpora.Dictionary(corp) corpus = [dictionary.doc2bow(text) for text in corp] lda = models.LdaModel(corpus, id2word=dictionary, num_topics=n_topic, random_state=1) #lda.save(out_dir + "{}-{}-{}-{}-{}-{}-{}-lda_model-single-nyt.model".format(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],topic_name) ) ldaOut = lda.print_topics(n_topic) file_ldaOut = codecs.open( out_dir + "{}-{}-{}-{}-{}-{}-{}-lda_topics-rocchio-nyt.txt".format( sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], topic_name), "w", "utf-8") file_ldaOut.write(str(ldaOut)) file_id_lda = codecs.open( out_dir + "{}-{}-{}-{}-{}-{}-{}-fileid_lda-rocchio-nyt.txt".format( sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4], sys.argv[5], sys.argv[6], topic_name), "w", "utf-8") corpus_lda_doc = lda[corpus] index_id = -1 dict_id_lda = {} for doc in corpus_lda_doc: index_id += 1 file_id = get_id[index_id] dict_id_lda[file_id] = doc file_id_lda.write(json.dumps(dict_id_lda))
tfidf = TfidfVectorizer() vector = [] vectors = [] for i in range(1, 100): with open('data/%s.txt' % i) as f: content = [] is_content = False for line in f.readlines(): line = line.rstrip() if line.startswith("# title #") and line.endswith('# / title #'): line = line.replace("# title #", "").replace("# / title #", "") content.append(line) elif line.startswith("# content #") and is_content is False: content.append(line.replace("# content #", "")) is_content = True elif line.startswith("# / content #"): is_content = False elif is_content is True: content.append(line) vectors.append(line.split()) vector.append(' '.join(content)) tfidf_vector = tfidf.fit_transform(vector[:-3]) cluster = KMeans(init='k-means++', n_clusters=3) cluster_result = cluster.fit(tfidf_vector) tfidf_vector_test = tfidf.fit_transform(vector[-3:]) dc = Dictionary(vectors) corpus = [dc.doc2bow(vec) for vec in vectors] lda = LdaModel(corpus=corpus, id2word=dc, num_topics=3) print lda.show_topics() print lda.print_topics(3)[0]
tfidf = TfidfVectorizer() vector = [] vectors = [] for i in range(1, 100): with open('data/%s.txt' % i) as f: content = [] is_content = False for line in f.readlines(): line = line.rstrip() if line.startswith("# title #") and line.endswith('# / title #'): line = line.replace("# title #" , "" ).replace("# / title #","") content.append(line) elif line.startswith("# content #") and is_content is False: content.append(line.replace("# content #","")) is_content = True elif line.startswith("# / content #"): is_content = False elif is_content is True: content.append(line) vectors.append(line.split()) vector.append(' '.join(content)) tfidf_vector = tfidf.fit_transform(vector[:-3]) cluster = KMeans(init = 'k-means++' , n_clusters = 3 ) cluster_result = cluster.fit(tfidf_vector) tfidf_vector_test = tfidf.fit_transform(vector[-3:]) dc = Dictionary(vectors) corpus = [dc.doc2bow(vec) for vec in vectors] lda = LdaModel(corpus = corpus , id2word=dc , num_topics = 3) print lda.show_topics() print lda.print_topics(3)[0]