class TestTopicIO(TestCase): def setUp(self): self.topic_io = TopicIO() def test_write_and_read_topics(self): lda = models.LdaModel() dname = "topic_io" self.topic_io.write_topics(lda, "orig", 2, 4, dname) self.assertTrue(os.path.exists(dname)) t_list = self.topic_io.read_topics(dname)
from topic.topicio import TopicIO from similarity.SimTopicLists import SimTopicLists import sys import utils.name_convention as name topics_io = TopicIO() stl = SimTopicLists() if len(sys.argv) <= 1: corpus_type = "bow" else: if sys.argv[1] == "t": corpus_type = "tfidf" elif sys.argv[1] == "b": corpus_type = "binary" else: corpus_type = "bow" if len(sys.argv) <= 2: topics_count = 3 else: topics_count = int(sys.argv[2]) if len(sys.argv) <= 3: src = "pp_reuters" else: src = sys.argv[3] dtw = name.get_output_dir(corpus_type, topics_count, src)
# print the topics for LSI # print "=========== topics" topic = lsi.print_topics(topics_count, 50) for x in topic: print x # # LDA # Pick the corpus to use # topics_io = TopicIO() print "=========== start LDA" start_time = time() lda = models.LdaModel(corpus, id2word=dictionary, num_topics=topics_count, minimum_probability=-1) corpus_lda = lda[corpus] print "======" # topic = lda.print_topics(topics_count, 10) # for x in topic: # print x topics_io.write_topics(model=lda, orig_dir=src,
# Load directory dictionary = corpora.Dictionary.load(dname + "/dict.dict") print(dictionary) # Load required corpus according to the argument corpus_type if corpus_type == 'tfidf': corpus_fname = dname + '/tfidf_corpus.mm' elif corpus_type == 'binary': corpus_fname = dname + '/binary_corpus.mm' else: corpus_fname = dname + '/bow_corpus.mm' print "Load Corpus File " + corpus_fname corpus = corpora.MmCorpus(corpus_fname) topics_io = TopicIO() # Perform LDA and store the results to an output file print "=========== start LDA" if alpha_set: lda = models.LdaModel(corpus, id2word=dictionary, num_topics=topics_count, minimum_probability=-1, alpha=alpha) lda_fname = dname + "/" + corpus_type + "_t" + str( topics_count) + "_alpha" + str(sys.argv[4]) + ".lda" elif eta_set: lda = models.LdaModel(corpus, id2word=dictionary, num_topics=topics_count,
print "corpus type :" + corpus_type print "# of topics : " + str(topics_count) print "src : " + src print "# of words used for topic coherence: " + str(word_count) print "output : " + output print "word count : " + str(word_count) print "startw : " + str(startw) print "Tfidf : " + str(tfidf) print "\n" # Load directory dictionary = corpora.Dictionary.load(dname + "/dict.dict") print(dictionary) # Init helpers topics_io = TopicIO() tc = TopicCoherence() # get all topics tlist = topics_io.read_topics(output + "/topics") # sort all words by decreasing frequency tlist2 = [] for topic in tlist: topic.sort() tlist2.append(topic.list(word_count, start=startw)) # prepare output file tf_file = name.tc_tf_file(dname, corpus_type, topics_count, startw, tfidf) co_occur_file = name.tc_co_occur_file(dname, corpus_type, topics_count, startw, tfidf)
need_ic = False if len(sys.argv) <= 6: words_count = 10 else: words_count = int(sys.argv[6]) if len(sys.argv) <= 7: startw = 0 else: startw = int(sys.argv[7]) dname = name.get_output_dir(corpus_type, topics_count, src) # read topics tio = TopicIO() tlist = tio.read_topics(dname + name.topics_dir()) # generate te file name fname = dname + name.te_preprocess(tc, words_count, startw) prefile = open(fname, "w") zerofile = dname + "/zeros_" + tc + "_w" + str(words_count) + ".txt" zerofile = open(zerofile, "w") # calculate topic evaluation values tclist = [] te = WordNetEvaluator() if not need_ic: for index, topic in enumerate(tlist): tclist.append([index, te.evaluate(topic, words_count, tc, prefile, zerofile, startw=startw)])
def setUp(self): self.topic_io = TopicIO()