class TestTopicIO(TestCase):
    def setUp(self):
        self.topic_io = TopicIO()

    def test_write_and_read_topics(self):
        lda = models.LdaModel()
        dname = "topic_io"
        self.topic_io.write_topics(lda, "orig", 2, 4, dname)

        self.assertTrue(os.path.exists(dname))
        t_list = self.topic_io.read_topics(dname)
from topic.topicio import TopicIO
from similarity.SimTopicLists import SimTopicLists
import sys
import utils.name_convention as name

topics_io = TopicIO()
stl = SimTopicLists()

if len(sys.argv) <= 1:
    corpus_type = "bow"
else:
    if sys.argv[1] == "t":
        corpus_type = "tfidf"
    elif sys.argv[1] == "b":
        corpus_type = "binary"
    else:
        corpus_type = "bow"

if len(sys.argv) <= 2:
    topics_count = 3
else:
    topics_count = int(sys.argv[2])

if len(sys.argv) <= 3:
    src = "pp_reuters"
else:
    src = sys.argv[3]

dtw = name.get_output_dir(corpus_type, topics_count, src)

#  print the topics for LSI
#

print "=========== topics"

topic = lsi.print_topics(topics_count, 50)

for x in topic:
    print x

#
#  LDA
#  Pick the corpus to use
#

topics_io = TopicIO()

print "=========== start LDA"
start_time = time()
lda = models.LdaModel(corpus,
                      id2word=dictionary,
                      num_topics=topics_count,
                      minimum_probability=-1)
corpus_lda = lda[corpus]

print "======"
# topic = lda.print_topics(topics_count, 10)
# for x in topic:
#    print x
topics_io.write_topics(model=lda,
                       orig_dir=src,
# Load directory
dictionary = corpora.Dictionary.load(dname + "/dict.dict")
print(dictionary)

# Load required corpus according to the argument corpus_type
if corpus_type == 'tfidf':
    corpus_fname = dname + '/tfidf_corpus.mm'
elif corpus_type == 'binary':
    corpus_fname = dname + '/binary_corpus.mm'
else:
    corpus_fname = dname + '/bow_corpus.mm'
print "Load Corpus File " + corpus_fname
corpus = corpora.MmCorpus(corpus_fname)

topics_io = TopicIO()

# Perform LDA and store the results to an output file
print "=========== start LDA"
if alpha_set:
    lda = models.LdaModel(corpus,
                          id2word=dictionary,
                          num_topics=topics_count,
                          minimum_probability=-1,
                          alpha=alpha)
    lda_fname = dname + "/" + corpus_type + "_t" + str(
        topics_count) + "_alpha" + str(sys.argv[4]) + ".lda"
elif eta_set:
    lda = models.LdaModel(corpus,
                          id2word=dictionary,
                          num_topics=topics_count,
Esempio n. 5
0
print "corpus type :" + corpus_type
print "# of topics : " + str(topics_count)
print "src : " + src
print "# of words used for topic coherence: " + str(word_count)
print "output : " + output
print "word count : " + str(word_count)
print "startw : " + str(startw)
print "Tfidf : " + str(tfidf)
print "\n"

# Load directory
dictionary = corpora.Dictionary.load(dname + "/dict.dict")
print(dictionary)

# Init helpers
topics_io = TopicIO()
tc = TopicCoherence()

# get all topics
tlist = topics_io.read_topics(output + "/topics")

# sort all words by decreasing frequency
tlist2 = []
for topic in tlist:
    topic.sort()
    tlist2.append(topic.list(word_count, start=startw))

# prepare output file
tf_file = name.tc_tf_file(dname, corpus_type, topics_count, startw, tfidf)
co_occur_file = name.tc_co_occur_file(dname, corpus_type, topics_count, startw,
                                      tfidf)
Esempio n. 6
0
        need_ic = False

if len(sys.argv) <= 6:
    words_count = 10
else:
    words_count = int(sys.argv[6])

if len(sys.argv) <= 7:
    startw = 0
else:
    startw = int(sys.argv[7])

dname = name.get_output_dir(corpus_type, topics_count, src)

# read topics
tio = TopicIO()
tlist = tio.read_topics(dname + name.topics_dir())

# generate te file name
fname = dname + name.te_preprocess(tc, words_count, startw)
prefile = open(fname, "w")

zerofile = dname + "/zeros_" + tc + "_w" + str(words_count) + ".txt"
zerofile = open(zerofile, "w")

# calculate topic evaluation values
tclist = []
te = WordNetEvaluator()
if not need_ic:
    for index, topic in enumerate(tlist):
        tclist.append([index, te.evaluate(topic, words_count, tc, prefile, zerofile, startw=startw)])
 def setUp(self):
     self.topic_io = TopicIO()