Ejemplo n.º 1
0
 def __init__(self, corpus, dictionary, num_topics, outfile):
     self.corpus = corpus
     self.outfile = outfile
     self.tokenizer = Tokenizer(dictionary)
     self.lda = StoppableLdaModel(dictionary, num_topics)
     self.training_thread = threading.Thread(target=self.train)
     self.training_thread.start()
Ejemplo n.º 2
0
 def __init__(self,corpus,dictionary,num_topics,outfile):
     self.corpus = corpus
     self.outfile = outfile
     self.tokenizer = Tokenizer(dictionary)
     self.lda = StoppableLdaModel(dictionary,num_topics)
     self.training_thread = threading.Thread(target=self.train)
     self.training_thread.start()
Ejemplo n.º 3
0
class Model(object):

    def __init__(self,corpus,dictionary,num_topics,outfile):
        self.corpus = corpus
        self.outfile = outfile
        self.tokenizer = Tokenizer(dictionary)
        self.lda = StoppableLdaModel(dictionary,num_topics)
        self.training_thread = threading.Thread(target=self.train)
        self.training_thread.start()

    def train(self):
        logging.info('starting model training...')
        self.lda.update(self.corpus)
        self.lda.save(self.outfile)

    def topics(self,text):
        bow = self.tokenizer.text2bow(text)
        return str(self.lda[bow])

    def show_topic(self,topic,topn=10):
        return str(self.lda.show_topic(topic,topn=topn))

    def print_topic(self,topic,topn=10):
        return self.lda.print_topic(topic,topn=topn)

    def request_stop(self):
        self.lda.request_stop()
        return 'stopping'
Ejemplo n.º 4
0
class Model(object):
    def __init__(self, corpus, dictionary, num_topics, outfile):
        self.corpus = corpus
        self.outfile = outfile
        self.tokenizer = Tokenizer(dictionary)
        self.lda = StoppableLdaModel(dictionary, num_topics)
        self.training_thread = threading.Thread(target=self.train)
        self.training_thread.start()

    def train(self):
        logging.info('starting model training...')
        self.lda.update(self.corpus)
        self.lda.save(self.outfile)

    def topics(self, text):
        bow = self.tokenizer.text2bow(text)
        return str(self.lda[bow])

    def show_topic(self, topic, topn=10):
        return str(self.lda.show_topic(topic, topn=topn))

    def print_topic(self, topic, topn=10):
        return self.lda.print_topic(topic, topn=topn)

    def request_stop(self):
        self.lda.request_stop()
        return 'stopping'
Ejemplo n.º 5
0
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

from news.document import Tokenizer

if len(sys.argv) != 2:
    print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0])
    raise SystemExit(1)

data_dir = sys.argv[1]
dictionary_file = data_dir+'/id_token_df'
model_file = data_dir+'/lda_model'

print 'creating tokenizer...'
dictionary = Dictionary.load_from_text(dictionary_file)
tok = Tokenizer(dictionary)

print 'loading model...'
lda = LdaModel.load(model_file)

while True:
    text = raw_input('enter text (q to quit): ')
    if text == 'q':
        print 'bye!'
        break
    doc = tok.text2bow(text)
    topics = lda[doc]
    for topic,weight in sorted(topics,key=itemgetter(1),reverse=True):
        print weight,lda.show_topic(topic,topn=4)
Ejemplo n.º 6
0
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

from news.document import Tokenizer

if len(sys.argv) != 2:
    print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0])
    raise SystemExit(1)

data_dir = sys.argv[1]
dictionary_file = data_dir + '/id_token_df'
model_file = data_dir + '/lda_model'

print 'creating tokenizer...'
dictionary = Dictionary.load_from_text(dictionary_file)
tok = Tokenizer(dictionary)

print 'loading model...'
lda = LdaModel.load(model_file)

while True:
    text = raw_input('enter text (q to quit): ')
    if text == 'q':
        print 'bye!'
        break
    doc = tok.text2bow(text)
    topics = lda[doc]
    for topic, weight in sorted(topics, key=itemgetter(1), reverse=True):
        print weight, lda.show_topic(topic, topn=4)