Example #1
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (C) 2017  Serge Sharoff
# This program is free software under GPL 3, see http://www.gnu.org/licenses/
'''
A script for inferring topics for documents with an existing model
'''
import sys
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary, TextCorpus

mname = sys.argv[1]
cname = sys.argv[2]

lda = LdaMulticore.load(mname)
dictionary = Dictionary.load_from_text(cname + '_wordids.txt.bz2')
wiki = TextCorpus.load(cname + '_corpus.pkl.bz2')

for d in wiki.get_texts():
    #bow = dictionary.doc2bow(d.split())
    t = lda.get_document_topics(dictionary.doc2bow(d))
    besttopval = 0
    for i in range(len(t)):
        topic = t[i]
        if topic[1] > besttopval:
            besttopval = topic[1]
            besttop = topic[0]
    print('%d %.3f' % (besttop, besttopval))
    sys.exit(1)
inp, model_name = sys.argv[1:3]

if len(sys.argv) > 3:
    ntopics = int(sys.argv[3])

if len(sys.argv) > 4:
    keep_words = int(sys.argv[4])
else:
    keep_words = DEFAULT_DICT_SIZE

if os.path.exists(outp +
                  '_wordids.txt.bz2') and os.path.exists(outp +
                                                         '_corpus.pkl.bz2'):
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    wiki = TextCorpus.load(outp + '_corpus.pkl.bz2')
else:
    wiki = TextCorpus(inp)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20,
                                    no_above=0.1,
                                    keep_n=keep_words)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    wiki.save(outp + '_corpus.pkl.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

# build tfidf
if os.path.exists(outp + '_tfidf.mm'):
    mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm')
Example #3
0
	if wiki: # models will be trained on the Dutch Wikipedia corpus
		if os.path.exists(f_bow):
			corpus = WikiCorpus.load(f_bow)
		else:
			# download wikipedia training corpus (2015/10/14 18:45, 132MB)
			if not os.path.exists(f_corpus):
				wiki_lang, wiki_size, wiki_url = wikis[lang]
				if raw_input("About to download {0} Wikipedia corpus ({1}). Do you want to proceed? (y/n) ".format(wiki_lang, wiki_size)).startswith("y"):
					util.download_file(wiki_url, f_corpus, progress=True)
				else:
					sys.exit()
			corpus = WikiCorpus(f_corpus)
#			corpus.save(f_bow)
	else: # models will be trained on your own corpus
		if os.path.exists(f_bow):
			corpus = TextCorpus.load(f_bow)
		else:
			corpus = TextCorpus(f_corpus)
#			corpus.save(f_bow)

	# filter dictionary
	corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size)
	corpus.dictionary.save(f_dict)
	corpus.save(f_bow)

	# tf-idf model
	if os.path.exists(f_tfidf):
		tfidf = TfidfModel.load(f_tfidf)
	else:
		tfidf = TfidfModel(corpus, id2word=corpus.dictionary)
		tfidf.save(f_tfidf)