#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Copyright (C) 2017 Serge Sharoff # This program is free software under GPL 3, see http://www.gnu.org/licenses/ ''' A script for inferring topics for documents with an existing model ''' import sys from gensim.models.ldamulticore import LdaMulticore from gensim.corpora import Dictionary, TextCorpus mname = sys.argv[1] cname = sys.argv[2] lda = LdaMulticore.load(mname) dictionary = Dictionary.load_from_text(cname + '_wordids.txt.bz2') wiki = TextCorpus.load(cname + '_corpus.pkl.bz2') for d in wiki.get_texts(): #bow = dictionary.doc2bow(d.split()) t = lda.get_document_topics(dictionary.doc2bow(d)) besttopval = 0 for i in range(len(t)): topic = t[i] if topic[1] > besttopval: besttopval = topic[1] besttop = topic[0] print('%d %.3f' % (besttop, besttopval))
sys.exit(1) inp, model_name = sys.argv[1:3] if len(sys.argv) > 3: ntopics = int(sys.argv[3]) if len(sys.argv) > 4: keep_words = int(sys.argv[4]) else: keep_words = DEFAULT_DICT_SIZE if os.path.exists(outp + '_wordids.txt.bz2') and os.path.exists(outp + '_corpus.pkl.bz2'): dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') wiki = TextCorpus.load(outp + '_corpus.pkl.bz2') else: wiki = TextCorpus(inp) # only keep the most frequent words wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=keep_words) wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') wiki.save(outp + '_corpus.pkl.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') # build tfidf if os.path.exists(outp + '_tfidf.mm'): mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm')
if wiki: # models will be trained on the Dutch Wikipedia corpus if os.path.exists(f_bow): corpus = WikiCorpus.load(f_bow) else: # download wikipedia training corpus (2015/10/14 18:45, 132MB) if not os.path.exists(f_corpus): wiki_lang, wiki_size, wiki_url = wikis[lang] if raw_input("About to download {0} Wikipedia corpus ({1}). Do you want to proceed? (y/n) ".format(wiki_lang, wiki_size)).startswith("y"): util.download_file(wiki_url, f_corpus, progress=True) else: sys.exit() corpus = WikiCorpus(f_corpus) # corpus.save(f_bow) else: # models will be trained on your own corpus if os.path.exists(f_bow): corpus = TextCorpus.load(f_bow) else: corpus = TextCorpus(f_corpus) # corpus.save(f_bow) # filter dictionary corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size) corpus.dictionary.save(f_dict) corpus.save(f_bow) # tf-idf model if os.path.exists(f_tfidf): tfidf = TfidfModel.load(f_tfidf) else: tfidf = TfidfModel(corpus, id2word=corpus.dictionary) tfidf.save(f_tfidf)