if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level = logging.INFO) logging.info("running %s" % ' '.join(sys.argv)) program = os.path.basename(sys.argv[0]) # check and process input arguments if len(sys.argv) < 3: print globals()['__doc__'] % locals() sys.exit(1) language = sys.argv[1] method = sys.argv[2].strip().lower() logging.info("loading corpus mappings") config = dmlcorpus.DmlConfig('gensim_%s' % language, resultDir = gensim_build.RESULT_DIR, acceptLangs = [language]) logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) corpus = MmCorpus(config.resultFile('bow.mm')) if method == 'tfidf': model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True) model.save(config.resultFile('model_tfidf.pkl')) elif method == 'lda': model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA) model.save(config.resultFile('model_lda.pkl')) elif method == 'lsi': # first, transform word counts to tf-idf weights
dml = dmlcorpus.DmlCorpus() dml.processConfig(config, shuffle = True) dml.buildDictionary() dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their URIs) dml.saveAsText() # save id mappings and documents as text data (matrix market format) return dml if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logging.info("running %s" % ' '.join(sys.argv)) program = os.path.basename(sys.argv[0]) # check and process input arguments if len(sys.argv) < 2: print globals()['__doc__'] % locals() sys.exit(1) language = sys.argv[1] # construct the config, which holds information about sources, data file filenames etc. config = dmlcorpus.DmlConfig('%s_%s' % (PREFIX, language), resultDir=RESULT_DIR, acceptLangs=[language]) for source in SOURCE_LIST: config.addSource(source) buildDmlCorpus(config) logging.info("finished running %s" % program)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logging.info("running %s" % ' '.join(sys.argv)) program = os.path.basename(sys.argv[0]) # check and process input arguments if len(sys.argv) < 3: print(globals()['__doc__'] % locals()) sys.exit(1) language = sys.argv[1] method = sys.argv[2].strip().lower() logging.info("loading corpus mappings") config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language), resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) id2word = dmlcorpus.DmlCorpus.loadDictionary( config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl')) input = MmCorpus(config.resultFile('_%s.mm' % method)) assert len(input) == len( corpus ), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % ( len(input), len(corpus))