def buildDmlCorpus(config, language): dml = DmlCorpus() dml.processConfig(config, shuffle = True) dml.buildDictionary() dml.dictionary.filterExtremes(noBelow = 5, noAbove = 0.3) # ignore too (in)frequent words dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their uris) dml.saveAsText() # save id mappings and documents as text data (matrix market format) return dml
def buildDmlCorpus(config, language): dml = DmlCorpus() dml.processConfig(config, shuffle=True) dml.buildDictionary() dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words dml.save( config.resultFile('.pkl') ) # save the mappings as binary data (actual documents are not saved, only their uris) dml.saveAsText( ) # save id mappings and documents as text data (matrix market format) return dml
logging.basicConfig(level = logging.INFO) logging.root.level = logging.DEBUG logging.info("running %s" % ' '.join(sys.argv)) program = os.path.basename(sys.argv[0]) # check and process input arguments if len(sys.argv) < 3: print globals()['__doc__'] % program sys.exit(1) language = sys.argv[1] method = sys.argv[2].strip().lower() logging.info("loading corpus mappings") try: dml = DmlCorpus.load(config.resultFile('.pkl')) except IOError, e: raise IOError("no word-count corpus found at %s; you must first generate it through gensim_build.py") config = dml.config logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) id2word = DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) input = MmCorpus(bow.mm) if method == 'tfidf': model = tfidfmodel.TfidfModel.load(modelfname('tfidf')) elif method == 'lsi': tfidf = tfidfmodel.TfidfModel.load(modelfname('tfidf'))
program = os.path.basename(sys.argv[0]) # check and process input arguments if len(sys.argv) < 3: print globals()['__doc__'] % program sys.exit(1) language = sys.argv[1] method = sys.argv[2].strip().lower() logging.info("loading corpus mappings") config = DmlConfig('gensim_%s' % language, resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) try: dml = DmlCorpus.load(config.resultFile('.pkl')) except IOError, e: raise IOError( "no word-count corpus found at %s; you must first generate it through gensim_build.py" ) logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) id2word = DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) if method == 'tfidf': corpus = MmCorpus(config.resultFile('bow.mm')) model = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) model.save(config.resultFile('tfidfmodel.pkl')) elif method == 'lda':