コード例 #1
0
if __name__ == '__main__':
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level = logging.INFO)
    logging.info("running %s" % ' '.join(sys.argv))

    program = os.path.basename(sys.argv[0])

    # check and process input arguments
    if len(sys.argv) < 3:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    language = sys.argv[1]
    method = sys.argv[2].strip().lower()
    
    logging.info("loading corpus mappings")
    config = dmlcorpus.DmlConfig('gensim_%s' % language, resultDir = gensim_build.RESULT_DIR, acceptLangs = [language])

    logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt'))
    id2word = dmlcorpus.DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))
    
    corpus = MmCorpus(config.resultFile('bow.mm'))

    if method == 'tfidf':
        model = tfidfmodel.TfidfModel(corpus, id2word = id2word, normalize = True)
        model.save(config.resultFile('model_tfidf.pkl'))
    elif method == 'lda':
        model = ldamodel.LdaModel(corpus, id2word = id2word, numTopics = DIM_LDA)
        model.save(config.resultFile('model_lda.pkl'))
    elif method == 'lsi':
        # first, transform word counts to tf-idf weights
コード例 #2
0
ファイル: gensim_build.py プロジェクト: lucky7323/gensimPy3
    dml = dmlcorpus.DmlCorpus()
    dml.processConfig(config, shuffle = True)
    dml.buildDictionary()
    dml.dictionary.filterExtremes(noBelow=5, noAbove=0.3) # ignore too (in)frequent words

    dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their URIs)
    dml.saveAsText() # save id mappings and documents as text data (matrix market format)
    return dml


if __name__ == '__main__':
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logging.info("running %s" % ' '.join(sys.argv))

    program = os.path.basename(sys.argv[0])

    # check and process input arguments
    if len(sys.argv) < 2:
        print globals()['__doc__'] % locals()
        sys.exit(1)
    language = sys.argv[1]

    # construct the config, which holds information about sources, data file filenames etc.
    config = dmlcorpus.DmlConfig('%s_%s' % (PREFIX, language), resultDir=RESULT_DIR, acceptLangs=[language])
    for source in SOURCE_LIST:
        config.addSource(source)
    buildDmlCorpus(config)

    logging.info("finished running %s" % program)
コード例 #3
0
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logging.info("running %s" % ' '.join(sys.argv))

    program = os.path.basename(sys.argv[0])

    # check and process input arguments
    if len(sys.argv) < 3:
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    language = sys.argv[1]
    method = sys.argv[2].strip().lower()

    logging.info("loading corpus mappings")
    config = dmlcorpus.DmlConfig('%s_%s' % (gensim_build.PREFIX, language),
                                 resultDir=gensim_build.RESULT_DIR,
                                 acceptLangs=[language])

    logging.info("loading word id mapping from %s" %
                 config.resultFile('wordids.txt'))
    id2word = dmlcorpus.DmlCorpus.loadDictionary(
        config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))

    corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl'))
    input = MmCorpus(config.resultFile('_%s.mm' % method))
    assert len(input) == len(
        corpus
    ), "corpus size mismatch (%i vs %i): run ./gensim_genmodel.py again" % (
        len(input), len(corpus))