def buildDmlCorpus(config, language):
    dml = DmlCorpus()
    dml.processConfig(config, shuffle = True)
    dml.buildDictionary()
    dml.dictionary.filterExtremes(noBelow = 5, noAbove = 0.3) # ignore too (in)frequent words
    
    dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their uris) 
    dml.saveAsText() # save id mappings and documents as text data (matrix market format)
    return dml
Example #2
0
def buildDmlCorpus(config, language):
    dml = DmlCorpus()
    dml.processConfig(config, shuffle=True)
    dml.buildDictionary()
    dml.dictionary.filterExtremes(noBelow=5,
                                  noAbove=0.3)  # ignore too (in)frequent words

    dml.save(
        config.resultFile('.pkl')
    )  # save the mappings as binary data (actual documents are not saved, only their uris)
    dml.saveAsText(
    )  # save id mappings and documents as text data (matrix market format)
    return dml
    logging.basicConfig(level = logging.INFO)
    logging.root.level = logging.DEBUG
    logging.info("running %s" % ' '.join(sys.argv))

    program = os.path.basename(sys.argv[0])
    
    # check and process input arguments
    if len(sys.argv) < 3:
        print globals()['__doc__'] % program
        sys.exit(1)
    language = sys.argv[1]
    method = sys.argv[2].strip().lower()
    
    logging.info("loading corpus mappings")
    try:
        dml = DmlCorpus.load(config.resultFile('.pkl'))
    except IOError, e:
        raise IOError("no word-count corpus found at %s; you must first generate it through gensim_build.py")
    config = dml.config

    logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt'))
    id2word = DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))


    input = MmCorpus(bow.mm)
    
    if method == 'tfidf':
        model = tfidfmodel.TfidfModel.load(modelfname('tfidf'))
    elif method == 'lsi':
        tfidf = tfidfmodel.TfidfModel.load(modelfname('tfidf'))
    program = os.path.basename(sys.argv[0])

    # check and process input arguments
    if len(sys.argv) < 3:
        print globals()['__doc__'] % program
        sys.exit(1)
    language = sys.argv[1]
    method = sys.argv[2].strip().lower()

    logging.info("loading corpus mappings")
    config = DmlConfig('gensim_%s' % language,
                       resultDir=gensim_build.RESULT_DIR,
                       acceptLangs=[language])
    try:
        dml = DmlCorpus.load(config.resultFile('.pkl'))
    except IOError, e:
        raise IOError(
            "no word-count corpus found at %s; you must first generate it through gensim_build.py"
        )

    logging.info("loading word id mapping from %s" %
                 config.resultFile('wordids.txt'))
    id2word = DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))

    if method == 'tfidf':
        corpus = MmCorpus(config.resultFile('bow.mm'))
        model = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True)
        model.save(config.resultFile('tfidfmodel.pkl'))
    elif method == 'lda':