Python DmlCorpus Examples

Programming Language: Python

Namespace/Package Name: gensim.corpora

Class/Type: DmlCorpus

Examples at hotexamples.com: 4

Python DmlCorpus - 4 examples found. These are the top rated real world Python examples of gensim.corpora.DmlCorpus extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DmlCorpus(1)

Example #1

Show file

File: gensim_build.py Project: beibeiyang/Latent-Dirichlet-Allocation

def buildDmlCorpus(config, language):
    dml = DmlCorpus()
    dml.processConfig(config, shuffle = True)
    dml.buildDictionary()
    dml.dictionary.filterExtremes(noBelow = 5, noAbove = 0.3) # ignore too (in)frequent words
    
    dml.save(config.resultFile('.pkl')) # save the mappings as binary data (actual documents are not saved, only their uris) 
    dml.saveAsText() # save id mappings and documents as text data (matrix market format)
    return dml

Example #2

Show file

def buildDmlCorpus(config, language):
    dml = DmlCorpus()
    dml.processConfig(config, shuffle=True)
    dml.buildDictionary()
    dml.dictionary.filterExtremes(noBelow=5,
                                  noAbove=0.3)  # ignore too (in)frequent words

    dml.save(
        config.resultFile('.pkl')
    )  # save the mappings as binary data (actual documents are not saved, only their uris)
    dml.saveAsText(
    )  # save id mappings and documents as text data (matrix market format)
    return dml

Example #3

Show file

File: gensim_xml.py Project: beibeiyang/Latent-Dirichlet-Allocation

    logging.basicConfig(level = logging.INFO)
    logging.root.level = logging.DEBUG
    logging.info("running %s" % ' '.join(sys.argv))

    program = os.path.basename(sys.argv[0])
    
    # check and process input arguments
    if len(sys.argv) < 3:
        print globals()['__doc__'] % program
        sys.exit(1)
    language = sys.argv[1]
    method = sys.argv[2].strip().lower()
    
    logging.info("loading corpus mappings")
    try:
        dml = DmlCorpus.load(config.resultFile('.pkl'))
    except IOError, e:
        raise IOError("no word-count corpus found at %s; you must first generate it through gensim_build.py")
    config = dml.config

    logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt'))
    id2word = DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))


    input = MmCorpus(bow.mm)
    
    if method == 'tfidf':
        model = tfidfmodel.TfidfModel.load(modelfname('tfidf'))
    elif method == 'lsi':
        tfidf = tfidfmodel.TfidfModel.load(modelfname('tfidf'))

Example #4

Show file

File: gensim_genmodel.py Project: zitingtang/Latent-Dirichlet-Allocation

    program = os.path.basename(sys.argv[0])

    # check and process input arguments
    if len(sys.argv) < 3:
        print globals()['__doc__'] % program
        sys.exit(1)
    language = sys.argv[1]
    method = sys.argv[2].strip().lower()

    logging.info("loading corpus mappings")
    config = DmlConfig('gensim_%s' % language,
                       resultDir=gensim_build.RESULT_DIR,
                       acceptLangs=[language])
    try:
        dml = DmlCorpus.load(config.resultFile('.pkl'))
    except IOError, e:
        raise IOError(
            "no word-count corpus found at %s; you must first generate it through gensim_build.py"
        )

    logging.info("loading word id mapping from %s" %
                 config.resultFile('wordids.txt'))
    id2word = DmlCorpus.loadDictionary(config.resultFile('wordids.txt'))
    logging.info("loaded %i word ids" % len(id2word))

    if method == 'tfidf':
        corpus = MmCorpus(config.resultFile('bow.mm'))
        model = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True)
        model.save(config.resultFile('tfidfmodel.pkl'))
    elif method == 'lda':