Python TextCorpus.load Examples

Programming Language: Python

Namespace/Package Name: gensim.corpora

Class/Type: TextCorpus

Method/Function: load

Examples at hotexamples.com: 3

Python TextCorpus.load - 3 examples found. These are the top rated real world Python examples of gensim.corpora.TextCorpus.load extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TextCorpus(10)

get_texts(3)

load(2)

__init__(1)

save(1)

Example #1

Show file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (C) 2017  Serge Sharoff
# This program is free software under GPL 3, see http://www.gnu.org/licenses/
'''
A script for inferring topics for documents with an existing model
'''
import sys
from gensim.models.ldamulticore import LdaMulticore
from gensim.corpora import Dictionary, TextCorpus

mname = sys.argv[1]
cname = sys.argv[2]

lda = LdaMulticore.load(mname)
dictionary = Dictionary.load_from_text(cname + '_wordids.txt.bz2')
wiki = TextCorpus.load(cname + '_corpus.pkl.bz2')

for d in wiki.get_texts():
    #bow = dictionary.doc2bow(d.split())
    t = lda.get_document_topics(dictionary.doc2bow(d))
    besttopval = 0
    for i in range(len(t)):
        topic = t[i]
        if topic[1] > besttopval:
            besttopval = topic[1]
            besttop = topic[0]
    print('%d %.3f' % (besttop, besttopval))

Example #2

Show file

File: make_corpus_lda1.py Project: MikeLepekhin/Non-thematic-Text-Classification

    sys.exit(1)
inp, model_name = sys.argv[1:3]

if len(sys.argv) > 3:
    ntopics = int(sys.argv[3])

if len(sys.argv) > 4:
    keep_words = int(sys.argv[4])
else:
    keep_words = DEFAULT_DICT_SIZE

if os.path.exists(outp +
                  '_wordids.txt.bz2') and os.path.exists(outp +
                                                         '_corpus.pkl.bz2'):
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    wiki = TextCorpus.load(outp + '_corpus.pkl.bz2')
else:
    wiki = TextCorpus(inp)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20,
                                    no_above=0.1,
                                    keep_n=keep_words)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    wiki.save(outp + '_corpus.pkl.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

# build tfidf
if os.path.exists(outp + '_tfidf.mm'):
    mm = gensim.corpora.MmCorpus(outp + '_tfidf.mm')

Example #3

Show file

File: baglm_example.py Project: jorispelemans/scale

	if wiki: # models will be trained on the Dutch Wikipedia corpus
		if os.path.exists(f_bow):
			corpus = WikiCorpus.load(f_bow)
		else:
			# download wikipedia training corpus (2015/10/14 18:45, 132MB)
			if not os.path.exists(f_corpus):
				wiki_lang, wiki_size, wiki_url = wikis[lang]
				if raw_input("About to download {0} Wikipedia corpus ({1}). Do you want to proceed? (y/n) ".format(wiki_lang, wiki_size)).startswith("y"):
					util.download_file(wiki_url, f_corpus, progress=True)
				else:
					sys.exit()
			corpus = WikiCorpus(f_corpus)
#			corpus.save(f_bow)
	else: # models will be trained on your own corpus
		if os.path.exists(f_bow):
			corpus = TextCorpus.load(f_bow)
		else:
			corpus = TextCorpus(f_corpus)
#			corpus.save(f_bow)

	# filter dictionary
	corpus.dictionary.filter_extremes(no_below=0, no_above=1, keep_n=voc_size)
	corpus.dictionary.save(f_dict)
	corpus.save(f_bow)

	# tf-idf model
	if os.path.exists(f_tfidf):
		tfidf = TfidfModel.load(f_tfidf)
	else:
		tfidf = TfidfModel(corpus, id2word=corpus.dictionary)
		tfidf.save(f_tfidf)