Python HashDictionary.save Examples

Programming Language: Python

Namespace/Package Name: gensim.corpora

Class/Type: HashDictionary

Method/Function: save

Examples at hotexamples.com: 3

Python HashDictionary.save - 3 examples found. These are the top rated real world Python examples of gensim.corpora.HashDictionary.save extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HashDictionary(8)

save_as_text(4)

allow_update(3)

save(3)

doc2bow(2)

filter_extremes(2)

add_documents(1)

load(1)

Example #1

Show file

File: models.py Project: Kamaros/fnc-1

def hashdictionary_corpus(dataframe, id_range=32000):
    """Returns a HashDictionary mapping words to ids.

    Precomputed HashDictionaries are read from file if previously cached, or generated then cached otherwise.

    Parameters
    ----------
    dataframe : Pandas DataFrame
        The DataFrame containing the documents to process.
    id_range : int
        The maximum number of ids available.

    Returns
    -------
    dictionary : Gensim HashDictionary
        HashDictionary mapping words to ids.
    """
    filename = 'caches/models/dictionary_{}.model'.format(id_range)

    if not os.path.isfile(filename):
        corpus = text_corpus(dataframe)
        dictionary = HashDictionary(corpus, id_range=id_range)
        dictionary.save(filename)
    else:
        dictionary = HashDictionary.load(filename)

    return dictionary

Example #2

Show file

File: create_corpus.py Project: Keep-Current/Engine

    tokens = list(filter(None, tokens))
    return tokens


class Corpus(object):
    def __iter__(self):
        for file in glob.glob("*.txt"):
            print(file)
            paper = Path(file).read_text(encoding='utf8')
            yield paper


corpus_memory_friendly = Corpus()
papers = list(corpus_memory_friendly)

texts = [list(preprocess(t)) for t in papers]

# define the dictionary:
dictionary = Dictionary(texts)
dictionary.save('reasoning_corpura.dict')

corpus = [dictionary.doc2bow(text) for text in texts]
MmCorpus.serialize('reasoning_bow.mm', corpus)


hash_dictionary = HashDictionary(texts )
hash_dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
hash_dictionary.save_as_text('reasoning_wordids.txt.bz2')
hash_dictionary.save('reasoning_corpura_hash.dict')

Example #3

Show file

Download it from:

    https://www.kaggle.com/benhamner/nips-papers/downloads/papers.csv/2

"""
papers = pd.read_csv('papers.csv')
corpus = list(papers['paper_text'])

print("corpus size: ", len(corpus))

# ToDo: check performance with lemmatization: gensim.utils.lemmatize

tokenized_corpus = [[
    utils.to_unicode(token)
    for token in utils.tokenize(corpus_item, lower=True, errors='ignore')
] for corpus_item in corpus]

hash_dictionary = HashDictionary(tokenized_corpus)

bow_corpus = [hash_dictionary.doc2bow(text) for text in corpus]
MmCorpus.serialize('nips_bow.mm', bow_corpus, progress_cnt=10000)

hash_dictionary.filter_extremes(no_below=20,
                                no_above=0.1,
                                keep_n=DEFAULT_DICT_SIZE)
hash_dictionary.save_as_text('nips_wordids.txt.bz2')
hash_dictionary.save('nips_corpura_hash.dict')

dictionary = Dictionary(tokenized_corpus)
dictionary.save('nips_corpura.dict')