def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. `fname_vocab` is the file with vocabulary; if not specified, it defaults to `fname.vocab`. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) if fname_vocab is None: fname_base, _ = path.splitext(fname) fname_dir = path.dirname(fname) for fname_vocab in [ utils.smart_extension(fname, '.vocab'), utils.smart_extension(fname, '/vocab.txt'), utils.smart_extension(fname_base, '.vocab'), utils.smart_extension(fname_dir, '/vocab.txt'), ]: if path.exists(fname_vocab): break else: raise IOError('BleiCorpus: could not find vocabulary file') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [utils.to_unicode(word).rstrip() for word in fin] self.id2word = dict(enumerate(words))
def load(cls, fname, *args, **kwargs): """ Load a previously saved object from file (also see `save`). Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`: >>> LdaModel.load(fname, mmap='r') """ kwargs['mmap'] = kwargs.get('mmap', None) result = super(LdaModel, cls).load(fname, *args, **kwargs) # check if `random_state` attribute has been set after main pickle load # if set -> the model to be loaded was saved using a >= 0.13.2 version of Gensim # if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so set `random_state` as the default value if not hasattr(result, 'random_state'): result.random_state = utils.get_random_state(None) # using default value `get_random_state(None)` logging.warning("random_state not set so using default value") state_fname = utils.smart_extension(fname, '.state') try: result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) except Exception as e: logging.warning("failed to load state from %s: %s", state_fname, e) id2word_fname = utils.smart_extension(fname, '.id2word') # check if `id2word_fname` file is present on disk # if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, so set `result.id2word` using the `id2word_fname` file # if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickle load if (os.path.isfile(id2word_fname)): try: result.id2word = utils.unpickle(id2word_fname) except Exception as e: logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e) return result
def __init__(self, fname, fname_vocab=None): """ Initialize the corpus from a file. `fname_vocab` is the file with vocabulary; if not specified, it defaults to `fname.vocab`. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) if fname_vocab is None: fname_base, _ = path.splitext(fname) fname_dir = path.dirname(fname) for fname_vocab in [ utils.smart_extension(fname, '.vocab'), utils.smart_extension(fname, '/vocab.txt'), utils.smart_extension(fname_base, '.vocab'), utils.smart_extension(fname_dir, '/vocab.txt'), ]: if path.exists(fname_vocab): break else: raise IOError('BleiCorpus: could not find vocabulary file') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [utils.to_unicode(word).rstrip() for word in fin] self.id2word = dict(enumerate(words))
def load(cls, fname, *args, **kwargs): """ Load a previously saved object from file (also see `save`). Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`: >>> LdaModel.load(fname, mmap='r') """ kwargs['mmap'] = kwargs.get('mmap', None) result = super(LdaModel, cls).load(fname, *args, **kwargs) state_fname = utils.smart_extension(fname, '.state') try: result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) except Exception as e: logging.warning("failed to load state from %s: %s", state_fname, e) id2word_fname = utils.smart_extension(fname, '.id2word') if (os.path.isfile(id2word_fname)): try: result.id2word = utils.unpickle(id2word_fname) except Exception as e: logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e) else: result.id2word = None return result
def load(cls, fname, *args, **kwargs): """ Load a previously saved object from file (also see `save`). Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`: >>> LdaModel.load(fname, mmap='r') """ kwargs['mmap'] = kwargs.get('mmap', None) result = super(LdaModel, cls).load(fname, *args, **kwargs) state_fname = utils.smart_extension(fname, '.state') try: result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) except Exception as e: logging.warning("failed to load state from %s: %s", state_fname, e) id2word_fname = utils.smart_extension(fname, '.id2word') if (os.path.isfile(id2word_fname)): try: result.id2word = utils.unpickle(id2word_fname) except Exception as e: logging.warning( "failed to load id2word dictionary from %s: %s", id2word_fname, e) else: result.id2word = None return result
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `UciCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) # write out vocabulary fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s", fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
def __init__(self, fname, fname_vocab=None): """ Parameters ---------- fname : str Path to corpus in UCI format. fname_vocab : bool, optional Path to vocab. Examples -------- >>> from gensim.corpora import UciCorpus >>> from gensim.test.utils import datapath >>> >>> corpus = UciCorpus(datapath('testcorpus.uci')) >>> for document in corpus: ... pass """ IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) if fname_vocab is None: fname_vocab = utils.smart_extension(fname, '.vocab') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [word.strip() for word in fin] self.id2word = dict(enumerate(words)) self.transposed = True
def save(self, fname, *args, **kwargs): """Save the model to a file. Notes ----- Large internal arrays may be stored into separate files, with `fname` as prefix. Warnings -------- Do not save as a compressed file if you intend to load the file back with `mmap`. Parameters ---------- fname : str Path to output file. *args Variable length argument list, see :meth:`gensim.utils.SaveLoad.save`. **kwargs Arbitrary keyword arguments, see :meth:`gensim.utils.SaveLoad.save`. See Also -------- :meth:`~gensim.models.lsimodel.LsiModel.load` """ if self.projection is not None: self.projection.save(utils.smart_extension(fname, '.projection'), *args, **kwargs) super(LsiModel, self).save(fname, *args, ignore=['projection', 'dispatcher'], **kwargs)
def __init__(self, fname, index_fname=None): """ Initialize this abstract base class, by loading a previously saved index from `index_fname` (or `fname.index` if `index_fname` is not set). This index will allow subclasses to support the `corpus[docno]` syntax (random access to document #`docno` in O(1)). >>> # save corpus in SvmLightCorpus format with an index >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) >>> # load back as a document stream (*not* plain Python list) >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') >>> print(corpus_with_random_access[1]) [(0, 1.0), (1, 2.0)] """ try: if index_fname is None: index_fname = utils.smart_extension(fname, '.index') self.index = utils.unpickle(index_fname) # change self.index into a numpy.ndarray to support fancy indexing self.index = numpy.asarray(self.index) logger.info("loaded corpus index from %s" % index_fname) except: self.index = None self.length = None
def __init__(self, fname, index_fname=None): """ Initialize this abstract base class, by loading a previously saved index from `index_fname` (or `fname.index` if `index_fname` is not set). This index will allow subclasses to support the `corpus[docno]` syntax (random access to document #`docno` in O(1)). >>> # save corpus in SvmLightCorpus format with an index >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]] >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus) >>> # load back as a document stream (*not* plain Python list) >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight') >>> print(corpus_with_random_access[1]) [(0, 1.0), (1, 2.0)] """ try: if index_fname is None: index_fname = utils.smart_extension(fname, '.index') self.index = utils.unpickle(index_fname) # change self.index into a numpy.ndarray to support fancy indexing self.index = numpy.asarray(self.index) logger.info("loaded corpus index from %s", index_fname) except Exception: self.index = None self.length = None
def __init__(self, fname, fname_vocab=None): """ Parameters ---------- fname : str Path to corpus in UCI format. fname_vocab : bool, optional Path to vocab. Examples -------- .. sourcecode:: pycon >>> from gensim.corpora import UciCorpus >>> from gensim.test.utils import datapath >>> >>> corpus = UciCorpus(datapath('testcorpus.uci')) >>> for document in corpus: ... pass """ IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) if fname_vocab is None: fname_vocab = utils.smart_extension(fname, '.vocab') self.fname = fname with utils.open(fname_vocab, 'rb') as fin: words = [word.strip() for word in fin] self.id2word = dict(enumerate(words)) self.transposed = True
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) logger.info("storing corpus in Blei's LDA-C format into %s", fname) with utils.smart_open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7] fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
def save(self, fname, *args, **kwargs): """ Save the model to file. Large internal arrays may be stored into separate files, with `fname` as prefix. Note: do not save as a compressed file if you intend to load the file back with `mmap`. Note: If you intend to use models across Python 2/3 versions there are a few things to keep in mind: 1. The pickled Python dictionaries will not work across Python versions 2. The `save` method does not automatically save all NumPy arrays using NumPy, only those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main concern here is the `alpha` array if for instance using `alpha='auto'`. Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2) for an example on how to work around these issues. """ if self.state is not None: self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) super(LdaModel, self).save(fname, *args, ignore=['state', 'dispatcher'], **kwargs)
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): """ Iterate through the document stream `corpus`, saving the documents to `fname` and recording byte offset of each document. Save the resulting index structure to file `index_fname` (or `fname`.index is not set). This relies on the underlying corpus class `serializer` providing (in addition to standard iteration): * `save_corpus` method that returns a sequence of byte offsets, one for each saved document, * the `docbyoffset(offset)` method, which returns a document positioned at `offset` bytes within the persistent storage (file). * metadata if set to true will ensure that serialize will write out article titles to a pickle file. Example: >>> MmCorpus.serialize('test.mm', corpus) >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access >>> print(mm[42]) # retrieve document no. 42, etc. """ if getattr(corpus, 'fname', None) == fname: raise ValueError( "identical input vs. output corpus filename, refusing to serialize: %s" % fname) if index_fname is None: index_fname = utils.smart_extension(fname, '.index') kwargs = {'metadata': metadata} if progress_cnt is not None: kwargs['progress_cnt'] = progress_cnt if labels is not None: kwargs['labels'] = labels offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs) if offsets is None: raise NotImplementedError( "Called serialize on class %s which doesn't support indexing!" % serializer.__name__) # store offsets persistently, using pickle # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return # the offsets that are actually stored on disk - we're not storing self.index in any case, the # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure # backwards compatibility logger.info("saving %s index to %s", serializer.__name__, index_fname) utils.pickle(offsets, index_fname)
def __init__(self, fname): super(IdMmCorpus, self).__init__(fname) try: dockeys_fname = utils.smart_extension(fname, '.dockeys') self.dockeys = utils.unpickle(dockeys_fname) self.key_to_index = {k:n for (n, k) in enumerate(self.dockeys)} logger.info("loaded dockey index from %s" % dockeys_fname) except: self.dockeys = None
def __init__(self, fname): super(IdMmCorpus, self).__init__(fname) try: dockeys_fname = utils.smart_extension(fname, '.dockeys') self.dockeys = utils.unpickle(dockeys_fname) self.key_to_index = {k: n for (n, k) in enumerate(self.dockeys)} logger.info("loaded dockey index from %s" % dockeys_fname) except: self.dockeys = None
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """Save a corpus in the UCI Bag-of-Words format. Warnings -------- This function is automatically called by :meth`gensim.corpora.ucicorpus.UciCorpus.serialize`, don't call it directly, call :meth`gensim.corpora.ucicorpus.UciCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus: iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional Mapping between words and their ids. If None - will be inferred from `corpus`. progress_cnt : int, optional Progress counter, write log message each `progress_cnt` documents. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Notes ----- There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) elif id2word: num_terms = 1 + max(id2word) else: num_terms = 0 # write out vocabulary fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write( utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s", fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
def __init__(self, fname, fname_vocab=None): """ Parameters ---------- fname : str Path to corpus. fname_vocab : str, optional Vocabulary file. If `fname_vocab` is None, searching one of variants: * `fname`.vocab * `fname`/vocab.txt * `fname_without_ext`.vocab * `fname_folder`/vocab.txt Raises ------ IOError If vocabulary file doesn't exist. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) if fname_vocab is None: fname_base, _ = path.splitext(fname) fname_dir = path.dirname(fname) for fname_vocab in [ utils.smart_extension(fname, '.vocab'), utils.smart_extension(fname, '/vocab.txt'), utils.smart_extension(fname_base, '.vocab'), utils.smart_extension(fname_dir, '/vocab.txt'), ]: if path.exists(fname_vocab): break else: raise IOError('BleiCorpus: could not find vocabulary file') self.fname = fname with utils.open(fname_vocab, 'rb') as fin: words = [utils.to_unicode(word).rstrip() for word in fin] self.id2word = dict(enumerate(words))
def __init__(self, fname, fname_vocab=None): """ Parameters ---------- fname : str Path to corpus. fname_vocab : str, optional Vocabulary file. If `fname_vocab` is None, searching one of variants: * `fname`.vocab * `fname`/vocab.txt * `fname_without_ext`.vocab * `fname_folder`/vocab.txt Raises ------ IOError If vocabulary file doesn't exist. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) if fname_vocab is None: fname_base, _ = path.splitext(fname) fname_dir = path.dirname(fname) for fname_vocab in [ utils.smart_extension(fname, '.vocab'), utils.smart_extension(fname, '/vocab.txt'), utils.smart_extension(fname_base, '.vocab'), utils.smart_extension(fname_dir, '/vocab.txt'), ]: if path.exists(fname_vocab): break else: raise IOError('BleiCorpus: could not find vocabulary file') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [utils.to_unicode(word).rstrip() for word in fin] self.id2word = dict(enumerate(words))
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the LDA-C format. Notes ----- There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Input corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word for `corpus`. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Returns ------- list of int Offsets for each line in file (in bytes). """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) elif id2word: num_terms = 1 + max(id2word) else: num_terms = 0 logger.info("storing corpus in Blei's LDA-C format into %s", fname) with utils.open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7] fout.write( utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write( utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
def save(self, fname, *args, **kwargs): """ Save the model to file. Large internal arrays may be stored into separate files, with `fname` as prefix. Note: do not save as a compressed file if you intend to load the file back with `mmap`. """ if self.projection is not None: self.projection.save(utils.smart_extension(fname, '.projection'), *args, **kwargs) super(LsiModel, self).save(fname, *args, ignore=['projection', 'dispatcher'], **kwargs)
def __init__(self, fname, fname_vocab=None): IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) if fname_vocab is None: fname_vocab = utils.smart_extension(fname, '.vocab') self.fname = fname with utils.smart_open(fname_vocab) as fin: words = [word.strip() for word in fin] self.id2word = dict(enumerate(words)) self.transposed = True
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the LDA-C format. Notes ----- There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Input corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word for `corpus`. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Returns ------- list of int Offsets for each line in file (in bytes). """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) elif id2word: num_terms = 1 + max(id2word) else: num_terms = 0 logger.info("storing corpus in Blei's LDA-C format into %s", fname) with utils.smart_open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7] fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False, dockeys_fname=None): key_order = [] def corpus_as_list(): for k, c in corpus: key_order.append(k) yield c IndexedCorpus.serialize.im_func( serializer, fname, corpus_as_list(), id2word, index_fname, progress_cnt, labels, metadata) dockeys_fname = dockeys_fname or utils.smart_extension( fname, '.dockeys') utils.pickle(key_order, dockeys_fname)
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): """ Iterate through the document stream `corpus`, saving the documents to `fname` and recording byte offset of each document. Save the resulting index structure to file `index_fname` (or `fname`.index is not set). This relies on the underlying corpus class `serializer` providing (in addition to standard iteration): * `save_corpus` method that returns a sequence of byte offsets, one for each saved document, * the `docbyoffset(offset)` method, which returns a document positioned at `offset` bytes within the persistent storage (file). Example: >>> MmCorpus.serialize('test.mm', corpus) >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access >>> print(mm[42]) # retrieve document no. 42, etc. """ if getattr(corpus, 'fname', None) == fname: raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname) if index_fname is None: index_fname = utils.smart_extension(fname, '.index') if progress_cnt is not None: if labels is not None: offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) else: offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) else: if labels is not None: offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) else: offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata) if offsets is None: raise NotImplementedError("called serialize on class %s which doesn't support indexing!" % serializer.__name__) # store offsets persistently, using pickle # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return # the offsets that are actually stored on disk - we're not storing self.index in any case, the # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure # backwards compatibility logger.info("saving %s index to %s" % (serializer.__name__, index_fname)) utils.pickle(offsets, index_fname)
def load(cls, fname, *args, **kwargs): """ Load a previously saved object from file (also see `save`). Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`: >>> LsiModel.load(fname, mmap='r') """ kwargs['mmap'] = kwargs.get('mmap', None) result = super(LsiModelAltered, cls).load(fname, *args, **kwargs) projection_fname = utils.smart_extension(fname, '.projection') try: result.projection = super(LsiModelAltered, cls).load(projection_fname, *args, **kwargs) except Exception as e: logging.warning("failed to load projection from %s: %s" % (projection_fname, e)) return result
def load(cls, fname, *args, **kwargs): """ Load a previously saved object from file (also see `save`). Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`: >>> LdaModel.load(fname, mmap='r') """ kwargs["mmap"] = kwargs.get("mmap", None) result = super(LdaModel, cls).load(fname, *args, **kwargs) state_fname = utils.smart_extension(fname, ".state") try: result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs) except Exception as e: logging.warning("failed to load state from %s: %s", state_fname, e) return result
def load(cls, fname, *args, **kwargs): """ Load a previously saved object from file (also see `save`). Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`: >>> LsiModel.load(fname, mmap='r') """ kwargs['mmap'] = kwargs.get('mmap', None) result = super(LsiModel, cls).load(fname, *args, **kwargs) projection_fname = utils.smart_extension(fname, '.projection') try: result.projection = super(LsiModel, cls).load(projection_fname, *args, **kwargs) except Exception as e: logging.warning("failed to load projection from %s: %s" % (projection_fname, e)) return result
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """Save a corpus in the UCI Bag-of-Words format. Warnings -------- This function is automatically called by :meth`gensim.corpora.ucicorpus.UciCorpus.serialize`, don't call it directly, call :meth`gensim.corpora.ucicorpus.UciCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus: iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional Mapping between words and their ids. If None - will be inferred from `corpus`. progress_cnt : int, optional Progress counter, write log message each `progress_cnt` documents. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Notes ----- There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) elif id2word: num_terms = 1 + max(id2word) else: num_terms = 0 # write out vocabulary fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s", fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
def load(cls, fname, *args, **kwargs): """Load a previously saved object using :meth:`~gensim.models.lsimodel.LsiModel.save` from file. Notes ----- Large arrays can be memmap'ed back as read-only (shared memory) by setting the `mmap='r'` parameter. Parameters ---------- fname : str Path to file that contains LsiModel. *args Variable length argument list, see :meth:`gensim.utils.SaveLoad.load`. **kwargs Arbitrary keyword arguments, see :meth:`gensim.utils.SaveLoad.load`. See Also -------- :meth:`~gensim.models.lsimodel.LsiModel.save` Returns ------- :class:`~gensim.models.lsimodel.LsiModel` Loaded instance. Raises ------ IOError When methods are called on instance (should be called from class). """ kwargs['mmap'] = kwargs.get('mmap', None) result = super(LsiModel, cls).load(fname, *args, **kwargs) projection_fname = utils.smart_extension(fname, '.projection') try: result.projection = super(LsiModel, cls).load(projection_fname, *args, **kwargs) except Exception as e: logging.warning("failed to load projection from %s: %s", projection_fname, e) return result
def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs): """ Save the model to file. Large internal arrays may be stored into separate files, with `fname` as prefix. `separately` can be used to define which arrays should be stored in separate files. `ignore` parameter can be used to define which variables should be ignored, i.e. left out from the pickled lda model. By default the internal `state` is ignored as it uses its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher will be added to any ignore parameter defined. Note: do not save as a compressed file if you intend to load the file back with `mmap`. Note: If you intend to use models across Python 2/3 versions there are a few things to keep in mind: 1. The pickled Python dictionaries will not work across Python versions 2. The `save` method does not automatically save all NumPy arrays using NumPy, only those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main concern here is the `alpha` array if for instance using `alpha='auto'`. Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2) for an example on how to work around these issues. """ if self.state is not None: self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) # make sure 'state' and 'dispatcher' are ignored from the pickled object, even if # someone sets the ignore list themselves if ignore is not None and ignore: if isinstance(ignore, six.string_types): ignore = [ignore] ignore = [e for e in ignore if e] # make sure None and '' are not in the list ignore = list(set(['state', 'dispatcher']) | set(ignore)) else: ignore = ['state', 'dispatcher'] super(LdaModel, self).save(fname, *args, ignore=ignore, **kwargs)
def __init__(self, fname, index_fname=None): """ Parameters ---------- fname : str Path to corpus. index_fname : str, optional Path to index, if not provided - used `fname.index`. """ try: if index_fname is None: index_fname = utils.smart_extension(fname, '.index') self.index = utils.unpickle(index_fname) # change self.index into a numpy.ndarray to support fancy indexing self.index = numpy.asarray(self.index) logger.info("loaded corpus index from %s", index_fname) except Exception: self.index = None self.length = None
def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs): """ Save the model to file. Large internal arrays may be stored into separate files, with `fname` as prefix. `separately` can be used to define which arrays should be stored in separate files. `ignore` parameter can be used to define which variables should be ignored, i.e. left out from the pickled lda model. By default the internal `state` is ignored as it uses its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher` will be added to any ignore parameter defined. Note: do not save as a compressed file if you intend to load the file back with `mmap`. Note: If you intend to use models across Python 2/3 versions there are a few things to keep in mind: 1. The pickled Python dictionaries will not work across Python versions 2. The `save` method does not automatically save all NumPy arrays using NumPy, only those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main concern here is the `alpha` array if for instance using `alpha='auto'`. Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2) for an example on how to work around these issues. """ if self.state is not None: self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) # make sure 'state' and 'dispatcher' are ignored from the pickled object, even if # someone sets the ignore list themselves if ignore is not None and ignore: if isinstance(ignore, six.string_types): ignore = [ignore] ignore = [e for e in ignore if e] # make sure None and '' are not in the list ignore = list(set(['state', 'dispatcher']) | set(ignore)) else: ignore = ['state', 'dispatcher'] super(LdaModel, self).save(fname, *args, ignore=ignore, **kwargs)
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False, dockeys_fname=None): key_order = [] def corpus_as_list(): for k, c in corpus: key_order.append(k) yield c IndexedCorpus.serialize.__func__(serializer, fname, corpus_as_list(), id2word, index_fname, progress_cnt, labels, metadata) dockeys_fname = dockeys_fname or utils.smart_extension( fname, '.dockeys') utils.pickle(key_order, dockeys_fname)
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) logger.info("storing corpus in Blei's LDA-C format into %s" % fname) with utils.smart_open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7] fout.write( utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write( utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `UciCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + list(id2word)) # write out vocabulary fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write( utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s", fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): """Serialize corpus with offset metadata, allows to use direct indexes after loading. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word. index_fname : str, optional Where to save resulting index, if None - store index to `fname`.index. progress_cnt : int, optional Number of documents after which progress info is printed. labels : bool, optional If True - ignore first column (class labels). metadata : bool, optional If True - ensure that serialize will write out article titles to a pickle file. Examples -------- >>> from gensim.corpora import MmCorpus >>> from gensim.test.utils import get_tmpfile >>> >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]] >>> output_fname = get_tmpfile("test.mm") >>> >>> MmCorpus.serialize(output_fname, corpus) >>> mm = MmCorpus(output_fname) # `mm` document stream now has random access >>> print(mm[1]) # retrieve document no. 42, etc. [(1, 0.1)] """ if getattr(corpus, 'fname', None) == fname: raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname) if index_fname is None: index_fname = utils.smart_extension(fname, '.index') kwargs = {'metadata': metadata} if progress_cnt is not None: kwargs['progress_cnt'] = progress_cnt if labels is not None: kwargs['labels'] = labels offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs) if offsets is None: raise NotImplementedError( "Called serialize on class %s which doesn't support indexing!" % serializer.__name__ ) # store offsets persistently, using pickle # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return # the offsets that are actually stored on disk - we're not storing self.index in any case, the # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure # backwards compatibility logger.info("saving %s index to %s", serializer.__name__, index_fname) utils.pickle(offsets, index_fname)
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): """Serialize corpus with offset metadata, allows to use direct indexes after loading. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word. index_fname : str, optional Where to save resulting index, if None - store index to `fname`.index. progress_cnt : int, optional Number of documents after which progress info is printed. labels : bool, optional If True - ignore first column (class labels). metadata : bool, optional If True - ensure that serialize will write out article titles to a pickle file. Examples -------- .. sourcecode:: pycon >>> from gensim.corpora import MmCorpus >>> from gensim.test.utils import get_tmpfile >>> >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]] >>> output_fname = get_tmpfile("test.mm") >>> >>> MmCorpus.serialize(output_fname, corpus) >>> mm = MmCorpus(output_fname) # `mm` document stream now has random access >>> print(mm[1]) # retrieve document no. 42, etc. [(1, 0.1)] """ if getattr(corpus, 'fname', None) == fname: raise ValueError( "identical input vs. output corpus filename, refusing to serialize: %s" % fname) if index_fname is None: index_fname = utils.smart_extension(fname, '.index') kwargs = {'metadata': metadata} if progress_cnt is not None: kwargs['progress_cnt'] = progress_cnt if labels is not None: kwargs['labels'] = labels offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs) if offsets is None: raise NotImplementedError( "Called serialize on class %s which doesn't support indexing!" % serializer.__name__) # store offsets persistently, using pickle # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return # the offsets that are actually stored on disk - we're not storing self.index in any case, the # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure # backwards compatibility logger.info("saving %s index to %s", serializer.__name__, index_fname) utils.pickle(offsets, index_fname)
def save(self, fname, ignore=['state', 'dispatcher'], separately=None, *args, **kwargs): """ Save the model to file. Large internal arrays may be stored into separate files, with `fname` as prefix. `separately` can be used to define which arrays should be stored in separate files. `ignore` parameter can be used to define which variables should be ignored, i.e. left out from the pickled lda model. By default the internal `state` is ignored as it uses its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher` will be added to any ignore parameter defined. Note: do not save as a compressed file if you intend to load the file back with `mmap`. Note: If you intend to use models across Python 2/3 versions there are a few things to keep in mind: 1. The pickled Python dictionaries will not work across Python versions 2. The `save` method does not automatically save all np arrays using np, only those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main concern here is the `alpha` array if for instance using `alpha='auto'`. Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2) for an example on how to work around these issues. """ if self.state is not None: self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) # Save the dictionary separately if not in 'ignore'. if 'id2word' not in ignore: utils.pickle(self.id2word, utils.smart_extension(fname, '.id2word')) # make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if # someone sets the ignore list themselves if ignore is not None and ignore: if isinstance(ignore, six.string_types): ignore = [ignore] ignore = [e for e in ignore if e] # make sure None and '' are not in the list ignore = list(set(['state', 'dispatcher', 'id2word']) | set(ignore)) else: ignore = ['state', 'dispatcher', 'id2word'] # make sure 'expElogbeta' and 'sstats' are ignored from the pickled object, even if # someone sets the separately list themselves. separately_explicit = ['expElogbeta', 'sstats'] # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some # array manually. if (isinstance(self.alpha, six.string_types) and self.alpha == 'auto') or (isinstance(self.alpha, np.ndarray) and len(self.alpha.shape) != 1): separately_explicit.append('alpha') if (isinstance(self.eta, six.string_types) and self.eta == 'auto') or (isinstance(self.eta, np.ndarray) and len(self.eta.shape) != 1): separately_explicit.append('eta') # Merge separately_explicit with separately. if separately: if isinstance(separately, six.string_types): separately = [separately] separately = [e for e in separately if e] # make sure None and '' are not in the list separately = list(set(separately_explicit) | set(separately)) else: separately = separately_explicit super(LdaModel, self).save(fname, ignore=ignore, separately=separately, *args, **kwargs)
def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False): """ Iterate through the document stream `corpus`, saving the documents to `fname` and recording byte offset of each document. Save the resulting index structure to file `index_fname` (or `fname`.index is not set). This relies on the underlying corpus class `serializer` providing (in addition to standard iteration): * `save_corpus` method that returns a sequence of byte offsets, one for each saved document, * the `docbyoffset(offset)` method, which returns a document positioned at `offset` bytes within the persistent storage (file). Example: >>> MmCorpus.serialize('test.mm', corpus) >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access >>> print(mm[42]) # retrieve document no. 42, etc. """ if getattr(corpus, 'fname', None) == fname: raise ValueError( "identical input vs. output corpus filename, refusing to serialize: %s" % fname) if index_fname is None: index_fname = utils.smart_extension(fname, '.index') if progress_cnt is not None: if labels is not None: offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata) else: offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata) else: if labels is not None: offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata) else: offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata) if offsets is None: raise NotImplementedError( "called serialize on class %s which doesn't support indexing!" % serializer.__name__) # store offsets persistently, using pickle logger.info("saving %s index to %s" % (serializer.__name__, index_fname)) utils.pickle(offsets, index_fname)
def save(self, fname, ignore=['state', 'dispatcher'], separately=None, *args, **kwargs): """ Save the model to file. Large internal arrays may be stored into separate files, with `fname` as prefix. `separately` can be used to define which arrays should be stored in separate files. `ignore` parameter can be used to define which variables should be ignored, i.e. left out from the pickled lda model. By default the internal `state` is ignored as it uses its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher` will be added to any ignore parameter defined. Note: do not save as a compressed file if you intend to load the file back with `mmap`. Note: If you intend to use models across Python 2/3 versions there are a few things to keep in mind: 1. The pickled Python dictionaries will not work across Python versions 2. The `save` method does not automatically save all np arrays using np, only those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main concern here is the `alpha` array if for instance using `alpha='auto'`. Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2) for an example on how to work around these issues. """ if self.state is not None: self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) # Save the dictionary separately if not in 'ignore'. if 'id2word' not in ignore: utils.pickle(self.id2word, utils.smart_extension(fname, '.id2word')) # make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if # someone sets the ignore list themselves if ignore is not None and ignore: if isinstance(ignore, six.string_types): ignore = [ignore] ignore = [e for e in ignore if e] # make sure None and '' are not in the list ignore = list( set(['state', 'dispatcher', 'id2word']) | set(ignore)) else: ignore = ['state', 'dispatcher', 'id2word'] # make sure 'expElogbeta' and 'sstats' are ignored from the pickled object, even if # someone sets the separately list themselves. separately_explicit = ['expElogbeta', 'sstats'] # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some # array manually. if (isinstance(self.alpha, six.string_types) and self.alpha == 'auto') or len(self.alpha.shape) != 1: separately_explicit.append('alpha') if (isinstance(self.eta, six.string_types) and self.eta == 'auto') or len(self.eta.shape) != 1: separately_explicit.append('eta') # Merge separately_explicit with separately. if separately: if isinstance(separately, six.string_types): separately = [separately] separately = [e for e in separately if e] # make sure None and '' are not in the list separately = list(set(separately_explicit) | set(separately)) else: separately = separately_explicit super(LdaModel, self).save(fname, ignore=ignore, separately=separately, *args, **kwargs)