Ejemplo n.º 1
0
    def __init__(self, fname, fname_vocab=None):
        """
        Initialize the corpus from a file.

        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
        `fname.vocab`.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s", fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                        utils.smart_extension(fname, '.vocab'),
                        utils.smart_extension(fname, '/vocab.txt'),
                        utils.smart_extension(fname_base, '.vocab'),
                        utils.smart_extension(fname_dir, '/vocab.txt'),
                        ]:
                if path.exists(fname_vocab):
                    break
            else:
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
Ejemplo n.º 2
0
    def load(cls, fname, *args, **kwargs):
        """
        Load a previously saved object from file (also see `save`).

        Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`:

            >>> LdaModel.load(fname, mmap='r')

        """
        kwargs['mmap'] = kwargs.get('mmap', None)
        result = super(LdaModel, cls).load(fname, *args, **kwargs)

        # check if `random_state` attribute has been set after main pickle load
        # if set -> the model to be loaded was saved using a >= 0.13.2 version of Gensim
        # if not set -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so set `random_state` as the default value
        if not hasattr(result, 'random_state'):
            result.random_state = utils.get_random_state(None)  # using default value `get_random_state(None)`
            logging.warning("random_state not set so using default value")

        state_fname = utils.smart_extension(fname, '.state')
        try:
            result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs)
        except Exception as e:
            logging.warning("failed to load state from %s: %s", state_fname, e)

        id2word_fname = utils.smart_extension(fname, '.id2word')
        # check if `id2word_fname` file is present on disk
        # if present -> the model to be loaded was saved using a >= 0.13.2 version of Gensim, so set `result.id2word` using the `id2word_fname` file
        # if not present -> the model to be loaded was saved using a < 0.13.2 version of Gensim, so `result.id2word` already set after the main pickle load
        if (os.path.isfile(id2word_fname)):
            try:
                result.id2word = utils.unpickle(id2word_fname)
            except Exception as e:
                logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e)
        return result
Ejemplo n.º 3
0
    def __init__(self, fname, fname_vocab=None):
        """
        Initialize the corpus from a file.

        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
        `fname.vocab`.
        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s" % fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                    utils.smart_extension(fname, '.vocab'),
                    utils.smart_extension(fname, '/vocab.txt'),
                    utils.smart_extension(fname_base, '.vocab'),
                    utils.smart_extension(fname_dir, '/vocab.txt'),
            ]:
                if path.exists(fname_vocab):
                    break
            else:
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
Ejemplo n.º 4
0
    def load(cls, fname, *args, **kwargs):
        """
        Load a previously saved object from file (also see `save`).

        Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`:

            >>> LdaModel.load(fname, mmap='r')

        """
        kwargs['mmap'] = kwargs.get('mmap', None)
        result = super(LdaModel, cls).load(fname, *args, **kwargs)
        state_fname = utils.smart_extension(fname, '.state')
        try:
            result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs)
        except Exception as e:
            logging.warning("failed to load state from %s: %s", state_fname, e)
        id2word_fname = utils.smart_extension(fname, '.id2word')
        if (os.path.isfile(id2word_fname)):
            try:
                result.id2word = utils.unpickle(id2word_fname)
            except Exception as e:
                logging.warning("failed to load id2word dictionary from %s: %s", id2word_fname, e)
        else:
            result.id2word = None
        return result
Ejemplo n.º 5
0
    def load(cls, fname, *args, **kwargs):
        """
        Load a previously saved object from file (also see `save`).

        Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`:

            >>> LdaModel.load(fname, mmap='r')

        """
        kwargs['mmap'] = kwargs.get('mmap', None)
        result = super(LdaModel, cls).load(fname, *args, **kwargs)
        state_fname = utils.smart_extension(fname, '.state')
        try:
            result.state = super(LdaModel, cls).load(state_fname, *args,
                                                     **kwargs)
        except Exception as e:
            logging.warning("failed to load state from %s: %s", state_fname, e)
        id2word_fname = utils.smart_extension(fname, '.id2word')
        if (os.path.isfile(id2word_fname)):
            try:
                result.id2word = utils.unpickle(id2word_fname)
            except Exception as e:
                logging.warning(
                    "failed to load id2word dictionary from %s: %s",
                    id2word_fname, e)
        else:
            result.id2word = None
        return result
Ejemplo n.º 6
0
    def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
        """
        Save a corpus in the UCI Bag-of-Words format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `UciCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + id2word.keys())

        # write out vocabulary
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in xrange(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        logger.info("storing corpus in UCI Bag-of-Words format: %s", fname)

        return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
Ejemplo n.º 7
0
    def __init__(self, fname, fname_vocab=None):
        """
        Parameters
        ----------
        fname : str
            Path to corpus in UCI format.
        fname_vocab : bool, optional
            Path to vocab.

        Examples
        --------
        >>> from gensim.corpora import UciCorpus
        >>> from gensim.test.utils import datapath
        >>>
        >>> corpus = UciCorpus(datapath('testcorpus.uci'))
        >>> for document in corpus:
        ...     pass

        """
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Ejemplo n.º 8
0
    def save(self, fname, *args, **kwargs):
        """Save the model to a file.

        Notes
        -----
        Large internal arrays may be stored into separate files, with `fname` as prefix.

        Warnings
        --------
        Do not save as a compressed file if you intend to load the file back with `mmap`.

        Parameters
        ----------
        fname : str
            Path to output file.
        *args
            Variable length argument list, see :meth:`gensim.utils.SaveLoad.save`.
        **kwargs
            Arbitrary keyword arguments, see :meth:`gensim.utils.SaveLoad.save`.

        See Also
        --------
        :meth:`~gensim.models.lsimodel.LsiModel.load`

        """

        if self.projection is not None:
            self.projection.save(utils.smart_extension(fname, '.projection'), *args, **kwargs)
        super(LsiModel, self).save(fname, *args, ignore=['projection', 'dispatcher'], **kwargs)
Ejemplo n.º 9
0
    def __init__(self, fname, index_fname=None):
        """
        Initialize this abstract base class, by loading a previously saved index
        from `index_fname` (or `fname.index` if `index_fname` is not set).
        This index will allow subclasses to support the `corpus[docno]` syntax
        (random access to document #`docno` in O(1)).

        >>> # save corpus in SvmLightCorpus format with an index
        >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]]
        >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus)
        >>> # load back as a document stream (*not* plain Python list)
        >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight')
        >>> print(corpus_with_random_access[1])
        [(0, 1.0), (1, 2.0)]

        """
        try:
            if index_fname is None:
                index_fname = utils.smart_extension(fname, '.index')
            self.index = utils.unpickle(index_fname)
            # change self.index into a numpy.ndarray to support fancy indexing
            self.index = numpy.asarray(self.index)
            logger.info("loaded corpus index from %s" % index_fname)
        except:
            self.index = None
        self.length = None
Ejemplo n.º 10
0
    def __init__(self, fname, index_fname=None):
        """
        Initialize this abstract base class, by loading a previously saved index
        from `index_fname` (or `fname.index` if `index_fname` is not set).
        This index will allow subclasses to support the `corpus[docno]` syntax
        (random access to document #`docno` in O(1)).

        >>> # save corpus in SvmLightCorpus format with an index
        >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]]
        >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus)
        >>> # load back as a document stream (*not* plain Python list)
        >>> corpus_with_random_access = gensim.corpora.SvmLightCorpus('tstfile.svmlight')
        >>> print(corpus_with_random_access[1])
        [(0, 1.0), (1, 2.0)]

        """
        try:
            if index_fname is None:
                index_fname = utils.smart_extension(fname, '.index')
            self.index = utils.unpickle(index_fname)
            # change self.index into a numpy.ndarray to support fancy indexing
            self.index = numpy.asarray(self.index)
            logger.info("loaded corpus index from %s", index_fname)
        except Exception:
            self.index = None
        self.length = None
Ejemplo n.º 11
0
    def save(self, fname, *args, **kwargs):
        """Save the model to a file.

        Notes
        -----
        Large internal arrays may be stored into separate files, with `fname` as prefix.

        Warnings
        --------
        Do not save as a compressed file if you intend to load the file back with `mmap`.

        Parameters
        ----------
        fname : str
            Path to output file.
        *args
            Variable length argument list, see :meth:`gensim.utils.SaveLoad.save`.
        **kwargs
            Arbitrary keyword arguments, see :meth:`gensim.utils.SaveLoad.save`.

        See Also
        --------
        :meth:`~gensim.models.lsimodel.LsiModel.load`

        """

        if self.projection is not None:
            self.projection.save(utils.smart_extension(fname, '.projection'), *args, **kwargs)
        super(LsiModel, self).save(fname, *args, ignore=['projection', 'dispatcher'], **kwargs)
Ejemplo n.º 12
0
    def __init__(self, fname, fname_vocab=None):
        """
        Parameters
        ----------
        fname : str
            Path to corpus in UCI format.
        fname_vocab : bool, optional
            Path to vocab.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora import UciCorpus
            >>> from gensim.test.utils import datapath
            >>>
            >>> corpus = UciCorpus(datapath('testcorpus.uci'))
            >>> for document in corpus:
            ...     pass

        """
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with utils.open(fname_vocab, 'rb') as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Ejemplo n.º 13
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the LDA-C format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `BleiCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + id2word.keys())

        logger.info("storing corpus in Blei's LDA-C format into %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7]
                fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in xrange(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        return offsets
Ejemplo n.º 14
0
    def save(self, fname, *args, **kwargs):
        """
        Save the model to file.

        Large internal arrays may be stored into separate files, with `fname` as prefix.

        Note: do not save as a compressed file if you intend to load the file back with `mmap`.

        Note: If you intend to use models across Python 2/3 versions there are a few things to
        keep in mind:

        1. The pickled Python dictionaries will not work across Python versions
        2. The `save` method does not automatically save all NumPy arrays using NumPy, only
        those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main
        concern here is the `alpha` array if for instance using `alpha='auto'`.

        Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2)
        for an example on how to work around these issues.
        """
        if self.state is not None:
            self.state.save(utils.smart_extension(fname, '.state'), *args,
                            **kwargs)
        super(LdaModel, self).save(fname,
                                   *args,
                                   ignore=['state', 'dispatcher'],
                                   **kwargs)
Ejemplo n.º 15
0
    def serialize(serializer,
                  fname,
                  corpus,
                  id2word=None,
                  index_fname=None,
                  progress_cnt=None,
                  labels=None,
                  metadata=False):
        """
        Iterate through the document stream `corpus`, saving the documents to `fname`
        and recording byte offset of each document. Save the resulting index
        structure to file `index_fname` (or `fname`.index is not set).

        This relies on the underlying corpus class `serializer` providing (in
        addition to standard iteration):

        * `save_corpus` method that returns a sequence of byte offsets, one for
           each saved document,
        * the `docbyoffset(offset)` method, which returns a document
          positioned at `offset` bytes within the persistent storage (file).
        * metadata if set to true will ensure that serialize will write out article titles to a pickle file.

        Example:

        >>> MmCorpus.serialize('test.mm', corpus)
        >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access
        >>> print(mm[42]) # retrieve document no. 42, etc.
        """
        if getattr(corpus, 'fname', None) == fname:
            raise ValueError(
                "identical input vs. output corpus filename, refusing to serialize: %s"
                % fname)

        if index_fname is None:
            index_fname = utils.smart_extension(fname, '.index')

        kwargs = {'metadata': metadata}
        if progress_cnt is not None:
            kwargs['progress_cnt'] = progress_cnt

        if labels is not None:
            kwargs['labels'] = labels

        offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs)

        if offsets is None:
            raise NotImplementedError(
                "Called serialize on class %s which doesn't support indexing!"
                % serializer.__name__)

        # store offsets persistently, using pickle
        # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return
        # the offsets that are actually stored on disk - we're not storing self.index in any case, the
        # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure
        # backwards compatibility
        logger.info("saving %s index to %s", serializer.__name__, index_fname)
        utils.pickle(offsets, index_fname)
Ejemplo n.º 16
0
 def __init__(self, fname):
     super(IdMmCorpus, self).__init__(fname)
     try:
         dockeys_fname = utils.smart_extension(fname, '.dockeys')
         self.dockeys = utils.unpickle(dockeys_fname)
         self.key_to_index = {k:n for (n, k) in enumerate(self.dockeys)}
         logger.info("loaded dockey index from %s" % dockeys_fname)
     except:
         self.dockeys = None
Ejemplo n.º 17
0
 def __init__(self, fname):
     super(IdMmCorpus, self).__init__(fname)
     try:
         dockeys_fname = utils.smart_extension(fname, '.dockeys')
         self.dockeys = utils.unpickle(dockeys_fname)
         self.key_to_index = {k: n for (n, k) in enumerate(self.dockeys)}
         logger.info("loaded dockey index from %s" % dockeys_fname)
     except:
         self.dockeys = None
Ejemplo n.º 18
0
    def save_corpus(fname,
                    corpus,
                    id2word=None,
                    progress_cnt=10000,
                    metadata=False):
        """Save a corpus in the UCI Bag-of-Words format.

        Warnings
        --------
        This function is automatically called by :meth`gensim.corpora.ucicorpus.UciCorpus.serialize`,
        don't call it directly, call :meth`gensim.corpora.ucicorpus.UciCorpus.serialize` instead.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus: iterable of iterable of (int, int)
            Corpus in BoW format.
        id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between words and their ids. If None - will be inferred from `corpus`.
        progress_cnt : int, optional
            Progress counter, write log message each `progress_cnt` documents.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Notes
        -----
        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.

        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        elif id2word:
            num_terms = 1 + max(id2word)
        else:
            num_terms = 0

        # write out vocabulary
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms,
                    fname_vocab)
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in range(num_terms):
                fout.write(
                    utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        logger.info("storing corpus in UCI Bag-of-Words format: %s", fname)

        return UciWriter.write_corpus(fname,
                                      corpus,
                                      index=True,
                                      progress_cnt=progress_cnt)
Ejemplo n.º 19
0
    def __init__(self, fname, fname_vocab=None):
        """

        Parameters
        ----------
        fname : str
            Path to corpus.
        fname_vocab : str, optional
            Vocabulary file. If `fname_vocab` is None, searching one of variants:

            * `fname`.vocab
            * `fname`/vocab.txt
            * `fname_without_ext`.vocab
            * `fname_folder`/vocab.txt

        Raises
        ------
        IOError
            If vocabulary file doesn't exist.

        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s", fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                    utils.smart_extension(fname, '.vocab'),
                    utils.smart_extension(fname, '/vocab.txt'),
                    utils.smart_extension(fname_base, '.vocab'),
                    utils.smart_extension(fname_dir, '/vocab.txt'),
            ]:
                if path.exists(fname_vocab):
                    break
            else:
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with utils.open(fname_vocab, 'rb') as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
Ejemplo n.º 20
0
    def __init__(self, fname, fname_vocab=None):
        """

        Parameters
        ----------
        fname : str
            Path to corpus.
        fname_vocab : str, optional
            Vocabulary file. If `fname_vocab` is None, searching one of variants:

            * `fname`.vocab
            * `fname`/vocab.txt
            * `fname_without_ext`.vocab
            * `fname_folder`/vocab.txt

        Raises
        ------
        IOError
            If vocabulary file doesn't exist.

        """
        IndexedCorpus.__init__(self, fname)
        logger.info("loading corpus from %s", fname)

        if fname_vocab is None:
            fname_base, _ = path.splitext(fname)
            fname_dir = path.dirname(fname)
            for fname_vocab in [
                        utils.smart_extension(fname, '.vocab'),
                        utils.smart_extension(fname, '/vocab.txt'),
                        utils.smart_extension(fname_base, '.vocab'),
                        utils.smart_extension(fname_dir, '/vocab.txt'),
                        ]:
                if path.exists(fname_vocab):
                    break
            else:
                raise IOError('BleiCorpus: could not find vocabulary file')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [utils.to_unicode(word).rstrip() for word in fin]
        self.id2word = dict(enumerate(words))
Ejemplo n.º 21
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the LDA-C format.

        Notes
        -----
        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, float)
            Input corpus in BoW format.
        id2word : dict of (str, str), optional
            Mapping id -> word for `corpus`.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Returns
        -------
        list of int
            Offsets for each line in file (in bytes).

        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        elif id2word:
            num_terms = 1 + max(id2word)
        else:
            num_terms = 0

        logger.info("storing corpus in Blei's LDA-C format into %s", fname)
        with utils.open(fname, 'wb') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7]
                fout.write(
                    utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms,
                    fname_vocab)
        with utils.open(fname_vocab, 'wb') as fout:
            for featureid in range(num_terms):
                fout.write(
                    utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        return offsets
Ejemplo n.º 22
0
    def save(self, fname, *args, **kwargs):
        """
        Save the model to file.

        Large internal arrays may be stored into separate files, with `fname` as prefix.

        Note: do not save as a compressed file if you intend to load the file back with `mmap`.

        """
        if self.projection is not None:
            self.projection.save(utils.smart_extension(fname, '.projection'), *args, **kwargs)
        super(LsiModel, self).save(fname, *args, ignore=['projection', 'dispatcher'], **kwargs)
Ejemplo n.º 23
0
    def save(self, fname, *args, **kwargs):
        """
        Save the model to file.

        Large internal arrays may be stored into separate files, with `fname` as prefix.

        Note: do not save as a compressed file if you intend to load the file back with `mmap`.

        """
        if self.projection is not None:
            self.projection.save(utils.smart_extension(fname, '.projection'), *args, **kwargs)
        super(LsiModel, self).save(fname, *args, ignore=['projection', 'dispatcher'], **kwargs)
Ejemplo n.º 24
0
    def __init__(self, fname, fname_vocab=None):
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Ejemplo n.º 25
0
    def __init__(self, fname, fname_vocab=None):
        IndexedCorpus.__init__(self, fname)
        UciReader.__init__(self, fname)

        if fname_vocab is None:
            fname_vocab = utils.smart_extension(fname, '.vocab')

        self.fname = fname
        with utils.smart_open(fname_vocab) as fin:
            words = [word.strip() for word in fin]
        self.id2word = dict(enumerate(words))

        self.transposed = True
Ejemplo n.º 26
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the LDA-C format.

        Notes
        -----
        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, float)
            Input corpus in BoW format.
        id2word : dict of (str, str), optional
            Mapping id -> word for `corpus`.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Returns
        -------
        list of int
            Offsets for each line in file (in bytes).

        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        elif id2word:
            num_terms = 1 + max(id2word)
        else:
            num_terms = 0

        logger.info("storing corpus in Blei's LDA-C format into %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7]
                fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in range(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        return offsets
Ejemplo n.º 27
0
    def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
                  progress_cnt=None, labels=None, metadata=False,
                  dockeys_fname=None):
        key_order = []

        def corpus_as_list():
            for k, c in corpus:
                key_order.append(k)
                yield c
        IndexedCorpus.serialize.im_func(
            serializer, fname, corpus_as_list(), id2word,
            index_fname, progress_cnt, labels, metadata)
        dockeys_fname = dockeys_fname or utils.smart_extension(
            fname, '.dockeys')
        utils.pickle(key_order, dockeys_fname)
Ejemplo n.º 28
0
    def serialize(serializer, fname, corpus, id2word=None, index_fname=None, progress_cnt=None, labels=None, metadata=False):
        """
        Iterate through the document stream `corpus`, saving the documents to `fname`
        and recording byte offset of each document. Save the resulting index
        structure to file `index_fname` (or `fname`.index is not set).

        This relies on the underlying corpus class `serializer` providing (in
        addition to standard iteration):

        * `save_corpus` method that returns a sequence of byte offsets, one for
           each saved document,
        * the `docbyoffset(offset)` method, which returns a document
          positioned at `offset` bytes within the persistent storage (file).

        Example:

        >>> MmCorpus.serialize('test.mm', corpus)
        >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access
        >>> print(mm[42]) # retrieve document no. 42, etc.
        """
        if getattr(corpus, 'fname', None) == fname:
            raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname)

        if index_fname is None:
            index_fname = utils.smart_extension(fname, '.index')

        if progress_cnt is not None:
            if labels is not None:
                offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, progress_cnt=progress_cnt, metadata=metadata)
            else:
                offsets = serializer.save_corpus(fname, corpus, id2word, progress_cnt=progress_cnt, metadata=metadata)
        else:
            if labels is not None:
                offsets = serializer.save_corpus(fname, corpus, id2word, labels=labels, metadata=metadata)
            else:
                offsets = serializer.save_corpus(fname, corpus, id2word, metadata=metadata)

        if offsets is None:
            raise NotImplementedError("called serialize on class %s which doesn't support indexing!" %
                serializer.__name__)

        # store offsets persistently, using pickle
        # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return
        # the offsets that are actually stored on disk - we're not storing self.index in any case, the
        # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure
        # backwards compatibility
        logger.info("saving %s index to %s" % (serializer.__name__, index_fname))
        utils.pickle(offsets, index_fname)
Ejemplo n.º 29
0
 def load(cls, fname, *args, **kwargs):
     """
     Load a previously saved object from file (also see `save`).
     Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`:
         >>> LsiModel.load(fname, mmap='r')
     """
     kwargs['mmap'] = kwargs.get('mmap', None)
     result = super(LsiModelAltered, cls).load(fname, *args, **kwargs)
     projection_fname = utils.smart_extension(fname, '.projection')
     try:
         result.projection = super(LsiModelAltered,
                                   cls).load(projection_fname, *args,
                                             **kwargs)
     except Exception as e:
         logging.warning("failed to load projection from %s: %s" %
                         (projection_fname, e))
     return result
Ejemplo n.º 30
0
    def load(cls, fname, *args, **kwargs):
        """
        Load a previously saved object from file (also see `save`).

        Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`:

            >>> LdaModel.load(fname, mmap='r')

        """
        kwargs["mmap"] = kwargs.get("mmap", None)
        result = super(LdaModel, cls).load(fname, *args, **kwargs)
        state_fname = utils.smart_extension(fname, ".state")
        try:
            result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs)
        except Exception as e:
            logging.warning("failed to load state from %s: %s", state_fname, e)
        return result
Ejemplo n.º 31
0
    def load(cls, fname, *args, **kwargs):
        """
        Load a previously saved object from file (also see `save`).

        Large arrays can be memmap'ed back as read-only (shared memory) by setting `mmap='r'`:

            >>> LsiModel.load(fname, mmap='r')

        """
        kwargs['mmap'] = kwargs.get('mmap', None)
        result = super(LsiModel, cls).load(fname, *args, **kwargs)
        projection_fname = utils.smart_extension(fname, '.projection')
        try:
            result.projection = super(LsiModel, cls).load(projection_fname, *args, **kwargs)
        except Exception as e:
            logging.warning("failed to load projection from %s: %s" % (projection_fname, e))
        return result
Ejemplo n.º 32
0
    def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
        """Save a corpus in the UCI Bag-of-Words format.

        Warnings
        --------
        This function is automatically called by :meth`gensim.corpora.ucicorpus.UciCorpus.serialize`,
        don't call it directly, call :meth`gensim.corpora.ucicorpus.UciCorpus.serialize` instead.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus: iterable of iterable of (int, int)
            Corpus in BoW format.
        id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between words and their ids. If None - will be inferred from `corpus`.
        progress_cnt : int, optional
            Progress counter, write log message each `progress_cnt` documents.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Notes
        -----
        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.

        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        elif id2word:
            num_terms = 1 + max(id2word)
        else:
            num_terms = 0

        # write out vocabulary
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in range(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        logger.info("storing corpus in UCI Bag-of-Words format: %s", fname)

        return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
Ejemplo n.º 33
0
    def load(cls, fname, *args, **kwargs):
        """Load a previously saved object using :meth:`~gensim.models.lsimodel.LsiModel.save` from file.

        Notes
        -----
        Large arrays can be memmap'ed back as read-only (shared memory) by setting the `mmap='r'` parameter.

        Parameters
        ----------
        fname : str
            Path to file that contains LsiModel.
        *args
            Variable length argument list, see :meth:`gensim.utils.SaveLoad.load`.
        **kwargs
            Arbitrary keyword arguments, see :meth:`gensim.utils.SaveLoad.load`.

        See Also
        --------
        :meth:`~gensim.models.lsimodel.LsiModel.save`

        Returns
        -------
        :class:`~gensim.models.lsimodel.LsiModel`
            Loaded instance.

        Raises
        ------
        IOError
            When methods are called on instance (should be called from class).

        """
        kwargs['mmap'] = kwargs.get('mmap', None)
        result = super(LsiModel, cls).load(fname, *args, **kwargs)
        projection_fname = utils.smart_extension(fname, '.projection')
        try:
            result.projection = super(LsiModel,
                                      cls).load(projection_fname, *args,
                                                **kwargs)
        except Exception as e:
            logging.warning("failed to load projection from %s: %s",
                            projection_fname, e)
        return result
Ejemplo n.º 34
0
    def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs):
        """
        Save the model to file.

        Large internal arrays may be stored into separate files, with `fname` as prefix.

        `separately` can be used to define which arrays should be stored in separate files.

        `ignore` parameter can be used to define which variables should be ignored, i.e. left
        out from the pickled lda model. By default the internal `state` is ignored as it uses
        its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher
        will be added to any ignore parameter defined.


        Note: do not save as a compressed file if you intend to load the file back with `mmap`.

        Note: If you intend to use models across Python 2/3 versions there are a few things to
        keep in mind:

          1. The pickled Python dictionaries will not work across Python versions
          2. The `save` method does not automatically save all NumPy arrays using NumPy, only
             those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main
             concern here is the `alpha` array if for instance using `alpha='auto'`.

        Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2)
        for an example on how to work around these issues.
        """
        if self.state is not None:
            self.state.save(utils.smart_extension(fname, '.state'), *args,
                            **kwargs)

        # make sure 'state' and 'dispatcher' are ignored from the pickled object, even if
        # someone sets the ignore list themselves
        if ignore is not None and ignore:
            if isinstance(ignore, six.string_types):
                ignore = [ignore]
            ignore = [e for e in ignore
                      if e]  # make sure None and '' are not in the list
            ignore = list(set(['state', 'dispatcher']) | set(ignore))
        else:
            ignore = ['state', 'dispatcher']
        super(LdaModel, self).save(fname, *args, ignore=ignore, **kwargs)
Ejemplo n.º 35
0
    def __init__(self, fname, index_fname=None):
        """

        Parameters
        ----------
        fname : str
            Path to corpus.
        index_fname : str, optional
            Path to index, if not provided - used `fname.index`.

        """
        try:
            if index_fname is None:
                index_fname = utils.smart_extension(fname, '.index')
            self.index = utils.unpickle(index_fname)
            # change self.index into a numpy.ndarray to support fancy indexing
            self.index = numpy.asarray(self.index)
            logger.info("loaded corpus index from %s", index_fname)
        except Exception:
            self.index = None
        self.length = None
Ejemplo n.º 36
0
    def __init__(self, fname, index_fname=None):
        """

        Parameters
        ----------
        fname : str
            Path to corpus.
        index_fname : str, optional
            Path to index, if not provided - used `fname.index`.

        """
        try:
            if index_fname is None:
                index_fname = utils.smart_extension(fname, '.index')
            self.index = utils.unpickle(index_fname)
            # change self.index into a numpy.ndarray to support fancy indexing
            self.index = numpy.asarray(self.index)
            logger.info("loaded corpus index from %s", index_fname)
        except Exception:
            self.index = None
        self.length = None
Ejemplo n.º 37
0
    def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs):
        """
        Save the model to file.

        Large internal arrays may be stored into separate files, with `fname` as prefix.

        `separately` can be used to define which arrays should be stored in separate files.

        `ignore` parameter can be used to define which variables should be ignored, i.e. left
        out from the pickled lda model. By default the internal `state` is ignored as it uses
        its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher`
        will be added to any ignore parameter defined.


        Note: do not save as a compressed file if you intend to load the file back with `mmap`.

        Note: If you intend to use models across Python 2/3 versions there are a few things to
        keep in mind:

          1. The pickled Python dictionaries will not work across Python versions
          2. The `save` method does not automatically save all NumPy arrays using NumPy, only
             those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main
             concern here is the `alpha` array if for instance using `alpha='auto'`.

        Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2)
        for an example on how to work around these issues.
        """
        if self.state is not None:
            self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs)

        # make sure 'state' and 'dispatcher' are ignored from the pickled object, even if
        # someone sets the ignore list themselves
        if ignore is not None and ignore:
            if isinstance(ignore, six.string_types):
                ignore = [ignore]
            ignore = [e for e in ignore if e] # make sure None and '' are not in the list
            ignore = list(set(['state', 'dispatcher']) | set(ignore))
        else:
            ignore = ['state', 'dispatcher']
        super(LdaModel, self).save(fname, *args, ignore=ignore, **kwargs)
Ejemplo n.º 38
0
    def save(self, fname, *args, **kwargs):
        """
        Save the model to file.

        Large internal arrays may be stored into separate files, with `fname` as prefix.

        Note: do not save as a compressed file if you intend to load the file back with `mmap`.

        Note: If you intend to use models across Python 2/3 versions there are a few things to
        keep in mind:

          1. The pickled Python dictionaries will not work across Python versions
          2. The `save` method does not automatically save all NumPy arrays using NumPy, only
             those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main
             concern here is the `alpha` array if for instance using `alpha='auto'`.

        Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2)
        for an example on how to work around these issues.
        """
        if self.state is not None:
            self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs)
        super(LdaModel, self).save(fname, *args, ignore=['state', 'dispatcher'], **kwargs)
Ejemplo n.º 39
0
    def serialize(serializer,
                  fname,
                  corpus,
                  id2word=None,
                  index_fname=None,
                  progress_cnt=None,
                  labels=None,
                  metadata=False,
                  dockeys_fname=None):
        key_order = []

        def corpus_as_list():
            for k, c in corpus:
                key_order.append(k)
                yield c

        IndexedCorpus.serialize.__func__(serializer, fname, corpus_as_list(),
                                         id2word, index_fname, progress_cnt,
                                         labels, metadata)
        dockeys_fname = dockeys_fname or utils.smart_extension(
            fname, '.dockeys')
        utils.pickle(key_order, dockeys_fname)
Ejemplo n.º 40
0
    def load(cls, fname, *args, **kwargs):
        """Load a previously saved object using :meth:`~gensim.models.lsimodel.LsiModel.save` from file.

        Notes
        -----
        Large arrays can be memmap'ed back as read-only (shared memory) by setting the `mmap='r'` parameter.

        Parameters
        ----------
        fname : str
            Path to file that contains LsiModel.
        *args
            Variable length argument list, see :meth:`gensim.utils.SaveLoad.load`.
        **kwargs
            Arbitrary keyword arguments, see :meth:`gensim.utils.SaveLoad.load`.

        See Also
        --------
        :meth:`~gensim.models.lsimodel.LsiModel.save`

        Returns
        -------
        :class:`~gensim.models.lsimodel.LsiModel`
            Loaded instance.

        Raises
        ------
        IOError
            When methods are called on instance (should be called from class).

        """
        kwargs['mmap'] = kwargs.get('mmap', None)
        result = super(LsiModel, cls).load(fname, *args, **kwargs)
        projection_fname = utils.smart_extension(fname, '.projection')
        try:
            result.projection = super(LsiModel, cls).load(projection_fname, *args, **kwargs)
        except Exception as e:
            logging.warning("failed to load projection from %s: %s", projection_fname, e)
        return result
Ejemplo n.º 41
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the LDA-C format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `BleiCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + id2word.keys())

        logger.info("storing corpus in Blei's LDA-C format into %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7]
                fout.write(
                    utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s" %
                    (num_terms, fname_vocab))
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in xrange(num_terms):
                fout.write(
                    utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        return offsets
Ejemplo n.º 42
0
    def save_corpus(fname,
                    corpus,
                    id2word=None,
                    progress_cnt=10000,
                    metadata=False):
        """
        Save a corpus in the UCI Bag-of-Words format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `UciCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + list(id2word))

        # write out vocabulary
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms,
                    fname_vocab)
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in xrange(num_terms):
                fout.write(
                    utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        logger.info("storing corpus in UCI Bag-of-Words format: %s", fname)

        return UciWriter.write_corpus(fname,
                                      corpus,
                                      index=True,
                                      progress_cnt=progress_cnt)
Ejemplo n.º 43
0
    def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
                  progress_cnt=None, labels=None, metadata=False):
        """Serialize corpus with offset metadata, allows to use direct indexes after loading.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, float)
            Corpus in BoW format.
        id2word : dict of (str, str), optional
            Mapping id -> word.
        index_fname : str, optional
             Where to save resulting index, if None - store index to `fname`.index.
        progress_cnt : int, optional
            Number of documents after which progress info is printed.
        labels : bool, optional
             If True - ignore first column (class labels).
        metadata : bool, optional
            If True - ensure that serialize will write out article titles to a pickle file.

        Examples
        --------
        >>> from gensim.corpora import MmCorpus
        >>> from gensim.test.utils import get_tmpfile
        >>>
        >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]]
        >>> output_fname = get_tmpfile("test.mm")
        >>>
        >>> MmCorpus.serialize(output_fname, corpus)
        >>> mm = MmCorpus(output_fname) # `mm` document stream now has random access
        >>> print(mm[1]) # retrieve document no. 42, etc.
        [(1, 0.1)]

        """
        if getattr(corpus, 'fname', None) == fname:
            raise ValueError("identical input vs. output corpus filename, refusing to serialize: %s" % fname)

        if index_fname is None:
            index_fname = utils.smart_extension(fname, '.index')

        kwargs = {'metadata': metadata}
        if progress_cnt is not None:
            kwargs['progress_cnt'] = progress_cnt

        if labels is not None:
            kwargs['labels'] = labels

        offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs)

        if offsets is None:
            raise NotImplementedError(
                "Called serialize on class %s which doesn't support indexing!" % serializer.__name__
            )

        # store offsets persistently, using pickle
        # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return
        # the offsets that are actually stored on disk - we're not storing self.index in any case, the
        # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure
        # backwards compatibility
        logger.info("saving %s index to %s", serializer.__name__, index_fname)
        utils.pickle(offsets, index_fname)
Ejemplo n.º 44
0
    def serialize(serializer,
                  fname,
                  corpus,
                  id2word=None,
                  index_fname=None,
                  progress_cnt=None,
                  labels=None,
                  metadata=False):
        """Serialize corpus with offset metadata, allows to use direct indexes after loading.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, float)
            Corpus in BoW format.
        id2word : dict of (str, str), optional
            Mapping id -> word.
        index_fname : str, optional
             Where to save resulting index, if None - store index to `fname`.index.
        progress_cnt : int, optional
            Number of documents after which progress info is printed.
        labels : bool, optional
             If True - ignore first column (class labels).
        metadata : bool, optional
            If True - ensure that serialize will write out article titles to a pickle file.

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.corpora import MmCorpus
            >>> from gensim.test.utils import get_tmpfile
            >>>
            >>> corpus = [[(1, 0.3), (2, 0.1)], [(1, 0.1)], [(2, 0.3)]]
            >>> output_fname = get_tmpfile("test.mm")
            >>>
            >>> MmCorpus.serialize(output_fname, corpus)
            >>> mm = MmCorpus(output_fname)  # `mm` document stream now has random access
            >>> print(mm[1])  # retrieve document no. 42, etc.
            [(1, 0.1)]

        """
        if getattr(corpus, 'fname', None) == fname:
            raise ValueError(
                "identical input vs. output corpus filename, refusing to serialize: %s"
                % fname)

        if index_fname is None:
            index_fname = utils.smart_extension(fname, '.index')

        kwargs = {'metadata': metadata}
        if progress_cnt is not None:
            kwargs['progress_cnt'] = progress_cnt

        if labels is not None:
            kwargs['labels'] = labels

        offsets = serializer.save_corpus(fname, corpus, id2word, **kwargs)

        if offsets is None:
            raise NotImplementedError(
                "Called serialize on class %s which doesn't support indexing!"
                % serializer.__name__)

        # store offsets persistently, using pickle
        # we shouldn't have to worry about self.index being a numpy.ndarray as the serializer will return
        # the offsets that are actually stored on disk - we're not storing self.index in any case, the
        # load just needs to turn whatever is loaded from disk back into a ndarray - this should also ensure
        # backwards compatibility
        logger.info("saving %s index to %s", serializer.__name__, index_fname)
        utils.pickle(offsets, index_fname)
Ejemplo n.º 45
0
    def save(self, fname, ignore=['state', 'dispatcher'], separately=None, *args, **kwargs):
        """
        Save the model to file.

        Large internal arrays may be stored into separate files, with `fname` as prefix.

        `separately` can be used to define which arrays should be stored in separate files.

        `ignore` parameter can be used to define which variables should be ignored, i.e. left
        out from the pickled lda model. By default the internal `state` is ignored as it uses
        its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher`
        will be added to any ignore parameter defined.


        Note: do not save as a compressed file if you intend to load the file back with `mmap`.

        Note: If you intend to use models across Python 2/3 versions there are a few things to
        keep in mind:

          1. The pickled Python dictionaries will not work across Python versions
          2. The `save` method does not automatically save all np arrays using np, only
             those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main
             concern here is the `alpha` array if for instance using `alpha='auto'`.

        Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2)
        for an example on how to work around these issues.
        """
        if self.state is not None:
            self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs)
        # Save the dictionary separately if not in 'ignore'.
        if 'id2word' not in ignore:
            utils.pickle(self.id2word, utils.smart_extension(fname, '.id2word'))

        # make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if
        # someone sets the ignore list themselves
        if ignore is not None and ignore:
            if isinstance(ignore, six.string_types):
                ignore = [ignore]
            ignore = [e for e in ignore if e]  # make sure None and '' are not in the list
            ignore = list(set(['state', 'dispatcher', 'id2word']) | set(ignore))
        else:
            ignore = ['state', 'dispatcher', 'id2word']

        # make sure 'expElogbeta' and 'sstats' are ignored from the pickled object, even if
        # someone sets the separately list themselves.
        separately_explicit = ['expElogbeta', 'sstats']
        # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some
        # array manually.
        if (isinstance(self.alpha, six.string_types) and self.alpha == 'auto') or (isinstance(self.alpha, np.ndarray) and len(self.alpha.shape) != 1):
            separately_explicit.append('alpha')
        if (isinstance(self.eta, six.string_types) and self.eta == 'auto') or (isinstance(self.eta, np.ndarray) and len(self.eta.shape) != 1):
            separately_explicit.append('eta')
        # Merge separately_explicit with separately.
        if separately:
            if isinstance(separately, six.string_types):
                separately = [separately]
            separately = [e for e in separately if e]  # make sure None and '' are not in the list
            separately = list(set(separately_explicit) | set(separately))
        else:
            separately = separately_explicit
        super(LdaModel, self).save(fname, ignore=ignore, separately=separately, *args, **kwargs)
Ejemplo n.º 46
0
    def serialize(serializer,
                  fname,
                  corpus,
                  id2word=None,
                  index_fname=None,
                  progress_cnt=None,
                  labels=None,
                  metadata=False):
        """
        Iterate through the document stream `corpus`, saving the documents to `fname`
        and recording byte offset of each document. Save the resulting index
        structure to file `index_fname` (or `fname`.index is not set).

        This relies on the underlying corpus class `serializer` providing (in
        addition to standard iteration):

        * `save_corpus` method that returns a sequence of byte offsets, one for
           each saved document,
        * the `docbyoffset(offset)` method, which returns a document
          positioned at `offset` bytes within the persistent storage (file).

        Example:

        >>> MmCorpus.serialize('test.mm', corpus)
        >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access
        >>> print(mm[42]) # retrieve document no. 42, etc.
        """
        if getattr(corpus, 'fname', None) == fname:
            raise ValueError(
                "identical input vs. output corpus filename, refusing to serialize: %s"
                % fname)

        if index_fname is None:
            index_fname = utils.smart_extension(fname, '.index')

        if progress_cnt is not None:
            if labels is not None:
                offsets = serializer.save_corpus(fname,
                                                 corpus,
                                                 id2word,
                                                 labels=labels,
                                                 progress_cnt=progress_cnt,
                                                 metadata=metadata)
            else:
                offsets = serializer.save_corpus(fname,
                                                 corpus,
                                                 id2word,
                                                 progress_cnt=progress_cnt,
                                                 metadata=metadata)
        else:
            if labels is not None:
                offsets = serializer.save_corpus(fname,
                                                 corpus,
                                                 id2word,
                                                 labels=labels,
                                                 metadata=metadata)
            else:
                offsets = serializer.save_corpus(fname,
                                                 corpus,
                                                 id2word,
                                                 metadata=metadata)

        if offsets is None:
            raise NotImplementedError(
                "called serialize on class %s which doesn't support indexing!"
                % serializer.__name__)

        # store offsets persistently, using pickle
        logger.info("saving %s index to %s" %
                    (serializer.__name__, index_fname))
        utils.pickle(offsets, index_fname)
Ejemplo n.º 47
0
    def save(self,
             fname,
             ignore=['state', 'dispatcher'],
             separately=None,
             *args,
             **kwargs):
        """
        Save the model to file.

        Large internal arrays may be stored into separate files, with `fname` as prefix.

        `separately` can be used to define which arrays should be stored in separate files.

        `ignore` parameter can be used to define which variables should be ignored, i.e. left
        out from the pickled lda model. By default the internal `state` is ignored as it uses
        its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher`
        will be added to any ignore parameter defined.


        Note: do not save as a compressed file if you intend to load the file back with `mmap`.

        Note: If you intend to use models across Python 2/3 versions there are a few things to
        keep in mind:

          1. The pickled Python dictionaries will not work across Python versions
          2. The `save` method does not automatically save all np arrays using np, only
             those ones that exceed `sep_limit` set in `gensim.utils.SaveLoad.save`. The main
             concern here is the `alpha` array if for instance using `alpha='auto'`.

        Please refer to the wiki recipes section (https://github.com/piskvorky/gensim/wiki/Recipes-&-FAQ#q9-how-do-i-load-a-model-in-python-3-that-was-trained-and-saved-using-python-2)
        for an example on how to work around these issues.
        """
        if self.state is not None:
            self.state.save(utils.smart_extension(fname, '.state'), *args,
                            **kwargs)
        # Save the dictionary separately if not in 'ignore'.
        if 'id2word' not in ignore:
            utils.pickle(self.id2word,
                         utils.smart_extension(fname, '.id2word'))

        # make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if
        # someone sets the ignore list themselves
        if ignore is not None and ignore:
            if isinstance(ignore, six.string_types):
                ignore = [ignore]
            ignore = [e for e in ignore
                      if e]  # make sure None and '' are not in the list
            ignore = list(
                set(['state', 'dispatcher', 'id2word']) | set(ignore))
        else:
            ignore = ['state', 'dispatcher', 'id2word']

        # make sure 'expElogbeta' and 'sstats' are ignored from the pickled object, even if
        # someone sets the separately list themselves.
        separately_explicit = ['expElogbeta', 'sstats']
        # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some
        # array manually.
        if (isinstance(self.alpha, six.string_types)
                and self.alpha == 'auto') or len(self.alpha.shape) != 1:
            separately_explicit.append('alpha')
        if (isinstance(self.eta, six.string_types)
                and self.eta == 'auto') or len(self.eta.shape) != 1:
            separately_explicit.append('eta')
        # Merge separately_explicit with separately.
        if separately:
            if isinstance(separately, six.string_types):
                separately = [separately]
            separately = [e for e in separately
                          if e]  # make sure None and '' are not in the list
            separately = list(set(separately_explicit) | set(separately))
        else:
            separately = separately_explicit
        super(LdaModel, self).save(fname,
                                   ignore=ignore,
                                   separately=separately,
                                   *args,
                                   **kwargs)