Exemple #1
0
    def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
        """
        Save a corpus in the UCI Bag-of-Words format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `UciCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + id2word.keys())

        # write out vocabulary
        fname_vocab = fname + '.vocab'
        logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in xrange(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname)

        return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
Exemple #2
0
    def save_corpus(fname, corpus, id2word=None):
        """
        Save a corpus in the List-of-words format.

        This function is automatically called by `LowCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in List-Of-Words format: %s" % fname)
        truncated = 0
        offsets = []
        with open(fname, 'w') as fout:
            fout.write('%i\n' % len(corpus))
            for doc in corpus:
                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([str(id2word[wordid])] * int(value))
                offsets.append(fout.tell())
                fout.write('%s\n' % ' '.join(words))

        if truncated:
            logger.warning("List-of-words format can only save vectors with "
                            "integer elements; %i float entries were truncated to integer value" %
                            truncated)
        return offsets
Exemple #3
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the List-of-words format.

        This function is automatically called by `LowCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in List-Of-Words format into %s" % fname)
        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8('%i\n' % len(corpus)))
            for doc in corpus:
                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] *
                                 int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s\n' % ' '.join(words)))

        if truncated:
            logger.warning(
                "List-of-words format can only save vectors with integer elements; "
                "%i float entries were truncated to integer value", truncated)
        return offsets
Exemple #4
0
    def initialize(self, corpus):
        """Initialize the random projection matrix.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
          Input corpus.

        """
        if self.id2word is None:
            logger.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif self.id2word:
            self.num_terms = 1 + max(self.id2word)
        else:
            self.num_terms = 0

        shape = self.num_topics, self.num_terms
        logger.info("constructing %s random matrix", str(shape))
        # Now construct the projection matrix itself.
        # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection",
        # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1).
        randmat = 1 - 2 * np.random.binomial(1, 0.5, shape)  # convert from 0/1 to +1/-1
        # convert from int32 to floats, for faster multiplications
        self.projection = np.asfortranarray(randmat, dtype=np.float32)
    def __init__(self, mallet_path, corpus=None, covariates=None, num_topics=100, id2word=None, workers=4, prefix=None,
                 optimize_interval=0, iterations=1000):
        """
        `mallet_path` is path to the mallet executable, e.g. `/home/kofola/mallet-2.0.7/bin/mallet`.
        `corpus` is a gensim corpus, aka a stream of sparse document vectors.
		'covariates' is a numpy array of covariates to the corpus
        `id2word` is a mapping between tokens ids and token.
        `workers` is the number of threads, for parallel training.
        `prefix` is the string prefix under which all data files will be stored; default: system temp + random filename prefix.
        `optimize_interval` optimize hyperparameters every N iterations (sometimes leads to Java exception; 0 to switch off hyperparameter optimization).
        `iterations` is the number of sampling iterations.

        """
        self.mallet_path = mallet_path
        self.id2word = id2word
        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys())
        if self.num_terms == 0:
            raise ValueError("cannot compute DMR over an empty collection (no terms)")
        self.num_topics = num_topics
        if prefix is None:
            rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
            prefix = os.path.join(tempfile.gettempdir(), rand_prefix)
        self.prefix = prefix
        self.workers = workers
        self.optimize_interval = optimize_interval
        self.iterations = iterations

        if (corpus is not None & covariates is not None):
            self.train(corpus)
Exemple #6
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the LDA-C format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `BleiCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + id2word.keys())

        logger.info("storing corpus in Blei's LDA-C format into %s" % fname)
        with utils.smart_open(fname, 'wb') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                parts = ["%i:%s" % p for p in doc if abs(p[1]) > 1e-7]
                fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = fname + '.vocab'
        logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in xrange(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        return offsets
Exemple #7
0
    def initialize(self, corpus):
        """Initialize the random projection matrix.

        Parameters
        ----------
        corpus : iterable of iterable of (int, int)
          Input corpus.

        """
        if self.id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif self.id2word:
            self.num_terms = 1 + max(self.id2word)
        else:
            self.num_terms = 0

        shape = self.num_topics, self.num_terms
        logger.info("constructing %s random matrix", str(shape))
        # Now construct the projection matrix itself.
        # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection",
        # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1).
        randmat = 1 - 2 * np.random.binomial(
            1, 0.5, shape)  # convert from 0/1 to +1/-1
        # convert from int32 to floats, for faster multiplications
        self.projection = np.asfortranarray(randmat, dtype=np.float32)
Exemple #8
0
    def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
        """
        Save a corpus in the UCI Bag-of-Words format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `UciCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + id2word.keys())

        # write out vocabulary
        fname_vocab = fname + '.vocab'
        logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab))
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in xrange(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname)

        return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
Exemple #9
0
    def __init__(self,
                 corpus=None,
                 num_topics=100,
                 id2word=None,
                 passes=1,
                 threshold=0.001,
                 iterations=10,
                 alpha=None,
                 eta=None,
                 offset=1.0,
                 decay=0.5,
                 eval_every=1,
                 random_state=None):

        if alpha is None:
            alpha = 1.0 / num_topics
        if eta is None:
            eta = 1.0 / num_topics

        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError(
                "cannot compute LDA over an empty collection (no terms)")

        logger.info('Vocabulary consists of %d words.', self.num_terms)

        self.corpus = corpus
        self.iterations = iterations
        self.passes = passes
        self.num_topics = num_topics
        self.threshold = threshold
        self.alpha = alpha
        self.eta = eta
        self.offset = offset
        self.decay = decay
        self.num_docs = len(corpus)
        self.eval_every = eval_every
        self.random_state = random_state

        self.random_state = get_random_state(random_state)

        if corpus is not None:
            self.inference(corpus)
Exemple #10
0
    def __init__(self,
                 mallet_path,
                 corpus=None,
                 num_topics=100,
                 alpha=50,
                 id2word=None,
                 workers=4,
                 prefix=None,
                 optimize_interval=0,
                 iterations=1000,
                 topic_threshold=0.0):
        """
        `mallet_path` is path to the mallet executable, e.g. `/home/kofola/mallet-2.0.7/bin/mallet`.

        `corpus` is a gensim corpus, aka a stream of sparse document vectors.

        `id2word` is a mapping between tokens ids and token.

        `workers` is the number of threads, for parallel training.

        `prefix` is the string prefix under which all data files will be stored;
        default: system temp + random filename prefix.

        `optimize_interval` optimize hyperparameters every N iterations (sometimes leads to Java exception;
        0 to switch off hyperparameter optimization).

        `iterations` is the number of sampling iterations.

        `topic_threshold` is the threshold of the probability above which we consider a topic.
        This is basically for sparse topic distribution.

        """
        self.mallet_path = mallet_path
        self.id2word = id2word
        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 0 if not self.id2word else 1 + max(
                self.id2word.keys())
        if self.num_terms == 0:
            raise ValueError(
                "cannot compute LDA over an empty collection (no terms)")
        self.num_topics = num_topics
        self.topic_threshold = topic_threshold
        self.alpha = alpha
        if prefix is None:
            rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
            prefix = os.path.join(tempfile.gettempdir(), rand_prefix)
        self.prefix = prefix
        self.workers = workers
        self.optimize_interval = optimize_interval
        self.iterations = iterations
        if corpus is not None:
            self.train(corpus)
    def save_corpus(fname,
                    corpus,
                    id2word=None,
                    progress_cnt=10000,
                    metadata=False):
        """Save a corpus in the UCI Bag-of-Words format.

        Warnings
        --------
        This function is automatically called by :meth`gensim.corpora.ucicorpus.UciCorpus.serialize`,
        don't call it directly, call :meth`gensim.corpora.ucicorpus.UciCorpus.serialize` instead.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus: iterable of iterable of (int, int)
            Corpus in BoW format.
        id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between words and their ids. If None - will be inferred from `corpus`.
        progress_cnt : int, optional
            Progress counter, write log message each `progress_cnt` documents.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Notes
        -----
        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.

        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        elif id2word:
            num_terms = 1 + max(id2word)
        else:
            num_terms = 0

        # write out vocabulary
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms,
                    fname_vocab)
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in range(num_terms):
                fout.write(
                    utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        logger.info("storing corpus in UCI Bag-of-Words format: %s", fname)

        return UciWriter.write_corpus(fname,
                                      corpus,
                                      index=True,
                                      progress_cnt=progress_cnt)
Exemple #12
0
    def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=None, workers=4, prefix=None,
                 optimize_interval=0, iterations=1000, topic_threshold=0.0, random_seed=0):
        """

        Parameters
        ----------
        mallet_path : str
            Path to the mallet binary, e.g. `/home/username/mallet-2.0.7/bin/mallet`.
        corpus : iterable of iterable of (int, int), optional
            Collection of texts in BoW format.
        num_topics : int, optional
            Number of topics.
        alpha : int, optional
            Alpha parameter of LDA.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`.
        workers : int, optional
            Number of threads that will be used for training.
        prefix : str, optional
            Prefix for produced temporary files.
        optimize_interval : int, optional
            Optimize hyperparameters every `optimize_interval` iterations
            (sometimes leads to Java exception 0 to switch off hyperparameter optimization).
        iterations : int, optional
            Number of training iterations.
        topic_threshold : float, optional
            Threshold of the probability above which we consider a topic.
        random_seed: int, optional
            Random seed to ensure consistent results, if 0 - use system clock.

        """
        self.mallet_path = mallet_path
        self.id2word = id2word
        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys())
        if self.num_terms == 0:
            raise ValueError("cannot compute LDA over an empty collection (no terms)")
        self.num_topics = num_topics
        self.topic_threshold = topic_threshold
        self.alpha = alpha
        if prefix is None:
            rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
            prefix = os.path.join(tempfile.gettempdir(), rand_prefix)
        self.prefix = prefix
        self.workers = workers
        self.optimize_interval = optimize_interval
        self.iterations = iterations
        self.random_seed = random_seed
        if corpus is not None:
            self.train(corpus)
Exemple #13
0
    def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=None, workers=4, prefix=None,
                 optimize_interval=0, iterations=1000, topic_threshold=0.0, random_seed=0):
        """

        Parameters
        ----------
        mallet_path : str
            Path to the mallet binary, e.g. `/home/username/mallet-2.0.7/bin/mallet`.
        corpus : iterable of iterable of (int, int), optional
            Collection of texts in BoW format.
        num_topics : int, optional
            Number of topics.
        alpha : int, optional
            Alpha parameter of LDA.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`.
        workers : int, optional
            Number of threads that will be used for training.
        prefix : str, optional
            Prefix for produced temporary files.
        optimize_interval : int, optional
            Optimize hyperparameters every `optimize_interval` iterations
            (sometimes leads to Java exception 0 to switch off hyperparameter optimization).
        iterations : int, optional
            Number of training iterations.
        topic_threshold : float, optional
            Threshold of the probability above which we consider a topic.
        random_seed: int, optional
            Random seed to ensure consistent results, if 0 - use system clock.

        """
        self.mallet_path = mallet_path
        self.id2word = id2word
        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys())
        if self.num_terms == 0:
            raise ValueError("cannot compute LDA over an empty collection (no terms)")
        self.num_topics = num_topics
        self.topic_threshold = topic_threshold
        self.alpha = alpha
        if prefix is None:
            rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
            prefix = os.path.join(tempfile.gettempdir(), rand_prefix)
        self.prefix = prefix
        self.workers = workers
        self.optimize_interval = optimize_interval
        self.iterations = iterations
        self.random_seed = random_seed
        if corpus is not None:
            self.train(corpus)
Exemple #14
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the LDA-C format.

        Notes
        -----
        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, float)
            Input corpus in BoW format.
        id2word : dict of (str, str), optional
            Mapping id -> word for `corpus`.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Returns
        -------
        list of int
            Offsets for each line in file (in bytes).

        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        elif id2word:
            num_terms = 1 + max(id2word)
        else:
            num_terms = 0

        logger.info("storing corpus in Blei's LDA-C format into %s", fname)
        with utils.open(fname, 'wb') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7]
                fout.write(
                    utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms,
                    fname_vocab)
        with utils.open(fname_vocab, 'wb') as fout:
            for featureid in range(num_terms):
                fout.write(
                    utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        return offsets
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the GibbsLda++ format.

        Warnings
        --------
        This function is automatically called by :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize`,
        don't call it directly, call :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize` instead.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, int)
            Corpus in BoW format.
        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between word_ids (integers) and words (strings).
            If not provided, the mapping is constructed directly from `corpus`.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Return
        ------
        list of int
            List of offsets in resulting file for each document (in bytes),
            can be used for :meth:`~gensim.corpora.lowcorpus.LowCorpus.docbyoffset`

        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in List-Of-Words format into %s" % fname)
        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8('%i\n' % len(corpus)))
            for doc in corpus:
                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] *
                                 int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s\n' % ' '.join(words)))

        if truncated:
            logger.warning(
                "List-of-words format can only save vectors with integer elements; "
                "%i float entries were truncated to integer value", truncated)
        return offsets
Exemple #16
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the GibbsLda++ format.

        Warnings
        --------
        This function is automatically called by :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize`,
        don't call it directly, call :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize` instead.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, int)
            Corpus in BoW format.
        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between word_ids (integers) and words (strings).
            If not provided, the mapping is constructed directly from `corpus`.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Return
        ------
        list of int
            List of offsets in resulting file for each document (in bytes),
            can be used for :meth:`~gensim.corpora.lowcorpus.LowCorpus.docbyoffset`

        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in List-Of-Words format into %s" % fname)
        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            fout.write(utils.to_utf8('%i\n' % len(corpus)))
            for doc in corpus:
                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] * int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s\n' % ' '.join(words)))

        if truncated:
            logger.warning(
                "List-of-words format can only save vectors with integer elements; "
                "%i float entries were truncated to integer value", truncated
            )
        return offsets
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the LDA-C format.

        Notes
        -----
        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, float)
            Input corpus in BoW format.
        id2word : dict of (str, str), optional
            Mapping id -> word for `corpus`.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Returns
        -------
        list of int
            Offsets for each line in file (in bytes).

        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        elif id2word:
            num_terms = 1 + max(id2word)
        else:
            num_terms = 0

        logger.info("storing corpus in Blei's LDA-C format into %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7]
                fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts))))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in range(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        return offsets
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the Mallet format.

        The document id will be generated by enumerating the corpus.
        That is, it will range between 0 and number of documents in the corpus.

        Since Mallet has a language field in the format, this defaults to the string '__unknown__'.
        If the language needs to be saved, post-processing will be required.

        This function is automatically called by `MalletCorpus.serialize`; don't
        call it directly, call `serialize` instead.

        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in Mallet format into %s" % fname)

        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for doc_id, doc in enumerate(corpus):
                if metadata:
                    doc_id, doc_lang = doc[1]
                    doc = doc[0]
                else:
                    doc_lang = '__unknown__'

                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] *
                                 int(value))
                offsets.append(fout.tell())
                fout.write(
                    utils.to_utf8('%s %s %s\n' %
                                  (doc_id, doc_lang, ' '.join(words))))

        if truncated:
            logger.warning(
                "Mallet format can only save vectors with "
                "integer elements; %i float entries were truncated to integer value"
                % truncated)

        return offsets
Exemple #19
0
    def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
        """Save a corpus in the UCI Bag-of-Words format.

        Warnings
        --------
        This function is automatically called by :meth`gensim.corpora.ucicorpus.UciCorpus.serialize`,
        don't call it directly, call :meth`gensim.corpora.ucicorpus.UciCorpus.serialize` instead.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus: iterable of iterable of (int, int)
            Corpus in BoW format.
        id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between words and their ids. If None - will be inferred from `corpus`.
        progress_cnt : int, optional
            Progress counter, write log message each `progress_cnt` documents.
        metadata : bool, optional
            THIS PARAMETER WILL BE IGNORED.

        Notes
        -----
        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.

        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        elif id2word:
            num_terms = 1 + max(id2word)
        else:
            num_terms = 0

        # write out vocabulary
        fname_vocab = utils.smart_extension(fname, '.vocab')
        logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab)
        with utils.smart_open(fname_vocab, 'wb') as fout:
            for featureid in range(num_terms):
                fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---')))

        logger.info("storing corpus in UCI Bag-of-Words format: %s", fname)

        return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
Exemple #20
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the Mallet format.

        The document id will be generated by enumerating the corpus.
        That is, it will range between 0 and number of documents in the corpus.

        Since Mallet has a language field in the format, this defaults to the string '__unknown__'.
        If the language needs to be saved, post-processing will be required.

        This function is automatically called by `MalletCorpus.serialize`; don't
        call it directly, call `serialize` instead.

        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in Mallet format into %s", fname)

        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for doc_id, doc in enumerate(corpus):
                if metadata:
                    doc_id, doc_lang = doc[1]
                    doc = doc[0]
                else:
                    doc_lang = '__unknown__'

                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] * int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words))))

        if truncated:
            logger.warning(
                "Mallet format can only save vectors with integer elements; "
                "%i float entries were truncated to integer value", truncated
            )

        return offsets
Exemple #21
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """
        Save a corpus in the LDA-C format.

        There are actually two files saved: `fname` and `fname.vocab`, where
        `fname.vocab` is the vocabulary file.

        This function is automatically called by `BleiCorpus.serialize`; don't
        call it directly, call `serialize` instead.
        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)
            num_terms = len(id2word)
        else:
            num_terms = 1 + max([-1] + id2word.keys())

        logger.info("storing corpus in Blei's LDA-C format into %s" % fname)
        with utils.smart_open(fname, 'w') as fout:
            offsets = []
            for doc in corpus:
                doc = list(doc)
                offsets.append(fout.tell())
                fout.write(
                    "%i %s\n" %
                    (len(doc), ' '.join("%i:%s" % p
                                        for p in doc if abs(p[1]) > 1e-12)))

        # write out vocabulary, in a format compatible with Blei's topics.py script
        fname_vocab = fname + '.vocab'
        logger.info("saving vocabulary of %i words to %s" %
                    (num_terms, fname_vocab))
        with open(fname_vocab, 'wb') as fout:
            for featureid in xrange(num_terms):
                fout.write("%s\n" %
                           utils.to_utf8(id2word.get(featureid, '---')))

        return offsets
    def __init__(self,
                 vw_path,
                 corpus=None,
                 num_topics=100,
                 id2word=None,
                 chunksize=256,
                 passes=1,
                 alpha=0.1,
                 eta=0.1,
                 decay=0.5,
                 offset=1,
                 gamma_threshold=0.001,
                 random_seed=None,
                 cleanup_files=True,
                 tmp_prefix='tmp'):
        """

        Parameters
        ----------
        vw_path : str
            Path to Vowpal Wabbit's binary.
        corpus : iterable of list of (int, int), optional
            Collection of texts in BoW format. If given, training will start immediately,
            otherwise, you should call :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.train` or
            :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.update` manually for training.
        num_topics : int, optional
            Number of requested latent topics to be extracted from the training corpus.
            Corresponds to VW's ``--lda <num_topics>`` argument.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Mapping from word ids (integers) to words (strings).
        chunksize : int, optional
            Number of documents examined in each batch.
            Corresponds to VW's ``--minibatch <batch_size>`` argument.
        passes : int, optional
            Number of passes over the dataset to use.
            Corresponds to VW's ``--passes <passes>`` argument.
        alpha : float, optional
            Float effecting sparsity of per-document topic weights.
            This is applied symmetrically, and should be set higher to when documents are thought to look more similar.
            Corresponds to VW's ``--lda_alpha <alpha>`` argument.
        eta : float, optional
            Affects the sparsity of topic distributions.
            This is applied symmetrically, and should be set higher when topics
            are thought to look more similar.
            Corresponds to VW's ``--lda_rho <rho>`` argument.
        decay : float, optional
            Learning rate decay, affects how quickly learnt values are forgotten.
            Should be set to a value between 0.5 and 1.0 to guarantee convergence.
            Corresponds to VW's ``--power_t <tau>`` argument.
        offset: int, optional
            Learning offset, set to higher values to slow down learning on early iterations of the algorithm.
            Corresponds to VW's ``--initial_t <tau>`` argument.
        gamma_threshold : float, optional
            Affects when learning loop will be broken out of, higher values will result in earlier loop completion.
            Corresponds to VW's ``--epsilon <eps>`` argument.
        random_seed : int, optional
            Sets random seed when learning.
            Corresponds to VW's ``--random_seed <seed>`` argument.
        cleanup_files : bool, optional
            Whether or not to delete temporary directory and files used by this wrapper.
            Setting to False can be useful for debugging, or for re-using Vowpal Wabbit files elsewhere.
        tmp_prefix : str, optional
            To prefix temporary working directory name.

        """
        # default parameters are taken from Vowpal Wabbit's defaults, and
        # parameter names changed to match Gensim's LdaModel where possible
        self.vw_path = vw_path
        self.id2word = id2word

        if self.id2word is None:
            if corpus is None:
                raise ValueError(
                    "at least one of corpus/id2word must be specified, to establish input space dimensionality"
                )
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError(
                "cannot compute LDA over an empty collection (no terms)")

        # LDA parameters
        self.num_topics = num_topics
        self.chunksize = chunksize
        self.passes = passes
        self.alpha = alpha
        self.eta = eta
        self.gamma_threshold = gamma_threshold
        self.offset = offset
        self.decay = decay
        self.random_seed = random_seed
        self._initial_offset = offset

        # temporary files used for Vowpal Wabbit input/output
        self.tmp_dir = None
        self.tmp_prefix = tmp_prefix
        self.cleanup_files = cleanup_files
        self._init_temp_dir(tmp_prefix)

        # used for saving/loading this model's state
        self._model_data = None
        self._topics_data = None

        # cache loaded topics as numpy array
        self._topics = None

        if corpus is not None:
            self.train(corpus)
Exemple #23
0
    def __init__(
        self,
        corpus=None,
        num_topics=100,
        id2word=None,
        chunksize=2000,
        passes=1,
        kappa=1.0,
        minimum_probability=0.01,
        w_max_iter=200,
        w_stop_condition=1e-4,
        h_max_iter=50,
        h_stop_condition=1e-3,
        eval_every=10,
        normalize=True,
        random_state=None,
    ):
        r"""

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents), optional
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        num_topics : int, optional
            Number of topics to extract.
        id2word: {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}
            Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for
            debugging and topic printing.
        chunksize: int, optional
            Number of documents to be used in each training chunk.
        passes: int, optional
            Number of full passes over the training corpus.
            Leave at default `passes=1` if your input is an iterator.
        kappa : float, optional
            Gradient descent step size.
            Larger value makes the model train faster, but could lead to non-convergence if set too large.
        minimum_probability:
            If `normalize` is True, topics with smaller probabilities are filtered out.
            If `normalize` is False, topics with smaller factors are filtered out.
            If set to None, a value of 1e-8 is used to prevent 0s.
        w_max_iter: int, optional
            Maximum number of iterations to train W per each batch.
        w_stop_condition: float, optional
            If error difference gets less than that, training of ``W`` stops for the current batch.
        h_max_iter: int, optional
            Maximum number of iterations to train h per each batch.
        h_stop_condition: float
            If error difference gets less than that, training of ``h`` stops for the current batch.
        eval_every: int, optional
            Number of batches after which l2 norm of (v - Wh) is computed. Decreases performance if set too low.
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.
        random_state: {np.random.RandomState, int}, optional
            Seed for random generator. Needed for reproducibility.

        """
        self.num_topics = num_topics
        self.id2word = id2word
        self.chunksize = chunksize
        self.passes = passes
        self._kappa = kappa
        self.minimum_probability = minimum_probability
        self._w_max_iter = w_max_iter
        self._w_stop_condition = w_stop_condition
        self._h_max_iter = h_max_iter
        self._h_stop_condition = h_stop_condition
        self.eval_every = eval_every
        self.normalize = normalize
        self.random_state = utils.get_random_state(random_state)

        self.v_max = None

        if self.id2word is None:
            self.id2word = utils.dict_from_corpus(corpus)

        self.num_tokens = len(self.id2word)

        self.A = None
        self.B = None

        self._W = None
        self.w_std = None
        self._w_error = np.inf

        self._h = None

        if corpus is not None:
            self.update(corpus)
Exemple #24
0
    def __init__(
        self,
        corpus=None,
        num_topics=100,
        id2word=None,
        chunksize=2000,
        passes=1,
        lambda_=1.0,
        kappa=1.0,
        minimum_probability=0.01,
        use_r=False,
        w_max_iter=200,
        w_stop_condition=1e-4,
        h_r_max_iter=50,
        h_r_stop_condition=1e-3,
        eval_every=10,
        v_max=None,
        normalize=True,
        sparse_coef=3,
        random_state=None,
    ):
        """

        Parameters
        ----------
        corpus : iterable of list of (int, float), optional
            Training corpus. If not given, model is left untrained.
        num_topics : int, optional
            Number of topics to extract.
        id2word: gensim.corpora.Dictionary, optional
            Mapping from token id to token. If not set words get replaced with word ids.
        chunksize: int, optional
            Number of documents to be used in each training chunk.
        passes: int, optioanl
            Number of full passes over the training corpus.
        lambda_ : float, optional
            Residuals regularizer coefficient. Increasing it helps prevent ovefitting. Has no effect if `use_r` is set
            to False.
        kappa : float, optional
            Optimizer step coefficient. Increaing it makes model train faster, but adds a risk that it won't converge.
        w_max_iter: int, optional
            Maximum number of iterations to train W matrix per each batch.
        w_stop_condition: float, optional
            If error difference gets less than that, training of matrix ``W`` stops for current batch.
        h_r_max_iter: int, optional
            Maximum number of iterations to train h and r matrices per each batch.
        h_r_stop_condition: float
            If error difference gets less than that, training of matrices ``h`` and ``r`` stops for current batch.
        eval_every: int, optional
            Number of batches after which model will be evaluated.
        v_max: int, optional
            Maximum number of word occurrences in the corpora. Inferred if not set. Rarely needs to be set explicitly.
        normalize: bool, optional
            Whether to normalize results. Offers "kind-of-probabilistic" result.
        sparse_coef: float, optional
            The more it is, the more sparse are matrices. Significantly increases performance.
        random_state: {np.random.RandomState, int}, optional
            Seed for random generator. Useful for reproducibility.

        """
        self._w_error = None
        self.num_tokens = None
        self.num_topics = num_topics
        self.id2word = id2word
        self.chunksize = chunksize
        self.passes = passes
        self._lambda_ = lambda_
        self._kappa = kappa
        self.minimum_probability = minimum_probability
        self.use_r = use_r
        self._w_max_iter = w_max_iter
        self._w_stop_condition = w_stop_condition
        self._h_r_max_iter = h_r_max_iter
        self._h_r_stop_condition = h_r_stop_condition
        self.v_max = v_max
        self.eval_every = eval_every
        self.normalize = normalize
        self.sparse_coef = sparse_coef
        self.random_state = utils.get_random_state(random_state)

        if self.id2word is None:
            self.id2word = utils.dict_from_corpus(corpus)

        self.num_tokens = len(self.id2word)

        self.A = None
        self.B = None

        self._W = None
        self.w_std = None

        self._h = None
        self._r = None

        if corpus is not None:
            self.update(corpus)
Exemple #25
0
    def __init__(self,
                 corpus=None,
                 num_topics=100,
                 id2word=None,
                 estep_convergence=0.001,
                 em_convergence=0.0001,
                 em_max_iterations=50):
        """
        If given, start training from the iterable `corpus` straight away.
        If not given, the model is left untrained (presumably because you
        want to call `update()` manually).

        `num_topics` is the number of requested latent topics to be extracted
        from the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings).
        It is used to determine the vocabulary size, as well as for debugging
        and topic printing.

        The variational EM runs until the relative change in the likelihood
        bound is less than `em_convergence`.

        In each EM iteration, the E-step runs until the relative change in
        the likelihood bound is less than `estep_convergence`.

        """

        # store user-supplied parameters
        self.id2word = id2word
        self.estep_convergence = estep_convergence  # relative change we need to achieve in E-step
        self.em_convergence = em_convergence  # relative change we need to achieve in Expectation-Maximization
        self.em_max_iterations = em_max_iterations

        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError(
                "cannot compute CTL over an empty collection (no terms)")

        self.num_topics = int(num_topics)

        # initialize a model with zero-mean, diagonal covariance gaussian and
        # random topics seeded from the corpus
        self.mu = numpy.zeros(self.num_topics)
        self.sigma = numpy.diagflat([1.0] * self.num_topics)
        self.sigma_inverse = inv(self.sigma)
        self.beta = numpy.random.uniform(0, 1,
                                         (self.num_topics, self.num_terms))

        # variational parameters
        self.lamda = numpy.zeros(self.num_topics)
        self.nu2 = numpy.ones(self.num_topics)  # nu^2
        self.phi = 1 / float(self.num_topics) * numpy.ones(
            [self.num_terms, self.num_topics])
        self.optimize_zeta()

        # in order to get the topics graph, we need to store the
        # optimized lamda for each document
        self.observed_lamda = numpy.zeros([len(corpus)])

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None:
            self.expectation_maximization(corpus)
Exemple #26
0
    def __init__(self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100,
                 id2word=None, prefix=None, lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10,
                 alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True):
        """
        `dtm_path` is path to the dtm executable, e.g. `C:/dtm/dtm-win64.exe`.

        `corpus` is a gensim corpus, aka a stream of sparse document vectors.

        `id2word` is a mapping between tokens ids and token.

        `mode` controls the mode of the mode: 'fit' is for training, 'time' for
        analyzing documents through time according to a DTM, basically a held out set.

        `model` controls the choice of model. 'fixed' is for DIM and 'dtm' for DTM.

        `lda_sequence_min_iter` min iteration of LDA.

        `lda_sequence_max_iter` max iteration of LDA.

        `lda_max_em_iter` max em optiimzatiion iterations in LDA.

        `alpha` is a hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice.

        `top_chain_var` is a hyperparameter that affects.

        `rng_seed` is the random seed.

        `initialize_lda` initialize DTM with LDA.

        """
        if not os.path.isfile(dtm_path):
            raise ValueError("dtm_path must point to the binary file, not to a folder")

        self.dtm_path = dtm_path
        self.id2word = id2word
        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys())
        if self.num_terms == 0:
            raise ValueError("cannot compute DTM over an empty collection (no terms)")
        self.num_topics = num_topics

        try:
            lencorpus = len(corpus)
        except TypeError:
            logger.warning("input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            raise ValueError("cannot compute DTM over an empty corpus")
        if model == "fixed" and any(not text for text in corpus):
            raise ValueError("""There is a text without words in the input corpus.
                    This breaks method='fixed' (The DIM model).""")
        if lencorpus != sum(time_slices):
            raise ValueError(
                "mismatched timeslices %{slices} for corpus of len {clen}"
                .format(slices=sum(time_slices), clen=lencorpus)
            )
        self.lencorpus = lencorpus
        if prefix is None:
            rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
            prefix = os.path.join(tempfile.gettempdir(), rand_prefix)

        self.prefix = prefix
        self.time_slices = time_slices
        self.lda_sequence_min_iter = int(lda_sequence_min_iter)
        self.lda_sequence_max_iter = int(lda_sequence_max_iter)
        self.lda_max_em_iter = int(lda_max_em_iter)
        self.alpha = alpha
        self.top_chain_var = top_chain_var
        self.rng_seed = rng_seed
        self.initialize_lda = str(initialize_lda).lower()

        self.lambda_ = None
        self.obs_ = None
        self.lhood_ = None
        self.gamma_ = None
        self.init_alpha = None
        self.init_beta = None
        self.init_ss = None
        self.em_steps = []
        self.influences_time = []

        if corpus is not None:
            self.train(corpus, time_slices, mode, model)
Exemple #27
0
    def __init__(self,
                 corpus=None,
                 num_topics=100,
                 id2word=None,
                 author2doc=None,
                 doc2author=None,
                 chunksize=2000,
                 passes=1,
                 iterations=50,
                 decay=0.5,
                 offset=1.0,
                 alpha='symmetric',
                 eta='symmetric',
                 update_every=1,
                 eval_every=10,
                 gamma_threshold=0.001,
                 serialized=False,
                 serialization_path=None,
                 minimum_probability=0.01,
                 random_state=None):
        """
        If the iterable corpus and one of author2doc/doc2author dictionaries are given,
        start training straight away. If not given, the model is left untrained
        (presumably because you want to call the `update` method manually).

        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `author2doc` is a dictionary where the keys are the names of authors, and the
        values are lists of documents that the author contributes to.

        `doc2author` is a dictionary where the keys are document IDs (indexes to corpus)
        and the values are lists of author names. I.e. this is the reverse mapping of
        `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be
        supplied.

        `passes` is the number of times the model makes a pass over the entire trianing
        data.

        `iterations` is the maximum number of times the model loops over each document
        (M-step). The iterations stop when convergence is reached.

        `chunksize` controls the size of the mini-batches.

        `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics prior.

        `alpha` can be set to an explicit array = prior of your choice. It also
        support special values of 'asymmetric' and 'auto': the former uses a fixed
        normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric
        prior directly from your data.

        `eta` can be a scalar for a symmetric prior over topic/word
        distributions, or a vector of shape num_words, which can be used to
        impose (user defined) asymmetric priors over the word distribution.
        It also supports the special value 'auto', which learns an asymmetric
        prior over words directly from your data. `eta` can also be a matrix
        of shape num_topics x num_words, which can be used to impose
        asymmetric priors over the word distribution on a per-topic basis
        (can not be learned from data).

        Calculate and log perplexity estimate from the latest mini-batch every
        `eval_every` model updates. Set to None to disable perplexity estimation.

        `decay` and `offset` parameters are the same as Kappa and Tau_0 in
        Hoffman et al, respectively. `decay` controls how quickly old documents are
        forgotten, while `offset` down-weights early iterations.

        `minimum_probability` controls filtering the topics returned for a document (bow).

        `random_state` can be an integer or a numpy.random.RandomState object. Set the
        state of the random number generator inside the author-topic model, to ensure
        reproducibility of your experiments, for example.

        `serialized` indicates whether the input corpora to the model are simple
        in-memory lists (`serialized = False`) or saved to the hard-drive
        (`serialized = True`). Note that this behaviour is quite different from
        other Gensim models. If your data is too large to fit in to memory, use
        this functionality. Note that calling `AuthorTopicModel.update` with new
        data may be cumbersome as it requires all the existing data to be
        re-serialized.

        `serialization_path` must be set to a filepath, if `serialized = True` is
        used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your
        working directory by setting `serialization_path = serialized_model.mm`. An existing
        file *cannot* be overwritten; either delete the old file or choose a different
        name.

        Example:

        >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word)  # train model
        >>> model.update(corpus2)  # update the author-topic model with additional documents

        >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5)  # train asymmetric alpha from data

        """

        # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the
        # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState.
        distributed = False
        self.dispatcher = None
        self.numworkers = 1

        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError(
                "cannot compute the author-topic model over an empty collection (no terms)"
            )

        logger.info('Vocabulary consists of %d words.', self.num_terms)

        self.author2doc = {}
        self.doc2author = {}

        self.distributed = distributed
        self.num_topics = num_topics
        self.num_authors = 0
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.minimum_probability = minimum_probability
        self.num_updates = 0
        self.total_docs = 0

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every

        self.author2id = {}
        self.id2author = {}

        self.serialized = serialized
        if serialized and not serialization_path:
            raise ValueError(
                "If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path)."
            )
        if serialized and serialization_path:
            assert not isfile(
                serialization_path
            ), "A file already exists at the serialization_path path; choose a different serialization_path, or delete the file."
        self.serialization_path = serialization_path

        # Initialize an empty self.corpus.
        self.init_empty_corpus()

        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

        assert self.alpha.shape == (
            self.num_topics,
        ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(
            self.alpha.shape), self.num_topics)

        if isinstance(eta, six.string_types):
            if eta == 'asymmetric':
                raise ValueError(
                    "The 'asymmetric' option cannot be used for eta")

        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

        self.random_state = utils.get_random_state(random_state)

        assert (
            self.eta.shape == (self.num_terms, )
            or self.eta.shape == (self.num_topics, self.num_terms)
        ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)"
            % (str(self.eta.shape), self.num_terms, self.num_topics,
               self.num_terms))

        # VB constants
        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        # Initialize the variational distributions q(beta|lambda) and q(theta|gamma)
        self.state = AuthorTopicState(self.eta,
                                      (self.num_topics, self.num_terms),
                                      (self.num_authors, self.num_topics))
        self.state.sstats = self.random_state.gamma(
            100., 1. / 100., (self.num_topics, self.num_terms))
        self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None and (author2doc is not None
                                   or doc2author is not None):
            use_numpy = self.dispatcher is not None
            self.update(corpus,
                        author2doc,
                        doc2author,
                        chunks_as_numpy=use_numpy)
Exemple #28
0
    def __init__(
        self, dtm_path, corpus=None, time_slices=None, num_topics=100, id2word=None, prefix=None,
            lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10, alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=False):
        """
        `dtm_path` is path to the dtm executable, e.g. `C:/dtm/dtm-win64.exe`.

        `corpus` is a gensim corpus, aka a stream of sparse document vectors.

        `id2word` is a mapping between tokens ids and token.

        `lda_sequence_min_iter` min iteration of LDA.

        `lda_sequence_max_iter` max iteration of LDA.

        `lda_max_em_iter` max em optiimzatiion iterations in LDA.

        `alpha` is a hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice.

        `top_chain_var` is a hyperparameter that affects.

        `rng_seed` is the random seed.

        `initialize_lda` initialize DTM with LDA.

        """
        self.dtm_path = dtm_path
        self.id2word = id2word
        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys())
        if self.num_terms == 0:
            raise ValueError("cannot compute DTM over an empty collection (no terms)")
        self.num_topics = num_topics

        try:
            lencorpus = len(corpus)
        except:
            logger.warning("input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            raise ValueError("cannot compute DTM over an empty corpus")
        if lencorpus != sum(time_slices):
            raise ValueError("mismatched timeslices %{slices} for corpus of len {clen}".format(
                slices=sum(time_slices), clen=lencorpus))
        self.lencorpus = lencorpus
        if prefix is None:
            rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
            prefix = os.path.join(tempfile.gettempdir(), rand_prefix)

        self.prefix = prefix
        self.time_slices = time_slices
        self.lda_sequence_min_iter = int(lda_sequence_min_iter)
        self.lda_sequence_max_iter = int(lda_sequence_max_iter)
        self.lda_max_em_iter = int(lda_max_em_iter)
        self.alpha = alpha
        self.top_chain_var = top_chain_var
        self.rng_seed = rng_seed
        self.initialize_lda = str(initialize_lda).lower()

        self.lambda_ = None
        self.obs_ = None
        self.lhood_ = None
        self.gamma_ = None
        self.init_alpha = None
        self.init_beta = None
        self.init_ss = None
        self.em_steps = []
        self.influences_time = []

        if corpus is not None:
            self.train(corpus, time_slices)
Exemple #29
0
    def __init__(self,
                 corpus=None,
                 num_topics=100,
                 id2word=None,
                 distributed=False,
                 chunksize=2000,
                 passes=1,
                 update_every=1,
                 alpha='symmetric',
                 eta=None,
                 decay=0.5,
                 offset=1.0,
                 eval_every=10,
                 iterations=50,
                 gamma_threshold=0.001,
                 minimum_probability=0.01,
                 defined_kws={},
                 tfMod=None):
        """
        If given, start training from the iterable `corpus` straight away. If not given,
        the model is left untrained (presumably because you want to call `update()` manually).

        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics prior.

        `alpha` can be set to an explicit array = prior of your choice. It also
        support special values of 'asymmetric' and 'auto': the former uses a fixed
        normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric
        prior directly from your data.

        `eta` can be a scalar for a symmetric prior over topic/word
        distributions, or a matrix of shape num_topics x num_words, which can
        be used to impose asymmetric priors over the word distribution on a
        per-topic basis. This may be useful if you want to seed certain topics
        with particular words by boosting the priors for those words.  It also
        supports the special value 'auto', which learns an asymmetric prior
        directly from your data.

        Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_
        on how to set up a cluster of machines for gensim).

        Calculate and log perplexity estimate from the latest mini-batch every
        `eval_every` model updates (setting this to 1 slows down training ~2x;
        default is 10 for better performance). Set to None to disable perplexity estimation.

        `decay` and `offset` parameters are the same as Kappa and Tau_0 in
        Hoffman et al, respectively.

        `minimum_probability` controls filtering the topics returned for a document (bow).

        Example:

        >>> lda = LdaModel(corpus, num_topics=100)  # train model
        >>> print(lda[doc_bow]) # get topic probability distribution for a document
        >>> lda.update(corpus2) # update the LDA model with additional documents
        >>> print(lda[doc_bow])

        >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5)  # train asymmetric alpha from data

        """
        # store user-supplied parameters
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError(
                "cannot compute LDA over an empty collection (no terms)")

        self.distributed = bool(distributed)
        self.num_topics = int(num_topics)
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.minimum_probability = minimum_probability
        self.num_updates = 0

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every

        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

        assert self.alpha.shape == (
            num_topics,
        ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(
            self.alpha.shape), num_topics)

        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

        assert (
            self.eta.shape == (num_topics, 1)
            or self.eta.shape == (num_topics, self.num_terms)
        ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)"
            % (str(self.eta.shape), num_topics, num_topics, self.num_terms))

        # VB constants
        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        # set up distributed environment if necessary
        if not distributed:
            logger.info("using serial LDA version on this node")
            self.dispatcher = None
            self.numworkers = 1
        else:
            if self.optimize_alpha:
                raise NotImplementedError(
                    "auto-optimizing alpha not implemented in distributed LDA")
            # set up distributed version
            try:
                import Pyro4
                dispatcher = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher')
                logger.debug("looking for dispatcher at %s" %
                             str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=self.id2word,
                                      num_topics=num_topics,
                                      chunksize=chunksize,
                                      alpha=alpha,
                                      eta=eta,
                                      distributed=False)
                self.dispatcher = dispatcher
                self.numworkers = len(dispatcher.getworkers())
                logger.info("using distributed version with %i workers" %
                            self.numworkers)
            except Exception as err:
                logger.error("failed to initialize distributed LDA (%s)", err)
                raise RuntimeError(
                    "failed to initialize distributed LDA (%s)" % err)

        # Initialize the variational distribution q(beta|lambda)
        self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
        self.state.sstats = numpy.random.gamma(
            100., 1. / 100., (self.num_topics, self.num_terms))

        #reassign word/topic for specific words
        self.word2id = utils.revdict(self.id2word)
        sstats = self.state.sstats
        self.defined_kws = defined_kws
        self.defined_wordids = {}
        for w, t in defined_kws.iteritems():
            if w in self.word2id:
                wid = self.word2id[w]
                self.defined_wordids[wid] = numpy.array(list(t))

        for wid, t in self.defined_wordids.iteritems():
            sstats[:, wid] = numpy.random.gamma(0.1, 0.05, (self.num_topics, ))

        for wid, topics in self.defined_wordids.iteritems():
            if tfMod is not None:
                score = self.num_topics * tfMod.idfs.get(wid, 1.0)
            else:
                score = self.num_topics
            if topics.shape[0] > 1:
                # score = self.num_topics / math.log(len(topics) + 1)
                score = 0.4 * self.num_topics
                # print 'score :{}'.format(score)
                # print self.num_topics
                # print topics

            # for t in topics:
            sstats[topics, wid] = score

        self.expElogbeta = numpy.exp(dirichlet_expectation(self.state.sstats))

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None:
            use_numpy = self.dispatcher is not None
            self.update(corpus, chunks_as_numpy=use_numpy)
Exemple #30
0
    def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
                 decay=1.0, distributed=False, onepass=True,
                 power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64):
        """Construct an `LsiModel` object.

        Either `corpus` or `id2word` must be supplied in order to train the model.

        Parameters
        ----------
        corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
            Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`).
        num_topics : int, optional
            Number of requested factors (latent dimensions)
        id2word : dict of {int: str}, optional
            ID to word mapping, optional.
        chunksize :  int, optional
            Number of documents to be used in each training chunk.
        decay : float, optional
            Weight of existing observations relatively to new ones.
        distributed : bool, optional
            If True - distributed mode (parallel execution on several machines) will be used.
        onepass : bool, optional
            Whether the one-pass algorithm should be used for training.
            Pass `False` to force a multi-pass stochastic algorithm.
        power_iters: int, optional
            Number of power iteration steps to be used.
            Increasing the number of power iterations improves accuracy, but lowers performance
        extra_samples : int, optional
            Extra samples to be used besides the rank `k`. Can improve accuracy.
        dtype : type, optional
            Enforces a type for elements of the decomposed matrix.

        """
        self.id2word = id2word
        self.num_topics = int(num_topics)
        self.chunksize = int(chunksize)
        self.decay = float(decay)
        if distributed:
            if not onepass:
                logger.warning("forcing the one-pass algorithm for distributed LSA")
                onepass = True
        self.onepass = onepass
        self.extra_samples, self.power_iters = extra_samples, power_iters
        self.dtype = dtype

        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 1 + (max(self.id2word.keys()) if self.id2word else -1)

        self.docs_processed = 0
        self.projection = Projection(
            self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples, dtype=dtype
        )

        self.numworkers = 1
        if not distributed:
            logger.info("using serial LSI version on this node")
            self.dispatcher = None
        else:
            if not onepass:
                raise NotImplementedError(
                    "distributed stochastic LSA not implemented yet; "
                    "run either distributed one-pass, or serial randomized."
                )
            try:
                import Pyro4
                dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher')
                logger.debug("looking for dispatcher at %s", str(dispatcher._pyroUri))
                dispatcher.initialize(
                    id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, decay=decay,
                    power_iters=self.power_iters, extra_samples=self.extra_samples, distributed=False, onepass=onepass
                )
                self.dispatcher = dispatcher
                self.numworkers = len(dispatcher.getworkers())
                logger.info("using distributed version with %i workers", self.numworkers)
            except Exception as err:
                # distributed version was specifically requested, so this is an error state
                logger.error("failed to initialize distributed LSI (%s)", err)
                raise RuntimeError("failed to initialize distributed LSI (%s)" % err)

        if corpus is not None:
            self.add_documents(corpus)
Exemple #31
0
    def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False,
                 chunksize=2000, passes=1, update_every=1, alpha=None, eta=None, decay=0.5):
        """
        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics (but can be set to a vector, for asymmetric priors).

        Turn on `distributed` to force distributed computing (see the web tutorial
        on how to set up a cluster of machines for gensim).

        Example:

        >>> lda = LdaModel(corpus, num_topics=100)
        >>> print lda[doc_bow] # get topic probability distribution for a document
        >>> lda.update(corpus2) # update the LDA model with additional documents
        >>> print lda[doc_bow]

        """
        # store user-supplied parameters
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 1 + max([-1] + self.id2word.keys())

        if self.num_terms == 0:
            raise ValueError("cannot compute LDA over an empty collection (no terms)")

        self.distributed = bool(distributed)
        self.num_topics = int(num_topics)
        self.chunksize = chunksize
        self.decay = decay
        self.num_updates = 0

        self.passes = passes
        self.update_every = update_every

        if alpha is None:
            self.alpha = 1.0 / num_topics
        else:
            self.alpha = alpha
        if eta is None:
            self.eta = 1.0 / num_topics
        else:
            self.eta = eta

        # VB constants
        self.VAR_MAXITER = 50
        self.VAR_THRESH = 0.001

        # set up distributed environment if necessary
        if not distributed:
            logger.info("using serial LDA version on this node")
            self.dispatcher = None
            self.numworkers = 1
        else:
            # set up distributed version
            try:
                import Pyro4
                dispatcher = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher')
                dispatcher._pyroOneway.add("exit")
                logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=self.id2word, num_topics=num_topics,
                                      chunksize=chunksize, alpha=alpha, eta=eta, distributed=False)
                self.dispatcher = dispatcher
                self.numworkers = len(dispatcher.getworkers())
                logger.info("using distributed version with %i workers" % self.numworkers)
            except Exception, err:
                logger.error("failed to initialize distributed LDA (%s)" % err)
                raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
Exemple #32
0
    def __init__(self,
                 vw_path,
                 corpus=None,
                 num_topics=100,
                 id2word=None,
                 chunksize=256,
                 passes=1,
                 alpha=0.1,
                 eta=0.1,
                 decay=0.5,
                 offset=1,
                 gamma_threshold=0.001,
                 random_seed=None,
                 cleanup_files=True,
                 tmp_prefix='tmp'):
        """`vw_path` is the path to Vowpal Wabbit's 'vw' executable.

        `corpus` is an iterable training corpus. If given, training will
        start immediately, otherwise the model is left untrained (presumably
        because you want to call `update()` manually).

        `num_topics` is the number of requested latent topics to be extracted
        from the training corpus.
        Corresponds to VW's '--lda <num_topics>' argument.

        `id2word` is a mapping from word ids (integers) to words (strings).
        It is used to determine the vocabulary size, as well as for debugging
        and topic printing.

        `chunksize` is the number of documents examined in each batch.
        Corresponds to VW's '--minibatch <batch_size>' argument.

        `passes` is the number of passes over the dataset to use.
        Corresponds to VW's '--passes <passes>' argument.

        `alpha` is a float effecting sparsity of per-document topic weights.
        This is applied symmetrically, and should be set higher to when
        documents are thought to look more similar.
        Corresponds to VW's '--lda_alpha <alpha>' argument.

        `eta` is a float which affects the sparsity of topic distributions.
        This is applied symmetrically, and should be set higher when topics
        are thought to look more similar.
        Corresponds to VW's '--lda_rho <rho>' argument.

        `decay` learning rate decay, affects how quickly learnt values
        are forgotten. Should be set to a value between 0.5 and 1.0 to
        guarantee convergence.
        Corresponds to VW's '--power_t <tau>' argument.

        `offset` integer learning offset, set to higher values to slow down
        learning on early iterations of the algorithm.
        Corresponds to VW's '--initial_t <tau>' argument.

        `gamma_threshold` affects when learning loop will be broken out of,
        higher values will result in earlier loop completion.
        Corresponds to VW's '--epsilon <eps>' argument.

        `random_seed` sets Vowpal Wabbit's random seed when learning.
        Corresponds to VW's '--random_seed <seed>' argument.

        `cleanup_files` whether or not to delete temporary directory and files
        used by this wrapper. Setting to False can be useful for debugging,
        or for re-using Vowpal Wabbit files elsewhere.

        `tmp_prefix` used to prefix temporary working directory name.
        """
        # default parameters are taken from Vowpal Wabbit's defaults, and
        # parameter names changed to match Gensim's LdaModel where possible
        self.vw_path = vw_path
        self.id2word = id2word

        if self.id2word is None:
            if corpus is None:
                raise ValueError(
                    "at least one of corpus/id2word must be specified, to establish input space dimensionality"
                )
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError(
                "cannot compute LDA over an empty collection (no terms)")

        # LDA parameters
        self.num_topics = num_topics
        self.chunksize = chunksize
        self.passes = passes
        self.alpha = alpha
        self.eta = eta
        self.gamma_threshold = gamma_threshold
        self.offset = offset
        self.decay = decay
        self.random_seed = random_seed
        self._initial_offset = offset

        # temporary files used for Vowpal Wabbit input/output
        self.tmp_dir = None
        self.tmp_prefix = tmp_prefix
        self.cleanup_files = cleanup_files
        self._init_temp_dir(tmp_prefix)

        # used for saving/loading this model's state
        self._model_data = None
        self._topics_data = None

        # cache loaded topics as numpy array
        self._topics = None

        if corpus is not None:
            self.train(corpus)
Exemple #33
0
    def __init__(self, corpus=None, num_topics=100, id2word=None,
                 distributed=False, chunksize=2000, passes=1, update_every=1,
                 alpha='symmetric', eta=None, decay=0.5, offset=1.0,
                 eval_every=10, iterations=50, gamma_threshold=0.001,
                 minimum_probability=0.01, random_state=None, ns_conf={},
                 minimum_phi_value=0.01, per_word_topics=False):
        """
        If given, start training from the iterable `corpus` straight away. If not given,
        the model is left untrained (presumably because you want to call `update()` manually).

        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics prior.

        `alpha` can be set to an explicit array = prior of your choice. It also
        support special values of 'asymmetric' and 'auto': the former uses a fixed
        normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric
        prior directly from your data.

        `eta` can be a scalar for a symmetric prior over topic/word
        distributions, or a vector of shape num_words, which can be used to
        impose (user defined) asymmetric priors over the word distribution.
        It also supports the special value 'auto', which learns an asymmetric
        prior over words directly from your data. `eta` can also be a matrix
        of shape num_topics x num_words, which can be used to impose
        asymmetric priors over the word distribution on a per-topic basis
        (can not be learned from data).

        Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_
        on how to set up a cluster of machines for gensim).

        Calculate and log perplexity estimate from the latest mini-batch every
        `eval_every` model updates (setting this to 1 slows down training ~2x;
        default is 10 for better performance). Set to None to disable perplexity estimation.

        `decay` and `offset` parameters are the same as Kappa and Tau_0 in
        Hoffman et al, respectively.

        `minimum_probability` controls filtering the topics returned for a document (bow).

        `random_state` can be a np.random.RandomState object or the seed for one

        Example:

        >>> lda = LdaModel(corpus, num_topics=100)  # train model
        >>> print(lda[doc_bow]) # get topic probability distribution for a document
        >>> lda.update(corpus2) # update the LDA model with additional documents
        >>> print(lda[doc_bow])

        >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5)  # train asymmetric alpha from data

        """

        # store user-supplied parameters
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError("cannot compute LDA over an empty collection (no terms)")

        self.distributed = bool(distributed)
        self.num_topics = int(num_topics)
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.minimum_probability = minimum_probability
        self.num_updates = 0

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every
        self.minimum_phi_value = minimum_phi_value
        self.per_word_topics = per_word_topics

        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

        assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)

        if isinstance(eta, six.string_types):
            if eta == 'asymmetric':
                raise ValueError("The 'asymmetric' option cannot be used for eta")

        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

        self.random_state = utils.get_random_state(random_state)

        assert self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms), (
            "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
            (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms))

        # VB constants
        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        # set up distributed environment if necessary
        if not distributed:
            logger.info("using serial LDA version on this node")
            self.dispatcher = None
            self.numworkers = 1
        else:
            if self.optimize_alpha:
                raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA")
            # set up distributed version
            try:
                import Pyro4
                with utils.getNS(**ns_conf) as ns:
                    from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX
                    self.dispatcher = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX])
                    logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri))
                    self.dispatcher.initialize(id2word=self.id2word, num_topics=self.num_topics,
                                               chunksize=chunksize, alpha=alpha, eta=eta, distributed=False)
                    self.numworkers = len(self.dispatcher.getworkers())
                    logger.info("using distributed version with %i workers" % self.numworkers)
            except Exception as err:
                logger.error("failed to initialize distributed LDA (%s)", err)
                raise RuntimeError("failed to initialize distributed LDA (%s)" % err)

        # Initialize the variational distribution q(beta|lambda)
        self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
        self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
        self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None:
            use_numpy = self.dispatcher is not None
            self.update(corpus, chunks_as_numpy=use_numpy)
Exemple #34
0
    def __init__(self,
                 corpus=None,
                 time_slice=None,
                 id2word=None,
                 alphas=0.01,
                 num_topics=10,
                 initialize='gensim',
                 sstats=None,
                 lda_model=None,
                 obs_variance=0.5,
                 chain_variance=0.005,
                 passes=10,
                 random_state=None,
                 lda_inference_max_iter=25,
                 em_min_iter=6,
                 em_max_iter=20,
                 chunksize=100):
        """
        `corpus` is any iterable gensim corpus

        `time_slice` as described above is a list which contains the number of documents in each time-slice

        `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size and printing topics.

        `alphas`  is a prior of your choice and should be a double or float value. default is 0.01

        `num_topics` is the number of requested latent topics to be extracted from the training corpus.

        `initalize` allows the user to decide how he wants to initialise the DTM model. Default is through gensim LDA.
        You can use your own sstats of an LDA model previously trained as well by specifying 'own' and passing a np matrix through sstats.
        If you wish to just pass a previously used LDA model, pass it through `lda_model`
        Shape of sstats is (vocab_len, num_topics)

        `chain_variance` is a constant which dictates how the beta values evolve - it is a gaussian parameter defined in the
        beta distribution.

        `passes` is the number of passes of the initial LdaModel.

        `random_state` can be a np.random.RandomState object or the seed for one, for the LdaModel.
        """
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.vocab_len = len(self.id2word)
        elif len(self.id2word) > 0:
            self.vocab_len = len(self.id2word)
        else:
            self.vocab_len = 0

        if corpus is not None:
            try:
                self.corpus_len = len(corpus)
            except:
                logger.warning(
                    "input corpus stream has no len(); counting documents")
                self.corpus_len = sum(1 for _ in corpus)

        self.time_slice = time_slice
        if self.time_slice is not None:
            self.num_time_slices = len(time_slice)

        max_doc_len = 0
        for line_no, line in enumerate(corpus):
            if len(line) > max_doc_len:
                max_doc_len = len(line)
        self.max_doc_len = max_doc_len

        self.num_topics = num_topics
        self.num_time_slices = len(time_slice)
        self.alphas = np.full(num_topics, alphas)

        # topic_chains contains for each topic a 'state space language model' object which in turn has information about each topic
        # the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities.
        self.topic_chains = []
        for topic in range(0, num_topics):
            sslm_ = sslm(num_time_slices=self.num_time_slices,
                         vocab_len=self.vocab_len,
                         num_topics=self.num_topics,
                         chain_variance=chain_variance,
                         obs_variance=obs_variance)
            self.topic_chains.append(sslm_)

        # the following are class variables which are to be integrated during Document Influence Model
        self.top_doc_phis = None
        self.influence = None
        self.renormalized_influence = None
        self.influence_sum_lgl = None

        # if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM.
        if corpus is not None and time_slice is not None:
            if initialize == 'gensim':
                lda_model = ldamodel.LdaModel(corpus,
                                              id2word=self.id2word,
                                              num_topics=self.num_topics,
                                              passes=passes,
                                              alpha=self.alphas,
                                              random_state=random_state)
                self.sstats = np.transpose(lda_model.state.sstats)
            if initialize == 'ldamodel':
                self.sstats = np.transpose(lda_model.state.sstats)
            if initialize == 'own':
                self.sstats = sstats

            # initialize model from sstats
            self.init_ldaseq_ss(chain_variance, obs_variance, self.alphas,
                                self.sstats)

            # fit DTM
            self.fit_lda_seq(corpus, lda_inference_max_iter, em_min_iter,
                             em_max_iter, chunksize)
Exemple #35
0
    def __init__(self,
                 corpus=None,
                 num_topics=100,
                 id2word=None,
                 distributed=False,
                 chunksize=2000,
                 passes=1,
                 update_every=1,
                 alpha='symmetric',
                 eta=None,
                 decay=0.5,
                 offset=1.0,
                 eval_every=10,
                 iterations=50,
                 gamma_threshold=0.001,
                 minimum_probability=0.01):
        """
        If given, start training from the iterable 'corpus' straight away. If not given,
        the model is left untrained('update()')

        :param corpus:

        :param num_topics: is the number of requested latent topics to be extracted from
        the training corpus.

        :param id2word: is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        :param eta and alpha: are hyperparameters that affect sparsity of the document-topic
        (theta) and topic-word(lambda) distributions. Both default to a symmetric
        1.0/num_topics prior

        :param alpha: can be set to an explicit array = prior of your choice. It also support
        special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric
        1.0/topicno prior, the latter learns an asymmetric prior directly from your data

        :param eta : can be a scalar for a symmetric prior over topic/word distributions, or a
        matrix of shape num_topics * num_words, which can be used to impose asymmetric priors over
        the word distribution on a per-topic basis. This may be useful if you want to seed certain
        topics with particular words by boosting the priors for those words. It also supports
        the special value 'auto', which learns an asymmetric prior directly from your data.

        :param distributed: Turn on 'distributed' ro force distributed computing
        (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_
        on how to set up a cluster of machines for gensim).

        :param eval_every: Calculate and log perplexity estimate from the latest mini-batch every
        `eval_every` model updates (setting this to 1 slows down training ~2x;
        default is 10 for better performance). Set to None to disable perplexity estimation.

        :param decay and offset:  decay` and `offset` parameters are the same as Kappa and Tau_0 in
        Hoffman et al, respectively.

        :param minimum_probability: controls filtering the topics returned for a documents(bow)

        Example:
        >>> lda = LdaModel(corpus, num_topics=100)  # train model
        >>> print(lda[doc_bow]) # get topic probability distribution for a document
        >>> lda.update(corpus2) # update the LDA model with additional documents
        >>> print(lda[doc_bow])
        >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5)  # train asymmetric alpha from data
        """
        #store user-supplied parameters
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input sapce dimensionality'
            )

        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing form corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0
        if self.num_terms == 0:
            raise ValueError(
                "cannot compute LDA over an empty collection (no terms)")

        self.distributed = bool(distributed)
        self.num_topics = int(num_topics)
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.minimum_probability = minimum_probability
        self.num_updates = 0

        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every

        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

        assert self.alpha.shape == (
            num_topics,
        ), "Invalid alpha shape, Got shape %s, but expected (%d)" % (str(
            self.alpha.shape), num_topics)

        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

        assert (self.eta.shape == num_topics, 1) or self.eta.shape == (
            num_topics, self.num_terms
        ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)"
            % (str(self.eta.shape), num_topics, num_topics, self.num_terms))

        #Initialize the variational distribution q(beta|lambda)
        self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
        self.state.sstats = numpy.ramdom.gamma(
            100., 1 / 100. / (self.num_topics, self.num_terms))
        self.expElogbeta = numpy.exp(dirichlet_expectation(self.state.sstats))

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None:
            use_numpy = self.dispatcher is not None
            self.update(corpus, chunks_as_numpy=use_numpy)
Exemple #36
0
    def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None,
                 chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0,
                 alpha='symmetric', eta='symmetric', update_every=1, eval_every=10,
                 gamma_threshold=0.001, serialized=False, serialization_path=None,
                 minimum_probability=0.01, random_state=None):
        """
        If the iterable corpus and one of author2doc/doc2author dictionaries are given,
        start training straight away. If not given, the model is left untrained
        (presumably because you want to call the `update` method manually).

        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `author2doc` is a dictionary where the keys are the names of authors, and the
        values are lists of documents that the author contributes to.

        `doc2author` is a dictionary where the keys are document IDs (indexes to corpus)
        and the values are lists of author names. I.e. this is the reverse mapping of
        `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be
        supplied.

        `passes` is the number of times the model makes a pass over the entire trianing
        data.

        `iterations` is the maximum number of times the model loops over each document
        (M-step). The iterations stop when convergence is reached.

        `chunksize` controls the size of the mini-batches.

        `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics prior.

        `alpha` can be set to an explicit array = prior of your choice. It also
        support special values of 'asymmetric' and 'auto': the former uses a fixed
        normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric
        prior directly from your data.

        `eta` can be a scalar for a symmetric prior over topic/word
        distributions, or a vector of shape num_words, which can be used to
        impose (user defined) asymmetric priors over the word distribution.
        It also supports the special value 'auto', which learns an asymmetric
        prior over words directly from your data. `eta` can also be a matrix
        of shape num_topics x num_words, which can be used to impose
        asymmetric priors over the word distribution on a per-topic basis
        (can not be learned from data).

        Calculate and log perplexity estimate from the latest mini-batch every
        `eval_every` model updates. Set to None to disable perplexity estimation.

        `decay` and `offset` parameters are the same as Kappa and Tau_0 in
        Hoffman et al, respectively. `decay` controls how quickly old documents are
        forgotten, while `offset` down-weights early iterations.

        `minimum_probability` controls filtering the topics returned for a document (bow).

        `random_state` can be an integer or a numpy.random.RandomState object. Set the
        state of the random number generator inside the author-topic model, to ensure
        reproducibility of your experiments, for example.

        `serialized` indicates whether the input corpora to the model are simple
        in-memory lists (`serialized = False`) or saved to the hard-drive
        (`serialized = True`). Note that this behaviour is quite different from
        other Gensim models. If your data is too large to fit in to memory, use
        this functionality. Note that calling `AuthorTopicModel.update` with new
        data may be cumbersome as it requires all the existing data to be
        re-serialized.

        `serialization_path` must be set to a filepath, if `serialized = True` is
        used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your
        working directory by setting `serialization_path = serialized_model.mm`. An existing
        file *cannot* be overwritten; either delete the old file or choose a different
        name.

        Example:

        >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word)  # train model
        >>> model.update(corpus2)  # update the author-topic model with additional documents

        >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5)  # train asymmetric alpha from data

        """

        # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the
        # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState.
        distributed = False
        self.dispatcher = None
        self.numworkers = 1

        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError(
                "at least one of corpus/id2word must be specified, to establish input space dimensionality"
            )

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError("cannot compute the author-topic model over an empty collection (no terms)")

        logger.info('Vocabulary consists of %d words.', self.num_terms)

        self.author2doc = {}
        self.doc2author = {}

        self.distributed = distributed
        self.num_topics = num_topics
        self.num_authors = 0
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.minimum_probability = minimum_probability
        self.num_updates = 0
        self.total_docs = 0

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every

        self.author2id = {}
        self.id2author = {}

        self.serialized = serialized
        if serialized and not serialization_path:
            raise ValueError("If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path).")
        if serialized and serialization_path:
            assert not isfile(serialization_path), \
                "A file already exists at the serialization_path path; " \
                "choose a different serialization_path, or delete the file."
        self.serialization_path = serialization_path

        # Initialize an empty self.corpus.
        self.init_empty_corpus()

        self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha')

        assert self.alpha.shape == (self.num_topics,), \
            "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)

        if isinstance(eta, six.string_types):
            if eta == 'asymmetric':
                raise ValueError("The 'asymmetric' option cannot be used for eta")

        self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

        self.random_state = utils.get_random_state(random_state)

        assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), (
                "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
                (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)
        )

        # VB constants
        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        # Initialize the variational distributions q(beta|lambda) and q(theta|gamma)
        self.state = AuthorTopicState(self.eta, (self.num_topics, self.num_terms), (self.num_authors, self.num_topics))
        self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
        self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None and (author2doc is not None or doc2author is not None):
            use_numpy = self.dispatcher is not None
            self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy)
Exemple #37
0
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the Mallet format.

        Warnings
        --------
        This function is automatically called by :meth:`gensim.corpora.malletcorpus.MalletCorpus.serialize`,
        don't call it directly, call :meth:`gensim.corpora.lowcorpus.malletcorpus.MalletCorpus.serialize` instead.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, int)
            Corpus in BoW format.
        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between word_ids (integers) and words (strings).
            If not provided, the mapping is constructed directly from `corpus`.
        metadata : bool, optional
            If True - ????

        Return
        ------
        list of int
            List of offsets in resulting file for each document (in bytes),
            can be used for :meth:`~gensim.corpora.malletcorpus.Malletcorpus.docbyoffset`.

        Notes
        -----
        The document id will be generated by enumerating the corpus.
        That is, it will range between 0 and number of documents in the corpus.

        Since Mallet has a language field in the format, this defaults to the string '__unknown__'.
        If the language needs to be saved, post-processing will be required.

        """
        if id2word is None:
            logger.info(
                "no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in Mallet format into %s", fname)

        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for doc_id, doc in enumerate(corpus):
                if metadata:
                    doc_id, doc_lang = doc[1]
                    doc = doc[0]
                else:
                    doc_lang = '__unknown__'

                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] *
                                 int(value))
                offsets.append(fout.tell())
                fout.write(
                    utils.to_utf8('%s %s %s\n' %
                                  (doc_id, doc_lang, ' '.join(words))))

        if truncated:
            logger.warning(
                "Mallet format can only save vectors with integer elements; "
                "%i float entries were truncated to integer value", truncated)

        return offsets
Exemple #38
0
    def __init__(self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100,
                 id2word=None, prefix=None, lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10,
                 alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True):
        """

        Parameters
        ----------
        dtm_path : str
            Path to the dtm binary, e.g. `/home/username/dtm/dtm/main`.
        corpus : iterable of iterable of (int, int)
            Collection of texts in BoW format.
        time_slices : list of int
            Sequence of timestamps.
        mode : {'fit', 'time'}, optional
            Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time
            according to a DTM, basically a held out set.
        model : {'fixed', 'dtm'}, optional
            Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM.
        num_topics : int, optional
            Number of topics.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`.
        prefix : str, optional
            Prefix for produced temporary files.
        lda_sequence_min_iter : int, optional
             Min iteration of LDA.
        lda_sequence_max_iter : int, optional
            Max iteration of LDA.
        lda_max_em_iter : int, optional
             Max em optimization iterations in LDA.
        alpha : int, optional
            Hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice.
        top_chain_var : int, optional
            Hyperparameter that affects.
        rng_seed : int, optional
             Random seed.
        initialize_lda : bool, optional
             If True - initialize DTM with LDA.

        """
        if not os.path.isfile(dtm_path):
            raise ValueError("dtm_path must point to the binary file, not to a folder")

        self.dtm_path = dtm_path
        self.id2word = id2word
        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys())
        if self.num_terms == 0:
            raise ValueError("cannot compute DTM over an empty collection (no terms)")
        self.num_topics = num_topics

        try:
            lencorpus = len(corpus)
        except TypeError:
            logger.warning("input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            raise ValueError("cannot compute DTM over an empty corpus")
        if model == "fixed" and any(not text for text in corpus):
            raise ValueError("""There is a text without words in the input corpus.
                    This breaks method='fixed' (The DIM model).""")
        if lencorpus != sum(time_slices):
            raise ValueError(
                "mismatched timeslices %{slices} for corpus of len {clen}"
                .format(slices=sum(time_slices), clen=lencorpus)
            )
        self.lencorpus = lencorpus
        if prefix is None:
            rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
            prefix = os.path.join(tempfile.gettempdir(), rand_prefix)

        self.prefix = prefix
        self.time_slices = time_slices
        self.lda_sequence_min_iter = int(lda_sequence_min_iter)
        self.lda_sequence_max_iter = int(lda_sequence_max_iter)
        self.lda_max_em_iter = int(lda_max_em_iter)
        self.alpha = alpha
        self.top_chain_var = top_chain_var
        self.rng_seed = rng_seed
        self.initialize_lda = str(initialize_lda).lower()

        self.lambda_ = None
        self.obs_ = None
        self.lhood_ = None
        self.gamma_ = None
        self.init_alpha = None
        self.init_beta = None
        self.init_ss = None
        self.em_steps = []
        self.influences_time = []

        if corpus is not None:
            self.train(corpus, time_slices, mode, model)
    def save_corpus(fname, corpus, id2word=None, metadata=False):
        """Save a corpus in the Mallet format.

        Warnings
        --------
        This function is automatically called by :meth:`gensim.corpora.malletcorpus.MalletCorpus.serialize`,
        don't call it directly, call :meth:`gensim.corpora.lowcorpus.malletcorpus.MalletCorpus.serialize` instead.

        Parameters
        ----------
        fname : str
            Path to output file.
        corpus : iterable of iterable of (int, int)
            Corpus in BoW format.
        id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
            Mapping between word_ids (integers) and words (strings).
            If not provided, the mapping is constructed directly from `corpus`.
        metadata : bool, optional
            If True - ????

        Return
        ------
        list of int
            List of offsets in resulting file for each document (in bytes),
            can be used for :meth:`~gensim.corpora.malletcorpus.Malletcorpus.docbyoffset`.

        Notes
        -----
        The document id will be generated by enumerating the corpus.
        That is, it will range between 0 and number of documents in the corpus.

        Since Mallet has a language field in the format, this defaults to the string '__unknown__'.
        If the language needs to be saved, post-processing will be required.

        """
        if id2word is None:
            logger.info("no word id mapping provided; initializing from corpus")
            id2word = utils.dict_from_corpus(corpus)

        logger.info("storing corpus in Mallet format into %s", fname)

        truncated = 0
        offsets = []
        with utils.smart_open(fname, 'wb') as fout:
            for doc_id, doc in enumerate(corpus):
                if metadata:
                    doc_id, doc_lang = doc[1]
                    doc = doc[0]
                else:
                    doc_lang = '__unknown__'

                words = []
                for wordid, value in doc:
                    if abs(int(value) - value) > 1e-6:
                        truncated += 1
                    words.extend([utils.to_unicode(id2word[wordid])] * int(value))
                offsets.append(fout.tell())
                fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words))))

        if truncated:
            logger.warning(
                "Mallet format can only save vectors with integer elements; "
                "%i float entries were truncated to integer value", truncated
            )

        return offsets
Exemple #40
0
    def __init__(self,
                 sstm_jar_path,
                 model,
                 corpus,
                 id2word=None,
                 vectors=None,
                 num_topics=20,
                 alpha=0.1,
                 beta=0.01,
                 iterations=2000,
                 prefix='results/',
                 name='model',
                 twords=20,
                 sstep=0
                ):
        """

        Parameters
        ----------
        sstm_path : str
            Path to the SSTM jar file.
        corpus : iterable of iterable of (int, int), optional
            Collection of texts in BoW format.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`.
        vectors:
            Path to the word2vec file.
        num_topics : int, optional
            Number of topics.
        alpha : int, optional
            Alpha hyperparameter.
        beta : int, optional
            Beta hyperparameter.
        iterations : int, optional
            Number of training iterations.
        prefix : str, optional
            Prefix for produced temporary files.
        name : str, optional
            Name of topic model experiment.
        twords: int, optional
            Number of the most probable topical words.
        sstep : int, optional
            Step to save the sampling outputs.

        """
        self.avaliable_models = AVALIABLE_MODELS
        self.sstm_jar_path = sstm_jar_path
        self.model = model.upper()
        if self.model not in self.avaliable_models:
            raise ValueError("unknown model")
            
        self.id2word = id2word
        self.vectors = vectors
        if self.id2word is None:
            raise ValueError("no word id mapping provided")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys())
            
        if self.num_terms == 0:
            raise ValueError("empty collection (no terms)")
            
        self.num_topics = num_topics
        
        self.alpha = [alpha] * num_topics
        self.beta = beta
        
        if prefix is None:
            rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
            prefix = os.path.join(tempfile.gettempdir(), rand_prefix)
            
        self.prefix = prefix
        self.name = name
        self.twords = twords
        self.iterations = iterations
        self.sstep = sstep
        
        if corpus is not None:
            self.train(corpus)
Exemple #41
0
    def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False,
                 chunksize=2000, passes=1, update_every=1, alpha=None, eta=None, decay=0.5):
        """
        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics (but can be set to a vector, for asymmetric priors).

        Turn on `distributed` to force distributed computing (see the web tutorial
        on how to set up a cluster of machines for gensim).

        Example:

        >>> lda = LdaModel(corpus, num_topics=100)
        >>> print lda[doc_bow] # get topic probability distribution for a document
        >>> lda.update(corpus2) # update the LDA model with additional documents
        >>> print lda[doc_bow]

        """
        # store user-supplied parameters
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')

        if self.id2word is None:
            logger.info("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 1 + max([-1] + self.id2word.keys())

        if self.num_terms == 0:
            raise ValueError("cannot compute LDA over an empty collection (no terms)")

        self.distributed = bool(distributed)
        self.num_topics = int(num_topics)
        self.chunksize = chunksize
        self.decay = decay
        self.num_updates = 0

        self.passes = passes
        self.update_every = update_every

        if alpha is None:
            self.alpha = 1.0 / num_topics
        else:
            self.alpha = alpha
        if eta is None:
            self.eta = 1.0 / num_topics
        else:
            self.eta = eta

        # VB constants
        self.VAR_MAXITER = 50
        self.VAR_THRESH = 0.001

        # set up distributed environment if necessary
        if not distributed:
            logger.info("using serial LDA version on this node")
            self.dispatcher = None
            self.numworkers = 1
        else:
            # set up distributed version
            try:
                import Pyro4
                dispatcher = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher')
                dispatcher._pyroOneway.add("exit")
                logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=self.id2word, num_topics=num_topics,
                                      chunksize=chunksize, alpha=alpha, eta=eta, distributed=False)
                self.dispatcher = dispatcher
                self.numworkers = len(dispatcher.getworkers())
                logger.info("using distributed version with %i workers" % self.numworkers)
            except Exception, err:
                logger.error("failed to initialize distributed LDA (%s)" % err)
                raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
Exemple #42
0
    def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
                 decay=1.0, distributed=False, onepass=True,
                 power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS):
        """
        `num_topics` is the number of requested factors (latent dimensions).

        After the model has been trained, you can estimate topics for an
        arbitrary, unseen document, using the ``topics = self[document]`` dictionary
        notation. You can also add new training documents, with ``self.add_documents``,
        so that training can be stopped and resumed at any time, and the
        LSI transformation is available at any point.

        If you specify a `corpus`, it will be used to train the model. See the
        method `add_documents` for a description of the `chunksize` and `decay` parameters.

        Turn `onepass` off to force a multi-pass stochastic algorithm.

        `power_iters` and `extra_samples` affect the accuracy of the stochastic
        multi-pass algorithm, which is used either internally (`onepass=True`) or
        as the front-end algorithm (`onepass=False`). Increasing the number of
        power iterations improves accuracy, but lowers performance. See [3]_ for
        some hard numbers.

        Turn on `distributed` to enable distributed computing.

        Example:

        >>> lsi = LsiModel(corpus, num_topics=10)
        >>> print(lsi[doc_tfidf]) # project some document into LSI space
        >>> lsi.add_documents(corpus2) # update LSI on additional documents
        >>> print(lsi[doc_tfidf])

        .. [3] http://nlp.fi.muni.cz/~xrehurek/nips/rehurek_nips.pdf

        """
        self.id2word = id2word
        self.num_topics = int(num_topics)
        self.chunksize = int(chunksize)
        self.decay = float(decay)
        if distributed:
            if not onepass:
                logger.warning("forcing the one-pass algorithm for distributed LSA")
                onepass = True
        self.onepass = onepass
        self.extra_samples, self.power_iters = extra_samples, power_iters

        if corpus is None and self.id2word is None:
            raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 1 + max([-1] + self.id2word.keys())

        self.docs_processed = 0
        self.projection = Projection(self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples)

        self.numworkers = 1
        if not distributed:
            logger.info("using serial LSI version on this node")
            self.dispatcher = None
        else:
            if not onepass:
                raise NotImplementedError("distributed stochastic LSA not implemented yet; "
                                          "run either distributed one-pass, or serial randomized.")
            try:
                import Pyro4
                dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher')
                logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=self.id2word, num_topics=num_topics,
                                      chunksize=chunksize, decay=decay,
                                      power_iters=self.power_iters, extra_samples=self.extra_samples,
                                      distributed=False, onepass=onepass)
                self.dispatcher = dispatcher
                self.numworkers = len(dispatcher.getworkers())
                logger.info("using distributed version with %i workers" % self.numworkers)
            except Exception as err:
                # distributed version was specifically requested, so this is an error state
                logger.error("failed to initialize distributed LSI (%s)" % err)
                raise RuntimeError("failed to initialize distributed LSI (%s)" % err)

        if corpus is not None:
            self.add_documents(corpus)
Exemple #43
0
    def __init__(self,
                 dtm_path,
                 corpus=None,
                 time_slices=None,
                 mode='fit',
                 model='dtm',
                 num_topics=100,
                 id2word=None,
                 prefix=None,
                 lda_sequence_min_iter=6,
                 lda_sequence_max_iter=20,
                 lda_max_em_iter=10,
                 alpha=0.01,
                 top_chain_var=0.005,
                 rng_seed=0,
                 initialize_lda=True):
        """

        Parameters
        ----------
        dtm_path : str
            Path to the dtm binary, e.g. `/home/username/dtm/dtm/main`.
        corpus : iterable of iterable of (int, int)
            Collection of texts in BoW format.
        time_slices : list of int
            Sequence of timestamps.
        mode : {'fit', 'time'}, optional
            Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time
            according to a DTM, basically a held out set.
        model : {'fixed', 'dtm'}, optional
            Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM.
        num_topics : int, optional
            Number of topics.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`.
        prefix : str, optional
            Prefix for produced temporary files.
        lda_sequence_min_iter : int, optional
             Min iteration of LDA.
        lda_sequence_max_iter : int, optional
            Max iteration of LDA.
        lda_max_em_iter : int, optional
             Max em optimization iterations in LDA.
        alpha : int, optional
            Hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice.
        top_chain_var : int, optional
            Hyperparameter that affects.
        rng_seed : int, optional
             Random seed.
        initialize_lda : bool, optional
             If True - initialize DTM with LDA.

        """
        if not os.path.isfile(dtm_path):
            raise ValueError(
                "dtm_path must point to the binary file, not to a folder")

        self.dtm_path = dtm_path
        self.id2word = id2word
        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 0 if not self.id2word else 1 + max(
                self.id2word.keys())
        if self.num_terms == 0:
            raise ValueError(
                "cannot compute DTM over an empty collection (no terms)")
        self.num_topics = num_topics

        try:
            lencorpus = len(corpus)
        except TypeError:
            logger.warning(
                "input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            raise ValueError("cannot compute DTM over an empty corpus")
        if model == "fixed" and any(not text for text in corpus):
            raise ValueError(
                """There is a text without words in the input corpus.
                    This breaks method='fixed' (The DIM model).""")
        if lencorpus != sum(time_slices):
            raise ValueError(
                "mismatched timeslices %{slices} for corpus of len {clen}".
                format(slices=sum(time_slices), clen=lencorpus))
        self.lencorpus = lencorpus
        if prefix is None:
            rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
            prefix = os.path.join(tempfile.gettempdir(), rand_prefix)

        self.prefix = prefix
        self.time_slices = time_slices
        self.lda_sequence_min_iter = int(lda_sequence_min_iter)
        self.lda_sequence_max_iter = int(lda_sequence_max_iter)
        self.lda_max_em_iter = int(lda_max_em_iter)
        self.alpha = alpha
        self.top_chain_var = top_chain_var
        self.rng_seed = rng_seed
        self.initialize_lda = str(initialize_lda).lower()

        self.lambda_ = None
        self.obs_ = None
        self.lhood_ = None
        self.gamma_ = None
        self.init_alpha = None
        self.init_beta = None
        self.init_ss = None
        self.em_steps = []
        self.influences_time = []

        if corpus is not None:
            self.train(corpus, time_slices, mode, model)
    def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None,
                 chunksize=256, passes=1, alpha=0.1, eta=0.1, decay=0.5,
                 offset=1, gamma_threshold=0.001, random_seed=None,
                 cleanup_files=True, tmp_prefix='tmp'):
        """`vw_path` is the path to Vowpal Wabbit's 'vw' executable.

        `corpus` is an iterable training corpus. If given, training will
        start immediately, otherwise the model is left untrained (presumably
        because you want to call `update()` manually).

        `num_topics` is the number of requested latent topics to be extracted
        from the training corpus.
        Corresponds to VW's '--lda <num_topics>' argument.

        `id2word` is a mapping from word ids (integers) to words (strings).
        It is used to determine the vocabulary size, as well as for debugging
        and topic printing.

        `chunksize` is the number of documents examined in each batch.
        Corresponds to VW's '--minibatch <batch_size>' argument.

        `passes` is the number of passes over the dataset to use.
        Corresponds to VW's '--passes <passes>' argument.

        `alpha` is a float effecting sparsity of per-document topic weights.
        This is applied symmetrically, and should be set higher to when
        documents are thought to look more similar.
        Corresponds to VW's '--lda_alpha <alpha>' argument.

        `eta` is a float which affects the sparsity of topic distributions.
        This is applied symmetrically, and should be set higher when topics
        are thought to look more similar.
        Corresponds to VW's '--lda_rho <rho>' argument.

        `decay` learning rate decay, affects how quickly learnt values
        are forgotten. Should be set to a value between 0.5 and 1.0 to
        guarantee convergence.
        Corresponds to VW's '--power_t <tau>' argument.

        `offset` integer learning offset, set to higher values to slow down
        learning on early iterations of the algorithm.
        Corresponds to VW's '--initial_t <tau>' argument.

        `gamma_threshold` affects when learning loop will be broken out of,
        higher values will result in earlier loop completion.
        Corresponds to VW's '--epsilon <eps>' argument.

        `random_seed` sets Vowpal Wabbit's random seed when learning.
        Corresponds to VW's '--random_seed <seed>' argument.

        `cleanup_files` whether or not to delete temporary directory and files
        used by this wrapper. Setting to False can be useful for debugging,
        or for re-using Vowpal Wabbit files elsewhere.

        `tmp_prefix` used to prefix temporary working directory name.
        """
        # default parameters are taken from Vowpal Wabbit's defaults, and
        # parameter names changed to match Gensim's LdaModel where possible
        self.vw_path = vw_path
        self.id2word = id2word

        if self.id2word is None:
            if corpus is None:
                raise ValueError(
                    "at least one of corpus/id2word must be specified, to establish input space dimensionality"
                )
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError("cannot compute LDA over an empty collection (no terms)")

        # LDA parameters
        self.num_topics = num_topics
        self.chunksize = chunksize
        self.passes = passes
        self.alpha = alpha
        self.eta = eta
        self.gamma_threshold = gamma_threshold
        self.offset = offset
        self.decay = decay
        self.random_seed = random_seed
        self._initial_offset = offset

        # temporary files used for Vowpal Wabbit input/output
        self.tmp_dir = None
        self.tmp_prefix = tmp_prefix
        self.cleanup_files = cleanup_files
        self._init_temp_dir(tmp_prefix)

        # used for saving/loading this model's state
        self._model_data = None
        self._topics_data = None

        # cache loaded topics as numpy array
        self._topics = None

        if corpus is not None:
            self.train(corpus)
Exemple #45
0
    def __init__(
        self,
        corpus=None,
        num_topics=100,
        id2word=None,
        chunksize=2000,
        passes=1,
        kappa=1.0,
        minimum_probability=0.01,
        w_max_iter=200,
        w_stop_condition=1e-4,
        h_max_iter=50,
        h_stop_condition=1e-3,
        eval_every=10,
        normalize=True,
        random_state=None,
    ):
        r"""

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents), optional
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        num_topics : int, optional
            Number of topics to extract.
        id2word: {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}
            Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for
            debugging and topic printing.
        chunksize: int, optional
            Number of documents to be used in each training chunk.
        passes: int, optional
            Number of full passes over the training corpus.
            Leave at default `passes=1` if your input is an iterator.
        kappa : float, optional
            Gradient descent step size.
            Larger value makes the model train faster, but could lead to non-convergence if set too large.
        minimum_probability:
            If `normalize` is True, topics with smaller probabilities are filtered out.
            If `normalize` is False, topics with smaller factors are filtered out.
            If set to None, a value of 1e-8 is used to prevent 0s.
        w_max_iter: int, optional
            Maximum number of iterations to train W per each batch.
        w_stop_condition: float, optional
            If error difference gets less than that, training of ``W`` stops for the current batch.
        h_max_iter: int, optional
            Maximum number of iterations to train h per each batch.
        h_stop_condition: float
            If error difference gets less than that, training of ``h`` stops for the current batch.
        eval_every: int, optional
            Number of batches after which l2 norm of (v - Wh) is computed. Decreases performance if set too low.
        normalize: bool or None, optional
            Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c.
        random_state: {np.random.RandomState, int}, optional
            Seed for random generator. Needed for reproducibility.

        """
        self.num_topics = num_topics
        self.id2word = id2word
        self.chunksize = chunksize
        self.passes = passes
        self._kappa = kappa
        self.minimum_probability = minimum_probability
        self._w_max_iter = w_max_iter
        self._w_stop_condition = w_stop_condition
        self._h_max_iter = h_max_iter
        self._h_stop_condition = h_stop_condition
        self.eval_every = eval_every
        self.normalize = normalize
        self.random_state = utils.get_random_state(random_state)

        self.v_max = None

        if self.id2word is None:
            self.id2word = utils.dict_from_corpus(corpus)

        self.num_tokens = len(self.id2word)

        self.A = None
        self.B = None

        self._W = None
        self.w_std = None
        self._w_error = np.inf

        self._h = None

        if corpus is not None:
            self.update(corpus)
Exemple #46
0
    def __init__(self, corpus=None, num_topics=100, id2word=None,
                 distributed=False, chunksize=2000, passes=1, update_every=1,
                 alpha='symmetric', eta=None, decay=0.5, offset=1.0,
                 eval_every=10, iterations=50, gamma_threshold=0.001):
        """
        If given, start training from the iterable `corpus` straight away. If not given,
        the model is left untrained (presumably because you want to call `update()` manually).

        `num_topics` is the number of requested latent topics to be extracted from
        the training corpus.

        `id2word` is a mapping from word ids (integers) to words (strings). It is
        used to determine the vocabulary size, as well as for debugging and topic
        printing.

        `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic
        (theta) and topic-word (lambda) distributions. Both default to a symmetric
        1.0/num_topics prior.

        `alpha` can be set to an explicit array = prior of your choice. It also
        support special values of 'asymmetric' and 'auto': the former uses a fixed
        normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric
        prior directly from your data.

        `eta` can be a scalar for a symmetric prior over topic/word
        distributions, or a matrix of shape num_topics x num_words,
        which can be used to impose asymmetric priors over the word
        distribution on a per-topic basis. This may be useful if you
        want to seed certain topics with particular words by boosting
        the priors for those words.

        Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_
        on how to set up a cluster of machines for gensim).

        Calculate and log perplexity estimate from the latest mini-batch every
        `eval_every` model updates (setting this to 1 slows down training ~2x;
        default is 10 for better performance). Set to None to disable perplexity estimation.

        `decay` and `offset` parameters are the same as Kappa and Tau_0 in
        Hoffman et al, respectively.

        Example:

        >>> lda = LdaModel(corpus, num_topics=100)  # train model
        >>> print(lda[doc_bow]) # get topic probability distribution for a document
        >>> lda.update(corpus2) # update the LDA model with additional documents
        >>> print(lda[doc_bow])

        >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5)  # train asymmetric alpha from data

        """
        # store user-supplied parameters
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError("cannot compute LDA over an empty collection (no terms)")

        self.distributed = bool(distributed)
        self.num_topics = int(num_topics)
        self.chunksize = chunksize
        self.decay = decay
        self.offset = offset
        self.num_updates = 0

        self.passes = passes
        self.update_every = update_every
        self.eval_every = eval_every

        self.optimize_alpha = alpha == 'auto'
        if alpha == 'symmetric' or alpha is None:
            logger.info("using symmetric alpha at %s" % (1.0 / num_topics))
            self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)])
        elif alpha == 'asymmetric':
            self.alpha = numpy.asarray([1.0 / (i + numpy.sqrt(num_topics)) for i in xrange(num_topics)])
            self.alpha /= self.alpha.sum()
            logger.info("using asymmetric alpha %s" % list(self.alpha))
        elif alpha == 'auto':
            self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)])
            logger.info("using autotuned alpha, starting with %s" % list(self.alpha))
        else:
            # must be either float or an array of floats, of size num_topics
            self.alpha = alpha if isinstance(alpha, numpy.ndarray) else numpy.asarray([alpha] * num_topics)
            if len(self.alpha) != num_topics:
                raise RuntimeError("invalid alpha shape (must match num_topics)")

        if eta is None:
            self.eta = 1.0 / num_topics
        else:
            self.eta = eta

        # VB constants
        self.iterations = iterations
        self.gamma_threshold = gamma_threshold

        # set up distributed environment if necessary
        if not distributed:
            logger.info("using serial LDA version on this node")
            self.dispatcher = None
            self.numworkers = 1
        else:
            if self.optimize_alpha:
                raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA")
            # set up distributed version
            try:
                import Pyro4
                dispatcher = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher')
                logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=self.id2word, num_topics=num_topics,
                                      chunksize=chunksize, alpha=alpha, eta=eta, distributed=False)
                self.dispatcher = dispatcher
                self.numworkers = len(dispatcher.getworkers())
                logger.info("using distributed version with %i workers" % self.numworkers)
            except Exception as err:
                logger.error("failed to initialize distributed LDA (%s)" % err)
                raise RuntimeError("failed to initialize distributed LDA (%s)" % err)

        # Initialize the variational distribution q(beta|lambda)
        self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
        self.state.sstats = numpy.random.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
        self.sync_state()

        # if a training corpus was provided, start estimating the model right away
        if corpus is not None:
            self.update(corpus)
Exemple #47
0
    def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10,
                initialize='gensim', sstats=None,  lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, 
                random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):
        """
        `corpus` is any iterable gensim corpus

        `time_slice` as described above is a list which contains the number of documents in each time-slice

        `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size and printing topics.

        `alphas`  is a prior of your choice and should be a double or float value. default is 0.01

        `num_topics` is the number of requested latent topics to be extracted from the training corpus.

        `initalize` allows the user to decide how he wants to initialise the DTM model. Default is through gensim LDA.
        You can use your own sstats of an LDA model previously trained as well by specifying 'own' and passing a numpy matrix through sstats.
        If you wish to just pass a previously used LDA model, pass it through `lda_model`
        Shape of sstats is (vocab_len, num_topics)

        `chain_variance` is a constant which dictates how the beta values evolve - it is a gaussian parameter defined in the
        beta distribution.

        `passes` is the number of passes of the initial LdaModel.

        `random_state` can be a numpy.random.RandomState object or the seed for one, for the LdaModel.
        """
        self.id2word = id2word
        if corpus is None and self.id2word is None:
            raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality')

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.vocab_len = len(self.id2word)
        elif len(self.id2word) > 0:
            self.vocab_len = len(self.id2word)
        else:
            self.vocab_len = 0

        if corpus is not None:
            try:
                self.corpus_len = len(corpus)
            except:
                logger.warning("input corpus stream has no len(); counting documents")
                self.corpus_len = sum(1 for _ in corpus)

        self.time_slice = time_slice
        if self.time_slice is not None:
            self.num_time_slices = len(time_slice)

        max_doc_len = 0
        for line_no, line in enumerate(corpus):
            if len(line) > max_doc_len:
                max_doc_len = len(line)
        self.max_doc_len = max_doc_len

        self.num_topics = num_topics
        self.num_time_slices = len(time_slice)
        self.alphas = numpy.full(num_topics, alphas)

        # topic_chains contains for each topic a 'state space language model' object which in turn has information about each topic
        # the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities.
        self.topic_chains = []
        for topic in range(0, num_topics):
            sslm_ = sslm(num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics, chain_variance=chain_variance, obs_variance=obs_variance)
            self.topic_chains.append(sslm_)

        # the following are class variables which are to be integrated during Document Influence Model
        self.top_doc_phis = None
        self.influence = None
        self.renormalized_influence = None
        self.influence_sum_lgl = None

        # if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM.
        if corpus is not None and time_slice is not None:
            if initialize == 'gensim':
                lda_model = ldamodel.LdaModel(corpus, id2word=self.id2word, num_topics=self.num_topics, passes=passes, alpha=self.alphas, random_state=random_state)
                self.sstats = numpy.transpose(lda_model.state.sstats)
            if initialize == 'ldamodel':
                self.sstats = numpy.transpose(lda_model.state.sstats)
            if initialize == 'own':
                self.sstats = sstats

            # initialize model from sstats
            self.init_ldaseq_ss(chain_variance, obs_variance, self.alphas, self.sstats)

            # fit DTM
            self.fit_lda_seq(corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize)
Exemple #48
0
    def __init__(self,
                 corpus=None,
                 num_topics=200,
                 id2word=None,
                 chunksize=20000,
                 decay=1.0,
                 distributed=False,
                 onepass=True,
                 power_iters=P2_EXTRA_ITERS,
                 extra_samples=P2_EXTRA_DIMS):
        """
        `num_topics` is the number of requested factors (latent dimensions).

        After the model has been trained, you can estimate topics for an
        arbitrary, unseen document, using the ``topics = self[document]`` dictionary
        notation. You can also add new training documents, with ``self.add_documents``,
        so that training can be stopped and resumed at any time, and the
        LSI transformation is available at any point.

        If you specify a `corpus`, it will be used to train the model. See the
        method `add_documents` for a description of the `chunksize` and `decay` parameters.

        Turn `onepass` off to force a multi-pass stochastic algorithm.

        `power_iters` and `extra_samples` affect the accuracy of the stochastic
        multi-pass algorithm, which is used either internally (`onepass=True`) or
        as the front-end algorithm (`onepass=False`). Increasing the number of
        power iterations improves accuracy, but lowers performance. See [3]_ for
        some hard numbers.

        Turn on `distributed` to enable distributed computing.

        Example:

        >>> lsi = LsiModel(corpus, num_topics=10)
        >>> print(lsi[doc_tfidf]) # project some document into LSI space
        >>> lsi.add_documents(corpus2) # update LSI on additional documents
        >>> print(lsi[doc_tfidf])

        .. [3] http://nlp.fi.muni.cz/~xrehurek/nips/rehurek_nips.pdf

        """
        self.id2word = id2word
        self.num_topics = int(num_topics)
        self.chunksize = int(chunksize)
        self.decay = float(decay)
        if distributed:
            if not onepass:
                logger.warning(
                    "forcing the one-pass algorithm for distributed LSA")
                onepass = True
        self.onepass = onepass
        self.extra_samples, self.power_iters = extra_samples, power_iters

        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 1 + max([-1] + self.id2word.keys())

        self.docs_processed = 0
        self.projection = Projection(self.num_terms,
                                     self.num_topics,
                                     power_iters=self.power_iters,
                                     extra_dims=self.extra_samples)

        self.numworkers = 1
        if not distributed:
            logger.info("using serial LSI version on this node")
            self.dispatcher = None
        else:
            if not onepass:
                raise NotImplementedError(
                    "distributed stochastic LSA not implemented yet; "
                    "run either distributed one-pass, or serial randomized.")
            try:
                import Pyro4
                dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher')
                logger.debug("looking for dispatcher at %s",
                             str(dispatcher._pyroUri))
                dispatcher.initialize(id2word=self.id2word,
                                      num_topics=num_topics,
                                      chunksize=chunksize,
                                      decay=decay,
                                      power_iters=self.power_iters,
                                      extra_samples=self.extra_samples,
                                      distributed=False,
                                      onepass=onepass)
                self.dispatcher = dispatcher
                self.numworkers = len(dispatcher.getworkers())
                logger.info("using distributed version with %i workers",
                            self.numworkers)
            except Exception as err:
                # distributed version was specifically requested, so this is an error state
                logger.error("failed to initialize distributed LSI (%s)", err)
                raise RuntimeError(
                    "failed to initialize distributed LSI (%s)" % err)

        if corpus is not None:
            self.add_documents(corpus)
Exemple #49
0
    def __init__(self,
                 dtm_path,
                 corpus=None,
                 time_slices=None,
                 mode='fit',
                 model='dtm',
                 num_topics=100,
                 id2word=None,
                 prefix=None,
                 lda_sequence_min_iter=6,
                 lda_sequence_max_iter=20,
                 lda_max_em_iter=10,
                 alpha=0.01,
                 top_chain_var=0.005,
                 rng_seed=0,
                 initialize_lda=True):
        """
        `dtm_path` is path to the dtm executable, e.g. `C:/dtm/dtm-win64.exe`.

        `corpus` is a gensim corpus, aka a stream of sparse document vectors.

        `id2word` is a mapping between tokens ids and token.

        `mode` controls the mode of the mode: 'fit' is for training, 'time' for
        analyzing documents through time according to a DTM, basically a held out set.

        `model` controls the choice of model. 'fixed' is for DIM and 'dtm' for DTM.

        `lda_sequence_min_iter` min iteration of LDA.

        `lda_sequence_max_iter` max iteration of LDA.

        `lda_max_em_iter` max em optiimzatiion iterations in LDA.

        `alpha` is a hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice.

        `top_chain_var` is a hyperparameter that affects.

        `rng_seed` is the random seed.

        `initialize_lda` initialize DTM with LDA.

        """
        if not os.path.isfile(dtm_path):
            raise ValueError(
                "dtm_path must point to the binary file, not to a folder")

        self.dtm_path = dtm_path
        self.id2word = id2word
        if self.id2word is None:
            logger.warning(
                "no word id mapping provided; initializing from corpus, assuming identity"
            )
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 0 if not self.id2word else 1 + max(
                self.id2word.keys())
        if self.num_terms == 0:
            raise ValueError(
                "cannot compute DTM over an empty collection (no terms)")
        self.num_topics = num_topics

        try:
            lencorpus = len(corpus)
        except:
            logger.warning(
                "input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            raise ValueError("cannot compute DTM over an empty corpus")
        if model == "fixed" and any(not text for text in corpus):
            raise ValueError(
                """There is a text without words in the input corpus.
                    This breaks method='fixed' (The DIM model).""")
        if lencorpus != sum(time_slices):
            raise ValueError(
                "mismatched timeslices %{slices} for corpus of len {clen}".
                format(slices=sum(time_slices), clen=lencorpus))
        self.lencorpus = lencorpus
        if prefix is None:
            rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_'
            prefix = os.path.join(tempfile.gettempdir(), rand_prefix)

        self.prefix = prefix
        self.time_slices = time_slices
        self.lda_sequence_min_iter = int(lda_sequence_min_iter)
        self.lda_sequence_max_iter = int(lda_sequence_max_iter)
        self.lda_max_em_iter = int(lda_max_em_iter)
        self.alpha = alpha
        self.top_chain_var = top_chain_var
        self.rng_seed = rng_seed
        self.initialize_lda = str(initialize_lda).lower()

        self.lambda_ = None
        self.obs_ = None
        self.lhood_ = None
        self.gamma_ = None
        self.init_alpha = None
        self.init_beta = None
        self.init_ss = None
        self.em_steps = []
        self.influences_time = []

        if corpus is not None:
            self.train(corpus, time_slices, mode, model)
Exemple #50
0
    def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000,
                 decay=1.0, distributed=False, onepass=True,
                 power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64):
        """Construct an `LsiModel` object.

        Either `corpus` or `id2word` must be supplied in order to train the model.

        Parameters
        ----------
        corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
            Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`).
        num_topics : int, optional
            Number of requested factors (latent dimensions)
        id2word : dict of {int: str}, optional
            ID to word mapping, optional.
        chunksize :  int, optional
            Number of documents to be used in each training chunk.
        decay : float, optional
            Weight of existing observations relatively to new ones.
        distributed : bool, optional
            If True - distributed mode (parallel execution on several machines) will be used.
        onepass : bool, optional
            Whether the one-pass algorithm should be used for training.
            Pass `False` to force a multi-pass stochastic algorithm.
        power_iters: int, optional
            Number of power iteration steps to be used.
            Increasing the number of power iterations improves accuracy, but lowers performance
        extra_samples : int, optional
            Extra samples to be used besides the rank `k`. Can improve accuracy.
        dtype : type, optional
            Enforces a type for elements of the decomposed matrix.

        """
        self.id2word = id2word
        self.num_topics = int(num_topics)
        self.chunksize = int(chunksize)
        self.decay = float(decay)
        if distributed:
            if not onepass:
                logger.warning("forcing the one-pass algorithm for distributed LSA")
                onepass = True
        self.onepass = onepass
        self.extra_samples, self.power_iters = extra_samples, power_iters
        self.dtype = dtype

        if corpus is None and self.id2word is None:
            raise ValueError(
                'at least one of corpus/id2word must be specified, to establish input space dimensionality'
            )

        if self.id2word is None:
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        else:
            self.num_terms = 1 + (max(self.id2word.keys()) if self.id2word else -1)

        self.docs_processed = 0
        self.projection = Projection(
            self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples, dtype=dtype
        )

        self.numworkers = 1
        if not distributed:
            logger.info("using serial LSI version on this node")
            self.dispatcher = None
        else:
            if not onepass:
                raise NotImplementedError(
                    "distributed stochastic LSA not implemented yet; "
                    "run either distributed one-pass, or serial randomized."
                )
            try:
                import Pyro4
                dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher')
                logger.debug("looking for dispatcher at %s", str(dispatcher._pyroUri))
                dispatcher.initialize(
                    id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, decay=decay,
                    power_iters=self.power_iters, extra_samples=self.extra_samples, distributed=False, onepass=onepass
                )
                self.dispatcher = dispatcher
                self.numworkers = len(dispatcher.getworkers())
                logger.info("using distributed version with %i workers", self.numworkers)
            except Exception as err:
                # distributed version was specifically requested, so this is an error state
                logger.error("failed to initialize distributed LSI (%s)", err)
                raise RuntimeError("failed to initialize distributed LSI (%s)" % err)

        if corpus is not None:
            self.add_documents(corpus)
    def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None,
                 chunksize=256, passes=1, alpha=0.1, eta=0.1, decay=0.5,
                 offset=1, gamma_threshold=0.001, random_seed=None,
                 cleanup_files=True, tmp_prefix='tmp'):
        """

        Parameters
        ----------
        vw_path : str
            Path to Vowpal Wabbit's binary.
        corpus : iterable of list of (int, int), optional
            Collection of texts in BoW format. If given, training will start immediately,
            otherwise, you should call :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.train` or
            :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.update` manually for training.
        num_topics : int, optional
            Number of requested latent topics to be extracted from the training corpus.
            Corresponds to VW's ``--lda <num_topics>`` argument.
        id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Mapping from word ids (integers) to words (strings).
        chunksize : int, optional
            Number of documents examined in each batch.
            Corresponds to VW's ``--minibatch <batch_size>`` argument.
        passes : int, optional
            Number of passes over the dataset to use.
            Corresponds to VW's ``--passes <passes>`` argument.
        alpha : float, optional
            Float effecting sparsity of per-document topic weights.
            This is applied symmetrically, and should be set higher to when documents are thought to look more similar.
            Corresponds to VW's ``--lda_alpha <alpha>`` argument.
        eta : float, optional
            Affects the sparsity of topic distributions.
            This is applied symmetrically, and should be set higher when topics
            are thought to look more similar.
            Corresponds to VW's ``--lda_rho <rho>`` argument.
        decay : float, optional
            Learning rate decay, affects how quickly learnt values are forgotten.
            Should be set to a value between 0.5 and 1.0 to guarantee convergence.
            Corresponds to VW's ``--power_t <tau>`` argument.
        offset: int, optional
            Learning offset, set to higher values to slow down learning on early iterations of the algorithm.
            Corresponds to VW's ``--initial_t <tau>`` argument.
        gamma_threshold : float, optional
            Affects when learning loop will be broken out of, higher values will result in earlier loop completion.
            Corresponds to VW's ``--epsilon <eps>`` argument.
        random_seed : int, optional
            Sets random seed when learning.
            Corresponds to VW's ``--random_seed <seed>`` argument.
        cleanup_files : bool, optional
            Whether or not to delete temporary directory and files used by this wrapper.
            Setting to False can be useful for debugging, or for re-using Vowpal Wabbit files elsewhere.
        tmp_prefix : str, optional
            To prefix temporary working directory name.

        """
        # default parameters are taken from Vowpal Wabbit's defaults, and
        # parameter names changed to match Gensim's LdaModel where possible
        self.vw_path = vw_path
        self.id2word = id2word

        if self.id2word is None:
            if corpus is None:
                raise ValueError(
                    "at least one of corpus/id2word must be specified, to establish input space dimensionality"
                )
            logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
            self.id2word = utils.dict_from_corpus(corpus)
            self.num_terms = len(self.id2word)
        elif len(self.id2word) > 0:
            self.num_terms = 1 + max(self.id2word.keys())
        else:
            self.num_terms = 0

        if self.num_terms == 0:
            raise ValueError("cannot compute LDA over an empty collection (no terms)")

        # LDA parameters
        self.num_topics = num_topics
        self.chunksize = chunksize
        self.passes = passes
        self.alpha = alpha
        self.eta = eta
        self.gamma_threshold = gamma_threshold
        self.offset = offset
        self.decay = decay
        self.random_seed = random_seed
        self._initial_offset = offset

        # temporary files used for Vowpal Wabbit input/output
        self.tmp_dir = None
        self.tmp_prefix = tmp_prefix
        self.cleanup_files = cleanup_files
        self._init_temp_dir(tmp_prefix)

        # used for saving/loading this model's state
        self._model_data = None
        self._topics_data = None

        # cache loaded topics as numpy array
        self._topics = None

        if corpus is not None:
            self.train(corpus)