def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `UciCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) # write out vocabulary fname_vocab = fname + '.vocab' logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
def save_corpus(fname, corpus, id2word=None): """ Save a corpus in the List-of-words format. This function is automatically called by `LowCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in List-Of-Words format: %s" % fname) truncated = 0 offsets = [] with open(fname, 'w') as fout: fout.write('%i\n' % len(corpus)) for doc in corpus: words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([str(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write('%s\n' % ' '.join(words)) if truncated: logger.warning("List-of-words format can only save vectors with " "integer elements; %i float entries were truncated to integer value" % truncated) return offsets
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the List-of-words format. This function is automatically called by `LowCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in List-Of-Words format into %s" % fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8('%i\n' % len(corpus))) for doc in corpus: words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s\n' % ' '.join(words))) if truncated: logger.warning( "List-of-words format can only save vectors with integer elements; " "%i float entries were truncated to integer value", truncated) return offsets
def initialize(self, corpus): """Initialize the random projection matrix. Parameters ---------- corpus : iterable of iterable of (int, int) Input corpus. """ if self.id2word is None: logger.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif self.id2word: self.num_terms = 1 + max(self.id2word) else: self.num_terms = 0 shape = self.num_topics, self.num_terms logger.info("constructing %s random matrix", str(shape)) # Now construct the projection matrix itself. # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection", # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1). randmat = 1 - 2 * np.random.binomial(1, 0.5, shape) # convert from 0/1 to +1/-1 # convert from int32 to floats, for faster multiplications self.projection = np.asfortranarray(randmat, dtype=np.float32)
def __init__(self, mallet_path, corpus=None, covariates=None, num_topics=100, id2word=None, workers=4, prefix=None, optimize_interval=0, iterations=1000): """ `mallet_path` is path to the mallet executable, e.g. `/home/kofola/mallet-2.0.7/bin/mallet`. `corpus` is a gensim corpus, aka a stream of sparse document vectors. 'covariates' is a numpy array of covariates to the corpus `id2word` is a mapping between tokens ids and token. `workers` is the number of threads, for parallel training. `prefix` is the string prefix under which all data files will be stored; default: system temp + random filename prefix. `optimize_interval` optimize hyperparameters every N iterations (sometimes leads to Java exception; 0 to switch off hyperparameter optimization). `iterations` is the number of sampling iterations. """ self.mallet_path = mallet_path self.id2word = id2word if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys()) if self.num_terms == 0: raise ValueError("cannot compute DMR over an empty collection (no terms)") self.num_topics = num_topics if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' prefix = os.path.join(tempfile.gettempdir(), rand_prefix) self.prefix = prefix self.workers = workers self.optimize_interval = optimize_interval self.iterations = iterations if (corpus is not None & covariates is not None): self.train(corpus)
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) logger.info("storing corpus in Blei's LDA-C format into %s" % fname) with utils.smart_open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%s" % p for p in doc if abs(p[1]) > 1e-7] fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = fname + '.vocab' logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
def initialize(self, corpus): """Initialize the random projection matrix. Parameters ---------- corpus : iterable of iterable of (int, int) Input corpus. """ if self.id2word is None: logger.info( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif self.id2word: self.num_terms = 1 + max(self.id2word) else: self.num_terms = 0 shape = self.num_topics, self.num_terms logger.info("constructing %s random matrix", str(shape)) # Now construct the projection matrix itself. # Here i use a particular form, derived in "Achlioptas: Database-friendly random projection", # and his (1) scenario of Theorem 1.1 in particular (all entries are +1/-1). randmat = 1 - 2 * np.random.binomial( 1, 0.5, shape) # convert from 0/1 to +1/-1 # convert from int32 to floats, for faster multiplications self.projection = np.asfortranarray(randmat, dtype=np.float32)
def __init__(self, corpus=None, num_topics=100, id2word=None, passes=1, threshold=0.001, iterations=10, alpha=None, eta=None, offset=1.0, decay=0.5, eval_every=1, random_state=None): if alpha is None: alpha = 1.0 / num_topics if eta is None: eta = 1.0 / num_topics self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute LDA over an empty collection (no terms)") logger.info('Vocabulary consists of %d words.', self.num_terms) self.corpus = corpus self.iterations = iterations self.passes = passes self.num_topics = num_topics self.threshold = threshold self.alpha = alpha self.eta = eta self.offset = offset self.decay = decay self.num_docs = len(corpus) self.eval_every = eval_every self.random_state = random_state self.random_state = get_random_state(random_state) if corpus is not None: self.inference(corpus)
def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=None, workers=4, prefix=None, optimize_interval=0, iterations=1000, topic_threshold=0.0): """ `mallet_path` is path to the mallet executable, e.g. `/home/kofola/mallet-2.0.7/bin/mallet`. `corpus` is a gensim corpus, aka a stream of sparse document vectors. `id2word` is a mapping between tokens ids and token. `workers` is the number of threads, for parallel training. `prefix` is the string prefix under which all data files will be stored; default: system temp + random filename prefix. `optimize_interval` optimize hyperparameters every N iterations (sometimes leads to Java exception; 0 to switch off hyperparameter optimization). `iterations` is the number of sampling iterations. `topic_threshold` is the threshold of the probability above which we consider a topic. This is basically for sparse topic distribution. """ self.mallet_path = mallet_path self.id2word = id2word if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 0 if not self.id2word else 1 + max( self.id2word.keys()) if self.num_terms == 0: raise ValueError( "cannot compute LDA over an empty collection (no terms)") self.num_topics = num_topics self.topic_threshold = topic_threshold self.alpha = alpha if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' prefix = os.path.join(tempfile.gettempdir(), rand_prefix) self.prefix = prefix self.workers = workers self.optimize_interval = optimize_interval self.iterations = iterations if corpus is not None: self.train(corpus)
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """Save a corpus in the UCI Bag-of-Words format. Warnings -------- This function is automatically called by :meth`gensim.corpora.ucicorpus.UciCorpus.serialize`, don't call it directly, call :meth`gensim.corpora.ucicorpus.UciCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus: iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional Mapping between words and their ids. If None - will be inferred from `corpus`. progress_cnt : int, optional Progress counter, write log message each `progress_cnt` documents. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Notes ----- There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) elif id2word: num_terms = 1 + max(id2word) else: num_terms = 0 # write out vocabulary fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write( utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s", fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
def __init__(self, mallet_path, corpus=None, num_topics=100, alpha=50, id2word=None, workers=4, prefix=None, optimize_interval=0, iterations=1000, topic_threshold=0.0, random_seed=0): """ Parameters ---------- mallet_path : str Path to the mallet binary, e.g. `/home/username/mallet-2.0.7/bin/mallet`. corpus : iterable of iterable of (int, int), optional Collection of texts in BoW format. num_topics : int, optional Number of topics. alpha : int, optional Alpha parameter of LDA. id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`. workers : int, optional Number of threads that will be used for training. prefix : str, optional Prefix for produced temporary files. optimize_interval : int, optional Optimize hyperparameters every `optimize_interval` iterations (sometimes leads to Java exception 0 to switch off hyperparameter optimization). iterations : int, optional Number of training iterations. topic_threshold : float, optional Threshold of the probability above which we consider a topic. random_seed: int, optional Random seed to ensure consistent results, if 0 - use system clock. """ self.mallet_path = mallet_path self.id2word = id2word if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys()) if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.num_topics = num_topics self.topic_threshold = topic_threshold self.alpha = alpha if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' prefix = os.path.join(tempfile.gettempdir(), rand_prefix) self.prefix = prefix self.workers = workers self.optimize_interval = optimize_interval self.iterations = iterations self.random_seed = random_seed if corpus is not None: self.train(corpus)
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the LDA-C format. Notes ----- There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Input corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word for `corpus`. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Returns ------- list of int Offsets for each line in file (in bytes). """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) elif id2word: num_terms = 1 + max(id2word) else: num_terms = 0 logger.info("storing corpus in Blei's LDA-C format into %s", fname) with utils.open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7] fout.write( utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write( utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the GibbsLda++ format. Warnings -------- This function is automatically called by :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize`, don't call it directly, call :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional Mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed directly from `corpus`. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Return ------ list of int List of offsets in resulting file for each document (in bytes), can be used for :meth:`~gensim.corpora.lowcorpus.LowCorpus.docbyoffset` """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in List-Of-Words format into %s" % fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8('%i\n' % len(corpus))) for doc in corpus: words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s\n' % ' '.join(words))) if truncated: logger.warning( "List-of-words format can only save vectors with integer elements; " "%i float entries were truncated to integer value", truncated) return offsets
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the GibbsLda++ format. Warnings -------- This function is automatically called by :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize`, don't call it directly, call :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional Mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed directly from `corpus`. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Return ------ list of int List of offsets in resulting file for each document (in bytes), can be used for :meth:`~gensim.corpora.lowcorpus.LowCorpus.docbyoffset` """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in List-Of-Words format into %s" % fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8('%i\n' % len(corpus))) for doc in corpus: words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s\n' % ' '.join(words))) if truncated: logger.warning( "List-of-words format can only save vectors with integer elements; " "%i float entries were truncated to integer value", truncated ) return offsets
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the LDA-C format. Notes ----- There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Input corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word for `corpus`. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Returns ------- list of int Offsets for each line in file (in bytes). """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) elif id2word: num_terms = 1 + max(id2word) else: num_terms = 0 logger.info("storing corpus in Blei's LDA-C format into %s", fname) with utils.smart_open(fname, 'wb') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) parts = ["%i:%g" % p for p in doc if abs(p[1]) > 1e-7] fout.write(utils.to_utf8("%i %s\n" % (len(doc), ' '.join(parts)))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) return offsets
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the Mallet format. The document id will be generated by enumerating the corpus. That is, it will range between 0 and number of documents in the corpus. Since Mallet has a language field in the format, this defaults to the string '__unknown__'. If the language needs to be saved, post-processing will be required. This function is automatically called by `MalletCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in Mallet format into %s" % fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: for doc_id, doc in enumerate(corpus): if metadata: doc_id, doc_lang = doc[1] doc = doc[0] else: doc_lang = '__unknown__' words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write( utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words)))) if truncated: logger.warning( "Mallet format can only save vectors with " "integer elements; %i float entries were truncated to integer value" % truncated) return offsets
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """Save a corpus in the UCI Bag-of-Words format. Warnings -------- This function is automatically called by :meth`gensim.corpora.ucicorpus.UciCorpus.serialize`, don't call it directly, call :meth`gensim.corpora.ucicorpus.UciCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus: iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional Mapping between words and their ids. If None - will be inferred from `corpus`. progress_cnt : int, optional Progress counter, write log message each `progress_cnt` documents. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Notes ----- There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) elif id2word: num_terms = 1 + max(id2word) else: num_terms = 0 # write out vocabulary fname_vocab = utils.smart_extension(fname, '.vocab') logger.info("saving vocabulary of %i words to %s", num_terms, fname_vocab) with utils.smart_open(fname_vocab, 'wb') as fout: for featureid in range(num_terms): fout.write(utils.to_utf8("%s\n" % id2word.get(featureid, '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s", fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the Mallet format. The document id will be generated by enumerating the corpus. That is, it will range between 0 and number of documents in the corpus. Since Mallet has a language field in the format, this defaults to the string '__unknown__'. If the language needs to be saved, post-processing will be required. This function is automatically called by `MalletCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in Mallet format into %s", fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: for doc_id, doc in enumerate(corpus): if metadata: doc_id, doc_lang = doc[1] doc = doc[0] else: doc_lang = '__unknown__' words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words)))) if truncated: logger.warning( "Mallet format can only save vectors with integer elements; " "%i float entries were truncated to integer value", truncated ) return offsets
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) logger.info("storing corpus in Blei's LDA-C format into %s" % fname) with utils.smart_open(fname, 'w') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) fout.write( "%i %s\n" % (len(doc), ' '.join("%i:%s" % p for p in doc if abs(p[1]) > 1e-12))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = fname + '.vocab' logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write("%s\n" % utils.to_utf8(id2word.get(featureid, '---'))) return offsets
def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None, chunksize=256, passes=1, alpha=0.1, eta=0.1, decay=0.5, offset=1, gamma_threshold=0.001, random_seed=None, cleanup_files=True, tmp_prefix='tmp'): """ Parameters ---------- vw_path : str Path to Vowpal Wabbit's binary. corpus : iterable of list of (int, int), optional Collection of texts in BoW format. If given, training will start immediately, otherwise, you should call :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.train` or :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.update` manually for training. num_topics : int, optional Number of requested latent topics to be extracted from the training corpus. Corresponds to VW's ``--lda <num_topics>`` argument. id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional Mapping from word ids (integers) to words (strings). chunksize : int, optional Number of documents examined in each batch. Corresponds to VW's ``--minibatch <batch_size>`` argument. passes : int, optional Number of passes over the dataset to use. Corresponds to VW's ``--passes <passes>`` argument. alpha : float, optional Float effecting sparsity of per-document topic weights. This is applied symmetrically, and should be set higher to when documents are thought to look more similar. Corresponds to VW's ``--lda_alpha <alpha>`` argument. eta : float, optional Affects the sparsity of topic distributions. This is applied symmetrically, and should be set higher when topics are thought to look more similar. Corresponds to VW's ``--lda_rho <rho>`` argument. decay : float, optional Learning rate decay, affects how quickly learnt values are forgotten. Should be set to a value between 0.5 and 1.0 to guarantee convergence. Corresponds to VW's ``--power_t <tau>`` argument. offset: int, optional Learning offset, set to higher values to slow down learning on early iterations of the algorithm. Corresponds to VW's ``--initial_t <tau>`` argument. gamma_threshold : float, optional Affects when learning loop will be broken out of, higher values will result in earlier loop completion. Corresponds to VW's ``--epsilon <eps>`` argument. random_seed : int, optional Sets random seed when learning. Corresponds to VW's ``--random_seed <seed>`` argument. cleanup_files : bool, optional Whether or not to delete temporary directory and files used by this wrapper. Setting to False can be useful for debugging, or for re-using Vowpal Wabbit files elsewhere. tmp_prefix : str, optional To prefix temporary working directory name. """ # default parameters are taken from Vowpal Wabbit's defaults, and # parameter names changed to match Gensim's LdaModel where possible self.vw_path = vw_path self.id2word = id2word if self.id2word is None: if corpus is None: raise ValueError( "at least one of corpus/id2word must be specified, to establish input space dimensionality" ) logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute LDA over an empty collection (no terms)") # LDA parameters self.num_topics = num_topics self.chunksize = chunksize self.passes = passes self.alpha = alpha self.eta = eta self.gamma_threshold = gamma_threshold self.offset = offset self.decay = decay self.random_seed = random_seed self._initial_offset = offset # temporary files used for Vowpal Wabbit input/output self.tmp_dir = None self.tmp_prefix = tmp_prefix self.cleanup_files = cleanup_files self._init_temp_dir(tmp_prefix) # used for saving/loading this model's state self._model_data = None self._topics_data = None # cache loaded topics as numpy array self._topics = None if corpus is not None: self.train(corpus)
def __init__( self, corpus=None, num_topics=100, id2word=None, chunksize=2000, passes=1, kappa=1.0, minimum_probability=0.01, w_max_iter=200, w_stop_condition=1e-4, h_max_iter=50, h_stop_condition=1e-3, eval_every=10, normalize=True, random_state=None, ): r""" Parameters ---------- corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents), optional Training corpus. Can be either iterable of documents, which are lists of `(word_id, word_count)`, or a sparse csc matrix of BOWs for each document. If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`). num_topics : int, optional Number of topics to extract. id2word: {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`} Mapping from word IDs to words. It is used to determine the vocabulary size, as well as for debugging and topic printing. chunksize: int, optional Number of documents to be used in each training chunk. passes: int, optional Number of full passes over the training corpus. Leave at default `passes=1` if your input is an iterator. kappa : float, optional Gradient descent step size. Larger value makes the model train faster, but could lead to non-convergence if set too large. minimum_probability: If `normalize` is True, topics with smaller probabilities are filtered out. If `normalize` is False, topics with smaller factors are filtered out. If set to None, a value of 1e-8 is used to prevent 0s. w_max_iter: int, optional Maximum number of iterations to train W per each batch. w_stop_condition: float, optional If error difference gets less than that, training of ``W`` stops for the current batch. h_max_iter: int, optional Maximum number of iterations to train h per each batch. h_stop_condition: float If error difference gets less than that, training of ``h`` stops for the current batch. eval_every: int, optional Number of batches after which l2 norm of (v - Wh) is computed. Decreases performance if set too low. normalize: bool or None, optional Whether to normalize the result. Allows for estimation of perplexity, coherence, e.t.c. random_state: {np.random.RandomState, int}, optional Seed for random generator. Needed for reproducibility. """ self.num_topics = num_topics self.id2word = id2word self.chunksize = chunksize self.passes = passes self._kappa = kappa self.minimum_probability = minimum_probability self._w_max_iter = w_max_iter self._w_stop_condition = w_stop_condition self._h_max_iter = h_max_iter self._h_stop_condition = h_stop_condition self.eval_every = eval_every self.normalize = normalize self.random_state = utils.get_random_state(random_state) self.v_max = None if self.id2word is None: self.id2word = utils.dict_from_corpus(corpus) self.num_tokens = len(self.id2word) self.A = None self.B = None self._W = None self.w_std = None self._w_error = np.inf self._h = None if corpus is not None: self.update(corpus)
def __init__( self, corpus=None, num_topics=100, id2word=None, chunksize=2000, passes=1, lambda_=1.0, kappa=1.0, minimum_probability=0.01, use_r=False, w_max_iter=200, w_stop_condition=1e-4, h_r_max_iter=50, h_r_stop_condition=1e-3, eval_every=10, v_max=None, normalize=True, sparse_coef=3, random_state=None, ): """ Parameters ---------- corpus : iterable of list of (int, float), optional Training corpus. If not given, model is left untrained. num_topics : int, optional Number of topics to extract. id2word: gensim.corpora.Dictionary, optional Mapping from token id to token. If not set words get replaced with word ids. chunksize: int, optional Number of documents to be used in each training chunk. passes: int, optioanl Number of full passes over the training corpus. lambda_ : float, optional Residuals regularizer coefficient. Increasing it helps prevent ovefitting. Has no effect if `use_r` is set to False. kappa : float, optional Optimizer step coefficient. Increaing it makes model train faster, but adds a risk that it won't converge. w_max_iter: int, optional Maximum number of iterations to train W matrix per each batch. w_stop_condition: float, optional If error difference gets less than that, training of matrix ``W`` stops for current batch. h_r_max_iter: int, optional Maximum number of iterations to train h and r matrices per each batch. h_r_stop_condition: float If error difference gets less than that, training of matrices ``h`` and ``r`` stops for current batch. eval_every: int, optional Number of batches after which model will be evaluated. v_max: int, optional Maximum number of word occurrences in the corpora. Inferred if not set. Rarely needs to be set explicitly. normalize: bool, optional Whether to normalize results. Offers "kind-of-probabilistic" result. sparse_coef: float, optional The more it is, the more sparse are matrices. Significantly increases performance. random_state: {np.random.RandomState, int}, optional Seed for random generator. Useful for reproducibility. """ self._w_error = None self.num_tokens = None self.num_topics = num_topics self.id2word = id2word self.chunksize = chunksize self.passes = passes self._lambda_ = lambda_ self._kappa = kappa self.minimum_probability = minimum_probability self.use_r = use_r self._w_max_iter = w_max_iter self._w_stop_condition = w_stop_condition self._h_r_max_iter = h_r_max_iter self._h_r_stop_condition = h_r_stop_condition self.v_max = v_max self.eval_every = eval_every self.normalize = normalize self.sparse_coef = sparse_coef self.random_state = utils.get_random_state(random_state) if self.id2word is None: self.id2word = utils.dict_from_corpus(corpus) self.num_tokens = len(self.id2word) self.A = None self.B = None self._W = None self.w_std = None self._h = None self._r = None if corpus is not None: self.update(corpus)
def __init__(self, corpus=None, num_topics=100, id2word=None, estep_convergence=0.001, em_convergence=0.0001, em_max_iterations=50): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. The variational EM runs until the relative change in the likelihood bound is less than `em_convergence`. In each EM iteration, the E-step runs until the relative change in the likelihood bound is less than `estep_convergence`. """ # store user-supplied parameters self.id2word = id2word self.estep_convergence = estep_convergence # relative change we need to achieve in E-step self.em_convergence = em_convergence # relative change we need to achieve in Expectation-Maximization self.em_max_iterations = em_max_iterations if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute CTL over an empty collection (no terms)") self.num_topics = int(num_topics) # initialize a model with zero-mean, diagonal covariance gaussian and # random topics seeded from the corpus self.mu = numpy.zeros(self.num_topics) self.sigma = numpy.diagflat([1.0] * self.num_topics) self.sigma_inverse = inv(self.sigma) self.beta = numpy.random.uniform(0, 1, (self.num_topics, self.num_terms)) # variational parameters self.lamda = numpy.zeros(self.num_topics) self.nu2 = numpy.ones(self.num_topics) # nu^2 self.phi = 1 / float(self.num_topics) * numpy.ones( [self.num_terms, self.num_topics]) self.optimize_zeta() # in order to get the topics graph, we need to store the # optimized lamda for each document self.observed_lamda = numpy.zeros([len(corpus)]) # if a training corpus was provided, start estimating the model right away if corpus is not None: self.expectation_maximization(corpus)
def __init__(self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100, id2word=None, prefix=None, lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10, alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True): """ `dtm_path` is path to the dtm executable, e.g. `C:/dtm/dtm-win64.exe`. `corpus` is a gensim corpus, aka a stream of sparse document vectors. `id2word` is a mapping between tokens ids and token. `mode` controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time according to a DTM, basically a held out set. `model` controls the choice of model. 'fixed' is for DIM and 'dtm' for DTM. `lda_sequence_min_iter` min iteration of LDA. `lda_sequence_max_iter` max iteration of LDA. `lda_max_em_iter` max em optiimzatiion iterations in LDA. `alpha` is a hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice. `top_chain_var` is a hyperparameter that affects. `rng_seed` is the random seed. `initialize_lda` initialize DTM with LDA. """ if not os.path.isfile(dtm_path): raise ValueError("dtm_path must point to the binary file, not to a folder") self.dtm_path = dtm_path self.id2word = id2word if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys()) if self.num_terms == 0: raise ValueError("cannot compute DTM over an empty collection (no terms)") self.num_topics = num_topics try: lencorpus = len(corpus) except TypeError: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: raise ValueError("cannot compute DTM over an empty corpus") if model == "fixed" and any(not text for text in corpus): raise ValueError("""There is a text without words in the input corpus. This breaks method='fixed' (The DIM model).""") if lencorpus != sum(time_slices): raise ValueError( "mismatched timeslices %{slices} for corpus of len {clen}" .format(slices=sum(time_slices), clen=lencorpus) ) self.lencorpus = lencorpus if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' prefix = os.path.join(tempfile.gettempdir(), rand_prefix) self.prefix = prefix self.time_slices = time_slices self.lda_sequence_min_iter = int(lda_sequence_min_iter) self.lda_sequence_max_iter = int(lda_sequence_max_iter) self.lda_max_em_iter = int(lda_max_em_iter) self.alpha = alpha self.top_chain_var = top_chain_var self.rng_seed = rng_seed self.initialize_lda = str(initialize_lda).lower() self.lambda_ = None self.obs_ = None self.lhood_ = None self.gamma_ = None self.init_alpha = None self.init_beta = None self.init_ss = None self.em_steps = [] self.influences_time = [] if corpus is not None: self.train(corpus, time_slices, mode, model)
def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None, chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, gamma_threshold=0.001, serialized=False, serialization_path=None, minimum_probability=0.01, random_state=None): """ If the iterable corpus and one of author2doc/doc2author dictionaries are given, start training straight away. If not given, the model is left untrained (presumably because you want to call the `update` method manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `author2doc` is a dictionary where the keys are the names of authors, and the values are lists of documents that the author contributes to. `doc2author` is a dictionary where the keys are document IDs (indexes to corpus) and the values are lists of author names. I.e. this is the reverse mapping of `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be supplied. `passes` is the number of times the model makes a pass over the entire trianing data. `iterations` is the maximum number of times the model loops over each document (M-step). The iterations stop when convergence is reached. `chunksize` controls the size of the mini-batches. `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value 'auto', which learns an asymmetric prior over words directly from your data. `eta` can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates. Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `decay` controls how quickly old documents are forgotten, while `offset` down-weights early iterations. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be an integer or a numpy.random.RandomState object. Set the state of the random number generator inside the author-topic model, to ensure reproducibility of your experiments, for example. `serialized` indicates whether the input corpora to the model are simple in-memory lists (`serialized = False`) or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from other Gensim models. If your data is too large to fit in to memory, use this functionality. Note that calling `AuthorTopicModel.update` with new data may be cumbersome as it requires all the existing data to be re-serialized. `serialization_path` must be set to a filepath, if `serialized = True` is used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your working directory by setting `serialization_path = serialized_model.mm`. An existing file *cannot* be overwritten; either delete the old file or choose a different name. Example: >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word) # train model >>> model.update(corpus2) # update the author-topic model with additional documents >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState. distributed = False self.dispatcher = None self.numworkers = 1 self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute the author-topic model over an empty collection (no terms)" ) logger.info('Vocabulary consists of %d words.', self.num_terms) self.author2doc = {} self.doc2author = {} self.distributed = distributed self.num_topics = num_topics self.num_authors = 0 self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.total_docs = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.author2id = {} self.id2author = {} self.serialized = serialized if serialized and not serialization_path: raise ValueError( "If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path)." ) if serialized and serialization_path: assert not isfile( serialization_path ), "A file already exists at the serialization_path path; choose a different serialization_path, or delete the file." self.serialization_path = serialization_path # Initialize an empty self.corpus. self.init_empty_corpus() self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == ( self.num_topics, ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str( self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError( "The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = utils.get_random_state(random_state) assert ( self.eta.shape == (self.num_terms, ) or self.eta.shape == (self.num_topics, self.num_terms) ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # Initialize the variational distributions q(beta|lambda) and q(theta|gamma) self.state = AuthorTopicState(self.eta, (self.num_topics, self.num_terms), (self.num_authors, self.num_topics)) self.state.sstats = self.random_state.gamma( 100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None and (author2doc is not None or doc2author is not None): use_numpy = self.dispatcher is not None self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy)
def __init__( self, dtm_path, corpus=None, time_slices=None, num_topics=100, id2word=None, prefix=None, lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10, alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=False): """ `dtm_path` is path to the dtm executable, e.g. `C:/dtm/dtm-win64.exe`. `corpus` is a gensim corpus, aka a stream of sparse document vectors. `id2word` is a mapping between tokens ids and token. `lda_sequence_min_iter` min iteration of LDA. `lda_sequence_max_iter` max iteration of LDA. `lda_max_em_iter` max em optiimzatiion iterations in LDA. `alpha` is a hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice. `top_chain_var` is a hyperparameter that affects. `rng_seed` is the random seed. `initialize_lda` initialize DTM with LDA. """ self.dtm_path = dtm_path self.id2word = id2word if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys()) if self.num_terms == 0: raise ValueError("cannot compute DTM over an empty collection (no terms)") self.num_topics = num_topics try: lencorpus = len(corpus) except: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: raise ValueError("cannot compute DTM over an empty corpus") if lencorpus != sum(time_slices): raise ValueError("mismatched timeslices %{slices} for corpus of len {clen}".format( slices=sum(time_slices), clen=lencorpus)) self.lencorpus = lencorpus if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' prefix = os.path.join(tempfile.gettempdir(), rand_prefix) self.prefix = prefix self.time_slices = time_slices self.lda_sequence_min_iter = int(lda_sequence_min_iter) self.lda_sequence_max_iter = int(lda_sequence_max_iter) self.lda_max_em_iter = int(lda_max_em_iter) self.alpha = alpha self.top_chain_var = top_chain_var self.rng_seed = rng_seed self.initialize_lda = str(initialize_lda).lower() self.lambda_ = None self.obs_ = None self.lhood_ = None self.gamma_ = None self.init_alpha = None self.init_beta = None self.init_ss = None self.em_steps = [] self.influences_time = [] if corpus is not None: self.train(corpus, time_slices)
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, defined_kws={}, tfMod=None): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis. This may be useful if you want to seed certain topics with particular words by boosting the priors for those words. It also supports the special value 'auto', which learns an asymmetric prior directly from your data. Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `minimum_probability` controls filtering the topics returned for a document (bow). Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == ( num_topics, ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str( self.alpha.shape), num_topics) self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') assert ( self.eta.shape == (num_topics, 1) or self.eta.shape == (num_topics, self.num_terms) ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), num_topics, num_topics, self.num_terms)) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: if self.optimize_alpha: raise NotImplementedError( "auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher') logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)", err) raise RuntimeError( "failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = numpy.random.gamma( 100., 1. / 100., (self.num_topics, self.num_terms)) #reassign word/topic for specific words self.word2id = utils.revdict(self.id2word) sstats = self.state.sstats self.defined_kws = defined_kws self.defined_wordids = {} for w, t in defined_kws.iteritems(): if w in self.word2id: wid = self.word2id[w] self.defined_wordids[wid] = numpy.array(list(t)) for wid, t in self.defined_wordids.iteritems(): sstats[:, wid] = numpy.random.gamma(0.1, 0.05, (self.num_topics, )) for wid, topics in self.defined_wordids.iteritems(): if tfMod is not None: score = self.num_topics * tfMod.idfs.get(wid, 1.0) else: score = self.num_topics if topics.shape[0] > 1: # score = self.num_topics / math.log(len(topics) + 1) score = 0.4 * self.num_topics # print 'score :{}'.format(score) # print self.num_topics # print topics # for t in topics: sstats[topics, wid] = score self.expElogbeta = numpy.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None self.update(corpus, chunks_as_numpy=use_numpy)
def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, distributed=False, onepass=True, power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64): """Construct an `LsiModel` object. Either `corpus` or `id2word` must be supplied in order to train the model. Parameters ---------- corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`). num_topics : int, optional Number of requested factors (latent dimensions) id2word : dict of {int: str}, optional ID to word mapping, optional. chunksize : int, optional Number of documents to be used in each training chunk. decay : float, optional Weight of existing observations relatively to new ones. distributed : bool, optional If True - distributed mode (parallel execution on several machines) will be used. onepass : bool, optional Whether the one-pass algorithm should be used for training. Pass `False` to force a multi-pass stochastic algorithm. power_iters: int, optional Number of power iteration steps to be used. Increasing the number of power iterations improves accuracy, but lowers performance extra_samples : int, optional Extra samples to be used besides the rank `k`. Can improve accuracy. dtype : type, optional Enforces a type for elements of the decomposed matrix. """ self.id2word = id2word self.num_topics = int(num_topics) self.chunksize = int(chunksize) self.decay = float(decay) if distributed: if not onepass: logger.warning("forcing the one-pass algorithm for distributed LSA") onepass = True self.onepass = onepass self.extra_samples, self.power_iters = extra_samples, power_iters self.dtype = dtype if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 1 + (max(self.id2word.keys()) if self.id2word else -1) self.docs_processed = 0 self.projection = Projection( self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples, dtype=dtype ) self.numworkers = 1 if not distributed: logger.info("using serial LSI version on this node") self.dispatcher = None else: if not onepass: raise NotImplementedError( "distributed stochastic LSA not implemented yet; " "run either distributed one-pass, or serial randomized." ) try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') logger.debug("looking for dispatcher at %s", str(dispatcher._pyroUri)) dispatcher.initialize( id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, decay=decay, power_iters=self.power_iters, extra_samples=self.extra_samples, distributed=False, onepass=onepass ) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers", self.numworkers) except Exception as err: # distributed version was specifically requested, so this is an error state logger.error("failed to initialize distributed LSI (%s)", err) raise RuntimeError("failed to initialize distributed LSI (%s)" % err) if corpus is not None: self.add_documents(corpus)
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha=None, eta=None, decay=0.5): """ `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics (but can be set to a vector, for asymmetric priors). Turn on `distributed` to force distributed computing (see the web tutorial on how to set up a cluster of machines for gensim). Example: >>> lda = LdaModel(corpus, num_topics=100) >>> print lda[doc_bow] # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print lda[doc_bow] """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 1 + max([-1] + self.id2word.keys()) if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.num_updates = 0 self.passes = passes self.update_every = update_every if alpha is None: self.alpha = 1.0 / num_topics else: self.alpha = alpha if eta is None: self.eta = 1.0 / num_topics else: self.eta = eta # VB constants self.VAR_MAXITER = 50 self.VAR_THRESH = 0.001 # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: # set up distributed version try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher') dispatcher._pyroOneway.add("exit") logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception, err: logger.error("failed to initialize distributed LDA (%s)" % err) raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None, chunksize=256, passes=1, alpha=0.1, eta=0.1, decay=0.5, offset=1, gamma_threshold=0.001, random_seed=None, cleanup_files=True, tmp_prefix='tmp'): """`vw_path` is the path to Vowpal Wabbit's 'vw' executable. `corpus` is an iterable training corpus. If given, training will start immediately, otherwise the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. Corresponds to VW's '--lda <num_topics>' argument. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `chunksize` is the number of documents examined in each batch. Corresponds to VW's '--minibatch <batch_size>' argument. `passes` is the number of passes over the dataset to use. Corresponds to VW's '--passes <passes>' argument. `alpha` is a float effecting sparsity of per-document topic weights. This is applied symmetrically, and should be set higher to when documents are thought to look more similar. Corresponds to VW's '--lda_alpha <alpha>' argument. `eta` is a float which affects the sparsity of topic distributions. This is applied symmetrically, and should be set higher when topics are thought to look more similar. Corresponds to VW's '--lda_rho <rho>' argument. `decay` learning rate decay, affects how quickly learnt values are forgotten. Should be set to a value between 0.5 and 1.0 to guarantee convergence. Corresponds to VW's '--power_t <tau>' argument. `offset` integer learning offset, set to higher values to slow down learning on early iterations of the algorithm. Corresponds to VW's '--initial_t <tau>' argument. `gamma_threshold` affects when learning loop will be broken out of, higher values will result in earlier loop completion. Corresponds to VW's '--epsilon <eps>' argument. `random_seed` sets Vowpal Wabbit's random seed when learning. Corresponds to VW's '--random_seed <seed>' argument. `cleanup_files` whether or not to delete temporary directory and files used by this wrapper. Setting to False can be useful for debugging, or for re-using Vowpal Wabbit files elsewhere. `tmp_prefix` used to prefix temporary working directory name. """ # default parameters are taken from Vowpal Wabbit's defaults, and # parameter names changed to match Gensim's LdaModel where possible self.vw_path = vw_path self.id2word = id2word if self.id2word is None: if corpus is None: raise ValueError( "at least one of corpus/id2word must be specified, to establish input space dimensionality" ) logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute LDA over an empty collection (no terms)") # LDA parameters self.num_topics = num_topics self.chunksize = chunksize self.passes = passes self.alpha = alpha self.eta = eta self.gamma_threshold = gamma_threshold self.offset = offset self.decay = decay self.random_seed = random_seed self._initial_offset = offset # temporary files used for Vowpal Wabbit input/output self.tmp_dir = None self.tmp_prefix = tmp_prefix self.cleanup_files = cleanup_files self._init_temp_dir(tmp_prefix) # used for saving/loading this model's state self._model_data = None self._topics_data = None # cache loaded topics as numpy array self._topics = None if corpus is not None: self.train(corpus)
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf={}, minimum_phi_value=0.01, per_word_topics=False): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value 'auto', which learns an asymmetric prior over words directly from your data. `eta` can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be a np.random.RandomState object or the seed for one Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.minimum_phi_value = minimum_phi_value self.per_word_topics = per_word_topics self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError("The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = utils.get_random_state(random_state) assert self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms), ( "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms)) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: if self.optimize_alpha: raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 with utils.getNS(**ns_conf) as ns: from gensim.models.lda_dispatcher import LDA_DISPATCHER_PREFIX self.dispatcher = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) logger.debug("looking for dispatcher at %s" % str(self.dispatcher._pyroUri)) self.dispatcher.initialize(id2word=self.id2word, num_topics=self.num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.numworkers = len(self.dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)", err) raise RuntimeError("failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None self.update(corpus, chunks_as_numpy=use_numpy)
def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100): """ `corpus` is any iterable gensim corpus `time_slice` as described above is a list which contains the number of documents in each time-slice `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size and printing topics. `alphas` is a prior of your choice and should be a double or float value. default is 0.01 `num_topics` is the number of requested latent topics to be extracted from the training corpus. `initalize` allows the user to decide how he wants to initialise the DTM model. Default is through gensim LDA. You can use your own sstats of an LDA model previously trained as well by specifying 'own' and passing a np matrix through sstats. If you wish to just pass a previously used LDA model, pass it through `lda_model` Shape of sstats is (vocab_len, num_topics) `chain_variance` is a constant which dictates how the beta values evolve - it is a gaussian parameter defined in the beta distribution. `passes` is the number of passes of the initial LdaModel. `random_state` can be a np.random.RandomState object or the seed for one, for the LdaModel. """ self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.vocab_len = len(self.id2word) elif len(self.id2word) > 0: self.vocab_len = len(self.id2word) else: self.vocab_len = 0 if corpus is not None: try: self.corpus_len = len(corpus) except: logger.warning( "input corpus stream has no len(); counting documents") self.corpus_len = sum(1 for _ in corpus) self.time_slice = time_slice if self.time_slice is not None: self.num_time_slices = len(time_slice) max_doc_len = 0 for line_no, line in enumerate(corpus): if len(line) > max_doc_len: max_doc_len = len(line) self.max_doc_len = max_doc_len self.num_topics = num_topics self.num_time_slices = len(time_slice) self.alphas = np.full(num_topics, alphas) # topic_chains contains for each topic a 'state space language model' object which in turn has information about each topic # the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities. self.topic_chains = [] for topic in range(0, num_topics): sslm_ = sslm(num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics, chain_variance=chain_variance, obs_variance=obs_variance) self.topic_chains.append(sslm_) # the following are class variables which are to be integrated during Document Influence Model self.top_doc_phis = None self.influence = None self.renormalized_influence = None self.influence_sum_lgl = None # if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM. if corpus is not None and time_slice is not None: if initialize == 'gensim': lda_model = ldamodel.LdaModel(corpus, id2word=self.id2word, num_topics=self.num_topics, passes=passes, alpha=self.alphas, random_state=random_state) self.sstats = np.transpose(lda_model.state.sstats) if initialize == 'ldamodel': self.sstats = np.transpose(lda_model.state.sstats) if initialize == 'own': self.sstats = sstats # initialize model from sstats self.init_ldaseq_ss(chain_variance, obs_variance, self.alphas, self.sstats) # fit DTM self.fit_lda_seq(corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize)
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01): """ If given, start training from the iterable 'corpus' straight away. If not given, the model is left untrained('update()') :param corpus: :param num_topics: is the number of requested latent topics to be extracted from the training corpus. :param id2word: is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. :param eta and alpha: are hyperparameters that affect sparsity of the document-topic (theta) and topic-word(lambda) distributions. Both default to a symmetric 1.0/num_topics prior :param alpha: can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data :param eta : can be a scalar for a symmetric prior over topic/word distributions, or a matrix of shape num_topics * num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis. This may be useful if you want to seed certain topics with particular words by boosting the priors for those words. It also supports the special value 'auto', which learns an asymmetric prior directly from your data. :param distributed: Turn on 'distributed' ro force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). :param eval_every: Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. :param decay and offset: decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. :param minimum_probability: controls filtering the topics returned for a documents(bow) Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ #store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input sapce dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing form corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.iterations = iterations self.gamma_threshold = gamma_threshold self.passes = passes self.update_every = update_every self.eval_every = eval_every self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == ( num_topics, ), "Invalid alpha shape, Got shape %s, but expected (%d)" % (str( self.alpha.shape), num_topics) self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') assert (self.eta.shape == num_topics, 1) or self.eta.shape == ( num_topics, self.num_terms ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), num_topics, num_topics, self.num_terms)) #Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = numpy.ramdom.gamma( 100., 1 / 100. / (self.num_topics, self.num_terms)) self.expElogbeta = numpy.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None self.update(corpus, chunks_as_numpy=use_numpy)
def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, doc2author=None, chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0, alpha='symmetric', eta='symmetric', update_every=1, eval_every=10, gamma_threshold=0.001, serialized=False, serialization_path=None, minimum_probability=0.01, random_state=None): """ If the iterable corpus and one of author2doc/doc2author dictionaries are given, start training straight away. If not given, the model is left untrained (presumably because you want to call the `update` method manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `author2doc` is a dictionary where the keys are the names of authors, and the values are lists of documents that the author contributes to. `doc2author` is a dictionary where the keys are document IDs (indexes to corpus) and the values are lists of author names. I.e. this is the reverse mapping of `author2doc`. Only one of the two, `author2doc` and `doc2author` have to be supplied. `passes` is the number of times the model makes a pass over the entire trianing data. `iterations` is the maximum number of times the model loops over each document (M-step). The iterations stop when convergence is reached. `chunksize` controls the size of the mini-batches. `alpha` and `eta` are hyperparameters that affect sparsity of the author-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value 'auto', which learns an asymmetric prior over words directly from your data. `eta` can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates. Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `decay` controls how quickly old documents are forgotten, while `offset` down-weights early iterations. `minimum_probability` controls filtering the topics returned for a document (bow). `random_state` can be an integer or a numpy.random.RandomState object. Set the state of the random number generator inside the author-topic model, to ensure reproducibility of your experiments, for example. `serialized` indicates whether the input corpora to the model are simple in-memory lists (`serialized = False`) or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from other Gensim models. If your data is too large to fit in to memory, use this functionality. Note that calling `AuthorTopicModel.update` with new data may be cumbersome as it requires all the existing data to be re-serialized. `serialization_path` must be set to a filepath, if `serialized = True` is used. Use, for example, `serialization_path = /tmp/serialized_model.mm` or use your working directory by setting `serialization_path = serialized_model.mm`. An existing file *cannot* be overwritten; either delete the old file or choose a different name. Example: >>> model = AuthorTopicModel(corpus, num_topics=100, author2doc=author2doc, id2word=id2word) # train model >>> model.update(corpus2) # update the author-topic model with additional documents >>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the # infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState. distributed = False self.dispatcher = None self.numworkers = 1 self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( "at least one of corpus/id2word must be specified, to establish input space dimensionality" ) if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute the author-topic model over an empty collection (no terms)") logger.info('Vocabulary consists of %d words.', self.num_terms) self.author2doc = {} self.doc2author = {} self.distributed = distributed self.num_topics = num_topics self.num_authors = 0 self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.total_docs = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.author2id = {} self.id2author = {} self.serialized = serialized if serialized and not serialization_path: raise ValueError("If serialized corpora are used, a the path to a folder where the corpus should be saved must be provided (serialized_path).") if serialized and serialization_path: assert not isfile(serialization_path), \ "A file already exists at the serialization_path path; " \ "choose a different serialization_path, or delete the file." self.serialization_path = serialization_path # Initialize an empty self.corpus. self.init_empty_corpus() self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (self.num_topics,), \ "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) if isinstance(eta, six.string_types): if eta == 'asymmetric': raise ValueError("The 'asymmetric' option cannot be used for eta") self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') self.random_state = utils.get_random_state(random_state) assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), ( "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), self.num_terms, self.num_topics, self.num_terms) ) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # Initialize the variational distributions q(beta|lambda) and q(theta|gamma) self.state = AuthorTopicState(self.eta, (self.num_topics, self.num_terms), (self.num_authors, self.num_topics)) self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None and (author2doc is not None or doc2author is not None): use_numpy = self.dispatcher is not None self.update(corpus, author2doc, doc2author, chunks_as_numpy=use_numpy)
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the Mallet format. Warnings -------- This function is automatically called by :meth:`gensim.corpora.malletcorpus.MalletCorpus.serialize`, don't call it directly, call :meth:`gensim.corpora.lowcorpus.malletcorpus.MalletCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional Mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed directly from `corpus`. metadata : bool, optional If True - ???? Return ------ list of int List of offsets in resulting file for each document (in bytes), can be used for :meth:`~gensim.corpora.malletcorpus.Malletcorpus.docbyoffset`. Notes ----- The document id will be generated by enumerating the corpus. That is, it will range between 0 and number of documents in the corpus. Since Mallet has a language field in the format, this defaults to the string '__unknown__'. If the language needs to be saved, post-processing will be required. """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in Mallet format into %s", fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: for doc_id, doc in enumerate(corpus): if metadata: doc_id, doc_lang = doc[1] doc = doc[0] else: doc_lang = '__unknown__' words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write( utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words)))) if truncated: logger.warning( "Mallet format can only save vectors with integer elements; " "%i float entries were truncated to integer value", truncated) return offsets
def __init__(self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100, id2word=None, prefix=None, lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10, alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True): """ Parameters ---------- dtm_path : str Path to the dtm binary, e.g. `/home/username/dtm/dtm/main`. corpus : iterable of iterable of (int, int) Collection of texts in BoW format. time_slices : list of int Sequence of timestamps. mode : {'fit', 'time'}, optional Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time according to a DTM, basically a held out set. model : {'fixed', 'dtm'}, optional Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM. num_topics : int, optional Number of topics. id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`. prefix : str, optional Prefix for produced temporary files. lda_sequence_min_iter : int, optional Min iteration of LDA. lda_sequence_max_iter : int, optional Max iteration of LDA. lda_max_em_iter : int, optional Max em optimization iterations in LDA. alpha : int, optional Hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice. top_chain_var : int, optional Hyperparameter that affects. rng_seed : int, optional Random seed. initialize_lda : bool, optional If True - initialize DTM with LDA. """ if not os.path.isfile(dtm_path): raise ValueError("dtm_path must point to the binary file, not to a folder") self.dtm_path = dtm_path self.id2word = id2word if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys()) if self.num_terms == 0: raise ValueError("cannot compute DTM over an empty collection (no terms)") self.num_topics = num_topics try: lencorpus = len(corpus) except TypeError: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: raise ValueError("cannot compute DTM over an empty corpus") if model == "fixed" and any(not text for text in corpus): raise ValueError("""There is a text without words in the input corpus. This breaks method='fixed' (The DIM model).""") if lencorpus != sum(time_slices): raise ValueError( "mismatched timeslices %{slices} for corpus of len {clen}" .format(slices=sum(time_slices), clen=lencorpus) ) self.lencorpus = lencorpus if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' prefix = os.path.join(tempfile.gettempdir(), rand_prefix) self.prefix = prefix self.time_slices = time_slices self.lda_sequence_min_iter = int(lda_sequence_min_iter) self.lda_sequence_max_iter = int(lda_sequence_max_iter) self.lda_max_em_iter = int(lda_max_em_iter) self.alpha = alpha self.top_chain_var = top_chain_var self.rng_seed = rng_seed self.initialize_lda = str(initialize_lda).lower() self.lambda_ = None self.obs_ = None self.lhood_ = None self.gamma_ = None self.init_alpha = None self.init_beta = None self.init_ss = None self.em_steps = [] self.influences_time = [] if corpus is not None: self.train(corpus, time_slices, mode, model)
def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the Mallet format. Warnings -------- This function is automatically called by :meth:`gensim.corpora.malletcorpus.MalletCorpus.serialize`, don't call it directly, call :meth:`gensim.corpora.lowcorpus.malletcorpus.MalletCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional Mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed directly from `corpus`. metadata : bool, optional If True - ???? Return ------ list of int List of offsets in resulting file for each document (in bytes), can be used for :meth:`~gensim.corpora.malletcorpus.Malletcorpus.docbyoffset`. Notes ----- The document id will be generated by enumerating the corpus. That is, it will range between 0 and number of documents in the corpus. Since Mallet has a language field in the format, this defaults to the string '__unknown__'. If the language needs to be saved, post-processing will be required. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in Mallet format into %s", fname) truncated = 0 offsets = [] with utils.smart_open(fname, 'wb') as fout: for doc_id, doc in enumerate(corpus): if metadata: doc_id, doc_lang = doc[1] doc = doc[0] else: doc_lang = '__unknown__' words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words)))) if truncated: logger.warning( "Mallet format can only save vectors with integer elements; " "%i float entries were truncated to integer value", truncated ) return offsets
def __init__(self, sstm_jar_path, model, corpus, id2word=None, vectors=None, num_topics=20, alpha=0.1, beta=0.01, iterations=2000, prefix='results/', name='model', twords=20, sstep=0 ): """ Parameters ---------- sstm_path : str Path to the SSTM jar file. corpus : iterable of iterable of (int, int), optional Collection of texts in BoW format. id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`. vectors: Path to the word2vec file. num_topics : int, optional Number of topics. alpha : int, optional Alpha hyperparameter. beta : int, optional Beta hyperparameter. iterations : int, optional Number of training iterations. prefix : str, optional Prefix for produced temporary files. name : str, optional Name of topic model experiment. twords: int, optional Number of the most probable topical words. sstep : int, optional Step to save the sampling outputs. """ self.avaliable_models = AVALIABLE_MODELS self.sstm_jar_path = sstm_jar_path self.model = model.upper() if self.model not in self.avaliable_models: raise ValueError("unknown model") self.id2word = id2word self.vectors = vectors if self.id2word is None: raise ValueError("no word id mapping provided") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 0 if not self.id2word else 1 + max(self.id2word.keys()) if self.num_terms == 0: raise ValueError("empty collection (no terms)") self.num_topics = num_topics self.alpha = [alpha] * num_topics self.beta = beta if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' prefix = os.path.join(tempfile.gettempdir(), rand_prefix) self.prefix = prefix self.name = name self.twords = twords self.iterations = iterations self.sstep = sstep if corpus is not None: self.train(corpus)
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha=None, eta=None, decay=0.5): """ `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics (but can be set to a vector, for asymmetric priors). Turn on `distributed` to force distributed computing (see the web tutorial on how to set up a cluster of machines for gensim). Example: >>> lda = LdaModel(corpus, num_topics=100) >>> print lda[doc_bow] # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print lda[doc_bow] """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.info("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 1 + max([-1] + self.id2word.keys()) if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.num_updates = 0 self.passes = passes self.update_every = update_every if alpha is None: self.alpha = 1.0 / num_topics else: self.alpha = alpha if eta is None: self.eta = 1.0 / num_topics else: self.eta = eta # VB constants self.VAR_MAXITER = 50 self.VAR_THRESH = 0.001 # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: # set up distributed version try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher') dispatcher._pyroOneway.add("exit") logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception, err: logger.error("failed to initialize distributed LDA (%s)" % err) raise RuntimeError("failed to initialize distributed LDA (%s)" % err)
def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, distributed=False, onepass=True, power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS): """ `num_topics` is the number of requested factors (latent dimensions). After the model has been trained, you can estimate topics for an arbitrary, unseen document, using the ``topics = self[document]`` dictionary notation. You can also add new training documents, with ``self.add_documents``, so that training can be stopped and resumed at any time, and the LSI transformation is available at any point. If you specify a `corpus`, it will be used to train the model. See the method `add_documents` for a description of the `chunksize` and `decay` parameters. Turn `onepass` off to force a multi-pass stochastic algorithm. `power_iters` and `extra_samples` affect the accuracy of the stochastic multi-pass algorithm, which is used either internally (`onepass=True`) or as the front-end algorithm (`onepass=False`). Increasing the number of power iterations improves accuracy, but lowers performance. See [3]_ for some hard numbers. Turn on `distributed` to enable distributed computing. Example: >>> lsi = LsiModel(corpus, num_topics=10) >>> print(lsi[doc_tfidf]) # project some document into LSI space >>> lsi.add_documents(corpus2) # update LSI on additional documents >>> print(lsi[doc_tfidf]) .. [3] http://nlp.fi.muni.cz/~xrehurek/nips/rehurek_nips.pdf """ self.id2word = id2word self.num_topics = int(num_topics) self.chunksize = int(chunksize) self.decay = float(decay) if distributed: if not onepass: logger.warning("forcing the one-pass algorithm for distributed LSA") onepass = True self.onepass = onepass self.extra_samples, self.power_iters = extra_samples, power_iters if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 1 + max([-1] + self.id2word.keys()) self.docs_processed = 0 self.projection = Projection(self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples) self.numworkers = 1 if not distributed: logger.info("using serial LSI version on this node") self.dispatcher = None else: if not onepass: raise NotImplementedError("distributed stochastic LSA not implemented yet; " "run either distributed one-pass, or serial randomized.") try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, decay=decay, power_iters=self.power_iters, extra_samples=self.extra_samples, distributed=False, onepass=onepass) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: # distributed version was specifically requested, so this is an error state logger.error("failed to initialize distributed LSI (%s)" % err) raise RuntimeError("failed to initialize distributed LSI (%s)" % err) if corpus is not None: self.add_documents(corpus)
def __init__(self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100, id2word=None, prefix=None, lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10, alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True): """ Parameters ---------- dtm_path : str Path to the dtm binary, e.g. `/home/username/dtm/dtm/main`. corpus : iterable of iterable of (int, int) Collection of texts in BoW format. time_slices : list of int Sequence of timestamps. mode : {'fit', 'time'}, optional Controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time according to a DTM, basically a held out set. model : {'fixed', 'dtm'}, optional Control model that will be runned: 'fixed' is for DIM and 'dtm' for DTM. num_topics : int, optional Number of topics. id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional Mapping between tokens ids and words from corpus, if not specified - will be inferred from `corpus`. prefix : str, optional Prefix for produced temporary files. lda_sequence_min_iter : int, optional Min iteration of LDA. lda_sequence_max_iter : int, optional Max iteration of LDA. lda_max_em_iter : int, optional Max em optimization iterations in LDA. alpha : int, optional Hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice. top_chain_var : int, optional Hyperparameter that affects. rng_seed : int, optional Random seed. initialize_lda : bool, optional If True - initialize DTM with LDA. """ if not os.path.isfile(dtm_path): raise ValueError( "dtm_path must point to the binary file, not to a folder") self.dtm_path = dtm_path self.id2word = id2word if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 0 if not self.id2word else 1 + max( self.id2word.keys()) if self.num_terms == 0: raise ValueError( "cannot compute DTM over an empty collection (no terms)") self.num_topics = num_topics try: lencorpus = len(corpus) except TypeError: logger.warning( "input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: raise ValueError("cannot compute DTM over an empty corpus") if model == "fixed" and any(not text for text in corpus): raise ValueError( """There is a text without words in the input corpus. This breaks method='fixed' (The DIM model).""") if lencorpus != sum(time_slices): raise ValueError( "mismatched timeslices %{slices} for corpus of len {clen}". format(slices=sum(time_slices), clen=lencorpus)) self.lencorpus = lencorpus if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' prefix = os.path.join(tempfile.gettempdir(), rand_prefix) self.prefix = prefix self.time_slices = time_slices self.lda_sequence_min_iter = int(lda_sequence_min_iter) self.lda_sequence_max_iter = int(lda_sequence_max_iter) self.lda_max_em_iter = int(lda_max_em_iter) self.alpha = alpha self.top_chain_var = top_chain_var self.rng_seed = rng_seed self.initialize_lda = str(initialize_lda).lower() self.lambda_ = None self.obs_ = None self.lhood_ = None self.gamma_ = None self.init_alpha = None self.init_beta = None self.init_ss = None self.em_steps = [] self.influences_time = [] if corpus is not None: self.train(corpus, time_slices, mode, model)
def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None, chunksize=256, passes=1, alpha=0.1, eta=0.1, decay=0.5, offset=1, gamma_threshold=0.001, random_seed=None, cleanup_files=True, tmp_prefix='tmp'): """`vw_path` is the path to Vowpal Wabbit's 'vw' executable. `corpus` is an iterable training corpus. If given, training will start immediately, otherwise the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. Corresponds to VW's '--lda <num_topics>' argument. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `chunksize` is the number of documents examined in each batch. Corresponds to VW's '--minibatch <batch_size>' argument. `passes` is the number of passes over the dataset to use. Corresponds to VW's '--passes <passes>' argument. `alpha` is a float effecting sparsity of per-document topic weights. This is applied symmetrically, and should be set higher to when documents are thought to look more similar. Corresponds to VW's '--lda_alpha <alpha>' argument. `eta` is a float which affects the sparsity of topic distributions. This is applied symmetrically, and should be set higher when topics are thought to look more similar. Corresponds to VW's '--lda_rho <rho>' argument. `decay` learning rate decay, affects how quickly learnt values are forgotten. Should be set to a value between 0.5 and 1.0 to guarantee convergence. Corresponds to VW's '--power_t <tau>' argument. `offset` integer learning offset, set to higher values to slow down learning on early iterations of the algorithm. Corresponds to VW's '--initial_t <tau>' argument. `gamma_threshold` affects when learning loop will be broken out of, higher values will result in earlier loop completion. Corresponds to VW's '--epsilon <eps>' argument. `random_seed` sets Vowpal Wabbit's random seed when learning. Corresponds to VW's '--random_seed <seed>' argument. `cleanup_files` whether or not to delete temporary directory and files used by this wrapper. Setting to False can be useful for debugging, or for re-using Vowpal Wabbit files elsewhere. `tmp_prefix` used to prefix temporary working directory name. """ # default parameters are taken from Vowpal Wabbit's defaults, and # parameter names changed to match Gensim's LdaModel where possible self.vw_path = vw_path self.id2word = id2word if self.id2word is None: if corpus is None: raise ValueError( "at least one of corpus/id2word must be specified, to establish input space dimensionality" ) logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") # LDA parameters self.num_topics = num_topics self.chunksize = chunksize self.passes = passes self.alpha = alpha self.eta = eta self.gamma_threshold = gamma_threshold self.offset = offset self.decay = decay self.random_seed = random_seed self._initial_offset = offset # temporary files used for Vowpal Wabbit input/output self.tmp_dir = None self.tmp_prefix = tmp_prefix self.cleanup_files = cleanup_files self._init_temp_dir(tmp_prefix) # used for saving/loading this model's state self._model_data = None self._topics_data = None # cache loaded topics as numpy array self._topics = None if corpus is not None: self.train(corpus)
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis. This may be useful if you want to seed certain topics with particular words by boosting the priors for those words. Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.offset = offset self.num_updates = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.optimize_alpha = alpha == 'auto' if alpha == 'symmetric' or alpha is None: logger.info("using symmetric alpha at %s" % (1.0 / num_topics)) self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)]) elif alpha == 'asymmetric': self.alpha = numpy.asarray([1.0 / (i + numpy.sqrt(num_topics)) for i in xrange(num_topics)]) self.alpha /= self.alpha.sum() logger.info("using asymmetric alpha %s" % list(self.alpha)) elif alpha == 'auto': self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)]) logger.info("using autotuned alpha, starting with %s" % list(self.alpha)) else: # must be either float or an array of floats, of size num_topics self.alpha = alpha if isinstance(alpha, numpy.ndarray) else numpy.asarray([alpha] * num_topics) if len(self.alpha) != num_topics: raise RuntimeError("invalid alpha shape (must match num_topics)") if eta is None: self.eta = 1.0 / num_topics else: self.eta = eta # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: if self.optimize_alpha: raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher') logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)" % err) raise RuntimeError("failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = numpy.random.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) self.sync_state() # if a training corpus was provided, start estimating the model right away if corpus is not None: self.update(corpus)
def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100): """ `corpus` is any iterable gensim corpus `time_slice` as described above is a list which contains the number of documents in each time-slice `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size and printing topics. `alphas` is a prior of your choice and should be a double or float value. default is 0.01 `num_topics` is the number of requested latent topics to be extracted from the training corpus. `initalize` allows the user to decide how he wants to initialise the DTM model. Default is through gensim LDA. You can use your own sstats of an LDA model previously trained as well by specifying 'own' and passing a numpy matrix through sstats. If you wish to just pass a previously used LDA model, pass it through `lda_model` Shape of sstats is (vocab_len, num_topics) `chain_variance` is a constant which dictates how the beta values evolve - it is a gaussian parameter defined in the beta distribution. `passes` is the number of passes of the initial LdaModel. `random_state` can be a numpy.random.RandomState object or the seed for one, for the LdaModel. """ self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.vocab_len = len(self.id2word) elif len(self.id2word) > 0: self.vocab_len = len(self.id2word) else: self.vocab_len = 0 if corpus is not None: try: self.corpus_len = len(corpus) except: logger.warning("input corpus stream has no len(); counting documents") self.corpus_len = sum(1 for _ in corpus) self.time_slice = time_slice if self.time_slice is not None: self.num_time_slices = len(time_slice) max_doc_len = 0 for line_no, line in enumerate(corpus): if len(line) > max_doc_len: max_doc_len = len(line) self.max_doc_len = max_doc_len self.num_topics = num_topics self.num_time_slices = len(time_slice) self.alphas = numpy.full(num_topics, alphas) # topic_chains contains for each topic a 'state space language model' object which in turn has information about each topic # the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities. self.topic_chains = [] for topic in range(0, num_topics): sslm_ = sslm(num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics, chain_variance=chain_variance, obs_variance=obs_variance) self.topic_chains.append(sslm_) # the following are class variables which are to be integrated during Document Influence Model self.top_doc_phis = None self.influence = None self.renormalized_influence = None self.influence_sum_lgl = None # if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM. if corpus is not None and time_slice is not None: if initialize == 'gensim': lda_model = ldamodel.LdaModel(corpus, id2word=self.id2word, num_topics=self.num_topics, passes=passes, alpha=self.alphas, random_state=random_state) self.sstats = numpy.transpose(lda_model.state.sstats) if initialize == 'ldamodel': self.sstats = numpy.transpose(lda_model.state.sstats) if initialize == 'own': self.sstats = sstats # initialize model from sstats self.init_ldaseq_ss(chain_variance, obs_variance, self.alphas, self.sstats) # fit DTM self.fit_lda_seq(corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize)
def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, distributed=False, onepass=True, power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS): """ `num_topics` is the number of requested factors (latent dimensions). After the model has been trained, you can estimate topics for an arbitrary, unseen document, using the ``topics = self[document]`` dictionary notation. You can also add new training documents, with ``self.add_documents``, so that training can be stopped and resumed at any time, and the LSI transformation is available at any point. If you specify a `corpus`, it will be used to train the model. See the method `add_documents` for a description of the `chunksize` and `decay` parameters. Turn `onepass` off to force a multi-pass stochastic algorithm. `power_iters` and `extra_samples` affect the accuracy of the stochastic multi-pass algorithm, which is used either internally (`onepass=True`) or as the front-end algorithm (`onepass=False`). Increasing the number of power iterations improves accuracy, but lowers performance. See [3]_ for some hard numbers. Turn on `distributed` to enable distributed computing. Example: >>> lsi = LsiModel(corpus, num_topics=10) >>> print(lsi[doc_tfidf]) # project some document into LSI space >>> lsi.add_documents(corpus2) # update LSI on additional documents >>> print(lsi[doc_tfidf]) .. [3] http://nlp.fi.muni.cz/~xrehurek/nips/rehurek_nips.pdf """ self.id2word = id2word self.num_topics = int(num_topics) self.chunksize = int(chunksize) self.decay = float(decay) if distributed: if not onepass: logger.warning( "forcing the one-pass algorithm for distributed LSA") onepass = True self.onepass = onepass self.extra_samples, self.power_iters = extra_samples, power_iters if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 1 + max([-1] + self.id2word.keys()) self.docs_processed = 0 self.projection = Projection(self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples) self.numworkers = 1 if not distributed: logger.info("using serial LSI version on this node") self.dispatcher = None else: if not onepass: raise NotImplementedError( "distributed stochastic LSA not implemented yet; " "run either distributed one-pass, or serial randomized.") try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') logger.debug("looking for dispatcher at %s", str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, decay=decay, power_iters=self.power_iters, extra_samples=self.extra_samples, distributed=False, onepass=onepass) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers", self.numworkers) except Exception as err: # distributed version was specifically requested, so this is an error state logger.error("failed to initialize distributed LSI (%s)", err) raise RuntimeError( "failed to initialize distributed LSI (%s)" % err) if corpus is not None: self.add_documents(corpus)
def __init__(self, dtm_path, corpus=None, time_slices=None, mode='fit', model='dtm', num_topics=100, id2word=None, prefix=None, lda_sequence_min_iter=6, lda_sequence_max_iter=20, lda_max_em_iter=10, alpha=0.01, top_chain_var=0.005, rng_seed=0, initialize_lda=True): """ `dtm_path` is path to the dtm executable, e.g. `C:/dtm/dtm-win64.exe`. `corpus` is a gensim corpus, aka a stream of sparse document vectors. `id2word` is a mapping between tokens ids and token. `mode` controls the mode of the mode: 'fit' is for training, 'time' for analyzing documents through time according to a DTM, basically a held out set. `model` controls the choice of model. 'fixed' is for DIM and 'dtm' for DTM. `lda_sequence_min_iter` min iteration of LDA. `lda_sequence_max_iter` max iteration of LDA. `lda_max_em_iter` max em optiimzatiion iterations in LDA. `alpha` is a hyperparameter that affects sparsity of the document-topics for the LDA models in each timeslice. `top_chain_var` is a hyperparameter that affects. `rng_seed` is the random seed. `initialize_lda` initialize DTM with LDA. """ if not os.path.isfile(dtm_path): raise ValueError( "dtm_path must point to the binary file, not to a folder") self.dtm_path = dtm_path self.id2word = id2word if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 0 if not self.id2word else 1 + max( self.id2word.keys()) if self.num_terms == 0: raise ValueError( "cannot compute DTM over an empty collection (no terms)") self.num_topics = num_topics try: lencorpus = len(corpus) except: logger.warning( "input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: raise ValueError("cannot compute DTM over an empty corpus") if model == "fixed" and any(not text for text in corpus): raise ValueError( """There is a text without words in the input corpus. This breaks method='fixed' (The DIM model).""") if lencorpus != sum(time_slices): raise ValueError( "mismatched timeslices %{slices} for corpus of len {clen}". format(slices=sum(time_slices), clen=lencorpus)) self.lencorpus = lencorpus if prefix is None: rand_prefix = hex(random.randint(0, 0xffffff))[2:] + '_' prefix = os.path.join(tempfile.gettempdir(), rand_prefix) self.prefix = prefix self.time_slices = time_slices self.lda_sequence_min_iter = int(lda_sequence_min_iter) self.lda_sequence_max_iter = int(lda_sequence_max_iter) self.lda_max_em_iter = int(lda_max_em_iter) self.alpha = alpha self.top_chain_var = top_chain_var self.rng_seed = rng_seed self.initialize_lda = str(initialize_lda).lower() self.lambda_ = None self.obs_ = None self.lhood_ = None self.gamma_ = None self.init_alpha = None self.init_beta = None self.init_ss = None self.em_steps = [] self.influences_time = [] if corpus is not None: self.train(corpus, time_slices, mode, model)
def __init__(self, corpus=None, num_topics=200, id2word=None, chunksize=20000, decay=1.0, distributed=False, onepass=True, power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64): """Construct an `LsiModel` object. Either `corpus` or `id2word` must be supplied in order to train the model. Parameters ---------- corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional Stream of document vectors or sparse matrix of shape (`num_documents`, `num_terms`). num_topics : int, optional Number of requested factors (latent dimensions) id2word : dict of {int: str}, optional ID to word mapping, optional. chunksize : int, optional Number of documents to be used in each training chunk. decay : float, optional Weight of existing observations relatively to new ones. distributed : bool, optional If True - distributed mode (parallel execution on several machines) will be used. onepass : bool, optional Whether the one-pass algorithm should be used for training. Pass `False` to force a multi-pass stochastic algorithm. power_iters: int, optional Number of power iteration steps to be used. Increasing the number of power iterations improves accuracy, but lowers performance extra_samples : int, optional Extra samples to be used besides the rank `k`. Can improve accuracy. dtype : type, optional Enforces a type for elements of the decomposed matrix. """ self.id2word = id2word self.num_topics = int(num_topics) self.chunksize = int(chunksize) self.decay = float(decay) if distributed: if not onepass: logger.warning("forcing the one-pass algorithm for distributed LSA") onepass = True self.onepass = onepass self.extra_samples, self.power_iters = extra_samples, power_iters self.dtype = dtype if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) else: self.num_terms = 1 + (max(self.id2word.keys()) if self.id2word else -1) self.docs_processed = 0 self.projection = Projection( self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples, dtype=dtype ) self.numworkers = 1 if not distributed: logger.info("using serial LSI version on this node") self.dispatcher = None else: if not onepass: raise NotImplementedError( "distributed stochastic LSA not implemented yet; " "run either distributed one-pass, or serial randomized." ) try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') logger.debug("looking for dispatcher at %s", str(dispatcher._pyroUri)) dispatcher.initialize( id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, decay=decay, power_iters=self.power_iters, extra_samples=self.extra_samples, distributed=False, onepass=onepass ) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers", self.numworkers) except Exception as err: # distributed version was specifically requested, so this is an error state logger.error("failed to initialize distributed LSI (%s)", err) raise RuntimeError("failed to initialize distributed LSI (%s)" % err) if corpus is not None: self.add_documents(corpus)
def __init__(self, vw_path, corpus=None, num_topics=100, id2word=None, chunksize=256, passes=1, alpha=0.1, eta=0.1, decay=0.5, offset=1, gamma_threshold=0.001, random_seed=None, cleanup_files=True, tmp_prefix='tmp'): """ Parameters ---------- vw_path : str Path to Vowpal Wabbit's binary. corpus : iterable of list of (int, int), optional Collection of texts in BoW format. If given, training will start immediately, otherwise, you should call :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.train` or :meth:`~gensim.models.wrappers.ldavowpalwabbit.LdaVowpalWabbit.update` manually for training. num_topics : int, optional Number of requested latent topics to be extracted from the training corpus. Corresponds to VW's ``--lda <num_topics>`` argument. id2word : :class:`~gensim.corpora.dictionary.Dictionary`, optional Mapping from word ids (integers) to words (strings). chunksize : int, optional Number of documents examined in each batch. Corresponds to VW's ``--minibatch <batch_size>`` argument. passes : int, optional Number of passes over the dataset to use. Corresponds to VW's ``--passes <passes>`` argument. alpha : float, optional Float effecting sparsity of per-document topic weights. This is applied symmetrically, and should be set higher to when documents are thought to look more similar. Corresponds to VW's ``--lda_alpha <alpha>`` argument. eta : float, optional Affects the sparsity of topic distributions. This is applied symmetrically, and should be set higher when topics are thought to look more similar. Corresponds to VW's ``--lda_rho <rho>`` argument. decay : float, optional Learning rate decay, affects how quickly learnt values are forgotten. Should be set to a value between 0.5 and 1.0 to guarantee convergence. Corresponds to VW's ``--power_t <tau>`` argument. offset: int, optional Learning offset, set to higher values to slow down learning on early iterations of the algorithm. Corresponds to VW's ``--initial_t <tau>`` argument. gamma_threshold : float, optional Affects when learning loop will be broken out of, higher values will result in earlier loop completion. Corresponds to VW's ``--epsilon <eps>`` argument. random_seed : int, optional Sets random seed when learning. Corresponds to VW's ``--random_seed <seed>`` argument. cleanup_files : bool, optional Whether or not to delete temporary directory and files used by this wrapper. Setting to False can be useful for debugging, or for re-using Vowpal Wabbit files elsewhere. tmp_prefix : str, optional To prefix temporary working directory name. """ # default parameters are taken from Vowpal Wabbit's defaults, and # parameter names changed to match Gensim's LdaModel where possible self.vw_path = vw_path self.id2word = id2word if self.id2word is None: if corpus is None: raise ValueError( "at least one of corpus/id2word must be specified, to establish input space dimensionality" ) logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") # LDA parameters self.num_topics = num_topics self.chunksize = chunksize self.passes = passes self.alpha = alpha self.eta = eta self.gamma_threshold = gamma_threshold self.offset = offset self.decay = decay self.random_seed = random_seed self._initial_offset = offset # temporary files used for Vowpal Wabbit input/output self.tmp_dir = None self.tmp_prefix = tmp_prefix self.cleanup_files = cleanup_files self._init_temp_dir(tmp_prefix) # used for saving/loading this model's state self._model_data = None self._topics_data = None # cache loaded topics as numpy array self._topics = None if corpus is not None: self.train(corpus)