def show_topics(self, topics=10, topn=10, log=False, formatted=True): shown = [] if topics < 0: topics = len(self.data) topics = min(topics, len(self.data)) for k in xrange(topics): lambdak = list(self.data[k, :]) lambdak = lambdak / sum(lambdak) temp = zip(lambdak, xrange(len(lambdak))) temp = sorted(temp, key=lambda x: x[0], reverse=True) topic_terms = self.show_topic_terms(temp, topn) if formatted: topic = self.format_topic(k, topic_terms) # assuming we only output formatted topics if log: logger.info(topic) else: topic = [k, topic_terms] shown.append(topic) return shown
def __iter__(self): """ Iteratively yield vectors from the underlying file, in the format (row_no, vector), where vector is a list of (col_no, value) 2-tuples. Note that the total number of vectors returned is always equal to the number of rows specified in the header; empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the Matrix Market file. """ if isinstance(self.input, string_types): fin = open(self.input) else: fin = self.input fin.seek(0) self.skip_headers(fin) previd = -1 for line in fin: docid, termid, val = line.split() if not self.transposed: termid, docid = docid, termid docid, termid, val = int(docid) - 1, int(termid) - 1, float( val ) # -1 because matrix market indexes are 1-based => convert to 0-based assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: # change of document: return the document read so far (its id is prevId) if previd >= 0: yield previd, document # return implicit (empty) documents between previous id and new id # too, to keep consistent document numbering and corpus length for previd in xrange(previd + 1, docid): yield previd, [] # from now on start adding fields to a new document, with a new id previd = docid document = [] document.append(( termid, val, )) # add another field to the current document # handle the last document, as a special case if previd >= 0: yield previd, document # return empty documents between the last explicit document and the number # of documents as specified in the header for previd in xrange(previd + 1, self.num_docs): yield previd, []
def __iter__(self): """ Iteratively yield vectors from the underlying file, in the format (row_no, vector), where vector is a list of (col_no, value) 2-tuples. Note that the total number of vectors returned is always equal to the number of rows specified in the header; empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the Matrix Market file. """ if isinstance(self.input, string_types): fin = open(self.input) else: fin = self.input fin.seek(0) self.skip_headers(fin) previd = -1 for line in fin: docid, termid, val = line.split() if not self.transposed: termid, docid = docid, termid docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: # change of document: return the document read so far (its id is prevId) if previd >= 0: yield previd, document # return implicit (empty) documents between previous id and new id # too, to keep consistent document numbering and corpus length for previd in xrange(previd + 1, docid): yield previd, [] # from now on start adding fields to a new document, with a new id previd = docid document = [] document.append((termid, val,)) # add another field to the current document # handle the last document, as a special case if previd >= 0: yield previd, document # return empty documents between the last explicit document and the number # of documents as specified in the header for previd in xrange(previd + 1, self.num_docs): yield previd, []
def from_corpus(corpus): """ Create Dictionary from an existing corpus. This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original text corpus. This will scan the term-document count matrix for all word ids that appear in it, then construct and return Dictionary which maps each `word_id -> str(word_id)`. """ result = Dictionary() max_id = -1 for docno, document in enumerate(corpus): if docno % 10000 == 0: logger.info("adding document #%i to %s" % (docno, result)) result.num_docs += 1 result.num_nnz += len(document) for wordid, word_freq in document: max_id = max(wordid, max_id) result.num_pos += word_freq result.dfs[wordid] = result.dfs.get(wordid, 0) + 1 # now make sure length(result) == get_max_id(corpus) + 1 for i in xrange(max_id + 1): result.token2id[str(i)] = i logger.info("built %s from %i documents (total %i corpus positions)" % (result, result.num_docs, result.num_pos)) return result
def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True): """ Show `num_topics` most significant topics (show all by default). For each topic, show `num_words` most significant words (10 words by defaults). Return the shown topics as a list -- a list of strings if `formatted` is True, or a list of (value, word) 2-tuples if it's False. If `log` is True, also output this result to log. """ shown = [] if num_topics < 0: num_topics = self.num_topics for i in xrange(min(num_topics, self.num_topics)): if i < len(self.projection.s): if formatted: topic = self.print_topic(i, topn=num_words) else: topic = self.show_topic(i, topn=num_words) shown.append(topic) if log: logger.info("topic #%i(%.3f): %s" % (i, self.projection.s[i], topic)) return shown
def create_binary_tree(self): """ Create a binary Huffman tree using stored vocabulary word counts. Frequent words will have shorter binary codes. Called internally from `build_vocab()`. """ logger.info("constructing a huffman tree from %i words" % len(self.vocab)) # build the huffman tree heap = self.vocab.values() heapq.heapify(heap) for i in xrange(len(self.vocab) - 1): min1, min2 = heapq.heappop(heap), heapq.heappop(heap) heapq.heappush(heap, Vocab(count=min1.count + min2.count, index=i + len(self.vocab), left=min1, right=min2)) # recurse over the tree, assigning a binary code to each vocabulary word if heap: max_depth, stack = 0, [(heap[0], [], [])] while stack: node, codes, points = stack.pop() if node.index < len(self.vocab): # leaf node => store its path from the root node.code, node.point = codes, points max_depth = max(len(codes), max_depth) else: # inner node => continue recursion points = array(list(points) + [node.index - len(self.vocab)], dtype=uint32) stack.append((node.left, array(list(codes) + [0], dtype=uint8), points)) stack.append((node.right, array(list(codes) + [1], dtype=uint8), points)) logger.info("built huffman tree with maximum node depth %i" % max_depth)
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `UciCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) # write out vocabulary fname_vocab = fname + '.vocab' logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write("%s\n" % utils.to_utf8(id2word.get(featureid, '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) logger.info("storing corpus in Blei's LDA-C format into %s" % fname) with utils.smart_open(fname, 'w') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) fout.write("%i %s\n" % (len(doc), ' '.join("%i:%s" % p for p in doc if abs(p[1]) > 1e-12))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = fname + '.vocab' logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write("%s\n" % utils.to_utf8(id2word.get(featureid, '---'))) return offsets
def reset_weights(self): """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" logger.info("resetting layer weights") random.seed(self.seed) self.syn0 = empty((len(self.vocab), self.layer1_size), dtype=REAL) # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once for i in xrange(len(self.vocab)): self.syn0[i] = (random.rand(self.layer1_size) - 0.5) / self.layer1_size self.syn1 = zeros((len(self.vocab), self.layer1_size), dtype=REAL) self.syn0norm = None
def update_expectations(self): """ Since we're doing lazy updates on lambda, at any given moment the current state of lambda may not be accurate. This function updates all of the elements of lambda and Elogbeta so that if (for example) we want to print out the topics we've learned we'll get the correct behavior. """ for w in xrange(self.m_W): self.m_lambda[:, w] *= np.exp(self.m_r[-1] - self.m_r[self.m_timestamp[w]]) self.m_Elogbeta = sp.psi(self.m_eta + self.m_lambda) - \ sp.psi(self.m_W * self.m_eta + self.m_lambda_sum[:, np.newaxis]) self.m_timestamp[:] = self.m_updatect self.m_status_up_to_date = True
def __iter__(self): """ For each index document, compute cosine similarity against all other documents in the index and yield the result. """ # turn off query normalization (vectors in the index are assumed to be already normalized) norm = self.normalize self.normalize = False # Try to compute similarities in bigger chunks of documents (not # one query = a single document after another). The point is, a # bigger query of N documents is faster than N small queries of one # document. # # After computing similarities of the bigger query in `self[chunk]`, # yield the resulting similarities one after another, so that it looks # exactly the same as if they had been computed with many small queries. try: chunking = self.chunksize > 1 except AttributeError: # chunking not supported; fall back to the (slower) mode of 1 query=1 document chunking = False if chunking: # assumes `self.corpus` holds the index as a 2-d numpy array. # this is true for MatrixSimilarity and SparseMatrixSimilarity, but # may not be true for other (future) classes..? for chunk_start in xrange(0, self.index.shape[0], self.chunksize): # scipy.sparse doesn't allow slicing beyond real size of the matrix # (unlike numpy). so, clip the end of the chunk explicitly to make # scipy.sparse happy chunk_end = min(self.index.shape[0], chunk_start + self.chunksize) chunk = self.index[chunk_start:chunk_end] if chunk.shape[0] > 1: for sim in self[chunk]: yield sim else: yield self[chunk] else: for doc in self.index: yield self[doc] # restore old normalization value self.normalize = norm
def __iter__(self): """ For each index document, compute cosine similarity against all other documents in the index and yield the result. """ # turn off query normalization (vectors in the index are assumed to be already normalized) norm = self.normalize self.normalize = False # Try to compute similarities in bigger chunks of documents (not # one query = a single document after another). The point is, a # bigger query of N documents is faster than N small queries of one # document. # # After computing similarities of the bigger query in `self[chunk]`, # yield the resulting similarities one after another, so that it looks # exactly the same as if they had been computed with many small queries. try: chunking = self.chunksize > 1 except AttributeError: # chunking not supported; fall back to the (slower) mode of 1 query=1 document chunking = False if chunking: # assumes `self.corpus` holds the index as a 2-d numpy array. # this is true for MatrixSimilarity and SparseMatrixSimilarity, but # may not be true for other (future) classes..? for chunk_start in xrange(0, self.index.shape[0], self.chunksize): # scipy.sparse doesn't allow slicing beyond real size of the matrix # (unlike numpy). so, clip the end of the chunk explicitly to make # scipy.sparse happy chunk_end = min(self.index.shape[0], chunk_start + self.chunksize) chunk = self.index[chunk_start : chunk_end] if chunk.shape[0] > 1: for sim in self[chunk]: yield sim else: yield self[chunk] else: for doc in self.index: yield self[doc] # restore old normalization value self.normalize = norm
def hdp_to_lda(self): """ Compute the LDA almost equivalent HDP. """ # alpha sticks = self.m_var_sticks[0] / (self.m_var_sticks[0] + self.m_var_sticks[1]) alpha = np.zeros(self.m_T) left = 1.0 for i in xrange(0, self.m_T - 1): alpha[i] = sticks[i] * left left = left - alpha[i] alpha[self.m_T - 1] = left alpha = alpha * self.m_alpha # beta beta = (self.m_lambda + self.m_eta) / (self.m_W * self.m_eta + \ self.m_lambda_sum[:, np.newaxis]) return (alpha, beta)
def compactify(self): """ Assign new word ids to all words. This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. """ logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id idmap = dict( izip(itervalues(self.token2id), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id)) self.id2token = {} self.dfs = dict( (idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
def compactify(self): """ Assign new word ids to all words. This is done to make the ids more compact, e.g. after some tokens have been removed via :func:`filter_tokens` and there are gaps in the id series. Calling this method will remove the gaps. """ logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id idmap = dict(izip(itervalues(self.token2id), xrange(len(self.token2id)))) # reassign mappings to new ids self.token2id = dict((token, idmap[tokenid]) for token, tokenid in iteritems(self.token2id)) self.id2token = {} self.dfs = dict((idmap[tokenid], freq) for tokenid, freq in iteritems(self.dfs))
def __init__(self, fname, id2word=None, line2words=split_on_space): """ Initialize the corpus from a file. `id2word` and `line2words` are optional parameters. If provided, `id2word` is a dictionary mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed from the documents. `line2words` is a function which converts lines into tokens. Defaults to simple splitting on spaces. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) all_terms = sorted( all_terms ) # sort the list of all words; rank in that list = word's integer id self.id2word = dict(izip( xrange(len(all_terms)), all_terms)) # build a mapping of word id(int) -> word (string) else: logger.info("using provided word mapping (%i ids)" % len(id2word)) self.id2word = id2word self.word2id = dict((v, k) for k, v in iteritems(self.id2word)) self.num_terms = len(self.word2id) self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples logger.info("loaded corpus with %i documents and %i terms from %s" % (self.num_docs, self.num_terms, fname))
def save_corpus(fname, corpus, id2word=None, metadata=False): """ Save a corpus in the LDA-C format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `BleiCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) logger.info("storing corpus in Blei's LDA-C format into %s" % fname) with utils.smart_open(fname, 'w') as fout: offsets = [] for doc in corpus: doc = list(doc) offsets.append(fout.tell()) fout.write( "%i %s\n" % (len(doc), ' '.join("%i:%s" % p for p in doc if abs(p[1]) > 1e-12))) # write out vocabulary, in a format compatible with Blei's topics.py script fname_vocab = fname + '.vocab' logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with open(fname_vocab, 'wb') as fout: for featureid in xrange(num_terms): fout.write("%s\n" % utils.to_utf8(id2word.get(featureid, '---'))) return offsets
def __init__(self, fname, id2word=None, line2words=split_on_space): """ Initialize the corpus from a file. `id2word` and `line2words` are optional parameters. If provided, `id2word` is a dictionary mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed from the documents. `line2words` is a function which converts lines into tokens. Defaults to simple splitting on spaces. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s" % fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id self.id2word = dict( izip(xrange(len(all_terms)), all_terms) ) # build a mapping of word id(int) -> word (string) else: logger.info("using provided word mapping (%i ids)" % len(id2word)) self.id2word = id2word self.word2id = dict((v, k) for k, v in iteritems(self.id2word)) self.num_terms = len(self.word2id) self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples logger.info("loaded corpus with %i documents and %i terms from %s" % (self.num_docs, self.num_terms, fname))
def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): gamma = np.ones(len(alpha)) expElogtheta = np.exp(dirichlet_expectation(gamma)) betad = beta[:, doc_word_ids] phinorm = np.dot(expElogtheta, betad) + 1e-100 counts = np.array(doc_word_counts) for _ in xrange(max_iter): lastgamma = gamma gamma = alpha + expElogtheta * np.dot(counts / phinorm, betad.T) Elogtheta = dirichlet_expectation(gamma) expElogtheta = np.exp(Elogtheta) phinorm = np.dot(expElogtheta, betad) + 1e-100 meanchange = np.mean(abs(gamma - lastgamma)) if (meanchange < meanchangethresh): break likelihood = np.sum(counts * np.log(phinorm)) likelihood += np.sum((alpha - gamma) * Elogtheta) likelihood += np.sum(sp.gammaln(gamma) - sp.gammaln(alpha)) likelihood += sp.gammaln(np.sum(alpha)) - sp.gammaln(np.sum(gamma)) return (likelihood, gamma)
def iter_chunks(self, chunksize=None): """ Iteratively yield the index as chunks of documents, each of size <= chunksize. The chunk is returned in its raw form (matrix or sparse matrix slice). The size of the chunk may be smaller than requested; it is up to the caller to check the result for real length, using `chunk.shape[0]`. """ self.close_shard() if chunksize is None: # if not explicitly specified, use the chunksize from the constructor chunksize = self.chunksize for shard in self.shards: query = shard.get_index().index for chunk_start in xrange(0, query.shape[0], chunksize): # scipy.sparse doesn't allow slicing beyond real size of the matrix # (unlike numpy). so, clip the end of the chunk explicitly to make # scipy.sparse happy chunk_end = min(query.shape[0], chunk_start + chunksize) chunk = query[chunk_start: chunk_end] # create a view yield chunk
def iter_chunks(self, chunksize=None): """ Iteratively yield the index as chunks of documents, each of size <= chunksize. The chunk is returned in its raw form (matrix or sparse matrix slice). The size of the chunk may be smaller than requested; it is up to the caller to check the result for real length, using `chunk.shape[0]`. """ self.close_shard() if chunksize is None: # if not explicitly specified, use the chunksize from the constructor chunksize = self.chunksize for shard in self.shards: query = shard.get_index().index for chunk_start in xrange(0, query.shape[0], chunksize): # scipy.sparse doesn't allow slicing beyond real size of the matrix # (unlike numpy). so, clip the end of the chunk explicitly to make # scipy.sparse happy chunk_end = min(query.shape[0], chunk_start + chunksize) chunk = query[chunk_start:chunk_end] # create a view yield chunk
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): """ Save a corpus in the UCI Bag-of-Words format. There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. This function is automatically called by `UciCorpus.serialize`; don't call it directly, call `serialize` instead. """ if id2word is None: logger.info( "no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) num_terms = len(id2word) else: num_terms = 1 + max([-1] + id2word.keys()) # write out vocabulary fname_vocab = fname + '.vocab' logger.info("saving vocabulary of %i words to %s" % (num_terms, fname_vocab)) with open(fname_vocab, 'w') as fout: for featureid in xrange(num_terms): fout.write("%s\n" % utils.to_utf8(id2word.get(featureid, '---'))) logger.info("storing corpus in UCI Bag-of-Words format: %s" % fname) return UciWriter.write_corpus(fname, corpus, index=True, progress_cnt=progress_cnt)
def train(self, sentences, total_words=None, word_count=0, chunksize=100): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ if FAST_VERSION < 0: import warnings warnings.warn("Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`") logger.info("training model with %i workers on %i vocabulary and %i features" % (self.workers, len(self.vocab), self.layer1_size)) if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] word_count, total_words = [word_count], total_words or sum(v.count for v in itervalues(self.vocab)) jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() # for shared state (=number of words trained so far, log reports...) def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = zeros(self.layer1_size, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) # how many words did we train on? out-of-vocabulary (unknown) words do not count if self.sg: job_words = sum(train_sentence_sg(self, sentence, alpha, work) for sentence in job) else: job_words = sum(train_sentence_cbow(self, sentence, alpha, work, neu1) for sentence in job) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences) for job_no, job in enumerate(utils.grouper(no_oov, chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0)) return word_count[0]
def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, power_iters=0, dtype=numpy.float64, eps=1e-6): """ Run truncated Singular Value Decomposition (SVD) on a sparse input. Return (U, S): the left singular vectors and the singular values of the input data stream `corpus` [4]_. The corpus may be larger than RAM (iterator of vectors). This may return less than the requested number of top `rank` factors, in case the input itself is of lower rank. The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the decomposition. This algorithm uses `2+power_iters` passes over the input data. In case you can only afford a single pass, set `onepass=True` in :class:`LsiModel` and avoid using this function directly. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** .. [4] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen. """ rank = int(rank) if extra_dims is None: samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters)) num_terms = int(num_terms) # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O) # build Y in blocks of `chunksize` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st phase: constructing %s action matrix" % str(y.shape)) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms) o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, corpus.data, o.ravel(), y.ravel()) # y = corpus * o del o # unlike numpy, scipy.sparse `astype()` copies everything, even if there is no change to dtype! # so check for equal dtype explicitly, to avoid the extra memory footprint if possible if y.dtype != dtype: y = y.astype(dtype) logger.info("orthonormalizing %s action matrix" % str(y.shape)) y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range logger.debug("running %i power iterations" % power_iters) for power_iter in xrange(power_iters): q = corpus.T * q q = [corpus * q] q, _ = matutils.qr_destroy(q) # orthonormalize the range after each power iteration step else: num_docs = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i' % (chunk_no * chunksize)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunksize) matrix! s = sum(len(doc) for doc in chunk) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunksize # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o chunk.data, o.ravel(), y.ravel()) del chunk, o y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range for power_iter in xrange(power_iters): logger.info("running power iteration #%i" % (power_iter + 1)) yold = q.copy() q[:] = 0.0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs)) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC tmp = chunk.T * yold tmp = chunk * tmp del chunk q += tmp del yold q = [q] q, _ = matutils.qr_destroy(q) # orthonormalize the range qt = q[:, :samples].T.copy() del q if scipy.sparse.issparse(corpus): b = qt * corpus logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape)) u, s, vt = scipy.linalg.svd(b, full_matrices=False) del b, vt else: # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunksize` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=numpy.float64) logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape)) for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs)) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=qt.dtype) b = qt * chunk # dense * sparse matrix multiply del chunk x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del b # now we're ready to compute decomposition of the small matrix X logger.info("running dense decomposition on %s covariance matrix" % str(x.shape)) u, s, vt = scipy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) s = numpy.sqrt(s) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus q = qt.T.copy() del qt logger.info("computing the final decomposition") keep = clip_spectrum(s**2, rank, discard=eps) u = u[:, :keep].copy() s = s[:keep] u = numpy.dot(q, u) return u.astype(dtype), s.astype(dtype)
def merge(self, other, decay=1.0): """ Merge this Projection with another. The content of `other` is destroyed in the process, so pass this function a copy of `other` if you need it further. """ if other.u is None: # the other projection is empty => do nothing return if self.u is None: # we are empty => result of merge is the other projection, whatever it is self.u = other.u.copy() self.s = other.s.copy() return if self.m != other.m: raise ValueError("vector space mismatch: update is using %s features, expected %s" % (other.m, self.m)) logger.info("merging projections: %s + %s" % (str(self.u.shape), str(other.u.shape))) m, n1, n2 = self.u.shape[0], self.u.shape[1], other.u.shape[1] # TODO Maybe keep the bases as elementary reflectors, without # forming explicit matrices with ORGQR. # The only operation we ever need is basis^T*basis ond basis*component. # But how to do that in scipy? And is it fast(er)? # find component of u2 orthogonal to u1 logger.debug("constructing orthogonal component") self.u = asfarray(self.u, 'self.u') c = numpy.dot(self.u.T, other.u) self.u = ascarray(self.u, 'self.u') other.u -= numpy.dot(self.u, c) other.u = [other.u] # do some reference magic and call qr_destroy, to save RAM q, r = matutils.qr_destroy(other.u) # q, r = QR(component) assert not other.u # find the rotation that diagonalizes r k = numpy.bmat([[numpy.diag(decay * self.s), numpy.multiply(c, other.s)], [matutils.pad(numpy.array([]).reshape(0, 0), min(m, n2), n1), numpy.multiply(r, other.s)]]) logger.debug("computing SVD of %s dense matrix" % str(k.shape)) try: # in numpy < 1.1.0, running SVD sometimes results in "LinAlgError: SVD did not converge'. # for these early versions of numpy, catch the error and try to compute # SVD again, but over k*k^T. # see http://www.mail-archive.com/[email protected]/msg07224.html and # bug ticket http://projects.scipy.org/numpy/ticket/706 # sdoering: replaced numpy's linalg.svd with scipy's linalg.svd: u_k, s_k, _ = scipy.linalg.svd(k, full_matrices=False) # TODO *ugly overkill*!! only need first self.k SVD factors... but there is no LAPACK wrapper for partial svd/eigendecomp in numpy :( //sdoering: maybe there is one in scipy? except scipy.linalg.LinAlgError: logger.error("SVD(A) failed; trying SVD(A * A^T)") u_k, s_k, _ = scipy.linalg.svd(numpy.dot(k, k.T), full_matrices=False) # if this fails too, give up with an exception s_k = numpy.sqrt(s_k) # go back from eigen values to singular values k = clip_spectrum(s_k**2, self.k) u1_k, u2_k, s_k = numpy.array(u_k[:n1, :k]), numpy.array(u_k[n1:, :k]), s_k[:k] # update & rotate current basis U = [U, U']*[U1_k, U2_k] logger.debug("updating orthonormal basis U") self.s = s_k self.u = ascarray(self.u, 'self.u') self.u = numpy.dot(self.u, u1_k) q = ascarray(q, 'q') q = numpy.dot(q, u2_k) self.u += q # make each column of U start with a non-negative number (to force canonical decomposition) if self.u.shape[0] > 0: for i in xrange(self.u.shape[1]): if self.u[0, i] < 0.0: self.u[:, i] *= -1.0
def _vowelinstem(self): """True <=> 0,...j contains a vowel""" return not all(self._cons(i) for i in xrange(self.j + 1))
def iteritems(self): for i in xrange(self.num_terms): yield i, str(i)
def inference(self, chunk, collect_sstats=False): """ Given a chunk of sparse document vectors, estimate gamma (parameters controlling the topic weights) for each document in the chunk. This function does not modify the model (=is read-only aka const). The whole input chunk of document is assumed to fit in RAM; chunking of a large corpus must be done earlier in the pipeline. If `collect_sstats` is True, also collect sufficient statistics needed to update the model's topic-word distributions, and return a 2-tuple `(gamma, sstats)`. Otherwise, return `(gamma, None)`. `gamma` is of shape `len(chunk) x topics`. """ try: _ = len(chunk) except: chunk = list(chunk) # convert iterators/generators to plain list, so we have len() etc. if len(chunk) > 1: logger.debug("performing inference on a chunk of %i documents" % len(chunk)) # Initialize the variational distribution q(theta|gamma) for the chunk gamma = numpy.random.gamma(100., 1. / 100., (len(chunk), self.num_topics)) Elogtheta = dirichlet_expectation(gamma) expElogtheta = numpy.exp(Elogtheta) if collect_sstats: sstats = numpy.zeros_like(self.expElogbeta) else: sstats = None converged = 0 # Now, for each document d update that document's gamma and phi # Inference code copied from Hoffman's `onlineldavb.py` (esp. the # Lee&Seung trick which speeds things up by an order of magnitude, compared # to Blei's original LDA-C code, cool!). for d, doc in enumerate(chunk): ids = [id for id, _ in doc] cts = numpy.array([cnt for _, cnt in doc]) gammad = gamma[d, :] Elogthetad = Elogtheta[d, :] expElogthetad = expElogtheta[d, :] expElogbetad = self.expElogbeta[:, ids] # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w. # phinorm is the normalizer. phinorm = numpy.dot(expElogthetad, expElogbetad) + 1e-100 # TODO treat zeros explicitly, instead of adding eps? # Iterate between gamma and phi until convergence for _ in xrange(self.iterations): lastgamma = gammad # We represent phi implicitly to save memory and time. # Substituting the value of the optimal phi back into # the update for gamma gives this update. Cf. Lee&Seung 2001. gammad = self.alpha + expElogthetad * numpy.dot(cts / phinorm, expElogbetad.T) Elogthetad = dirichlet_expectation(gammad) expElogthetad = numpy.exp(Elogthetad) phinorm = numpy.dot(expElogthetad, expElogbetad) + 1e-100 # If gamma hasn't changed much, we're done. meanchange = numpy.mean(abs(gammad - lastgamma)) if (meanchange < self.gamma_threshold): converged += 1 break gamma[d, :] = gammad if collect_sstats: # Contribution of document d to the expected sufficient # statistics for the M step. sstats[:, ids] += numpy.outer(expElogthetad.T, cts / phinorm) if len(chunk) > 1: logger.info("%i/%i documents converged within %i iterations" % (converged, len(chunk), self.iterations)) if collect_sstats: # This step finishes computing the sufficient statistics for the # M step, so that # sstats[k, w] = \sum_d n_{dw} * phi_{dwk} # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}. sstats *= self.expElogbeta return gamma, sstats
def load_word2vec_format(cls, fname, fvocab=None, binary=False, norm_only=True): """ Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. `binary` is a boolean indicating whether the data is in binary word2vec format. `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). """ counts = None if fvocab is not None: logger.info("loading word counts from %s" % (fvocab)) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = line.strip().split() counts[word] = int(count) logger.info("loading projection weights from %s" % (fname)) with utils.smart_open(fname) as fin: header = fin.readline() vocab_size, layer1_size = map(int, header.split()) # throws for invalid file format result = Word2Vec(size=layer1_size) result.syn0 = zeros((vocab_size, layer1_size), dtype=REAL) if binary: binary_len = dtype(REAL).itemsize * layer1_size for line_no in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == ' ': word = ''.join(word) break if ch != '\n': # ignore newlines in front of words (some binary files have newline, some not) word.append(ch) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif counts.has_key(word): result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = fromstring(fin.read(binary_len), dtype=REAL) else: for line_no, line in enumerate(fin): parts = line.split() if len(parts) != layer1_size + 1: raise ValueError("invalid vector on line %s (is this really the text format?)" % (line_no)) word, weights = parts[0], map(REAL, parts[1:]) if counts is None: result.vocab[word] = Vocab(index=line_no, count=vocab_size - line_no) elif counts.has_key(word): result.vocab[word] = Vocab(index=line_no, count=counts[word]) else: logger.warning("vocabulary file is incomplete") result.vocab[word] = Vocab(index=line_no, count=None) result.index2word.append(word) result.syn0[line_no] = weights logger.info("loaded %s matrix from %s" % (result.syn0.shape, fname)) result.init_sims(norm_only) return result
def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). `corpus` must be an iterable (repeatable stream of documents), In distributed mode, the E step is distributed over a cluster of machines. This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. """ # use parameters given in constructor, unless user explicitly overrode them if chunksize is None: chunksize = self.chunksize if decay is None: decay = self.decay if passes is None: passes = self.passes if update_every is None: update_every = self.update_every if eval_every is None: eval_every = self.eval_every if iterations is None: iterations = self.iterations if gamma_threshold is None: gamma_threshold = self.gamma_threshold # rho is the "speed" of updating; TODO try other fncs rho = lambda: pow(1.0 + self.num_updates, -decay) try: lencorpus = len(corpus) except: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaModel.update() called with an empty corpus") return self.state.numdocs += lencorpus if update_every: updatetype = "online" updateafter = min(lencorpus, update_every * self.numworkers * chunksize) else: updatetype = "batch" updateafter = lencorpus evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize) updates_per_pass = max(1, lencorpus / updateafter) logger.info("running %s LDA training, %s topics, %i passes over " "the supplied corpus of %i documents, updating model once " "every %i documents, evaluating perplexity every %i documents, " "iterating %i with a convergence threshold of %i" % (updatetype, self.num_topics, passes, lencorpus, updateafter, evalafter, iterations, gamma_threshold)) if updates_per_pass * passes < 10: logger.warning("too few updates, training might not converge; consider " "increasing the number of passes or iterations to improve accuracy") for pass_ in xrange(passes): if self.dispatcher: logger.info('initializing %s workers' % self.numworkers) self.dispatcher.reset(self.state) else: other = LdaState(self.eta, self.state.sstats.shape) dirty = False reallen = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=True)): reallen += len(chunk) # keep track of how many documents we've processed so far if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): self.log_perplexity(chunk, total_docs=lencorpus) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it logger.info('PROGRESS: pass %i, dispatching documents up to #%i/%i' % (pass_, chunk_no * chunksize + len(chunk), lencorpus)) # this will eventually block until some jobs finish, because the queue has a small finite length self.dispatcher.putjob(chunk) else: logger.info('PROGRESS: pass %i, at document #%i/%i' % (pass_, chunk_no * chunksize + len(chunk), lencorpus)) gammat = self.do_estep(chunk, other) if self.optimize_alpha: self.update_alpha(gammat, rho) dirty = True del chunk # perform an M step. determine when based on update_every, don't do this after every chunk if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.do_mstep(rho(), other) del other # free up some mem if self.dispatcher: logger.info('initializing workers') self.dispatcher.reset(self.state) else: other = LdaState(self.eta, self.state.sstats.shape) dirty = False #endfor single corpus iteration if reallen != lencorpus: raise RuntimeError("input corpus size changed during training (don't use generators as input)") if dirty: # finish any remaining updates if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.do_mstep(rho(), other) del other dirty = False
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, eval_every=10, iterations=50, gamma_threshold=0.001): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be also set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.num_updates = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.optimize_alpha = alpha == 'auto' if alpha == 'symmetric' or alpha is None: logger.info("using symmetric alpha at %s" % (1.0 / num_topics)) self.alpha = numpy.asarray( [1.0 / num_topics for i in xrange(num_topics)]) elif alpha == 'asymmetric': self.alpha = numpy.asarray([ 1.0 / (i + numpy.sqrt(num_topics)) for i in xrange(num_topics) ]) self.alpha /= self.alpha.sum() logger.info("using asymmetric alpha %s" % list(self.alpha)) elif alpha == 'auto': self.alpha = numpy.asarray( [1.0 / num_topics for i in xrange(num_topics)]) logger.info("using autotuned alpha, starting with %s" % list(self.alpha)) else: # must be either float or an array of floats, of size num_topics self.alpha = alpha if isinstance( alpha, numpy.ndarray) else numpy.asarray([alpha] * num_topics) if len(self.alpha) != num_topics: raise RuntimeError( "invalid alpha shape (must match num_topics)") if eta is None: self.eta = 1.0 / num_topics else: self.eta = eta # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: if self.optimize_alpha: raise NotImplementedError( "auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher') dispatcher._pyroOneway.add("exit") logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)" % err) raise RuntimeError( "failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = numpy.random.gamma( 100., 1. / 100., (self.num_topics, self.num_terms)) self.sync_state() # if a training corpus was provided, start estimating the model right away if corpus is not None: self.update(corpus)
def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). `corpus` must be an iterable (repeatable stream of documents), In distributed mode, the E step is distributed over a cluster of machines. This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. """ # use parameters given in constructor, unless user explicitly overrode them if chunksize is None: chunksize = self.chunksize if decay is None: decay = self.decay if passes is None: passes = self.passes if update_every is None: update_every = self.update_every if eval_every is None: eval_every = self.eval_every if iterations is None: iterations = self.iterations if gamma_threshold is None: gamma_threshold = self.gamma_threshold # rho is the "speed" of updating; TODO try other fncs rho = lambda: pow(1.0 + self.num_updates, -decay) try: lencorpus = len(corpus) except: logger.warning( "input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaModel.update() called with an empty corpus") return self.state.numdocs += lencorpus if update_every: updatetype = "online" updateafter = min(lencorpus, update_every * self.numworkers * chunksize) else: updatetype = "batch" updateafter = lencorpus evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize) updates_per_pass = max(1, lencorpus / updateafter) logger.info( "running %s LDA training, %s topics, %i passes over " "the supplied corpus of %i documents, updating model once " "every %i documents, evaluating perplexity every %i documents, " "iterating %i with a convergence threshold of %i" % (updatetype, self.num_topics, passes, lencorpus, updateafter, evalafter, iterations, gamma_threshold)) if updates_per_pass * passes < 10: logger.warning( "too few updates, training might not converge; consider " "increasing the number of passes or iterations to improve accuracy" ) for pass_ in xrange(passes): if self.dispatcher: logger.info('initializing %s workers' % self.numworkers) self.dispatcher.reset(self.state) else: other = LdaState(self.eta, self.state.sstats.shape) dirty = False reallen = 0 for chunk_no, chunk in enumerate( utils.grouper(corpus, chunksize, as_numpy=True)): reallen += len( chunk ) # keep track of how many documents we've processed so far if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): self.log_perplexity(chunk, total_docs=lencorpus) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it logger.info( 'PROGRESS: pass %i, dispatching documents up to #%i/%i' % (pass_, chunk_no * chunksize + len(chunk), lencorpus)) # this will eventually block until some jobs finish, because the queue has a small finite length self.dispatcher.putjob(chunk) else: logger.info( 'PROGRESS: pass %i, at document #%i/%i' % (pass_, chunk_no * chunksize + len(chunk), lencorpus)) gammat = self.do_estep(chunk, other) if self.optimize_alpha: self.update_alpha(gammat, rho) dirty = True del chunk # perform an M step. determine when based on update_every, don't do this after every chunk if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: if self.dispatcher: # distributed mode: wait for all workers to finish logger.info( "reached the end of input; now waiting for all remaining jobs to finish" ) other = self.dispatcher.getstate() self.do_mstep(rho(), other) del other # free up some mem if self.dispatcher: logger.info('initializing workers') self.dispatcher.reset(self.state) else: other = LdaState(self.eta, self.state.sstats.shape) dirty = False #endfor single corpus iteration if reallen != lencorpus: raise RuntimeError( "input corpus size changed during training (don't use generators as input)" ) if dirty: # finish any remaining updates if self.dispatcher: # distributed mode: wait for all workers to finish logger.info( "reached the end of input; now waiting for all remaining jobs to finish" ) other = self.dispatcher.getstate() self.do_mstep(rho(), other) del other dirty = False
def inference(self, chunk, collect_sstats=False): """ Given a chunk of sparse document vectors, estimate gamma (parameters controlling the topic weights) for each document in the chunk. This function does not modify the model (=is read-only aka const). The whole input chunk of document is assumed to fit in RAM; chunking of a large corpus must be done earlier in the pipeline. If `collect_sstats` is True, also collect sufficient statistics needed to update the model's topic-word distributions, and return a 2-tuple `(gamma, sstats)`. Otherwise, return `(gamma, None)`. `gamma` is of shape `len(chunk) x topics`. """ try: _ = len(chunk) except: chunk = list( chunk ) # convert iterators/generators to plain list, so we have len() etc. if len(chunk) > 1: logger.debug("performing inference on a chunk of %i documents" % len(chunk)) # Initialize the variational distribution q(theta|gamma) for the chunk gamma = numpy.random.gamma(100., 1. / 100., (len(chunk), self.num_topics)) Elogtheta = dirichlet_expectation(gamma) expElogtheta = numpy.exp(Elogtheta) if collect_sstats: sstats = numpy.zeros_like(self.expElogbeta) else: sstats = None converged = 0 # Now, for each document d update that document's gamma and phi # Inference code copied from Hoffman's `onlineldavb.py` (esp. the # Lee&Seung trick which speeds things up by an order of magnitude, compared # to Blei's original LDA-C code, cool!). for d, doc in enumerate(chunk): ids = [id for id, _ in doc] cts = numpy.array([cnt for _, cnt in doc]) gammad = gamma[d, :] Elogthetad = Elogtheta[d, :] expElogthetad = expElogtheta[d, :] expElogbetad = self.expElogbeta[:, ids] # The optimal phi_{dwk} is proportional to expElogthetad_k * expElogbetad_w. # phinorm is the normalizer. phinorm = numpy.dot( expElogthetad, expElogbetad ) + 1e-100 # TODO treat zeros explicitly, instead of adding eps? # Iterate between gamma and phi until convergence for _ in xrange(self.iterations): lastgamma = gammad # We represent phi implicitly to save memory and time. # Substituting the value of the optimal phi back into # the update for gamma gives this update. Cf. Lee&Seung 2001. gammad = self.alpha + expElogthetad * numpy.dot( cts / phinorm, expElogbetad.T) Elogthetad = dirichlet_expectation(gammad) expElogthetad = numpy.exp(Elogthetad) phinorm = numpy.dot(expElogthetad, expElogbetad) + 1e-100 # If gamma hasn't changed much, we're done. meanchange = numpy.mean(abs(gammad - lastgamma)) if (meanchange < self.gamma_threshold): converged += 1 break gamma[d, :] = gammad if collect_sstats: # Contribution of document d to the expected sufficient # statistics for the M step. sstats[:, ids] += numpy.outer(expElogthetad.T, cts / phinorm) if len(chunk) > 1: logger.info("%i/%i documents converged within %i iterations" % (converged, len(chunk), self.iterations)) if collect_sstats: # This step finishes computing the sufficient statistics for the # M step, so that # sstats[k, w] = \sum_d n_{dw} * phi_{dwk} # = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}. sstats *= self.expElogbeta return gamma, sstats
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, eval_every=10, iterations=50, gamma_threshold=0.001): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be also set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError("cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.num_updates = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.optimize_alpha = alpha == 'auto' if alpha == 'symmetric' or alpha is None: logger.info("using symmetric alpha at %s" % (1.0 / num_topics)) self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)]) elif alpha == 'asymmetric': self.alpha = numpy.asarray([1.0 / (i + numpy.sqrt(num_topics)) for i in xrange(num_topics)]) self.alpha /= self.alpha.sum() logger.info("using asymmetric alpha %s" % list(self.alpha)) elif alpha == 'auto': self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)]) logger.info("using autotuned alpha, starting with %s" % list(self.alpha)) else: # must be either float or an array of floats, of size num_topics self.alpha = alpha if isinstance(alpha, numpy.ndarray) else numpy.asarray([alpha] * num_topics) if len(self.alpha) != num_topics: raise RuntimeError("invalid alpha shape (must match num_topics)") if eta is None: self.eta = 1.0 / num_topics else: self.eta = eta # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: if self.optimize_alpha: raise NotImplementedError("auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher') dispatcher._pyroOneway.add("exit") logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)" % err) raise RuntimeError("failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = numpy.random.gamma(100., 1. / 100., (self.num_topics, self.num_terms)) self.sync_state() # if a training corpus was provided, start estimating the model right away if corpus is not None: self.update(corpus)