def load_corpus(self, corpus_dir): """Load corpus from a given directory, then initialize the documents and model. Line format: token1 \t token2 \t token3 \t ... ... """ self.documents = [] rand = random.Random() logging.info('Load corpus from %s.' % corpus_dir) for root, dirs, files in os.walk(corpus_dir): for f in files: filename = os.path.join(root, f) logging.info('Load filename %s.' % filename) fp = open(filename, 'r') for doc_str in fp.readlines(): doc_str = doc_str.decode('gbk') doc_tokens = doc_str.strip().split('\t') if len(doc_tokens) < 2: continue document = Document(self.model.num_topics) document.parse_from_tokens(doc_tokens, rand, self.vocabulary) if document.num_words() < 2: continue self.documents.append(document) fp.close() logging.info('The document number is %d.' % len(self.documents)) self._initialize_model() self._compute_smoothing_only_bucket() self._initialize_topic_word_coefficient()
def _inference_one_chain(self, doc_tokens, rand): """Inference topics with one markov chain. Returns the sparse topics p(z|d). """ document = Document(self.model.num_topics) document.parse_from_tokens(doc_tokens, rand, self.vocabulary, self.model) if document.num_words() == 0: return dict() accumulated_topic_hist = {} for i in xrange(self.total_iterations): # one iteration for word in document.words: # -- document.decrease_topic(word.topic, 1) new_topic = self._sample_word_topic(document, word.id, rand) assert new_topic != None word.topic = new_topic # ++ document.increase_topic(new_topic, 1) if i >= self.burn_in_iterations: for non_zero in document.doc_topic_hist.non_zeros: if non_zero.topic in accumulated_topic_hist: accumulated_topic_hist[ non_zero.topic] += non_zero.count else: accumulated_topic_hist[non_zero.topic] = non_zero.count topic_dist = self._l1normalize_distribution(accumulated_topic_hist) return topic_dist
def _inference_one_chain(self, doc_tokens, rand): """Inference topics with one markov chain. Returns the sparse topics p(z|d). """ document = Document(self.model.num_topics) document.parse_from_tokens(doc_tokens, rand, self.vocabulary, self.model) if document.num_words() == 0: return dict() accumulated_topic_hist = {} for i in xrange(self.total_iterations): # one iteration for word in document.words: # -- document.decrease_topic(word.topic, 1) new_topic = self._sample_word_topic(document, word.id, rand) assert new_topic != None word.topic = new_topic # ++ document.increase_topic(new_topic, 1) if i >= self.burn_in_iterations: for non_zero in document.doc_topic_hist.non_zeros: if non_zero.topic in accumulated_topic_hist: accumulated_topic_hist[non_zero.topic] += non_zero.count else: accumulated_topic_hist[non_zero.topic] = non_zero.count topic_dist = self._l1normalize_distribution(accumulated_topic_hist) return topic_dist
def test_compute_loglikelihood(self): doc_tokens = ['macbook', 'ipad', # exist in vocabulary and model 'mac os x', 'chrome', # only exist in vocabulary 'nokia', 'null'] # inexistent document = Document(self.model.num_topics) rand = random.Random() rand.seed(0) document.parse_from_tokens( doc_tokens, rand, self.vocabulary, self.model) documents = [document, document] self.assertEqual(-14.113955684239654, self.model_evaluator.compute_loglikelihood(documents))