def test_serialize_and_parse(self): blob = self.ordered_sparse_topic_hist.serialize_to_string() sparse_topic_hist = OrderedSparseTopicHistogram(self.num_topics) sparse_topic_hist.parse_from_string(blob) self.assertEqual(sparse_topic_hist.size(), self.ordered_sparse_topic_hist.size()) self.assertEqual(str(sparse_topic_hist), str(self.ordered_sparse_topic_hist))
def parse_from_string(self, document_str): """Parse document from DocumentPB serialized string. """ self.words = [] self.doc_topic_hist = OrderedSparseTopicHistogram(self.num_topics) self.document_pb = DocumentPB() self.document_pb.ParseFromString(document_str) for word_pb in self.document_pb.words: self.words.append(Word(word_pb.id, word_pb.topic)) self.increase_topic(word_pb.topic, 1)
def setUp(self): self.model = Model(20) # initialize self.model.global_topic_hist and # self.model.word_topic_hist for i in xrange(10): ordered_sparse_topic_hist = OrderedSparseTopicHistogram(20) for j in xrange(10 + i): ordered_sparse_topic_hist.increase_topic(j, j + 1) self.model.global_topic_hist[j] += j + 1 self.model.word_topic_hist[i] = ordered_sparse_topic_hist
def parse_from_tokens(self, doc_tokens, rand, vocabulary, model = None): """Parse the text document from tokens. Only tokens in vocabulary and model will be considered. """ self.words = [] self.doc_topic_hist = OrderedSparseTopicHistogram(self.num_topics) for token in doc_tokens: word_index = vocabulary.word_index(token) if (word_index != -1 and (model == None or model.has_word(word_index))): # initialize a random topic for current word topic = rand.randint(0, self.num_topics - 1) self.words.append(Word(word_index, topic)) self.doc_topic_hist.increase_topic(topic, 1)
def _load_word_topic_hist(self, filename): logging.info('Loading word_topic_hist matrix N(w|z).') self.word_topic_hist.clear() fp = open(filename, "rb") record_reader = RecordReader(fp) while True: blob = record_reader.read() if blob == None: break word_topic_hist_pb = WordTopicHistogramPB() word_topic_hist_pb.ParseFromString(blob) ordered_sparse_topic_hist = \ OrderedSparseTopicHistogram(self.num_topics) ordered_sparse_topic_hist.parse_from_string( word_topic_hist_pb.sparse_topic_hist.SerializeToString()) self.word_topic_hist[word_topic_hist_pb.word] = \ ordered_sparse_topic_hist fp.close() return (len(self.word_topic_hist) > 0)
def setUp(self): self.num_topics = 20 self.ordered_sparse_topic_hist = \ OrderedSparseTopicHistogram(self.num_topics) for i in xrange(10): self.ordered_sparse_topic_hist.increase_topic(i, i + 1)
def __init__(self, num_topics): self.num_topics = num_topics self.words = [] # word occurances of the document, # item fmt: Word<id, topic> self.doc_topic_hist = OrderedSparseTopicHistogram(num_topics) # N(z|d)