def test_serialize_and_parse(self):
        blob = self.ordered_sparse_topic_hist.serialize_to_string()

        sparse_topic_hist = OrderedSparseTopicHistogram(self.num_topics)
        sparse_topic_hist.parse_from_string(blob)

        self.assertEqual(sparse_topic_hist.size(),
                self.ordered_sparse_topic_hist.size())
        self.assertEqual(str(sparse_topic_hist),
                str(self.ordered_sparse_topic_hist))
Esempio n. 2
0
    def test_serialize_and_parse(self):
        blob = self.ordered_sparse_topic_hist.serialize_to_string()

        sparse_topic_hist = OrderedSparseTopicHistogram(self.num_topics)
        sparse_topic_hist.parse_from_string(blob)

        self.assertEqual(sparse_topic_hist.size(),
                         self.ordered_sparse_topic_hist.size())
        self.assertEqual(str(sparse_topic_hist),
                         str(self.ordered_sparse_topic_hist))
Esempio n. 3
0
 def parse_from_string(self, document_str):
     """Parse document from DocumentPB serialized string.
     """
     self.words = []
     self.doc_topic_hist = OrderedSparseTopicHistogram(self.num_topics)
     self.document_pb = DocumentPB()
     self.document_pb.ParseFromString(document_str)
     for word_pb in self.document_pb.words:
         self.words.append(Word(word_pb.id, word_pb.topic))
         self.increase_topic(word_pb.topic, 1)
Esempio n. 4
0
    def setUp(self):
        self.model = Model(20)

        # initialize self.model.global_topic_hist and
        # self.model.word_topic_hist
        for i in xrange(10):
            ordered_sparse_topic_hist = OrderedSparseTopicHistogram(20)
            for j in xrange(10 + i):
                ordered_sparse_topic_hist.increase_topic(j, j + 1)
                self.model.global_topic_hist[j] += j + 1
            self.model.word_topic_hist[i] = ordered_sparse_topic_hist
Esempio n. 5
0
    def setUp(self):
        self.model = Model(20)

        # initialize self.model.global_topic_hist and
        # self.model.word_topic_hist
        for i in xrange(10):
            ordered_sparse_topic_hist = OrderedSparseTopicHistogram(20)
            for j in xrange(10 + i):
                ordered_sparse_topic_hist.increase_topic(j, j + 1)
                self.model.global_topic_hist[j] += j + 1
            self.model.word_topic_hist[i] = ordered_sparse_topic_hist
Esempio n. 6
0
    def parse_from_tokens(self, doc_tokens, rand, vocabulary, model = None):
        """Parse the text document from tokens. Only tokens in vocabulary
        and model will be considered.
        """
        self.words = []
        self.doc_topic_hist = OrderedSparseTopicHistogram(self.num_topics)

        for token in doc_tokens:
            word_index = vocabulary.word_index(token)
            if (word_index != -1 and
                    (model == None or model.has_word(word_index))):
                # initialize a random topic for current word
                topic = rand.randint(0, self.num_topics - 1)
                self.words.append(Word(word_index, topic))
                self.doc_topic_hist.increase_topic(topic, 1)
Esempio n. 7
0
 def parse_from_string(self, document_str):
     """Parse document from DocumentPB serialized string.
     """
     self.words = []
     self.doc_topic_hist = OrderedSparseTopicHistogram(self.num_topics)
     self.document_pb = DocumentPB()
     self.document_pb.ParseFromString(document_str)
     for word_pb in self.document_pb.words:
         self.words.append(Word(word_pb.id, word_pb.topic))
         self.increase_topic(word_pb.topic, 1)
Esempio n. 8
0
    def _load_word_topic_hist(self, filename):
        logging.info('Loading word_topic_hist matrix N(w|z).')
        self.word_topic_hist.clear()

        fp = open(filename, "rb")
        record_reader = RecordReader(fp)
        while True:
            blob = record_reader.read()
            if blob == None:
                break

            word_topic_hist_pb = WordTopicHistogramPB()
            word_topic_hist_pb.ParseFromString(blob)

            ordered_sparse_topic_hist = \
                    OrderedSparseTopicHistogram(self.num_topics)
            ordered_sparse_topic_hist.parse_from_string(
                word_topic_hist_pb.sparse_topic_hist.SerializeToString())
            self.word_topic_hist[word_topic_hist_pb.word] = \
                    ordered_sparse_topic_hist
        fp.close()
        return (len(self.word_topic_hist) > 0)
Esempio n. 9
0
    def _load_word_topic_hist(self, filename):
        logging.info('Loading word_topic_hist matrix N(w|z).')
        self.word_topic_hist.clear()

        fp = open(filename, "rb")
        record_reader = RecordReader(fp)
        while True:
            blob = record_reader.read()
            if blob == None:
                break

            word_topic_hist_pb = WordTopicHistogramPB()
            word_topic_hist_pb.ParseFromString(blob)

            ordered_sparse_topic_hist = \
                    OrderedSparseTopicHistogram(self.num_topics)
            ordered_sparse_topic_hist.parse_from_string(
                    word_topic_hist_pb.sparse_topic_hist.SerializeToString())
            self.word_topic_hist[word_topic_hist_pb.word] = \
                    ordered_sparse_topic_hist
        fp.close()
        return (len(self.word_topic_hist) > 0)
 def setUp(self):
     self.num_topics = 20
     self.ordered_sparse_topic_hist = \
             OrderedSparseTopicHistogram(self.num_topics)
     for i in xrange(10):
         self.ordered_sparse_topic_hist.increase_topic(i, i + 1)
class OrderedSparseTopicHistogramTest(unittest.TestCase):

    def setUp(self):
        self.num_topics = 20
        self.ordered_sparse_topic_hist = \
                OrderedSparseTopicHistogram(self.num_topics)
        for i in xrange(10):
            self.ordered_sparse_topic_hist.increase_topic(i, i + 1)

    def test_ordered_sparse_topic_hist(self):
        self.assertEqual(10, len(self.ordered_sparse_topic_hist.non_zeros))
        for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros)):
            self.assertEqual(10 - i - 1,
                    self.ordered_sparse_topic_hist.non_zeros[i].topic)
            self.assertEqual(10 - i,
                    self.ordered_sparse_topic_hist.non_zeros[i].count)

    def test_num_topics(self):
        self.assertEqual(self.num_topics,
                self.ordered_sparse_topic_hist.num_topics)

    def test_size(self):
        self.assertEqual(10, self.ordered_sparse_topic_hist.size())

    def test_serialize_and_parse(self):
        blob = self.ordered_sparse_topic_hist.serialize_to_string()

        sparse_topic_hist = OrderedSparseTopicHistogram(self.num_topics)
        sparse_topic_hist.parse_from_string(blob)

        self.assertEqual(sparse_topic_hist.size(),
                self.ordered_sparse_topic_hist.size())
        self.assertEqual(str(sparse_topic_hist),
                str(self.ordered_sparse_topic_hist))

    def test_count(self):
        for i in xrange(10):
            self.assertEqual(i + 1, self.ordered_sparse_topic_hist.count(i))
        for i in xrange(10, 20):
            self.assertEqual(0, self.ordered_sparse_topic_hist.count(i))

    def test_increase_topic(self):
        for i in xrange(20):
            if i < 10:
                self.assertEqual(2 * (i + 1),
                        self.ordered_sparse_topic_hist.increase_topic(i, i + 1))
            else:
                self.assertEqual(i + 1,
                        self.ordered_sparse_topic_hist.increase_topic(i, i + 1))

            for j in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1):
                self.assertGreaterEqual(
                        self.ordered_sparse_topic_hist.non_zeros[j].count,
                        self.ordered_sparse_topic_hist.non_zeros[j + 1].count)

        self.assertEqual(2, self.ordered_sparse_topic_hist.count(0))
        self.assertEqual(12, self.ordered_sparse_topic_hist.count(5))
        self.assertEqual(11, self.ordered_sparse_topic_hist.count(10))
        self.assertEqual(16, self.ordered_sparse_topic_hist.count(15))
        self.assertEqual(20, self.ordered_sparse_topic_hist.increase_topic(15, 4))

    def test_decrease_topic(self):
        self.assertEqual(6, self.ordered_sparse_topic_hist.count(5))
        self.assertEqual(7, self.ordered_sparse_topic_hist.count(6))
        self.assertEqual(5, self.ordered_sparse_topic_hist.decrease_topic(5, 1))
        self.assertEqual(3, self.ordered_sparse_topic_hist.decrease_topic(6, 4))
        self.assertEqual(10, self.ordered_sparse_topic_hist.size())
        self.assertEqual(5, self.ordered_sparse_topic_hist.count(5))
        self.assertEqual(3, self.ordered_sparse_topic_hist.count(6))

        for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1):
            self.assertGreaterEqual(
                    self.ordered_sparse_topic_hist.non_zeros[i].count,
                    self.ordered_sparse_topic_hist.non_zeros[i + 1].count)

        self.assertEqual(0, self.ordered_sparse_topic_hist.decrease_topic(6, 3))
        self.assertEqual(9, self.ordered_sparse_topic_hist.size())
        for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1):
            self.assertGreaterEqual(
                    self.ordered_sparse_topic_hist.non_zeros[i].count,
                    self.ordered_sparse_topic_hist.non_zeros[i + 1].count)
Esempio n. 12
0
class Document(object):

    def __init__(self, num_topics):
        self.num_topics = num_topics
        self.words = None  # word occurances of the document,
                           # item fmt: Word<id, topic>
        self.doc_topic_hist = None  # N(z|d)

    def parse_from_tokens(self, doc_tokens, rand, vocabulary, model = None):
        """Parse the text document from tokens. Only tokens in vocabulary
        and model will be considered.
        """
        self.words = []
        self.doc_topic_hist = OrderedSparseTopicHistogram(self.num_topics)

        for token in doc_tokens:
            word_index = vocabulary.word_index(token)
            if (word_index != -1 and
                    (model == None or model.has_word(word_index))):
                # initialize a random topic for current word
                topic = rand.randint(0, self.num_topics - 1)
                self.words.append(Word(word_index, topic))
                self.doc_topic_hist.increase_topic(topic, 1)

    def serialize_to_string(self):
        """Serialize document to DocumentPB string.
        """
        document_pb = DocumentPB()
        for word in self.words:
            word_pb = document_pb.words.add()
            word_pb.id = word.id
            word_pb.topic = word.topic
        return document_pb.SerializeToString()

    def parse_from_string(self, document_str):
        """Parse document from DocumentPB serialized string.
        """
        self.words = []
        self.doc_topic_hist = OrderedSparseTopicHistogram(self.num_topics)

        self.document_pb = DocumentPB()
        self.document_pb.ParseFromString(document_str)
        for word_pb in self.document_pb.words:
            self.words.append(Word(word_pb.id, word_pb.topic))
            self.increase_topic(word_pb.topic, 1)

    def num_words(self):
        return len(self.words)

    def get_words(self):
        for word in self.words:
            yield word

    def get_topic_count(self, topic):
        """Returns N(z|d).
        """
        return self.doc_topic_hist.count(topic)

    def increase_topic(self, topic, count = 1):
        """Adds count to current topic, and returns the updated count.
        """
        return self.doc_topic_hist.increase_topic(topic, count)

    def decrease_topic(self, topic, count = 1):
        """Subtracts count from current topic, and returns the updated count.
        """
        return self.doc_topic_hist.decrease_topic(topic, count)

    def __str__(self):
        """Outputs a human-readable representation of the model.
        """
        document_str = []
        for word in self.words:
            document_str.append(str(word))
        document_str.append(str(self.doc_topic_hist))
        return '\n'.join(document_str)
Esempio n. 13
0
 def setUp(self):
     self.num_topics = 20
     self.ordered_sparse_topic_hist = \
             OrderedSparseTopicHistogram(self.num_topics)
     for i in xrange(10):
         self.ordered_sparse_topic_hist.increase_topic(i, i + 1)
Esempio n. 14
0
class OrderedSparseTopicHistogramTest(unittest.TestCase):
    def setUp(self):
        self.num_topics = 20
        self.ordered_sparse_topic_hist = \
                OrderedSparseTopicHistogram(self.num_topics)
        for i in xrange(10):
            self.ordered_sparse_topic_hist.increase_topic(i, i + 1)

    def test_ordered_sparse_topic_hist(self):
        self.assertEqual(10, len(self.ordered_sparse_topic_hist.non_zeros))
        for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros)):
            self.assertEqual(10 - i - 1,
                             self.ordered_sparse_topic_hist.non_zeros[i].topic)
            self.assertEqual(10 - i,
                             self.ordered_sparse_topic_hist.non_zeros[i].count)

    def test_num_topics(self):
        self.assertEqual(self.num_topics,
                         self.ordered_sparse_topic_hist.num_topics)

    def test_size(self):
        self.assertEqual(10, self.ordered_sparse_topic_hist.size())

    def test_serialize_and_parse(self):
        blob = self.ordered_sparse_topic_hist.serialize_to_string()

        sparse_topic_hist = OrderedSparseTopicHistogram(self.num_topics)
        sparse_topic_hist.parse_from_string(blob)

        self.assertEqual(sparse_topic_hist.size(),
                         self.ordered_sparse_topic_hist.size())
        self.assertEqual(str(sparse_topic_hist),
                         str(self.ordered_sparse_topic_hist))

    def test_count(self):
        for i in xrange(10):
            self.assertEqual(i + 1, self.ordered_sparse_topic_hist.count(i))
        for i in xrange(10, 20):
            self.assertEqual(0, self.ordered_sparse_topic_hist.count(i))

    def test_increase_topic(self):
        for i in xrange(20):
            if i < 10:
                self.assertEqual(
                    2 * (i + 1),
                    self.ordered_sparse_topic_hist.increase_topic(i, i + 1))
            else:
                self.assertEqual(
                    i + 1,
                    self.ordered_sparse_topic_hist.increase_topic(i, i + 1))

            for j in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1):
                self.assertGreaterEqual(
                    self.ordered_sparse_topic_hist.non_zeros[j].count,
                    self.ordered_sparse_topic_hist.non_zeros[j + 1].count)

        self.assertEqual(2, self.ordered_sparse_topic_hist.count(0))
        self.assertEqual(12, self.ordered_sparse_topic_hist.count(5))
        self.assertEqual(11, self.ordered_sparse_topic_hist.count(10))
        self.assertEqual(16, self.ordered_sparse_topic_hist.count(15))
        self.assertEqual(20,
                         self.ordered_sparse_topic_hist.increase_topic(15, 4))

    def test_decrease_topic(self):
        self.assertEqual(6, self.ordered_sparse_topic_hist.count(5))
        self.assertEqual(7, self.ordered_sparse_topic_hist.count(6))
        self.assertEqual(5,
                         self.ordered_sparse_topic_hist.decrease_topic(5, 1))
        self.assertEqual(3,
                         self.ordered_sparse_topic_hist.decrease_topic(6, 4))
        self.assertEqual(10, self.ordered_sparse_topic_hist.size())
        self.assertEqual(5, self.ordered_sparse_topic_hist.count(5))
        self.assertEqual(3, self.ordered_sparse_topic_hist.count(6))

        for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1):
            self.assertGreaterEqual(
                self.ordered_sparse_topic_hist.non_zeros[i].count,
                self.ordered_sparse_topic_hist.non_zeros[i + 1].count)

        self.assertEqual(0,
                         self.ordered_sparse_topic_hist.decrease_topic(6, 3))
        self.assertEqual(9, self.ordered_sparse_topic_hist.size())
        for i in xrange(len(self.ordered_sparse_topic_hist.non_zeros) - 1):
            self.assertGreaterEqual(
                self.ordered_sparse_topic_hist.non_zeros[i].count,
                self.ordered_sparse_topic_hist.non_zeros[i + 1].count)
Esempio n. 15
0
 def __init__(self, num_topics):
     self.num_topics = num_topics
     self.words = []  # word occurances of the document,
                      # item fmt: Word<id, topic>
     self.doc_topic_hist = OrderedSparseTopicHistogram(num_topics)  # N(z|d)
Esempio n. 16
0
 def __init__(self, num_topics):
     self.num_topics = num_topics
     self.words = []  # word occurances of the document,
     # item fmt: Word<id, topic>
     self.doc_topic_hist = OrderedSparseTopicHistogram(num_topics)  # N(z|d)