Example #1
0
def do_mstep_b(d):
    result = np.zeros( [ number_of_topics ])
    for z in range(number_of_topics):
        s = 0
        for w_index in range(vocabulary_size):
            count = term_doc_matrix[d][w_index]
            s = s + count * topic_prob[d, w_index, z]
        result[z] = s
    normalize(result)
    return result
Example #2
0
def do_mstep_b(d):
    result = np.zeros([number_of_topics])
    for z in range(number_of_topics):
        s = 0
        for w_index in range(vocabulary_size):
            count = term_doc_matrix[d][w_index]
            s = s + count * topic_prob[d, w_index, z]
        result[z] = s
    normalize(result)
    return result
Example #3
0
def do_estep(d):
    result = np.zeros([vocabulary_size, number_of_topics])
    
    for w in range(vocabulary_size):
        prob = document_topic_prob[d, :] * topic_word_prob[:, w]
        if sum(prob) == 0.0:
            print 'exit'
        else:
            normalize(prob)
        result[w] = prob
    return result
Example #4
0
def do_mstep_a(t):
    result = np.zeros([ vocabulary_size ])

    for w_index in range(vocabulary_size):
        s = 0
        for d_index in range(number_of_documents):
            count = term_doc_matrix[d_index][w_index]
            s = s + count * topic_prob[d_index, w_index, t]
        result[w_index] = s
    normalize(result)
    return result
Example #5
0
def do_estep(d):
    result = np.zeros([vocabulary_size, number_of_topics])

    for w in range(vocabulary_size):
        prob = document_topic_prob[d, :] * topic_word_prob[:, w]
        if sum(prob) == 0.0:
            print 'exit'
        else:
            normalize(prob)
        result[w] = prob
    return result
Example #6
0
def do_mstep_a(t):
    result = np.zeros([vocabulary_size])

    for w_index in range(vocabulary_size):
        s = 0
        for d_index in range(number_of_documents):
            count = term_doc_matrix[d_index][w_index]
            s = s + count * topic_prob[d_index, w_index, t]
        result[w_index] = s
    normalize(result)
    return result
Example #7
0
    def plsa(self, number_of_topics, max_iter):
        '''
        Model topics.
        '''
        print "EM iteration begins..."
        # Get vocabulary and number of documents.
        self.build_vocabulary()
        number_of_documents = len(self.documents)
        vocabulary_size = len(self.vocabulary)

        #build IDF(Inverse Document Frequency) Matrix
        reverse_word_doc = {}
        for t_doc in self.documents:
            tag = True
            for word in t_doc.words:
                reverse_word_doc.setdefault(word, 0)
                if tag:
                    tag = False
                    reverse_word_doc[word] += 1

        #build TF-IDF(Term Frequence-Inverse DDocument Frequence) Matrix
        # build term-doc matrix(numbers of specific word occurring in this document)
        fp_term_doc_matrix = open('term_doc_matrix.csv', 'w')
        term_doc_matrix = np.zeros([number_of_documents, vocabulary_size],
                                   dtype=np.float64)
        for d_index, doc in enumerate(self.documents):
            term_count = np.zeros(vocabulary_size, dtype=np.int)
            for word in doc.words:
                # t_idf_item = math.log(1.0 * number_of_documents / (reverse_word_doc[word] + 1))
                if word in self.vocabulary:
                    w_index = self.vocabulary.index(word)
                    term_count[w_index] = (term_count[w_index] + 1
                                           )  #* t_idf_item

            count = 0
            number_of_is_not_zero = 0
            for i in xrange(vocabulary_size):
                fp_term_doc_matrix.write('%d ' % term_count[i])
                if term_count[i] != 0:
                    number_of_is_not_zero += 1
                count += term_count[i]
            print 'index:%d number of words in document[index]:%d number of specific word in document[index]:%d' % (
                d_index, count, number_of_is_not_zero)
            fp_term_doc_matrix.write('\n')
            term_doc_matrix[d_index] = term_count
            flag = True
            for i in xrange(vocabulary_size):
                fp_term_doc_matrix.write('%d ' % term_doc_matrix[d_index][i])
                if term_doc_matrix[d_index][i] != 0:
                    flag = False
            if flag:
                print "bug!"
        fp_term_doc_matrix.close()

        # Create the counter arrays.
        self.document_topic_prob = np.zeros(
            [number_of_documents, number_of_topics],
            dtype=np.float)  # P(z | d)  z;topic d:document
        self.topic_word_prob = np.zeros(
            [number_of_topics, len(self.vocabulary)],
            dtype=np.float)  # P(w | z)  w:word z:topic
        self.topic_prob = np.zeros(
            [number_of_documents,
             len(self.vocabulary), number_of_topics],
            dtype=np.float)  # P(z | d, w)

        # Initialize
        print "Initializing..."
        # randomly assign values
        self.document_topic_prob = np.random.random(size=(number_of_documents,
                                                          number_of_topics))
        for d_index in range(len(self.documents)):
            normalize(self.document_topic_prob[d_index]
                      )  # normalize for each document
        self.topic_word_prob = np.random.random(size=(number_of_topics,
                                                      len(self.vocabulary)))
        for z in range(number_of_topics):
            normalize(self.topic_word_prob[z])  # normalize for each topic

        # Run the EM algorithm
        for iteration in range(max_iter):
            print "Iteration #" + str(iteration + 1) + "..."
            print "E step:"
            for d_index, document in enumerate(self.documents):
                for w_index in range(vocabulary_size):
                    prob = self.document_topic_prob[
                        d_index, :] * self.topic_word_prob[:, w_index]
                    if sum(prob) == 0.0:
                        print "d_index = " + str(
                            d_index) + ",  w_index = " + str(w_index)
                        print "self.document_topic_prob[d_index, :] = " + str(
                            self.document_topic_prob[d_index, :])
                        print "self.topic_word_prob[:, w_index] = " + str(
                            self.topic_word_prob[:, w_index])
                        print "topic_prob[d_index][w_index] = " + str(prob)
                        exit(0)
                    else:
                        normalize(prob)
                    self.topic_prob[d_index][w_index] = prob
            print "M step:"
            # update P(w | z)
            for z in range(number_of_topics):
                for w_index in range(vocabulary_size):
                    s = 0
                    for d_index in range(len(self.documents)):
                        count = term_doc_matrix[d_index][w_index]
                        s = s + count * self.topic_prob[d_index, w_index, z]
                    self.topic_word_prob[z][w_index] = s
                normalize(self.topic_word_prob[z])

            # update P(z | d)
            for d_index in range(len(self.documents)):
                for z in range(number_of_topics):
                    s = 0
                    for w_index in range(vocabulary_size):
                        count = term_doc_matrix[d_index][w_index]
                        s = s + count * self.topic_prob[d_index, w_index, z]
                    self.document_topic_prob[d_index][z] = s
#                print self.document_topic_prob[d_index]
#                assert(sum(self.document_topic_prob[d_index]) != 0)
                normalize(self.document_topic_prob[d_index])
Example #8
0
    def plsa(self, nt, max_iter, processes=4):
        '''
        Model topics using multiprocessing.
        
        Args
            term_doc_matrix: matrix of (term, doc)
            nt (int): number of topic
            max_iter (int): maximum number of iterations
            processes (int): maximum number of parallel processes (default=4)
            
        '''
        print "EM iteration begins. Num topics: " + str(nt) + "; Iterations: " + str(max_iter) + "; Processes: " + str(processes)

        # Get vocabulary and number of documents.
        self.build_vocabulary()
        number_of_documents = len(self.documents)
        vocabulary_size = len(self.vocabulary)
        number_of_topics = nt

        # build term-doc matrix
        term_doc_matrix = np.zeros([number_of_documents, vocabulary_size], dtype = np.int)
        for d_index, doc in enumerate(self.documents):
            term_count = np.zeros(vocabulary_size, dtype = np.int)
            for word in doc.words:
                if word in self.vocabulary:
                    w_index = self.vocabulary.index(word)
                    term_count[w_index] = term_count[w_index] + 1
            term_doc_matrix[d_index] = term_count

        global vocabulary_size, number_of_documents, number_of_topics, document_topic_prob, topic_word_prob, term_doc_matrix, topic_prob
        

        # Create the counter arrays.
        document_topic_prob = np.zeros([number_of_documents, number_of_topics], dtype=np.float) # P(z | d)
        topic_word_prob = np.zeros([number_of_topics, vocabulary_size], dtype=np.float) # P(w | z)
        topic_prob = np.zeros([number_of_documents, vocabulary_size, number_of_topics], dtype=np.float) # P(z | d, w)

        # Initialize
        print "Initializing..."
        
        # randomly assign values
        document_topic_prob = np.random.random(size = (number_of_documents, number_of_topics))
        for d_index in range(number_of_documents):
            normalize(document_topic_prob[d_index]) # normalize for each document
        topic_word_prob = np.random.random(size = (number_of_topics, vocabulary_size))
        for z in range(number_of_topics):
            normalize(topic_word_prob[z]) # normalize for each topic


        # Run the EM algorithm using multiprocessing
        for iteration in range(max_iter):
            start = time.time()

            # e step
            topic_prob = []
            pool = Pool(processes)
            TASKS = []
            for d_index in range(number_of_documents):
                TASKS.append(d_index)
            jobs = pool.imap(do_estep, TASKS)
            pool.close()
            pool.join()

            finished = False
            while not finished:
                try:
                    topic_prob.append(jobs.next())
                except Exception as e:
                    finished = True
            topic_prob = np.asarray(topic_prob)


            # m step - first part
            pool = Pool(processes)
            topic_word_prob = []
            TASKS = []
            for z_index in range(number_of_topics):
                TASKS.append(z_index)
            jobs = pool.imap(do_mstep_a, TASKS)

            pool.close()
            pool.join()
            
            finished = False
            while not finished:
                try:
                    topic_word_prob.append(jobs.next())
                except:
                    finished = True
            topic_word_prob = np.asarray(topic_word_prob)


            # m step - second part
            pool = Pool(processes)
            document_topic_prob = []
            TASKS = []

            for d_index in range(number_of_documents):
                TASKS.append(d_index)
            jobs = pool.imap(do_mstep_b, TASKS)
            pool.close()
            pool.join()


            finished = False
            while not finished:
                try:
                    document_topic_prob.append(jobs.next())
                except:
                    finished = True

            document_topic_prob = np.asarray(document_topic_prob)

            print "iteration " + str(iteration) + " completed in " + str(time.time() - start) + " seconds."
            print "document probability variance: " + str( np.var(document_topic_prob))

        self.topic_word_prob = topic_word_prob
        self.document_topic_prob = document_topic_prob
Example #9
0
    def plsa(self, nt, max_iter, processes=4):
        '''
        Model topics using multiprocessing.
        
        Args
            term_doc_matrix: matrix of (term, doc)
            nt (int): number of topic
            max_iter (int): maximum number of iterations
            processes (int): maximum number of parallel processes (default=4)
            
        '''
        print "EM iteration begins. Num topics: " + str(
            nt) + "; Iterations: " + str(max_iter) + "; Processes: " + str(
                processes)

        # Get vocabulary and number of documents.
        self.build_vocabulary()
        number_of_documents = len(self.documents)
        vocabulary_size = len(self.vocabulary)
        number_of_topics = nt

        # build term-doc matrix
        term_doc_matrix = np.zeros([number_of_documents, vocabulary_size],
                                   dtype=np.int)
        for d_index, doc in enumerate(self.documents):
            term_count = np.zeros(vocabulary_size, dtype=np.int)
            for word in doc.words:
                if word in self.vocabulary:
                    w_index = self.vocabulary.index(word)
                    term_count[w_index] = term_count[w_index] + 1
            term_doc_matrix[d_index] = term_count

        global vocabulary_size, number_of_documents, number_of_topics, document_topic_prob, topic_word_prob, term_doc_matrix, topic_prob

        # Create the counter arrays.
        document_topic_prob = np.zeros([number_of_documents, number_of_topics],
                                       dtype=np.float)  # P(z | d)
        topic_word_prob = np.zeros([number_of_topics, vocabulary_size],
                                   dtype=np.float)  # P(w | z)
        topic_prob = np.zeros(
            [number_of_documents, vocabulary_size, number_of_topics],
            dtype=np.float)  # P(z | d, w)

        # Initialize
        print "Initializing..."

        # randomly assign values
        document_topic_prob = np.random.random(size=(number_of_documents,
                                                     number_of_topics))
        for d_index in range(number_of_documents):
            normalize(
                document_topic_prob[d_index])  # normalize for each document
        topic_word_prob = np.random.random(size=(number_of_topics,
                                                 vocabulary_size))
        for z in range(number_of_topics):
            normalize(topic_word_prob[z])  # normalize for each topic

        # Run the EM algorithm using multiprocessing
        for iteration in range(max_iter):
            start = time.time()

            # e step
            topic_prob = []
            pool = Pool(processes)
            TASKS = []
            for d_index in range(number_of_documents):
                TASKS.append(d_index)
            jobs = pool.imap(do_estep, TASKS)
            pool.close()
            pool.join()

            finished = False
            while not finished:
                try:
                    topic_prob.append(jobs.next())
                except Exception as e:
                    finished = True
            topic_prob = np.asarray(topic_prob)

            # m step - first part
            pool = Pool(processes)
            topic_word_prob = []
            TASKS = []
            for z_index in range(number_of_topics):
                TASKS.append(z_index)
            jobs = pool.imap(do_mstep_a, TASKS)

            pool.close()
            pool.join()

            finished = False
            while not finished:
                try:
                    topic_word_prob.append(jobs.next())
                except:
                    finished = True
            topic_word_prob = np.asarray(topic_word_prob)

            # m step - second part
            pool = Pool(processes)
            document_topic_prob = []
            TASKS = []

            for d_index in range(number_of_documents):
                TASKS.append(d_index)
            jobs = pool.imap(do_mstep_b, TASKS)
            pool.close()
            pool.join()

            finished = False
            while not finished:
                try:
                    document_topic_prob.append(jobs.next())
                except:
                    finished = True

            document_topic_prob = np.asarray(document_topic_prob)

            print "iteration " + str(iteration) + " completed in " + str(
                time.time() - start) + " seconds."
            print "document probability variance: " + str(
                np.var(document_topic_prob))

        self.topic_word_prob = topic_word_prob
        self.document_topic_prob = document_topic_prob