def do_mstep_b(d): result = np.zeros( [ number_of_topics ]) for z in range(number_of_topics): s = 0 for w_index in range(vocabulary_size): count = term_doc_matrix[d][w_index] s = s + count * topic_prob[d, w_index, z] result[z] = s normalize(result) return result
def do_mstep_b(d): result = np.zeros([number_of_topics]) for z in range(number_of_topics): s = 0 for w_index in range(vocabulary_size): count = term_doc_matrix[d][w_index] s = s + count * topic_prob[d, w_index, z] result[z] = s normalize(result) return result
def do_estep(d): result = np.zeros([vocabulary_size, number_of_topics]) for w in range(vocabulary_size): prob = document_topic_prob[d, :] * topic_word_prob[:, w] if sum(prob) == 0.0: print 'exit' else: normalize(prob) result[w] = prob return result
def do_mstep_a(t): result = np.zeros([ vocabulary_size ]) for w_index in range(vocabulary_size): s = 0 for d_index in range(number_of_documents): count = term_doc_matrix[d_index][w_index] s = s + count * topic_prob[d_index, w_index, t] result[w_index] = s normalize(result) return result
def do_mstep_a(t): result = np.zeros([vocabulary_size]) for w_index in range(vocabulary_size): s = 0 for d_index in range(number_of_documents): count = term_doc_matrix[d_index][w_index] s = s + count * topic_prob[d_index, w_index, t] result[w_index] = s normalize(result) return result
def plsa(self, number_of_topics, max_iter): ''' Model topics. ''' print "EM iteration begins..." # Get vocabulary and number of documents. self.build_vocabulary() number_of_documents = len(self.documents) vocabulary_size = len(self.vocabulary) #build IDF(Inverse Document Frequency) Matrix reverse_word_doc = {} for t_doc in self.documents: tag = True for word in t_doc.words: reverse_word_doc.setdefault(word, 0) if tag: tag = False reverse_word_doc[word] += 1 #build TF-IDF(Term Frequence-Inverse DDocument Frequence) Matrix # build term-doc matrix(numbers of specific word occurring in this document) fp_term_doc_matrix = open('term_doc_matrix.csv', 'w') term_doc_matrix = np.zeros([number_of_documents, vocabulary_size], dtype=np.float64) for d_index, doc in enumerate(self.documents): term_count = np.zeros(vocabulary_size, dtype=np.int) for word in doc.words: # t_idf_item = math.log(1.0 * number_of_documents / (reverse_word_doc[word] + 1)) if word in self.vocabulary: w_index = self.vocabulary.index(word) term_count[w_index] = (term_count[w_index] + 1 ) #* t_idf_item count = 0 number_of_is_not_zero = 0 for i in xrange(vocabulary_size): fp_term_doc_matrix.write('%d ' % term_count[i]) if term_count[i] != 0: number_of_is_not_zero += 1 count += term_count[i] print 'index:%d number of words in document[index]:%d number of specific word in document[index]:%d' % ( d_index, count, number_of_is_not_zero) fp_term_doc_matrix.write('\n') term_doc_matrix[d_index] = term_count flag = True for i in xrange(vocabulary_size): fp_term_doc_matrix.write('%d ' % term_doc_matrix[d_index][i]) if term_doc_matrix[d_index][i] != 0: flag = False if flag: print "bug!" fp_term_doc_matrix.close() # Create the counter arrays. self.document_topic_prob = np.zeros( [number_of_documents, number_of_topics], dtype=np.float) # P(z | d) z;topic d:document self.topic_word_prob = np.zeros( [number_of_topics, len(self.vocabulary)], dtype=np.float) # P(w | z) w:word z:topic self.topic_prob = np.zeros( [number_of_documents, len(self.vocabulary), number_of_topics], dtype=np.float) # P(z | d, w) # Initialize print "Initializing..." # randomly assign values self.document_topic_prob = np.random.random(size=(number_of_documents, number_of_topics)) for d_index in range(len(self.documents)): normalize(self.document_topic_prob[d_index] ) # normalize for each document self.topic_word_prob = np.random.random(size=(number_of_topics, len(self.vocabulary))) for z in range(number_of_topics): normalize(self.topic_word_prob[z]) # normalize for each topic # Run the EM algorithm for iteration in range(max_iter): print "Iteration #" + str(iteration + 1) + "..." print "E step:" for d_index, document in enumerate(self.documents): for w_index in range(vocabulary_size): prob = self.document_topic_prob[ d_index, :] * self.topic_word_prob[:, w_index] if sum(prob) == 0.0: print "d_index = " + str( d_index) + ", w_index = " + str(w_index) print "self.document_topic_prob[d_index, :] = " + str( self.document_topic_prob[d_index, :]) print "self.topic_word_prob[:, w_index] = " + str( self.topic_word_prob[:, w_index]) print "topic_prob[d_index][w_index] = " + str(prob) exit(0) else: normalize(prob) self.topic_prob[d_index][w_index] = prob print "M step:" # update P(w | z) for z in range(number_of_topics): for w_index in range(vocabulary_size): s = 0 for d_index in range(len(self.documents)): count = term_doc_matrix[d_index][w_index] s = s + count * self.topic_prob[d_index, w_index, z] self.topic_word_prob[z][w_index] = s normalize(self.topic_word_prob[z]) # update P(z | d) for d_index in range(len(self.documents)): for z in range(number_of_topics): s = 0 for w_index in range(vocabulary_size): count = term_doc_matrix[d_index][w_index] s = s + count * self.topic_prob[d_index, w_index, z] self.document_topic_prob[d_index][z] = s # print self.document_topic_prob[d_index] # assert(sum(self.document_topic_prob[d_index]) != 0) normalize(self.document_topic_prob[d_index])
def plsa(self, nt, max_iter, processes=4): ''' Model topics using multiprocessing. Args term_doc_matrix: matrix of (term, doc) nt (int): number of topic max_iter (int): maximum number of iterations processes (int): maximum number of parallel processes (default=4) ''' print "EM iteration begins. Num topics: " + str(nt) + "; Iterations: " + str(max_iter) + "; Processes: " + str(processes) # Get vocabulary and number of documents. self.build_vocabulary() number_of_documents = len(self.documents) vocabulary_size = len(self.vocabulary) number_of_topics = nt # build term-doc matrix term_doc_matrix = np.zeros([number_of_documents, vocabulary_size], dtype = np.int) for d_index, doc in enumerate(self.documents): term_count = np.zeros(vocabulary_size, dtype = np.int) for word in doc.words: if word in self.vocabulary: w_index = self.vocabulary.index(word) term_count[w_index] = term_count[w_index] + 1 term_doc_matrix[d_index] = term_count global vocabulary_size, number_of_documents, number_of_topics, document_topic_prob, topic_word_prob, term_doc_matrix, topic_prob # Create the counter arrays. document_topic_prob = np.zeros([number_of_documents, number_of_topics], dtype=np.float) # P(z | d) topic_word_prob = np.zeros([number_of_topics, vocabulary_size], dtype=np.float) # P(w | z) topic_prob = np.zeros([number_of_documents, vocabulary_size, number_of_topics], dtype=np.float) # P(z | d, w) # Initialize print "Initializing..." # randomly assign values document_topic_prob = np.random.random(size = (number_of_documents, number_of_topics)) for d_index in range(number_of_documents): normalize(document_topic_prob[d_index]) # normalize for each document topic_word_prob = np.random.random(size = (number_of_topics, vocabulary_size)) for z in range(number_of_topics): normalize(topic_word_prob[z]) # normalize for each topic # Run the EM algorithm using multiprocessing for iteration in range(max_iter): start = time.time() # e step topic_prob = [] pool = Pool(processes) TASKS = [] for d_index in range(number_of_documents): TASKS.append(d_index) jobs = pool.imap(do_estep, TASKS) pool.close() pool.join() finished = False while not finished: try: topic_prob.append(jobs.next()) except Exception as e: finished = True topic_prob = np.asarray(topic_prob) # m step - first part pool = Pool(processes) topic_word_prob = [] TASKS = [] for z_index in range(number_of_topics): TASKS.append(z_index) jobs = pool.imap(do_mstep_a, TASKS) pool.close() pool.join() finished = False while not finished: try: topic_word_prob.append(jobs.next()) except: finished = True topic_word_prob = np.asarray(topic_word_prob) # m step - second part pool = Pool(processes) document_topic_prob = [] TASKS = [] for d_index in range(number_of_documents): TASKS.append(d_index) jobs = pool.imap(do_mstep_b, TASKS) pool.close() pool.join() finished = False while not finished: try: document_topic_prob.append(jobs.next()) except: finished = True document_topic_prob = np.asarray(document_topic_prob) print "iteration " + str(iteration) + " completed in " + str(time.time() - start) + " seconds." print "document probability variance: " + str( np.var(document_topic_prob)) self.topic_word_prob = topic_word_prob self.document_topic_prob = document_topic_prob
def plsa(self, nt, max_iter, processes=4): ''' Model topics using multiprocessing. Args term_doc_matrix: matrix of (term, doc) nt (int): number of topic max_iter (int): maximum number of iterations processes (int): maximum number of parallel processes (default=4) ''' print "EM iteration begins. Num topics: " + str( nt) + "; Iterations: " + str(max_iter) + "; Processes: " + str( processes) # Get vocabulary and number of documents. self.build_vocabulary() number_of_documents = len(self.documents) vocabulary_size = len(self.vocabulary) number_of_topics = nt # build term-doc matrix term_doc_matrix = np.zeros([number_of_documents, vocabulary_size], dtype=np.int) for d_index, doc in enumerate(self.documents): term_count = np.zeros(vocabulary_size, dtype=np.int) for word in doc.words: if word in self.vocabulary: w_index = self.vocabulary.index(word) term_count[w_index] = term_count[w_index] + 1 term_doc_matrix[d_index] = term_count global vocabulary_size, number_of_documents, number_of_topics, document_topic_prob, topic_word_prob, term_doc_matrix, topic_prob # Create the counter arrays. document_topic_prob = np.zeros([number_of_documents, number_of_topics], dtype=np.float) # P(z | d) topic_word_prob = np.zeros([number_of_topics, vocabulary_size], dtype=np.float) # P(w | z) topic_prob = np.zeros( [number_of_documents, vocabulary_size, number_of_topics], dtype=np.float) # P(z | d, w) # Initialize print "Initializing..." # randomly assign values document_topic_prob = np.random.random(size=(number_of_documents, number_of_topics)) for d_index in range(number_of_documents): normalize( document_topic_prob[d_index]) # normalize for each document topic_word_prob = np.random.random(size=(number_of_topics, vocabulary_size)) for z in range(number_of_topics): normalize(topic_word_prob[z]) # normalize for each topic # Run the EM algorithm using multiprocessing for iteration in range(max_iter): start = time.time() # e step topic_prob = [] pool = Pool(processes) TASKS = [] for d_index in range(number_of_documents): TASKS.append(d_index) jobs = pool.imap(do_estep, TASKS) pool.close() pool.join() finished = False while not finished: try: topic_prob.append(jobs.next()) except Exception as e: finished = True topic_prob = np.asarray(topic_prob) # m step - first part pool = Pool(processes) topic_word_prob = [] TASKS = [] for z_index in range(number_of_topics): TASKS.append(z_index) jobs = pool.imap(do_mstep_a, TASKS) pool.close() pool.join() finished = False while not finished: try: topic_word_prob.append(jobs.next()) except: finished = True topic_word_prob = np.asarray(topic_word_prob) # m step - second part pool = Pool(processes) document_topic_prob = [] TASKS = [] for d_index in range(number_of_documents): TASKS.append(d_index) jobs = pool.imap(do_mstep_b, TASKS) pool.close() pool.join() finished = False while not finished: try: document_topic_prob.append(jobs.next()) except: finished = True document_topic_prob = np.asarray(document_topic_prob) print "iteration " + str(iteration) + " completed in " + str( time.time() - start) + " seconds." print "document probability variance: " + str( np.var(document_topic_prob)) self.topic_word_prob = topic_word_prob self.document_topic_prob = document_topic_prob