def topic_labels(data, model, N=50): # td, idf, vocab = data file_txt = open(PLSA_PARAMETERS_PATH + file, "w") port_dict = porter_dictionary.porter_dictionary() port_dict.load_dict(dict_path) # print port_dict.dictionary td, vocab = data plsa = plsa1.pLSA(model) inv_vocab = inverse_vocab(vocab) dict_vocab=[] # vocab_list=[x for x in inv_vocab[1]] # print vocab_list for ind in inv_vocab: try: dict_vocab.append(port_dict.dictionary[inv_vocab[ind]][0]) except: dict_vocab.append(inv_vocab[ind]) # print len(dict_vocab) for i in plsa.topic_labels(dict_vocab, N): # print (i) # file_txt.write(str(i)) for j in i: file_txt.write(j+', ') file_txt.write('\n') file_txt.close()
def open_model(self, z = 2, modelFile = None, study = None, name = None, run = '', useC = False, folder = MODELS_LOCATION, add_to_file = ''): ''' Opens the probs of a model previously computed and saved in a json file ''' if self.study is None: self.study = study if self.name is None: self.name = name if self.run == '': self.run = run if self.useC is None: self.useC = useC if modelFile is not None: self.modelFile = modelFile else: self.modelFile = self.get_result_filename(z, run, useC, folder, add_to_file = add_to_file) f = open(self.modelFile,'r') print 'Using the following result file:', self.modelFile data = json.load(f) p_z = np.array(data['p_z']) p_w_z = np.array(data['p_w_z']) p_d_z = np.array(data['p_d_z']) model = p_z, p_w_z, p_d_z plsa = pLSA() plsa.set_model(model) self.z = z self.model = plsa return self.modelFile
def main(): # Setup logging ------------------------- logging.basicConfig(filename='plsa.log', level=logging.INFO) console = logging.StreamHandler() console.setLevel(logging.INFO) logging.getLogger('').addHandler(console) # Some basic configuration --------------- fname = './data.txt' fsw = './stopwords.txt' eps = 20.0 key_word_size = 10 # Preprocess ----------------------------- pp = PP(fname, fsw) t_d = pp.get_t_d() V, D = t_d.shape logging.info("V = %d D = %d" % (V, D)) # Train model and get result ------------- pmodel = pLSA() for z in range(3, (D + 1), 10): t1 = time.clock() (l, p_d_z, p_w_z, p_z) = pmodel.train(t_d, z, eps) t2 = time.clock() logging.info('z = %d eps = %f time = %f' % (z, l, t2 - t1)) for itz in range(z): logging.info('Topic %d' % itz) data = [(p_w_z[i][itz], i) for i in range(len(p_w_z[:, itz]))] data.sort(key=lambda tup: tup[0], reverse=True) for i in range(key_word_size): logging.info('%s : %.6f ' % (pp.get_word(data[i][1]), data[i][0]))
def main(): # Setup logging ------------------------- logging.basicConfig(filename='plsa.log', level=logging.INFO) console = logging.StreamHandler() console.setLevel(logging.INFO) logging.getLogger('').addHandler(console) # Some basic configuration --------------- fname = './dst.txt' fsw = './cnStopwords.txt' eps = 20.0 key_word_size = 10 # Preprocess ----------------------------- pp = PP(fname, fsw) t_d = pp.get_t_d() V,D = t_d.shape logging.info("V = %d D = %d" %(V,D)) # Train model and get result ------------- pmodel = pLSA() # for z in range(3,(D+1), 10): for z in range(10,21, 10): t1 = time.clock() (l, p_d_z, p_w_z, p_z) = pmodel.train(t_d, z, eps) t2 = time.clock() logging.info('z = %d ll = %f time = %f' %(z, l, t2-t1)) for itz in range(z): logging.info('Topic %d' %itz) data = [(p_w_z[i][itz], i) for i in range(len(p_w_z[:,itz])) ] data.sort(key=lambda tup:tup[0], reverse=True) for i in range(key_word_size): logging.info('%s : %.6f ' %(pp.get_word(data[i][1]), data[i][0] ))
def run_plsa(self, Z, maxiter=MAX_ITER_PLSA, verbatim = True, useC = True): '''runs plsa on sample data in filename''' plsa = pLSA() plsa.debug = verbatim print "\n Running ...\n" plsa.train(self.datamatrix, Z, maxiter = maxiter, useC = useC) #runs plsa! self.model = plsa return plsa
def run_plsa(self, Z, maxiter=MAX_ITER_PLSA, verbatim=True, useC=True): '''runs plsa on sample data in filename''' plsa = pLSA() plsa.debug = verbatim print "\n Running ...\n" plsa.train(self.datamatrix, Z, maxiter=maxiter, useC=useC) #runs plsa! self.model = plsa return plsa
def open_model(self, z=2, modelFile=None, study=None, name=None, run='', useC=False, folder=MODELS_LOCATION, add_to_file=''): ''' Opens the probs of a model previously computed and saved in a json file ''' if self.study is None: self.study = study if self.name is None: self.name = name if self.run == '': self.run = run if self.useC is None: self.useC = useC if modelFile is not None: self.modelFile = modelFile else: self.modelFile = self.get_result_filename(z, run, useC, folder, add_to_file=add_to_file) f = open(self.modelFile, 'r') print 'Using the following result file:', self.modelFile data = json.load(f) p_z = np.array(data['p_z']) p_w_z = np.array(data['p_w_z']) p_d_z = np.array(data['p_d_z']) model = p_z, p_w_z, p_d_z plsa = pLSA() plsa.set_model(model) self.z = z self.model = plsa return self.modelFile
def global_weights(data, model): td, idf, vocab = data plsa = pLSA(model) print plsa.global_weights(idf)
def document_cluster(model): plsa = pLSA(model) print plsa.document_cluster()
def train(self): self.pLSAmodel = plsa.pLSA() self.pLSAmodel.train(self.UI.transpose(), self.k, self.maxiter, self.eps)
swords=[line.strip() for line in open("stopwords.txt", "r")] trainlines=[line.strip() for line in open("coalscam_english.txt", "r")] count=0 l=len(trainlines) for i in range(0,l): for s in swords: trainlines[i]=re.sub(r'\b'+s+r'\b','',trainlines[i]) for i in range(0,l): doc=trainlines[i] tdm.add_doc(doc) tdm.write_csv("matrix.csv", cutoff=1) x=pLSA() my_data=genfromtxt('matrix.csv', delimiter=',') [i,j]=my_data.shape a=(my_data[1:]).transpose() x.train(a,3) data=csv.reader(open('matrix.csv')) fields=data.next() cluster=x.topic_labels(fields) for rows in cluster: print rows p=count
def word_cluster(model): plsa = plsa1.pLSA(model) print (plsa.word_cluster())
def document_cluster(model): plsa = plsa1.pLSA(model) print (plsa.document_cluster())
def folding_in(data, model, maxiter=30, debug=True): td, idf, vocab = data d = td[:,-1] plsa = plsa1.pLSA(model) plsa.debug = debug print (plsa.folding_in(d, maxiter))
def folding_in(data, model, maxiter=50, debug=True): td, idf, vocab = data d = td[:,-1] plsa = pLSA(model) plsa.debug = debug print plsa.folding_in(d, maxiter)
def average_train(data, maxiter=500, debug=True): td, idf, vocab = data td = td[:,:-1] plsa = pLSA() plsa.debug = debug return plsa.average_train(5)(td, 10, maxiter)
def topic_labels(data, model, N=15): td, idf, vocab = data plsa = pLSA(model) inv_vocab = inverse_vocab(vocab) print plsa.topic_labels(inv_vocab, N)
def unigram_smoothing(model): plsa = pLSA(model) print plsa.unigram_smoothing()
def word_cluster(model): plsa = pLSA(model) print plsa.word_cluster()
def word_topics(model): plsa = pLSA(model) print plsa.word_topics()
def train(data,empty_docs_list,json_files_list,maxiter=500, debug=True): # td, idf, vocab = data # s_file_list= sorted(file_list) # print('file_list:',file_list) # print s_file_list topic_list= range(0,num_topics) # print topic_list # file_list_2 = list(file_list) # empty_file_list() #Bug update # df= pd.DataFrame(0,index=topic_list,columns=file_list_2) # Bug update over td, vocab = data # td = td[:,:-1] plsa = plsa1.pLSA() plsa.debug = debug plsa.logL_pic = logL_pic # model=plsa.train(td, num_topics, maxiter) model=plsa.train(td=td,Z=num_topics,maxiter=maxiter2,eps=cc,beta=beta,min_iteration=min_iteration) p_z_d=plsa.topic_document() ii=0 # print 'model2',model[0] # print 'p_z_D =',p_z_d[0] # print('row',len(p_z_d)) # print('column',len(p_z_d[0])) # print(df.shape) # Bug update # for i in df.index: # jj=0 # for j in df.columns: # df.loc[i,j]= p_z_d[ii][jj] # # print 'df loc', df.loc[i,j] # # print'ii jj', p_z_d[ii][jj] # jj=jj+1 # ii=ii+1 # Bug update over # print('PATH =',PATH) # Bug update # df.to_csv(PATH+'.csv') # Bug update over file_list = json_files_list print('"""""""""""""""""""""""""""""') # print(file_list) print('>>>>>>> In method train:', empty_docs_list) for edl in sorted(empty_docs_list, reverse=True): # print(file_list[edl]) del file_list[edl] print('Dimenstionssssssssssssssssss') print("topic_list_len =",topic_list.__len__()) print("p_z_d_len =", p_z_d.__len__()) print("file_list_len =",file_list.__len__()) print("p_z_d[0] =", p_z_d[0].__len__()) topic_by_doc = open(PATH+'.csv', "w") for i in range(file_list.__len__()): topic_by_doc.write(',') topic_by_doc.write(file_list[i]) topic_by_doc.write('\n') for i in range(p_z_d.__len__()): topic_by_doc.write(str(i)) for j in range(p_z_d[0].__len__()): topic_by_doc.write(',') topic_by_doc.write(str(p_z_d[i][j])) topic_by_doc.write('\n') topic_by_doc.close() print('////////////////////////////') print(p_z_d.__len__()) print(p_z_d[0].__len__()) word_by_topic_conditional = open(PATH_word_by_topic_conditional+'.csv', "w") global num_topics_generated if len(plsa.p_w_z) < 300: num_topics_generated = len(plsa.p_w_z) p_w_z_transposed_truncated = np.sort(plsa.p_w_z.transpose()[:,0:num_topics_generated]) for i in range(p_w_z_transposed_truncated.__len__()): for j in range(p_w_z_transposed_truncated[0].__len__()): word_by_topic_conditional.write(str(p_w_z_transposed_truncated[i][num_topics_generated-j-1])) word_by_topic_conditional.write(',') word_by_topic_conditional.write('\n') word_by_topic_conditional.close() # print('docs==========================') # # for i in file_list: # print(i) # for i in p_z_d: # print(i) pz=model[0] topic_prob_file = open(PLSA_PARAMETERS_PATH + file2, "w") for z in pz: topic_prob_file.write(str(z)) topic_prob_file.write('\n') topic_prob_file.close() return model
def average_train(data, maxiter=500, debug=True): td, idf, vocab = data td = td[:,:-1] plsa = plsa1.pLSA() plsa.debug = debug return plsa.average_train(10)(td, 10, maxiter)
def document_topics(model): plsa = pLSA(model) print plsa.document_topics()
def document_topics(model): plsa = plsa1.pLSA(model) for i in plsa.document_topics(): print (i)
def word_topics(model): plsa = plsa1.pLSA(model) for i in plsa.word_topics(): print (i)
def train(data, maxiter=500, debug=True): td, idf, vocab = data td = td[:,:-1] plsa = pLSA() plsa.debug = debug return plsa.train(td, 10, maxiter)
def unigram_smoothing(model): plsa = plsa1.pLSA(model) print (plsa.unigram_smoothing())
def global_weights(data, model): td, idf, vocab = data plsa = plsa1.pLSA(model) print (plsa.global_weights(idf))
def __plsa(self,user_feature_matrix): item_user_matrix = user_feature_matrix.transpose() plsa = pLSA() plsa.train(item_user_matrix.todok(),220) topic_user = plsa.document_topics() return topic_user.T