コード例 #1
0
def topic_labels(data, model, N=50):
    # td, idf, vocab = data
    file_txt = open(PLSA_PARAMETERS_PATH + file, "w")
    port_dict = porter_dictionary.porter_dictionary()
    port_dict.load_dict(dict_path)
    # print port_dict.dictionary
    td, vocab = data
    plsa = plsa1.pLSA(model)
    inv_vocab = inverse_vocab(vocab)
    dict_vocab=[]
    # vocab_list=[x for x in inv_vocab[1]]

    # print vocab_list
    for ind in inv_vocab:
       try:
          dict_vocab.append(port_dict.dictionary[inv_vocab[ind]][0])
       except:
             dict_vocab.append(inv_vocab[ind])
    # print len(dict_vocab)
    for i in plsa.topic_labels(dict_vocab, N):
       # print (i)
       # file_txt.write(str(i))
       for j in i:
           file_txt.write(j+', ')
       file_txt.write('\n')
    file_txt.close()
コード例 #2
0
ファイル: microbplsa.py プロジェクト: sperez8/microbPLSA
 def open_model(self, z = 2, modelFile = None, study = None, name = None, run = '', useC = False, folder = MODELS_LOCATION, add_to_file = ''):
     ''' Opens the probs of a model previously computed and saved in a json file '''
     if self.study is None:
         self.study = study
     if self.name is None:
         self.name = name
     if self.run == '':
         self.run = run
     if self.useC is None:
         self.useC = useC
         
     
     if modelFile is not None:
         self.modelFile = modelFile
     else:
         self.modelFile = self.get_result_filename(z, run, useC, folder, add_to_file = add_to_file)
     
     f = open(self.modelFile,'r')
     print 'Using the following result file:', self.modelFile
     data = json.load(f)
     p_z = np.array(data['p_z'])
     p_w_z = np.array(data['p_w_z'])
     p_d_z = np.array(data['p_d_z'])
     model = p_z, p_w_z, p_d_z
     plsa = pLSA()
     plsa.set_model(model)
     self.z = z
     self.model = plsa
     return self.modelFile
コード例 #3
0
def main():
    # Setup logging -------------------------
    logging.basicConfig(filename='plsa.log', level=logging.INFO)
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    logging.getLogger('').addHandler(console)

    # Some basic configuration ---------------
    fname = './data.txt'
    fsw = './stopwords.txt'
    eps = 20.0
    key_word_size = 10

    # Preprocess -----------------------------
    pp = PP(fname, fsw)
    t_d = pp.get_t_d()

    V, D = t_d.shape
    logging.info("V = %d  D = %d" % (V, D))

    # Train model and get result -------------
    pmodel = pLSA()
    for z in range(3, (D + 1), 10):
        t1 = time.clock()
        (l, p_d_z, p_w_z, p_z) = pmodel.train(t_d, z, eps)
        t2 = time.clock()
        logging.info('z = %d eps = %f time = %f' % (z, l, t2 - t1))
        for itz in range(z):
            logging.info('Topic %d' % itz)
            data = [(p_w_z[i][itz], i) for i in range(len(p_w_z[:, itz]))]
            data.sort(key=lambda tup: tup[0], reverse=True)
            for i in range(key_word_size):
                logging.info('%s : %.6f ' %
                             (pp.get_word(data[i][1]), data[i][0]))
コード例 #4
0
ファイル: main.py プロジェクト: cheesezhe/pLSA_Topic_Model
def main():
  # Setup logging -------------------------
  logging.basicConfig(filename='plsa.log', level=logging.INFO)
  console = logging.StreamHandler()
  console.setLevel(logging.INFO)
  logging.getLogger('').addHandler(console)

  # Some basic configuration ---------------
  fname = './dst.txt'
  fsw = './cnStopwords.txt'
  eps = 20.0
  key_word_size = 10

  # Preprocess -----------------------------
  pp = PP(fname, fsw)
  t_d = pp.get_t_d()

  V,D = t_d.shape
  logging.info("V = %d  D = %d" %(V,D))

  # Train model and get result -------------
  pmodel = pLSA()
  # for z in range(3,(D+1), 10):
  for z in range(10,21, 10):
    t1 = time.clock()
    (l, p_d_z, p_w_z, p_z)  = pmodel.train(t_d, z, eps)
    t2 = time.clock()
    logging.info('z = %d ll = %f time = %f' %(z, l, t2-t1))
    for itz in range(z):
      logging.info('Topic %d' %itz)
      data = [(p_w_z[i][itz], i)  for i in range(len(p_w_z[:,itz])) ]
      data.sort(key=lambda tup:tup[0], reverse=True)
      for i in range(key_word_size):
        logging.info('%s : %.6f ' %(pp.get_word(data[i][1]), data[i][0] ))
コード例 #5
0
ファイル: microbplsa.py プロジェクト: sperez8/microbPLSA
    def run_plsa(self, Z, maxiter=MAX_ITER_PLSA, verbatim = True, useC = True):
        '''runs plsa on sample data in filename'''

        plsa = pLSA()
        plsa.debug = verbatim
        print "\n Running ...\n"
        plsa.train(self.datamatrix, Z, maxiter = maxiter, useC = useC)   #runs plsa!
        self.model = plsa
        return plsa
コード例 #6
0
    def run_plsa(self, Z, maxiter=MAX_ITER_PLSA, verbatim=True, useC=True):
        '''runs plsa on sample data in filename'''

        plsa = pLSA()
        plsa.debug = verbatim
        print "\n Running ...\n"
        plsa.train(self.datamatrix, Z, maxiter=maxiter, useC=useC)  #runs plsa!
        self.model = plsa
        return plsa
コード例 #7
0
    def open_model(self,
                   z=2,
                   modelFile=None,
                   study=None,
                   name=None,
                   run='',
                   useC=False,
                   folder=MODELS_LOCATION,
                   add_to_file=''):
        ''' Opens the probs of a model previously computed and saved in a json file '''
        if self.study is None:
            self.study = study
        if self.name is None:
            self.name = name
        if self.run == '':
            self.run = run
        if self.useC is None:
            self.useC = useC

        if modelFile is not None:
            self.modelFile = modelFile
        else:
            self.modelFile = self.get_result_filename(z,
                                                      run,
                                                      useC,
                                                      folder,
                                                      add_to_file=add_to_file)

        f = open(self.modelFile, 'r')
        print 'Using the following result file:', self.modelFile
        data = json.load(f)
        p_z = np.array(data['p_z'])
        p_w_z = np.array(data['p_w_z'])
        p_d_z = np.array(data['p_d_z'])
        model = p_z, p_w_z, p_d_z
        plsa = pLSA()
        plsa.set_model(model)
        self.z = z
        self.model = plsa
        return self.modelFile
コード例 #8
0
ファイル: example_plsa.py プロジェクト: sperez8/microbPLSA
def global_weights(data, model):
    td, idf, vocab = data
    plsa = pLSA(model)
    print plsa.global_weights(idf)
コード例 #9
0
def document_cluster(model):
    plsa = pLSA(model)
    print plsa.document_cluster()
コード例 #10
0
	def train(self):
		self.pLSAmodel = plsa.pLSA()
		self.pLSAmodel.train(self.UI.transpose(), self.k, self.maxiter, self.eps)
コード例 #11
0
ファイル: main.py プロジェクト: pranjals16/cs498
swords=[line.strip() for line in open("stopwords.txt", "r")]
trainlines=[line.strip() for line in open("coalscam_english.txt", "r")]
count=0

l=len(trainlines)
for i in range(0,l):
    for s in swords:
        trainlines[i]=re.sub(r'\b'+s+r'\b','',trainlines[i])

for i in range(0,l):
    doc=trainlines[i]
    tdm.add_doc(doc)

tdm.write_csv("matrix.csv", cutoff=1)

x=pLSA()

my_data=genfromtxt('matrix.csv', delimiter=',')
[i,j]=my_data.shape

a=(my_data[1:]).transpose()
x.train(a,3)

data=csv.reader(open('matrix.csv'))
fields=data.next()
cluster=x.topic_labels(fields)
for rows in cluster:
    print rows


p=count
コード例 #12
0
def word_cluster(model):
    plsa = plsa1.pLSA(model)
    print (plsa.word_cluster())
コード例 #13
0
def document_cluster(model):
    plsa = plsa1.pLSA(model)
    print (plsa.document_cluster())
コード例 #14
0
def folding_in(data, model, maxiter=30, debug=True):
    td, idf, vocab = data
    d = td[:,-1]
    plsa = plsa1.pLSA(model)
    plsa.debug = debug
    print (plsa.folding_in(d, maxiter))
コード例 #15
0
ファイル: example_plsa.py プロジェクト: sperez8/microbPLSA
def folding_in(data, model, maxiter=50, debug=True):
    td, idf, vocab = data
    d = td[:,-1]
    plsa = pLSA(model)
    plsa.debug = debug
    print plsa.folding_in(d, maxiter)
コード例 #16
0
ファイル: example_plsa.py プロジェクト: sperez8/microbPLSA
def average_train(data, maxiter=500, debug=True):
    td, idf, vocab = data
    td = td[:,:-1]
    plsa = pLSA()
    plsa.debug = debug
    return plsa.average_train(5)(td, 10, maxiter)
コード例 #17
0
 def train(self):
     self.pLSAmodel = plsa.pLSA()
     self.pLSAmodel.train(self.UI.transpose(), self.k, self.maxiter,
                          self.eps)
コード例 #18
0
def topic_labels(data, model, N=15):
    td, idf, vocab = data
    plsa = pLSA(model)
    inv_vocab = inverse_vocab(vocab)
    print plsa.topic_labels(inv_vocab, N)
コード例 #19
0
def unigram_smoothing(model):
    plsa = pLSA(model)
    print plsa.unigram_smoothing()
コード例 #20
0
def word_cluster(model):
    plsa = pLSA(model)
    print plsa.word_cluster()
コード例 #21
0
def word_topics(model):
    plsa = pLSA(model)
    print plsa.word_topics()
コード例 #22
0
def train(data,empty_docs_list,json_files_list,maxiter=500, debug=True):
    # td, idf, vocab = data
    # s_file_list= sorted(file_list)
    # print('file_list:',file_list)
    # print s_file_list
    topic_list= range(0,num_topics)
    # print topic_list
    # file_list_2 = list(file_list)
    # empty_file_list()
    #Bug update
    # df= pd.DataFrame(0,index=topic_list,columns=file_list_2)
    # Bug update over



    td, vocab = data
    # td = td[:,:-1]
    plsa = plsa1.pLSA()
    plsa.debug = debug
    plsa.logL_pic = logL_pic
    # model=plsa.train(td, num_topics, maxiter)
    model=plsa.train(td=td,Z=num_topics,maxiter=maxiter2,eps=cc,beta=beta,min_iteration=min_iteration)
    p_z_d=plsa.topic_document()
    ii=0

    # print 'model2',model[0]
    # print 'p_z_D =',p_z_d[0]
    # print('row',len(p_z_d))
    # print('column',len(p_z_d[0]))
    # print(df.shape)

    # Bug update
    # for i in df.index:
    #     jj=0
    #     for j in df.columns:
    #         df.loc[i,j]= p_z_d[ii][jj]
    #         # print 'df loc', df.loc[i,j]
    #         # print'ii jj', p_z_d[ii][jj]
    #         jj=jj+1
    #     ii=ii+1
    # Bug update over
    # print('PATH =',PATH)
    # Bug update
    # df.to_csv(PATH+'.csv')
    # Bug update over


    file_list = json_files_list
    print('"""""""""""""""""""""""""""""')
    # print(file_list)

    print('>>>>>>> In method train:', empty_docs_list)
    for edl in sorted(empty_docs_list, reverse=True):
        # print(file_list[edl])
        del file_list[edl]

    print('Dimenstionssssssssssssssssss')
    print("topic_list_len =",topic_list.__len__())
    print("p_z_d_len =", p_z_d.__len__())
    print("file_list_len =",file_list.__len__())
    print("p_z_d[0] =", p_z_d[0].__len__())


    topic_by_doc = open(PATH+'.csv', "w")
    for i in range(file_list.__len__()):
        topic_by_doc.write(',')
        topic_by_doc.write(file_list[i])
    topic_by_doc.write('\n')

    for i in range(p_z_d.__len__()):
        topic_by_doc.write(str(i))
        for j in range(p_z_d[0].__len__()):
            topic_by_doc.write(',')
            topic_by_doc.write(str(p_z_d[i][j]))
        topic_by_doc.write('\n')
    topic_by_doc.close()

    print('////////////////////////////')
    print(p_z_d.__len__())
    print(p_z_d[0].__len__())


    word_by_topic_conditional = open(PATH_word_by_topic_conditional+'.csv', "w")

    global num_topics_generated
    if len(plsa.p_w_z) < 300:
        num_topics_generated = len(plsa.p_w_z)


    p_w_z_transposed_truncated = np.sort(plsa.p_w_z.transpose()[:,0:num_topics_generated])

    for i in range(p_w_z_transposed_truncated.__len__()):
        for j in range(p_w_z_transposed_truncated[0].__len__()):
            word_by_topic_conditional.write(str(p_w_z_transposed_truncated[i][num_topics_generated-j-1]))
            word_by_topic_conditional.write(',')
        word_by_topic_conditional.write('\n')
    word_by_topic_conditional.close()





    # print('docs==========================')
    #
    # for i in file_list:
    #     print(i)
    # for i in p_z_d:
    #     print(i)

    pz=model[0]
    topic_prob_file = open(PLSA_PARAMETERS_PATH + file2, "w")
    for z in pz:
        topic_prob_file.write(str(z))
        topic_prob_file.write('\n')
    topic_prob_file.close()
    return model
コード例 #23
0
def average_train(data, maxiter=500, debug=True):
    td, idf, vocab = data
    td = td[:,:-1]
    plsa = plsa1.pLSA()
    plsa.debug = debug
    return plsa.average_train(10)(td, 10, maxiter)
コード例 #24
0
ファイル: example_plsa.py プロジェクト: sperez8/microbPLSA
def document_topics(model):
    plsa = pLSA(model)
    print plsa.document_topics()
コード例 #25
0
def document_topics(model):
    plsa = plsa1.pLSA(model)
    for i in  plsa.document_topics():
       print (i)
コード例 #26
0
ファイル: example_plsa.py プロジェクト: sperez8/microbPLSA
def document_cluster(model):
    plsa = pLSA(model)
    print plsa.document_cluster()
コード例 #27
0
def word_topics(model):
    plsa = plsa1.pLSA(model)
    for i in  plsa.word_topics():
       print (i)
コード例 #28
0
def train(data, maxiter=500, debug=True):
    td, idf, vocab = data
    td = td[:,:-1]
    plsa = pLSA()
    plsa.debug = debug
    return plsa.train(td, 10, maxiter)
コード例 #29
0
def unigram_smoothing(model):
    plsa = plsa1.pLSA(model)
    print (plsa.unigram_smoothing())
コード例 #30
0
ファイル: example_plsa.py プロジェクト: sperez8/microbPLSA
def unigram_smoothing(model):
    plsa = pLSA(model)
    print plsa.unigram_smoothing()
コード例 #31
0
def global_weights(data, model):
    td, idf, vocab = data
    plsa = plsa1.pLSA(model)
    print (plsa.global_weights(idf))
コード例 #32
0
ファイル: example_plsa.py プロジェクト: sperez8/microbPLSA
def topic_labels(data, model, N=15):
    td, idf, vocab = data
    plsa = pLSA(model)
    inv_vocab = inverse_vocab(vocab)
    print plsa.topic_labels(inv_vocab, N)
コード例 #33
0
ファイル: example_plsa.py プロジェクト: sperez8/microbPLSA
def word_topics(model):
    plsa = pLSA(model)
    print plsa.word_topics()
コード例 #34
0
def document_topics(model):
    plsa = pLSA(model)
    print plsa.document_topics()
コード例 #35
0
ファイル: example_plsa.py プロジェクト: sperez8/microbPLSA
def word_cluster(model):
    plsa = pLSA(model)
    print plsa.word_cluster()
コード例 #36
0
 def __plsa(self,user_feature_matrix):
     item_user_matrix = user_feature_matrix.transpose()
     plsa = pLSA()
     plsa.train(item_user_matrix.todok(),220)
     topic_user = plsa.document_topics()
     return topic_user.T