コード例 #1
0
ファイル: test.py プロジェクト: jeromedockes/project_pgm
def test_variational_inference(voc = None, docs = None,
                               max_files = None, doc_num = None, n_topics = 20,
                               dirich_param = .5,
                               log_word_proba_given_topic = None,
                               **kwargs):

    description = {'n_topics': n_topics,
                   'dirich_param': dirich_param}
    
    if(voc == None or docs == None):
        files_list = [path.join(path_to_reuters, 'reut2-000.sgm'), 
                      path.join(path_to_reuters, 'reut2-001.sgm'),
                      path.join(path_to_reuters, 'reut2-002.sgm'),
                      path.join(path_to_reuters, 'reut2-003.sgm'),
                      path.join(path_to_reuters, 'reut2-004.sgm'),
                      path.join(path_to_reuters, 'reut2-005.sgm'),
                      path.join(path_to_reuters, 'reut2-006.sgm'),
                      path.join(path_to_reuters, 'reut2-007.sgm'),
                      path.join(path_to_reuters, 'reut2-008.sgm'),
                      path.join(path_to_reuters, 'reut2-009.sgm')]
        print files_list        
        description['data_files_list'] = files_list
        voc, docs = dp.build_voc(files_list)
        print voc.keys()[:10]

    voc_size = len(voc)
    description['voc_size'] = voc_size
    
    if doc_num == None:
        doc_count = len(docs)
        print 'doc_count: %d' % doc_count
        doc_num = np.random.randint(doc_count)

    if log_word_proba_given_topic == None:
        word_proba_given_topic = np.random.rand(n_topics * voc_size).reshape(
            (n_topics, voc_size))
        word_proba_given_topic /= np.sum(word_proba_given_topic,
                                         axis = 1).reshape((-1,1))
        log_word_proba_given_topic = np.log(word_proba_given_topic)


    # # test for a document d
    # var_dirich, var_multinom, log_likelihoods = vi.variational_inference(
    #     docs[doc_num], dirich_param, log_word_proba_given_topic, **kwargs)

    # plt.figure(1)
    # plt.plot(log_likelihoods)
    # plt.xlabel('iterations')
    # plt.ylabel('expected log-likelihood')
    # plt.title('expected log-likelihood for a document d, k = '
    # + str(n_topics))

    # test for a corpus
    logger = Dirich_features_logger(root_results_dir = results_dir(),
                                    description = description)
    
    (dirich_param, word_logproba_given_topic, corpus_log_likelihood) \
        = vi.latent_dirichlet_allocation(docs, n_topics, voc_size,
                                         max_iter = 200, var_inf_max_iter = 200,
                                         logger = logger)
コード例 #2
0
def prepare_data(reuters_files, data_file_name, voc_file_name):
    voc = dp.build_voc(reuters_files)

    with open(data_file_name, 'w') as data_file:
        for doc in voc[1]:
            data_file.write('%d ' % np.size(doc, axis = 0))

            for word in doc:
                data_file.write('%d:%d ' % (word[0], word[1]))

            data_file.write('\n')

    sorted_voc = sorted(voc[0].items(), key = itemgetter(1))

    with open(voc_file_name, 'w') as voc_file:
        for item in sorted_voc:
            voc_file.write('%s\n' % item[0])