Esempio n. 1
0
def load_dataset(cfg=config.default_config()):
    """Load or generate dataset.
       - Return:
       F
       vocab
       N
       M
       Phi_r
       Theta_r 
       - Used params:
       load_data
       data_name?
    """
    if cfg['load_data'] == 'uci' or cfg['load_data'] == 1:
        print("uci")
        F, vocab = data.load_uci(cfg['data_name'], cfg)
        N, M = F.shape
        cfg['N'], cfg['M'] = F.shape
        print('Dimensions of F:', N, M)
        print('Checking assumption on F:', np.sum(F, axis=0).max())
        return F, vocab, N, M, None, None
    elif cfg['load_data'] == 2:
        F, Phi_r, Theta_r = gen_real(cfg)
        print(Phi_r)
        print('Checking assumption on F:', np.sum(F, axis=0).max())
        return F, None, F.shape[0], F.shape[1], Phi_r, Theta_r
    elif cfg['load_data'] == 3:
    	print("uci halfmodel", cfg["alpha"])
        F, vocab = data.load_uci(cfg['data_name'], cfg)
        N, M = F.shape
        cfg['N'], cfg['M'] = F.shape
        Phi_r, Theta_r = load_obj('Phi_'+cfg['data_name']), load_obj('Theta_'+cfg['data_name'])
        F_merged = merge_halfmodel(F, Phi_r, Theta_r, cfg)
        print('Dimensions of F:', N, M)
        print('Checking assumption on F:', np.sum(F_merged, axis=0).max())
        return F_merged, vocab, N, M, Phi_r, Theta_r
    elif cfg['load_data'] == 4:
        F = np.eye(cfg['T'])
        cfg['N'], cfg['M'] = F.shape
        Phi_r = np.eye(cfg['T'])
        Theta_r = np.eye(cfg['T'])
        return F, None, cfg['T'], cfg['T'], Phi_r, Theta_r
    elif cfg['load_data'] == 5:
        cfg['real_theta_sparsity'] = 1.
        cfg['real_phi_sparsity'] = 1.
        F, Phi_r, Theta_r = gen_real(cfg)
        print('Checking assumption on F:', np.sum(F, axis=0).max())
        return F, None, F.shape[0], F.shape[1], Phi_r, Theta_r
Esempio n. 2
0
def save_topics(W, filename, vocab=None, topic_idxs=None):
    if not vocab:
        vocab = range(W.shape[0])
    if not topic_idxs:
        topic_idxs = range(W.shape[1])
    with open(filename, 'w') as f:
        for topic_idx in topic_idxs:
            words = np.argsort(-W[:, topic_idx])
            print('topic #', topic_idx, ':', file=f)
            str_words = ['  ' + str(vocab[i]) + ':' + str(W[i, topic_idx]) for i in words]
            print('\n'.join(str_words), file=f)

if __name__ != '__main__':
    ml.rcdefaults() # cбрасываем настройки на "по умолчанию"
    ml.rcParams['font.family'] = 'fantasy'
    ml.rcParams['font.fantasy'] = 'Times New Roman', 'Ubuntu','Arial','Tahoma','Calibri'
else:
    import os
    from os.path import join
    from data import load_uci
    ml.rcdefaults() # cбрасываем настройки на "по умолчанию"
    ml.rcParams['font.family'] = 'fantasy'
    ml.rcParams['font.fantasy'] = 'Times New Roman', 'Ubuntu','Arial','Tahoma','Calibri'
    cfg = config.load()
    _, vocab = load_uci(cfg['data_name'], cfg)
    W = np.loadtxt(join(cfg['result_dir'], cfg['experiment'] + '_W.csv'))
    res = plot_matrix(W, u'Распределение слов в темах', vocab=vocab)
    filename = join(cfg['result_dir'], cfg['experiment']+'_W.pdf')
    plt.savefig(filename, format='pdf')
    plt.show()
Esempio n. 3
0
        vocab = range(W.shape[0])
    if not topic_idxs:
        topic_idxs = range(W.shape[1])
    with open(filename, "w") as f:
        for topic_idx in topic_idxs:
            words = np.argsort(-W[:, topic_idx])
            print("topic #", topic_idx, ":", file=f)
            str_words = ["  " + str(vocab[i]) + ":" + str(W[i, topic_idx]) for i in words]
            print("\n".join(str_words), file=f)


if __name__ != "__main__":
    ml.rcdefaults()  # cбрасываем настройки на "по умолчанию"
    ml.rcParams["font.family"] = "fantasy"
    ml.rcParams["font.fantasy"] = "Times New Roman", "Ubuntu", "Arial", "Tahoma", "Calibri"
else:
    import os
    from os.path import join
    from data import load_uci

    ml.rcdefaults()  # cбрасываем настройки на "по умолчанию"
    ml.rcParams["font.family"] = "fantasy"
    ml.rcParams["font.fantasy"] = "Times New Roman", "Ubuntu", "Arial", "Tahoma", "Calibri"
    cfg = config.load()
    _, vocab = load_uci(cfg["data_name"], cfg)
    W = np.loadtxt(join(cfg["result_dir"], cfg["experiment"] + "_W.csv"))
    res = plot_matrix(W, u"Распределение слов в темах", vocab=vocab)
    filename = join(cfg["result_dir"], cfg["experiment"] + "_W.pdf")
    plt.savefig(filename, format="pdf")
    plt.show()
Esempio n. 4
0
def save_topics(W, filename, vocab=None, topic_idxs=None):
    if not vocab:
        vocab = range(W.shape[0])
    if not topic_idxs:
        topic_idxs = range(W.shape[1])
    with open(filename, 'w') as f:
        for topic_idx in topic_idxs:
            words = np.argsort(-W[:, topic_idx])
            print('topic #', topic_idx, ':', file=f)
            str_words = ['  ' + str(vocab[i]) + ':' + str(W[i, topic_idx]) for i in words]
            print('\n'.join(str_words), file=f)

if __name__ != '__main__':
    ml.rcdefaults() # cбрасываем настройки на "по умолчанию"
    ml.rcParams['font.family'] = 'fantasy'
    ml.rcParams['font.fantasy'] = 'Times New Roman', 'Ubuntu','Arial','Tahoma','Calibri'
else:
    import os
    from os.path import join
    from data import load_uci
    ml.rcdefaults() # cбрасываем настройки на "по умолчанию"
    ml.rcParams['font.family'] = 'fantasy'
    ml.rcParams['font.fantasy'] = 'Times New Roman', 'Ubuntu','Arial','Tahoma','Calibri'
    cfg = config.load()
    _, vocab = load_uci(cfg['data_name'], cfg)
    W = np.loadtxt(join(cfg['result_dir'], cfg['experiment'] + '_W.csv'))
    res = plot_matrix(W, u'Распределение слов в темах', vocab=vocab)
    filename = join(cfg['result_dir'], cfg['experiment']+'_W.pdf')
    plt.savefig(filename, format='pdf')
    plt.show()