def analyze_glda_result(): glda_base = '/home/cdong/works/research/clu/baselinee/GaussianLDA' out_dir_ = iu.join(glda_base, 'output_{}/') d_classes = (DataReuters(), Data20ng()) # for d in d_classes: # # topic_list = d.get_topic_list() # out_dir = out_dir_.format(d.name) # # for out_dir in fi.listchildren(glda_base, children_type=fi.TYPE_DIR, pattern='output'): # # print(out_dir) # for param in fi.listchildren(out_dir, children_type=fi.TYPE_DIR, concat=True): # print(param[param.rfind('/') + 1:]) # for assignment in fi.listchildren(param, concat=True, pattern='table_assignments_'): # lines = fu.read_lines(assignment) # print(len(lines)) from collections import Counter import utils.array_utils as au import re def fn(f): return f[f.rfind('/') + 1:] for out_dir in iu.list_children(glda_base, ctype=iu.DIR, pattern='^output', full_path=True): # topic_list = Data20ng().get_topic_list() dname = re.findall('output_(.+)$', out_dir)[0] print(dname) topic_list = name2object[dname].get_topics() print(out_dir) for param in iu.list_children(out_dir, ctype=iu.DIR, full_path=True): print(param[param.rfind('/') + 1:]) for assign in iu.list_children(param, pattern='table_assignments', full_path=True): lines = iu.read_lines(assign) if len(lines) != len(topic_list): continue print(fn(assign), len(lines)) print(lines[100]) cluster_list = [ Counter(list(map(int, line.split_length()))).most_common()[0][0] for line in lines ] print(au.score(topic_list, cluster_list, 'nmi')) print(au.score(topic_list, cluster_list, 'ari'))
def test(test_file, model_file): textarr, labelarr = list(), list() with open(test_file) as testfp: lines = testfp.readlines()[:20] for line in lines: label, text = line.strip().split(' ', 1) textarr.append(text) labelarr.append(label) # for idx, line in enumerate(testlines): # if pu.is_empty_string(line): # continue # label, text = line.split(' ', 1) # print(label, model.predict(text, threshold=0.5), text) pred_value_arr = predict(textarr, ftu.load_model(model_file)) label = [label2value[label] for label in labelarr] print(au.score(label, pred_value_arr, 'auc'))
def LECM_twarr_with_label(twarr, tw_cluster_label): # Currently best hyperparam 1, 0.1, 0.1, 1 # tw_topic_arr, tw_cluster_pred = LECMClusterer.LECM_twarr(twarr, 1, 0.1, 0.1, 1, 20, 1) # print('one epoch:alpha {:<5}, eta {:<5}, beta {:<5}, lambd {:<5}, NMI {:<8}\n'. # format(0.1, 0.1, 0.1, 0.1, au.score(tw_cluster_pred, tw_cluster_label, 'nmi'))) tw_topic_arr = tw_cluster_pred = nmi = 0 for alpha in [1]: for eta in [0.1]: for beta in [0.1]: for lambd in [1]: tw_topic_arr_, tw_cluster_pred_ = LECMClusterer.LECM_twarr(twarr, alpha, eta, beta, lambd, 20, 70) nmi_ = au.score(tw_cluster_pred_, tw_cluster_label, 'nmi') print('alpha {:<5}, eta {:<5}, beta {:<5}, lambd {:<5}, NMI{:<8}'. format(alpha, eta, beta, lambd, nmi_)) if nmi < nmi_: tw_topic_arr, tw_cluster_pred = tw_topic_arr_, tw_cluster_pred_ nmi = nmi_ return tw_topic_arr, tw_cluster_pred
def test(test_file, model_file): textarr, labelarr = list(), list() with open(test_file) as testfp: lines = testfp.readlines() for line in lines: label, text = line.strip().split(' ', 1) textarr.append(text) labelarr.append(label) preds, scores = predict(textarr, threshold=0.2) assert len(preds) == len(textarr) for thres in [i/10 for i in range(2, 11)]: print(thres, Counter([1 if s > thres else 0 for s in scores])) label = [label2value[label] for label in labelarr] print(au.score(label, preds, 'auc')) for idx in range(1000, 1100): pred, lb, text = preds[idx], label[idx], textarr[idx] if not pred == lb: print(pred, lb, text)
def analyze_refine_mean_and_stderr(result_file, mean_std_file): using_scores = ['nmi', 'h**o', 'cmplt', 'ari'] arg_tpc_clu_list = iu.load_array(result_file) rows = list() for kwargs, topics, clusters in arg_tpc_clu_list: scores = [au.score(topics, clusters, s) for s in using_scores] res_dict = Od(zip(using_scores, scores)) row = Od(list(kwargs.items()) + list(res_dict.items())) rows.append(row) rows = sorted(rows, key=lambda item: item['nmi'], reverse=True) df = pd.DataFrame(data=rows) print(df) score_array = df[using_scores].values mean = np.mean(score_array, axis=0) std = np.std(score_array, axis=0, ddof=1) table = list(zip(*[using_scores, mean, std])) lines = [ '{}: {} ± {}'.format(name, round(mean, 4), round(std, 4)) for name, mean, std in table ] iu.write_lines(mean_std_file, lines)
def input_twarr_with_label(twarr, label): # alpha_range = beta_range = [i/100 for i in range(1, 10, 3)] + [i/10 for i in range(1, 10, 3)] + \ # [i for i in range(1, 10, 3)] # K_range = [30, 40, 50] alpha_range = beta_range = [i / 100 for i in range(1, 10, 4) ] + [i / 10 for i in range(1, 10, 4)] K_range = [30, 40, 50] """cluster using different hyperparams in multiprocess way""" iter_num = 100 process_num = 20 hyperparams = [(a, b, K) for a in alpha_range for b in beta_range for K in K_range] res_list = list() for i in range(int(math.ceil(len(hyperparams) / process_num))): param_list = [ (twarr, *param, iter_num) for param in hyperparams[i * process_num:(i + 1) * process_num] ] res_list += utils.multiprocess_utils.multi_process( GSDMM.GSDMM_twarr, param_list) print('{:<4} /'.format((i + 1) * process_num), len(hyperparams), 'params processed') """group the data by K""" frame = pd.DataFrame(index=np.arange(0, len(hyperparams)), columns=['alpha', 'beta', 'K']) for i in range(len(hyperparams)): frame.loc[i] = hyperparams[i] print('\n', frame, '\n') """start plotting figures""" for (alpha, K), indices in frame.groupby(['alpha', 'K']).groups.items(): fig = plt.figure() fig.set_figheight(8) fig.set_figwidth(8) all_nmi = list() for i in indices: beta = frame.loc[i]['beta'] tw_cluster_pred_iter = res_list[i] iter_x = range(len(tw_cluster_pred_iter)) nmi_y = [ au.score(label, pred, 'nmi') for pred in tw_cluster_pred_iter ] all_nmi.append(nmi_y) plt.plot(iter_x, nmi_y, '-', lw=1.5, label='beta={}'.format(round(beta, 2))) plt.xlabel('iteration') plt.ylabel('NMI') plt.ylim(0.0, 0.75) plt.title('K=' + str(K)) plt.legend(loc='lower right') plt.grid(True, '-', color='#333333', lw=0.8) plt.text(iter_num - 40, 0.70, 'final nmi: ' + str(round(max([nmi[-1] for nmi in all_nmi]), 6)), fontsize=14, verticalalignment='bottom', horizontalalignment='left') plt.savefig(getcfg().dc_test + 'GSDMM/' + 'alpha={},K={}.png'.format(round(alpha, 2), K))
for i in range(3): hist = model.fit(X, B, validation_split=0.1, epochs=100, batch_size=batch_size, verbose=0, shuffle=True) # create model that gives penultimate layer inputs = model.layers[0].input outputs = model.layers[-2].output model_penultimate = Model(inputs, outputs) # inference of penultimate layer H = model_penultimate.predict(X) V = normalize(H, norm='l1') print("Sample shape: {}".format(H.shape)) # n_clusters = len(np.unique(y)) # print("Number of classes: %d" % n_clusters) km = KMeans(n_clusters=clu_num, n_jobs=4, max_iter=200) km.fit(V) pred = km.labels_ # nmi = cluster_quality(y, pred) d = dict([(s, au.score(y, pred, s)) for s in ['nmi', 'ari']]) print(d) # logger.info(entries2name(d, inter=' ', intra=':', postfix='')) # np.save("pred.npy", pred) # model.save_weights("model.plk")