def best_test_perf_with_eval(filename, methods=None, org_names=None): """ for each method report the best param based on the validation performance """ ##FIXME the docs to the function data = compressed_pickle.load(filename) if methods is None: methods = data.keys() if org_names is None: org_names = data[methods[0]][0].keys() best_eval_perf = np.zeros((len(methods), len(org_names))) best_test_perf = np.zeros((len(methods), len(org_names))) for m_idx, m in enumerate(methods): for n_idx, n in enumerate(org_names): assert data[m][0][n].shape == data[m][1][n].shape num_splits, num_params = data[m][0][n].shape best_eval_param_idx_for_each_split = np.argmax(data[m][0][n], axis=1) assert len(best_eval_param_idx_for_each_split) == num_splits tmp_eval = [] tmp_test = [] for s_idx, p_idx in enumerate(best_eval_param_idx_for_each_split): tmp_eval.append(data[m][0][n][s_idx][p_idx]) tmp_test.append(data[m][1][n][s_idx][p_idx]) assert len(tmp_eval) == len(tmp_test) == num_splits best_eval_perf[m_idx, n_idx] = np.mean(tmp_eval) best_test_perf[m_idx, n_idx] = np.mean(tmp_test) print m, best_eval_perf[m_idx].mean(), best_test_perf[m_idx].mean() df_eval = pd.DataFrame(best_eval_perf, columns=org_names, index=methods) df_test = pd.DataFrame(best_test_perf, columns=org_names, index=methods) return df_eval, df_test
def best_org_param_idx(filename, diff_methods=None, org_names=None): """ for each method and org, report best param """ data = compressed_pickle.load(filename) if diff_methods is None: diff_methods = data.keys() methods = ['individual', 'union', 'mtl', 'mtmkl'] ## pre-defined methods for learning techniques #methods = ['individual'] assert (set(methods)==set(diff_methods)), "methods from pickle file %s != %s" % (diff_methods, methods) if org_names is None: org_names = data[methods[0]][0].keys() best_param_method_org = defaultdict(dict) all_num_eval = np.zeros((len(methods), len(org_names))) all_num_test = np.zeros((len(methods), len(org_names))) for m_idx, m in enumerate(methods): for n_idx, n in enumerate(org_names): best_param_idx = np.argmax(data[m][0][n].mean(axis=0)) # argmax based on eval data best_param_method_org[m][n] = best_param_idx all_num_eval[m_idx, n_idx] = data[m][0][n].mean(axis=0)[best_param_idx] all_num_test[m_idx, n_idx] = data[m][1][n].mean(axis=0)[best_param_idx] print m, all_num_test[m_idx].mean(), all_num_eval[m_idx].mean() # create pandas frames df_eval = pd.DataFrame(all_num_eval, columns=org_names, index=methods) df_test = pd.DataFrame(all_num_test, columns=org_names, index=methods) # TODO plotting with pandas add-on # df_eval.plot(kind="bar") # TODO: unit test return df_eval, df_test
def best_global_param_idx(filename, methods=None, org_names=None): """ for each method, report best param (averaged over orgs) based on eval data """ data = compressed_pickle.load(filename) if methods is None: methods = data.keys() if org_names is None: org_names = data[methods[0]][0].keys() best_param_method = {} best_test_method = {} inner_shape = data[methods[0]][0][org_names[0]].shape assert inner_shape == data[methods[0]][1][org_names[0]].shape import ipdb; ipdb.set_trace() for m in methods: all_num = np.zeros((len(org_names), inner_shape[0], inner_shape[1])) all_num_test = np.zeros((len(org_names), inner_shape[0], inner_shape[1])) for i,n in enumerate(org_names): assert data[m][0][n].shape == inner_shape all_num[i] = data[m][0][n] all_num_test[i] = data[m][1][n] # average over orgs and splits mean_perf = all_num.mean(axis=1).mean(axis=0) assert len(mean_perf) == inner_shape[1] best_param_idx_eval = np.argmax(mean_perf) best_param_method[m] = best_param_idx_eval best_test_method[m] = all_num_test.mean(axis=1).mean(axis=0)[best_param_idx_eval] return best_param_method, best_test_method
def highest_org_param(filename, diff_methods=None, org_names=None): """ highest C for an organism """ data = compressed_pickle.load(filename) if diff_methods is None: diff_methods = data.keys() methods = ['individual', 'union', 'mtl', 'mtmkl'] ## pre-defined methods for learning techniques methods = ['individual', 'union', 'mtl'] assert (set(methods)==set(diff_methods)), "methods from pickle file %s != %s" % (diff_methods, methods) if org_names is None: org_names = data[methods[0]][0].keys() all_num_eval = np.zeros((len(methods), len(org_names))) all_num_test = np.zeros((len(methods), len(org_names))) for m_idx, m in enumerate(methods): for n_idx, n in enumerate(org_names): best_eval_param = np.amax(data[m][0][n].mean(axis=0)) all_num_eval[m_idx, n_idx] = best_eval_param best_test_param = np.amax(data[m][1][n].mean(axis=0)) all_num_test[m_idx, n_idx] = best_test_param print m, all_num_test[m_idx].mean(), all_num_eval[m_idx].mean() # create pandas frames df_eval = pd.DataFrame(all_num_eval, columns=org_names, index=methods) df_test = pd.DataFrame(all_num_test, columns=org_names, index=methods) # TODO: unit test return df_eval, df_test