def main(tseries_fpath, predict_fpath, bestby_fpath): X = np.genfromtxt(tseries_fpath)[:, 1:] + 0.0001 cls_pred = np.loadtxt(predict_fpath, dtype='i') rgr_true = X.sum(axis=1) bestby = np.genfromtxt(bestby_fpath) cls_labels = set(cls_pred[cls_pred != -1]) tt = X.shape[1] models = {} models_per_clust = {} ref_time = np.arange(1, tt + 1) #tr = 7 #ref_time = np.array([tr]) #bestby = np.zeros(bestby.shape[0]) + tr for tr in ref_time: models[tr] = fit(X, tr, tt) for k in sorted(cls_labels): Xk = X[cls_pred == k] models_per_clust[tr, k] = fit(Xk, tr, tt) errors_all = [] errors_cls = [] errors_per_cls = defaultdict(list) for tr in ref_time: idx = bestby == tr ols = models[tr] errors_all.extend(ols.gcv_sqerrors[idx]) classes = cls_pred[idx] for cls in set(classes): bestby_for_cls = bestby[cls_pred == cls] idx_cls = bestby_for_cls == tr ols = models_per_clust[tr, cls] errors_cls.extend(ols.gcv_sqerrors[idx_cls]) errors_per_cls[cls].extend(ols.gcv_sqerrors[idx_cls]) print('Glob model:', np.mean(errors_all), '+-', hci(errors_all, .95)) print('Spec model:', np.mean(errors_cls), '+-', hci(errors_cls, .95)) print() print('Per class') for cls in cls_labels: err = errors_per_cls[cls] print('Cls = ', cls, np.mean(err), '+-', hci(err, .95))
def main(tseries_fpath, predict_fpath, bestby_fpath): X = np.genfromtxt(tseries_fpath)[:,1:] + 0.0001 cls_pred = np.loadtxt(predict_fpath, dtype='i') rgr_true = X.sum(axis=1) bestby = np.genfromtxt(bestby_fpath) cls_labels = set(cls_pred[cls_pred != -1]) tt = X.shape[1] models = {} models_per_clust = {} ref_time = np.arange(1, tt + 1) #tr = 7 #ref_time = np.array([tr]) #bestby = np.zeros(bestby.shape[0]) + tr for tr in ref_time: models[tr] = fit(X, tr, tt) for k in sorted(cls_labels): Xk = X[cls_pred == k] models_per_clust[tr, k] = fit(Xk, tr, tt) errors_all = [] errors_cls = [] errors_per_cls = defaultdict(list) for tr in ref_time: idx = bestby == tr ols = models[tr] errors_all.extend(ols.gcv_sqerrors[idx]) classes = cls_pred[idx] for cls in set(classes): bestby_for_cls = bestby[cls_pred == cls] idx_cls = bestby_for_cls == tr ols = models_per_clust[tr, cls] errors_cls.extend(ols.gcv_sqerrors[idx_cls]) errors_per_cls[cls].extend(ols.gcv_sqerrors[idx_cls]) print('Glob model:', np.mean(errors_all), '+-', hci(errors_all, .95)) print('Spec model:', np.mean(errors_cls), '+-', hci(errors_cls, .95)) print() print('Per class') for cls in cls_labels: err = errors_per_cls[cls] print('Cls = ', cls, np.mean(err), '+-', hci(err, .95))
def main(features_fpath, tseries_fpath, tags_fpath, classes_fpath, clf_name): X, params = create_input_table(features_fpath, tseries_fpath, tags_fpath) y = np.loadtxt(classes_fpath) clf = create_grid_search(clf_name) class_matrices, conf_matrices = run_classifier(clf, X, y) metric_means = np.mean(class_matrices, axis=0) metric_ci = hci(class_matrices, .95, axis=0) print(clf_summary(metric_means, metric_ci)) print() conf_means = np.mean(conf_matrices, axis=0) conf_ci = hci(conf_matrices, .95, axis=0) print("Average confusion matrix with .95 confidence interval") print(" \ttrue ") print("predic") for i in range(conf_means.shape[0]): print(i, end="\t \t") for j in range(conf_means.shape[1]): print('%.3f +- %.3f' % (conf_means[i, j], conf_ci[i, j]), end="\t") print()
def main(features_fpath, tseries_fpath, tags_fpath, classes_fpath, clf_name): X, params = create_input_table(features_fpath, tseries_fpath, tags_fpath) y = np.loadtxt(classes_fpath) clf = create_grid_search(clf_name) class_matrices, conf_matrices = run_classifier(clf, X, y) metric_means = np.mean(class_matrices, axis=0) metric_ci = hci(class_matrices, .95, axis=0) print(clf_summary(metric_means, metric_ci)) print() conf_means = np.mean(conf_matrices, axis=0) conf_ci = hci(conf_matrices, .95, axis=0) print("Average confusion matrix with .95 confidence interval") print(" \ttrue ") print("predic") for i in xrange(conf_means.shape[0]): print(i, end="\t \t") for j in xrange(conf_means.shape[1]): print('%.3f +- %.3f' % (conf_means[i, j], conf_ci[i, j]), end="\t") print()
def main(result_fpath): parameters = OrderedDict() errors = OrderedDict() with open(result_fpath) as results_file: obj_id = None for i, line in enumerate(results_file): if i % 3 == 0: obj_id = line.strip() if i % 3 == 1: line = line.replace('array(', '') line = line.replace(', dtype=int32)', '') line = line.replace('\'', '"') params = json.loads(line.strip()) parameters[obj_id] = params if i % 3 == 2: err = np.asarray([float(x) for x in line.split()]) errors[obj_id] = err obj_id = None bics_phx = [] bics_kir = [] wins = [] diff = [] for key in parameters: params = parameters[key] err = errors[key] bic_phoenix = err[0] bic_kir = err[[2, 4, 6, 8]].min() #bic_phoenix = err[1] #bic_kir = min(err[[3, 5, 7, 9]]) bics_phx.append(bic_phoenix) bics_kir.append(bic_kir) diff.append((bic_kir - bic_phoenix) / bic_phoenix) wins.append(bic_kir - bic_phoenix > 0) bics_phx = np.asarray(bics_phx) bics_kir = np.asarray(bics_kir) #ks = robjects.r['ks.test'] #res = ks(bics_phx, bics_kir)#, alternative='less') #val = res.rx2('statistic')[0] #p_val = res.rx2('p.value')[0] from vod.stats.ci import half_confidence_interval_size as hci print(sum(wins) / bics_phx.shape[0], '&', np.mean(bics_phx), hci(bics_phx, .95), np.mean(bics_kir), hci(bics_kir, .95), (np.mean(bics_kir) - np.mean(bics_phx)) / np.mean(bics_phx))
def print_results(clf_scores, micro, macro, r2_all, mse_all, mrse_all): metric_means = np.mean(clf_scores, axis=0) metric_ci = hci(clf_scores, .95, axis=0) print(clf_summary(metric_means, metric_ci)) print('Micro F1 - mean: %f +- %f' % (np.mean(micro), hci(micro, .95))) print('Macro F1 - mean: %f +- %f' % (np.mean(macro), hci(macro, .95))) print('R2 all - mean: %f +- %f' % (np.mean(r2_all), hci(r2_all, .95))) print('MSE all - mean: %f +- %f' % (np.mean(mse_all), hci(mse_all, .95))) print('MRSE all - mean: %f +- %f' % (np.mean(mrse_all), hci(mrse_all, .95)))
def main(tseries_fpath, plot_foldpath): assert os.path.isdir(plot_foldpath) initialize_matplotlib() X = np.genfromtxt(tseries_fpath)[:, 1:].copy() n_samples = X.shape[0] sample_rows = np.arange(n_samples) clust_range = range(2, 16) n_clustering_vals = len(clust_range) intra_array = np.zeros(shape=(25, n_clustering_vals)) inter_array = np.zeros(shape=(25, n_clustering_vals)) bcvs_array = np.zeros(shape=(25, n_clustering_vals)) costs_array = np.zeros(shape=(25, n_clustering_vals)) r = 0 for i in xrange(5): np.random.shuffle(sample_rows) rand_sample = sample_rows[:200] X_new = X[rand_sample] D_new = dist.dist_all(X_new, X_new, rolling=True)[0] for j in xrange(5): for k in clust_range: intra, inter, bcv, cost = run_clustering(X_new, k, D_new) intra_array[r, k - 2] = intra inter_array[r, k - 2] = inter bcvs_array[r, k - 2] = bcv costs_array[r, k - 2] = cost r += 1 print(r) intra_err = np.zeros(n_clustering_vals) inter_err = np.zeros(n_clustering_vals) bcvs_err = np.zeros(n_clustering_vals) costs_err = np.zeros(n_clustering_vals) for k in clust_range: j = k - 2 intra_err[j] = hci(intra_array[:, j], 0.95) inter_err[j] = hci(inter_array[:, j], 0.95) bcvs_err[j] = hci(bcvs_array[:, j], 0.95) costs_err[j] = hci(costs_array[:, j], 0.95) plt.errorbar(clust_range, np.mean(inter_array, axis=0), fmt="gD", label="Inter Cluster", yerr=inter_err) plt.errorbar(clust_range, np.mean(bcvs_array, axis=0), fmt="bo", label="BetaCV", yerr=bcvs_err) plt.errorbar(clust_range, np.mean(intra_array, axis=0), fmt="rs", label="Intra Cluster", yerr=intra_err) plt.ylabel("Average Distance") plt.xlabel("Number of clusters") plt.xlim((0.0, 16)) plt.ylim((0.0, 1.0)) plt.legend(frameon=False, loc="lower left") plt.savefig(os.path.join(plot_foldpath, "bcv.pdf")) plt.close() plt.errorbar(clust_range, np.mean(costs_array, axis=0), fmt="bo", label="Cost", yerr=costs_err) plt.ylabel("Cost (F)") plt.xlabel("Number of clusters") plt.xlim((0.0, 16)) plt.ylim((0.0, 1.0)) plt.legend(frameon=False, loc="lower left") plt.savefig(os.path.join(plot_foldpath, "cost.pdf")) plt.close()
def main(tseries_fpath, plot_foldpath): assert os.path.isdir(plot_foldpath) initialize_matplotlib() X = np.genfromtxt(tseries_fpath)[:, 1:].copy() n_samples = X.shape[0] sample_rows = np.arange(n_samples) clust_range = range(2, 16) n_clustering_vals = len(clust_range) intra_array = np.zeros(shape=(25, n_clustering_vals)) inter_array = np.zeros(shape=(25, n_clustering_vals)) bcvs_array = np.zeros(shape=(25, n_clustering_vals)) costs_array = np.zeros(shape=(25, n_clustering_vals)) r = 0 for i in xrange(5): np.random.shuffle(sample_rows) rand_sample = sample_rows[:200] X_new = X[rand_sample] D_new = dist.dist_all(X_new, X_new, rolling=True)[0] for j in xrange(5): for k in clust_range: intra, inter, bcv, cost = run_clustering(X_new, k, D_new) intra_array[r, k - 2] = intra inter_array[r, k - 2] = inter bcvs_array[r, k - 2] = bcv costs_array[r, k - 2] = cost r += 1 print(r) intra_err = np.zeros(n_clustering_vals) inter_err = np.zeros(n_clustering_vals) bcvs_err = np.zeros(n_clustering_vals) costs_err = np.zeros(n_clustering_vals) for k in clust_range: j = k - 2 intra_err[j] = hci(intra_array[:, j], .95) inter_err[j] = hci(inter_array[:, j], .95) bcvs_err[j] = hci(bcvs_array[:, j], .95) costs_err[j] = hci(costs_array[:, j], .95) plt.errorbar(clust_range, np.mean(inter_array, axis=0), fmt='gD', label='Inter Cluster', yerr=inter_err) plt.errorbar(clust_range, np.mean(bcvs_array, axis=0), fmt='bo', label='BetaCV', yerr=bcvs_err) plt.errorbar(clust_range, np.mean(intra_array, axis=0), fmt='rs', label='Intra Cluster', yerr=intra_err) plt.ylabel('Average Distance') plt.xlabel('Number of clusters') plt.xlim((0., 16)) plt.ylim((0., 1.)) plt.legend(frameon=False, loc='lower left') plt.savefig(os.path.join(plot_foldpath, 'bcv.pdf')) plt.close() plt.errorbar(clust_range, np.mean(costs_array, axis=0), fmt='bo', label='Cost', yerr=costs_err) plt.ylabel('Cost (F)') plt.xlabel('Number of clusters') plt.xlim((0., 16)) plt.ylim((0., 1.)) plt.legend(frameon=False, loc='lower left') plt.savefig(os.path.join(plot_foldpath, 'cost.pdf')) plt.close()