Beispiel #1
0
def main(tseries_fpath, predict_fpath, bestby_fpath):

    X = np.genfromtxt(tseries_fpath)[:, 1:] + 0.0001
    cls_pred = np.loadtxt(predict_fpath, dtype='i')
    rgr_true = X.sum(axis=1)
    bestby = np.genfromtxt(bestby_fpath)

    cls_labels = set(cls_pred[cls_pred != -1])

    tt = X.shape[1]
    models = {}
    models_per_clust = {}
    ref_time = np.arange(1, tt + 1)

    #tr = 7
    #ref_time = np.array([tr])
    #bestby = np.zeros(bestby.shape[0]) + tr

    for tr in ref_time:
        models[tr] = fit(X, tr, tt)

        for k in sorted(cls_labels):
            Xk = X[cls_pred == k]
            models_per_clust[tr, k] = fit(Xk, tr, tt)

    errors_all = []
    errors_cls = []
    errors_per_cls = defaultdict(list)
    for tr in ref_time:
        idx = bestby == tr
        ols = models[tr]

        errors_all.extend(ols.gcv_sqerrors[idx])
        classes = cls_pred[idx]

        for cls in set(classes):
            bestby_for_cls = bestby[cls_pred == cls]
            idx_cls = bestby_for_cls == tr

            ols = models_per_clust[tr, cls]
            errors_cls.extend(ols.gcv_sqerrors[idx_cls])
            errors_per_cls[cls].extend(ols.gcv_sqerrors[idx_cls])

    print('Glob model:', np.mean(errors_all), '+-', hci(errors_all, .95))
    print('Spec model:', np.mean(errors_cls), '+-', hci(errors_cls, .95))
    print()
    print('Per class')
    for cls in cls_labels:
        err = errors_per_cls[cls]
        print('Cls = ', cls, np.mean(err), '+-', hci(err, .95))
Beispiel #2
0
def main(tseries_fpath, predict_fpath, bestby_fpath):

    X = np.genfromtxt(tseries_fpath)[:,1:] + 0.0001
    cls_pred = np.loadtxt(predict_fpath, dtype='i')
    rgr_true = X.sum(axis=1)
    bestby = np.genfromtxt(bestby_fpath)

    cls_labels = set(cls_pred[cls_pred != -1])

    tt = X.shape[1]
    models = {}
    models_per_clust = {}
    ref_time = np.arange(1, tt + 1)

    #tr = 7
    #ref_time = np.array([tr])
    #bestby = np.zeros(bestby.shape[0]) + tr

    for tr in ref_time:
        models[tr] = fit(X, tr, tt)
        
        for k in sorted(cls_labels):
            Xk = X[cls_pred == k]
            models_per_clust[tr, k] = fit(Xk, tr, tt)
    
    errors_all = []
    errors_cls = []
    errors_per_cls = defaultdict(list)
    for tr in ref_time:
        idx = bestby == tr
        ols = models[tr]

        errors_all.extend(ols.gcv_sqerrors[idx])
        classes = cls_pred[idx]

        for cls in set(classes):
            bestby_for_cls = bestby[cls_pred == cls]
            idx_cls = bestby_for_cls == tr
            
            ols = models_per_clust[tr, cls]
            errors_cls.extend(ols.gcv_sqerrors[idx_cls])
            errors_per_cls[cls].extend(ols.gcv_sqerrors[idx_cls])

    print('Glob model:', np.mean(errors_all), '+-', hci(errors_all, .95))
    print('Spec model:', np.mean(errors_cls), '+-', hci(errors_cls, .95))
    print()
    print('Per class')
    for cls in cls_labels:
        err = errors_per_cls[cls]
        print('Cls = ', cls, np.mean(err), '+-', hci(err, .95))
Beispiel #3
0
def main(features_fpath, tseries_fpath, tags_fpath, classes_fpath, clf_name):
    X, params = create_input_table(features_fpath, tseries_fpath, tags_fpath)
    y = np.loadtxt(classes_fpath)

    clf = create_grid_search(clf_name)
    class_matrices, conf_matrices = run_classifier(clf, X, y)

    metric_means = np.mean(class_matrices, axis=0)
    metric_ci = hci(class_matrices, .95, axis=0)
    print(clf_summary(metric_means, metric_ci))
    print()

    conf_means = np.mean(conf_matrices, axis=0)
    conf_ci = hci(conf_matrices, .95, axis=0)
    print("Average confusion matrix with .95 confidence interval")
    print(" \ttrue ")
    print("predic")
    for i in range(conf_means.shape[0]):
        print(i, end="\t \t")
        for j in range(conf_means.shape[1]):
            print('%.3f +- %.3f' % (conf_means[i, j], conf_ci[i, j]), end="\t")
        print()
Beispiel #4
0
def main(features_fpath, tseries_fpath, tags_fpath, classes_fpath, clf_name):
    X, params = create_input_table(features_fpath, tseries_fpath, tags_fpath)
    y = np.loadtxt(classes_fpath)
    
    clf = create_grid_search(clf_name)
    class_matrices, conf_matrices = run_classifier(clf, X, y)
    
    metric_means = np.mean(class_matrices, axis=0)
    metric_ci = hci(class_matrices, .95, axis=0)
    print(clf_summary(metric_means, metric_ci))
    print()
    
    conf_means = np.mean(conf_matrices, axis=0)
    conf_ci = hci(conf_matrices, .95, axis=0)
    print("Average confusion matrix with .95 confidence interval")
    print(" \ttrue ")
    print("predic")
    for i in xrange(conf_means.shape[0]):
        print(i, end="\t \t")
        for j in xrange(conf_means.shape[1]):
            print('%.3f +- %.3f' % (conf_means[i, j], conf_ci[i, j]), end="\t")
        print()
Beispiel #5
0
def main(result_fpath):
   
    parameters = OrderedDict()
    errors = OrderedDict()
    with open(result_fpath) as results_file:
        obj_id = None
        for i, line in enumerate(results_file):
            if i % 3 == 0:
                obj_id = line.strip()
            if i % 3 == 1:
                line = line.replace('array(', '')
                line = line.replace(', dtype=int32)', '')
                line = line.replace('\'', '"')
                params = json.loads(line.strip())
                parameters[obj_id] = params
            if i % 3 == 2:
                err = np.asarray([float(x) for x in line.split()])
                errors[obj_id] = err
                obj_id = None
    
    bics_phx = []
    bics_kir = []
    wins = []
    diff = []
    for key in parameters:
        params = parameters[key]
        err = errors[key]

        bic_phoenix = err[0]
        bic_kir = err[[2, 4, 6, 8]].min()
        
        #bic_phoenix = err[1]
        #bic_kir = min(err[[3, 5, 7, 9]])
        
        bics_phx.append(bic_phoenix)
        bics_kir.append(bic_kir)
        
        diff.append((bic_kir - bic_phoenix) / bic_phoenix)
        wins.append(bic_kir - bic_phoenix > 0)

    bics_phx = np.asarray(bics_phx)
    bics_kir = np.asarray(bics_kir)

    #ks = robjects.r['ks.test']
    #res = ks(bics_phx, bics_kir)#, alternative='less')
    #val = res.rx2('statistic')[0]
    #p_val = res.rx2('p.value')[0]

    from vod.stats.ci import half_confidence_interval_size as hci 
    print(sum(wins) / bics_phx.shape[0], '&', np.mean(bics_phx), hci(bics_phx, .95), np.mean(bics_kir), hci(bics_kir, .95), 
            (np.mean(bics_kir) - np.mean(bics_phx)) / np.mean(bics_phx))
Beispiel #6
0
def print_results(clf_scores, micro, macro, r2_all, mse_all, mrse_all):
    metric_means = np.mean(clf_scores, axis=0)
    metric_ci = hci(clf_scores, .95, axis=0)

    print(clf_summary(metric_means, metric_ci))
    print('Micro F1 - mean: %f +- %f' % (np.mean(micro), hci(micro, .95)))
    print('Macro F1 - mean: %f +- %f' % (np.mean(macro), hci(macro, .95)))
    print('R2 all   - mean: %f +- %f' % (np.mean(r2_all), hci(r2_all, .95)))
    print('MSE all   - mean: %f +- %f' % (np.mean(mse_all), hci(mse_all, .95)))
    print('MRSE all   - mean: %f +- %f' %
          (np.mean(mrse_all), hci(mrse_all, .95)))
Beispiel #7
0
def print_results(clf_scores, micro, macro, r2_all, mse_all, mrse_all):
    metric_means = np.mean(clf_scores, axis=0)
    metric_ci = hci(clf_scores, .95, axis=0)
    
    print(clf_summary(metric_means, metric_ci))
    print('Micro F1 - mean: %f +- %f' % (np.mean(micro), hci(micro, .95)))
    print('Macro F1 - mean: %f +- %f' % (np.mean(macro), hci(macro, .95)))
    print('R2 all   - mean: %f +- %f' % (np.mean(r2_all), hci(r2_all, .95)))
    print('MSE all   - mean: %f +- %f' % (np.mean(mse_all), hci(mse_all, .95)))
    print('MRSE all   - mean: %f +- %f' % (np.mean(mrse_all), 
                                           hci(mrse_all, .95)))
Beispiel #8
0
def main(tseries_fpath, plot_foldpath):
    assert os.path.isdir(plot_foldpath)
    initialize_matplotlib()

    X = np.genfromtxt(tseries_fpath)[:, 1:].copy()

    n_samples = X.shape[0]
    sample_rows = np.arange(n_samples)

    clust_range = range(2, 16)
    n_clustering_vals = len(clust_range)

    intra_array = np.zeros(shape=(25, n_clustering_vals))
    inter_array = np.zeros(shape=(25, n_clustering_vals))
    bcvs_array = np.zeros(shape=(25, n_clustering_vals))
    costs_array = np.zeros(shape=(25, n_clustering_vals))

    r = 0
    for i in xrange(5):
        np.random.shuffle(sample_rows)
        rand_sample = sample_rows[:200]

        X_new = X[rand_sample]
        D_new = dist.dist_all(X_new, X_new, rolling=True)[0]

        for j in xrange(5):
            for k in clust_range:
                intra, inter, bcv, cost = run_clustering(X_new, k, D_new)

                intra_array[r, k - 2] = intra
                inter_array[r, k - 2] = inter
                bcvs_array[r, k - 2] = bcv
                costs_array[r, k - 2] = cost

            r += 1
            print(r)

    intra_err = np.zeros(n_clustering_vals)
    inter_err = np.zeros(n_clustering_vals)
    bcvs_err = np.zeros(n_clustering_vals)
    costs_err = np.zeros(n_clustering_vals)

    for k in clust_range:
        j = k - 2
        intra_err[j] = hci(intra_array[:, j], 0.95)
        inter_err[j] = hci(inter_array[:, j], 0.95)
        bcvs_err[j] = hci(bcvs_array[:, j], 0.95)
        costs_err[j] = hci(costs_array[:, j], 0.95)

    plt.errorbar(clust_range, np.mean(inter_array, axis=0), fmt="gD", label="Inter Cluster", yerr=inter_err)
    plt.errorbar(clust_range, np.mean(bcvs_array, axis=0), fmt="bo", label="BetaCV", yerr=bcvs_err)
    plt.errorbar(clust_range, np.mean(intra_array, axis=0), fmt="rs", label="Intra Cluster", yerr=intra_err)
    plt.ylabel("Average Distance")
    plt.xlabel("Number of clusters")
    plt.xlim((0.0, 16))
    plt.ylim((0.0, 1.0))
    plt.legend(frameon=False, loc="lower left")
    plt.savefig(os.path.join(plot_foldpath, "bcv.pdf"))
    plt.close()

    plt.errorbar(clust_range, np.mean(costs_array, axis=0), fmt="bo", label="Cost", yerr=costs_err)
    plt.ylabel("Cost (F)")
    plt.xlabel("Number of clusters")
    plt.xlim((0.0, 16))
    plt.ylim((0.0, 1.0))
    plt.legend(frameon=False, loc="lower left")
    plt.savefig(os.path.join(plot_foldpath, "cost.pdf"))
    plt.close()
Beispiel #9
0
def main(tseries_fpath, plot_foldpath):
    assert os.path.isdir(plot_foldpath)
    initialize_matplotlib()

    X = np.genfromtxt(tseries_fpath)[:, 1:].copy()

    n_samples = X.shape[0]
    sample_rows = np.arange(n_samples)

    clust_range = range(2, 16)
    n_clustering_vals = len(clust_range)

    intra_array = np.zeros(shape=(25, n_clustering_vals))
    inter_array = np.zeros(shape=(25, n_clustering_vals))
    bcvs_array = np.zeros(shape=(25, n_clustering_vals))
    costs_array = np.zeros(shape=(25, n_clustering_vals))

    r = 0
    for i in xrange(5):
        np.random.shuffle(sample_rows)
        rand_sample = sample_rows[:200]

        X_new = X[rand_sample]
        D_new = dist.dist_all(X_new, X_new, rolling=True)[0]

        for j in xrange(5):
            for k in clust_range:
                intra, inter, bcv, cost = run_clustering(X_new, k, D_new)

                intra_array[r, k - 2] = intra
                inter_array[r, k - 2] = inter
                bcvs_array[r, k - 2] = bcv
                costs_array[r, k - 2] = cost

            r += 1
            print(r)

    intra_err = np.zeros(n_clustering_vals)
    inter_err = np.zeros(n_clustering_vals)
    bcvs_err = np.zeros(n_clustering_vals)
    costs_err = np.zeros(n_clustering_vals)

    for k in clust_range:
        j = k - 2
        intra_err[j] = hci(intra_array[:, j], .95)
        inter_err[j] = hci(inter_array[:, j], .95)
        bcvs_err[j] = hci(bcvs_array[:, j], .95)
        costs_err[j] = hci(costs_array[:, j], .95)

    plt.errorbar(clust_range,
                 np.mean(inter_array, axis=0),
                 fmt='gD',
                 label='Inter Cluster',
                 yerr=inter_err)
    plt.errorbar(clust_range,
                 np.mean(bcvs_array, axis=0),
                 fmt='bo',
                 label='BetaCV',
                 yerr=bcvs_err)
    plt.errorbar(clust_range,
                 np.mean(intra_array, axis=0),
                 fmt='rs',
                 label='Intra Cluster',
                 yerr=intra_err)
    plt.ylabel('Average Distance')
    plt.xlabel('Number of clusters')
    plt.xlim((0., 16))
    plt.ylim((0., 1.))
    plt.legend(frameon=False, loc='lower left')
    plt.savefig(os.path.join(plot_foldpath, 'bcv.pdf'))
    plt.close()

    plt.errorbar(clust_range,
                 np.mean(costs_array, axis=0),
                 fmt='bo',
                 label='Cost',
                 yerr=costs_err)
    plt.ylabel('Cost (F)')
    plt.xlabel('Number of clusters')
    plt.xlim((0., 16))
    plt.ylim((0., 1.))
    plt.legend(frameon=False, loc='lower left')
    plt.savefig(os.path.join(plot_foldpath, 'cost.pdf'))
    plt.close()