Exemple #1
0
def main(tseries_fpath, assign_fpath, centroids_fpath, plot_foldpath):
    initialize_matplotlib()

    X = np.genfromtxt(tseries_fpath)[:, 1:].copy()
    y = np.genfromtxt(assign_fpath)
    centroids = np.genfromtxt(centroids_fpath)

    num_classes = len(set(y))

    for k in range(num_classes):
        centroid_plot_foldpath = os.path.join(plot_foldpath, str(k))
        os.mkdir(centroid_plot_foldpath)

        centroid = centroids[k]
        plot_series(centroid, centroid_plot_foldpath, 'centroid', True)

        members = X[y == k]
        n_samples = members.shape[0]
        sample_rows = np.arange(n_samples)
        np.random.shuffle(sample_rows)

        members_to_plot = members[sample_rows[:10]]
        for i in range(members_to_plot.shape[0]):
            print(k, i)
            plot_series(members_to_plot[i], centroid_plot_foldpath,
                        'ex-%d' % i)
Exemple #2
0
def main(features_fpath,
         classes_fpath,
         out_fpath,
         trans_fpath,
         col_to_use=2,
         is_text_features=False):
    initialize_matplotlib()

    classes = np.loadtxt(classes_fpath)

    if is_text_features:
        to_plot, sum_classes, labels = \
        load_text_file(features_fpath, col_to_use, classes)
        ref = False
    else:
        to_plot, sum_classes, labels = \
        load_svm_file(features_fpath, classes)
        ref = True

    trans = {}
    with open(trans_fpath) as f:
        for l in f:
            spl = l.split()
            trans[int(spl[0])] = int(spl[1])

    data = generate_data_plot(to_plot, sum_classes, labels, classes)
    stacked_bars(labels, data, out_fpath, trans, ref)
Exemple #3
0
def main(tseries_fpath, assign_fpath, centroids_fpath, plot_foldpath):
    initialize_matplotlib()
    
    X = np.genfromtxt(tseries_fpath)[:,1:].copy()
    y = np.genfromtxt(assign_fpath)
    centroids = np.genfromtxt(centroids_fpath)

    num_classes = len(set(y))
    
    for k in xrange(num_classes):
        centroid_plot_foldpath = os.path.join(plot_foldpath, str(k))
        os.mkdir(centroid_plot_foldpath)

        centroid = centroids[k]
        plot_series(centroid, centroid_plot_foldpath, 'centroid', True)
        
        members = X[y == k]
        n_samples = members.shape[0]
        sample_rows = np.arange(n_samples)
        np.random.shuffle(sample_rows)        
        
        members_to_plot = members[sample_rows[:10]]
        for i in xrange(members_to_plot.shape[0]):
            print(k, i)
            plot_series(members_to_plot[i], centroid_plot_foldpath, 'ex-%d' % i)
Exemple #4
0
def main(tseries_fpath, k, plot_foldpath):
    import mkl
    mkl.set_num_threads(16)

    initialize_matplotlib()

    X = np.genfromtxt(tseries_fpath)[:, 1:]
    aux = X.sum(axis=1)
    fix = np.where(aux == 0)[0]
    X[fix] += .001  #fixing zero only rows
    X = X.copy()

    cent, assign, shift, dists_cent = ksc.inc_ksc(X, k)

    for i in range(cent.shape[0]):
        t_series = cent[i]

        plt.plot(t_series, '-k')
        plt.gca().get_xaxis().set_visible(False)
        plt.gca().get_yaxis().set_visible(False)
        #plt.ylabel('Views')
        #plt.xlabel('Time')
        plt.savefig(os.path.join(plot_foldpath, '%d.pdf' % i))
        plt.close()

        half = t_series.shape[0] // 2
        to_shift = half - np.argmax(t_series)
        to_plot_peak_center = dist.shift(t_series, to_shift, rolling=True)
        plt.plot(to_plot_peak_center, '-k')
        plt.gca().get_xaxis().set_visible(False)
        plt.gca().get_yaxis().set_visible(False)
        #plt.ylabel('Views')
        #plt.xlabel('Time')
        plt.savefig(os.path.join(plot_foldpath, '%d-peak-center.pdf' % i))
        plt.close()

        to_shift = 0 - np.argmin(t_series)
        to_plot_min_first = dist.shift(t_series, to_shift, rolling=True)
        plt.plot(to_plot_min_first, '-k')
        plt.gca().get_xaxis().set_visible(False)
        plt.gca().get_yaxis().set_visible(False)
        #plt.ylabel('Views')
        #plt.xlabel('Time')
        plt.savefig(os.path.join(plot_foldpath, '%d-min-first.pdf' % i))
        plt.close()

    np.savetxt(os.path.join(plot_foldpath, 'cents.dat'), cent, fmt='%.5f')
    np.savetxt(os.path.join(plot_foldpath, 'assign.dat'), assign, fmt='%d')
    np.savetxt(os.path.join(plot_foldpath, 'shift.dat'), shift, fmt='%d')
    np.savetxt(os.path.join(plot_foldpath, 'dists_cent.dat'),
               dists_cent,
               fmt='%.5f')
Exemple #5
0
def main(features_fpath, classes_fpath, out_fpath, trans_fpath, col_to_use=2, is_text_features=False):
    initialize_matplotlib()

    classes = np.loadtxt(classes_fpath)

    if is_text_features:
        to_plot, sum_classes, labels = load_text_file(features_fpath, col_to_use, classes)
        ref = False
    else:
        to_plot, sum_classes, labels = load_svm_file(features_fpath, classes)
        ref = True

    trans = {}
    with open(trans_fpath) as f:
        for l in f:
            spl = l.split()
            trans[int(spl[0])] = int(spl[1])

    data = generate_data_plot(to_plot, sum_classes, labels, classes)
    stacked_bars(labels, data, out_fpath, trans, ref)
def main(features_fpath):
    initialize_matplotlib()
    
    X = np.genfromtxt(features_fpath)[:,1:]
    
    for r, k in sorted(refs.items()):
        idxs = X[:,k] > 0
        time_to_ref = (X[:,UP_DATE][idxs] - X[:,k][idxs])
        print(r, np.mean(time_to_ref), np.std(time_to_ref))

    print('peak_frac', np.mean(X[:,-3]), np.std(X[:,-3]))
    
    time_to_peak = (X[:,-4] - X[:,UP_DATE]) / 7
    print('peak_date', np.mean(time_to_peak), np.std(time_to_peak))
    
    import time
    plt.hist(X[:,UP_DATE], bins=20)
    ticks, labels = plt.xticks()
    plt.xticks(ticks, [time.strftime('%m/%y', time.localtime(x)) for x in ticks])
    plt.ylabel('\# Videos')
    plt.xlabel('Month/Year')
    plt.savefig('hist.pdf')
Exemple #7
0
def main(features_fpath):
    initialize_matplotlib()

    X = np.genfromtxt(features_fpath)[:, 1:]

    for r, k in sorted(refs.items()):
        idxs = X[:, k] > 0
        time_to_ref = (X[:, UP_DATE][idxs] - X[:, k][idxs])
        print(r, np.mean(time_to_ref), np.std(time_to_ref))

    print('peak_frac', np.mean(X[:, -3]), np.std(X[:, -3]))

    time_to_peak = (X[:, -4] - X[:, UP_DATE]) / 7
    print('peak_date', np.mean(time_to_peak), np.std(time_to_peak))

    import time
    plt.hist(X[:, UP_DATE], bins=20)
    ticks, labels = plt.xticks()
    plt.xticks(ticks,
               [time.strftime('%m/%y', time.localtime(x)) for x in ticks])
    plt.ylabel('\# Videos')
    plt.xlabel('Month/Year')
    plt.savefig('hist.pdf')
Exemple #8
0
def main(features_fpath, classes_fpath, user_users=False):

    initialize_matplotlib()

    classes = np.loadtxt(classes_fpath)
    num_classes = len(set(classes))

    to_compare = load_text_file(features_fpath, classes, user_users)

    print(end='\t')
    for i in range(num_classes):
        print(i, end='\t')
    print()

    for j in range(num_classes):
        print(j, end='\t')
        for i in range(num_classes):

            first_set = to_compare[i]
            second_set = to_compare[j]

            asym_j = asym_jaccard(first_set, second_set)
            print('%.3f' % asym_j, end='\t')
        print()
def main(features_fpath, classes_fpath, user_users=False):
    
    initialize_matplotlib()
    
    classes = np.loadtxt(classes_fpath)
    num_classes = len(set(classes))
    
    to_compare = load_text_file(features_fpath, classes, user_users)
    
    print(end='\t')
    for i in xrange(num_classes):
        print(i, end='\t')
    print()
    
    for j in xrange(num_classes):
        print(j, end='\t')
        for i in xrange(num_classes):
            
            first_set = to_compare[i]
            second_set = to_compare[j]
            
            asym_j = asym_jaccard(first_set, second_set)
            print('%.3f' % asym_j, end='\t')
        print()
Exemple #10
0
def main(tseries_fpath, plot_foldpath):
    assert os.path.isdir(plot_foldpath)
    initialize_matplotlib()

    X = np.genfromtxt(tseries_fpath)[:, 1:].copy()

    n_samples = X.shape[0]
    sample_rows = np.arange(n_samples)

    clust_range = range(2, 16)
    n_clustering_vals = len(clust_range)

    intra_array = np.zeros(shape=(25, n_clustering_vals))
    inter_array = np.zeros(shape=(25, n_clustering_vals))
    bcvs_array = np.zeros(shape=(25, n_clustering_vals))
    costs_array = np.zeros(shape=(25, n_clustering_vals))

    r = 0
    for i in xrange(5):
        np.random.shuffle(sample_rows)
        rand_sample = sample_rows[:200]

        X_new = X[rand_sample]
        D_new = dist.dist_all(X_new, X_new, rolling=True)[0]

        for j in xrange(5):
            for k in clust_range:
                intra, inter, bcv, cost = run_clustering(X_new, k, D_new)

                intra_array[r, k - 2] = intra
                inter_array[r, k - 2] = inter
                bcvs_array[r, k - 2] = bcv
                costs_array[r, k - 2] = cost

            r += 1
            print(r)

    intra_err = np.zeros(n_clustering_vals)
    inter_err = np.zeros(n_clustering_vals)
    bcvs_err = np.zeros(n_clustering_vals)
    costs_err = np.zeros(n_clustering_vals)

    for k in clust_range:
        j = k - 2
        intra_err[j] = hci(intra_array[:, j], 0.95)
        inter_err[j] = hci(inter_array[:, j], 0.95)
        bcvs_err[j] = hci(bcvs_array[:, j], 0.95)
        costs_err[j] = hci(costs_array[:, j], 0.95)

    plt.errorbar(clust_range, np.mean(inter_array, axis=0), fmt="gD", label="Inter Cluster", yerr=inter_err)
    plt.errorbar(clust_range, np.mean(bcvs_array, axis=0), fmt="bo", label="BetaCV", yerr=bcvs_err)
    plt.errorbar(clust_range, np.mean(intra_array, axis=0), fmt="rs", label="Intra Cluster", yerr=intra_err)
    plt.ylabel("Average Distance")
    plt.xlabel("Number of clusters")
    plt.xlim((0.0, 16))
    plt.ylim((0.0, 1.0))
    plt.legend(frameon=False, loc="lower left")
    plt.savefig(os.path.join(plot_foldpath, "bcv.pdf"))
    plt.close()

    plt.errorbar(clust_range, np.mean(costs_array, axis=0), fmt="bo", label="Cost", yerr=costs_err)
    plt.ylabel("Cost (F)")
    plt.xlabel("Number of clusters")
    plt.xlim((0.0, 16))
    plt.ylim((0.0, 1.0))
    plt.legend(frameon=False, loc="lower left")
    plt.savefig(os.path.join(plot_foldpath, "cost.pdf"))
    plt.close()
Exemple #11
0
def main(tseries_fpath, plot_foldpath):
    assert os.path.isdir(plot_foldpath)
    initialize_matplotlib()

    X = np.genfromtxt(tseries_fpath)[:, 1:].copy()

    n_samples = X.shape[0]
    sample_rows = np.arange(n_samples)

    clust_range = range(2, 16)
    n_clustering_vals = len(clust_range)

    intra_array = np.zeros(shape=(25, n_clustering_vals))
    inter_array = np.zeros(shape=(25, n_clustering_vals))
    bcvs_array = np.zeros(shape=(25, n_clustering_vals))
    costs_array = np.zeros(shape=(25, n_clustering_vals))

    r = 0
    for i in xrange(5):
        np.random.shuffle(sample_rows)
        rand_sample = sample_rows[:200]

        X_new = X[rand_sample]
        D_new = dist.dist_all(X_new, X_new, rolling=True)[0]

        for j in xrange(5):
            for k in clust_range:
                intra, inter, bcv, cost = run_clustering(X_new, k, D_new)

                intra_array[r, k - 2] = intra
                inter_array[r, k - 2] = inter
                bcvs_array[r, k - 2] = bcv
                costs_array[r, k - 2] = cost

            r += 1
            print(r)

    intra_err = np.zeros(n_clustering_vals)
    inter_err = np.zeros(n_clustering_vals)
    bcvs_err = np.zeros(n_clustering_vals)
    costs_err = np.zeros(n_clustering_vals)

    for k in clust_range:
        j = k - 2
        intra_err[j] = hci(intra_array[:, j], .95)
        inter_err[j] = hci(inter_array[:, j], .95)
        bcvs_err[j] = hci(bcvs_array[:, j], .95)
        costs_err[j] = hci(costs_array[:, j], .95)

    plt.errorbar(clust_range,
                 np.mean(inter_array, axis=0),
                 fmt='gD',
                 label='Inter Cluster',
                 yerr=inter_err)
    plt.errorbar(clust_range,
                 np.mean(bcvs_array, axis=0),
                 fmt='bo',
                 label='BetaCV',
                 yerr=bcvs_err)
    plt.errorbar(clust_range,
                 np.mean(intra_array, axis=0),
                 fmt='rs',
                 label='Intra Cluster',
                 yerr=intra_err)
    plt.ylabel('Average Distance')
    plt.xlabel('Number of clusters')
    plt.xlim((0., 16))
    plt.ylim((0., 1.))
    plt.legend(frameon=False, loc='lower left')
    plt.savefig(os.path.join(plot_foldpath, 'bcv.pdf'))
    plt.close()

    plt.errorbar(clust_range,
                 np.mean(costs_array, axis=0),
                 fmt='bo',
                 label='Cost',
                 yerr=costs_err)
    plt.ylabel('Cost (F)')
    plt.xlabel('Number of clusters')
    plt.xlim((0., 16))
    plt.ylim((0., 1.))
    plt.legend(frameon=False, loc='lower left')
    plt.savefig(os.path.join(plot_foldpath, 'cost.pdf'))
    plt.close()