Exemple #1
    def test_dist_all(self):
        m1 = np.array([[0.0], [0.0]])
        m2 = np.array([[0.0], [0.0]])

        expected = np.array([[0.0, 0.0], [0.0, 0.0]])
        assert_array_equal(expected, dist.dist_all(m1, m2)[0])
        assert_array_equal(expected, dist.dist_all(m1, m2)[1])

        m1 = np.array([[1.0], [1.0]])
        m2 = np.array([[0.0], [0.0]])
        expected = np.array([[1.0, 1.0], [1.0, 1.0]])
        assert_array_equal(expected, dist.dist_all(m1, m2)[0])

        m1 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
        m2 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
        expected = np.array([[0.0, 2 / sqrt(29)], [2 / sqrt(29), 0.0]])
        assert_array_almost_equal(expected, dist.dist_all(m1, m2, True)[0])
Exemple #2
    def test_dist_all(self):
        m1 = np.array([[0.0], [0.0]])
        m2 = np.array([[0.0], [0.0]])

        expected = np.array([[0.0, 0.0], [0.0, 0.0]])
        assert_array_equal(expected, dist.dist_all(m1, m2)[0])
        assert_array_equal(expected, dist.dist_all(m1, m2)[1])

        m1 = np.array([[1.0], [1.0]])
        m2 = np.array([[0.0], [0.0]])
        expected = np.array([[1.0, 1.0], [1.0, 1.0]])
        assert_array_equal(expected, dist.dist_all(m1, m2)[0])

        m1 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
        m2 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
        expected = np.array([[0.0, 2 / sqrt(29)], [2 / sqrt(29), 0.0]])
        assert_array_almost_equal(expected, dist.dist_all(m1, m2, True)[0])
Exemple #3
def main(tseries_fpath, test_fpath, cents_fpath):

    X = ioutil.load_series(tseries_fpath, test_fpath)

    C = np.loadtxt(cents_fpath)
    dist_cents = dist.dist_all(C, X, rolling=True)[0]
    y_true = dist_cents.argmin(axis=0)

    for t in y_true:
        print t
Exemple #4
def cost(tseries, assign, centroids, dist_centroids=None):
    num_series = tseries.shape[0]
    if dist_centroids is None:
        dist_centroids = dist_all(centroids, tseries)
    cost_f = 0.0
    for i in xrange(num_series):
        k = assign[i]
        cost_f += dist_centroids[k, i] ** 2
    return cost_f / num_series
Exemple #5
def cost(tseries, assign, centroids, dist_centroids=None):

    num_series = tseries.shape[0]
    if dist_centroids is None:
        dist_centroids = dist_all(centroids, tseries)

    cost_f = 0.0
    for i in range(num_series):
        k = assign[i]
        cost_f += dist_centroids[k, i]**2

    return cost_f / num_series
Exemple #6
def avg_inter_dist(tseries, assign, dists_all_pairs=None):
    num_series = tseries.shape[0]
    if dists_all_pairs is None:
        dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]
    dists = []
    for i in xrange(num_series):
        k = assign[i]
        non_members = assign != k
        dists_i = dists_all_pairs[i]
    return np.mean(dists), np.std(dists)
Exemple #7
def avg_inter_dist(tseries, assign, dists_all_pairs=None):

    num_series = tseries.shape[0]

    if dists_all_pairs is None:
        dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]

    dists = []
    for i in range(num_series):
        k = assign[i]
        non_members = assign != k
        dists_i = dists_all_pairs[i]

    return np.mean(dists), np.std(dists)
Exemple #8
def main(tseries_fpath, in_folder):

    ids = []
    with open(tseries_fpath) as tseries_file:
        for l in tseries_file:

    ids = np.array(ids)
    folders = glob.glob(os.path.join(in_folder, 'fold-*/ksc'))
    num_folders = len(folders)

    agree = 0
    diff = 0
    for i in xrange(num_folders):

        base_i = os.path.dirname(folders[i])
        Ci = np.loadtxt(os.path.join(folders[i], 'cents.dat'))

        train_i = np.loadtxt(os.path.join(base_i, 'train.dat'), dtype='bool')
        assign_i = np.loadtxt(os.path.join(folders[i], 'assign.dat'))

        for j in xrange(i, num_folders):

            base_j = os.path.dirname(folders[j])    
            Cj = np.loadtxt(os.path.join(folders[j], 'cents.dat'))
            dists = dist.dist_all(Ci, Cj, rolling=True)[0]
            argsrt = dists.argsort(axis=1)
            train_j = np.loadtxt(os.path.join(base_j, 'train.dat'), dtype='bool')    
            assign_j = np.loadtxt(os.path.join(folders[j], 'assign.dat'))
            for k in xrange(argsrt.shape[0]):
                first = True
                for o in argsrt[k]:
                    ids_k = set(ids[train_i][assign_i == k])
                    ids_o = set(ids[train_j][assign_j == o])
                    n_inter = len(ids_k.intersection(ids_o))

                    if first:
                        first = False
                        agree += n_inter
                        diff += n_inter
    print('AgreedProb = ', agree / (agree + diff))
    print('DisagreeProb = ', diff / (agree + diff))
Exemple #9
def main(tseries_fpath, base_folder):

    folders = glob.glob(os.path.join(base_folder, 'fold-*'))
    num_folders = len(folders)

    cluster_mapping = []
    C_base = np.loadtxt(os.path.join(folders[0], 'ksc/cents.dat'))

    for i in range(num_folders):
        Ci = np.loadtxt(os.path.join(folders[i], 'ksc/cents.dat'))

        dists = dist.dist_all(Ci, C_base, rolling=True)[0]
        closest = dists.argmin(axis=1)

        for k in range(Ci.shape[0]):
            cluster_mapping[i][k] = closest[k]

    y_true_all = []
    y_pred_all = []
    for i in range(num_folders):
        y_true = np.loadtxt(os.path.join(folders[i], 'ksc/test_assign.dat'))
        y_pred = np.loadtxt(os.path.join(folders[i], \

        for j in range(y_true.shape[0]):
            y_true[j] = cluster_mapping[i][y_true[j]]
            if y_pred[j] != -1:
                y_pred[j] = cluster_mapping[i][y_pred[j]]


    y_pred_all = np.asarray(y_pred_all)
    y_true_all = np.asarray(y_true_all)

    report = classification_report(y_true_all, y_pred_all)
    valid = y_pred_all != -1
    print('Using the centroids from folder: ', folders[0])
    print('Micro Aggregation of Folds:')
    print('%.3f fract of videos were not classified' %
          (sum(~valid) / y_pred_all.shape[0]))
    print(classification_report(y_true_all[valid], y_pred_all[valid]))
Exemple #10
def main(tseries_fpath, base_folder):

    folders = glob.glob(os.path.join(base_folder, "fold-*"))
    num_folders = len(folders)

    cluster_mapping = []
    C_base = np.loadtxt(os.path.join(folders[0], "ksc/cents.dat"))

    for i in xrange(num_folders):
        Ci = np.loadtxt(os.path.join(folders[i], "ksc/cents.dat"))

        dists = dist.dist_all(Ci, C_base, rolling=True)[0]
        closest = dists.argmin(axis=1)

        for k in xrange(Ci.shape[0]):
            cluster_mapping[i][k] = closest[k]

    y_true_all = []
    y_pred_all = []
    for i in xrange(num_folders):
        y_true = np.loadtxt(os.path.join(folders[i], "ksc/test_assign.dat"))
        y_pred = np.loadtxt(os.path.join(folders[i], "cls-res-fitted-50/pred.dat"))

        for j in xrange(y_true.shape[0]):
            y_true[j] = cluster_mapping[i][y_true[j]]
            if y_pred[j] != -1:
                y_pred[j] = cluster_mapping[i][y_pred[j]]


    y_pred_all = np.asarray(y_pred_all)
    y_true_all = np.asarray(y_true_all)

    report = classification_report(y_true_all, y_pred_all)
    valid = y_pred_all != -1
    print("Using the centroids from folder: ", folders[0])
    print("Micro Aggregation of Folds:")
    print("%.3f fract of videos were not classified" % (sum(~valid) / y_pred_all.shape[0]))
    print(classification_report(y_true_all[valid], y_pred_all[valid]))
Exemple #11
def silhouette(tseries, assign, dists_all_pairs=None):
    if dists_all_pairs is None:
        dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]

    num_series = tseries.shape[0]
    sils = np.zeros(num_series, dtype='f')
    labels = set(assign)
    for i in xrange(num_series):
        k = assign[i]
        dists_i = dists_all_pairs[i]
        intra = np.mean(dists_i[assign == k])
        min_inter = float('inf')
        for o in labels:
            if o != k:
                inter = np.mean(dists_i[assign == o])
                if inter < min_inter:
                    min_inter = inter
        sils[i] = (min_inter - intra) / max(intra, min_inter)
    return np.mean(sils)
Exemple #12
def silhouette(tseries, assign, dists_all_pairs=None):

    if dists_all_pairs is None:
        dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]

    num_series = tseries.shape[0]
    sils = np.zeros(num_series, dtype='f')
    labels = set(assign)
    for i in range(num_series):

        k = assign[i]
        dists_i = dists_all_pairs[i]
        intra = np.mean(dists_i[assign == k])

        min_inter = float('inf')
        for o in labels:
            if o != k:
                inter = np.mean(dists_i[assign == o])
                if inter < min_inter:
                    min_inter = inter

        sils[i] = (min_inter - intra) / max(intra, min_inter)

    return np.mean(sils)
Exemple #13
def _base_ksc(tseries, initial_centroids, n_iters=-1):
    This is the base of the KSC algorithm. It follows the same idea of a K-Means
    algorithm. Firstly, we assign time series to a new cluster based on the
    distance to the centroids. For each time series, it is computed the best
    shift to minimize the distance to the closest centroid.
    The assignment step is followed by an update step where new centroids are 
    computed based on the new clustering (based on the update step).
    Both steps above are repeated `n_iters` times. If this parameter is negative
    then the steps are repeated until convergence, that is, until no time series
    changes cluster between consecutive steps. 

    tseries: a matrix of shape (number of time series, size of each series)
        The time series to cluster
    initial_centroids: a matrix of shape (num. of clusters, size of time series)
        The initial centroid estimates
    n_iters: int
        The number of iterations which the algorithm will run

    centroids: a matrix of shape (num. of clusters, size of time series)
        The final centroids found by the algorithm
    assign: an array of num. series size
        The cluster id which each time series belongs to
    best_shift: an array of num. series size
        The amount shift amount performed for each time series
    cent_dists: a matrix of shape (num. centroids, num. series)
        The distance of each centroid to each time series

    ----------    References
    .. [1] J. Yang and J. Leskovec, 
       "Patterns of Temporal Variation in Online Media" - WSDM'11  
    .. [1] J. Yang and J. Leskovec, 
       "Patterns of Temporal Variation in Online Media" - WSDM'11  
    .. [2] Wikipedia, 
        "K-means clustering"  
    num_clusters = initial_centroids.shape[0]
    num_series = tseries.shape[0]

    centroids = initial_centroids

    #KSC algorithm
    cent_dists = None
    assign = None
    prev_assign = None
    best_shift = None

    iters = n_iters
    converged = False

    while iters != 0 and not converged:
        #assign elements to new clusters    References
        cent_dists, shifts = dist_all(centroids, tseries, rolling=True)
        assign = cent_dists.argmin(axis=0)
        best_shift = np.ndarray(num_series, dtype='i')
        for i in xrange(shifts.shape[1]):
            best_shift[i] = shifts[assign[i], i]
        #check if converged, if not compute new centroids
        if prev_assign is not None and not (prev_assign - assign).any():
            converged = True
            centroids = _compute_centroids(tseries, assign, num_clusters, 

        prev_assign = assign
        iters -= 1
    return centroids, assign, best_shift, cent_dists
Exemple #14
def main(tseries_fpath, plot_foldpath):
    assert os.path.isdir(plot_foldpath)

    X = np.genfromtxt(tseries_fpath)[:, 1:].copy()

    n_samples = X.shape[0]
    sample_rows = np.arange(n_samples)

    clust_range = range(2, 16)
    n_clustering_vals = len(clust_range)

    intra_array = np.zeros(shape=(25, n_clustering_vals))
    inter_array = np.zeros(shape=(25, n_clustering_vals))
    bcvs_array = np.zeros(shape=(25, n_clustering_vals))
    costs_array = np.zeros(shape=(25, n_clustering_vals))

    r = 0
    for i in xrange(5):
        rand_sample = sample_rows[:200]

        X_new = X[rand_sample]
        D_new = dist.dist_all(X_new, X_new, rolling=True)[0]

        for j in xrange(5):
            for k in clust_range:
                intra, inter, bcv, cost = run_clustering(X_new, k, D_new)

                intra_array[r, k - 2] = intra
                inter_array[r, k - 2] = inter
                bcvs_array[r, k - 2] = bcv
                costs_array[r, k - 2] = cost

            r += 1

    intra_err = np.zeros(n_clustering_vals)
    inter_err = np.zeros(n_clustering_vals)
    bcvs_err = np.zeros(n_clustering_vals)
    costs_err = np.zeros(n_clustering_vals)

    for k in clust_range:
        j = k - 2
        intra_err[j] = hci(intra_array[:, j], 0.95)
        inter_err[j] = hci(inter_array[:, j], 0.95)
        bcvs_err[j] = hci(bcvs_array[:, j], 0.95)
        costs_err[j] = hci(costs_array[:, j], 0.95)

    plt.errorbar(clust_range, np.mean(inter_array, axis=0), fmt="gD", label="Inter Cluster", yerr=inter_err)
    plt.errorbar(clust_range, np.mean(bcvs_array, axis=0), fmt="bo", label="BetaCV", yerr=bcvs_err)
    plt.errorbar(clust_range, np.mean(intra_array, axis=0), fmt="rs", label="Intra Cluster", yerr=intra_err)
    plt.ylabel("Average Distance")
    plt.xlabel("Number of clusters")
    plt.xlim((0.0, 16))
    plt.ylim((0.0, 1.0))
    plt.legend(frameon=False, loc="lower left")
    plt.savefig(os.path.join(plot_foldpath, "bcv.pdf"))

    plt.errorbar(clust_range, np.mean(costs_array, axis=0), fmt="bo", label="Cost", yerr=costs_err)
    plt.ylabel("Cost (F)")
    plt.xlabel("Number of clusters")
    plt.xlim((0.0, 16))
    plt.ylim((0.0, 1.0))
    plt.legend(frameon=False, loc="lower left")
    plt.savefig(os.path.join(plot_foldpath, "cost.pdf"))
Exemple #15
def main(tseries_fpath, plot_foldpath):
    assert os.path.isdir(plot_foldpath)

    X = np.genfromtxt(tseries_fpath)[:, 1:].copy()

    n_samples = X.shape[0]
    sample_rows = np.arange(n_samples)

    clust_range = range(2, 16)
    n_clustering_vals = len(clust_range)

    intra_array = np.zeros(shape=(25, n_clustering_vals))
    inter_array = np.zeros(shape=(25, n_clustering_vals))
    bcvs_array = np.zeros(shape=(25, n_clustering_vals))
    costs_array = np.zeros(shape=(25, n_clustering_vals))

    r = 0
    for i in xrange(5):
        rand_sample = sample_rows[:200]

        X_new = X[rand_sample]
        D_new = dist.dist_all(X_new, X_new, rolling=True)[0]

        for j in xrange(5):
            for k in clust_range:
                intra, inter, bcv, cost = run_clustering(X_new, k, D_new)

                intra_array[r, k - 2] = intra
                inter_array[r, k - 2] = inter
                bcvs_array[r, k - 2] = bcv
                costs_array[r, k - 2] = cost

            r += 1

    intra_err = np.zeros(n_clustering_vals)
    inter_err = np.zeros(n_clustering_vals)
    bcvs_err = np.zeros(n_clustering_vals)
    costs_err = np.zeros(n_clustering_vals)

    for k in clust_range:
        j = k - 2
        intra_err[j] = hci(intra_array[:, j], .95)
        inter_err[j] = hci(inter_array[:, j], .95)
        bcvs_err[j] = hci(bcvs_array[:, j], .95)
        costs_err[j] = hci(costs_array[:, j], .95)

                 np.mean(inter_array, axis=0),
                 label='Inter Cluster',
                 np.mean(bcvs_array, axis=0),
                 np.mean(intra_array, axis=0),
                 label='Intra Cluster',
    plt.ylabel('Average Distance')
    plt.xlabel('Number of clusters')
    plt.xlim((0., 16))
    plt.ylim((0., 1.))
    plt.legend(frameon=False, loc='lower left')
    plt.savefig(os.path.join(plot_foldpath, 'bcv.pdf'))

                 np.mean(costs_array, axis=0),
    plt.ylabel('Cost (F)')
    plt.xlabel('Number of clusters')
    plt.xlim((0., 16))
    plt.ylim((0., 1.))
    plt.legend(frameon=False, loc='lower left')
    plt.savefig(os.path.join(plot_foldpath, 'cost.pdf'))
Exemple #16
def _base_ksc(tseries, initial_centroids, n_iters=-1):
    This is the base of the KSC algorithm. It follows the same idea of a K-Means
    algorithm. Firstly, we assign time series to a new cluster based on the
    distance to the centroids. For each time series, it is computed the best
    shift to minimize the distance to the closest centroid.
    The assignment step is followed by an update step where new centroids are 
    computed based on the new clustering (based on the update step).
    Both steps above are repeated `n_iters` times. If this parameter is negative
    then the steps are repeated until convergence, that is, until no time series
    changes cluster between consecutive steps. 

    tseries: a matrix of shape (number of time series, size of each series)
        The time series to cluster
    initial_centroids: a matrix of shape (num. of clusters, size of time series)
        The initial centroid estimates
    n_iters: int
        The number of iterations which the algorithm will run

    centroids: a matrix of shape (num. of clusters, size of time series)
        The final centroids found by the algorithm
    assign: an array of num. series size
        The cluster id which each time series belongs to
    best_shift: an array of num. series size
        The amount shift amount performed for each time series
    cent_dists: a matrix of shape (num. centroids, num. series)
        The distance of each centroid to each time series

    ----------    References
    .. [1] J. Yang and J. Leskovec, 
       "Patterns of Temporal Variation in Online Media" - WSDM'11  
    .. [1] J. Yang and J. Leskovec, 
       "Patterns of Temporal Variation in Online Media" - WSDM'11  
    .. [2] Wikipedia, 
        "K-means clustering"  

    num_clusters = initial_centroids.shape[0]
    num_series = tseries.shape[0]

    centroids = initial_centroids

    #KSC algorithm
    cent_dists = None
    assign = None
    prev_assign = None
    best_shift = None

    iters = n_iters
    converged = False

    while iters != 0 and not converged:
        #assign elements to new clusters    References
        cent_dists, shifts = dist_all(centroids, tseries, rolling=True)

        assign = cent_dists.argmin(axis=0)
        best_shift = np.ndarray(num_series, dtype='i')
        for i in range(shifts.shape[1]):
            best_shift[i] = shifts[assign[i], i]

        #check if converged, if not compute new centroids
        if prev_assign is not None and not (prev_assign - assign).any():
            converged = True
            centroids = _compute_centroids(tseries, assign, num_clusters,

        prev_assign = assign
        iters -= 1

    return centroids, assign, best_shift, cent_dists
    Z = preprocessing.StandardScaler().fit_transform(T)
    km = cluster.MiniBatchKMeans(n_clusters=num_clusters)
    km = km.fit(Z)
    D = km.transform(Z)

    return D

if __name__ == '__main__':
    X_train, T12_train, hosts_train = myio.read_features(test=False)
    Y_train = myio.read_response_train()
    k = 50 
    D = transform_km(T12_train, k)
    X_train_new = np.hstack((D,  X_train))
    model = OLS()
    model.fit(X_train_new, Y_train)
    print(k, np.sqrt(model.G.mean(axis=0)))

    C = np.genfromtxt('ksc-results/cents_visits_%d.dat' % k, dtype='d')
    T_nolog = np.asarray(np.exp(T12_train) - 1, order='C')
    D = dist_all(C, T_nolog, rolling=True)[0].T
    X_train_new = np.hstack((D,  X_train))
    model = OLS()
    model.fit(X_train_new, Y_train)
    print(k, np.sqrt(model.G.mean(axis=0)))