コード例 #1
0
ファイル: test_dist.py プロジェクト: flaviovdf/pyksc
    def test_dist_all(self):
        m1 = np.array([[0.0], [0.0]])
        m2 = np.array([[0.0], [0.0]])

        expected = np.array([[0.0, 0.0], [0.0, 0.0]])
        assert_array_equal(expected, dist.dist_all(m1, m2)[0])
        assert_array_equal(expected, dist.dist_all(m1, m2)[1])

        m1 = np.array([[1.0], [1.0]])
        m2 = np.array([[0.0], [0.0]])
        expected = np.array([[1.0, 1.0], [1.0, 1.0]])
        assert_array_equal(expected, dist.dist_all(m1, m2)[0])

        m1 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
        m2 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
        expected = np.array([[0.0, 2 / sqrt(29)], [2 / sqrt(29), 0.0]])
        assert_array_almost_equal(expected, dist.dist_all(m1, m2, True)[0])
コード例 #2
0
    def test_dist_all(self):
        m1 = np.array([[0.0], [0.0]])
        m2 = np.array([[0.0], [0.0]])

        expected = np.array([[0.0, 0.0], [0.0, 0.0]])
        assert_array_equal(expected, dist.dist_all(m1, m2)[0])
        assert_array_equal(expected, dist.dist_all(m1, m2)[1])

        m1 = np.array([[1.0], [1.0]])
        m2 = np.array([[0.0], [0.0]])
        expected = np.array([[1.0, 1.0], [1.0, 1.0]])
        assert_array_equal(expected, dist.dist_all(m1, m2)[0])

        m1 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
        m2 = np.array([[2.0, 3.0, 4.0], [3.0, 4.0, 0.0]])
        expected = np.array([[0.0, 2 / sqrt(29)], [2 / sqrt(29), 0.0]])
        assert_array_almost_equal(expected, dist.dist_all(m1, m2, True)[0])
コード例 #3
0
def main(tseries_fpath, test_fpath, cents_fpath):

    X = ioutil.load_series(tseries_fpath, test_fpath)

    C = np.loadtxt(cents_fpath)
    dist_cents = dist.dist_all(C, X, rolling=True)[0]
    y_true = dist_cents.argmin(axis=0)

    for t in y_true:
        print t
コード例 #4
0
ファイル: metrics.py プロジェクト: antoine-tran/pyksc
def cost(tseries, assign, centroids, dist_centroids=None):
    
    num_series = tseries.shape[0]
    if dist_centroids is None:
        dist_centroids = dist_all(centroids, tseries)
    
    cost_f = 0.0
    for i in xrange(num_series):
        k = assign[i]
        cost_f += dist_centroids[k, i] ** 2
    
    return cost_f / num_series
コード例 #5
0
ファイル: metrics.py プロジェクト: FlorentF9/pyksc
def cost(tseries, assign, centroids, dist_centroids=None):

    num_series = tseries.shape[0]
    if dist_centroids is None:
        dist_centroids = dist_all(centroids, tseries)

    cost_f = 0.0
    for i in range(num_series):
        k = assign[i]
        cost_f += dist_centroids[k, i]**2

    return cost_f / num_series
コード例 #6
0
ファイル: metrics.py プロジェクト: antoine-tran/pyksc
def avg_inter_dist(tseries, assign, dists_all_pairs=None):
    
    num_series = tseries.shape[0]
    
    if dists_all_pairs is None:
        dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]
    
    dists = []
    for i in xrange(num_series):
        k = assign[i]
        non_members = assign != k
        dists_i = dists_all_pairs[i]
        dists.extend(dists_i[non_members])
        
    return np.mean(dists), np.std(dists)
コード例 #7
0
ファイル: metrics.py プロジェクト: FlorentF9/pyksc
def avg_inter_dist(tseries, assign, dists_all_pairs=None):

    num_series = tseries.shape[0]

    if dists_all_pairs is None:
        dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]

    dists = []
    for i in range(num_series):
        k = assign[i]
        non_members = assign != k
        dists_i = dists_all_pairs[i]
        dists.extend(dists_i[non_members])

    return np.mean(dists), np.std(dists)
コード例 #8
0
def main(tseries_fpath, in_folder):

    ids = []
    with open(tseries_fpath) as tseries_file:
        for l in tseries_file:
            ids.append(l.split()[0])

    ids = np.array(ids)
    folders = glob.glob(os.path.join(in_folder, 'fold-*/ksc'))
    num_folders = len(folders)

    agree = 0
    diff = 0
    
    for i in xrange(num_folders):

        base_i = os.path.dirname(folders[i])
        Ci = np.loadtxt(os.path.join(folders[i], 'cents.dat'))

        train_i = np.loadtxt(os.path.join(base_i, 'train.dat'), dtype='bool')
        assign_i = np.loadtxt(os.path.join(folders[i], 'assign.dat'))

        for j in xrange(i, num_folders):

            base_j = os.path.dirname(folders[j])    
            Cj = np.loadtxt(os.path.join(folders[j], 'cents.dat'))
            
            dists = dist.dist_all(Ci, Cj, rolling=True)[0]
            argsrt = dists.argsort(axis=1)
            
            train_j = np.loadtxt(os.path.join(base_j, 'train.dat'), dtype='bool')    
            assign_j = np.loadtxt(os.path.join(folders[j], 'assign.dat'))
            
            for k in xrange(argsrt.shape[0]):
                first = True
                for o in argsrt[k]:
                    ids_k = set(ids[train_i][assign_i == k])
                    ids_o = set(ids[train_j][assign_j == o])
                    n_inter = len(ids_k.intersection(ids_o))

                    if first:
                        first = False
                        agree += n_inter
                    else:
                        diff += n_inter
    
    print('AgreedProb = ', agree / (agree + diff))
    print('DisagreeProb = ', diff / (agree + diff))
コード例 #9
0
ファイル: summarize_results.py プロジェクト: FlorentF9/pyksc
def main(tseries_fpath, base_folder):

    folders = glob.glob(os.path.join(base_folder, 'fold-*'))
    num_folders = len(folders)

    cluster_mapping = []
    C_base = np.loadtxt(os.path.join(folders[0], 'ksc/cents.dat'))

    for i in range(num_folders):
        Ci = np.loadtxt(os.path.join(folders[i], 'ksc/cents.dat'))

        dists = dist.dist_all(Ci, C_base, rolling=True)[0]
        closest = dists.argmin(axis=1)

        cluster_mapping.append({})
        for k in range(Ci.shape[0]):
            cluster_mapping[i][k] = closest[k]

    y_true_all = []
    y_pred_all = []
    for i in range(num_folders):
        y_true = np.loadtxt(os.path.join(folders[i], 'ksc/test_assign.dat'))
        y_pred = np.loadtxt(os.path.join(folders[i], \
                'cls-res-fitted-50/pred.dat'))

        for j in range(y_true.shape[0]):
            y_true[j] = cluster_mapping[i][y_true[j]]
            if y_pred[j] != -1:
                y_pred[j] = cluster_mapping[i][y_pred[j]]

        y_true_all.extend(y_true)
        y_pred_all.extend(y_pred)

    y_pred_all = np.asarray(y_pred_all)
    y_true_all = np.asarray(y_true_all)

    report = classification_report(y_true_all, y_pred_all)
    valid = y_pred_all != -1
    print()
    print('Using the centroids from folder: ', folders[0])
    print('Micro Aggregation of Folds:')
    print('%.3f fract of videos were not classified' %
          (sum(~valid) / y_pred_all.shape[0]))
    print()
    print(classification_report(y_true_all[valid], y_pred_all[valid]))
コード例 #10
0
ファイル: summarize_results.py プロジェクト: flaviovdf/pyksc
def main(tseries_fpath, base_folder):

    folders = glob.glob(os.path.join(base_folder, "fold-*"))
    num_folders = len(folders)

    cluster_mapping = []
    C_base = np.loadtxt(os.path.join(folders[0], "ksc/cents.dat"))

    for i in xrange(num_folders):
        Ci = np.loadtxt(os.path.join(folders[i], "ksc/cents.dat"))

        dists = dist.dist_all(Ci, C_base, rolling=True)[0]
        closest = dists.argmin(axis=1)

        cluster_mapping.append({})
        for k in xrange(Ci.shape[0]):
            cluster_mapping[i][k] = closest[k]

    y_true_all = []
    y_pred_all = []
    for i in xrange(num_folders):
        y_true = np.loadtxt(os.path.join(folders[i], "ksc/test_assign.dat"))
        y_pred = np.loadtxt(os.path.join(folders[i], "cls-res-fitted-50/pred.dat"))

        for j in xrange(y_true.shape[0]):
            y_true[j] = cluster_mapping[i][y_true[j]]
            if y_pred[j] != -1:
                y_pred[j] = cluster_mapping[i][y_pred[j]]

        y_true_all.extend(y_true)
        y_pred_all.extend(y_pred)

    y_pred_all = np.asarray(y_pred_all)
    y_true_all = np.asarray(y_true_all)

    report = classification_report(y_true_all, y_pred_all)
    valid = y_pred_all != -1
    print()
    print("Using the centroids from folder: ", folders[0])
    print("Micro Aggregation of Folds:")
    print("%.3f fract of videos were not classified" % (sum(~valid) / y_pred_all.shape[0]))
    print()
    print(classification_report(y_true_all[valid], y_pred_all[valid]))
コード例 #11
0
ファイル: metrics.py プロジェクト: antoine-tran/pyksc
def silhouette(tseries, assign, dists_all_pairs=None):
    
    if dists_all_pairs is None:
        dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]

    num_series = tseries.shape[0]
    sils = np.zeros(num_series, dtype='f')
    labels = set(assign)
    for i in xrange(num_series):
        
        k = assign[i]
        dists_i = dists_all_pairs[i]
        intra = np.mean(dists_i[assign == k])
        
        min_inter = float('inf')
        for o in labels:
            if o != k:
                inter = np.mean(dists_i[assign == o])
                if inter < min_inter:
                    min_inter = inter
         
        sils[i] = (min_inter - intra) / max(intra, min_inter)
    
    return np.mean(sils)
コード例 #12
0
ファイル: metrics.py プロジェクト: FlorentF9/pyksc
def silhouette(tseries, assign, dists_all_pairs=None):

    if dists_all_pairs is None:
        dists_all_pairs = dist_all(tseries, tseries, rolling=True)[0]

    num_series = tseries.shape[0]
    sils = np.zeros(num_series, dtype='f')
    labels = set(assign)
    for i in range(num_series):

        k = assign[i]
        dists_i = dists_all_pairs[i]
        intra = np.mean(dists_i[assign == k])

        min_inter = float('inf')
        for o in labels:
            if o != k:
                inter = np.mean(dists_i[assign == o])
                if inter < min_inter:
                    min_inter = inter

        sils[i] = (min_inter - intra) / max(intra, min_inter)

    return np.mean(sils)
コード例 #13
0
ファイル: ksc.py プロジェクト: antoine-tran/pyksc
def _base_ksc(tseries, initial_centroids, n_iters=-1):
    '''
    This is the base of the KSC algorithm. It follows the same idea of a K-Means
    algorithm. Firstly, we assign time series to a new cluster based on the
    distance to the centroids. For each time series, it is computed the best
    shift to minimize the distance to the closest centroid.
     
    The assignment step is followed by an update step where new centroids are 
    computed based on the new clustering (based on the update step).
    
    Both steps above are repeated `n_iters` times. If this parameter is negative
    then the steps are repeated until convergence, that is, until no time series
    changes cluster between consecutive steps. 

    Arguments
    ---------
    tseries: a matrix of shape (number of time series, size of each series)
        The time series to cluster
    initial_centroids: a matrix of shape (num. of clusters, size of time series)
        The initial centroid estimates
    n_iters: int
        The number of iterations which the algorithm will run

    Returns
    -------
    centroids: a matrix of shape (num. of clusters, size of time series)
        The final centroids found by the algorithm
    assign: an array of num. series size
        The cluster id which each time series belongs to
    best_shift: an array of num. series size
        The amount shift amount performed for each time series
    cent_dists: a matrix of shape (num. centroids, num. series)
        The distance of each centroid to each time series

    References
    ----------    References
    ----------
    .. [1] J. Yang and J. Leskovec, 
       "Patterns of Temporal Variation in Online Media" - WSDM'11  
       http://dl.acm.org/citation.cfm?id=1935863
    .. [1] J. Yang and J. Leskovec, 
       "Patterns of Temporal Variation in Online Media" - WSDM'11  
       http://dl.acm.org/citation.cfm?id=1935863
    .. [2] Wikipedia, 
        "K-means clustering"  
        http://en.wikipedia.org/wiki/K-means_clustering
    '''
    
    num_clusters = initial_centroids.shape[0]
    num_series = tseries.shape[0]

    centroids = initial_centroids

    #KSC algorithm
    cent_dists = None
    assign = None
    prev_assign = None
    best_shift = None

    iters = n_iters
    converged = False

    while iters != 0 and not converged:
        #assign elements to new clusters    References
        cent_dists, shifts = dist_all(centroids, tseries, rolling=True)
        
        assign = cent_dists.argmin(axis=0)
        best_shift = np.ndarray(num_series, dtype='i')
        for i in xrange(shifts.shape[1]):
            best_shift[i] = shifts[assign[i], i]
        
        #check if converged, if not compute new centroids
        if prev_assign is not None and not (prev_assign - assign).any():
            converged = True
        else: 
            centroids = _compute_centroids(tseries, assign, num_clusters, 
                                          best_shift)

        prev_assign = assign
        iters -= 1
    
    return centroids, assign, best_shift, cent_dists
コード例 #14
0
ファイル: plot_quality.py プロジェクト: flaviovdf/pyksc
def main(tseries_fpath, plot_foldpath):
    assert os.path.isdir(plot_foldpath)
    initialize_matplotlib()

    X = np.genfromtxt(tseries_fpath)[:, 1:].copy()

    n_samples = X.shape[0]
    sample_rows = np.arange(n_samples)

    clust_range = range(2, 16)
    n_clustering_vals = len(clust_range)

    intra_array = np.zeros(shape=(25, n_clustering_vals))
    inter_array = np.zeros(shape=(25, n_clustering_vals))
    bcvs_array = np.zeros(shape=(25, n_clustering_vals))
    costs_array = np.zeros(shape=(25, n_clustering_vals))

    r = 0
    for i in xrange(5):
        np.random.shuffle(sample_rows)
        rand_sample = sample_rows[:200]

        X_new = X[rand_sample]
        D_new = dist.dist_all(X_new, X_new, rolling=True)[0]

        for j in xrange(5):
            for k in clust_range:
                intra, inter, bcv, cost = run_clustering(X_new, k, D_new)

                intra_array[r, k - 2] = intra
                inter_array[r, k - 2] = inter
                bcvs_array[r, k - 2] = bcv
                costs_array[r, k - 2] = cost

            r += 1
            print(r)

    intra_err = np.zeros(n_clustering_vals)
    inter_err = np.zeros(n_clustering_vals)
    bcvs_err = np.zeros(n_clustering_vals)
    costs_err = np.zeros(n_clustering_vals)

    for k in clust_range:
        j = k - 2
        intra_err[j] = hci(intra_array[:, j], 0.95)
        inter_err[j] = hci(inter_array[:, j], 0.95)
        bcvs_err[j] = hci(bcvs_array[:, j], 0.95)
        costs_err[j] = hci(costs_array[:, j], 0.95)

    plt.errorbar(clust_range, np.mean(inter_array, axis=0), fmt="gD", label="Inter Cluster", yerr=inter_err)
    plt.errorbar(clust_range, np.mean(bcvs_array, axis=0), fmt="bo", label="BetaCV", yerr=bcvs_err)
    plt.errorbar(clust_range, np.mean(intra_array, axis=0), fmt="rs", label="Intra Cluster", yerr=intra_err)
    plt.ylabel("Average Distance")
    plt.xlabel("Number of clusters")
    plt.xlim((0.0, 16))
    plt.ylim((0.0, 1.0))
    plt.legend(frameon=False, loc="lower left")
    plt.savefig(os.path.join(plot_foldpath, "bcv.pdf"))
    plt.close()

    plt.errorbar(clust_range, np.mean(costs_array, axis=0), fmt="bo", label="Cost", yerr=costs_err)
    plt.ylabel("Cost (F)")
    plt.xlabel("Number of clusters")
    plt.xlim((0.0, 16))
    plt.ylim((0.0, 1.0))
    plt.legend(frameon=False, loc="lower left")
    plt.savefig(os.path.join(plot_foldpath, "cost.pdf"))
    plt.close()
コード例 #15
0
def main(tseries_fpath, plot_foldpath):
    assert os.path.isdir(plot_foldpath)
    initialize_matplotlib()

    X = np.genfromtxt(tseries_fpath)[:, 1:].copy()

    n_samples = X.shape[0]
    sample_rows = np.arange(n_samples)

    clust_range = range(2, 16)
    n_clustering_vals = len(clust_range)

    intra_array = np.zeros(shape=(25, n_clustering_vals))
    inter_array = np.zeros(shape=(25, n_clustering_vals))
    bcvs_array = np.zeros(shape=(25, n_clustering_vals))
    costs_array = np.zeros(shape=(25, n_clustering_vals))

    r = 0
    for i in xrange(5):
        np.random.shuffle(sample_rows)
        rand_sample = sample_rows[:200]

        X_new = X[rand_sample]
        D_new = dist.dist_all(X_new, X_new, rolling=True)[0]

        for j in xrange(5):
            for k in clust_range:
                intra, inter, bcv, cost = run_clustering(X_new, k, D_new)

                intra_array[r, k - 2] = intra
                inter_array[r, k - 2] = inter
                bcvs_array[r, k - 2] = bcv
                costs_array[r, k - 2] = cost

            r += 1
            print(r)

    intra_err = np.zeros(n_clustering_vals)
    inter_err = np.zeros(n_clustering_vals)
    bcvs_err = np.zeros(n_clustering_vals)
    costs_err = np.zeros(n_clustering_vals)

    for k in clust_range:
        j = k - 2
        intra_err[j] = hci(intra_array[:, j], .95)
        inter_err[j] = hci(inter_array[:, j], .95)
        bcvs_err[j] = hci(bcvs_array[:, j], .95)
        costs_err[j] = hci(costs_array[:, j], .95)

    plt.errorbar(clust_range,
                 np.mean(inter_array, axis=0),
                 fmt='gD',
                 label='Inter Cluster',
                 yerr=inter_err)
    plt.errorbar(clust_range,
                 np.mean(bcvs_array, axis=0),
                 fmt='bo',
                 label='BetaCV',
                 yerr=bcvs_err)
    plt.errorbar(clust_range,
                 np.mean(intra_array, axis=0),
                 fmt='rs',
                 label='Intra Cluster',
                 yerr=intra_err)
    plt.ylabel('Average Distance')
    plt.xlabel('Number of clusters')
    plt.xlim((0., 16))
    plt.ylim((0., 1.))
    plt.legend(frameon=False, loc='lower left')
    plt.savefig(os.path.join(plot_foldpath, 'bcv.pdf'))
    plt.close()

    plt.errorbar(clust_range,
                 np.mean(costs_array, axis=0),
                 fmt='bo',
                 label='Cost',
                 yerr=costs_err)
    plt.ylabel('Cost (F)')
    plt.xlabel('Number of clusters')
    plt.xlim((0., 16))
    plt.ylim((0., 1.))
    plt.legend(frameon=False, loc='lower left')
    plt.savefig(os.path.join(plot_foldpath, 'cost.pdf'))
    plt.close()
コード例 #16
0
def _base_ksc(tseries, initial_centroids, n_iters=-1):
    '''
    This is the base of the KSC algorithm. It follows the same idea of a K-Means
    algorithm. Firstly, we assign time series to a new cluster based on the
    distance to the centroids. For each time series, it is computed the best
    shift to minimize the distance to the closest centroid.
     
    The assignment step is followed by an update step where new centroids are 
    computed based on the new clustering (based on the update step).
    
    Both steps above are repeated `n_iters` times. If this parameter is negative
    then the steps are repeated until convergence, that is, until no time series
    changes cluster between consecutive steps. 

    Arguments
    ---------
    tseries: a matrix of shape (number of time series, size of each series)
        The time series to cluster
    initial_centroids: a matrix of shape (num. of clusters, size of time series)
        The initial centroid estimates
    n_iters: int
        The number of iterations which the algorithm will run

    Returns
    -------
    centroids: a matrix of shape (num. of clusters, size of time series)
        The final centroids found by the algorithm
    assign: an array of num. series size
        The cluster id which each time series belongs to
    best_shift: an array of num. series size
        The amount shift amount performed for each time series
    cent_dists: a matrix of shape (num. centroids, num. series)
        The distance of each centroid to each time series

    References
    ----------    References
    ----------
    .. [1] J. Yang and J. Leskovec, 
       "Patterns of Temporal Variation in Online Media" - WSDM'11  
       http://dl.acm.org/citation.cfm?id=1935863
    .. [1] J. Yang and J. Leskovec, 
       "Patterns of Temporal Variation in Online Media" - WSDM'11  
       http://dl.acm.org/citation.cfm?id=1935863
    .. [2] Wikipedia, 
        "K-means clustering"  
        http://en.wikipedia.org/wiki/K-means_clustering
    '''

    num_clusters = initial_centroids.shape[0]
    num_series = tseries.shape[0]

    centroids = initial_centroids

    #KSC algorithm
    cent_dists = None
    assign = None
    prev_assign = None
    best_shift = None

    iters = n_iters
    converged = False

    while iters != 0 and not converged:
        #assign elements to new clusters    References
        cent_dists, shifts = dist_all(centroids, tseries, rolling=True)

        assign = cent_dists.argmin(axis=0)
        best_shift = np.ndarray(num_series, dtype='i')
        for i in range(shifts.shape[1]):
            best_shift[i] = shifts[assign[i], i]

        #check if converged, if not compute new centroids
        if prev_assign is not None and not (prev_assign - assign).any():
            converged = True
        else:
            centroids = _compute_centroids(tseries, assign, num_clusters,
                                           best_shift)

        prev_assign = assign
        iters -= 1

    return centroids, assign, best_shift, cent_dists
コード例 #17
0
    Z = preprocessing.StandardScaler().fit_transform(T)
    km = cluster.MiniBatchKMeans(n_clusters=num_clusters)
    km = km.fit(Z)
    D = km.transform(Z)

    return D

if __name__ == '__main__':
    
    X_train, T12_train, hosts_train = myio.read_features(test=False)
    Y_train = myio.read_response_train()
    k = 50 
    
    print('K-means')
    D = transform_km(T12_train, k)
    X_train_new = np.hstack((D,  X_train))
    
    model = OLS()
    model.fit(X_train_new, Y_train)
    print(k, np.sqrt(model.G.mean(axis=0)))

    print('KSC')
    C = np.genfromtxt('ksc-results/cents_visits_%d.dat' % k, dtype='d')
    T_nolog = np.asarray(np.exp(T12_train) - 1, order='C')
    D = dist_all(C, T_nolog, rolling=True)[0].T
    X_train_new = np.hstack((D,  X_train))
    
    model = OLS()
    model.fit(X_train_new, Y_train)
    print(k, np.sqrt(model.G.mean(axis=0)))