def ksc(tseries, num_clusters, n_iters=-1, n_runs=10): ''' This method will make `n_runs` call to `_base_ksc` returning the results from the run with the lowest over-all clustering cost. In each run, a random initialization of centroids is performed. This is done by assigning time series to clusters in a uniform random manner and then computing the centroid of each cluster. Please refer to the documentation of `_base_ksc` for a detailed summary of the KSC algorithm. Arguments --------- tseries: a matrix of shape (number of time series, size of each series) The time series to cluster n_iters: int The number of iterations which the algorithm will run n_runs: int The number of times to run the KSC algorithm Returns ------- centroids: a matrix of shape (num. of clusters, size of time series) The final centroids found by the algorithm assign: an array of num. series size The cluster id which each time series belongs to best_shift: an array of num. series size The amount shift amount performed for each time series cent_dists: a matrix of shape (num. centroids, num. series) The distance of each centroid to each time series References ---------- .. [1] J. Yang and J. Leskovec, "Patterns of Temporal Variation in Online Media" - WSDM'11 http://dl.acm.org/citation.cfm?id=1935863 ''' min_cost = float('+inf') best_cents = None best_assign = None best_shift = None best_dist = None for _ in xrange(n_runs): assign = np.random.randint(0, num_clusters, tseries.shape[0]) cents = _compute_centroids(tseries, assign, num_clusters) cents, assign, series_shift, dists = _base_ksc(tseries, cents, n_iters) clust_cost = cost(tseries, assign, cents, dists) if clust_cost < min_cost: min_cost = clust_cost best_cents = cents best_assign = assign best_shift = series_shift best_dist = dists return best_cents, best_assign, best_shift, best_dist
def ksc(tseries, num_clusters, n_iters=-1, n_runs=10): ''' This method will make `n_runs` call to `_base_ksc` returning the results from the run with the lowest over-all clustering cost. In each run, a random initialization of centroids is performed. This is done by assigning time series to clusters in a uniform random manner and then computing the centroid of each cluster. Please refer to the documentation of `_base_ksc` for a detailed summary of the KSC algorithm. Arguments --------- tseries: a matrix of shape (number of time series, size of each series) The time series to cluster n_iters: int The number of iterations which the algorithm will run n_runs: int The number of times to run the KSC algorithm Returns ------- centroids: a matrix of shape (num. of clusters, size of time series) The final centroids found by the algorithm assign: an array of num. series size The cluster id which each time series belongs to best_shift: an array of num. series size The amount shift amount performed for each time series cent_dists: a matrix of shape (num. centroids, num. series) The distance of each centroid to each time series References ---------- .. [1] J. Yang and J. Leskovec, "Patterns of Temporal Variation in Online Media" - WSDM'11 http://dl.acm.org/citation.cfm?id=1935863 ''' min_cost = float('+inf') best_cents = None best_assign = None best_shift = None best_dist = None for _ in range(n_runs): assign = np.random.randint(0, num_clusters, tseries.shape[0]) cents = _compute_centroids(tseries, assign, num_clusters) cents, assign, series_shift, dists = _base_ksc(tseries, cents, n_iters) clust_cost = cost(tseries, assign, cents, dists) if clust_cost < min_cost: min_cost = clust_cost best_cents = cents best_assign = assign best_shift = series_shift best_dist = dists return best_cents, best_assign, best_shift, best_dist
def run_clustering(X, k, dists_all): cent, assign, shift, dists_cent = ksc.inc_ksc(X, k) intra = metrics.avg_intra_dist(X, assign, dists_all)[0] inter = metrics.avg_inter_dist(X, assign, dists_all)[0] bcv = metrics.beta_cv(X, assign, dists_all) cost = metrics.cost(X, assign, None, dists_cent) return intra, inter, bcv, cost