def test_all(self): x = np.array([]) assert_array_equal(np.array([]), dhwt.transform(x)[0]) assert_array_equal(np.array([]), dhwt.transform(x)[1]) assert_array_equal(x, dhwt.inverse(*dhwt.transform(x))) x = np.array([1., 1]) assert_array_equal(np.array([1.]), dhwt.transform(x)[0]) assert_array_equal(np.array([0.]), dhwt.transform(x)[1]) assert_array_equal(x, dhwt.inverse(*dhwt.transform(x))) x = np.array([1., 2, 3, 0]) assert_array_equal(np.array([1.5, 1.5]), dhwt.transform(x)[0]) assert_array_equal(np.array([-.5, 1.5]), dhwt.transform(x)[1]) assert_array_equal(x, dhwt.inverse(*dhwt.transform(x))) x = np.array([1., 2, 3, 0, 7]) assert_array_equal(np.array([1.5, 1.5, 3.5]), dhwt.transform(x)[0]) assert_array_equal(np.array([-.5, 1.5, 3.5]), dhwt.transform(x)[1]) assert_array_equal(x, dhwt.inverse(*dhwt.transform(x))) x = np.array([6., 12, 15, 15, 14, 12, 120, 116]) assert_array_equal(np.array([9., 15, 13, 118]), dhwt.transform(x)[0]) assert_array_equal(np.array([-3, 0, 1, 2]), dhwt.transform(x)[1]) assert_array_equal(x, dhwt.inverse(*dhwt.transform(x))) x = np.array([6., 12, 15, 15, 14, 12, 120, 116, 2]) assert_array_equal(np.array([9., 15, 13, 118, 1]), dhwt.transform(x)[0]) assert_array_equal(np.array([-3, 0, 1, 2, 1]), dhwt.transform(x)[1]) assert_array_equal(x, dhwt.inverse(*dhwt.transform(x)))
def inc_ksc(tseries, num_clusters, n_iters=-1, num_wavelets=2): ''' Given the number `num_wavelets`, this method will compute subsequent Discrete Harr Wavelet Transforms of the time series to be clustered. At each transform the number of points of the time series is decreased, thus we say that we are viewing the time series at a higher resolution. Clustering will begin at the highest resolution (last transform), and the results from the previous resolution is used to initialized the current one. Only the highest resolution is initialized randomly. This technique can improve the run-time of the KSC algorithm, since it is faster to cluster at higher resolutions (less data points), being for subsequent resolutions the centroids from the previous resolution already a close approximation of the actual centroid. See [1] for details. Please refer to the documentation of `_base_ksc` for a detailed summary of the KSC algorithm. Arguments --------- tseries: a matrix of shape (number of time series, size of each series) The time series to cluster n_iters: int The number of iterations which the algorithm will run num_wavelets: int The number of wavelets to use Returns ------- centroids: a matrix of shape (num. of clusters, size of time series) The final centroids found by the algorithm assign: an array of num. series size The cluster id which each time series belongs to best_shift: an array of num. series size The amount shift amount performed for each time series cent_dists: a matrix of shape (num. centroids, num. series) The distance of each centroid to each time series References ---------- .. [1] J. Yang and J. Leskovec, "Patterns of Temporal Variation in Online Media" - WSDM'11 http://dl.acm.org/citation.cfm?id=1935863 ''' dhw_series = [] dhw_series.append(tseries) previous = tseries for _ in xrange(num_wavelets): new_series = [] for j in xrange(tseries.shape[0]): wave = transform(previous[j])[0] new_series.append(wave) previous = np.array(new_series) dhw_series.append(previous) assign = np.random.randint(0, num_clusters, tseries.shape[0]) cents = None series_shift = None for dhw in reversed(dhw_series): cents = _compute_centroids(dhw, assign, num_clusters, series_shift) cents, assign, series_shift, dists = _base_ksc(dhw, cents, n_iters) return cents, assign, series_shift, dists
def inc_ksc(tseries, num_clusters, n_iters=-1, num_wavelets=2): ''' Given the number `num_wavelets`, this method will compute subsequent Discrete Harr Wavelet Transforms of the time series to be clustered. At each transform the number of points of the time series is decreased, thus we say that we are viewing the time series at a higher resolution. Clustering will begin at the highest resolution (last transform), and the results from the previous resolution is used to initialized the current one. Only the highest resolution is initialized randomly. This technique can improve the run-time of the KSC algorithm, since it is faster to cluster at higher resolutions (less data points), being for subsequent resolutions the centroids from the previous resolution already a close approximation of the actual centroid. See [1] for details. Please refer to the documentation of `_base_ksc` for a detailed summary of the KSC algorithm. Arguments --------- tseries: a matrix of shape (number of time series, size of each series) The time series to cluster n_iters: int The number of iterations which the algorithm will run num_wavelets: int The number of wavelets to use Returns ------- centroids: a matrix of shape (num. of clusters, size of time series) The final centroids found by the algorithm assign: an array of num. series size The cluster id which each time series belongs to best_shift: an array of num. series size The amount shift amount performed for each time series cent_dists: a matrix of shape (num. centroids, num. series) The distance of each centroid to each time series References ---------- .. [1] J. Yang and J. Leskovec, "Patterns of Temporal Variation in Online Media" - WSDM'11 http://dl.acm.org/citation.cfm?id=1935863 ''' dhw_series = [] dhw_series.append(tseries) previous = tseries for _ in range(num_wavelets): new_series = [] for j in range(tseries.shape[0]): wave = transform(previous[j])[0] new_series.append(wave) previous = np.array(new_series) dhw_series.append(previous) assign = np.random.randint(0, num_clusters, tseries.shape[0]) cents = None series_shift = None for dhw in reversed(dhw_series): cents = _compute_centroids(dhw, assign, num_clusters, series_shift) cents, assign, series_shift, dists = _base_ksc(dhw, cents, n_iters) return cents, assign, series_shift, dists