def kshape_grid_iter(X_partitioned: List[np.array], kshape_kwargs: dict) -> Tuple[KShape, int]: seed_ixs = [np.random.randint(0, X.shape[0] - 1) for X in X_partitioned] centroid_seeds = np.array( [X_partitioned[i][seed] for i, seed in enumerate(seed_ixs)]) init = np.swapaxes(np.array([centroid_seeds]).T, 0, 1) kshape = KShape(n_clusters=len(seed_ixs), init=init, verbose=True, random_state=None, **kshape_kwargs) X = np.vstack(X_partitioned) print('** Fitting ks model **') kshape.fit(X) print('** Predicting **') n_clusters_out = np.unique(kshape.predict(X)).size # until the tslearn hyper-param json issue is released in the latest pypi version kshape.init = kshape.init.tolist() return kshape, n_clusters_out
def test_kshape(): n, sz, d = 15, 10, 3 rng = np.random.RandomState(0) time_series = rng.randn(n, sz, d) time_series = TimeSeriesScalerMeanVariance().fit_transform(time_series) ks = KShape(n_clusters=3, n_init=1, verbose=False, random_state=rng).fit(time_series) dists = ks._cross_dists(time_series) np.testing.assert_allclose(ks.labels_, dists.argmin(axis=1)) np.testing.assert_allclose(ks.labels_, ks.predict(time_series)) assert KShape(n_clusters=101, verbose=False, random_state=rng).fit(time_series)._X_fit is None
def run_single(X, train, params, workdir, out): kwargs = params ks = KShape(**kwargs) ks.fit(train) print('**** Predicting ****') y_pred = ks.predict(X) ks_path = os.path.join(workdir, 'ks.pickle') pickle.dump(ks, open(ks_path, 'wb')) y_pred_path = os.path.join(workdir, 'y_pred.npy') np.save(y_pred_path, y_pred) train_path = os.path.join(workdir, 'train.npy') np.save(train_path, train) with open(out, 'w') as f: f.write('1') print('* Done! *')
def run(data_path: str, params_path: str): X = np.load(data_path) params = pickle.load(open(params_path, 'rb')) workdir = params['workdir'] out = os.path.join(workdir, 'out') with open(out, 'w') as f: f.write('0') print(f'Using work dir: {workdir}') print('** Fitting training data **') n_train = int((params['kwargs'].pop('train_percent') / 100) * X.shape[0]) train = X[np.random.choice(X.shape[0], size=n_train, replace=False)] kwargs = params['kwargs'] ks = KShape(**kwargs) ks.fit(train) print('**** Predicting ****') y_pred = ks.predict(X) ks_path = os.path.join(workdir, 'ks.pickle') pickle.dump(ks, open(ks_path, 'wb')) y_pred_path = os.path.join(workdir, 'y_pred.npy') np.save(y_pred_path, y_pred) train_path = os.path.join(workdir, 'train.npy') np.save(train_path, train) with open(out, 'w') as f: f.write('1') print('* Done! *')
class TimeSeriesKShapes(BaseClusterer): """Kshape algorithm wrapper tslearns implementation. Parameters ---------- n_clusters: int, defaults = 8 The number of clusters to form as well as the number of centroids to generate. init_algorithm: str or np.ndarray, defaults = 'random' Method for initializing cluster centers. Any of the following are valid: ['random']. Or a np.ndarray of shape (n_clusters, ts_size, d) and gives the initial centers. n_init: int, defaults = 10 Number of times the k-means algorithm will be run with different centroid seeds. The final result will be the best output of n_init consecutive runs in terms of inertia. max_iter: int, defaults = 30 Maximum number of iterations of the k-means algorithm for a single run. tol: float, defaults = 1e-4 Relative tolerance with regards to Frobenius norm of the difference in the cluster centers of two consecutive iterations to declare convergence. verbose: bool, defaults = False Verbosity mode. random_state: int or np.random.RandomState instance or None, defaults = None Determines random number generation for centroid initialization. Attributes ---------- labels_: np.ndarray (1d array of shape (n_instance,)) Labels that is the index each time series belongs to. inertia_: float Sum of squared distances of samples to their closest cluster center, weighted by the sample weights if provided. n_iter_: int Number of iterations run. """ _tags = { "capability:multivariate": True, } def __init__( self, n_clusters: int = 8, init_algorithm: Union[str, np.ndarray] = "random", n_init: int = 10, max_iter: int = 300, tol: float = 1e-4, verbose: bool = False, random_state: Union[int, RandomState] = None, ): _check_soft_dependencies("tslearn", severity="error", object=self) self.init_algorithm = init_algorithm self.n_init = n_init self.max_iter = max_iter self.tol = tol self.verbose = verbose self.random_state = random_state self.cluster_centers_ = None self.labels_ = None self.inertia_ = None self.n_iter_ = 0 self._tslearn_k_shapes = None super(TimeSeriesKShapes, self).__init__(n_clusters=n_clusters) def _fit(self, X: TimeSeriesInstances, y=None) -> np.ndarray: """Fit time series clusterer to training data. Parameters ---------- X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape (n_instances, n_dimensions, series_length)) Training time series instances to cluster. y: ignored, exists for API consistency reasons. Returns ------- self: Fitted estimator. """ from tslearn.clustering import KShape if self._tslearn_k_shapes is None: self._tslearn_k_shapes = KShape( # n_clusters=self.n_clusters, n_clusters=3, max_iter=self.max_iter, tol=self.tol, random_state=self.random_state, n_init=self.n_init, verbose=self.verbose, init=self.init_algorithm, ) self._tslearn_k_shapes.fit(X) self._cluster_centers = self._tslearn_k_shapes.cluster_centers_ self.labels_ = self._tslearn_k_shapes.labels_ self.inertia_ = self._tslearn_k_shapes.inertia_ self.n_iter_ = self._tslearn_k_shapes.n_iter_ def _predict(self, X: TimeSeriesInstances, y=None) -> np.ndarray: """Predict the closest cluster each sample in X belongs to. Parameters ---------- X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape (n_instances, n_dimensions, series_length)) Time series instances to predict their cluster indexes. y: ignored, exists for API consistency reasons. Returns ------- np.ndarray (1d array of shape (n_instances,)) Index of the cluster each time series in X belongs to. """ return self._tslearn_k_shapes.predict(X) @classmethod def get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator. Parameters ---------- parameter_set : str, default="default" Name of the set of test parameters to return, for use in tests. If no special parameters are defined for a value, will return `"default"` set. Returns ------- params : dict or list of dict, default = {} Parameters to create testing instances of the class Each dict are parameters to construct an "interesting" test instance, i.e., `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. `create_test_instance` uses the first (or only) dictionary in `params` """ params = { "n_clusters": 2, "init_algorithm": "random", "n_init": 1, "max_iter": 1, "tol": 1e-4, "verbose": False, "random_state": 1, } return params def _score(self, X, y=None): return np.abs(self.inertia_)
data_test = np.loadtxt(current_path + file + "ECGFiveDays\\ECGFiveDays_TEST.tsv") X_test = to_time_series_dataset(data_test[:, 1:]) y_test = data_test[:, 0].astype(np.int) file = "教師なし教科書\\13章-時系列クラスタリング\\3_ECGFiveDays_k_shape\\result\\" # Prepare the data - Scale X_train = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X_train) X_test = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X_test) # k-Shape Algorithm # Train using k-Shape ks = KShape(n_clusters=2, max_iter=100, n_init=100, verbose=0) ks.fit(X_train) # Make predictions on train set and calculate adjusted Rand index preds = ks.predict(X_train) ars = adjusted_rand_score(data_train[:, 0], preds) print("Adjusted Rand Index:", ars) # Make predictions on test set and calculate adjusted Rand index preds_test = ks.predict(X_test) ars = adjusted_rand_score(data_test[:, 0], preds_test) print("Adjusted Rand Index on Test Set:", ars) # 訓練セットがちいさいから結果が悪い train 23 test 861 # Adjusted Rand Index: 0.668041237113402 # Adjusted Rand Index on Test Set: 0.012338817789874643
class Kshape(): """ Input time_span data data is pd.DataFrame data columns are [DEVICE_DATETIME, TEMPRATURE] where DEVICE_DATETIME is index. data is must be sorted by index, ascendings = True. data has taken every 10 seconds. time_span = 1 means 1 timeseries = 1 minutes data. batch is the number of elements what using 1 timeseris has. """ def __init__( self, time_span=1, batch=60, data=None, ): self.time_span = time_span * 6 self.data = data self.batch = batch self.km = KShape(n_clusters=2, max_iter=50, verbose=True, random_state=0) def Preprocess(self, x=None): """ dataを(batch, len(data)//time_span)の形に整形する。 """ if str(type(x)) == "<class 'NoneType'>": self.n_data = len(self.data) // self.time_span self.n_use = self.time_span * self.n_data ts = self.data.loc[:self.data.index[self.n_use - 1]] ts = np.array(ts.TEMPERATURE).reshape(1, -1) ts = TimeSeriesScalerMeanVariance().fit_transform(ts) ts = np.array(ts).reshape(self.n_data, -1) ts = TimeSeriesResampler(sz=self.batch).fit_transform(ts) self.ts = ts else: self.x_data = len(x) // self.time_span self.x_use = self.time_span * self.x_data ts = x.loc[:x.index[self.x_use - 1]] ts = np.array(ts.TEMPERATURE).reshape(1, -1) ts = TimeSeriesScalerMeanVariance().fit_transform(ts) ts = np.array(ts).reshape(self.x_data, -1) ts = TimeSeriesResampler(sz=self.batch).fit_transform(ts) return ts def classification(self): """ KShape で分類する。 使わなかったデータは、TimeSeriesResampler でかさ増しして使う 分類後に、self.data にcluster 列を作る """ self.Preprocess() self.y_pred = self.km.fit_predict(self.ts) #cluster 列を作る self.cluster = [] for i in range(self.n_data): list_item = [self.y_pred[i]] * self.time_span self.cluster.extend(list_item) #データが余っている時は、Resampler で時系列データを1つだけ作って予測する。 if not self.n_use == len(self.data): self.ts_c = self.data.loc[self.data.index[self.n_use]:] self.ts_c = np.array(self.ts_c.TEMPERATURE).reshape(1, -1) self.ts_batch = TimeSeriesResampler(sz=self.batch).fit_transform( self.ts_c) self.y_pred_c = [int(self.km.predict(self.ts_batch)) ] * self.ts_c.shape[1] self.cluster.extend(self.y_pred_c) self.data["CLUSTER"] = self.cluster def draw_graph(self, x=None): if str(type(x)) == "<class 'NoneType'>": fig, ax = plt.subplots() sns.scatterplot(data=self.data, x="DEVICE_DATETIME", y="TEMPERATURE", hue="CLUSTER") locator = mdates.AutoDateLocator(minticks=4, maxticks=10) formatter = mdates.ConciseDateFormatter(locator=locator) ax.xaxis.set_major_locator(locator) ax.xaxis.set_major_formatter(formatter) plt.show() else: fig, ax = plt.subplots() sns.scatterplot(data=x, x="DEVICE_DATETIME", y="TEMPERATURE", hue="CLUSTER") locator = mdates.AutoDateLocator(minticks=4, maxticks=10) formatter = mdates.ConciseDateFormatter(locator=locator) ax.xaxis.set_major_locator(locator) ax.xaxis.set_major_formatter(formatter) plt.show() def predict(self, x): ts = self.Preprocess(x=x) pred = self.km.predict(ts) cluster = [] for i in range(self.x_data): list_item = [pred[i]] * self.time_span cluster.extend(list_item) #データが余っている時は、Resampler で時系列データを1つだけ作って予測する。 if not self.x_use == len(x): self.x_c = x.loc[x.index[self.x_use]:] self.x_c = np.array(self.x_c.TEMPERATURE).reshape(1, -1) self.x_batch = TimeSeriesResampler(sz=self.batch).fit_transform( self.x_c) y_pred_c = [int(self.km.predict(self.x_batch))] * self.x_c.shape[1] cluster.extend(y_pred_c) x["CLUSTER"] = cluster self.draw_graph(x=x)