Example #1
0
def kshape_grid_iter(X_partitioned: List[np.array],
                     kshape_kwargs: dict) -> Tuple[KShape, int]:
    seed_ixs = [np.random.randint(0, X.shape[0] - 1) for X in X_partitioned]
    centroid_seeds = np.array(
        [X_partitioned[i][seed] for i, seed in enumerate(seed_ixs)])
    init = np.swapaxes(np.array([centroid_seeds]).T, 0, 1)

    kshape = KShape(n_clusters=len(seed_ixs),
                    init=init,
                    verbose=True,
                    random_state=None,
                    **kshape_kwargs)

    X = np.vstack(X_partitioned)

    print('** Fitting ks model **')
    kshape.fit(X)

    print('** Predicting **')
    n_clusters_out = np.unique(kshape.predict(X)).size

    # until the tslearn hyper-param json issue is released in the latest pypi version
    kshape.init = kshape.init.tolist()

    return kshape, n_clusters_out
Example #2
0
def test_kshape():
    n, sz, d = 15, 10, 3
    rng = np.random.RandomState(0)
    time_series = rng.randn(n, sz, d)
    time_series = TimeSeriesScalerMeanVariance().fit_transform(time_series)

    ks = KShape(n_clusters=3, n_init=1, verbose=False,
                random_state=rng).fit(time_series)
    dists = ks._cross_dists(time_series)
    np.testing.assert_allclose(ks.labels_, dists.argmin(axis=1))
    np.testing.assert_allclose(ks.labels_, ks.predict(time_series))

    assert KShape(n_clusters=101, verbose=False,
                  random_state=rng).fit(time_series)._X_fit is None
Example #3
0
def run_single(X, train, params, workdir, out):
    kwargs = params
    ks = KShape(**kwargs)

    ks.fit(train)

    print('**** Predicting ****')
    y_pred = ks.predict(X)

    ks_path = os.path.join(workdir, 'ks.pickle')
    pickle.dump(ks, open(ks_path, 'wb'))

    y_pred_path = os.path.join(workdir, 'y_pred.npy')
    np.save(y_pred_path, y_pred)

    train_path = os.path.join(workdir, 'train.npy')
    np.save(train_path, train)

    with open(out, 'w') as f:
        f.write('1')

    print('* Done! *')
Example #4
0
def run(data_path: str, params_path: str):
    X = np.load(data_path)

    params = pickle.load(open(params_path, 'rb'))
    workdir = params['workdir']

    out = os.path.join(workdir, 'out')
    with open(out, 'w') as f:
        f.write('0')

    print(f'Using work dir: {workdir}')

    print('** Fitting training data **')
    n_train = int((params['kwargs'].pop('train_percent') / 100) * X.shape[0])
    train = X[np.random.choice(X.shape[0], size=n_train, replace=False)]

    kwargs = params['kwargs']
    ks = KShape(**kwargs)

    ks.fit(train)

    print('**** Predicting ****')
    y_pred = ks.predict(X)

    ks_path = os.path.join(workdir, 'ks.pickle')
    pickle.dump(ks, open(ks_path, 'wb'))

    y_pred_path = os.path.join(workdir, 'y_pred.npy')
    np.save(y_pred_path, y_pred)

    train_path = os.path.join(workdir, 'train.npy')
    np.save(train_path, train)

    with open(out, 'w') as f:
        f.write('1')

    print('* Done! *')
Example #5
0
class TimeSeriesKShapes(BaseClusterer):
    """Kshape algorithm wrapper tslearns implementation.

    Parameters
    ----------
    n_clusters: int, defaults = 8
        The number of clusters to form as well as the number of
        centroids to generate.
    init_algorithm: str or np.ndarray, defaults = 'random'
        Method for initializing cluster centers. Any of the following are valid:
        ['random']. Or a np.ndarray of shape (n_clusters, ts_size, d) and gives the
        initial centers.
    n_init: int, defaults = 10
        Number of times the k-means algorithm will be run with different
        centroid seeds. The final result will be the best output of n_init
        consecutive runs in terms of inertia.
    max_iter: int, defaults = 30
        Maximum number of iterations of the k-means algorithm for a single
        run.
    tol: float, defaults = 1e-4
        Relative tolerance with regards to Frobenius norm of the difference
        in the cluster centers of two consecutive iterations to declare
        convergence.
    verbose: bool, defaults = False
        Verbosity mode.
    random_state: int or np.random.RandomState instance or None, defaults = None
        Determines random number generation for centroid initialization.

    Attributes
    ----------
    labels_: np.ndarray (1d array of shape (n_instance,))
        Labels that is the index each time series belongs to.
    inertia_: float
        Sum of squared distances of samples to their closest cluster center, weighted by
        the sample weights if provided.
    n_iter_: int
        Number of iterations run.
    """

    _tags = {
        "capability:multivariate": True,
    }

    def __init__(
        self,
        n_clusters: int = 8,
        init_algorithm: Union[str, np.ndarray] = "random",
        n_init: int = 10,
        max_iter: int = 300,
        tol: float = 1e-4,
        verbose: bool = False,
        random_state: Union[int, RandomState] = None,
    ):
        _check_soft_dependencies("tslearn", severity="error", object=self)

        self.init_algorithm = init_algorithm
        self.n_init = n_init
        self.max_iter = max_iter
        self.tol = tol
        self.verbose = verbose
        self.random_state = random_state

        self.cluster_centers_ = None
        self.labels_ = None
        self.inertia_ = None
        self.n_iter_ = 0

        self._tslearn_k_shapes = None

        super(TimeSeriesKShapes, self).__init__(n_clusters=n_clusters)

    def _fit(self, X: TimeSeriesInstances, y=None) -> np.ndarray:
        """Fit time series clusterer to training data.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Training time series instances to cluster.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        self:
            Fitted estimator.
        """
        from tslearn.clustering import KShape

        if self._tslearn_k_shapes is None:
            self._tslearn_k_shapes = KShape(
                # n_clusters=self.n_clusters,
                n_clusters=3,
                max_iter=self.max_iter,
                tol=self.tol,
                random_state=self.random_state,
                n_init=self.n_init,
                verbose=self.verbose,
                init=self.init_algorithm,
            )

        self._tslearn_k_shapes.fit(X)
        self._cluster_centers = self._tslearn_k_shapes.cluster_centers_
        self.labels_ = self._tslearn_k_shapes.labels_
        self.inertia_ = self._tslearn_k_shapes.inertia_
        self.n_iter_ = self._tslearn_k_shapes.n_iter_

    def _predict(self, X: TimeSeriesInstances, y=None) -> np.ndarray:
        """Predict the closest cluster each sample in X belongs to.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Time series instances to predict their cluster indexes.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        np.ndarray (1d array of shape (n_instances,))
            Index of the cluster each time series in X belongs to.
        """
        return self._tslearn_k_shapes.predict(X)

    @classmethod
    def get_test_params(cls, parameter_set="default"):
        """Return testing parameter settings for the estimator.

        Parameters
        ----------
        parameter_set : str, default="default"
            Name of the set of test parameters to return, for use in tests. If no
            special parameters are defined for a value, will return `"default"` set.


        Returns
        -------
        params : dict or list of dict, default = {}
            Parameters to create testing instances of the class
            Each dict are parameters to construct an "interesting" test instance, i.e.,
            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
            `create_test_instance` uses the first (or only) dictionary in `params`
        """
        params = {
            "n_clusters": 2,
            "init_algorithm": "random",
            "n_init": 1,
            "max_iter": 1,
            "tol": 1e-4,
            "verbose": False,
            "random_state": 1,
        }
        return params

    def _score(self, X, y=None):
        return np.abs(self.inertia_)
data_test = np.loadtxt(current_path + file +
                       "ECGFiveDays\\ECGFiveDays_TEST.tsv")
X_test = to_time_series_dataset(data_test[:, 1:])
y_test = data_test[:, 0].astype(np.int)
file = "教師なし教科書\\13章-時系列クラスタリング\\3_ECGFiveDays_k_shape\\result\\"

# Prepare the data - Scale
X_train = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X_train)
X_test = TimeSeriesScalerMeanVariance(mu=0., std=1.).fit_transform(X_test)


# k-Shape Algorithm
# Train using k-Shape
ks = KShape(n_clusters=2, max_iter=100, n_init=100, verbose=0)
ks.fit(X_train)

# Make predictions on train set and calculate adjusted Rand index
preds = ks.predict(X_train)
ars = adjusted_rand_score(data_train[:, 0], preds)
print("Adjusted Rand Index:", ars)

# Make predictions on test set and calculate adjusted Rand index
preds_test = ks.predict(X_test)
ars = adjusted_rand_score(data_test[:, 0], preds_test)
print("Adjusted Rand Index on Test Set:", ars)

# 訓練セットがちいさいから結果が悪い train 23 test 861
# Adjusted Rand Index: 0.668041237113402
# Adjusted Rand Index on Test Set: 0.012338817789874643
Example #7
0
class Kshape():
    """
    Input time_span data
    data is pd.DataFrame
    data columns are [DEVICE_DATETIME, TEMPRATURE] where DEVICE_DATETIME is index.
    data is must be sorted by index, ascendings = True.
    data has taken every 10 seconds.
    time_span = 1 means 1 timeseries = 1 minutes data.
    batch is the number of elements what using 1 timeseris has. 
    """
    def __init__(
        self,
        time_span=1,
        batch=60,
        data=None,
    ):
        self.time_span = time_span * 6
        self.data = data
        self.batch = batch
        self.km = KShape(n_clusters=2,
                         max_iter=50,
                         verbose=True,
                         random_state=0)

    def Preprocess(self, x=None):
        """
        dataを(batch, len(data)//time_span)の形に整形する。
        """
        if str(type(x)) == "<class 'NoneType'>":
            self.n_data = len(self.data) // self.time_span
            self.n_use = self.time_span * self.n_data
            ts = self.data.loc[:self.data.index[self.n_use - 1]]
            ts = np.array(ts.TEMPERATURE).reshape(1, -1)
            ts = TimeSeriesScalerMeanVariance().fit_transform(ts)
            ts = np.array(ts).reshape(self.n_data, -1)
            ts = TimeSeriesResampler(sz=self.batch).fit_transform(ts)
            self.ts = ts
        else:
            self.x_data = len(x) // self.time_span
            self.x_use = self.time_span * self.x_data
            ts = x.loc[:x.index[self.x_use - 1]]
            ts = np.array(ts.TEMPERATURE).reshape(1, -1)
            ts = TimeSeriesScalerMeanVariance().fit_transform(ts)
            ts = np.array(ts).reshape(self.x_data, -1)
            ts = TimeSeriesResampler(sz=self.batch).fit_transform(ts)
            return ts

    def classification(self):
        """
        KShape で分類する。
        使わなかったデータは、TimeSeriesResampler でかさ増しして使う
        分類後に、self.data にcluster 列を作る
        """
        self.Preprocess()
        self.y_pred = self.km.fit_predict(self.ts)
        #cluster 列を作る
        self.cluster = []
        for i in range(self.n_data):
            list_item = [self.y_pred[i]] * self.time_span
            self.cluster.extend(list_item)
        #データが余っている時は、Resampler で時系列データを1つだけ作って予測する。
        if not self.n_use == len(self.data):
            self.ts_c = self.data.loc[self.data.index[self.n_use]:]
            self.ts_c = np.array(self.ts_c.TEMPERATURE).reshape(1, -1)
            self.ts_batch = TimeSeriesResampler(sz=self.batch).fit_transform(
                self.ts_c)
            self.y_pred_c = [int(self.km.predict(self.ts_batch))
                             ] * self.ts_c.shape[1]
            self.cluster.extend(self.y_pred_c)
        self.data["CLUSTER"] = self.cluster

    def draw_graph(self, x=None):
        if str(type(x)) == "<class 'NoneType'>":
            fig, ax = plt.subplots()
            sns.scatterplot(data=self.data,
                            x="DEVICE_DATETIME",
                            y="TEMPERATURE",
                            hue="CLUSTER")
            locator = mdates.AutoDateLocator(minticks=4, maxticks=10)
            formatter = mdates.ConciseDateFormatter(locator=locator)
            ax.xaxis.set_major_locator(locator)
            ax.xaxis.set_major_formatter(formatter)
            plt.show()
        else:
            fig, ax = plt.subplots()
            sns.scatterplot(data=x,
                            x="DEVICE_DATETIME",
                            y="TEMPERATURE",
                            hue="CLUSTER")
            locator = mdates.AutoDateLocator(minticks=4, maxticks=10)
            formatter = mdates.ConciseDateFormatter(locator=locator)
            ax.xaxis.set_major_locator(locator)
            ax.xaxis.set_major_formatter(formatter)
            plt.show()

    def predict(self, x):
        ts = self.Preprocess(x=x)
        pred = self.km.predict(ts)
        cluster = []
        for i in range(self.x_data):
            list_item = [pred[i]] * self.time_span
            cluster.extend(list_item)
        #データが余っている時は、Resampler で時系列データを1つだけ作って予測する。
        if not self.x_use == len(x):
            self.x_c = x.loc[x.index[self.x_use]:]
            self.x_c = np.array(self.x_c.TEMPERATURE).reshape(1, -1)
            self.x_batch = TimeSeriesResampler(sz=self.batch).fit_transform(
                self.x_c)
            y_pred_c = [int(self.km.predict(self.x_batch))] * self.x_c.shape[1]
            cluster.extend(y_pred_c)
        x["CLUSTER"] = cluster
        self.draw_graph(x=x)