Ejemplo n.º 1
0
        def merge_crossover(ind1, ind2):
            """Merge shapelets from one set with shapelets from the other"""
            # Construct a pairwise similarity matrix using GAK
            _all = list(ind1) + list(ind2)
            similarity_matrix = cdist_gak(ind1, ind2, sigma=sigma_gak(_all))

            # Iterate over shapelets in `ind1` and merge them with shapelets
            # from `ind2`
            for row_idx in range(similarity_matrix.shape[0]):
                # Remove all elements equal to 1.0
                mask = similarity_matrix[row_idx, :] != 1.0
                non_equals = similarity_matrix[row_idx, :][mask]
                if len(non_equals):
                    # Get the timeseries most similar to the one at row_idx
                    max_col_idx = np.argmax(non_equals)
                    ts1 = list(ind1[row_idx]).copy()
                    ts2 = list(ind2[max_col_idx]).copy()
                    # Merge them and remove nans
                    ind1[row_idx] = euclidean_barycenter([ts1, ts2])
                    ind1[row_idx] = ind1[row_idx][~np.isnan(ind1[row_idx])]

            # Apply the same for the elements in ind2
            for col_idx in range(similarity_matrix.shape[1]):
                mask = similarity_matrix[:, col_idx] != 1.0
                non_equals = similarity_matrix[:, col_idx][mask]
                if len(non_equals):
                    max_row_idx = np.argmax(non_equals)
                    ts1 = list(ind1[max_row_idx]).copy()
                    ts2 = list(ind2[col_idx]).copy()
                    ind2[col_idx] = euclidean_barycenter([ts1, ts2])
                    ind2[col_idx] = ind2[col_idx][~np.isnan(ind2[col_idx])]

            return ind1, ind2
Ejemplo n.º 2
0
    def fit(self, train):
        '''
            fit KMeans clustering model on training data

            parameters:
                train                : training time series
        ''' 

        if self.algorithm == 'TimeSeriesKMeans':
            self.km = TimeSeriesKMeans(n_clusters=self.n_clusters, n_init=20, verbose=True, random_state=self.random_seed)
        else:
            self.km = GlobalAlignmentKernelKMeans(n_clusters=self.n_clusters, sigma=sigma_gak(train), n_init=20, verbose=True, random_state=self.random_seed)
        self.km.fit(train)
Ejemplo n.º 3
0
    def fit(self, train):
        """
        fit KMeans clustering model on training data

        parameters:
            train                : training time series
        """

        if self.algorithm == "TimeSeriesKMeans":
            self.km = TimeSeriesKMeans(
                n_clusters=self.n_clusters,
                n_init=20,
                verbose=True,
                random_state=self.random_seed,
            )
        else:
            self.km = GlobalAlignmentKernelKMeans(
                n_clusters=self.n_clusters,
                sigma=sigma_gak(train),
                n_init=20,
                verbose=True,
                random_state=self.random_seed,
            )
        return self.km.fit_predict(train)
Ejemplo n.º 4
0
    def fit(self, X, y=None, sample_weight=None):
        """Compute kernel k-means clustering.

        Parameters
        ----------
        X : array-like of shape=(n_ts, sz, d)
            Time series dataset.

        y
            Ignored
        sample_weight : array-like of shape=(n_ts, ) or None (default: None)
            Weights to be given to time series in the learning process. By
            default, all time series weights are equal.
        """
        if self.sigma != 1.:
            warnings.warn(
                "Setting `sigma` directly as a parameter for KernelKMeans "
                "and GlobalAlignmentKernelKMeans is deprecated in version "
                "0.4 and will be removed in 0.6. Use `kernel_params` "
                "instead.",
                DeprecationWarning,
                stacklevel=2)

        X = check_array(X, allow_nd=True, force_all_finite=False)
        X = check_dims(X)

        sample_weight = _check_sample_weight(sample_weight=sample_weight, X=X)

        max_attempts = max(self.n_init, 10)
        kernel_params = self._get_kernel_params()
        if self.kernel == "gak":
            self.sigma_gak_ = kernel_params.get("sigma", 1.)
            if self.sigma_gak_ == "auto":
                self.sigma_gak_ = sigma_gak(X)
        else:
            self.sigma_gak_ = None

        self.labels_ = None
        self.inertia_ = None
        self.sample_weight_ = None
        self._X_fit = None
        # n_iter_ will contain the number of iterations the most
        # successful run required.
        self.n_iter_ = 0

        n_samples = X.shape[0]
        K = self._get_kernel(X)
        sw = (sample_weight
              if sample_weight is not None else numpy.ones(n_samples))
        self.sample_weight_ = sw
        rs = check_random_state(self.random_state)

        last_correct_labels = None
        min_inertia = numpy.inf
        n_attempts = 0
        n_successful = 0
        while n_successful < self.n_init and n_attempts < max_attempts:
            try:
                if self.verbose and self.n_init > 1:
                    print("Init %d" % (n_successful + 1))
                n_attempts += 1
                self._fit_one_init(K, rs)
                if self.inertia_ < min_inertia:
                    last_correct_labels = self.labels_
                    min_inertia = self.inertia_
                    self.n_iter_ = self._iter
                n_successful += 1
            except EmptyClusterError:
                if self.verbose:
                    print("Resumed because of empty cluster")
        if n_successful > 0:
            self.labels_ = last_correct_labels
            self.inertia_ = min_inertia
            self._X_fit = X
        return self
Ejemplo n.º 5
0
from tslearn.clustering import GlobalAlignmentKernelKMeans
from tslearn.metrics import sigma_gak, cdist_gak
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

# tslearn:
seed = 0                                 #물리적데이터를 직접 넣을 수 없으니까, '양자화'하여서 촘촘하게하면 정밀=양이 많아짐. 듬성듬성=양 줄어듬,정확도 떨어짐.
np.random.seed(seed)                     #타임시리즈 데이터, 이산적데이터 ..
X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
X_train = X_train[y_train <4]
np.random.shuffle(X_train)

#          TimeSeriesScalerMeanVariance의 매개변수가 mu(평균)와 std(표준편차) => 정규분포 정규화.
X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50])
sz = X_train.shape[1]
gak_km = GlobalAlignmentKernelKMeans(n_clusters=3, sigma=sigma_gak(X_train),    #왜 GlobalAlignmentKernelKMeans를 붙였을까요? 시계열 데이터를 범윅값을 가지고 전 범위 비교. 시계열 데이터를 잴때 정확히 재려면 뭐를 쓴다고 했죠? DTW(Dynamic Time Wraping)을 써서 거리값을 재서 쓴다고 했죠?
        n_init=20, verbose=True, random_state=seed)     #매개변술로 sigma가 들어가고 있어요. 왜 시그마가 들어갔죠? 시그마는 표준편차. X_train이 가지고있는 sigma gak
        #n_init는 클러스터의 중심값을 몇번 바꾸냐는거에요. verbose는 조용히 하라고 하는 것이다.
y_pred = gak_km.fit_predict(X_train)
plt.figure()
for yi in range(3):
    plt.subplot(3, 1, 1+ yi)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.title("Cluster %d" % (yi + 1))

plt.tight_layout()

plt.show()
Ejemplo n.º 6
0
from tslearn.clustering import GlobalAlignmentKernelKMeans

hum_sub = np.loadtxt('../../HUM_subs.csv', delimiter=',', skiprows=1)
print(hum_sub.shape)

X = to_time_series_dataset(hum_sub)
print(X.shape)
X = TimeSeriesScalerMeanVariance().fit_transform(X)
sz = X.shape[1]

seed = 0
np.random.seed(seed)

nclust = 4
gak_km = GlobalAlignmentKernelKMeans(n_clusters=nclust,
                                     sigma=sigma_gak(X),
                                     n_init=20,
                                     verbose=True,
                                     random_state=seed)
y_pred = gak_km.fit_predict(X)

print(gak_km.inertia_)
print(y_pred + 1)

plt.figure()
for yi in range(nclust):
    plt.subplot(nclust, 1, 1 + yi)
    for xx in X[y_pred == yi]:
        plt.plot(xx.ravel(), "k-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
Ejemplo n.º 7
0
import numpy
import matplotlib.pyplot as plt

from tslearn.clustering import GlobalAlignmentKernelKMeans
from tslearn.metrics import sigma_gak, cdist_gak
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

seed = 0
numpy.random.seed(seed)
X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
X_train = X_train[y_train < 4]  # Keep first 3 classes
numpy.random.shuffle(X_train)
X_train = TimeSeriesScalerMeanVariance().fit_transform(X_train[:50])  # Keep only 50 time series
sz = X_train.shape[1]

gak_km = GlobalAlignmentKernelKMeans(n_clusters=3, sigma=sigma_gak(X_train), n_init=20, verbose=True, random_state=seed)
y_pred = gak_km.fit_predict(X_train)

plt.figure()
for yi in range(3):
    plt.subplot(3, 1, 1 + yi)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.title("Cluster %d" % (yi + 1))

plt.tight_layout()
plt.show()
from tslearn.clustering import GlobalAlignmentKernelKMeans
from tslearn.metrics import sigma_gak, cdist_gak
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

seed = 0
numpy.random.seed(seed)
X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
X_train = X_train[y_train < 4]  # Keep first 3 classes
numpy.random.shuffle(X_train)
X_train = TimeSeriesScalerMeanVariance().fit_transform(
    X_train[:50])  # Keep only 50 time series
sz = X_train.shape[1]

gak_km = GlobalAlignmentKernelKMeans(n_clusters=3,
                                     sigma=sigma_gak(X_train),
                                     n_init=20,
                                     verbose=True,
                                     random_state=seed)
y_pred = gak_km.fit_predict(X_train)

plt.figure()
for yi in range(3):
    plt.subplot(3, 1, 1 + yi)
    for xx in X_train[y_pred == yi]:
        plt.plot(xx.ravel(), "k-")
    plt.xlim(0, sz)
    plt.ylim(-4, 4)
    plt.title("Cluster %d" % (yi + 1))

plt.tight_layout()