Ejemplo n.º 1
0
def test_kmeans():
    """Test implementation of Kmeans."""
    X_train, y_train = load_basic_motions(split="train")
    X_test, y_test = load_basic_motions(split="test")

    kmeans = TimeSeriesKMeans(
        averaging_method="mean",
        random_state=1,
        n_init=2,
        n_clusters=4,
        init_algorithm="kmeans++",
        metric="euclidean",
    )
    train_predict = kmeans.fit_predict(X_train)
    train_mean_score = metrics.rand_score(y_train, train_predict)

    test_mean_result = kmeans.predict(X_test)
    mean_score = metrics.rand_score(y_test, test_mean_result)
    proba = kmeans.predict_proba(X_test)

    assert np.array_equal(test_mean_result, expected_results["mean"])
    assert mean_score == expected_score["mean"]
    assert train_mean_score == expected_train_result["mean"]
    assert kmeans.n_iter_ == expected_iters["mean"]
    assert np.array_equal(kmeans.labels_, expected_labels["mean"])
    assert isinstance(kmeans.cluster_centers_, np.ndarray)
    assert proba.shape == (40, 4)

    for val in proba:
        assert np.count_nonzero(val == 1.0) == 1
Ejemplo n.º 2
0
def time_clusterers():
    """Time tests for clusterers."""
    k_means = TimeSeriesKMeans(
        n_clusters=5,  # Number of desired centers
        init_algorithm="forgy",  # Center initialisation technique
        max_iter=
        10,  # Maximum number of iterations for refinement on training set
        metric="dtw",  # Distance metric to use
        averaging_method="mean",  # Averaging technique to use
        random_state=1,
    )
    X_train, y_train = load_arrow_head(split="train")
    X_test, y_test = load_arrow_head(split="test")
    k_means.fit(X_train)
    plot_cluster_algorithm(k_means, X_test, k_means.n_clusters)
Ejemplo n.º 3
0
def test_run_clustering_experiment(tmp_path):
    """Test running and saving results for clustering.

    Currently it just checks the files have been created, then deletes them.
    """
    dataset = "UnitTest"
    train_X, train_Y = load_unit_test("TRAIN")
    test_X, test_Y = load_unit_test("TEST")
    run_clustering_experiment(
        train_X,
        TimeSeriesKMeans(n_clusters=2),
        results_path=tmp_path,
        trainY=train_Y,
        testX=test_X,
        testY=test_Y,
        cls_name="kmeans",
        dataset_name=dataset,
        resample_id=0,
    )
    test_path = tmp_path.joinpath(
        f"kmeans/Predictions/{dataset}/testResample0.csv")
    train_path = tmp_path.joinpath(
        f"kmeans/Predictions/{dataset}/trainResample0.csv")
    assert test_path.is_file()
    assert train_path.is_file()
    # remove files
    test_path.unlink()
    train_path.unlink()
Ejemplo n.º 4
0
def tune_window(metric: str, train_X):
    """Tune window."""
    best_w = 0
    best_score = 0
    for w in np.arange(0, 1, 0.1):
        cls = TimeSeriesKMeans(metric=metric, distance_params={"window": w})
        cls.fit(train_X)
        preds = cls.predict(train_X)
        print(" Preds type = ", type(preds))
        score = davies_bouldin_score(train_X, preds)
        print(score)
        if score > best_score:
            best_score = score
            best_w = w
    print("best window =", best_w, " with score ", best_score)
    return best_w
Ejemplo n.º 5
0
def config_clusterer(clusterer: str, **kwargs):
    """Config clusterer."""
    if clusterer == "kmeans":
        cls = TimeSeriesKMeans(**kwargs)
    elif clusterer == "kmedoids":
        cls = TimeSeriesKMedoids(**kwargs)
    return cls
Ejemplo n.º 6
0
def debug_clusterers():
    """Debug clusterers."""
    X_train, y_train = load_basic_motions(split="train", return_type="numpy3d")
    #    X_train, y_train = load_unit_test(split="train", return_type="numpy3d")
    #   X_train2, y_train2 = load_unit_test(split="train", return_type="numpy2d")
    parameters = {"window": 1.0, "epsilon": 50.0, "g": 0.05, "c": 1.0}
    for dist in distances:
        kmeans = TimeSeriesKMeans(
            averaging_method="mean",
            random_state=1,
            n_init=2,
            n_clusters=2,
            init_algorithm="kmeans++",
            metric=dist,
            distance_params=parameters,
        )
        kmeans.fit(X_train)
        y_pred = kmeans.predict(X_train)
        train_rand = metrics.rand_score(y_train, y_pred)
        print('"' + dist + '": ' + str(train_rand) + ",")
Ejemplo n.º 7
0
def test_kmeans_dba():
    """Test implementation of Kmeans using dba."""
    X_train, y_train = load_basic_motions(split="train")
    X_test, y_test = load_basic_motions(split="test")

    num_test_values = 5

    kmeans = TimeSeriesKMeans(
        averaging_method="dba",
        random_state=1,
        n_init=2,
        n_clusters=4,
        init_algorithm="kmeans++",
        metric="dtw",
        distance_params={"window": 0.2},
        average_params={"window": 0.2},
    )
    train_predict = kmeans.fit_predict(X_train.head(num_test_values))
    train_mean_score = metrics.rand_score(y_train[0:num_test_values],
                                          train_predict)

    test_mean_result = kmeans.predict(X_test.head(num_test_values))
    mean_score = metrics.rand_score(y_test[0:num_test_values],
                                    test_mean_result)
    proba = kmeans.predict_proba(X_test.head(num_test_values))

    assert np.array_equal(test_mean_result, expected_results["dba"])
    assert mean_score == expected_score["dba"]
    assert train_mean_score == expected_train_result["dba"]
    assert kmeans.n_iter_ == expected_iters["dba"]
    assert np.array_equal(kmeans.labels_, expected_labels["dba"])
    assert isinstance(kmeans.cluster_centers_, np.ndarray)
    assert proba.shape == (5, 4)

    for val in proba:
        assert np.count_nonzero(val == 1.0) == 1
Ejemplo n.º 8
0
def set_clusterer(cls, resample_id=None):
    """Construct a clusterer.

    Basic way of creating the clusterer to build using the default settings. This
    set up is to help with batch jobs for multiple problems to facilitate easy
    reproducability through run_clustering_experiment. You can set up bespoke
    clusterers and pass them to run_clustering_experiment if you prefer. It also
    serves to illustrate the base clusterer parameters

    Parameters
    ----------
    cls : str
        indicating which clusterer you want
    resample_id : int or None, default = None
        clusterer random seed

    Return
    ------
    A clusterer.
    """
    name = cls.lower()
    # Distance based
    if name == "kmeans" or name == "k-means":
        return TimeSeriesKMeans(
            n_clusters=5,
            max_iter=50,
            averaging_algorithm="mean",
            random_state=resample_id,
        )
    if name == "kmedoids" or name == "k-medoids":
        return TimeSeriesKMedoids(
            n_clusters=5,
            max_iter=50,
            averaging_algorithm="mean",
            random_state=resample_id,
        )
    else:
        raise Exception("UNKNOWN CLUSTERER")
Ejemplo n.º 9
0
# -*- coding: utf-8 -*-
"""Clustering usage tests and examples."""
import numpy as np

from sktime.clustering.k_means import TimeSeriesKMeans
from sktime.clustering.k_medoids import TimeSeriesKMedoids
from sktime.datasets import load_arrow_head


def form_cluster_list(clusters, n) -> np.array:
    """Form a cluster list."""
    preds = np.zeros(n)
    for i in range(len(clusters)):
        for j in range(len(clusters[i])):
            preds[clusters[i][j]] = i
    return preds


if __name__ == "__main__":
    clusterer1 = TimeSeriesKMeans(n_clusters=5, max_iter=50, averaging_algorithm="mean")
    clusterer2 = TimeSeriesKMedoids()
    X, y = load_arrow_head(return_X_y=True)
    clusterer1.fit(X)
    c = clusterer1.predict(X)
    x = form_cluster_list(c, len(y))
    for i in range(len(x)):
        print(i, " is in cluster ", x[i])
Ejemplo n.º 10
0
 test_X = s.fit_transform(test_X.T)
 test_X = test_X.T
 if tune:
     window = tune_window(distance, train_X)
     name = clusterer + "-" + distance + "-tuned"
 else:
     name = clusterer + "-" + distance
 if (distance == "wdtw" or distance == "dwdtw" or distance == "dtw"
         or distance == "wdtw"):
     parameters = {"window": 0.2, "epsilon": 0.05, "g": 0.05, "c": 1}
 else:
     parameters = {"window": 1.0, "epsilon": 0.05, "g": 0.05, "c": 1}
 clst = TimeSeriesKMeans(
     averaging_method="dba",
     average_params={"averaging_distance_metric": distance},
     metric=distance,
     distance_params=parameters,
     n_clusters=len(set(train_Y)),
     random_state=resample + 1,
 )
 run_clustering_experiment(
     train_X,
     clst,
     results_path=results_dir,
     trainY=train_Y,
     testX=test_X,
     testY=test_Y,
     cls_name=name,
     dataset_name=dataset,
     resample_id=resample,
     overwrite=True,
 )