Exemple #1
0
    def _transform(self, X, X2=None):
        """Compute distance/kernel matrix.

            Core logic

        Behaviour: returns pairwise distance/kernel matrix
            between samples in X and X2
                if X2 is not passed, is equal to X
                if X/X2 is a pd.DataFrame and contains non-numeric columns,
                    these are removed before computation

        Parameters
        ----------
        X: 3D np.array of shape [num_instances, num_vars, num_time_points]
        X2: 3D np.array of shape [num_instances, num_vars, num_time_points], optional
            default X2 = X

        Returns
        -------
        distmat: np.array of shape [n, m]
            (i,j)-th entry contains distance/kernel between X[i] and X2[j]
        """
        metric_key = self.metric_key
        kwargs = self.kwargs

        distmat = pairwise_distance(X, X2, metric=metric_key, **kwargs)

        return distmat
Exemple #2
0
    def _assign_clusters(
        self, X: np.ndarray, cluster_centres: np.ndarray
    ) -> Tuple[np.ndarray, float]:
        """Assign each instance to a cluster.

        This is done by computing the distance between each instance and
        each cluster. For each instance an index is returned that indicates
        which center had the smallest distance to it.

        Parameters
        ----------
        X : np.ndarray (3d array of shape (n_instances, n_dimensions, series_length))
            Time series instances to predict their cluster indexes.
        cluster_centres: np.ndarray (3d array of shape
                                        (n_clusters, n_dimensions, series_length))
            Cluster centers to assign to.

        Returns
        -------
        np.ndarray (1d array of shape (n_instance,))
            Array of indexes of each instance closest cluster.
        float
            Only returned when return_inertia is true. Float representing inertia of
            the assigned clusters.
        """
        pairwise = pairwise_distance(
            X, cluster_centres, metric=self.metric, **self._distance_params
        )
        return pairwise.argmin(axis=1), pairwise.min(axis=1).sum()
Exemple #3
0
def medoids(
    X: np.ndarray,
    precomputed_pairwise_distance: np.ndarray = None,
    distance_metric: str = "dtw",
):
    """Compute the medoids from a panel of time series.

    Parameters
    ----------
    X : np.ndarray (3d array of shape (n_instances, n_dimensions, series_length))
        Time series to compute medoids from.
    precomputed_pairwise_distance: np.ndarray (2d array of shape
        (n_instances, n_instances)), defaults = None
        Precomputed pairwise distance between each time series in X.
    distance_metric: str, defaults = 'dtw'
        String of distance metric to compute.

    Returns
    -------
    np.ndarray (2d array of shape (n_dimensions, series_length)
        The time series that is the medoids.
    """
    if X.shape[0] < 1:
        return X

    if precomputed_pairwise_distance is None:
        precomputed_pairwise_distance = pairwise_distance(X, metric=distance_metric)

    x_size = X.shape[0]
    distance_matrix = np.zeros((x_size, x_size))
    for j in range(x_size):
        for k in range(x_size):
            distance_matrix[j, k] = precomputed_pairwise_distance[j, k]
    return X[np.argmin(sum(distance_matrix))]
Exemple #4
0
    def _fit(self, X: np.ndarray, y=None) -> np.ndarray:
        """Fit time series clusterer to training data.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Training time series instances to cluster.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        self:
            Fitted estimator.
        """
        if self.averaging_method == "dba":
            self._precomputed_pairwise = pairwise_distance(
                X, metric=self._dba_medoids_distance_metric, **self._average_params
            )
        return super()._fit(X, y)
Exemple #5
0
def _kmeans_plus_plus(
    X: np.ndarray,
    n_clusters: int,
    random_state: np.random.RandomState,
    distance_metric: str = "euclidean",
    n_local_trials: int = None,
    distance_params: dict = None,
    **kwargs,
):
    """Compute initial centroids using kmeans++ method.

    This works by choosing one point at random. Next compute the distance between the
    center and each point. Sample these with a probability proportional to the square
    of the distance of the points from its nearest center.

    NOTE: This is adapted from sklearns implementation:
    https://
    github.com/scikit-learn/scikit-learn/blob/7e1e6d09b/sklearn/cluster/_kmeans.py

    Parameters
    ----------
    X : np.ndarray (3d array of shape (n_instances,n_dimensions,series_length))
        Time series instances to cluster.
    n_clusters: int, defaults = 8
        The number of clusters to form as well as the number of
        centroids to generate.
    random_state: np.random.RandomState
        Determines random number generation for centroid initialization.
    distance_metric: str, defaults = 'euclidean'
        String that is the distance metric.
    n_local_trials: integer, optional
        The number of seeding trials for each center (except the first),
        of which the one reducing inertia the most is greedily chosen.
        Set to None to make the number of trials depend logarithmically
        on the number of seeds (2+log(k)); this is the default.
    distance_params: dict, defaults = None
        Dictionary containing distance parameter kwargs.

    Returns
    -------
    np.ndarray (3d array of shape (n_clusters, n_dimensions, series_length))
        Indexes of the cluster centers.
    """
    n_samples, n_timestamps, n_features = X.shape

    centers = np.empty((n_clusters, n_timestamps, n_features), dtype=X.dtype)
    n_samples, n_timestamps, n_features = X.shape

    if n_local_trials is None:
        n_local_trials = 2 + int(np.log(n_clusters))

    if distance_params is None:
        distance_params = {}

    center_id = random_state.randint(n_samples)
    centers[0] = X[center_id]
    closest_dist_sq = (
        pairwise_distance(centers[0, np.newaxis], X, metric=distance_metric) ** 2
    )
    current_pot = closest_dist_sq.sum()

    for c in range(1, n_clusters):
        rand_vals = random_state.random_sample(n_local_trials) * current_pot
        candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), rand_vals)
        np.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids)

        distance_to_candidates = (
            pairwise_distance(X[candidate_ids], X, metric=distance_metric) ** 2
        )

        np.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates)
        candidates_pot = distance_to_candidates.sum(axis=1)

        best_candidate = np.argmin(candidates_pot)
        current_pot = candidates_pot[best_candidate]
        closest_dist_sq = distance_to_candidates[best_candidate]
        best_candidate = candidate_ids[best_candidate]

        centers[c] = X[best_candidate]

    return centers
Exemple #6
0
    def _fit_one_init(self, X) -> Tuple[np.ndarray, np.ndarray, float, int]:
        """Perform one pass of kmeans.

        This is done because the initial center assignment greatly effects the final
        result so we perform multiple passes at kmeans with different initial center
        assignments and keep the best results going froward.

        Parameters
        ----------
        X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape
            (n_instances, n_dimensions, series_length))
            Training time series instances to cluster.
        y: ignored, exists for API consistency reasons.

        Returns
        -------
        np.ndarray (1d array of shape (n_instance,))
            Labels that is the index each time series belongs to.
        np.ndarray (3d array of shape (n_clusters, n_dimensions,
            series_length))
            Time series that represent each of the cluster centers. If the algorithm
            stops before fully converging these will not be consistent with labels.
        float
            Sum of squared distances of samples to their closest cluster center,
            weighted by the sample weights if provided.
        """
        cluster_centers = self._init_algorithm(X, self.n_clusters,
                                               self._random_state)

        old_inertia = np.inf
        old_labels = None
        for i in range(self.max_iter):
            labels, inertia = self._assign_clusters(
                X,
                cluster_centers,
            )

            if np.abs(old_inertia - inertia) < self.tol:
                break
            old_inertia = inertia

            if np.array_equal(labels, old_labels):
                if self.verbose:
                    print(  # noqa: T001
                        f"Converged at iteration {i}: strict convergence.")
                break
            elif old_labels is not None:
                # No strict convergence, check for tol based convergence.
                center_shift = pairwise_distance(
                    labels,
                    old_labels,
                    metric=self.metric,
                    **self._distance_params).sum()
                if center_shift <= self.tol:
                    if self.verbose:
                        print(  # noqa: T001
                            f"Converged at iteration {i}: inertia "
                            f"{inertia} within tolerance {self.tol}.")
                    break
            old_labels = labels

            cluster_centers = self._compute_new_cluster_centers(X, labels)

            if self.verbose is True:
                print(f"Iteration {i}, inertia {inertia}.")  # noqa: T001

        labels, inertia = self._assign_clusters(X, cluster_centers)
        centers = cluster_centers

        return labels, centers, inertia, i + 1