def _transform(self, X, X2=None): """Compute distance/kernel matrix. Core logic Behaviour: returns pairwise distance/kernel matrix between samples in X and X2 if X2 is not passed, is equal to X if X/X2 is a pd.DataFrame and contains non-numeric columns, these are removed before computation Parameters ---------- X: 3D np.array of shape [num_instances, num_vars, num_time_points] X2: 3D np.array of shape [num_instances, num_vars, num_time_points], optional default X2 = X Returns ------- distmat: np.array of shape [n, m] (i,j)-th entry contains distance/kernel between X[i] and X2[j] """ metric_key = self.metric_key kwargs = self.kwargs distmat = pairwise_distance(X, X2, metric=metric_key, **kwargs) return distmat
def _assign_clusters( self, X: np.ndarray, cluster_centres: np.ndarray ) -> Tuple[np.ndarray, float]: """Assign each instance to a cluster. This is done by computing the distance between each instance and each cluster. For each instance an index is returned that indicates which center had the smallest distance to it. Parameters ---------- X : np.ndarray (3d array of shape (n_instances, n_dimensions, series_length)) Time series instances to predict their cluster indexes. cluster_centres: np.ndarray (3d array of shape (n_clusters, n_dimensions, series_length)) Cluster centers to assign to. Returns ------- np.ndarray (1d array of shape (n_instance,)) Array of indexes of each instance closest cluster. float Only returned when return_inertia is true. Float representing inertia of the assigned clusters. """ pairwise = pairwise_distance( X, cluster_centres, metric=self.metric, **self._distance_params ) return pairwise.argmin(axis=1), pairwise.min(axis=1).sum()
def medoids( X: np.ndarray, precomputed_pairwise_distance: np.ndarray = None, distance_metric: str = "dtw", ): """Compute the medoids from a panel of time series. Parameters ---------- X : np.ndarray (3d array of shape (n_instances, n_dimensions, series_length)) Time series to compute medoids from. precomputed_pairwise_distance: np.ndarray (2d array of shape (n_instances, n_instances)), defaults = None Precomputed pairwise distance between each time series in X. distance_metric: str, defaults = 'dtw' String of distance metric to compute. Returns ------- np.ndarray (2d array of shape (n_dimensions, series_length) The time series that is the medoids. """ if X.shape[0] < 1: return X if precomputed_pairwise_distance is None: precomputed_pairwise_distance = pairwise_distance(X, metric=distance_metric) x_size = X.shape[0] distance_matrix = np.zeros((x_size, x_size)) for j in range(x_size): for k in range(x_size): distance_matrix[j, k] = precomputed_pairwise_distance[j, k] return X[np.argmin(sum(distance_matrix))]
def _fit(self, X: np.ndarray, y=None) -> np.ndarray: """Fit time series clusterer to training data. Parameters ---------- X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape (n_instances, n_dimensions, series_length)) Training time series instances to cluster. y: ignored, exists for API consistency reasons. Returns ------- self: Fitted estimator. """ if self.averaging_method == "dba": self._precomputed_pairwise = pairwise_distance( X, metric=self._dba_medoids_distance_metric, **self._average_params ) return super()._fit(X, y)
def _kmeans_plus_plus( X: np.ndarray, n_clusters: int, random_state: np.random.RandomState, distance_metric: str = "euclidean", n_local_trials: int = None, distance_params: dict = None, **kwargs, ): """Compute initial centroids using kmeans++ method. This works by choosing one point at random. Next compute the distance between the center and each point. Sample these with a probability proportional to the square of the distance of the points from its nearest center. NOTE: This is adapted from sklearns implementation: https:// github.com/scikit-learn/scikit-learn/blob/7e1e6d09b/sklearn/cluster/_kmeans.py Parameters ---------- X : np.ndarray (3d array of shape (n_instances,n_dimensions,series_length)) Time series instances to cluster. n_clusters: int, defaults = 8 The number of clusters to form as well as the number of centroids to generate. random_state: np.random.RandomState Determines random number generation for centroid initialization. distance_metric: str, defaults = 'euclidean' String that is the distance metric. n_local_trials: integer, optional The number of seeding trials for each center (except the first), of which the one reducing inertia the most is greedily chosen. Set to None to make the number of trials depend logarithmically on the number of seeds (2+log(k)); this is the default. distance_params: dict, defaults = None Dictionary containing distance parameter kwargs. Returns ------- np.ndarray (3d array of shape (n_clusters, n_dimensions, series_length)) Indexes of the cluster centers. """ n_samples, n_timestamps, n_features = X.shape centers = np.empty((n_clusters, n_timestamps, n_features), dtype=X.dtype) n_samples, n_timestamps, n_features = X.shape if n_local_trials is None: n_local_trials = 2 + int(np.log(n_clusters)) if distance_params is None: distance_params = {} center_id = random_state.randint(n_samples) centers[0] = X[center_id] closest_dist_sq = ( pairwise_distance(centers[0, np.newaxis], X, metric=distance_metric) ** 2 ) current_pot = closest_dist_sq.sum() for c in range(1, n_clusters): rand_vals = random_state.random_sample(n_local_trials) * current_pot candidate_ids = np.searchsorted(stable_cumsum(closest_dist_sq), rand_vals) np.clip(candidate_ids, None, closest_dist_sq.size - 1, out=candidate_ids) distance_to_candidates = ( pairwise_distance(X[candidate_ids], X, metric=distance_metric) ** 2 ) np.minimum(closest_dist_sq, distance_to_candidates, out=distance_to_candidates) candidates_pot = distance_to_candidates.sum(axis=1) best_candidate = np.argmin(candidates_pot) current_pot = candidates_pot[best_candidate] closest_dist_sq = distance_to_candidates[best_candidate] best_candidate = candidate_ids[best_candidate] centers[c] = X[best_candidate] return centers
def _fit_one_init(self, X) -> Tuple[np.ndarray, np.ndarray, float, int]: """Perform one pass of kmeans. This is done because the initial center assignment greatly effects the final result so we perform multiple passes at kmeans with different initial center assignments and keep the best results going froward. Parameters ---------- X : np.ndarray (2d or 3d array of shape (n_instances, series_length) or shape (n_instances, n_dimensions, series_length)) Training time series instances to cluster. y: ignored, exists for API consistency reasons. Returns ------- np.ndarray (1d array of shape (n_instance,)) Labels that is the index each time series belongs to. np.ndarray (3d array of shape (n_clusters, n_dimensions, series_length)) Time series that represent each of the cluster centers. If the algorithm stops before fully converging these will not be consistent with labels. float Sum of squared distances of samples to their closest cluster center, weighted by the sample weights if provided. """ cluster_centers = self._init_algorithm(X, self.n_clusters, self._random_state) old_inertia = np.inf old_labels = None for i in range(self.max_iter): labels, inertia = self._assign_clusters( X, cluster_centers, ) if np.abs(old_inertia - inertia) < self.tol: break old_inertia = inertia if np.array_equal(labels, old_labels): if self.verbose: print( # noqa: T001 f"Converged at iteration {i}: strict convergence.") break elif old_labels is not None: # No strict convergence, check for tol based convergence. center_shift = pairwise_distance( labels, old_labels, metric=self.metric, **self._distance_params).sum() if center_shift <= self.tol: if self.verbose: print( # noqa: T001 f"Converged at iteration {i}: inertia " f"{inertia} within tolerance {self.tol}.") break old_labels = labels cluster_centers = self._compute_new_cluster_centers(X, labels) if self.verbose is True: print(f"Iteration {i}, inertia {inertia}.") # noqa: T001 labels, inertia = self._assign_clusters(X, cluster_centers) centers = cluster_centers return labels, centers, inertia, i + 1