def test_check_array_multivariate(): x = np.arange(10 * 3 * 10).reshape(10, 3, 10) x_checked = check_array(x, allow_multivariate=True) assert x.dtype == x_checked.dtype with pytest.raises(ValueError): check_array(x, allow_multivariate=False)
def test_check_array_contiguous(): x = np.arange(10 * 3 * 10).reshape(10, 3, 10, order="f") x_checked = check_array(x, allow_multivariate=True) assert x_checked.flags.carray x_checked = check_array(x, allow_multivariate=True, contiguous=False) assert not x_checked.flags.carray
def plot_time_domain( x, *, y=None, ax=None, alpha=0.5, linewidth=0.5, zorder=-1, cmap="Dark2", ): """Plot the samples in the time domain Parameters ---------- x : array-like of shape (n_sample, n_timestep) The samples y : array-like of shape (n_samples, ), optional The labels, by default None ax : Axes, optional The matplotlib Axes-object, by default None cmap : str, optional The colormap, by default "Dark2" """ if ax is None: fig, ax = plt.subplots() x = check_array(np.atleast_2d(x), allow_multivariate=False, contiguous=False) if y is not None: y = check_array(y, ensure_2d=False, allow_nd=False, contiguous=False) if y.shape[0] != x.shape[0]: raise ValueError() label, idx, inv = np.unique(y, return_inverse=True, return_index=True) cmap = get_cmap(cmap, len(label)) else: cmap = get_cmap(cmap, 1) inv = np.zeros(x.shape[0]) x_axis = np.arange(x.shape[-1] + 1) collection = LineCollection( [list(zip(x_axis, x[i])) for i in range(x.shape[0])], colors=[cmap(inv[i]) for i in range(x.shape[0])], zorder=zorder, linewidth=linewidth, alpha=alpha, ) ax.add_collection(collection) if y is not None: ax.legend([Line2D([0], [0], color=cmap(inv[i])) for i in idx], label) ax.set_xlim([0, x.shape[-1] - 1]) ax.set_ylim([np.min(x) - np.std(x), np.max(x) + np.std(x)]) return ax
def fit(self, x, y, sample_weight=None): x = check_array(x, allow_multivariate=True) y = check_array(y, ensure_2d=False) random_state = check_random_state(self.random_state) self.pipe_ = Pipeline([ ("embedding", self._get_embedding(random_state.randint(2**31))), ("estimator", self._get_estimator(random_state.randint(2**31))), ], ) self.pipe_.fit(x, y, estimator__sample_weight=sample_weight) return self
def fit(self, X, y, sample_weight=None, check_input=True): """Fit a shapelet tree regressor from the training set Parameters ---------- X : array-like of shape (n_samples, n_timesteps) The training time series. y : array-like of shape (n_samples,) Target values as floating point values sample_weight : array-like of shape (n_samples,) If `None`, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. Splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : bool, optional Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self: object """ if check_input: X = check_array(X, allow_multivariate=True, dtype=float) y = check_array(y, ensure_2d=False, dtype=float) n_samples = X.shape[0] if isinstance(self.force_dim, int): X = np.reshape(X, [n_samples, self.force_dim, -1]) n_timesteps = X.shape[-1] if X.ndim > 2: n_dims = X.shape[1] else: n_dims = 1 if len(y) != n_samples: raise ValueError("Number of labels={} does not match " "number of samples={}".format(len(y), n_samples)) self.n_timestep_ = n_timesteps self.n_dims_ = n_dims random_state = check_random_state(self.random_state) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, X, dtype=float) self._fit(X, y, sample_weight, random_state) return self
def fit(self, estimator): check_is_fitted(estimator) if self.train_x is None or self.train_y is None: raise ValueError("background data are required.") x = check_array(self.train_x) y = check_array(self.train_y, ensure_2d=False) if len(y) != x.shape[0]: raise ValueError( "Number of labels={} does not match " "number of samples={}".format(len(y), x.shape[0]) ) random_state = check_random_state(self.random_state) metric_params = self.metric_params or {} method_params = self.method_params or {} if self.metric in _METRIC_TRANSFORM: metric = _METRIC_TRANSFORM[self.metric](self.step_size, **metric_params) else: raise ValueError("metric (%s) is not supported" % self.metric) if self.method in _PROTOTYPE_SAMPLER: sampler = _PROTOTYPE_SAMPLER[self.method] else: raise ValueError("method (%s) is not supported" % self.method) self.estimator_ = deepcopy(estimator) self.classes_ = np.unique(self.train_y) if self.target == "auto": self.target_ = PredictEvaluator(self.estimator_) else: if not 0 < self.target <= 1.0: raise ValueError("target must be in (0, 1], got %r" % self.target) self.target_ = ProbabilityEvaluator(self.estimator_, self.target) self.partitions_ = {} for c in self.classes_: x_partition = x[y == c] if self.n_prototypes == "auto": n_prototypes = x_partition.shape[0] elif isinstance(self.n_prototypes, numbers.Integral): n_prototypes = max(1, min(self.n_prototypes, x_partition.shape[0])) elif isinstance(self.n_prototypes, numbers.Real): if not 0.0 < self.n_prototypes <= 1.0: raise ValueError("n_prototypes") n_prototypes = math.ceil(self.n_prototypes * x_partition.shape[0]) else: raise ValueError("n_prototypes (%r) not supported" % self.n_prototypes) self.partitions_[c] = sampler( x_partition, c, n_prototypes, metric, random_state, **method_params )
def transform(self, x, y): x = check_array(x) y = check_array(y, ensure_2d=False) if len(y) != x.shape[0]: raise ValueError( "Number of labels={} does not match " "number of samples={}".format(len(y), x.shape[0]) ) n_samples = x.shape[0] counterfactuals = np.empty(x.shape, dtype=x.dtype) for i in range(n_samples): counterfactuals[i] = self._transform_sample(x[i], y[i]) return counterfactuals
def fit_transform(self, x, y=None): """Fit the embedding and return the transform of x. Parameters ---------- x : array-like of shape [n_samples, n_timestep] or [n_samples, n_dimensions, n_timestep] The time series dataset. y : None, optional For compatibility. Returns ------- x_embedding : ndarray of shape [n_samples, n_outputs] The embedding. """ x = check_array(x, allow_multivariate=True, dtype=np.double) if x.ndim < 2 or x.ndim > 3: raise ValueError("illegal input dimensions") random_state = check_random_state(self.random_state) self.n_timestep_ = x.shape[-1] embedding, x_out = feature_embedding_fit_transform( self._get_feature_engineer(), x, random_state, self.n_jobs) self.embedding_ = embedding return x_out
def transform(self, x, y): check_is_fitted(self, "paths_") x = check_array(x, allow_multivariate=True) y = check_array(y, ensure_2d=False) if len(y) != x.shape[0]: raise ValueError("Number of labels={} does not match " "number of samples={}".format(len(y), x.shape[0])) counterfactuals = np.empty(x.shape) for i in range(x.shape[0]): if self.verbose: print(f"Generating counterfactual for the {i}:th sample. " f"The target label is {y[i]}.") t = self.candidates(x[i], y[i]) if t is not None: counterfactuals[i] = t else: counterfactuals[i] = x[i] return counterfactuals
def _validate_x_predict(self, x, check_input): if check_input: x = check_array(x, allow_multivariate=True) if isinstance(self.force_dim, int): x = np.reshape(x, [x.shape[0], self.force_dim, -1]) if x.shape[-1] != self.n_timestep_: raise ValueError("illegal input shape ({} != {})".format( x.shape[-1], self.n_timestep_)) if x.ndim > 2 and x.shape[1] != self.n_dims_: raise ValueError("illegal input shape ({} != {}".format( x.shape[1], self.n_dims_)) return x
def transform(self, x): """Transform the dataset. Parameters ---------- x : array-like of shape [n_samples, n_timestep] or [n_samples, n_dimensions, n_timestep] The time series dataset. Returns ------- x_embedding : ndarray of shape [n_samples, n_outputs] The embedding. """ check_is_fitted(self, attributes="embedding_") x = check_array(x, allow_multivariate=True, dtype=np.double) return feature_embedding_transform(self.embedding_, x, self.n_jobs)
def histogram_mode(x, n_bins=5): """Compute the histogram mode Parameters ---------- x : ndarray of shape (n_samples, n_timestep) or (n_timestep, ) The input array n_bins : int, optional The number of bins Returns ------- mode : array or float The histogram mode """ x = check_array(x, allow_multivariate=True) return _catch22.histogram_mode_(x, n_bins)
def fit(self, x, y=None): """Fit the embedding. Parameters ---------- x : array-like of shape [n_samples, n_timestep] or [n_samples, n_dimensions, n_timestep] The time series dataset. y : None, optional For compatibility. Returns ------- embedding : self """ x = check_array(x, allow_multivariate=True, dtype=np.double) random_state = check_random_state(self.random_state) self.n_timestep_ = x.shape[-1] self.embedding_ = feature_embedding_fit(self._get_feature_engineer(), x, random_state) return self
def regimes( x=None, mpi=None, *, n_regimes=1, window=5, exclude=0.2, boundry=1.0, return_arc_curve=False, ): """Find change regimes in a time series. Parameters ---------- x : array-like of shape (n_samples, n_timestep) or (n_timestep, ), optional The time series. If x is given, the matrix profile of x is computed. mpi : array-like of shape (n_samples, profile_size) or (profile_size), optional The matrix profile index. Must be given unless x is given. n_regimes : int, optional The number of segmentations to identify window : int, optional The window size. Ignored if `mpi` is given. exclude : float, optional The self-join exclusion for the matrix profile. Ignored if `mpi` is given. boundry : float, optional The region around an identified segmentation that is ignored when searching for subsequent segmentations return_arc_curve : bool, optional Return the arc curve. Returns ------- segments : ndarray of shape (n_samples, n_regimes), (n_regimes) or int The start index of a segment arc_curves : ndarray of shape (n_samples, profile_size) or (profile_size, ) The arc curves See also -------- wildboar.distance.matrix_profile : compute the matrix profile References ---------- Gharghabi, Shaghayegh, et al. (2017) Matrix profile VIII: domain agnostic online semantic segmentation at superhuman performance levels. In proceedings of International Conference on Data Mining """ if x is None and mpi is None: raise ValueError("either x or mpi must be given") if x is not None: if mpi is not None: raise ValueError("both x and mpi cannot be given") x = check_array(np.atleast_2d(x), allow_multivariate=False) _, mpi = matrix_profile(x, window=window, exclude=exclude, return_index=True) mpi = np.atleast_2d(mpi) else: mpi = check_array(np.atleast_2d(mpi)) boundry = math.ceil(window * boundry) regimes = np.empty((mpi.shape[0], n_regimes), dtype=np.intp) if return_arc_curve: arc_curves = np.empty(mpi.shape, dtype=np.double) index = np.arange(mpi.shape[-1]) arc_curve_normalize = 2 * index * (mpi.shape[-1] - index) / mpi.shape[-1] arc_curve_normalize[0] = 10e-10 for i in range(mpi.shape[0]): arc_curve = np.minimum( np.cumsum( np.bincount(np.minimum(index, mpi[i]), minlength=mpi.shape[-1]) - np.bincount(np.maximum(index, mpi[i]), minlength=mpi.shape[-1]) ) / arc_curve_normalize, 1, ) arc_curve[:boundry] = 1.0 arc_curve[-boundry:] = 1.0 if return_arc_curve: arc_curves[i, :] = arc_curve for j in range(n_regimes): regimes[i, j] = np.argmin(arc_curve) if arc_curve[regimes[i, j]] == 1.0: warnings.warn( f"no more regimes for sample={i} (regime={j}) " "all remaining regimes are invalid and point to the first index.", UserWarning, ) start = max(regimes[i, j] - boundry, 0) end = min(regimes[i, j] + boundry, arc_curve.shape[0]) arc_curve[start:end] = 1.0 if return_arc_curve: return regimes, arc_curves else: return regimes
def plot_frequency_domain( x, *, y=None, ax=None, jitter=False, sample_spacing=1, frequency=False, cmap="Dark2", ): """Plot the samples in the freqency domain Parameters ---------- x : array-like of shape (n_sample, n_timestep) The samples y : array-like of shape (n_samples, ), optional The labels, by default None ax : Axes, optional The matplotlib Axes-object, by default None jitter : bool, optional Add jitter to the amplitude lines, by default False sample_spacing : int, optional The frequency domain sample spacing, by default 1 frequency : bool, optional Show the frequency bins, by default False cmap : str, optional The colormap, by default "Dark2" """ if ax is None: fig, ax = plt.subplots() x = check_array(x, allow_multivariate=False, contiguous=False) if y is not None: y = check_array(1, ensure_2d=False, allow_nd=False, contiguous=False) if y.shape[0] != x.shape[0]: raise ValueError() label, idx, inv = np.unique(y, return_inverse=True, return_index=True) cmap = get_cmap(cmap, len(label)) else: cmap = get_cmap(cmap, 1) inv = np.zeros(x.shape[0]) n_freqs = int(x.shape[-1] // 2) x_freq = np.abs(np.fft.fft(x, axis=1)[:, 1:n_freqs + 1]) / n_freqs x_axis = np.arange(1, n_freqs + 1) max_freq = np.max(x_freq) if frequency: for i in x_axis: ax.axvspan( i - 0.5, i + 0.5, 0, 1, facecolor=None, edgecolor="gray", fill=False, alpha=0.05, zorder=-100, ) ax.set_ylabel("Amplitude") for i in range(x.shape[0]): if jitter: x_axis_tmp = x_axis + np.random.normal(scale=0.5, size=n_freqs) else: x_axis_tmp = x_axis ax.vlines( x_axis_tmp, 0, x_freq[i], color=cmap(idx[i]), alpha=0.3, linewidth=1, zorder=-1, ) if y is not None: ax.legend([Line2D([0], [0], color=cmap(inv[i])) for i in idx], label) ticks = ax.get_xticks().astype(int)[:len(x_axis)] ticks[0] = 1 ax.set_xticks(ticks) x_label = np.fft.fftfreq(x.shape[-1], d=sample_spacing)[ticks] ax.set_xticklabels("%.2f" % lab for lab in x_label) ax.set_xlim([0.5, n_freqs + 0.5]) ax.set_ylim(0, max_freq) return ax
def decision_function(self, x): check_is_fitted(self) x = check_array(x, allow_multivariate=True) return self.pipe_.decision_function(x)
def predict(self, x): check_is_fitted(self) x = check_array(x, allow_multivariate=True) return self.pipe_.predict(x)
def fit(self, estimator, x, y=None, sample_weight=None): x = check_array(x, allow_multivariate=False) y = check_array(y, ensure_2d=False) random_state = check_random_state(self.random_state) if x.shape[0] != y.shape[0]: raise ValueError( "expected the same number of samples (%d) and labels (%d)" % (x.shape[0], y.shape[0]) ) if self.n_interval == "sqrt": n_interval = math.ceil(math.sqrt(x.shape[-1])) elif self.n_interval == "log": n_interval = math.ceil(math.log2(x.shape[-1])) elif isinstance(self.n_interval, numbers.Integral): n_interval = self.n_interval elif isinstance(self.n_interval, numbers.Real): if not 0 < self.n_interval <= 1: raise ValueError( "n_interval (%r) not in range [0, 1[" % self.n_interval ) n_interval = math.floor(x.shape[-1] * self.n_interval) else: raise ValueError("unsupported n_interval, got %r" % self.n_interval) if callable(self.scoring): scoring = self.scoring elif self.scoring is None or isinstance(self.scoring, str): scoring = check_scoring(estimator, self.scoring) else: scoring_dict = _check_multimetric_scoring(estimator, self.scoring) scoring = _MultimetricScorer(**scoring_dict) if isinstance(self.domain, str): self.domain_ = _PERMUTATION_DOMAIN.get(self.domain, None)() if self.domain_ is None: raise ValueError("domain (%s) is not supported" % self.domain) else: self.domain_ = self.domain x_transform = self.domain_.transform(x=x) self.intervals_ = list( self.domain_.intervals(x_transform.shape[-1], n_interval) ) scores = [] for iter, (start, end) in enumerate(self.intervals_): if self.verbose: print( f"Running iteration {iter + 1} of " f"{len(self.intervals_)}. {start}:{end}" ) x_perm_transform = x_transform.copy() rep_scores = [] for rep in range(self.n_repeat): self.domain_.randomize( x_perm_transform, start, end, random_state=random_state ) x_perm_inverse = self.domain_.inverse_transform(x_perm_transform) if sample_weight is not None: score = scoring( estimator, x_perm_inverse, y, sample_weight=sample_weight ) else: score = scoring(estimator, x_perm_inverse, y) rep_scores.append(score) if isinstance(rep_scores[0], dict): scores.append(_aggregate_score_dicts(rep_scores)) else: scores.append(rep_scores) if sample_weight is not None: self.baseline_score_ = scoring(estimator, x, y, sample_weight=sample_weight) else: self.baseline_score_ = scoring(estimator, x, y) if self.verbose: print(f"Baseline score is: {self.baseline_score_}") if isinstance(self.baseline_score_, dict): self.importances_ = { name: _unpack_scores( self.baseline_score_[name], np.array([scores[i][name] for i in range(n_interval)]), ) for name in self.baseline_score_ } else: self.importances_ = _unpack_scores(self.baseline_score_, np.array(scores)) return self
def fit(self, x, y, sample_weight=None, check_input=True): """Fit a shapelet tree regressor from the training set Parameters ---------- x : array-like of shape (n_samples, n_timesteps) The training time series. y : array-like of shape (n_samples,) or (n_samples, n_classes) The target values (class labels) as integers sample_weight : array-like of shape (n_samples,) If `None`, then samples are equally weighted. Splits that would create child nodes with net zero or negative weight are ignored while searching for a split in each node. Splits are also ignored if they would result in any single class carrying a negative weight in either child node. check_input : bool, optional Allow to bypass several input checking. Don't use this parameter unless you know what you do. Returns ------- self: object """ if check_input: x = check_array(x, allow_multivariate=True, dtype=float) y = check_array(y, ensure_2d=False) n_samples = x.shape[0] if isinstance(self.force_dim, int): x = np.reshape(x, [n_samples, self.force_dim, -1]) n_timesteps = x.shape[-1] if x.ndim > 2: n_dims = x.shape[1] else: n_dims = 1 if hasattr(self, "class_weight") and self.class_weight is not None: class_weight = compute_sample_weight(self.class_weight, y) else: class_weight = None if y.ndim == 1: self.classes_, y = np.unique(y, return_inverse=True) else: _, y = np.nonzero(y) if len(y) != n_samples: raise ValueError("Single label per sample expected.") self.classes_ = np.unique(y) if len(y) != n_samples: raise ValueError("Number of labels={} does not match " "number of samples={}".format(len(y), n_samples)) self.n_classes_ = len(self.classes_) self.n_timestep_ = n_timesteps self.n_dims_ = n_dims random_state = check_random_state( self.random_state if hasattr(self, "random_state") else None) if sample_weight is not None: sample_weight = _check_sample_weight(sample_weight, x, dtype=float) if class_weight is not None: if sample_weight is not None: sample_weight = sample_weight * class_weight else: sample_weight = class_weight self._fit(x, y, sample_weight, random_state) return self
def motifs( x, mp=None, window=None, exclude=0.2, max_distance="best", max_neighbours=10, min_neighbours=1, max_motif=1, return_distance=False, ): """Find motifs Parameters ---------- x : array-like of shape (n_samples, n_timestep) The time series mp : ndarray or shape (n_samples, profile_size), optional The matrix profile. The matrix profile is computed if None. window : int, optional The window size of the matrix profile. exclude : float, optional The size of the exclusion zone. max_distance : str, optional The maximum distance between motifs. max_matches : int, optional The maximum number of neighbours min_neighbours : int, optional The minimum number of neighbours max_motif : int, optional The maximum number of motifs to return. return_distance : bool, optional Return the distance from main to neighbours Returns ------- motif_indicies : list List of arrays of motif neighbour indicies motif_distance : list, optional List of arrays of distance from motif to neighbours See also -------- wildboar.distance.subsequence_match : find subsequence matches wildboar.distance.matrix_profile : compute the matrix profile References ---------- Yeh, C. C. M. et al. (2016). Matrix profile I: All pairs similarity joins for time series: a unifying view that includes motifs, discords and shapelets. In 2016 IEEE 16th international conference on data mining (ICDM) """ if mp is None: if window is None: raise ValueError( "if the matrix profile is not given, window must be set") mp = matrix_profile(x, window=window, exclude=exclude, return_index=False) mp = np.atleast_2d(mp) elif isinstance(mp, np.ndarray) and np.issubdtype(mp.dtype, np.double): w = x.shape[-1] - mp.shape[-1] + 1 if window is None: window = w elif window != w: raise ValueError("given window parameter is invalid, set to None") mp = np.atleast_2d(mp).copy() else: raise ValueError("unexpected matrix profile") if max_neighbours is None: max_neighbours = x.shape[-1] if isinstance(max_distance, str): max_distance = _THRESHOLD.get(max_distance, None) if max_distance is None: raise ValueError("invalid max_distance (%r)" % max_distance) cutoff = max_distance if isinstance(exclude, numbers.Integral): if exclude < 0: raise ValueError("invalid exclusion (%d < 0)" % exclude) elif isinstance(exclude, numbers.Real): exclude = math.ceil(window * exclude) elif exclude is not None: raise ValueError("invalid exclusion (%r)" % exclude) x = check_array(np.atleast_2d(x), dtype=np.double) if x.shape[0] != mp.shape[0]: raise ValueError("not the same number of samples") motif_distances = [] motif_indicies = [] for i in range(x.shape[0]): motif_distance = [] motif_index = [] if callable(max_distance): cutoff = max_distance(mp[i]) for j in range(max_motif): if len(motif_index) > max_motif: break candidate = np.argmin(mp[i]) if mp[i, candidate] > cutoff: break if (isinstance(max_distance, numbers.Real) and mp[i, candidate] > max_distance): break match_idx, match_dist = subsequence_match( x[i, candidate:candidate + window].reshape(1, -1), x[i].reshape(1, -1), threshold=max_distance, metric="scaled_euclidean", max_matches=max_neighbours, exclude=exclude, return_distance=True, ) if match_idx.size > min_neighbours: motif_index.append(match_idx[:max_neighbours]) motif_distance.append(match_dist[:max_neighbours]) # The first match is always the same as candidate # so we can exclude all the matches from the matrix # profile for j in match_idx[:max_neighbours]: start = max(0, j - exclude) end = min(mp.shape[-1], j + exclude) mp[i, start:end] = np.inf else: # Just exclude the candidate from the matrix profile start = max(0, candidate - exclude) end = min(mp.shape[-1], candidate + exclude) mp[i, start:end] = np.inf motif_distances.append(motif_distance) motif_indicies.append(motif_index) if return_distance: return motif_indicies, motif_distances else: return motif_indicies