def test_check_X_bad_input_args(X): """Test for the correct reaction for bad input in check_X.""" with pytest.raises(ValueError): check_X(X) with pytest.raises(ValueError): check_X_y(X, y)
def test_check_X_enforce_univariate(): X, y = make_classification_problem(n_columns=2) msg = r"univariate" with pytest.raises(ValueError, match=msg): check_X(X, enforce_univariate=True) with pytest.raises(ValueError, match=msg): check_X_y(X, y, enforce_univariate=True)
def test_check_X_enforce_min_columns(): X, y = make_classification_problem(n_columns=2) msg = r"columns" with pytest.raises(ValueError, match=msg): check_X(X, enforce_min_columns=3) with pytest.raises(ValueError, match=msg): check_X_y(X, y, enforce_min_columns=3)
def test_check_enforce_min_instances(): X, y = make_classification_problem(n_instances=3) msg = r"instance" with pytest.raises(ValueError, match=msg): check_X(X, enforce_min_instances=4) with pytest.raises(ValueError, match=msg): check_X_y(X, y, enforce_min_instances=4) with pytest.raises(ValueError, match=msg): check_y(y, enforce_min_instances=4)
def _set_oob_score(self, X, y): """Compute out-of-bag score.""" check_X_y(X, y) check_X(X, enforce_univariate=True) n_classes_ = self.n_classes_ n_samples = y.shape[0] oob_decision_function = [] oob_score = 0.0 predictions = [ np.zeros((n_samples, n_classes_[k])) for k in range(self.n_outputs_) ] n_samples_bootstrap = _get_n_samples_bootstrap(n_samples, self.max_samples) for estimator in self.estimators_: final_estimator = estimator.steps[-1][1] unsampled_indices = _generate_unsampled_indices( final_estimator.random_state, n_samples, n_samples_bootstrap) p_estimator = estimator.predict_proba(X.iloc[unsampled_indices, :]) if self.n_outputs_ == 1: p_estimator = [p_estimator] for k in range(self.n_outputs_): predictions[k][unsampled_indices, :] += p_estimator[k] for k in range(self.n_outputs_): if (predictions[k].sum(axis=1) == 0).any(): warn("Some inputs do not have OOB scores. " "This probably means too few trees were used " "to compute any reliable oob estimates.") decision = predictions[k] / predictions[k].sum(axis=1)[:, np.newaxis] oob_decision_function.append(decision) oob_score += np.mean(y[:, k] == np.argmax(predictions[k], axis=1), axis=0) if self.n_outputs_ == 1: self.oob_decision_function_ = oob_decision_function[0] else: self.oob_decision_function_ = oob_decision_function self.oob_score_ = oob_score / self.n_outputs_
def fit(self, X, y): """Fit a single boss classifier on n_instances cases (X,y). Parameters ---------- X : pd.DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) sfa = self.transformer.fit_transform(X) self.transformed_data = sfa[0] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self
def fit(self, X, y): """Fit an estimator using transformed data from the MatrixProfile transformer. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) self.classes_ = np.unique(y) self.n_classes = self.classes_.shape[0] self._transformer = MatrixProfile(m=self.subsequence_length) self._estimator = _clone_estimator( KNeighborsClassifier( n_neighbors=1) if self.estimator is None else self.estimator, self.random_state, ) m = getattr(self._estimator, "n_jobs", None) if callable(m): self._estimator.n_jobs = self.n_jobs X_t = self._transformer.fit_transform(X, y) self._estimator.fit(X_t, y) self._is_fitted = True return self
def fit(self, X, y): """Build a pipeline containing the ROCKET transformer and RidgeClassifierCV. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self.classifier = rocket_pipeline = make_pipeline( Rocket( num_kernels=self.num_kernels, random_state=self.random_state, n_jobs=self.n_jobs, ), RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True), ) rocket_pipeline.fit(X, y) self._is_fitted = True return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y) using supervised intervals and summary features Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. STSF has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y( X, y, enforce_univariate=True, coerce_to_numpy=True, ) X = X.squeeze(1) n_instances, _ = X.shape rng = check_random_state(self.random_state) cls, class_counts = np.unique(y, return_counts=True) self.n_classes = class_counts.shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.intervals_ = [[[] for _ in range(3)] for _ in range(self.n_estimators)] _, X_p = signal.periodogram(X) X_d = np.diff(X, 1) balance_cases = np.zeros(0, dtype=np.int32) average = math.floor(n_instances / self.n_classes) for i, c in enumerate(cls): if class_counts[i] < average: cls_idx = np.where(y == c)[0] balance_cases = np.concatenate( (rng.choice(cls_idx, size=average - class_counts[i]), balance_cases)) self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(self._fit_estimator)( X, X_p, X_d, y, np.concatenate((rng.choice(n_instances, size=n_instances), balance_cases)), i, ) for i in range(self.n_estimators)) self._is_fitted = True return self
def fit(self, X, y): """Fit time series classifier to training data. Parameters ---------- X : 3D np.array, array-like or sparse matrix of shape = [n_instances,n_dimensions,series_length] or shape = [n_instances,series_length] or single-column pd.DataFrame with pd.Series entries y : array-like, shape = [n_instances] - the class labels. Returns ------- self : reference to self. State change ------------ creates fitted model (attributes ending in "_") sets is_fitted flag to true """ coerce_to_numpy = self._all_tags()["coerce-X-to-numpy"] X, y = check_X_y(X, y, coerce_to_numpy=coerce_to_numpy) self._fit(X, y) # this should happen last self._is_fitted = True return self
def _get_train_probs(self, X, y): self.check_is_fitted() X, y = check_X_y(X, y, coerce_to_numpy=True, enforce_univariate=True) n_instances, _, series_length = X.shape if n_instances != self.n_instances_ or series_length != self.series_length_: raise ValueError( "n_instances, series_length mismatch. X should be " "the same as the training data used in fit for generating train " "probabilities.") results = np.zeros((n_instances, self.n_classes_)) divisors = np.zeros(n_instances) for i, clf in enumerate(self.estimators_): subsample = clf._subsample preds = (clf._train_predictions if self.save_train_predictions else Parallel(n_jobs=self._threads_to_use)( delayed(clf._train_predict)(i, ) for i in range(len(subsample)))) for n, pred in enumerate(preds): results[subsample[n]][ self._class_dictionary[pred]] += self.weights_[i] divisors[subsample[n]] += self.weights_[i] for i in range(n_instances): results[i] = (np.ones(self.n_classes_) * (1 / self.n_classes_) if divisors[i] == 0 else results[i] / (np.ones(self.n_classes_) * divisors[i])) return results
def wrapper(self, data, labels=None, **kwargs): # Check if pandas so we can convert back is_pandas = True if isinstance(data, pd.DataFrame) else False pd_idx = data.index if is_pandas else None # Fit checks if check_fitted: self.check_is_fitted() # First convert to pandas so everything is the same format if labels is None: data = check_X(data, coerce_to_pandas=True) else: data, labels = check_X_y(data, labels, coerce_to_pandas=True) # Now convert it to a numpy array # Note sktime uses [N, C, L] whereas signature code uses shape # [N, L, C] (C being channels) so we must transpose. data = np.transpose(from_nested_to_3d_numpy(data), [0, 2, 1]) # Apply the function to the transposed array if labels is None: output = func(self, data, **kwargs) else: output = func(self, data, labels, **kwargs) # Convert back if all( [is_pandas, isinstance(output, np.ndarray), not force_numpy]): output = pd.DataFrame(index=pd_idx, data=output) return output
def wrapper(self, data, labels=None, **kwargs): # Check if pandas so we can convert back is_pandas = True if isinstance(data, pd.DataFrame) else False pd_idx = data.index if is_pandas else None # Fit checks if check_fitted: self.check_is_fitted() # First convert to pandas so everything is the same format if labels is None: data = check_X(data, coerce_to_pandas=True) else: data, labels = check_X_y(data, labels, coerce_to_pandas=True) # Apply the function to the transposed array if labels is None: output = func(self, data, **kwargs) else: output = func(self, data, labels, **kwargs) # Convert back if all( [is_pandas, isinstance(output, np.ndarray), not force_numpy]): output = pd.DataFrame(index=pd_idx, data=output) return output
def fit(self, X, y): """Fit an estimator using transformed data from the Catch22 transformer. Parameters ---------- X : nested pandas DataFrame of shape [n_instances, n_dims] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y) self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.n_classes = np.unique(y).shape[0] self._transformer = Catch22(outlier_norm=self.outlier_norm) self._estimator = _clone_estimator( RandomForestClassifier(n_estimators=200) if self.estimator is None else self.estimator, self.random_state, ) m = getattr(self._estimator, "n_jobs", None) if callable(m): self._estimator.n_jobs = self.n_jobs X_t = self._transformer.fit_transform(X, y) X_t = np.nan_to_num(X_t, False, 0, 0, 0) self._estimator.fit(X_t, y) self._is_fitted = True return self
def fit(self, X, y): """ Build the classifier on the training set (X, y) ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed, column 0 is extracted. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_pandas=True) self.X = dataset_properties.positive_dataframe_indices(X) self.random_state = check_random_state(self.random_state) # setup label encoding if self.label_encoder is None: self.label_encoder = LabelEncoder() y = self.label_encoder.fit_transform(y) self.y = y self.classes_ = self.label_encoder.classes_ if self.distance_measure is None: if self.get_distance_measure is None: self.get_distance_measure = self.setup_distance_measure(self) self.distance_measure = self.get_distance_measure(self) self.X_exemplar, self.y_exemplar = self.pick_exemplars(self) self._is_fitted = True return self
def fit(self, X, y, **kwargs): """Wrap BaseForest._fit. This is a temporary measure prior to the BaseRegressor refactor. """ X, y = check_X_y(X, y, coerce_to_numpy=True, enforce_univariate=True) return BaseTimeSeriesForest._fit(self, X, y, **kwargs)
def fit(self, X, y): """Fit time series classifier to training data. Parameters ---------- X : 3D np.array, array-like or sparse matrix of shape = [n_instances,n_dimensions,series_length] or shape = [n_instances,series_length] or single-column pd.DataFrame with pd.Series entries y : array-like, shape = [n_instances] - the class labels. Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ coerce_to_numpy = self.get_tag("coerce-X-to-numpy", False) X, y = check_X_y(X, y, coerce_to_numpy=coerce_to_numpy) self._fit(X, y) # this should happen last self._is_fitted = True return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y). Uses random intervals and catch22/tsf summary features. Parameters ---------- X : array-like or sparse matrix of shape = [n_instances,n_dimensions, series_length] or shape = [n_instances,series_length] The training input samples. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, coerce_to_numpy=True) self.n_instances, self.n_dims, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] if self.base_estimator is None or self.base_estimator == "DTC": self.tree = DecisionTreeClassifier(criterion="entropy") elif self.base_estimator == "CIT": self.tree = ContinuousIntervalTree() elif isinstance(self.base_estimator, BaseEstimator): self.tree = self.base_estimator else: raise ValueError("DrCIF invalid base estimator given.") if self.n_intervals is None: self.__n_intervals = int( math.sqrt(self.series_length) * math.sqrt(self.n_dims) ) if self.__n_intervals <= 0: self.__n_intervals = 1 if self.series_length < self.min_interval: self.min_interval = self.series_length if self.max_interval is None: self.__max_interval = self.series_length / 2 if self.__max_interval < self.min_interval: self.__max_interval = self.min_interval fit = Parallel(n_jobs=self.n_jobs)( delayed(self._fit_estimator)( X, y, i, ) for i in range(self.n_estimators) ) self.classifiers, self.intervals, self.dims, self.atts = zip(*fit) self._is_fitted = True return self
def fit(self, X, y): """Fit a single TD classifier on n_instances cases (X,y). Parameters ---------- X : pd.DataFrame of shape [n_instances, 1] Nested dataframe with univariate time-series in cells. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, coerce_to_numpy=True) self.n_instances, self.n_dims, self.series_length = X.shape self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index # select dimensions using accuracy estimate if multivariate if self.n_dims > 1: self.dims, self.transformers = self._select_dims(X, y) words = [defaultdict(int) for _ in range(self.n_instances)] for i, dim in enumerate(self.dims): X_dim = X[:, dim, :].reshape(self.n_instances, 1, self.series_length) dim_words = self.transformers[i].transform(X_dim, y) dim_words = dim_words[0] for i in range(self.n_instances): for word, count in dim_words[i].items(): words[i][word << self.highest_dim_bit | dim] = count self.transformed_data = words else: self.transformers.append( SFA( word_length=self.word_length, alphabet_size=self.alphabet_size, window_size=self.window_size, norm=self.norm, levels=self.levels, binning_method="information-gain" if self.igb else "equi-depth", bigrams=self.bigrams, remove_repeat_words=True, save_words=False, n_jobs=self.n_jobs, )) sfa = self.transformers[0].fit_transform(X, y) self.transformed_data = sfa[0] self._is_fitted = True return self
def check_and_clean_data(X, y=None, input_checks=True): ''' Performs basic sktime data checks and prepares the train data for input to Keras models. Parameters ---------- X: the train data y: the train labels input_checks: whether to perform the basic sktime checks Returns ------- X ''' if input_checks: if y is None: check_X(X) else: check_X_y(X, y) # want data in form: [instances = n][timepoints = m][dimensions = d] if isinstance(X, pd.DataFrame): if _is_nested_dataframe(X): if X.shape[1] > 1: # we have multiple columns, AND each cell contains a series, # so this is a multidimensional problem X = _multivariate_nested_df_to_array(X) else: # we have a single column containing a series, treat this as # a univariate problem X = _univariate_nested_df_to_array(X) else: # we have multiple columns each containing a primitive, treat as # univariate series X = _univariate_df_to_array(X) if len(X.shape) == 2: # add a dimension to make it multivariate with one dimension X = X.values.reshape( X.shape[0], X.shape[1], 1 ) # go from [n][m] to [n][m][d=1] # return transposed data to conform with current model formats return X.transpose(0, 2, 1)
def fit(self, X, y): """Build the classifier on the training set (X, y). Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, n_columns] The training input samples. If a Pandas data frame is passed, column 0 is extracted. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_pandas=True) self.X = positive_dataframe_indices(X) self.random_state = check_random_state(self.random_state) if self.find_stump is None: self.find_stump = best_of_n_stumps(self.n_stump_evaluations) # setup label encoding if self.label_encoder is None: self.label_encoder = LabelEncoder() y = self.label_encoder.fit_transform(y) self.y = y self.classes_ = self.label_encoder.classes_ if self.distance_measure is None: if self.get_distance_measure is None: self.get_distance_measure = self.setup_distance_measure(self) self.distance_measure = self.get_distance_measure(self) self.stump = self.find_stump(self) n_branches = len(self.stump.y_exemplar) self.branches = [None] * n_branches if self.depth < self.max_depth: for index in range(n_branches): sub_y = self.stump.y_branches[index] if not self.is_leaf(sub_y): sub_tree = ProximityTree( random_state=self.random_state, get_exemplars=self.get_exemplars, distance_measure=self.distance_measure, setup_distance_measure=self.setup_distance_measure, get_distance_measure=self.get_distance_measure, get_gain=self.get_gain, is_leaf=self.is_leaf, verbosity=self.verbosity, max_depth=self.max_depth, n_jobs=self.n_jobs, ) sub_tree.label_encoder = self.label_encoder sub_tree.depth = self.depth + 1 self.branches[index] = sub_tree sub_X = self.stump.X_branches[index] sub_tree.fit(sub_X, sub_y) self._is_fitted = True return self
def fit(self, X, y): X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] cv_size = 10 _, counts = np.unique(y, return_counts=True) min_class = np.min(counts) if min_class < cv_size: cv_size = min_class self.stc = ShapeletTransformClassifier( random_state=self.random_state, time_contract_in_mins=60, ) self.stc.fit(X, y) train_preds = cross_val_predict( ShapeletTransformClassifier( random_state=self.random_state, time_contract_in_mins=60, ), X=X, y=y, cv=cv_size, ) self.stc_weight = accuracy_score(y, train_preds)**4 self.tsf = TimeSeriesForest(random_state=self.random_state) self.tsf.fit(X, y) train_preds = cross_val_predict( TimeSeriesForest(random_state=self.random_state), X=X, y=y, cv=cv_size, ) self.tsf_weight = accuracy_score(y, train_preds)**4 self.rise = RandomIntervalSpectralForest( random_state=self.random_state) self.fit(X, y) train_preds = cross_val_predict( RandomIntervalSpectralForest(random_state=self.random_state), X=X, y=y, cv=cv_size, ) self.rise_weight = accuracy_score(y, train_preds)**4 self.cboss = ContractableBOSS(random_state=self.random_state) self.cboss.fit(X, y) train_probs = self.cboss._get_train_probs(X) train_preds = self.cboss.classes_[np.argmax(train_probs, axis=1)] self.cboss_weight = accuracy_score(y, train_preds)**4 return self
def _get_train_probs(self, X, y, train_estimate_method="loocv") -> np.ndarray: self.check_is_fitted() X, y = check_X_y(X, y, coerce_to_numpy=True) n_instances, n_dims, series_length = X.shape if (n_instances != self.n_instances_ or n_dims != self.n_dims_ or series_length != self.series_length_): raise ValueError( "n_instances, n_dims, series_length mismatch. X should be " "the same as the training data used in fit for generating train " "probabilities.") results = np.zeros((n_instances, self.n_classes_)) divisors = np.zeros(n_instances) if train_estimate_method.lower() == "loocv": for i, clf in enumerate(self.estimators_): subsample = clf._subsample preds = (clf._train_predictions if self.save_train_predictions else Parallel(n_jobs=self._threads_to_use)( delayed(clf._train_predict)(i, ) for i in range(len(subsample)))) for n, pred in enumerate(preds): results[subsample[n]][ self._class_dictionary[pred]] += self.weights_[i] divisors[subsample[n]] += self.weights_[i] elif train_estimate_method.lower() == "oob": indices = range(n_instances) for i, clf in enumerate(self.estimators_): oob = [n for n in indices if n not in clf._subsample] if len(oob) == 0: continue preds = clf.predict(X[oob]) for n, pred in enumerate(preds): results[oob[n]][ self._class_dictionary[pred]] += self.weights_[i] divisors[oob[n]] += self.weights_[i] else: raise ValueError( "Invalid train_estimate_method. Available options: loocv, oob") for i in range(n_instances): results[i] = (np.ones(self.n_classes_) * (1 / self.n_classes_) if divisors[i] == 0 else results[i] / (np.ones(self.n_classes_) * divisors[i])) return results
def fit(self, X, y): """Build a forest of trees from the training set (X, y) using random intervals and summary features Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. TSF has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y( X, y, enforce_univariate=not TimeSeriesForest.capabilities["multivariate"], coerce_to_numpy=True, ) X = X.squeeze(1) n_instances, self.series_length = X.shape rng = check_random_state(self.random_state) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.n_intervals = int(math.sqrt(self.series_length)) if self.n_intervals == 0: self.n_intervals = 1 if self.series_length < self.min_interval: self.min_interval = self.series_length self.intervals_ = [ _get_intervals(self.n_intervals, self.min_interval, self.series_length, rng) for _ in range(self.n_estimators) ] self.estimators_ = Parallel(n_jobs=self.n_jobs)( delayed(_fit_estimator)( X, y, self.base_estimator, self.intervals_[i], self.random_state, ) for i in range(self.n_estimators) ) self._is_fitted = True return self
def fit(self, X, y): """Fit time series classifier to training data. Parameters ---------- X : 2D np.array (univariate, equal length series) of shape = [n_instances, series_length] or 3D np.array (any number of dimensions, equal length series) of shape = [n_instances,n_dimensions,series_length] or pd.DataFrame with each column a dimension, each cell a pd.Series (any number of dimensions, equal or unequal length series) y : 1D np.array of shape = [n_instances] - the class labels. Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ coerce_to_numpy = self.get_tag("coerce-X-to-numpy") coerce_to_pandas = self.get_tag("coerce-X-to-pandas") allow_multivariate = self.get_tag("capability:multivariate") X, y = check_X_y( X, y, coerce_to_numpy=coerce_to_numpy, coerce_to_pandas=coerce_to_pandas, enforce_univariate=not allow_multivariate, ) multithread = self.get_tag("capability:multithreading") if multithread: try: self._threads_to_use = check_n_jobs(self.n_jobs) except NameError: raise AttributeError( "self.n_jobs must be set if capability:multithreading is True" ) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.shape[0] for index, classVal in enumerate(self.classes_): self._class_dictionary[classVal] = index self._fit(X, y) # this should happen last self._is_fitted = True return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y) using random intervals and catch22/tsf summary features Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,series_length] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification). y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, coerce_to_numpy=True) self.n_instances, self.n_dims, self.series_length = X.shape self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] if self.n_intervals is None: self.__n_intervals = 4 + int( (math.sqrt(self.series_length) * math.sqrt(self.n_dims)) / 3) if self.__n_intervals <= 0: self.__n_intervals = 1 if self.series_length < self.min_interval: self.min_interval = self.series_length if self.max_interval is None: self.__max_interval = self.series_length / 2 if self.__max_interval < self.min_interval: self.__max_interval = self.min_interval _, X_p = signal.periodogram(X) X_d = np.diff(X, 1) self.total_intervals = self.__n_intervals * 2 + int( self.__n_intervals / 2) fit = Parallel(n_jobs=self.n_jobs)(delayed(self._fit_estimator)( X, X_p, X_d, y, i, ) for i in range(self.n_estimators)) self.classifiers, self.intervals, self.dims, self.atts = zip(*fit) self._is_fitted = True return self
def fit(self, X, y): X, y = check_X_y(X, y, enforce_univariate=True, coerce_to_numpy=True) sfa = self.transformer.fit_transform(X, y) self.transformed_data = sfa[0] # .iloc[:, 0] self.class_vals = y self.num_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] for index, classVal in enumerate(self.classes_): self.class_dictionary[classVal] = index self._is_fitted = True return self
def fit(self, X, y): """Build a forest of trees from the training set (X, y). Parameters ---------- Xt: np.ndarray or pd.DataFrame Panel training data. y : np.ndarray The class labels. Returns ------- self : object An fitted instance of the classifier """ X, y = check_X_y( X, y, enforce_univariate=not self.capabilities["multivariate"], coerce_to_numpy=True, ) X = X.squeeze(1) n_instances, self.series_length = X.shape n_jobs = check_n_jobs(self.n_jobs) rng = check_random_state(self.random_state) self.n_classes = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.n_intervals = int(math.sqrt(self.series_length)) if self.n_intervals == 0: self.n_intervals = 1 if self.series_length < self.min_interval: self.min_interval = self.series_length self.intervals_ = [ _get_intervals(self.n_intervals, self.min_interval, self.series_length, rng) for _ in range(self.n_estimators) ] self.estimators_ = Parallel(n_jobs=n_jobs)( delayed(_fit_estimator)(_clone_estimator(self.base_estimator, rng), X, y, self.intervals_[i]) for i in range(self.n_estimators)) self._is_fitted = True return self
def fit(self, X, y): """Perform a shapelet transform then builds a random forest. Contract default for ST is 5 hours Parameters ---------- X : array-like or sparse matrix of shape = [n_instances, series_length] or shape = [n_instances,n_columns] The training input samples. If a Pandas data frame is passed it must have a single column (i.e. univariate classification. RISE has no bespoke method for multivariate classification as yet. y : array-like, shape = [n_instances] The class labels. Returns ------- self : object """ X, y = check_X_y(X, y, enforce_univariate=True) # if y is a pd.series then convert to array. if isinstance(y, pd.Series): y = y.to_numpy() # generate pipeline in fit so that random state can be propagated properly. self.classifier_ = Pipeline([ ( "st", ContractedShapeletTransform( time_contract_in_mins=self.transform_contract_in_mins, verbose=False, random_state=self.random_state, ), ), ( "rf", RandomForestClassifier(n_estimators=self.n_estimators, random_state=self.random_state), ), ]) self.n_classes_ = np.unique(y).shape[0] self.classes_ = class_distribution(np.asarray(y).reshape(-1, 1))[0][0] self.classifier_.fit(X, y) self._is_fitted = True return self
def fit(self, X, y=None): """Fit. Parameters ---------- X : pd.DataFrame nested pandas DataFrame of shape [n_samples, n_columns] y : pd.Series or np.array Target variable Returns ------- self : an instance of self """ # lazy imports to avoid hard dependency from tsfresh.transformers.feature_selector import FeatureSelector # input checks if y is None: raise ValueError( f"{self.__class__.__name__} requires `y` in `fit`.") X, y = check_X_y(X, y, coerce_to_pandas=True) self.extractor_ = TSFreshFeatureExtractor( default_fc_parameters=self.default_fc_parameters, kind_to_fc_parameters=self.kind_to_fc_parameters, chunksize=self.chunksize, n_jobs=self.n_jobs, show_warnings=self.show_warnings, disable_progressbar=self.disable_progressbar, profiling=self.profiling, profiling_filename=self.profiling_filename, profiling_sorting=self.profiling_sorting, ) selection_params = self._get_selection_params() extraction_param = self._get_extraction_params() self.selector_ = FeatureSelector( n_jobs=extraction_param["n_jobs"], chunksize=extraction_param["chunksize"], ml_task=self.ml_task, **selection_params, ) Xt = self.extractor_.fit_transform(X) self.selector_.fit(Xt, y) self._is_fitted = True return self