def test_summary_transformer_incorrect_quantile_raises_error(quantile_arg): """Test if correct errors are raised for invalid quantiles input.""" msg = """`quantiles` must be int, float or a list or tuple made up of int and float values that are between 0 and 1. """ with pytest.raises(ValueError, match=msg): transformer = SummaryTransformer(summary_function="mean", quantiles=quantile_arg) transformer.fit_transform(data_to_test[0])
def test_summary_transformer_incorrect_summary_function_raises_error( summary_arg): """Test if correct errors are raised for invalid summary_function input.""" msg = rf"""`summary_function` must be str or a list or tuple made up of {ALLOWED_SUM_FUNCS}. """ with pytest.raises(ValueError, match=re.escape(msg)): transformer = SummaryTransformer(summary_function=summary_arg, quantiles=None) transformer.fit_transform(data_to_test[0])
def get_test_params(cls, parameter_set="default"): """Return testing parameter settings for the estimator. Parameters ---------- parameter_set : str, default="default" Name of the set of test parameters to return, for use in tests. If no special parameters are defined for a value, will return `"default"` set. For classifiers, a "default" set of parameters should be provided for general testing, and a "results_comparison" set for comparing against previously recorded results if the general set does not produce suitable probabilities to compare against. Returns ------- params : dict or list of dict, default={} Parameters to create testing instances of the class. Each dict are parameters to construct an "interesting" test instance, i.e., `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance. `create_test_instance` uses the first (or only) dictionary in `params`. """ from sklearn.ensemble import RandomForestClassifier from sktime.transformations.series.summarize import SummaryTransformer if parameter_set == "results_comparison": return { "n_intervals": 3, "estimator": RandomForestClassifier(n_estimators=10), "interval_transformers": SummaryTransformer( summary_function=("mean", "std", "min", "max"), quantiles=(0.25, 0.5, 0.75), ), } else: return { "n_intervals": 2, "estimator": RandomForestClassifier(n_estimators=2), "interval_transformers": SummaryTransformer(summary_function=("mean", "min", "max"), ), }
def _fit(self, X, y): """Fit a pipeline on cases (X,y), where y is the target variable. Parameters ---------- X : 3D np.array of shape = [n_instances, n_dimensions, series_length] The training data. y : array-like, shape = [n_instances] The class labels. Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ self._transformer = SummaryTransformer( summary_function=self.summary_functions, quantiles=self.summary_quantiles, ) self._estimator = _clone_estimator( RandomForestClassifier(n_estimators=200) if self.estimator is None else self.estimator, self.random_state, ) m = getattr(self._estimator, "n_jobs", None) if m is not None: self._estimator.n_jobs = self._threads_to_use X_t = self._transformer.fit_transform(X, y) if X_t.shape[0] > len(y): X_t = X_t.to_numpy().reshape((len(y), -1)) self._transform_atts = X_t.shape[1] self._estimator.fit(X_t, y) return self
def fit(self, X, y=None): """Fit the random interval transform. Parameters ---------- X : pandas DataFrame or 3d numpy array, input time series y : array_like, target values (optional, ignored) """ X = check_X(X, coerce_to_numpy=True) _, n_dims, series_length = X.shape if self.transformers is None: self._transformers = [ SummaryTransformer( summary_function=("mean", "std", "min", "max"), quantiles=(0.25, 0.5, 0.75), ) ] if not isinstance(self._transformers, list): self._transformers = [self._transformers] li = [] for i in range(len(self._transformers)): li.append( _clone_estimator( self._transformers[i], self.random_state, )) m = getattr(li[i], "n_jobs", None) if m is not None: li[i].n_jobs = self.n_jobs self._transformers = li rng = check_random_state(self.random_state) self._dims = rng.choice(n_dims, self.n_intervals, replace=True) self._intervals = np.zeros((self.n_intervals, 2), dtype=int) for i in range(0, self.n_intervals): if rng.random() < 0.5: self._intervals[i][0] = rng.randint(0, series_length - 3) length = ( rng.randint(0, series_length - self._intervals[i][0] - 3) + 3 if series_length - self._intervals[i][0] - 3 > 0 else 3) self._intervals[i][1] = self._intervals[i][0] + length else: self._intervals[i][1] = rng.randint(0, series_length - 3) + 3 length = (rng.randint(0, self._intervals[i][1] - 3) + 3 if self._intervals[i][1] - 3 > 0 else 3) self._intervals[i][0] = self._intervals[i][1] - length self._is_fitted = True return self
def test_summary_transformer_output_type(y, summary_arg, quantile_arg): """Test whether output is DataFrame of correct dimensions.""" transformer = SummaryTransformer(summary_function=summary_arg, quantiles=quantile_arg) transformer.fit(y) yt = transformer.transform(y) output_is_dataframe = isinstance(yt, pd.DataFrame) expected_instances = 1 if isinstance(y, pd.Series) else y.shape[1] expected_sum_features = 1 if isinstance(summary_arg, str) else len(summary_arg) if quantile_arg is None: expected_q_features = 0 elif isinstance(quantile_arg, (int, float)): expected_q_features = 1 else: expected_q_features = len(quantile_arg) expected_features = expected_sum_features + expected_q_features assert output_is_dataframe and yt.shape == (expected_instances, expected_features)
def test_random_interval_classifier_on_basic_motions(): """Test of RandomIntervalClassifier on basic motions.""" # load basic motions data X_train, y_train = load_basic_motions(split="train") X_test, y_test = load_basic_motions(split="test") indices = np.random.RandomState(4).choice(len(y_train), 10, replace=False) # train random interval classifier ric = RandomIntervalClassifier( random_state=0, n_intervals=5, interval_transformers=SummaryTransformer( summary_function=("mean", "std", "min", "max"), quantiles=(0.25, 0.5, 0.75), ), estimator=RandomForestClassifier(n_estimators=10), ) ric.fit(X_train.iloc[indices], y_train[indices]) # assert probabilities are the same probas = ric.predict_proba(X_test.iloc[indices]) testing.assert_array_almost_equal( probas, random_interval_classifier_basic_motions_probas, decimal=2)
n_estimators=10, ) ), ) _print_array( "MatrixProfileClassifier - UnitTest", _reproduce_classification_unit_test(MatrixProfileClassifier(random_state=0)), ) _print_array( "RandomIntervalClassifier - UnitTest", _reproduce_classification_unit_test( RandomIntervalClassifier( random_state=0, n_intervals=5, interval_transformers=SummaryTransformer( summary_function=("mean", "std", "min", "max"), quantiles=(0.25, 0.5, 0.75), ), estimator=RandomForestClassifier(n_estimators=10), ) ), ) _print_array( "RandomIntervalClassifier - BasicMotions", _reproduce_classification_basic_motions( RandomIntervalClassifier( random_state=0, n_intervals=5, interval_transformers=SummaryTransformer( summary_function=("mean", "std", "min", "max"), quantiles=(0.25, 0.5, 0.75), ),
"default_fc_parameters": "minimal", }, FreshPRINCE: { "n_estimators": 3, "default_fc_parameters": "minimal", }, RandomIntervals: { "n_intervals": 3, }, RandomIntervalClassifier: { "n_intervals": 3, "estimator": RandomForestClassifier(n_estimators=3), "interval_transformers": SummaryTransformer(summary_function=("mean", "min", "max"), ), }, SummaryClassifier: { "estimator": RandomForestClassifier(n_estimators=3), "summary_functions": ("mean", "min", "max"), }, RocketClassifier: { "num_kernels": 100 }, Arsenal: { "num_kernels": 50, "n_estimators": 3 }, HIVECOTEV1: { "stc_params": { "estimator": RotationForest(n_estimators=2),
def set_classifier(cls, resample_id=None, train_file=False): """Construct a classifier, possibly seeded. Basic way of creating the classifier to build using the default settings. This set up is to help with batch jobs for multiple problems to facilitate easy reproducibility for use with load_and_run_classification_experiment. You can pass a classifier object instead to run_classification_experiment. Parameters ---------- cls : str String indicating which classifier you want. resample_id : int or None, default=None Classifier random seed. train_file : bool, default=False Whether a train file is being produced. Return ------ classifier : A BaseClassifier. The classifier matching the input classifier name. """ name = cls.lower() # Dictionary based if name == "boss" or name == "bossensemble": return BOSSEnsemble(random_state=resample_id) elif name == "cboss" or name == "contractableboss": return ContractableBOSS(random_state=resample_id) elif name == "tde" or name == "temporaldictionaryensemble": return TemporalDictionaryEnsemble(random_state=resample_id, save_train_predictions=train_file) elif name == "weasel": return WEASEL(random_state=resample_id) elif name == "muse": return MUSE(random_state=resample_id) # Distance based elif name == "pf" or name == "proximityforest": return ProximityForest(random_state=resample_id) elif name == "pt" or name == "proximitytree": return ProximityTree(random_state=resample_id) elif name == "ps" or name == "proximityStump": return ProximityStump(random_state=resample_id) elif name == "dtwcv" or name == "kneighborstimeseriesclassifier": return KNeighborsTimeSeriesClassifier(distance="dtwcv") elif name == "dtw" or name == "1nn-dtw": return KNeighborsTimeSeriesClassifier(distance="dtw") elif name == "msm" or name == "1nn-msm": return KNeighborsTimeSeriesClassifier(distance="msm") elif name == "ee" or name == "elasticensemble": return ElasticEnsemble(random_state=resample_id) elif name == "shapedtw": return ShapeDTW() # Feature based elif name == "summary": return SummaryClassifier( random_state=resample_id, estimator=RandomForestClassifier(n_estimators=500)) elif name == "summary-intervals": return RandomIntervalClassifier( random_state=resample_id, interval_transformers=SummaryTransformer( summary_function=("mean", "std", "min", "max"), quantiles=(0.25, 0.5, 0.75), ), estimator=RandomForestClassifier(n_estimators=500), ) elif name == "summary-catch22": return RandomIntervalClassifier( random_state=resample_id, estimator=RandomForestClassifier(n_estimators=500)) elif name == "catch22": return Catch22Classifier( random_state=resample_id, estimator=RandomForestClassifier(n_estimators=500)) elif name == "matrixprofile": return MatrixProfileClassifier(random_state=resample_id) elif name == "signature": return SignatureClassifier( random_state=resample_id, estimator=RandomForestClassifier(n_estimators=500), ) elif name == "tsfresh": return TSFreshClassifier( random_state=resample_id, estimator=RandomForestClassifier(n_estimators=500)) elif name == "tsfresh-r": return TSFreshClassifier( random_state=resample_id, estimator=RandomForestClassifier(n_estimators=500), relevant_feature_extractor=True, ) elif name == "freshprince": return FreshPRINCE(random_state=resample_id, save_transformed_data=train_file) # Hybrid elif name == "hc1" or name == "hivecotev1": return HIVECOTEV1(random_state=resample_id) elif name == "hc2" or name == "hivecotev2": return HIVECOTEV2(random_state=resample_id) # Interval based elif name == "rise" or name == "randomintervalspectralforest": return RandomIntervalSpectralForest(random_state=resample_id, n_estimators=500) elif name == "tsf" or name == "timeseriesforestclassifier": return TimeSeriesForestClassifier(random_state=resample_id, n_estimators=500) elif name == "cif" or name == "canonicalintervalforest": return CanonicalIntervalForest(random_state=resample_id, n_estimators=500) elif name == "stsf" or name == "supervisedtimeseriesforest": return SupervisedTimeSeriesForest(random_state=resample_id, n_estimators=500) elif name == "drcif": return DrCIF(random_state=resample_id, n_estimators=500, save_transformed_data=train_file) # Kernel based elif name == "rocket": return RocketClassifier(random_state=resample_id) elif name == "mini-rocket": return RocketClassifier(random_state=resample_id, rocket_transform="minirocket") elif name == "multi-rocket": return RocketClassifier(random_state=resample_id, rocket_transform="multirocket") elif name == "arsenal": return Arsenal(random_state=resample_id, save_transformed_data=train_file) elif name == "mini-arsenal": return Arsenal( random_state=resample_id, save_transformed_data=train_file, rocket_transform="minirocket", ) elif name == "multi-arsenal": return Arsenal( random_state=resample_id, save_transformed_data=train_file, rocket_transform="multirocket", ) # Shapelet based elif name == "stc" or name == "shapelettransformclassifier": return ShapeletTransformClassifier( transform_limit_in_minutes=120, random_state=resample_id, save_transformed_data=train_file, ) else: raise Exception("UNKNOWN CLASSIFIER")
class SummaryClassifier(BaseClassifier): """Summary statistic classifier. This classifier simply transforms the input data using the SummaryTransformer transformer and builds a provided estimator using the transformed data. Parameters ---------- summary_functions : str, list, tuple, default=("mean", "std", "min", "max") Either a string, or list or tuple of strings indicating the pandas summary functions that are used to summarize each column of the dataset. Must be one of ("mean", "min", "max", "median", "sum", "skew", "kurt", "var", "std", "mad", "sem", "nunique", "count"). summary_quantiles : str, list, tuple or None, default=(0.25, 0.5, 0.75) Optional list of series quantiles to calculate. If None, no quantiles are calculated. estimator : sklearn classifier, default=None An sklearn estimator to be built using the transformed data. Defaults to a Random Forest with 200 trees. n_jobs : int, default=1 The number of jobs to run in parallel for both `fit` and `predict`. ``-1`` means using all processors. random_state : int or None, default=None Seed for random, integer. Attributes ---------- n_classes_ : int Number of classes. Extracted from the data. classes_ : ndarray of shape (n_classes) Holds the label for each class. See Also -------- SummaryTransformer Examples -------- >>> from sktime.classification.feature_based import SummaryClassifier >>> from sklearn.ensemble import RandomForestClassifier >>> from sktime.datasets import load_unit_test >>> X_train, y_train = load_unit_test(split="train", return_X_y=True) >>> X_test, y_test = load_unit_test(split="test", return_X_y=True) >>> clf = SummaryClassifier(estimator=RandomForestClassifier(n_estimators=10)) >>> clf.fit(X_train, y_train) SummaryClassifier(...) >>> y_pred = clf.predict(X_test) """ _tags = { "capability:multivariate": True, "capability:multithreading": True, } def __init__( self, summary_functions=("mean", "std", "min", "max"), summary_quantiles=(0.25, 0.5, 0.75), estimator=None, n_jobs=1, random_state=None, ): self.summary_functions = summary_functions self.summary_quantiles = summary_quantiles self.estimator = estimator self.n_jobs = n_jobs self.random_state = random_state self._transformer = None self._estimator = None self._transform_atts = 0 super(SummaryClassifier, self).__init__() def _fit(self, X, y): """Fit a pipeline on cases (X,y), where y is the target variable. Parameters ---------- X : 3D np.array of shape = [n_instances, n_dimensions, series_length] The training data. y : array-like, shape = [n_instances] The class labels. Returns ------- self : Reference to self. Notes ----- Changes state by creating a fitted model that updates attributes ending in "_" and sets is_fitted flag to True. """ self._transformer = SummaryTransformer( summary_function=self.summary_functions, quantiles=self.summary_quantiles, ) self._estimator = _clone_estimator( RandomForestClassifier(n_estimators=200) if self.estimator is None else self.estimator, self.random_state, ) m = getattr(self._estimator, "n_jobs", None) if m is not None: self._estimator.n_jobs = self._threads_to_use X_t = self._transformer.fit_transform(X, y) if X_t.shape[0] > len(y): X_t = X_t.to_numpy().reshape((len(y), -1)) self._transform_atts = X_t.shape[1] self._estimator.fit(X_t, y) return self def _predict(self, X): """Predict class values of n instances in X. Parameters ---------- X : 3D np.array of shape = [n_instances, n_dimensions, series_length] The data to make predictions for. Returns ------- y : array-like, shape = [n_instances] Predicted class labels. """ X_t = self._transformer.transform(X) if X_t.shape[1] < self._transform_atts: X_t = X_t.to_numpy().reshape((-1, self._transform_atts)) return self._estimator.predict(X_t) def _predict_proba(self, X): """Predict class probabilities for n instances in X. Parameters ---------- X : 3D np.array of shape = [n_instances, n_dimensions, series_length] The data to make predict probabilities for. Returns ------- y : array-like, shape = [n_instances, n_classes_] Predicted probabilities using the ordering in classes_. """ X_t = self._transformer.transform(X) if X_t.shape[1] < self._transform_atts: X_t = X_t.to_numpy().reshape((-1, self._transform_atts)) m = getattr(self._estimator, "predict_proba", None) if callable(m): return self._estimator.predict_proba(X_t) else: dists = np.zeros((X.shape[0], self.n_classes_)) preds = self._estimator.predict(X_t) for i in range(0, X.shape[0]): dists[i, self._class_dictionary[preds[i]]] = 1 return dists
}, MatrixProfileClassifier: { "subsequence_length": 4, }, TSFreshClassifier: { "estimator": RandomForestClassifier(n_estimators=3), "default_fc_parameters": "minimal", }, RandomIntervals: { "n_intervals": 3, }, RandomIntervalClassifier: { "n_intervals": 3, "estimator": RandomForestClassifier(n_estimators=3), "interval_transformers": SummaryTransformer( summary_function=("mean", "min", "max"), ), }, SummaryClassifier: { "estimator": RandomForestClassifier(n_estimators=3), "summary_functions": ("mean", "min", "max"), }, RocketClassifier: {"num_kernels": 100}, Arsenal: {"num_kernels": 50, "n_estimators": 3}, HIVECOTEV1: { "stc_params": { "estimator": RotationForest(n_estimators=2), "max_shapelets": 5, "n_shapelet_samples": 20, "batch_size": 10, },