def _fit(self, X, y=None): """ Fit transformer, generating random interval indices. Parameters ---------- X : pandas DataFrame of shape [n_samples, n_features] Input data y : pandas Series, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : RandomIntervalSegmenter This estimator """ # We use composition rather than inheritance here, because this transformer # has a different transform type (returns tabular) compared to the # RandomIntervalSegmenter (returns panel). self._interval_segmenter = RandomIntervalSegmenter( self.n_intervals, self.min_length, self.max_length, self.random_state ) self._interval_segmenter.fit(X, y) self.intervals_ = self._interval_segmenter.intervals_ self.input_shape_ = self._interval_segmenter.input_shape_ self._time_index = self._interval_segmenter._time_index return self
def test_different_implementations(): random_state = 1233 X_train, y_train = make_classification_problem() # Compare with chained transformations. tran1 = RandomIntervalSegmenter(n_intervals=1, random_state=random_state) tran2 = SeriesToPrimitivesRowTransformer(FunctionTransformer( func=np.mean, validate=False), check_transformer=False) A = tran2.fit_transform(tran1.fit_transform(X_train)) tran = RandomIntervalFeatureExtractor(n_intervals=1, features=[np.mean], random_state=random_state) B = tran.fit_transform(X_train) np.testing.assert_array_almost_equal(A, B)
def tsf_benchmarking(): for i in range(0, len(benchmark_datasets)): dataset = benchmark_datasets[i] print(str(i) + " problem = " + dataset) tsf = ib.TimeSeriesForest(n_estimators=100) exp.run_experiment( overwrite=False, problem_path=data_dir, results_path=results_dir, cls_name="PythonTSF", classifier=tsf, dataset=dataset, train_file=False, ) steps = [ ("segment", RandomIntervalSegmenter(n_intervals="sqrt")), ( "transform", FeatureUnion( [ ( "mean", make_row_transformer( FunctionTransformer(func=np.mean, validate=False) ), ), ( "std", make_row_transformer( FunctionTransformer(func=np.std, validate=False) ), ), ( "slope", make_row_transformer( FunctionTransformer(func=_slope, validate=False) ), ), ] ), ), ("clf", DecisionTreeClassifier()), ] base_estimator = Pipeline(steps) tsf = TimeSeriesForestClassifier(estimator=base_estimator, n_estimators=100) exp.run_experiment( overwrite=False, problem_path=data_dir, results_path=results_dir, cls_name="PythonTSFComposite", classifier=tsf, dataset=dataset, train_file=False, )
def test_output_format_dim(n_timepoints, n_instances, n_intervals): X = _make_nested_from_array( np.ones(n_timepoints), n_instances=n_instances, n_columns=1 ) trans = RandomIntervalSegmenter(n_intervals=n_intervals) Xt = trans.fit_transform(X) # Check number of rows and output type. assert isinstance(Xt, pd.DataFrame) assert Xt.shape[0] == X.shape[0] # Check number of generated intervals/columns. if n_intervals != "random": if np.issubdtype(type(n_intervals), np.floating): assert Xt.shape[1] == np.maximum(1, int(n_timepoints * n_intervals)) elif np.issubdtype(type(n_intervals), np.integer): assert Xt.shape[1] == n_intervals elif n_intervals == "sqrt": assert Xt.shape[1] == np.maximum(1, int(np.sqrt(n_timepoints))) elif n_intervals == "log": assert Xt.shape[1] == np.maximum(1, int(np.log(n_timepoints)))
def rise_benchmarking(): for i in range(0, len(benchmark_datasets)): dataset = benchmark_datasets[i] print(str(i) + " problem = " + dataset) rise = fb.RandomIntervalSpectralForest(n_estimators=100) exp.run_experiment( overwrite=True, problem_path=data_dir, results_path=results_dir, cls_name="PythonRISE", classifier=rise, dataset=dataset, train_file=False, ) steps = [ ("segment", RandomIntervalSegmenter(n_intervals=1, min_length=5)), ( "transform", FeatureUnion( [ ( "acf", make_row_transformer( FunctionTransformer(func=acf_coefs, validate=False) ), ), ( "ps", make_row_transformer( FunctionTransformer(func=powerspectrum, validate=False) ), ), ] ), ), ("tabularise", Tabularizer()), ("clf", DecisionTreeClassifier()), ] base_estimator = Pipeline(steps) rise = TimeSeriesForestClassifier(estimator=base_estimator, n_estimators=100) exp.run_experiment( overwrite=True, problem_path=data_dir, results_path=results_dir, cls_name="PythonRISEComposite", classifier=rise, dataset=dataset, train_file=False, )
def test_different_pipelines(): """Compare with transformer pipeline using TSFeatureUnion.""" random_state = 1233 X_train, y_train = make_classification_problem() steps = [ ( "segment", RandomIntervalSegmenter(n_intervals=1, random_state=random_state), ), ( "transform", FeatureUnion([ ( "mean", SeriesToPrimitivesRowTransformer( FunctionTransformer(func=np.mean, validate=False), check_transformer=False, ), ), ( "std", SeriesToPrimitivesRowTransformer( FunctionTransformer(func=np.std, validate=False), check_transformer=False, ), ), ( "slope", SeriesToPrimitivesRowTransformer( FunctionTransformer(func=_slope, validate=False), check_transformer=False, ), ), ]), ), ] pipe = Pipeline(steps) a = pipe.fit_transform(X_train) tran = RandomIntervalFeatureExtractor( n_intervals=1, features=[np.mean, np.std, _slope], random_state=random_state, ) b = tran.fit_transform(X_train) np.testing.assert_array_equal(a, b) np.testing.assert_array_equal(pipe.steps[0][1].intervals_, tran.intervals_)
def test_FeatureUnion_pipeline(): # pipeline with segmentation plus multiple feature extraction steps = [ ("segment", RandomIntervalSegmenter(n_intervals=1)), ( "transform", FeatureUnion([("mean", mean_transformer), ("std", std_transformer)]), ), ("clf", DecisionTreeClassifier()), ] clf = Pipeline(steps) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) assert y_pred.shape[0] == y_test.shape[0] np.testing.assert_array_equal(np.unique(y_pred), np.unique(y_test))
def test_equivalent_model_specifications(n_intervals, n_estimators): """Test composable TSF vs an equivalent model.""" random_state = 1234 X_train, y_train = load_unit_test(split="train") X_test, y_test = load_unit_test(split="test") # Due to tie-breaking/floating point rounding in the final decision tree # classifier, the results depend on the # exact column order of the input data # Compare pipeline predictions outside of ensemble. steps = [ ( "segment", RandomIntervalSegmenter(n_intervals=n_intervals, random_state=random_state), ), ( "transform", FeatureUnion([("mean", mean_transformer), ("std", std_transformer)]), ), ("clf", DecisionTreeClassifier(random_state=random_state)), ] clf1 = Pipeline(steps) clf1.fit(X_train, y_train) a = clf1.predict(X_test) steps = [ ( "transform", RandomIntervalFeatureExtractor( n_intervals=n_intervals, features=[np.mean, np.std], random_state=random_state, ), ), ("clf", DecisionTreeClassifier(random_state=random_state)), ] clf2 = Pipeline(steps) clf2.fit(X_train, y_train) b = clf2.predict(X_test) np.array_equal(a, b)
def set_classifier(cls, resampleId): """ Basic way of determining the classifier to build. To differentiate settings just and another elif. So, for example, if you wanted tuned TSF, you just pass TuneTSF and set up the tuning mechanism in the elif. This may well get superceded, it is just how e have always done it :param cls: String indicating which classifier you want :return: A classifier. """ if cls.lower() == "pf": return pf.ProximityForest(random_state=resampleId) elif cls.lower() == "pt": return pf.ProximityTree(random_state=resampleId) elif cls.lower() == "ps": return pf.ProximityStump(random_state=resampleId) elif cls.lower() == "rise": return fb.RandomIntervalSpectralForest(random_state=resampleId) elif cls.lower() == "tsf": return ib.TimeSeriesForest(random_state=resampleId) elif cls.lower() == "boss": return db.BOSSEnsemble() elif cls.lower() == "st": return st.ShapeletTransformClassifier(time_contract_in_mins=1500) elif cls.lower() == "dtw": return nn.KNeighborsTimeSeriesClassifier(metric="dtw") elif cls.lower() == "ee" or cls.lower() == "elasticensemble": return dist.ElasticEnsemble() elif cls.lower() == "shapedtw_raw": return ShapeDTW(subsequence_length=30, shape_descriptor_function="raw", metric_params=None) elif cls.lower() == "shapedtw_dwt": return ShapeDTW( subsequence_length=30, shape_descriptor_function="dwt", metric_params={"num_levels_dwt": 3}, ) elif cls.lower() == "shapedtw_paa": return ShapeDTW( subsequence_length=30, shape_descriptor_function="paa", metric_params={"num_intervals_paa": 5}, ) elif cls.lower() == "shapedtw_slope": return ShapeDTW( subsequence_length=30, shape_descriptor_function="slope", metric_params={"num_intervals_slope": 5}, ) elif cls.lower() == "shapedtw_hog1d": return ShapeDTW( subsequence_length=30, shape_descriptor_function="hog1d", metric_params={ "num_bins_hog1d": 8, "num_intervals_hog1d": 2, "scaling_factor_hog1d": 0.1, }, ) elif cls.lower() == "tsfcomposite": # It defaults to TSF return ensemble.TimeSeriesForestClassifier() elif cls.lower() == "risecomposite": steps = [ ("segment", RandomIntervalSegmenter(n_intervals=1, min_length=5)), ( "transform", FeatureUnion([ ( "acf", make_row_transformer( FunctionTransformer(func=acf_coefs, validate=False)), ), ( "ps", make_row_transformer( FunctionTransformer(func=powerspectrum, validate=False)), ), ]), ), ("tabularise", Tabularizer()), ("clf", DecisionTreeClassifier()), ] base_estimator = Pipeline(steps) return ensemble.TimeSeriesForestClassifier(estimator=base_estimator, n_estimators=100) else: raise Exception("UNKNOWN CLASSIFIER")
def test_bad_input_args(bad_interval): X = _make_nested_from_array(np.ones(10), n_instances=10, n_columns=2) with pytest.raises(ValueError): RandomIntervalSegmenter(n_intervals=bad_interval).fit(X)
class RandomIntervalFeatureExtractor(BaseTransformer): """Random interval feature extractor transform. Transformer that segments time-series into random intervals and subsequently extracts series-to-primitives features from each interval. n_intervals: str{'sqrt', 'log', 'random'}, int or float, optional ( default='sqrt') Number of random intervals to generate, where m is length of time series: - If "log", log of m is used. - If "sqrt", sqrt of m is used. - If "random", random number of intervals is generated. - If int, n_intervals intervals are generated. - If float, int(n_intervals * m) is used with n_intervals giving the fraction of intervals of the time series length. For all arguments relative to the length of the time series, the generated number of intervals is always at least 1. features: list of functions, optional (default=None) Applies each function to random intervals to extract features. If None, the mean is extracted. random_state: : int, RandomState instance, optional (default=None) - If int, random_state is the seed used by the random number generator; - If RandomState instance, random_state is the random number generator; - If None, the random number generator is the RandomState instance used by `np.random`. """ _tags = { "fit_is_empty": False, "univariate-only": True, "scitype:transform-input": "Series", # what is the scitype of X: Series, or Panel "scitype:transform-output": "Primitives", # what is the scitype of y: None (not needed), Primitives, Series, Panel "scitype:instancewise": True, # is this an instance-wise transform? "X_inner_mtype": "nested_univ", # which mtypes do _fit/_predict support for X? "y_inner_mtype": "pd_Series_Table", # and for y? } def __init__( self, n_intervals="sqrt", min_length=None, max_length=None, features=None, random_state=None, ): self.n_intervals = n_intervals self.min_length = min_length self.max_length = max_length self.random_state = random_state self.features = features super(RandomIntervalFeatureExtractor, self).__init__() def _fit(self, X, y=None): """ Fit transformer, generating random interval indices. Parameters ---------- X : pandas DataFrame of shape [n_samples, n_features] Input data y : pandas Series, shape (n_samples, ...), optional Targets for supervised learning. Returns ------- self : RandomIntervalSegmenter This estimator """ # We use composition rather than inheritance here, because this transformer # has a different transform type (returns tabular) compared to the # RandomIntervalSegmenter (returns panel). self._interval_segmenter = RandomIntervalSegmenter( self.n_intervals, self.min_length, self.max_length, self.random_state ) self._interval_segmenter.fit(X, y) self.intervals_ = self._interval_segmenter.intervals_ self.input_shape_ = self._interval_segmenter.input_shape_ self._time_index = self._interval_segmenter._time_index return self def _transform(self, X, y=None): """Transform X. Transform X, segments time-series in each column into random intervals using interval indices generated during `fit` and extracts features from each interval. Parameters ---------- X : nested pandas.DataFrame of shape [n_instances, n_features] Nested dataframe with time-series in cells. Returns ------- Xt : pandas.DataFrame Transformed pandas DataFrame with n_instances rows and one column for each generated interval. """ # Check input of feature calculators, i.e list of functions to be # applied to time-series features = _check_features(self.features) X = convert_to(X, "numpy3D") # Check that the input is of the same shape as the one passed # during fit. if X.shape[1] != self.input_shape_[1]: raise ValueError( "Number of columns of input is different from what was seen in `fit`" ) # Input validation # if not all([np.array_equal(fit_idx, trans_idx) for trans_idx, # fit_idx in zip(check_equal_index(X), # raise ValueError('Indexes of input time-series are different # from what was seen in `fit`') n_instances, _, _ = X.shape n_features = len(features) intervals = self.intervals_ n_intervals = len(intervals) # Compute features on intervals. Xt = np.zeros((n_instances, n_features * n_intervals)) # Allocate output array # for transformed data columns = [] i = 0 for func in features: # TODO generalise to series-to-series functions and function kwargs for start, end in intervals: interval = X[:, :, start:end] # Try to use optimised computations over axis if possible, # otherwise iterate over rows. try: Xt[:, i] = func(interval, axis=-1).squeeze() except TypeError as e: if ( str(e) == f"{func.__name__}() got an unexpected " f"keyword argument 'axis'" ): Xt[:, i] = np.apply_along_axis( func, axis=2, arr=interval ).squeeze() else: raise i += 1 columns.append(f"{start}_{end}_{func.__name__}") Xt = pd.DataFrame(Xt) Xt.columns = columns return Xt
def set_classifier(cls, resampleId): """ Basic way of determining the classifier to build. To differentiate settings just and another elif. So, for example, if you wanted tuned TSF, you just pass TuneTSF and set up the tuning mechanism in the elif. This may well get superceded, it is just how e have always done it :param cls: String indicating which classifier you want :return: A classifier. """ if cls.lower() == "pf": return pf.ProximityForest(random_state=resampleId) elif cls.lower() == "pt": return pf.ProximityTree(random_state=resampleId) elif cls.lower() == "ps": return pf.ProximityStump(random_state=resampleId) elif cls.lower() == "rise": return fb.RandomIntervalSpectralForest(random_state=resampleId) elif cls.lower() == "tsf": return ib.TimeSeriesForest(random_state=resampleId) elif cls.lower() == "cif": return CanonicalIntervalForest(random_state=resampleId) elif cls.lower() == "boss": return BOSSEnsemble(random_state=resampleId) elif cls.lower() == "cboss": return ContractableBOSS(random_state=resampleId) elif cls.lower() == "tde": return TemporalDictionaryEnsemble(random_state=resampleId) elif cls.lower() == "st": return st.ShapeletTransformClassifier(time_contract_in_mins=1500) elif cls.lower() == "dtwcv": return nn.KNeighborsTimeSeriesClassifier(metric="dtwcv") elif cls.lower() == "ee" or cls.lower() == "elasticensemble": return dist.ElasticEnsemble() elif cls.lower() == "tsfcomposite": # It defaults to TSF return ensemble.TimeSeriesForestClassifier() elif cls.lower() == "risecomposite": steps = [ ("segment", RandomIntervalSegmenter(n_intervals=1, min_length=5)), ( "transform", FeatureUnion([ ( "acf", make_row_transformer( FunctionTransformer(func=acf_coefs, validate=False)), ), ( "ps", make_row_transformer( FunctionTransformer(func=powerspectrum, validate=False)), ), ]), ), ("tabularise", Tabularizer()), ("clf", DecisionTreeClassifier()), ] base_estimator = Pipeline(steps) return ensemble.TimeSeriesForestClassifier(estimator=base_estimator, n_estimators=100) elif cls.lower() == "rocket": rocket_pipeline = make_pipeline( Rocket(random_state=resampleId), RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True), ) return rocket_pipeline else: raise Exception("UNKNOWN CLASSIFIER")