def test_different_pipelines(): random_seed = 1233 X_train, y_train = load_gunpoint(return_X_y=True) steps = [ ('segment', RandomIntervalSegmenter(n_intervals='sqrt')), ('transform', FeatureUnion([ ('mean', RowwiseTransformer( FunctionTransformer(func=np.mean, validate=False))), ('std', RowwiseTransformer( FunctionTransformer(func=np.std, validate=False))), ('slope', RowwiseTransformer( FunctionTransformer(func=time_series_slope, validate=False))), ])), ] pipe = Pipeline(steps, random_state=random_seed) a = pipe.fit_transform(X_train) tran = RandomIntervalFeatureExtractor( n_intervals='sqrt', features=[np.mean, np.std, time_series_slope], random_state=random_seed) b = tran.fit_transform(X_train) np.testing.assert_array_equal(a, b) np.testing.assert_array_equal(pipe.steps[0][1].intervals_, tran.intervals_)
def test_output_format_dim(n_instances, len_series, n_intervals, features): X = generate_df_from_array(np.ones(len_series), n_rows=n_instances, n_cols=1) n_rows, n_cols = X.shape trans = RandomIntervalFeatureExtractor(n_intervals=n_intervals, features=features) Xt = trans.fit_transform(X) assert isinstance(Xt, pd.DataFrame) assert Xt.shape[0] == n_rows assert np.array_equal(Xt.values, np.ones(Xt.shape))
def test_results(n_instances, len_series, n_intervals): x = np.random.normal(size=len_series) X = generate_df_from_array(x, n_rows=n_instances, n_cols=1) trans = RandomIntervalFeatureExtractor( n_intervals=n_intervals, features=[np.mean, np.std, time_series_slope]) Xt = trans.fit_transform(X) # Check results for s, e in trans.intervals_: assert np.all(Xt.filter(like=f'_{s}_{e}_mean') == np.mean(x[s:e])) assert np.all(Xt.filter(like=f'_{s}_{e}_std') == np.std(x[s:e])) assert np.all( Xt.filter(like=f'_{s}_{e}_time_series_slope') == time_series_slope( x[s:e]))
def test_pipeline_predictions(n_intervals, n_estimators): random_state = 1234 # Due to tie-breaking/floating point rounding in the final decision tree classifier, the results depend on the # exact column order of the input data # Compare pipeline predictions outside of ensemble. steps = [('segment', RandomIntervalSegmenter(n_intervals=n_intervals)), ('transform', FeatureUnion([('mean', RowwiseTransformer( FunctionTransformer(func=np.mean, validate=False))), ('std', RowwiseTransformer( FunctionTransformer(func=np.std, validate=False))), ('slope', RowwiseTransformer( FunctionTransformer(func=time_series_slope, validate=False)))])), ('clf', DecisionTreeClassifier())] clf1 = Pipeline(steps, random_state=random_state) clf1.fit(X_train, y_train) a = clf1.predict(X_test) steps = [('transform', RandomIntervalFeatureExtractor( n_intervals=n_intervals, features=[np.mean, np.std, time_series_slope])), ('clf', DecisionTreeClassifier())] clf2 = Pipeline(steps, random_state=random_state) clf2.fit(X_train, y_train) b = clf2.predict(X_test) np.array_equal(a, b)
def test_TimeSeriesForest_predictions(n_estimators, n_intervals): random_state = 1234 # fully modular implementation using pipeline with FeatureUnion # steps = [ # ('segment', RandomIntervalSegmenter(n_intervals=n_intervals)), # ('transform', FeatureUnion([ # ('mean', RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False))), # ('std', RowwiseTransformer(FunctionTransformer(func=np.std, validate=False))), # ('slope', RowwiseTransformer(FunctionTransformer(func=time_series_slope, validate=False))) # ])), # ('clf', DecisionTreeClassifier()) # ] # base_estimator = Pipeline(steps) features = [np.mean, np.std, time_series_slope] steps = [('transform', RandomIntervalFeatureExtractor(n_intervals=n_intervals, features=features)), ('clf', DecisionTreeClassifier())] base_estimator = Pipeline(steps) clf1 = TimeSeriesForestClassifier(base_estimator=base_estimator, random_state=random_state, n_estimators=n_estimators) clf1.fit(X_train, y_train) a = clf1.predict_proba(X_test) # default, semi-modular implementation using RandomIntervalFeatureExtractor internally clf2 = TimeSeriesForestClassifier(random_state=random_state, n_estimators=n_estimators) clf2.set_params(**{'base_estimator__transform__n_intervals': n_intervals}) clf2.fit(X_train, y_train) b = clf2.predict_proba(X_test) np.testing.assert_array_equal(a, b)
def test_different_implementations(): random_seed = 1233 X_train, y_train = load_gunpoint(return_X_y=True) # Compare with chained transformations. tran1 = RandomIntervalSegmenter(n_intervals='sqrt', random_state=random_seed) tran2 = RowwiseTransformer( FunctionTransformer(func=np.mean, validate=False)) A = tran2.fit_transform(tran1.fit_transform(X_train)) tran = RandomIntervalFeatureExtractor(n_intervals='sqrt', features=[np.mean], random_state=random_seed) B = tran.fit_transform(X_train) np.testing.assert_array_equal(A, B)
def test_random_state(): N_ITER = 10 X = generate_df_from_array(np.random.normal(size=20)) random_state = 1234 trans = RandomIntervalFeatureExtractor(n_intervals='random', random_state=random_state) first_Xt = trans.fit_transform(X) for _ in range(N_ITER): trans = RandomIntervalFeatureExtractor(n_intervals='random', random_state=random_state) Xt = trans.fit_transform(X) assert first_Xt.equals(Xt)
def test_Pipeline_random_state(): steps = [('transform', RandomIntervalFeatureExtractor(features=[np.mean])), ('clf', DecisionTreeClassifier())] pipe = Pipeline(steps) # Check that pipe is initiated without random_state assert pipe.random_state is None assert pipe.get_params()['random_state'] is None # Check that all components are initiated without random_state for step in pipe.steps: assert step[1].random_state is None assert step[1].get_params()['random_state'] is None # Check that if random state is set, it's set to itself and all its random components rs = 1234 pipe.set_params(**{'random_state': rs}) assert pipe.random_state == rs assert pipe.get_params()['random_state'] == rs for step in pipe.steps: assert step[1].random_state == rs assert step[1].get_params()['random_state'] == rs # Check specific results X_train, y_train = load_gunpoint(return_X_y=True) X_test, y_test = load_gunpoint("TEST", return_X_y=True) steps = [ ('segment', RandomIntervalSegmenter(n_intervals=3)), ('extract', RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False))), ('clf', DecisionTreeClassifier()) ] pipe = Pipeline(steps, random_state=rs) pipe.fit(X_train, y_train) y_pred_first = pipe.predict(X_test) N_ITER = 10 for _ in range(N_ITER): pipe = Pipeline(steps, random_state=rs) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) np.testing.assert_array_equal(y_pred_first, y_pred)
def __init__(self, base_estimator=None, n_estimators=500, criterion='mse', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0., max_features=None, max_leaf_nodes=None, min_impurity_decrease=0., min_impurity_split=None, bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, check_input=True): if base_estimator is None: features = [np.mean, np.std, time_series_slope] steps = [('transform', RandomIntervalFeatureExtractor(n_intervals='sqrt', features=features)), ('clf', DecisionTreeRegressor())] base_estimator = Pipeline(steps) elif not isinstance(base_estimator, Pipeline): raise ValueError( 'Base estimator must be pipeline with transforms.') elif not isinstance(base_estimator.steps[-1][1], DecisionTreeRegressor): raise ValueError( 'Last step in base estimator pipeline must be DecisionTreeRegressor.' ) # Assign values, even though passed on to base estimator below, necessary here for cloning self.criterion = criterion self.max_depth = max_depth self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.min_weight_fraction_leaf = min_weight_fraction_leaf self.max_features = max_features self.max_leaf_nodes = max_leaf_nodes self.min_impurity_decrease = min_impurity_decrease self.min_impurity_split = min_impurity_split # Rename estimator params according to name in pipeline. estimator = base_estimator.steps[-1][0] estimator_params = { "criterion": criterion, "max_depth": max_depth, "min_samples_split": min_samples_split, "min_samples_leaf": min_samples_leaf, "min_weight_fraction_leaf": min_weight_fraction_leaf, "max_features": max_features, "max_leaf_nodes": max_leaf_nodes, "min_impurity_decrease": min_impurity_decrease, "min_impurity_split": min_impurity_split, } estimator_params = { f'{estimator}__{pname}': pval for pname, pval in estimator_params.items() } # Pass on params. super(TimeSeriesForestRegressor, self).__init__( base_estimator=base_estimator, n_estimators=n_estimators, estimator_params=tuple(estimator_params.keys()), bootstrap=bootstrap, oob_score=oob_score, n_jobs=n_jobs, random_state=random_state, verbose=verbose, warm_start=warm_start, ) # Assign random state to pipeline. base_estimator.set_params(**{ 'random_state': random_state, 'check_input': False }) # Store renamed estimator params. for pname, pval in estimator_params.items(): self.__setattr__(pname, pval) self.check_input = check_input
def test_bad_features(bad_features): X, y = load_gunpoint(return_X_y=True) with pytest.raises(ValueError): RandomIntervalFeatureExtractor(n_intervals=bad_features).fit(X)