def test_RowwiseTransformer_pipeline(): # using pure sklearn mean_func = lambda X: pd.DataFrame([np.mean(row) for row in X]) first_func = lambda X: pd.DataFrame([row[0] for row in X]) column_transformer = ColumnTransformer( [('mean', FunctionTransformer(func=mean_func, validate=False), 'ts'), ('first', FunctionTransformer(func=first_func, validate=False), 'ts_copy')]) estimator = RandomForestClassifier(n_estimators=2, random_state=1) strategy = [ ('feature_extract', column_transformer), ('rfestimator', estimator)] model = Pipeline(steps=strategy) model.fit(X_train, y_train) expected = model.predict(X_test) # using sktime with sklearn pipeline first_func = lambda X: pd.DataFrame([row[0] for row in X]) column_transformer = ColumnTransformer( [('mean', RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False)), 'ts'), ('first', FunctionTransformer(func=first_func, validate=False), 'ts_copy')]) estimator = RandomForestClassifier(n_estimators=2, random_state=1) strategy = [ ('feature_extract', column_transformer), ('rfestimator', estimator)] model = Pipeline(steps=strategy) model.fit(X_train, y_train) got = model.predict(X_test) np.testing.assert_array_equal(expected, got)
def _test_pipeline_predictions(n_intervals=None, random_state=None): steps = [('segment', RandomIntervalSegmenter(n_intervals=n_intervals, check_input=False)), ('transform', FeatureUnion([('mean', RowwiseTransformer( FunctionTransformer(func=np.mean, validate=False))), ('std', RowwiseTransformer( FunctionTransformer(func=np.std, validate=False)))])), ('clf', DecisionTreeClassifier())] clf1 = Pipeline(steps, random_state=random_state) clf1.fit(X_train, y_train) a = clf1.predict(X_test) steps = [('transform', RandomIntervalFeatureExtractor(n_intervals=n_intervals, features=[np.mean, np.std])), ('clf', DecisionTreeClassifier())] clf2 = Pipeline(steps, random_state=random_state) clf2.fit(X_train, y_train) b = clf2.predict(X_test) np.array_equal(a, b)
def test_pipeline_predictions(n_intervals, n_estimators): random_state = 1234 # Due to tie-breaking/floating point rounding in the final decision tree classifier, the results depend on the # exact column order of the input data # Compare pipeline predictions outside of ensemble. steps = [('segment', RandomIntervalSegmenter(n_intervals=n_intervals)), ('transform', FeatureUnion([('mean', RowwiseTransformer( FunctionTransformer(func=np.mean, validate=False))), ('std', RowwiseTransformer( FunctionTransformer(func=np.std, validate=False))), ('slope', RowwiseTransformer( FunctionTransformer(func=time_series_slope, validate=False)))])), ('clf', DecisionTreeClassifier())] clf1 = Pipeline(steps, random_state=random_state) clf1.fit(X_train, y_train) a = clf1.predict(X_test) steps = [('transform', RandomIntervalFeatureExtractor( n_intervals=n_intervals, features=[np.mean, np.std, time_series_slope])), ('clf', DecisionTreeClassifier())] clf2 = Pipeline(steps, random_state=random_state) clf2.fit(X_train, y_train) b = clf2.predict(X_test) np.array_equal(a, b)
def test_RowwiseTransformer_pipeline(): X_train, y_train = load_basic_motions("TRAIN", return_X_y=True) X_test, y_test = load_basic_motions("TEST", return_X_y=True) # using pure sklearn def rowwise_mean(X): if isinstance(X, pd.Series): X = pd.DataFrame(X) Xt = pd.concat([pd.Series(col.apply(np.mean)) for _, col in X.items()], axis=1) return Xt def rowwise_first(X): if isinstance(X, pd.Series): X = pd.DataFrame(X) Xt = pd.concat([pd.Series(tabularise(col).iloc[:, 0]) for _, col in X.items()], axis=1) return Xt # specify column as a list, otherwise pandas Series are selected and passed on to the transformers transformer = ColumnTransformer([ ('mean', FunctionTransformer(func=rowwise_mean, validate=False), ['dim_0']), ('first', FunctionTransformer(func=rowwise_first, validate=False), ['dim_1']) ]) estimator = RandomForestClassifier(n_estimators=2, random_state=1) steps = [ ('extract', transformer), ('classify', estimator) ] model = Pipeline(steps=steps) model.fit(X_train, y_train) expected = model.predict(X_test) # using sktime with sklearn pipeline transformer = ColumnTransformer([ ('mean', RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False)), ['dim_0']), ('first', FunctionTransformer(func=rowwise_first, validate=False), ['dim_1']) ]) estimator = RandomForestClassifier(n_estimators=2, random_state=1) steps = [ ('extract', transformer), ('classify', estimator) ] model = Pipeline(steps=steps) model.fit(X_train, y_train) actual = model.predict(X_test) np.testing.assert_array_equal(expected, actual)
def test_Pipeline_random_state(): steps = [('transform', RandomIntervalFeatureExtractor(features=[np.mean])), ('clf', DecisionTreeClassifier())] pipe = Pipeline(steps) # Check that pipe is initiated without random_state assert pipe.random_state is None assert pipe.get_params()['random_state'] is None # Check that all components are initiated without random_state for step in pipe.steps: assert step[1].random_state is None assert step[1].get_params()['random_state'] is None # Check that if random state is set, it's set to itself and all its random components rs = 1234 pipe.set_params(**{'random_state': rs}) assert pipe.random_state == rs assert pipe.get_params()['random_state'] == rs for step in pipe.steps: assert step[1].random_state == rs assert step[1].get_params()['random_state'] == rs # Check specific results X_train, y_train = load_gunpoint(return_X_y=True) X_test, y_test = load_gunpoint("TEST", return_X_y=True) steps = [ ('segment', RandomIntervalSegmenter(n_intervals=3)), ('extract', RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False))), ('clf', DecisionTreeClassifier()) ] pipe = Pipeline(steps, random_state=rs) pipe.fit(X_train, y_train) y_pred_first = pipe.predict(X_test) N_ITER = 10 for _ in range(N_ITER): pipe = Pipeline(steps, random_state=rs) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) np.testing.assert_array_equal(y_pred_first, y_pred)
def test_ColumnTransformer_pipeline(): # using Identity function transformers (transform series to series) id_func = lambda X: X column_transformer = ColumnTransformer( [('ts', FunctionTransformer(func=id_func, validate=False), 'ts'), ('ts_copy', FunctionTransformer(func=id_func, validate=False), 'ts_copy')]) steps = [ ('feature_extract', column_transformer), ('tabularise', Tabulariser()), ('rfestimator', RandomForestClassifier(n_estimators=2))] model = Pipeline(steps=steps) model.fit(X_train, y_train) y_pred = model.predict(X_test) assert y_pred.shape[0] == y_test.shape[0] np.testing.assert_array_equal(np.unique(y_pred), np.unique(y_test))
def test_FeatureUnion_pipeline(): # pipeline with segmentation plus multiple feature extraction steps = [ ('segment', RandomIntervalSegmenter(n_intervals=3, check_input=False)), ('transform', FeatureUnion([ ('mean', RowwiseTransformer(FunctionTransformer(func=np.mean, validate=False))), ('std', RowwiseTransformer(FunctionTransformer(func=np.std, validate=False))) ])), ('clf', DecisionTreeClassifier()) ] clf = Pipeline(steps) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) assert y_pred.shape[0] == y_test.shape[0] np.testing.assert_array_equal(np.unique(y_pred), np.unique(y_test))
def test_ColumnTransformer_pipeline(): X_train, y_train = load_basic_motions("TRAIN", return_X_y=True) X_test, y_test = load_basic_motions("TEST", return_X_y=True) # using Identity function transformers (transform series to series) id_func = lambda X: X column_transformer = ColumnTransformer([ ('id0', FunctionTransformer(func=id_func, validate=False), ['dim_0']), ('id1', FunctionTransformer(func=id_func, validate=False), ['dim_1']) ]) steps = [('extract', column_transformer), ('tabularise', Tabulariser()), ('classify', RandomForestClassifier(n_estimators=2))] model = Pipeline(steps=steps) model.fit(X_train, y_train) y_pred = model.predict(X_test) assert y_pred.shape[0] == y_test.shape[0] np.testing.assert_array_equal(np.unique(y_pred), np.unique(y_test))