def test_with_gridsearchcv3_auto(self): from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score, make_scorer from sklearn.model_selection import GridSearchCV lr = LogisticRegression() from sklearn.pipeline import Pipeline scikit_pipeline = Pipeline( [(Nystroem().name(), Nystroem()), (lr.name(), LogisticRegression())] ) all_parameters = get_grid_search_parameter_grids( Nystroem() >> lr, num_samples=1 ) # otherwise the test takes too long parameters = random.sample(all_parameters, 2) with warnings.catch_warnings(): warnings.simplefilter("ignore") clf = GridSearchCV( scikit_pipeline, parameters, cv=2, scoring=make_scorer(accuracy_score) ) iris = load_iris() clf.fit(iris.data, iris.target) _ = clf.predict(iris.data)
def test_import_from_sklearn_pipeline_nested_pipeline1(self): from sklearn.pipeline import FeatureUnion, make_pipeline from sklearn.decomposition import PCA from sklearn.kernel_approximation import Nystroem from sklearn.feature_selection import SelectKBest from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline union = FeatureUnion([ ("selectkbest_pca", make_pipeline( SelectKBest(k=3), FeatureUnion([('pca', PCA(n_components=1)), ('nested_pipeline', make_pipeline(SelectKBest(k=2), Nystroem()))]))), ("nys", Nystroem(n_components=2, random_state=42)) ]) sklearn_pipeline = make_pipeline(union, KNeighborsClassifier()) lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline) self.assertEqual(len(lale_pipeline.edges()), 8) #These assertions assume topological sort, which may not be unique. So the assertions are brittle. from lale.lib.sklearn.pca import PCAImpl from lale.lib.sklearn.nystroem import NystroemImpl from lale.lib.lale.concat_features import ConcatFeaturesImpl from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl from lale.lib.sklearn.select_k_best import SelectKBestImpl self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(), SelectKBestImpl) self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(), PCAImpl) self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(), SelectKBestImpl) self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(), SelectKBestImpl) self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(), SelectKBestImpl) self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(), NystroemImpl) self.assertEqual(lale_pipeline.edges()[3][0]._impl_class(), PCAImpl) self.assertEqual(lale_pipeline.edges()[3][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[4][0]._impl_class(), NystroemImpl) self.assertEqual(lale_pipeline.edges()[4][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[5][0]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[5][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[6][0]._impl_class(), NystroemImpl) self.assertEqual(lale_pipeline.edges()[6][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[7][0]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[7][1]._impl_class(), KNeighborsClassifierImpl) self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
def test_export_to_sklearn_pipeline3(self): from sklearn.feature_selection import SelectKBest from sklearn.pipeline import FeatureUnion lale_pipeline = ( ( (PCA() >> SelectKBest(k=2)) & (Nystroem(random_state=42) >> SelectKBest(k=3)) & (SelectKBest(k=3)) ) >> ConcatFeatures() >> SelectKBest(k=2) >> LogisticRegression() ) trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assertIsInstance( sklearn_pipeline.named_steps["featureunion"], FeatureUnion ) self.assertIsInstance(sklearn_pipeline.named_steps["selectkbest"], SelectKBest) from sklearn.linear_model import LogisticRegression as SklearnLR self.assertIsInstance( sklearn_pipeline.named_steps["logisticregression"], SklearnLR ) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_two_estimators_predict1(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier()) trained = pipeline.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_import_from_sklearn_pipeline_nested_pipeline(self): from sklearn.pipeline import FeatureUnion, make_pipeline from sklearn.decomposition import PCA from sklearn.kernel_approximation import Nystroem from sklearn.feature_selection import SelectKBest from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline union = FeatureUnion([("selectkbest_pca", make_pipeline(SelectKBest(k=3), PCA(n_components=1))), ("nys", Nystroem(n_components=2, random_state=42))]) sklearn_pipeline = make_pipeline(union, KNeighborsClassifier()) lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline) self.assertEqual(len(lale_pipeline.edges()), 4) from lale.lib.sklearn.pca import PCAImpl from lale.lib.sklearn.nystroem import NystroemImpl from lale.lib.lale.concat_features import ConcatFeaturesImpl from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl #These assertions assume topological sort self.assertIsInstance(lale_pipeline.edges()[0][0]._impl, SelectKBest) self.assertIsInstance(lale_pipeline.edges()[0][1]._impl, PCAImpl) self.assertIsInstance(lale_pipeline.edges()[1][0]._impl, PCAImpl) self.assertIsInstance(lale_pipeline.edges()[1][1]._impl, ConcatFeaturesImpl) self.assertIsInstance(lale_pipeline.edges()[2][0]._impl, NystroemImpl) self.assertIsInstance(lale_pipeline.edges()[2][1]._impl, ConcatFeaturesImpl) self.assertIsInstance(lale_pipeline.edges()[3][0]._impl, ConcatFeaturesImpl) self.assertIsInstance(lale_pipeline.edges()[3][1]._impl, KNeighborsClassifierImpl) self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
def test_import_from_sklearn_pipeline_feature_union(self): from sklearn.pipeline import FeatureUnion from sklearn.decomposition import PCA from sklearn.kernel_approximation import Nystroem from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline union = FeatureUnion([("pca", PCA(n_components=1)), ("nys", Nystroem(n_components=2, random_state=42))]) sklearn_pipeline = make_pipeline(union, KNeighborsClassifier()) lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline) self.assertEqual(len(lale_pipeline.edges()), 3) from lale.lib.sklearn.pca import PCAImpl from lale.lib.sklearn.nystroem import NystroemImpl from lale.lib.lale.concat_features import ConcatFeaturesImpl from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(), PCAImpl) self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(), NystroemImpl) self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(), KNeighborsClassifierImpl) self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
def test_export_to_sklearn_pipeline2(self): from sklearn.feature_selection import SelectKBest from sklearn.pipeline import FeatureUnion lale_pipeline = ( ( ( (PCA(svd_solver="randomized", random_state=42) & SelectKBest(k=3)) >> ConcatFeatures() ) & Nystroem(random_state=42) ) >> ConcatFeatures() >> KNeighborsClassifier() ) trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assertIsInstance( sklearn_pipeline.named_steps["featureunion"], FeatureUnion ) from sklearn.neighbors import KNeighborsClassifier as SklearnKNN self.assertIsInstance( sklearn_pipeline.named_steps["kneighborsclassifier"], SklearnKNN ) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_two_transformers(self): tfm1 = PCA() tfm2 = Nystroem() trainable = tfm1 >> tfm2 digits = sklearn.datasets.load_digits() trained = trainable.fit(digits.data, digits.target) _ = trained.transform(digits.data)
def test_with_concat_features2(self): import warnings warnings.filterwarnings("ignore") from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score from lale.lib.lale import Hyperopt data = load_iris() X, y = data.data, data.target pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) from lale.operators import make_pipeline pipeline = make_pipeline( ((((SimpleImputer() | NoOp()) >> pca) & nys) >> concat >> lr) | KNeighborsClassifier() ) clf = Hyperopt(estimator=pipeline, max_evals=1, handle_cv_failure=True) trained = clf.fit(X, y) predictions = trained.predict(X) print(accuracy_score(y, predictions)) warnings.resetwarnings()
def test_two_estimators_predict_proba1(self): pipeline = StandardScaler() >> ( PCA() & Nystroem() & PassiveAggressiveClassifier() ) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier() pipeline.fit(self.X_train, self.y_train) with self.assertRaises(ValueError): pipeline.predict_proba(self.X_test)
def test_comparison_with_scikit(self): import warnings warnings.filterwarnings("ignore") from lale.lib.sklearn import PCA import sklearn.datasets from lale.helpers import cross_val_score pca = PCA(n_components=3, random_state=42, svd_solver='arpack') nys = Nystroem(n_components=10, random_state=42) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) trainable = (pca & nys) >> concat >> lr digits = sklearn.datasets.load_digits() X, y = sklearn.utils.shuffle(digits.data, digits.target, random_state=42) cv_results = cross_val_score(trainable, X, y) cv_results = ['{0:.1%}'.format(score) for score in cv_results] from sklearn.pipeline import make_pipeline, FeatureUnion from sklearn.decomposition import PCA as SklearnPCA from sklearn.kernel_approximation import Nystroem as SklearnNystroem from sklearn.linear_model import LogisticRegression as SklearnLR from sklearn.model_selection import cross_val_score union = FeatureUnion([("pca", SklearnPCA(n_components=3, random_state=42, svd_solver='arpack')), ("nys", SklearnNystroem(n_components=10, random_state=42))]) lr = SklearnLR(random_state=42, C=0.1) pipeline = make_pipeline(union, lr) scikit_cv_results = cross_val_score(pipeline, X, y, cv = 5) scikit_cv_results = ['{0:.1%}'.format(score) for score in scikit_cv_results] self.assertEqual(cv_results, scikit_cv_results) warnings.resetwarnings()
def test_remove_last4(self): pipeline = StandardScaler() >> ( PCA() & Nystroem() & PassiveAggressiveClassifier() ) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier() new_pipeline = pipeline.remove_last(inplace=True) self.assertEqual(len(new_pipeline._steps), 6) self.assertEqual(len(pipeline._steps), 6)
def test_remove_last2(self): pipeline = (StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> (PassiveAggressiveClassifier() & LogisticRegression())) with self.assertRaises(ValueError): pipeline.remove_last()
def test_astype_sklearn(self): from lale.lib.lale import ConcatFeatures from lale.lib.sklearn import PCA, LogisticRegression, MinMaxScaler, Nystroem pca = PCA(copy=False) logistic_regression = LogisticRegression(solver="saga", C=0.9) pipeline = ( MinMaxScaler() >> (pca & Nystroem()) >> ConcatFeatures >> logistic_regression ) expected = """from sklearn.preprocessing import MinMaxScaler from sklearn.decomposition import PCA from sklearn.kernel_approximation import Nystroem from sklearn.pipeline import make_union from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline pca = PCA(copy=False) union = make_union(pca, Nystroem()) logistic_regression = LogisticRegression(solver="saga", C=0.9) pipeline = make_pipeline(MinMaxScaler(), union, logistic_regression)""" printed = lale.pretty_print.to_string(pipeline, astype="sklearn") self._roundtrip(expected, printed)
def test_compose3(self): nys = Nystroem(n_components=15) pca = PCA(n_components=10) lr = LogisticRegression(random_state=42) trainable = nys >> pca >> lr digits = sklearn.datasets.load_digits() trained = trainable.fit(digits.data, digits.target) _ = trained.predict(digits.data)
def test_pca_nys_lr(self): from lale.operators import make_union nys = Nystroem(n_components=15) pca = PCA(n_components=10) lr = LogisticRegression(random_state=42) trainable = make_union(nys, pca) >> lr digits = sklearn.datasets.load_digits() trained = trainable.fit(digits.data, digits.target) predicted = trained.predict(digits.data)
def test_fit_args(self): from sklearn.datasets import load_iris from lale.lib.lale import TopKVotingClassifier from lale.lib.sklearn import Nystroem from sklearn.metrics import accuracy_score ensemble = TopKVotingClassifier(estimator=(PCA() | Nystroem()) >> (LogisticRegression()|KNeighborsClassifier()), k=2) trained = ensemble.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_remove_last5(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier() ) pipeline.remove_last(inplace=True).freeze_trainable()
def test_two_estimators_predict_proba1(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & GaussianNB()) >> ConcatFeatures() >> NoOp() >> GaussianNB() ) pipeline.fit(self.X_train, self.y_train) pipeline.predict_proba(self.X_test)
def test_fit_smaller_trials(self): from sklearn.datasets import load_iris from lale.lib.lale import TopKVotingClassifier from lale.lib.sklearn import Nystroem from sklearn.metrics import accuracy_score ensemble = TopKVotingClassifier(estimator=(PCA() | Nystroem()) >> (LogisticRegression()|KNeighborsClassifier()), args_to_optimizer={'max_evals':3}, k=20) trained = ensemble.fit(self.X_train, self.y_train) final_ensemble = trained._impl._best_estimator self.assertLessEqual(len(final_ensemble._impl._wrapped_model.estimators), 3)
def test_two_estimators_predict_proba(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & LogisticRegression()) >> ConcatFeatures() >> NoOp() >> LogisticRegression() ) trained = pipeline.fit(self.X_train, self.y_train) trained.predict_proba(self.X_test)
def test_make_choice_with_instance(self): from lale.operators import make_union, make_choice, make_pipeline from sklearn.datasets import load_iris iris = load_iris() X, y = iris.data, iris.target tfm = PCA() | Nystroem() | NoOp() with self.assertRaises(AttributeError): trained = tfm.fit(X, y) planned_pipeline1 = (OneHotEncoder | NoOp) >> tfm >> (LogisticRegression | KNeighborsClassifier) planned_pipeline2 = (OneHotEncoder | NoOp) >> (PCA | Nystroem) >> (LogisticRegression | KNeighborsClassifier) planned_pipeline3 = make_choice(OneHotEncoder, NoOp) >> make_choice(PCA, Nystroem) >> make_choice(LogisticRegression, KNeighborsClassifier)
def test_higher_order_1(self): from lale.json_operator import from_json from lale.lib.lale import Both from lale.lib.sklearn import PCA, Nystroem operator = Both(op1=PCA(n_components=2), op2=Nystroem) json_expected = { "class": Both.class_name(), "state": "trainable", "operator": "Both", "label": "Both", "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.both.html", "hyperparams": { "op1": { "$ref": "../steps/pca" }, "op2": { "$ref": "../steps/nystroem" }, }, "steps": { "pca": { "class": PCA.class_name(), "state": "trainable", "operator": "PCA", "label": "PCA", "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.pca.html", "hyperparams": { "n_components": 2 }, "is_frozen_trainable": False, }, "nystroem": { "class": Nystroem.class_name(), "state": "planned", "operator": "Nystroem", "label": "Nystroem", "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.nystroem.html", }, }, "is_frozen_trainable": False, } json = operator.to_json() self.assertEqual(json, json_expected) operator_2 = from_json(json) json_2 = operator_2.to_json() self.assertEqual(json, json_2)
def test_fit_args(self): from lale.lib.lale import TopKVotingClassifier from lale.lib.sklearn import Nystroem ensemble = TopKVotingClassifier( estimator=(PCA() | Nystroem()) >> (LogisticRegression() | KNeighborsClassifier()), k=2, ) trained = ensemble.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_resampler(self): from lale.lib.sklearn import PCA, Nystroem, LogisticRegression, RandomForestClassifier from lale.lib.lale import NoOp, ConcatFeatures X_train, y_train = self.X_train, self.y_train X_test, y_test = self.X_test, self.y_test import importlib module_name = ".".join(res_name.split('.')[0:-1]) class_name = res_name.split('.')[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) with self.assertRaises(ValueError): res = class_() #test_schemas_are_schemas lale.type_checking.validate_is_schema(class_.input_schema_fit()) lale.type_checking.validate_is_schema(class_.input_schema_predict()) lale.type_checking.validate_is_schema(class_.output_schema_predict()) lale.type_checking.validate_is_schema(class_.hyperparam_schema()) #test_init_fit_predict from lale.operators import make_pipeline pipeline1 = PCA() >> class_(operator=make_pipeline(LogisticRegression())) trained = pipeline1.fit(X_train, y_train) predictions = trained.predict(X_test) pipeline2 = class_(operator=make_pipeline(PCA(), LogisticRegression())) trained = pipeline2.fit(X_train, y_train) predictions = trained.predict(X_test) #test_with_hyperopt from lale.lib.lale import Hyperopt optimizer = Hyperopt(estimator=PCA >> class_(operator=make_pipeline(LogisticRegression())), max_evals = 1, show_progressbar=False) trained_optimizer = optimizer.fit(X_train, y_train) predictions = trained_optimizer.predict(X_test) pipeline3 = class_(operator= PCA() >> (Nystroem & NoOp) >> ConcatFeatures >> LogisticRegression()) optimizer = Hyperopt(estimator=pipeline3, max_evals = 1, show_progressbar=False) trained_optimizer = optimizer.fit(X_train, y_train) predictions = trained_optimizer.predict(X_test) pipeline4 = (PCA >> class_(operator=make_pipeline(Nystroem())) & class_(operator=make_pipeline(Nystroem()))) >> ConcatFeatures >> LogisticRegression() optimizer = Hyperopt(estimator=pipeline4, max_evals = 1, scoring='roc_auc', show_progressbar=False) trained_optimizer = optimizer.fit(X_train, y_train) predictions = trained_optimizer.predict(X_test) #test_cross_validation from lale.helpers import cross_val_score cv_results = cross_val_score(pipeline1, X_train, y_train, cv = 2) self.assertEqual(len(cv_results), 2) #test_to_json pipeline1.to_json()
def test_concat_with_hyperopt(self): from lale.lib.lale import Hyperopt pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) trainable = (pca & nys) >> concat >> lr clf = Hyperopt(estimator=trainable, max_evals=2) from sklearn.datasets import load_iris iris_data = load_iris() clf.fit(iris_data.data, iris_data.target) clf.predict(iris_data.data)
def test_with_pandas(self): from lale.datasets import load_iris_df import warnings warnings.filterwarnings("ignore") pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) trainable = (pca & nys) >> concat >> lr (X_train, y_train), (X_test, y_test) = load_iris_df() trained = trainable.fit(X_train, y_train) predicted = trained.predict(X_test)
def test_fit_smaller_trials(self): from lale.lib.lale import TopKVotingClassifier from lale.lib.sklearn import Nystroem ensemble = TopKVotingClassifier( estimator=(PCA() | Nystroem()) >> (LogisticRegression() | KNeighborsClassifier()), args_to_optimizer={"max_evals": 3}, k=20, ) trained = ensemble.fit(self.X_train, self.y_train) final_ensemble = trained._impl._best_estimator self.assertLessEqual(len(final_ensemble._impl_instance().estimators), 3)
def test_compose4(self): from lale.operators import make_choice digits = sklearn.datasets.load_digits() ohe = OneHotEncoder(handle_unknown=OneHotEncoder.handle_unknown.ignore) ohe.get_params() no_op = NoOp() pca = PCA() nys = Nystroem() lr = LogisticRegression() knn = KNeighborsClassifier() step1 = ohe | no_op step2 = pca | nys step3 = lr | knn model_plan = step1 >> step2 >> step3
def test_concat_with_hyperopt2(self): from lale.lib.lale import Hyperopt from lale.operators import make_pipeline, make_union pca = PCA(n_components=3) nys = Nystroem(n_components=10) lr = LogisticRegression(random_state=42, C=0.1) trainable = make_pipeline(make_union(pca, nys), lr) clf = Hyperopt(estimator=trainable, max_evals=2) from sklearn.datasets import load_iris iris_data = load_iris() clf.fit(iris_data.data, iris_data.target) clf.predict(iris_data.data)