def test_trainable_pipe_left(self): from lale.lib.lale import NoOp from lale.lib.sklearn import LogisticRegression from sklearn.decomposition import PCA iris = sklearn.datasets.load_iris() pipeline = PCA() >> LogisticRegression(random_state=42) pipeline.fit(iris.data, iris.target)
def test_feature_preprocessor(self): X_train, y_train = self.X_train, self.y_train import importlib module_name = ".".join(fproc_name.split(".")[0:-1]) class_name = fproc_name.split(".")[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) fproc = class_() from lale.lib.sklearn.one_hot_encoder import OneHotEncoder if isinstance(fproc, OneHotEncoder): # type: ignore # fproc = OneHotEncoder(handle_unknown = 'ignore') # remove the hack when this is fixed fproc = PCA() # test_schemas_are_schemas lale.type_checking.validate_is_schema(fproc.input_schema_fit()) lale.type_checking.validate_is_schema(fproc.input_schema_transform()) lale.type_checking.validate_is_schema(fproc.output_schema_transform()) lale.type_checking.validate_is_schema(fproc.hyperparam_schema()) # test_init_fit_transform trained = fproc.fit(self.X_train, self.y_train) _ = trained.transform(self.X_test) # test_predict_on_trainable trained = fproc.fit(X_train, y_train) fproc.transform(X_train) # test_to_json fproc.to_json() # test_in_a_pipeline # This test assumes that the output of feature processing is compatible with LogisticRegression from lale.lib.sklearn import LogisticRegression pipeline = fproc >> LogisticRegression() trained = pipeline.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test) # Tune the pipeline with LR using Hyperopt from lale.lib.lale import Hyperopt hyperopt = Hyperopt(estimator=pipeline, max_evals=1, verbose=True, cv=3) trained = hyperopt.fit(self.X_train, self.y_train) _ = trained.predict(self.X_test)
def test_planned_pipeline_with_choice(self): planned = PCA() >> (LogisticRegression() | KNeighborsClassifier()) try: planned.fit(self.X, self.y) except AttributeError as e: self.assertEqual( e.__str__(), """The pipeline is not trainable, which means you can not call fit on it. Suggested fixes: Fix [A]: You can make the following changes in the pipeline in order to make it trainable: [A.1] Please remove the operator choice `|` from `LogisticRegression | KNeighborsClassifier` and keep only one of those operators. Fix [B]: Alternatively, you could use `auto_configure(X, y, Hyperopt, max_evals=5)` on the pipeline to use Hyperopt for `max_evals` iterations for hyperparameter tuning. `Hyperopt` can be imported as `from lale.lib.lale import Hyperopt`.""", )
def test_fit3(self): from lale.lib.sklearn import MinMaxScaler, MLPClassifier, PCA pipeline = PCA() >> Batching( operator=MinMaxScaler() >> MLPClassifier(random_state=42), batch_size=10) trained = pipeline.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test)
def test_export_to_sklearn_pipeline_with_noop_3(self): from lale.lib.sklearn import PCA, KNeighborsClassifier from lale.lib.lale import NoOp from sklearn.pipeline import make_pipeline # This test is probably unnecessary, but doesn't harm at this point lale_pipeline = PCA(n_components=3) >> KNeighborsClassifier() >> NoOp() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline()
def test_export_to_sklearn_pipeline_with_noop_2(self): from lale.lib.sklearn import PCA, KNeighborsClassifier from lale.lib.lale import NoOp from sklearn.pipeline import make_pipeline lale_pipeline = PCA(n_components=3) >> NoOp() >> KNeighborsClassifier() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_feature_preprocessor(self): X_train, y_train = self.X_train, self.y_train X_test, y_test = self.X_test, self.y_test import importlib module_name = ".".join(fproc_name.split('.')[0:-1]) class_name = fproc_name.split('.')[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) fproc = class_() from lale.lib.sklearn.one_hot_encoder import OneHotEncoderImpl if isinstance(fproc._impl, OneHotEncoderImpl): #fproc = OneHotEncoder(handle_unknown = 'ignore') #remove the hack when this is fixed fproc = PCA() #test_schemas_are_schemas from lale.helpers import validate_is_schema validate_is_schema(fproc.input_schema_fit()) validate_is_schema(fproc.input_schema_transform()) validate_is_schema(fproc.output_schema()) validate_is_schema(fproc.hyperparam_schema()) #test_init_fit_transform trained = fproc.fit(self.X_train, self.y_train) predictions = trained.transform(self.X_test) #test_predict_on_trainable trained = fproc.fit(X_train, y_train) fproc.transform(X_train) #test_to_json fproc.to_json() #test_in_a_pipeline #This test assumes that the output of feature processing is compatible with LogisticRegression from lale.lib.sklearn import LogisticRegression pipeline = fproc >> LogisticRegression() trained = pipeline.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test) #Tune the pipeline with LR using HyperoptClassifier from lale.lib.lale import HyperoptClassifier hyperopt = HyperoptClassifier(model=pipeline, max_evals=1) trained = hyperopt.fit(self.X_train, self.y_train) predictions = trained.predict(self.X_test)
def test_pipeline_create_trained(self): orig_trainable = PCA() >> LogisticRegression() orig_trained = orig_trainable.fit(self.X_train, self.y_train) self.assertIsInstance(orig_trained, lale.operators.TrainedPipeline) pca_trained, lr_trained = orig_trained.steps() pre_trained = lale.lib.sklearn.Pipeline( steps=[("pca1", pca_trained), ("lr1", lr_trained)]) self.assertIsInstance(pre_trained, lale.operators.TrainedIndividualOp) predictions = pre_trained.predict(self.X_test) accuracy_score(self.y_test, predictions)
def test_export_to_sklearn_pipeline(self): lale_pipeline = PCA(n_components=3) >> KNeighborsClassifier() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() for i, pipeline_step in enumerate(sklearn_pipeline.named_steps): sklearn_step_params = sklearn_pipeline.named_steps[ pipeline_step].get_params() lale_sklearn_params = self.get_sklearn_params( trained_lale_pipeline.steps()[i]) self.assertEqual(sklearn_step_params, lale_sklearn_params) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_make_choice_with_instance(self): from lale.operators import make_union, make_choice, make_pipeline from sklearn.datasets import load_iris iris = load_iris() X, y = iris.data, iris.target tfm = PCA() | Nystroem() | NoOp() with self.assertRaises(AttributeError): trained = tfm.fit(X, y) planned_pipeline1 = (OneHotEncoder | NoOp) >> tfm >> (LogisticRegression | KNeighborsClassifier) planned_pipeline2 = (OneHotEncoder | NoOp) >> (PCA | Nystroem) >> (LogisticRegression | KNeighborsClassifier) planned_pipeline3 = make_choice(OneHotEncoder, NoOp) >> make_choice(PCA, Nystroem) >> make_choice(LogisticRegression, KNeighborsClassifier)
def test_resampler(self): from lale.lib.sklearn import PCA, Nystroem, LogisticRegression, RandomForestClassifier from lale.lib.lale import NoOp, ConcatFeatures X_train, y_train = self.X_train, self.y_train X_test, y_test = self.X_test, self.y_test import importlib module_name = ".".join(res_name.split('.')[0:-1]) class_name = res_name.split('.')[-1] module = importlib.import_module(module_name) class_ = getattr(module, class_name) with self.assertRaises(ValueError): res = class_() #test_schemas_are_schemas lale.type_checking.validate_is_schema(class_.input_schema_fit()) lale.type_checking.validate_is_schema(class_.input_schema_predict()) lale.type_checking.validate_is_schema(class_.output_schema_predict()) lale.type_checking.validate_is_schema(class_.hyperparam_schema()) #test_init_fit_predict from lale.operators import make_pipeline pipeline1 = PCA() >> class_(operator=make_pipeline(LogisticRegression())) trained = pipeline1.fit(X_train, y_train) predictions = trained.predict(X_test) pipeline2 = class_(operator=make_pipeline(PCA(), LogisticRegression())) trained = pipeline2.fit(X_train, y_train) predictions = trained.predict(X_test) #test_with_hyperopt from lale.lib.lale import Hyperopt optimizer = Hyperopt(estimator=PCA >> class_(operator=make_pipeline(LogisticRegression())), max_evals = 1, show_progressbar=False) trained_optimizer = optimizer.fit(X_train, y_train) predictions = trained_optimizer.predict(X_test) pipeline3 = class_(operator= PCA() >> (Nystroem & NoOp) >> ConcatFeatures >> LogisticRegression()) optimizer = Hyperopt(estimator=pipeline3, max_evals = 1, show_progressbar=False) trained_optimizer = optimizer.fit(X_train, y_train) predictions = trained_optimizer.predict(X_test) pipeline4 = (PCA >> class_(operator=make_pipeline(Nystroem())) & class_(operator=make_pipeline(Nystroem()))) >> ConcatFeatures >> LogisticRegression() optimizer = Hyperopt(estimator=pipeline4, max_evals = 1, scoring='roc_auc', show_progressbar=False) trained_optimizer = optimizer.fit(X_train, y_train) predictions = trained_optimizer.predict(X_test) #test_cross_validation from lale.helpers import cross_val_score cv_results = cross_val_score(pipeline1, X_train, y_train, cv = 2) self.assertEqual(len(cv_results), 2) #test_to_json pipeline1.to_json()
def test_export_to_sklearn_pipeline(self): from lale.lib.sklearn import PCA from lale.lib.sklearn import KNeighborsClassifier from sklearn.pipeline import make_pipeline lale_pipeline = PCA(n_components=3) >> KNeighborsClassifier() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() for i, pipeline_step in enumerate(sklearn_pipeline.named_steps): sklearn_step_params = sklearn_pipeline.named_steps[ pipeline_step].get_params() lale_sklearn_params = trained_lale_pipeline.steps( )[i]._impl._wrapped_model.get_params() self.assertEqual(sklearn_step_params, lale_sklearn_params) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_make_choice_with_instance(self): from sklearn.datasets import load_iris from lale.operators import make_choice iris = load_iris() X, y = iris.data, iris.target tfm = PCA() | Nystroem() | NoOp() with self.assertRaises(AttributeError): # we are trying to trigger a runtime error here, so we ignore the static warning _ = tfm.fit(X, y) # type: ignore _ = (OneHotEncoder | NoOp) >> tfm >> (LogisticRegression | KNeighborsClassifier) _ = ((OneHotEncoder | NoOp) >> (PCA | Nystroem) >> (LogisticRegression | KNeighborsClassifier)) _ = (make_choice(OneHotEncoder, NoOp) >> make_choice(PCA, Nystroem) >> make_choice(LogisticRegression, KNeighborsClassifier))
def test_disable_schema_validation_pipeline(self): os.environ["LALE_DISABLE_SCHEMA_VALIDATION"]='True' from lale.lib.sklearn import PCA, LogisticRegression import lale.schemas as schemas lr_input = schemas.Object(required=['X', 'y'], X=schemas.AnyOf([ schemas.Array( schemas.Array( schemas.String())), schemas.Array( schemas.String())]), y=schemas.Array(schemas.String())) foo = LogisticRegression.customize_schema(input_fit=lr_input) abc = foo() pipeline = PCA() >> abc trained_pipeline = pipeline.fit(self.X_train, self.y_train) trained_pipeline.predict(self.X_test) os.environ["LALE_DISABLE_SCHEMA_VALIDATION"]='False'
def test_enable_schema_validation_pipeline(self): with EnableSchemaValidation(): import lale.schemas as schemas from lale.lib.sklearn import PCA, LogisticRegression lr_input = schemas.Object( required=["X", "y"], X=schemas.AnyOf([ schemas.Array(schemas.Array(schemas.String())), schemas.Array(schemas.String()), ]), y=schemas.Array(schemas.String()), ) foo = LogisticRegression.customize_schema(input_fit=lr_input) abc = foo() pipeline = PCA() >> abc with self.assertRaises(ValueError): trained_pipeline = pipeline.fit(self.X_train, self.y_train) trained_pipeline.predict(self.X_test)
def test_disable_schema_validation_pipeline(self): existing_flag = disable_data_schema_validation set_disable_data_schema_validation(True) import lale.schemas as schemas from lale.lib.sklearn import PCA, LogisticRegression lr_input = schemas.Object( required=["X", "y"], X=schemas.AnyOf([ schemas.Array(schemas.Array(schemas.String())), schemas.Array(schemas.String()), ]), y=schemas.Array(schemas.String()), ) foo = LogisticRegression.customize_schema(input_fit=lr_input) abc = foo() pipeline = PCA() >> abc trained_pipeline = pipeline.fit(self.X_train, self.y_train) trained_pipeline.predict(self.X_test) set_disable_data_schema_validation(existing_flag)
def test_export_to_sklearn_pipeline_with_noop_3(self): # This test is probably unnecessary, but doesn't harm at this point lale_pipeline = PCA(n_components=3) >> KNeighborsClassifier() >> NoOp() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) _ = trained_lale_pipeline.export_to_sklearn_pipeline()
def test_export_to_sklearn_pipeline_with_noop_2(self): lale_pipeline = PCA(n_components=3) >> NoOp() >> KNeighborsClassifier() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_PCA(self): op = PCA() op.fit(self.X, [])