def test_init_fit_predict_spark_pandas(self): from lale.datasets import pandas2spark from lale.datasets.util import spark_installed if spark_installed: trainable_cf = ConcatFeatures() A = [[11, 12, 13], [21, 22, 23], [31, 32, 33]] B = [[14, 15], [24, 25], [34, 35]] A = pd.DataFrame(A, columns=["a", "b", "c"]) B = pd.DataFrame(B, columns=["d", "e"]) A = pandas2spark(A, add_index=True) A = add_table_name(A, "A") B = add_table_name(B, "B") trained_cf = trainable_cf.fit(X=[A, B]) transformed = trained_cf.transform([A, B]) expected = [ [11, 12, 13, 14, 15], [21, 22, 23, 24, 25], [31, 32, 33, 34, 35], ] expected = pd.DataFrame(expected, columns=["a", "b", "c", "d", "e"]) for c in expected.columns: self.assertEqual(list(transformed[c]), list(expected[c]))
def test_planned_pipeline_3(self) : plan = ( ( MinMaxScaler() & NoOp() ) >> ConcatFeatures() >> ( StandardScaler & ( NoOp() | MinMaxScaler() ) ) >> ConcatFeatures() >> ( LogisticRegression | KNeighborsClassifier ) ) run_hyperopt_on_planned_pipeline(plan)
def test_export_to_sklearn_pipeline2(self): from sklearn.feature_selection import SelectKBest from sklearn.pipeline import FeatureUnion lale_pipeline = ( ( ( (PCA(svd_solver="randomized", random_state=42) & SelectKBest(k=3)) >> ConcatFeatures() ) & Nystroem(random_state=42) ) >> ConcatFeatures() >> KNeighborsClassifier() ) trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assertIsInstance( sklearn_pipeline.named_steps["featureunion"], FeatureUnion ) from sklearn.neighbors import KNeighborsClassifier as SklearnKNN self.assertIsInstance( sklearn_pipeline.named_steps["kneighborsclassifier"], SklearnKNN ) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_multiple_estimators_predict_predict_proba(self): pipeline = (StandardScaler() >> (LogisticRegression() & PCA()) >> ConcatFeatures() >> (NoOp() & LinearSVC()) >> ConcatFeatures() >> KNeighborsClassifier()) pipeline.fit(self.X_train, self.y_train) _ = pipeline.predict_proba(self.X_test) _ = pipeline.predict(self.X_test)
def test_init_fit_predict(self): trainable_cf = ConcatFeatures() A = [[11, 12, 13], [21, 22, 23], [31, 32, 33]] B = [[14, 15], [24, 25], [34, 35]] trained_cf = trainable_cf.fit(X=[A, B]) transformed = trained_cf.transform([A, B]) expected = [[11, 12, 13, 14, 15], [21, 22, 23, 24, 25], [31, 32, 33, 34, 35]] for i_sample in range(len(transformed)): for i_feature in range(len(transformed[i_sample])): self.assertEqual(transformed[i_sample][i_feature], expected[i_sample][i_feature])
def test_comparison_with_scikit(self): import warnings warnings.filterwarnings("ignore") from lale.lib.sklearn import PCA import sklearn.datasets from lale.helpers import cross_val_score pca = PCA(n_components=3, random_state=42, svd_solver='arpack') nys = Nystroem(n_components=10, random_state=42) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) trainable = (pca & nys) >> concat >> lr digits = sklearn.datasets.load_digits() X, y = sklearn.utils.shuffle(digits.data, digits.target, random_state=42) cv_results = cross_val_score(trainable, X, y) cv_results = ['{0:.1%}'.format(score) for score in cv_results] from sklearn.pipeline import make_pipeline, FeatureUnion from sklearn.decomposition import PCA as SklearnPCA from sklearn.kernel_approximation import Nystroem as SklearnNystroem from sklearn.linear_model import LogisticRegression as SklearnLR from sklearn.model_selection import cross_val_score union = FeatureUnion([("pca", SklearnPCA(n_components=3, random_state=42, svd_solver='arpack')), ("nys", SklearnNystroem(n_components=10, random_state=42))]) lr = SklearnLR(random_state=42, C=0.1) pipeline = make_pipeline(union, lr) scikit_cv_results = cross_val_score(pipeline, X, y, cv = 5) scikit_cv_results = ['{0:.1%}'.format(score) for score in scikit_cv_results] self.assertEqual(cv_results, scikit_cv_results) warnings.resetwarnings()
def test_export_to_sklearn_pipeline3(self): from sklearn.feature_selection import SelectKBest from sklearn.pipeline import FeatureUnion lale_pipeline = ( ( (PCA() >> SelectKBest(k=2)) & (Nystroem(random_state=42) >> SelectKBest(k=3)) & (SelectKBest(k=3)) ) >> ConcatFeatures() >> SelectKBest(k=2) >> LogisticRegression() ) trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assertIsInstance( sklearn_pipeline.named_steps["featureunion"], FeatureUnion ) self.assertIsInstance(sklearn_pipeline.named_steps["selectkbest"], SelectKBest) from sklearn.linear_model import LogisticRegression as SklearnLR self.assertIsInstance( sklearn_pipeline.named_steps["logisticregression"], SklearnLR ) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_two_estimators_predict_proba1(self): pipeline = StandardScaler() >> ( PCA() & Nystroem() & PassiveAggressiveClassifier() ) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier() pipeline.fit(self.X_train, self.y_train) with self.assertRaises(ValueError): pipeline.predict_proba(self.X_test)
def test_planned_pipeline_1(self) : plan = ( ( PCA & ( MinMaxScaler | Normalizer ) ) >> ConcatFeatures() >> ( MinMaxScaler | Normalizer ) >> ( LogisticRegression | KNeighborsClassifier) ) run_hyperopt_on_planned_pipeline(plan)
def test_remove_last4(self): pipeline = StandardScaler() >> ( PCA() & Nystroem() & PassiveAggressiveClassifier() ) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier() new_pipeline = pipeline.remove_last(inplace=True) self.assertEqual(len(new_pipeline._steps), 6) self.assertEqual(len(pipeline._steps), 6)
def test_two_estimators_predict1(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier()) trained = pipeline.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_remove_last2(self): pipeline = (StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> (PassiveAggressiveClassifier() & LogisticRegression())) with self.assertRaises(ValueError): pipeline.remove_last()
def test_with_concat_features2(self): import warnings warnings.filterwarnings("ignore") from sklearn.datasets import load_iris from sklearn.metrics import accuracy_score from lale.lib.lale import Hyperopt data = load_iris() X, y = data.data, data.target pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) from lale.operators import make_pipeline pipeline = make_pipeline( ((((SimpleImputer() | NoOp()) >> pca) & nys) >> concat >> lr) | KNeighborsClassifier() ) clf = Hyperopt(estimator=pipeline, max_evals=1, handle_cv_failure=True) trained = clf.fit(X, y) predictions = trained.predict(X) print(accuracy_score(y, predictions)) warnings.resetwarnings()
def test_pipeline_1(self): self.maxDiff = None from lale.json_operator import from_json, to_json from lale.lib.lale import ConcatFeatures, NoOp from lale.lib.sklearn import PCA from lale.lib.sklearn import LogisticRegression as LR operator = (PCA & NoOp) >> ConcatFeatures >> LR json_expected = { "class": "lale.operators.PlannedPipeline", "state": "planned", "edges": [ ["pca", "concat_features"], ["no_op", "concat_features"], ["concat_features", "lr"], ], "steps": { "pca": { "class": PCA.class_name(), "state": "planned", "operator": "PCA", "label": "PCA", "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.pca.html", }, "no_op": { "class": NoOp.class_name(), "state": "trained", "operator": "NoOp", "label": "NoOp", "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.no_op.html", "hyperparams": None, "coefs": None, "is_frozen_trainable": True, "is_frozen_trained": True, }, "concat_features": { "class": ConcatFeatures.class_name(), "state": "trained", "operator": "ConcatFeatures", "label": "ConcatFeatures", "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html", "hyperparams": None, "coefs": None, "is_frozen_trainable": True, "is_frozen_trained": True, }, "lr": { "class": LR.class_name(), "state": "planned", "operator": "LogisticRegression", "label": "LR", "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.logistic_regression.html", }, }, } json = to_json(operator) self.assertEqual(json, json_expected) operator_2 = from_json(json) json_2 = to_json(operator_2) self.assertEqual(json, json_2)
def test_empty_schema(self): pca = PCA().customize_schema(whiten=schemas.Schema()) plan = ((pca & (MinMaxScaler | Normalizer)) >> ConcatFeatures() >> (MinMaxScaler | Normalizer) >> (LogisticRegression | KNeighborsClassifier)) from lale.search.schema2search_space import OperatorSchemaError with self.assertRaises(OperatorSchemaError) as ctxt: run_hyperopt_on_planned_pipeline(plan)
def test_no_max_schema(self): pca = PCA().customize_schema(n_components=schemas.Float(min=0.0)) plan = ((pca & (MinMaxScaler | Normalizer)) >> ConcatFeatures() >> (MinMaxScaler | Normalizer) >> (LogisticRegression | KNeighborsClassifier)) from lale.search.search_space import SearchSpaceError with self.assertRaises(SearchSpaceError) as ctxt: run_hyperopt_on_planned_pipeline(plan)
def test_remove_last5(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier() ) pipeline.remove_last(inplace=True).freeze_trainable()
def test_init_fit_predict_pandas_series(self): trainable_cf = ConcatFeatures() A = [[11, 12, 13], [21, 22, 23], [31, 32, 33]] B = [14, 24, 34] A = pd.DataFrame(A, columns=["a", "b", "c"]) B = pd.Series(B, name="d") A = add_table_name(A, "A") B = add_table_name(B, "B") trained_cf = trainable_cf.fit(X=[A, B]) transformed = trained_cf.transform([A, B]) expected = [ [11, 12, 13, 14], [21, 22, 23, 24], [31, 32, 33, 34], ] expected = pd.DataFrame(expected, columns=["a", "b", "c", "d"]) for c in expected.columns: self.assertEqual(list(transformed[c]), list(expected[c]))
def test_two_estimators_predict_proba(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & LogisticRegression()) >> ConcatFeatures() >> NoOp() >> LogisticRegression() ) trained = pipeline.fit(self.X_train, self.y_train) trained.predict_proba(self.X_test)
def test_two_estimators_predict_proba1(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & GaussianNB()) >> ConcatFeatures() >> NoOp() >> GaussianNB() ) pipeline.fit(self.X_train, self.y_train) pipeline.predict_proba(self.X_test)
def do1DTest(self, trainable, train_X, train_y, test_X, test_y): #Test for 1-D array as input to the transformers train_X = train_X[:,0] test_X = test_X[:,0] trainable_pipeline = (trainable & NoOp()) >> ConcatFeatures() >> float32_transform() >> LR() trained_pipeline = trainable_pipeline.fit(train_X, train_y) trained_pipeline.predict(test_X) hyperopt = Hyperopt(estimator=trainable_pipeline, max_evals=1) trained_hyperopt = hyperopt.fit(train_X, train_y) trained_hyperopt.predict(test_X)
def test_export_to_sklearn_pipeline2(self): from lale.lib.lale import ConcatFeatures from lale.lib.sklearn import PCA from lale.lib.sklearn import KNeighborsClassifier from sklearn.feature_selection import SelectKBest from lale.lib.sklearn import Nystroem from sklearn.pipeline import FeatureUnion lale_pipeline = ( ((PCA() & SelectKBest(k=3)) >> ConcatFeatures()) & Nystroem()) >> ConcatFeatures() >> KNeighborsClassifier() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assertIsInstance(sklearn_pipeline.named_steps['featureunion'], FeatureUnion) from sklearn.neighbors import KNeighborsClassifier self.assertIsInstance( sklearn_pipeline.named_steps['kneighborsclassifier'], KNeighborsClassifier) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_transform_schema_Concat_irisArr(self): from lale.datasets.data_schemas import to_schema existing_flag = disable_data_schema_validation set_disable_data_schema_validation(False) data_X, data_y = self._irisArr["X"], self._irisArr["y"] s_in_X, s_in_y = to_schema(data_X), to_schema(data_y) def check(s_actual, n_expected, s_expected): assert s_actual["items"]["minItems"] == n_expected, str(s_actual) assert s_actual["items"]["maxItems"] == n_expected, str(s_actual) assert s_actual["items"]["items"] == s_expected, str(s_actual) s_out_X = ConcatFeatures.transform_schema({"items": [s_in_X]}) check(s_out_X, 4, {"type": "number"}) s_out_y = ConcatFeatures.transform_schema({"items": [s_in_y]}) check(s_out_y, 1, {"type": "integer"}) s_out_XX = ConcatFeatures.transform_schema({"items": [s_in_X, s_in_X]}) check(s_out_XX, 8, {"type": "number"}) s_out_yy = ConcatFeatures.transform_schema({"items": [s_in_y, s_in_y]}) check(s_out_yy, 2, {"type": "integer"}) s_out_Xy = ConcatFeatures.transform_schema({"items": [s_in_X, s_in_y]}) check(s_out_Xy, 5, {"type": "number"}) s_out_XXX = ConcatFeatures.transform_schema( {"items": [s_in_X, s_in_X, s_in_X]}) check(s_out_XXX, 12, {"type": "number"}) set_disable_data_schema_validation(existing_flag)
def test_transform_schema_Concat_irisDf(self): with EnableSchemaValidation(): from lale.datasets.data_schemas import to_schema data_X, data_y = self._irisDf["X"], self._irisDf["y"] s_in_X, s_in_y = to_schema(data_X), to_schema(data_y) def check(s_actual, n_expected, s_expected): assert s_actual["items"]["minItems"] == n_expected, str( s_actual) assert s_actual["items"]["maxItems"] == n_expected, str( s_actual) assert s_actual["items"]["items"] == s_expected, str(s_actual) s_out_X = ConcatFeatures.transform_schema({"items": [s_in_X]}) check(s_out_X, 4, {"type": "number"}) s_out_y = ConcatFeatures.transform_schema({"items": [s_in_y]}) check(s_out_y, 1, {"description": "target", "type": "integer"}) s_out_XX = ConcatFeatures.transform_schema( {"items": [s_in_X, s_in_X]}) check(s_out_XX, 8, {"type": "number"}) s_out_yy = ConcatFeatures.transform_schema( {"items": [s_in_y, s_in_y]}) check(s_out_yy, 2, {"type": "integer"}) s_out_Xy = ConcatFeatures.transform_schema( {"items": [s_in_X, s_in_y]}) check(s_out_Xy, 5, {"type": "number"}) s_out_XXX = ConcatFeatures.transform_schema( {"items": [s_in_X, s_in_X, s_in_X]}) check(s_out_XXX, 12, {"type": "number"})
def test_concat_with_hyperopt(self): from lale.lib.lale import Hyperopt pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) trainable = (pca & nys) >> concat >> lr clf = Hyperopt(estimator=trainable, max_evals=2) from sklearn.datasets import load_iris iris_data = load_iris() clf.fit(iris_data.data, iris_data.target) clf.predict(iris_data.data)
def test_with_pandas(self): from lale.datasets import load_iris_df import warnings warnings.filterwarnings("ignore") pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) trainable = (pca & nys) >> concat >> lr (X_train, y_train), (X_test, y_test) = load_iris_df() trained = trainable.fit(X_train, y_train) predicted = trained.predict(X_test)
def test_with_concat_features1(self): import warnings warnings.filterwarnings("ignore") from sklearn.datasets import load_iris from lale.lib.lale import Hyperopt from sklearn.metrics import accuracy_score data = load_iris() X, y = data.data, data.target pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) pipeline = ((pca & nys) >> concat >> lr) | KNeighborsClassifier() clf = Hyperopt(estimator=pipeline, max_evals=1) trained = clf.fit(X, y) predictions = trained.predict(X) print(accuracy_score(y, predictions)) warnings.resetwarnings()
def test_export_to_sklearn_pipeline3(self): from lale.lib.lale import ConcatFeatures from lale.lib.sklearn import PCA from lale.lib.sklearn import KNeighborsClassifier, LogisticRegression, SVC from sklearn.feature_selection import SelectKBest from lale.lib.sklearn import Nystroem from sklearn.pipeline import FeatureUnion lale_pipeline = ( (PCA() >> SelectKBest(k=2)) & (Nystroem(random_state=42) >> SelectKBest(k=3)) & (SelectKBest(k=3))) >> ConcatFeatures() >> SelectKBest( k=2) >> LogisticRegression() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assertIsInstance(sklearn_pipeline.named_steps['featureunion'], FeatureUnion) self.assertIsInstance(sklearn_pipeline.named_steps['selectkbest'], SelectKBest) from sklearn.linear_model import LogisticRegression self.assertIsInstance( sklearn_pipeline.named_steps['logisticregression'], LogisticRegression) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_transform_schema_Concat_irisDf(self): from lale.datasets.data_schemas import to_schema data_X, data_y = self._irisDf['X'], self._irisDf['y'] s_in_X, s_in_y = to_schema(data_X), to_schema(data_y) def check(s_actual, n_expected, s_expected): assert s_actual['items']['minItems'] == n_expected, str(s_actual) assert s_actual['items']['maxItems'] == n_expected, str(s_actual) assert s_actual['items']['items'] == s_expected, str(s_actual) s_out_X = ConcatFeatures.transform_schema({'items': [s_in_X]}) check(s_out_X, 4, {'type': 'number'}) s_out_y = ConcatFeatures.transform_schema({'items': [s_in_y]}) check(s_out_y, 1, {'description': 'target', 'type': 'integer'}) s_out_XX = ConcatFeatures.transform_schema({'items': [s_in_X, s_in_X]}) check(s_out_XX, 8, {'type': 'number'}) s_out_yy = ConcatFeatures.transform_schema({'items': [s_in_y, s_in_y]}) check(s_out_yy, 2, {'type': 'integer'}) s_out_Xy = ConcatFeatures.transform_schema({'items': [s_in_X, s_in_y]}) check(s_out_Xy, 5, {'type': 'number'}) s_out_XXX = ConcatFeatures.transform_schema({ 'items': [s_in_X, s_in_X, s_in_X]}) check(s_out_XXX, 12, {'type': 'number'})
def test_hyperparam_defaults(self): cf = ConcatFeatures()