def test_empty_schema(self): pca = PCA().customize_schema(whiten=schemas.Schema()) plan = ((pca & (MinMaxScaler | Normalizer)) >> ConcatFeatures() >> (MinMaxScaler | Normalizer) >> (LogisticRegression | KNeighborsClassifier)) from lale.search.schema2search_space import OperatorSchemaError with self.assertRaises(OperatorSchemaError): run_hyperopt_on_planned_pipeline(plan)
def test_remove_last5(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier() ) pipeline.remove_last(inplace=True).freeze_trainable()
def test_no_max_schema(self): pca = PCA().customize_schema(n_components=schemas.Float(min=0.0)) plan = ((pca & (MinMaxScaler | Normalizer)) >> ConcatFeatures() >> (MinMaxScaler | Normalizer) >> (LogisticRegression | KNeighborsClassifier)) from lale.search.search_space import SearchSpaceError with self.assertRaises(SearchSpaceError): run_hyperopt_on_planned_pipeline(plan)
def test_two_estimators_predict_proba(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & LogisticRegression()) >> ConcatFeatures() >> NoOp() >> LogisticRegression() ) trained = pipeline.fit(self.X_train, self.y_train) trained.predict_proba(self.X_test)
def test_remove_last2(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> (PassiveAggressiveClassifier() & LogisticRegression()) ) with self.assertRaises(ValueError): pipeline.remove_last()
def test_two_estimators_predict_proba1(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & GaussianNB()) >> ConcatFeatures() >> NoOp() >> GaussianNB() ) pipeline.fit(self.X_train, self.y_train) pipeline.predict_proba(self.X_test)
def do1DTest(self, trainable, train_X, train_y, test_X, test_y): #Test for 1-D array as input to the transformers train_X = train_X[:,0] test_X = test_X[:,0] trainable_pipeline = (trainable & NoOp()) >> ConcatFeatures() >> float32_transform() >> LR() trained_pipeline = trainable_pipeline.fit(train_X, train_y) trained_pipeline.predict(test_X) hyperopt = Hyperopt(estimator=trainable_pipeline, max_evals=1) trained_hyperopt = hyperopt.fit(train_X, train_y) trained_hyperopt.predict(test_X)
def test_two_estimators_predict1(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier() ) trained = pipeline.fit(self.X_train, self.y_train) trained.predict(self.X_test)
def test_export_to_sklearn_pipeline2(self): from lale.lib.lale import ConcatFeatures from lale.lib.sklearn import PCA from lale.lib.sklearn import KNeighborsClassifier from sklearn.feature_selection import SelectKBest from lale.lib.sklearn import Nystroem from sklearn.pipeline import FeatureUnion lale_pipeline = ( ((PCA() & SelectKBest(k=3)) >> ConcatFeatures()) & Nystroem()) >> ConcatFeatures() >> KNeighborsClassifier() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assertIsInstance(sklearn_pipeline.named_steps['featureunion'], FeatureUnion) from sklearn.neighbors import KNeighborsClassifier self.assertIsInstance( sklearn_pipeline.named_steps['kneighborsclassifier'], KNeighborsClassifier) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_remove_last4(self): pipeline = ( StandardScaler() >> (PCA() & Nystroem() & PassiveAggressiveClassifier()) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier() ) new_pipeline = pipeline.remove_last(inplace=True) self.assertEqual(len(new_pipeline._steps), 6) self.assertEqual(len(pipeline._steps), 6)
def test_concat_with_hyperopt(self): from lale.lib.lale import Hyperopt pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) trainable = (pca & nys) >> concat >> lr clf = Hyperopt(estimator=trainable, max_evals=2) from sklearn.datasets import load_iris iris_data = load_iris() clf.fit(iris_data.data, iris_data.target) clf.predict(iris_data.data)
def test_init_fit_predict(self): trainable_cf = ConcatFeatures() A = [[11, 12, 13], [21, 22, 23], [31, 32, 33]] B = [[14, 15], [24, 25], [34, 35]] trained_cf = trainable_cf.fit(X=[A, B]) transformed = trained_cf.transform([A, B]) expected = [[11, 12, 13, 14, 15], [21, 22, 23, 24, 25], [31, 32, 33, 34, 35]] for i_sample in range(len(transformed)): for i_feature in range(len(transformed[i_sample])): self.assertEqual(transformed[i_sample][i_feature], expected[i_sample][i_feature])
def test_with_pandas(self): from lale.datasets import load_iris_df import warnings warnings.filterwarnings("ignore") pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) trainable = (pca & nys) >> concat >> lr (X_train, y_train), (X_test, y_test) = load_iris_df() trained = trainable.fit(X_train, y_train) predicted = trained.predict(X_test)
def test_comparison_with_scikit(self): import warnings warnings.filterwarnings("ignore") import sklearn.datasets import sklearn.utils from lale.helpers import cross_val_score from lale.lib.sklearn import PCA pca = PCA(n_components=3, random_state=42, svd_solver="arpack") nys = Nystroem(n_components=10, random_state=42) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) trainable = (pca & nys) >> concat >> lr digits = sklearn.datasets.load_digits() X, y = sklearn.utils.shuffle(digits.data, digits.target, random_state=42) cv_results = cross_val_score(trainable, X, y) cv_results = ["{0:.1%}".format(score) for score in cv_results] from sklearn.decomposition import PCA as SklearnPCA from sklearn.kernel_approximation import Nystroem as SklearnNystroem from sklearn.linear_model import LogisticRegression as SklearnLR from sklearn.model_selection import cross_val_score from sklearn.pipeline import FeatureUnion, make_pipeline union = FeatureUnion([ ( "pca", SklearnPCA(n_components=3, random_state=42, svd_solver="arpack"), ), ("nys", SklearnNystroem(n_components=10, random_state=42)), ]) lr = SklearnLR(random_state=42, C=0.1) pipeline = make_pipeline(union, lr) scikit_cv_results = cross_val_score(pipeline, X, y, cv=5) scikit_cv_results = [ "{0:.1%}".format(score) for score in scikit_cv_results ] self.assertEqual(cv_results, scikit_cv_results) warnings.resetwarnings()
def test_init_fit_predict_pandas_series(self): trainable_cf = ConcatFeatures() A = [[11, 12, 13], [21, 22, 23], [31, 32, 33]] B = [14, 24, 34] A = pd.DataFrame(A, columns=["a", "b", "c"]) B = pd.Series(B, name="d") A = add_table_name(A, "A") B = add_table_name(B, "B") trained_cf = trainable_cf.fit(X=[A, B]) transformed = trained_cf.transform([A, B]) expected = [ [11, 12, 13, 14], [21, 22, 23, 24], [31, 32, 33, 34], ] expected = pd.DataFrame(expected, columns=["a", "b", "c", "d"]) for c in expected.columns: self.assertEqual(list(transformed[c]), list(expected[c]))
def test_export_to_sklearn_pipeline3(self): from sklearn.feature_selection import SelectKBest from sklearn.pipeline import FeatureUnion lale_pipeline = (((PCA() >> SelectKBest(k=2)) & (Nystroem(random_state=42) >> SelectKBest(k=3)) & (SelectKBest(k=3))) >> ConcatFeatures() >> SelectKBest(k=2) >> LogisticRegression()) trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assertIsInstance(sklearn_pipeline.named_steps["featureunion"], FeatureUnion) self.assertIsInstance(sklearn_pipeline.named_steps["selectkbest"], SelectKBest) from sklearn.linear_model import LogisticRegression as SklearnLR self.assertIsInstance( sklearn_pipeline.named_steps["logisticregression"], SklearnLR) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_with_concat_features1(self): import warnings warnings.filterwarnings("ignore") from sklearn.datasets import load_iris from lale.lib.lale import Hyperopt from sklearn.metrics import accuracy_score data = load_iris() X, y = data.data, data.target pca = PCA(n_components=3) nys = Nystroem(n_components=10) concat = ConcatFeatures() lr = LogisticRegression(random_state=42, C=0.1) pipeline = ((pca & nys) >> concat >> lr) | KNeighborsClassifier() clf = Hyperopt(estimator=pipeline, max_evals=1) trained = clf.fit(X, y) predictions = trained.predict(X) print(accuracy_score(y, predictions)) warnings.resetwarnings()
def test_export_to_sklearn_pipeline3(self): from lale.lib.lale import ConcatFeatures from lale.lib.sklearn import PCA from lale.lib.sklearn import KNeighborsClassifier, LogisticRegression, SVC from sklearn.feature_selection import SelectKBest from lale.lib.sklearn import Nystroem from sklearn.pipeline import FeatureUnion lale_pipeline = ( (PCA() >> SelectKBest(k=2)) & (Nystroem(random_state=42) >> SelectKBest(k=3)) & (SelectKBest(k=3))) >> ConcatFeatures() >> SelectKBest( k=2) >> LogisticRegression() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assertIsInstance(sklearn_pipeline.named_steps['featureunion'], FeatureUnion) self.assertIsInstance(sklearn_pipeline.named_steps['selectkbest'], SelectKBest) from sklearn.linear_model import LogisticRegression self.assertIsInstance( sklearn_pipeline.named_steps['logisticregression'], LogisticRegression) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
def test_hyperparam_defaults(self): cf = ConcatFeatures()
'input_fit': _input_fit_schema, 'input_predict': _input_predict_schema, 'output': _output_predict_schema } } HyperoptRegressor = lale.operators.make_operator(HyperoptRegressorImpl, _combined_schemas) if __name__ == '__main__': from lale.lib.lale import ConcatFeatures from lale.lib.sklearn import Nystroem, PCA, RandomForestRegressor from sklearn.metrics import r2_score pca = PCA(n_components=3) nys = Nystroem(n_components=3) concat = ConcatFeatures() rf = RandomForestRegressor() trainable = (pca & nys) >> concat >> rf #trainable = nys >>rf import sklearn.datasets from lale.helpers import cross_val_score diabetes = sklearn.datasets.load_diabetes() X, y = sklearn.utils.shuffle(diabetes.data, diabetes.target, random_state=42) hp_n = HyperoptRegressor(estimator=trainable, max_evals=20) hp_n_trained = hp_n.fit(X, y) predictions = hp_n_trained.predict(X)
def test_planned_pipeline_3(self): plan = ((MinMaxScaler() & NoOp()) >> ConcatFeatures() >> (StandardScaler & (NoOp() | MinMaxScaler())) >> ConcatFeatures() >> (LogisticRegression | KNeighborsClassifier)) run_hyperopt_on_planned_pipeline(plan)
def test_planned_pipeline_1(self): plan = ((PCA & (MinMaxScaler | Normalizer)) >> ConcatFeatures() >> (MinMaxScaler | Normalizer) >> (LogisticRegression | KNeighborsClassifier)) run_hyperopt_on_planned_pipeline(plan)
def test_with_hyperopt2(self): from lale.expressions import ( count, it, max, mean, min, string_indexer, sum, variance, ) wrap_imported_operators() scan = Scan(table=it["main"]) scan_0 = Scan(table=it["customers"]) join = Join(pred=[(it["main"]["group_customer_id"] == it["customers"] ["group_customer_id"])]) map = Map( columns={ "[main](group_customer_id)[customers]|number_children|identity": it["number_children"], "[main](group_customer_id)[customers]|name|identity": it["name"], "[main](group_customer_id)[customers]|income|identity": it["income"], "[main](group_customer_id)[customers]|address|identity": it["address"], "[main](group_customer_id)[customers]|age|identity": it["age"], }, remainder="drop", ) pipeline_4 = join >> map scan_1 = Scan(table=it["purchase"]) join_0 = Join( pred=[(it["main"]["group_id"] == it["purchase"]["group_id"])], join_limit=50.0, ) aggregate = Aggregate( columns={ "[main](group_id)[purchase]|price|variance": variance(it["price"]), "[main](group_id)[purchase]|time|sum": sum(it["time"]), "[main](group_id)[purchase]|time|mean": mean(it["time"]), "[main](group_id)[purchase]|time|min": min(it["time"]), "[main](group_id)[purchase]|price|sum": sum(it["price"]), "[main](group_id)[purchase]|price|count": count(it["price"]), "[main](group_id)[purchase]|price|mean": mean(it["price"]), "[main](group_id)[purchase]|price|min": min(it["price"]), "[main](group_id)[purchase]|price|max": max(it["price"]), "[main](group_id)[purchase]|time|max": max(it["time"]), "[main](group_id)[purchase]|time|variance": variance(it["time"]), }, group_by=it["row_id"], ) pipeline_5 = join_0 >> aggregate map_0 = Map( columns={ "[main]|group_customer_id|identity": it["group_customer_id"], "[main]|transaction_id|identity": it["transaction_id"], "[main]|group_id|identity": it["group_id"], "[main]|comments|identity": it["comments"], "[main]|id|identity": it["id"], "prefix_0_id": it["prefix_0_id"], "next_purchase": it["next_purchase"], "[main]|time|identity": it["time"], }, remainder="drop", ) scan_2 = Scan(table=it["transactions"]) scan_3 = Scan(table=it["products"]) join_1 = Join(pred=[ (it["main"]["transaction_id"] == it["transactions"] ["transaction_id"]), (it["transactions"]["product_id"] == it["products"]["product_id"]), ]) map_1 = Map( columns={ "[main](transaction_id)[transactions](product_id)[products]|price|identity": it["price"], "[main](transaction_id)[transactions](product_id)[products]|type|identity": it["type"], }, remainder="drop", ) pipeline_6 = join_1 >> map_1 join_2 = Join(pred=[(it["main"]["transaction_id"] == it["transactions"] ["transaction_id"])]) map_2 = Map( columns={ "[main](transaction_id)[transactions]|description|identity": it["description"], "[main](transaction_id)[transactions]|product_id|identity": it["product_id"], }, remainder="drop", ) pipeline_7 = join_2 >> map_2 map_3 = Map(columns=[ string_indexer(it["[main]|comments|identity"]), string_indexer( it["[main](transaction_id)[transactions]|description|identity"] ), string_indexer(it[ "[main](transaction_id)[transactions](product_id)[products]|type|identity"] ), string_indexer( it["[main](group_customer_id)[customers]|name|identity"]), string_indexer( it["[main](group_customer_id)[customers]|address|identity"]), ]) pipeline_8 = ConcatFeatures() >> map_3 relational = Relational(operator=make_pipeline_graph( steps=[ scan, scan_0, pipeline_4, scan_1, pipeline_5, map_0, scan_2, scan_3, pipeline_6, pipeline_7, pipeline_8, ], edges=[ (scan, pipeline_4), (scan, pipeline_5), (scan, map_0), (scan, pipeline_6), (scan, pipeline_7), (scan_0, pipeline_4), (pipeline_4, pipeline_8), (scan_1, pipeline_5), (pipeline_5, pipeline_8), (map_0, pipeline_8), (scan_2, pipeline_6), (scan_2, pipeline_7), (scan_3, pipeline_6), (pipeline_6, pipeline_8), (pipeline_7, pipeline_8), ], )) pipeline = relational >> (KNeighborsClassifier | LogisticRegression) from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) from lale.lib.lale import Hyperopt opt = Hyperopt(estimator=pipeline, max_evals=2) opt.fit(X, y)