Example #1
0
    def test_init_fit_predict_spark_pandas(self):

        from lale.datasets import pandas2spark
        from lale.datasets.util import spark_installed

        if spark_installed:
            trainable_cf = ConcatFeatures()
            A = [[11, 12, 13], [21, 22, 23], [31, 32, 33]]
            B = [[14, 15], [24, 25], [34, 35]]
            A = pd.DataFrame(A, columns=["a", "b", "c"])
            B = pd.DataFrame(B, columns=["d", "e"])
            A = pandas2spark(A, add_index=True)
            A = add_table_name(A, "A")
            B = add_table_name(B, "B")

            trained_cf = trainable_cf.fit(X=[A, B])
            transformed = trained_cf.transform([A, B])
            expected = [
                [11, 12, 13, 14, 15],
                [21, 22, 23, 24, 25],
                [31, 32, 33, 34, 35],
            ]
            expected = pd.DataFrame(expected,
                                    columns=["a", "b", "c", "d", "e"])
            for c in expected.columns:
                self.assertEqual(list(transformed[c]), list(expected[c]))
Example #2
0
 def test_planned_pipeline_3(self) :
     plan = (
         ( MinMaxScaler() & NoOp() ) >> ConcatFeatures() >>
         ( StandardScaler & ( NoOp() | MinMaxScaler() ) ) >> ConcatFeatures() >>
         ( LogisticRegression | KNeighborsClassifier )
     )
     run_hyperopt_on_planned_pipeline(plan)
Example #3
0
    def test_export_to_sklearn_pipeline2(self):
        from sklearn.feature_selection import SelectKBest
        from sklearn.pipeline import FeatureUnion

        lale_pipeline = (
            (
                (
                    (PCA(svd_solver="randomized", random_state=42) & SelectKBest(k=3))
                    >> ConcatFeatures()
                )
                & Nystroem(random_state=42)
            )
            >> ConcatFeatures()
            >> KNeighborsClassifier()
        )
        trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train)
        sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline()
        self.assertIsInstance(
            sklearn_pipeline.named_steps["featureunion"], FeatureUnion
        )
        from sklearn.neighbors import KNeighborsClassifier as SklearnKNN

        self.assertIsInstance(
            sklearn_pipeline.named_steps["kneighborsclassifier"], SklearnKNN
        )
        self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
Example #4
0
 def test_multiple_estimators_predict_predict_proba(self):
     pipeline = (StandardScaler() >>
                 (LogisticRegression() & PCA()) >> ConcatFeatures() >>
                 (NoOp() & LinearSVC()) >> ConcatFeatures() >>
                 KNeighborsClassifier())
     pipeline.fit(self.X_train, self.y_train)
     _ = pipeline.predict_proba(self.X_test)
     _ = pipeline.predict(self.X_test)
Example #5
0
    def test_init_fit_predict(self):
        trainable_cf = ConcatFeatures()
        A = [[11, 12, 13], [21, 22, 23], [31, 32, 33]]
        B = [[14, 15], [24, 25], [34, 35]]

        trained_cf = trainable_cf.fit(X=[A, B])
        transformed = trained_cf.transform([A, B])
        expected = [[11, 12, 13, 14, 15], [21, 22, 23, 24, 25],
                    [31, 32, 33, 34, 35]]
        for i_sample in range(len(transformed)):
            for i_feature in range(len(transformed[i_sample])):
                self.assertEqual(transformed[i_sample][i_feature],
                                 expected[i_sample][i_feature])
Example #6
0
    def test_comparison_with_scikit(self):
        import warnings
        warnings.filterwarnings("ignore")
        from lale.lib.sklearn import PCA
        import sklearn.datasets
        from lale.helpers import cross_val_score
        pca = PCA(n_components=3, random_state=42, svd_solver='arpack')
        nys = Nystroem(n_components=10, random_state=42)
        concat = ConcatFeatures()
        lr = LogisticRegression(random_state=42, C=0.1)
        trainable = (pca & nys) >> concat >> lr
        digits = sklearn.datasets.load_digits()
        X, y = sklearn.utils.shuffle(digits.data, digits.target, random_state=42)

        cv_results = cross_val_score(trainable, X, y)
        cv_results = ['{0:.1%}'.format(score) for score in cv_results]

        from sklearn.pipeline import make_pipeline, FeatureUnion
        from sklearn.decomposition import PCA as SklearnPCA
        from sklearn.kernel_approximation import Nystroem as SklearnNystroem
        from sklearn.linear_model import LogisticRegression as SklearnLR
        from sklearn.model_selection import cross_val_score
        union = FeatureUnion([("pca", SklearnPCA(n_components=3, random_state=42, svd_solver='arpack')),
                            ("nys", SklearnNystroem(n_components=10, random_state=42))])
        lr = SklearnLR(random_state=42, C=0.1)
        pipeline = make_pipeline(union, lr)

        scikit_cv_results = cross_val_score(pipeline, X, y, cv = 5)
        scikit_cv_results = ['{0:.1%}'.format(score) for score in scikit_cv_results]
        self.assertEqual(cv_results, scikit_cv_results)
        warnings.resetwarnings()
Example #7
0
    def test_export_to_sklearn_pipeline3(self):
        from sklearn.feature_selection import SelectKBest
        from sklearn.pipeline import FeatureUnion

        lale_pipeline = (
            (
                (PCA() >> SelectKBest(k=2))
                & (Nystroem(random_state=42) >> SelectKBest(k=3))
                & (SelectKBest(k=3))
            )
            >> ConcatFeatures()
            >> SelectKBest(k=2)
            >> LogisticRegression()
        )
        trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train)
        sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline()
        self.assertIsInstance(
            sklearn_pipeline.named_steps["featureunion"], FeatureUnion
        )
        self.assertIsInstance(sklearn_pipeline.named_steps["selectkbest"], SelectKBest)
        from sklearn.linear_model import LogisticRegression as SklearnLR

        self.assertIsInstance(
            sklearn_pipeline.named_steps["logisticregression"], SklearnLR
        )
        self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
Example #8
0
 def test_two_estimators_predict_proba1(self):
     pipeline = StandardScaler() >> (
         PCA() & Nystroem() & PassiveAggressiveClassifier()
     ) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier()
     pipeline.fit(self.X_train, self.y_train)
     with self.assertRaises(ValueError):
         pipeline.predict_proba(self.X_test)
Example #9
0
 def test_planned_pipeline_1(self) :
     plan = (
         ( PCA & ( MinMaxScaler | Normalizer ) ) >> ConcatFeatures() >>
         ( MinMaxScaler | Normalizer ) >>
         ( LogisticRegression | KNeighborsClassifier)
     )
     run_hyperopt_on_planned_pipeline(plan)
Example #10
0
 def test_remove_last4(self):
     pipeline = StandardScaler() >> (
         PCA() & Nystroem() & PassiveAggressiveClassifier()
     ) >> ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier()
     new_pipeline = pipeline.remove_last(inplace=True)
     self.assertEqual(len(new_pipeline._steps), 6)
     self.assertEqual(len(pipeline._steps), 6)
Example #11
0
 def test_two_estimators_predict1(self):
     pipeline = (
         StandardScaler() >>
         (PCA() & Nystroem() & PassiveAggressiveClassifier()) >>
         ConcatFeatures() >> NoOp() >> PassiveAggressiveClassifier())
     trained = pipeline.fit(self.X_train, self.y_train)
     trained.predict(self.X_test)
Example #12
0
 def test_remove_last2(self):
     pipeline = (StandardScaler() >>
                 (PCA() & Nystroem() & PassiveAggressiveClassifier()) >>
                 ConcatFeatures() >> NoOp() >>
                 (PassiveAggressiveClassifier() & LogisticRegression()))
     with self.assertRaises(ValueError):
         pipeline.remove_last()
Example #13
0
    def test_with_concat_features2(self):
        import warnings

        warnings.filterwarnings("ignore")

        from sklearn.datasets import load_iris
        from sklearn.metrics import accuracy_score

        from lale.lib.lale import Hyperopt

        data = load_iris()
        X, y = data.data, data.target
        pca = PCA(n_components=3)
        nys = Nystroem(n_components=10)
        concat = ConcatFeatures()
        lr = LogisticRegression(random_state=42, C=0.1)
        from lale.operators import make_pipeline

        pipeline = make_pipeline(
            ((((SimpleImputer() | NoOp()) >> pca) & nys) >> concat >> lr)
            | KNeighborsClassifier()
        )
        clf = Hyperopt(estimator=pipeline, max_evals=1, handle_cv_failure=True)
        trained = clf.fit(X, y)
        predictions = trained.predict(X)
        print(accuracy_score(y, predictions))
        warnings.resetwarnings()
Example #14
0
    def test_pipeline_1(self):
        self.maxDiff = None
        from lale.json_operator import from_json, to_json
        from lale.lib.lale import ConcatFeatures, NoOp
        from lale.lib.sklearn import PCA
        from lale.lib.sklearn import LogisticRegression as LR

        operator = (PCA & NoOp) >> ConcatFeatures >> LR
        json_expected = {
            "class": "lale.operators.PlannedPipeline",
            "state": "planned",
            "edges": [
                ["pca", "concat_features"],
                ["no_op", "concat_features"],
                ["concat_features", "lr"],
            ],
            "steps": {
                "pca": {
                    "class": PCA.class_name(),
                    "state": "planned",
                    "operator": "PCA",
                    "label": "PCA",
                    "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.pca.html",
                },
                "no_op": {
                    "class": NoOp.class_name(),
                    "state": "trained",
                    "operator": "NoOp",
                    "label": "NoOp",
                    "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.no_op.html",
                    "hyperparams": None,
                    "coefs": None,
                    "is_frozen_trainable": True,
                    "is_frozen_trained": True,
                },
                "concat_features": {
                    "class": ConcatFeatures.class_name(),
                    "state": "trained",
                    "operator": "ConcatFeatures",
                    "label": "ConcatFeatures",
                    "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.lale.concat_features.html",
                    "hyperparams": None,
                    "coefs": None,
                    "is_frozen_trainable": True,
                    "is_frozen_trained": True,
                },
                "lr": {
                    "class": LR.class_name(),
                    "state": "planned",
                    "operator": "LogisticRegression",
                    "label": "LR",
                    "documentation_url": "https://lale.readthedocs.io/en/latest/modules/lale.lib.sklearn.logistic_regression.html",
                },
            },
        }
        json = to_json(operator)
        self.assertEqual(json, json_expected)
        operator_2 = from_json(json)
        json_2 = to_json(operator_2)
        self.assertEqual(json, json_2)
Example #15
0
 def test_empty_schema(self):
     pca = PCA().customize_schema(whiten=schemas.Schema())
     plan = ((pca & (MinMaxScaler | Normalizer)) >> ConcatFeatures() >>
             (MinMaxScaler | Normalizer) >>
             (LogisticRegression | KNeighborsClassifier))
     from lale.search.schema2search_space import OperatorSchemaError
     with self.assertRaises(OperatorSchemaError) as ctxt:
         run_hyperopt_on_planned_pipeline(plan)
Example #16
0
 def test_no_max_schema(self):
     pca = PCA().customize_schema(n_components=schemas.Float(min=0.0))
     plan = ((pca & (MinMaxScaler | Normalizer)) >> ConcatFeatures() >>
             (MinMaxScaler | Normalizer) >>
             (LogisticRegression | KNeighborsClassifier))
     from lale.search.search_space import SearchSpaceError
     with self.assertRaises(SearchSpaceError) as ctxt:
         run_hyperopt_on_planned_pipeline(plan)
Example #17
0
 def test_remove_last5(self):
     pipeline = (
         StandardScaler()
         >> (PCA() & Nystroem() & PassiveAggressiveClassifier())
         >> ConcatFeatures()
         >> NoOp()
         >> PassiveAggressiveClassifier()
     )
     pipeline.remove_last(inplace=True).freeze_trainable()
Example #18
0
 def test_init_fit_predict_pandas_series(self):
     trainable_cf = ConcatFeatures()
     A = [[11, 12, 13], [21, 22, 23], [31, 32, 33]]
     B = [14, 24, 34]
     A = pd.DataFrame(A, columns=["a", "b", "c"])
     B = pd.Series(B, name="d")
     A = add_table_name(A, "A")
     B = add_table_name(B, "B")
     trained_cf = trainable_cf.fit(X=[A, B])
     transformed = trained_cf.transform([A, B])
     expected = [
         [11, 12, 13, 14],
         [21, 22, 23, 24],
         [31, 32, 33, 34],
     ]
     expected = pd.DataFrame(expected, columns=["a", "b", "c", "d"])
     for c in expected.columns:
         self.assertEqual(list(transformed[c]), list(expected[c]))
Example #19
0
 def test_two_estimators_predict_proba(self):
     pipeline = (
         StandardScaler()
         >> (PCA() & Nystroem() & LogisticRegression())
         >> ConcatFeatures()
         >> NoOp()
         >> LogisticRegression()
     )
     trained = pipeline.fit(self.X_train, self.y_train)
     trained.predict_proba(self.X_test)
Example #20
0
 def test_two_estimators_predict_proba1(self):
     pipeline = (
         StandardScaler()
         >> (PCA() & Nystroem() & GaussianNB())
         >> ConcatFeatures()
         >> NoOp()
         >> GaussianNB()
     )
     pipeline.fit(self.X_train, self.y_train)
     pipeline.predict_proba(self.X_test)
Example #21
0
 def do1DTest(self, trainable, train_X, train_y, test_X, test_y):
     #Test for 1-D array as input to the transformers
     train_X = train_X[:,0]
     test_X = test_X[:,0]
     trainable_pipeline = (trainable & NoOp()) >> ConcatFeatures() >> float32_transform() >> LR()
     trained_pipeline = trainable_pipeline.fit(train_X, train_y)
     trained_pipeline.predict(test_X)
     hyperopt = Hyperopt(estimator=trainable_pipeline, max_evals=1)
     trained_hyperopt = hyperopt.fit(train_X, train_y)
     trained_hyperopt.predict(test_X)
Example #22
0
    def test_export_to_sklearn_pipeline2(self):
        from lale.lib.lale import ConcatFeatures
        from lale.lib.sklearn import PCA
        from lale.lib.sklearn import KNeighborsClassifier
        from sklearn.feature_selection import SelectKBest
        from lale.lib.sklearn import Nystroem
        from sklearn.pipeline import FeatureUnion

        lale_pipeline = (
            ((PCA() & SelectKBest(k=3)) >> ConcatFeatures())
            & Nystroem()) >> ConcatFeatures() >> KNeighborsClassifier()
        trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train)
        sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline()
        self.assertIsInstance(sklearn_pipeline.named_steps['featureunion'],
                              FeatureUnion)
        from sklearn.neighbors import KNeighborsClassifier
        self.assertIsInstance(
            sklearn_pipeline.named_steps['kneighborsclassifier'],
            KNeighborsClassifier)
        self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
Example #23
0
    def test_transform_schema_Concat_irisArr(self):
        from lale.datasets.data_schemas import to_schema

        existing_flag = disable_data_schema_validation
        set_disable_data_schema_validation(False)

        data_X, data_y = self._irisArr["X"], self._irisArr["y"]
        s_in_X, s_in_y = to_schema(data_X), to_schema(data_y)

        def check(s_actual, n_expected, s_expected):
            assert s_actual["items"]["minItems"] == n_expected, str(s_actual)
            assert s_actual["items"]["maxItems"] == n_expected, str(s_actual)
            assert s_actual["items"]["items"] == s_expected, str(s_actual)

        s_out_X = ConcatFeatures.transform_schema({"items": [s_in_X]})
        check(s_out_X, 4, {"type": "number"})
        s_out_y = ConcatFeatures.transform_schema({"items": [s_in_y]})
        check(s_out_y, 1, {"type": "integer"})
        s_out_XX = ConcatFeatures.transform_schema({"items": [s_in_X, s_in_X]})
        check(s_out_XX, 8, {"type": "number"})
        s_out_yy = ConcatFeatures.transform_schema({"items": [s_in_y, s_in_y]})
        check(s_out_yy, 2, {"type": "integer"})
        s_out_Xy = ConcatFeatures.transform_schema({"items": [s_in_X, s_in_y]})
        check(s_out_Xy, 5, {"type": "number"})
        s_out_XXX = ConcatFeatures.transform_schema(
            {"items": [s_in_X, s_in_X, s_in_X]})
        check(s_out_XXX, 12, {"type": "number"})
        set_disable_data_schema_validation(existing_flag)
Example #24
0
    def test_transform_schema_Concat_irisDf(self):
        with EnableSchemaValidation():
            from lale.datasets.data_schemas import to_schema

            data_X, data_y = self._irisDf["X"], self._irisDf["y"]
            s_in_X, s_in_y = to_schema(data_X), to_schema(data_y)

            def check(s_actual, n_expected, s_expected):
                assert s_actual["items"]["minItems"] == n_expected, str(
                    s_actual)
                assert s_actual["items"]["maxItems"] == n_expected, str(
                    s_actual)
                assert s_actual["items"]["items"] == s_expected, str(s_actual)

            s_out_X = ConcatFeatures.transform_schema({"items": [s_in_X]})
            check(s_out_X, 4, {"type": "number"})
            s_out_y = ConcatFeatures.transform_schema({"items": [s_in_y]})
            check(s_out_y, 1, {"description": "target", "type": "integer"})
            s_out_XX = ConcatFeatures.transform_schema(
                {"items": [s_in_X, s_in_X]})
            check(s_out_XX, 8, {"type": "number"})
            s_out_yy = ConcatFeatures.transform_schema(
                {"items": [s_in_y, s_in_y]})
            check(s_out_yy, 2, {"type": "integer"})
            s_out_Xy = ConcatFeatures.transform_schema(
                {"items": [s_in_X, s_in_y]})
            check(s_out_Xy, 5, {"type": "number"})
            s_out_XXX = ConcatFeatures.transform_schema(
                {"items": [s_in_X, s_in_X, s_in_X]})
            check(s_out_XXX, 12, {"type": "number"})
Example #25
0
    def test_concat_with_hyperopt(self):
        from lale.lib.lale import Hyperopt
        pca = PCA(n_components=3)
        nys = Nystroem(n_components=10)
        concat = ConcatFeatures()
        lr = LogisticRegression(random_state=42, C=0.1)

        trainable = (pca & nys) >> concat >> lr
        clf = Hyperopt(estimator=trainable, max_evals=2)
        from sklearn.datasets import load_iris
        iris_data = load_iris()
        clf.fit(iris_data.data, iris_data.target)
        clf.predict(iris_data.data)
Example #26
0
    def test_with_pandas(self):
        from lale.datasets import load_iris_df
        import warnings
        warnings.filterwarnings("ignore")
        pca = PCA(n_components=3)
        nys = Nystroem(n_components=10)
        concat = ConcatFeatures()
        lr = LogisticRegression(random_state=42, C=0.1)
        trainable = (pca & nys) >> concat >> lr

        (X_train, y_train), (X_test, y_test) = load_iris_df()
        trained = trainable.fit(X_train, y_train)
        predicted = trained.predict(X_test)
Example #27
0
    def test_with_concat_features1(self):
        import warnings
        warnings.filterwarnings("ignore")

        from sklearn.datasets import load_iris
        from lale.lib.lale import Hyperopt
        from sklearn.metrics import accuracy_score
        data = load_iris()
        X, y = data.data, data.target
        pca = PCA(n_components=3)
        nys = Nystroem(n_components=10)
        concat = ConcatFeatures()
        lr = LogisticRegression(random_state=42, C=0.1)
        pipeline = ((pca & nys) >> concat >> lr) | KNeighborsClassifier()
        clf = Hyperopt(estimator=pipeline, max_evals=1)
        trained = clf.fit(X, y)
        predictions = trained.predict(X)
        print(accuracy_score(y, predictions))
        warnings.resetwarnings()
Example #28
0
    def test_export_to_sklearn_pipeline3(self):
        from lale.lib.lale import ConcatFeatures
        from lale.lib.sklearn import PCA
        from lale.lib.sklearn import KNeighborsClassifier, LogisticRegression, SVC
        from sklearn.feature_selection import SelectKBest
        from lale.lib.sklearn import Nystroem
        from sklearn.pipeline import FeatureUnion

        lale_pipeline = (
            (PCA() >> SelectKBest(k=2)) &
            (Nystroem(random_state=42) >> SelectKBest(k=3))
            & (SelectKBest(k=3))) >> ConcatFeatures() >> SelectKBest(
                k=2) >> LogisticRegression()
        trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train)
        sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline()
        self.assertIsInstance(sklearn_pipeline.named_steps['featureunion'],
                              FeatureUnion)
        self.assertIsInstance(sklearn_pipeline.named_steps['selectkbest'],
                              SelectKBest)
        from sklearn.linear_model import LogisticRegression
        self.assertIsInstance(
            sklearn_pipeline.named_steps['logisticregression'],
            LogisticRegression)
        self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
Example #29
0
 def test_transform_schema_Concat_irisDf(self):
     from lale.datasets.data_schemas import to_schema
     data_X, data_y = self._irisDf['X'], self._irisDf['y']
     s_in_X, s_in_y = to_schema(data_X), to_schema(data_y)
     def check(s_actual, n_expected, s_expected):
         assert s_actual['items']['minItems'] == n_expected, str(s_actual)
         assert s_actual['items']['maxItems'] == n_expected, str(s_actual)
         assert s_actual['items']['items'] == s_expected, str(s_actual)
     s_out_X = ConcatFeatures.transform_schema({'items': [s_in_X]})
     check(s_out_X, 4, {'type': 'number'})
     s_out_y = ConcatFeatures.transform_schema({'items': [s_in_y]})
     check(s_out_y, 1, {'description': 'target', 'type': 'integer'})
     s_out_XX = ConcatFeatures.transform_schema({'items': [s_in_X, s_in_X]})
     check(s_out_XX, 8, {'type': 'number'})
     s_out_yy = ConcatFeatures.transform_schema({'items': [s_in_y, s_in_y]})
     check(s_out_yy, 2, {'type': 'integer'})
     s_out_Xy = ConcatFeatures.transform_schema({'items': [s_in_X, s_in_y]})
     check(s_out_Xy, 5, {'type': 'number'})
     s_out_XXX = ConcatFeatures.transform_schema({
         'items': [s_in_X, s_in_X, s_in_X]})
     check(s_out_XXX, 12, {'type': 'number'})
Example #30
0
 def test_hyperparam_defaults(self):
     cf = ConcatFeatures()