Ejemplo n.º 1
0
 def test_import_from_sklearn_pipeline_nested_pipeline(self):
     from sklearn.pipeline import FeatureUnion, make_pipeline
     from sklearn.decomposition import PCA
     from sklearn.kernel_approximation import Nystroem
     from sklearn.feature_selection import SelectKBest
     from sklearn.neighbors import KNeighborsClassifier
     from sklearn.pipeline import make_pipeline
     union = FeatureUnion([("selectkbest_pca",
                            make_pipeline(SelectKBest(k=3),
                                          PCA(n_components=1))),
                           ("nys", Nystroem(n_components=2,
                                            random_state=42))])
     sklearn_pipeline = make_pipeline(union, KNeighborsClassifier())
     lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
     self.assertEqual(len(lale_pipeline.edges()), 4)
     from lale.lib.sklearn.pca import PCAImpl
     from lale.lib.sklearn.nystroem import NystroemImpl
     from lale.lib.lale.concat_features import ConcatFeaturesImpl
     from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl
     #These assertions assume topological sort
     self.assertIsInstance(lale_pipeline.edges()[0][0]._impl, SelectKBest)
     self.assertIsInstance(lale_pipeline.edges()[0][1]._impl, PCAImpl)
     self.assertIsInstance(lale_pipeline.edges()[1][0]._impl, PCAImpl)
     self.assertIsInstance(lale_pipeline.edges()[1][1]._impl,
                           ConcatFeaturesImpl)
     self.assertIsInstance(lale_pipeline.edges()[2][0]._impl, NystroemImpl)
     self.assertIsInstance(lale_pipeline.edges()[2][1]._impl,
                           ConcatFeaturesImpl)
     self.assertIsInstance(lale_pipeline.edges()[3][0]._impl,
                           ConcatFeaturesImpl)
     self.assertIsInstance(lale_pipeline.edges()[3][1]._impl,
                           KNeighborsClassifierImpl)
     self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
Ejemplo n.º 2
0
 def test_import_from_sklearn_pipeline_nested_pipeline1(self):
     from sklearn.pipeline import FeatureUnion, make_pipeline
     from sklearn.decomposition import PCA
     from sklearn.kernel_approximation import Nystroem
     from sklearn.feature_selection import SelectKBest
     from sklearn.neighbors import KNeighborsClassifier
     from sklearn.pipeline import make_pipeline
     union = FeatureUnion([
         ("selectkbest_pca",
          make_pipeline(
              SelectKBest(k=3),
              FeatureUnion([('pca', PCA(n_components=1)),
                            ('nested_pipeline',
                             make_pipeline(SelectKBest(k=2),
                                           Nystroem()))]))),
         ("nys", Nystroem(n_components=2, random_state=42))
     ])
     sklearn_pipeline = make_pipeline(union, KNeighborsClassifier())
     lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
     self.assertEqual(len(lale_pipeline.edges()), 8)
     #These assertions assume topological sort, which may not be unique. So the assertions are brittle.
     from lale.lib.sklearn.pca import PCAImpl
     from lale.lib.sklearn.nystroem import NystroemImpl
     from lale.lib.lale.concat_features import ConcatFeaturesImpl
     from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl
     from lale.lib.sklearn.select_k_best import SelectKBestImpl
     self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(),
                      SelectKBestImpl)
     self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(), PCAImpl)
     self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(),
                      SelectKBestImpl)
     self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(),
                      SelectKBestImpl)
     self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(),
                      SelectKBestImpl)
     self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(),
                      NystroemImpl)
     self.assertEqual(lale_pipeline.edges()[3][0]._impl_class(), PCAImpl)
     self.assertEqual(lale_pipeline.edges()[3][1]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[4][0]._impl_class(),
                      NystroemImpl)
     self.assertEqual(lale_pipeline.edges()[4][1]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[5][0]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[5][1]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[6][0]._impl_class(),
                      NystroemImpl)
     self.assertEqual(lale_pipeline.edges()[6][1]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[7][0]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[7][1]._impl_class(),
                      KNeighborsClassifierImpl)
     self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
Ejemplo n.º 3
0
    def test_resampler(self):
        from lale.lib.sklearn import PCA, Nystroem, LogisticRegression, RandomForestClassifier
        from lale.lib.lale import NoOp, ConcatFeatures
        X_train, y_train = self.X_train, self.y_train
        X_test, y_test = self.X_test, self.y_test
        import importlib
        module_name = ".".join(res_name.split('.')[0:-1])
        class_name = res_name.split('.')[-1]
        module = importlib.import_module(module_name)

        class_ = getattr(module, class_name)
        with self.assertRaises(ValueError):
            res = class_()

        #test_schemas_are_schemas
        lale.type_checking.validate_is_schema(class_.input_schema_fit())
        lale.type_checking.validate_is_schema(class_.input_schema_predict())
        lale.type_checking.validate_is_schema(class_.output_schema_predict())
        lale.type_checking.validate_is_schema(class_.hyperparam_schema())

        #test_init_fit_predict
        from lale.operators import make_pipeline
        pipeline1 = PCA() >> class_(operator=make_pipeline(LogisticRegression()))
        trained = pipeline1.fit(X_train, y_train)
        predictions = trained.predict(X_test)

        pipeline2 = class_(operator=make_pipeline(PCA(), LogisticRegression()))
        trained = pipeline2.fit(X_train, y_train)
        predictions = trained.predict(X_test)

        #test_with_hyperopt
        from lale.lib.lale import Hyperopt
        optimizer = Hyperopt(estimator=PCA >> class_(operator=make_pipeline(LogisticRegression())), max_evals = 1, show_progressbar=False)
        trained_optimizer = optimizer.fit(X_train, y_train)
        predictions = trained_optimizer.predict(X_test)

        pipeline3 = class_(operator= PCA() >> (Nystroem & NoOp) >> ConcatFeatures >> LogisticRegression())
        optimizer = Hyperopt(estimator=pipeline3, max_evals = 1, show_progressbar=False)
        trained_optimizer = optimizer.fit(X_train, y_train)
        predictions = trained_optimizer.predict(X_test)

        pipeline4 = (PCA >> class_(operator=make_pipeline(Nystroem())) & class_(operator=make_pipeline(Nystroem()))) >> ConcatFeatures >> LogisticRegression()
        optimizer = Hyperopt(estimator=pipeline4, max_evals = 1, scoring='roc_auc', show_progressbar=False)
        trained_optimizer = optimizer.fit(X_train, y_train)
        predictions = trained_optimizer.predict(X_test)

        #test_cross_validation
        from lale.helpers import cross_val_score
        cv_results = cross_val_score(pipeline1, X_train, y_train, cv = 2)
        self.assertEqual(len(cv_results), 2)

        #test_to_json
        pipeline1.to_json()
Ejemplo n.º 4
0
 def test_import_from_sklearn_pipeline_feature_union(self):
     from sklearn.pipeline import FeatureUnion
     from sklearn.decomposition import PCA
     from sklearn.kernel_approximation import Nystroem
     from sklearn.neighbors import KNeighborsClassifier
     from sklearn.pipeline import make_pipeline
     union = FeatureUnion([("pca", PCA(n_components=1)),
                           ("nys", Nystroem(n_components=2,
                                            random_state=42))])
     sklearn_pipeline = make_pipeline(union, KNeighborsClassifier())
     lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
     self.assertEqual(len(lale_pipeline.edges()), 3)
     from lale.lib.sklearn.pca import PCAImpl
     from lale.lib.sklearn.nystroem import NystroemImpl
     from lale.lib.lale.concat_features import ConcatFeaturesImpl
     from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl
     self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(), PCAImpl)
     self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(),
                      NystroemImpl)
     self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(),
                      ConcatFeaturesImpl)
     self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(),
                      KNeighborsClassifierImpl)
     self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
Ejemplo n.º 5
0
    def test_autoai_libs_tam_2(self):
        from lale.lib.autoai_libs import TAM
        import numpy as np
        from lightgbm import LGBMClassifier
        from sklearn.decomposition import PCA
        from lale.operators import make_pipeline
        pca = PCA(copy=False)
        tam = TAM(tans_class=pca,
                  name='pca',
                  col_names=['a', 'b', 'c'],
                  col_dtypes=[
                      np.dtype('float32'),
                      np.dtype('float32'),
                      np.dtype('float32')
                  ])
        lgbm_classifier = LGBMClassifier(class_weight='balanced',
                                         learning_rate=0.18)
        pipeline = make_pipeline(tam, lgbm_classifier)
        expected = \
"""from lale.lib.autoai_libs import TAM
import sklearn.decomposition.pca
import numpy as np
from lightgbm import LGBMClassifier
from lale.operators import make_pipeline

tam = TAM(tans_class=sklearn.decomposition.pca.PCA(copy=False, iterated_power='auto', n_components=None, random_state=None,   svd_solver='auto', tol=0.0, whiten=False), name='pca', col_names=['a', 'b', 'c'], col_dtypes=[np.dtype('float32'), np.dtype('float32'), np.dtype('float32')])
lgbm_classifier = LGBMClassifier(class_weight='balanced', learning_rate=0.18)
pipeline = make_pipeline(tam, lgbm_classifier)"""
        self._roundtrip(
            expected, lale.pretty_print.to_string(pipeline, combinators=False))
Ejemplo n.º 6
0
 def test_duplicate_instances(self):
     from lale.operators import make_pipeline
     tfm = PCA()
     clf = LogisticRegression(LogisticRegression.solver.lbfgs,
                              LogisticRegression.multi_class.auto)
     with self.assertRaises(ValueError):
         trainable = make_pipeline(tfm, tfm, clf)
Ejemplo n.º 7
0
    def dont_test_car_hyperopt(self):

        from lale.datasets.auto_weka import fetch_car
        from sklearn.metrics import accuracy_score, make_scorer
        from sklearn.preprocessing import LabelEncoder
        import pandas as pd
        from lale.lib.weka import J48
        from lalegpl.lib.r import ArulesCBAClassifier 
        from lale.operators import make_pipeline
        from lale.lib.lale import HyperoptClassifier
        from lale.lib.sklearn import LogisticRegression, KNeighborsClassifier

        (X_train, y_train), (X_test, y_test) = fetch_car()
        y_name = y_train.name
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_test = le.transform(y_test)

        y_train = pd.Series(y_train, name=y_name)
        y_test = pd.Series(y_test, name=y_name)

        planned_pipeline = make_pipeline(ArulesCBAClassifier() | LogisticRegression() | KNeighborsClassifier())

        clf = HyperoptClassifier(model = planned_pipeline, max_evals = 1)
        best_pipeline = clf.fit(X_train, y_train)
        print(accuracy_score(y_test, best_pipeline.predict(X_test)))
Ejemplo n.º 8
0
    def test_autoai_libs_tam_2(self):
        from lale.lib.autoai_libs import TAM
        import numpy as np
        from lightgbm import LGBMClassifier
        from sklearn.decomposition import PCA
        from lale.operators import make_pipeline
        pca = PCA(copy=False)
        tam = TAM(tans_class=pca, name='pca', col_names=['a', 'b', 'c'], col_dtypes=[np.dtype('float32'), np.dtype('float32'), np.dtype('float32')])
        lgbm_classifier = LGBMClassifier(class_weight='balanced', learning_rate=0.18)
        pipeline = make_pipeline(tam, lgbm_classifier)
        expected = """from autoai_libs.cognito.transforms.transform_utils import TAM
import sklearn.decomposition
import numpy as np
from lightgbm import LGBMClassifier
from lale.operators import make_pipeline

tam = TAM(
    tans_class=sklearn.decomposition.PCA(copy=False),
    name="pca",
    col_names=["a", "b", "c"],
    col_dtypes=[
        np.dtype("float32"),
        np.dtype("float32"),
        np.dtype("float32"),
    ],
)
lgbm_classifier = LGBMClassifier(class_weight="balanced", learning_rate=0.18)
pipeline = make_pipeline(tam, lgbm_classifier)"""
        self._roundtrip(expected, lale.pretty_print.to_string(pipeline, combinators=False))
Ejemplo n.º 9
0
    def test_with_concat_features2(self):
        import warnings

        warnings.filterwarnings("ignore")

        from sklearn.datasets import load_iris
        from sklearn.metrics import accuracy_score

        from lale.lib.lale import Hyperopt

        data = load_iris()
        X, y = data.data, data.target
        pca = PCA(n_components=3)
        nys = Nystroem(n_components=10)
        concat = ConcatFeatures()
        lr = LogisticRegression(random_state=42, C=0.1)
        from lale.operators import make_pipeline

        pipeline = make_pipeline(
            ((((SimpleImputer() | NoOp()) >> pca) & nys) >> concat >> lr)
            | KNeighborsClassifier()
        )
        clf = Hyperopt(estimator=pipeline, max_evals=1, handle_cv_failure=True)
        trained = clf.fit(X, y)
        predictions = trained.predict(X)
        print(accuracy_score(y, predictions))
        warnings.resetwarnings()
Ejemplo n.º 10
0
def import_from_sklearn_pipeline(sklearn_pipeline):
    #For all pipeline steps, identify equivalent lale wrappers if present,
    #if not, call make operator on sklearn classes and create a lale pipeline.

    def get_equivalent_lale_op(sklearn_obj):
        module_name = "lale.lib.sklearn"
        from sklearn.base import clone
        from lale.operators import make_operator

        class_name = sklearn_obj.__class__.__name__
        module = importlib.import_module(module_name)
        try:
            class_ = getattr(module, class_name)
        except AttributeError:
            class_ = make_operator(sklearn_obj, name=class_name)
        class_ = class_(**sklearn_obj.get_params())
        class_._impl._sklearn_model =  copy.deepcopy(sklearn_obj)
        return class_         

    from sklearn.pipeline import FeatureUnion, Pipeline
    from sklearn.base import BaseEstimator
    from lale.operators import make_pipeline, make_union
    
    if isinstance(sklearn_pipeline, Pipeline):
        nested_pipeline_steps = sklearn_pipeline.named_steps.values()
        nested_pipeline_lale_objects = [import_from_sklearn_pipeline(nested_pipeline_step) for nested_pipeline_step in nested_pipeline_steps]
        lale_op_obj = make_pipeline(*nested_pipeline_lale_objects)
    elif isinstance(sklearn_pipeline, FeatureUnion):
        transformer_list = sklearn_pipeline.transformer_list
        concat_predecessors = [import_from_sklearn_pipeline(transformer[1]) for transformer in transformer_list]
        lale_op_obj = make_union(*concat_predecessors)
    else:
        lale_op_obj = get_equivalent_lale_op(sklearn_pipeline)
    return lale_op_obj
Ejemplo n.º 11
0
    def test_autoai_libs_t_no_op(self):
        from lightgbm import LGBMClassifier

        from lale.lib.autoai_libs import TNoOp
        from lale.operators import make_pipeline

        t_no_op = TNoOp(
            fun="fun",
            name="no_action",
            datatypes="x",
            feat_constraints=[],
            tgraph="tgraph",
        )
        lgbm_classifier = LGBMClassifier(class_weight="balanced", learning_rate=0.18)
        pipeline = make_pipeline(t_no_op, lgbm_classifier)
        expected = """from autoai_libs.cognito.transforms.transform_utils import TNoOp
from lightgbm import LGBMClassifier
from lale.operators import make_pipeline

t_no_op = TNoOp(
    fun="fun",
    name="no_action",
    datatypes="x",
    feat_constraints=[],
    tgraph="tgraph",
)
lgbm_classifier = LGBMClassifier(
    class_weight="balanced", learning_rate=0.18, n_estimators=100
)
pipeline = make_pipeline(t_no_op, lgbm_classifier)"""
        self._roundtrip(
            expected, lale.pretty_print.to_string(pipeline, combinators=False)
        )
Ejemplo n.º 12
0
 def test_string_labels(self):
     from lale.lib.imblearn import CondensedNearestNeighbour
     print(type(CondensedNearestNeighbour))
     from lale.operators import make_pipeline
     y_train = ['low' if label==0 else 'high' for label in self.y_train]
     pipeline = CondensedNearestNeighbour(operator=make_pipeline(PCA(), LogisticRegression()), sampling_strategy=['high'])
     trained = pipeline.fit(self.X_train, y_train)
     predictions = trained.predict(self.X_test)
Ejemplo n.º 13
0
    def test_export_to_pickle(self):
        from lale.lib.sklearn import LogisticRegression
        from lale.operators import make_pipeline

        lale_pipeline = make_pipeline(LogisticRegression())
        trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train)
        pickle.dumps(lale_pipeline)
        pickle.dumps(trained_lale_pipeline)
Ejemplo n.º 14
0
 def test_make_pipeline(self):
     from lale.operators import make_pipeline
     tfm = PCA(n_components=10)
     clf = LogisticRegression(random_state=42)
     trainable = make_pipeline(tfm, clf)
     digits = sklearn.datasets.load_digits()
     trained = trainable.fit(digits.data, digits.target)
     predicted = trained.predict(digits.data)
Ejemplo n.º 15
0
 def test_decision_function(self):
     from lale.lib.imblearn import SMOTE
     from lale.operators import make_pipeline
     from lale.lib.sklearn import RandomForestClassifier
     smote = SMOTE(operator=make_pipeline(RandomForestClassifier()))
     trained = smote.fit(self.X_train, self.y_train)
     trained.predict(self.X_test)
     with self.assertRaises(AttributeError):
         trained.decision_function(self.X_test)
Ejemplo n.º 16
0
    def test_export_to_sklearn_pipeline4(self):
        from lale.lib.sklearn import LogisticRegression
        from lale.operators import make_pipeline

        lale_pipeline = make_pipeline(LogisticRegression())
        trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train)
        sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline()
        from sklearn.linear_model import LogisticRegression
        self.assertIsInstance(
            sklearn_pipeline.named_steps['logisticregression'],
            LogisticRegression)
        self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
Ejemplo n.º 17
0
def import_from_sklearn_pipeline(sklearn_pipeline, fitted=True):
    #For all pipeline steps, identify equivalent lale wrappers if present,
    #if not, call make operator on sklearn classes and create a lale pipeline.

    def get_equivalent_lale_op(sklearn_obj, fitted):
        module_names = ["lale.lib.sklearn", "lale.lib.autoai_libs"]
        from lale.operators import make_operator, TrainedIndividualOp

        lale_wrapper_found = False
        class_name = sklearn_obj.__class__.__name__
        for module_name in module_names:
            module = importlib.import_module(module_name)
            try:
                class_ = getattr(module, class_name)
                lale_wrapper_found = True
                break
            except AttributeError:
                continue
        else:
            class_ = make_operator(sklearn_obj, name=class_name)

        if not fitted:  #If fitted is False, we do not want to return a Trained operator.
            lale_op = class_
        else:
            lale_op = TrainedIndividualOp(class_._name, class_._impl,
                                          class_._schemas)
        class_ = lale_op(**sklearn_obj.get_params())
        if lale_wrapper_found:
            class_._impl_instance()._wrapped_model = copy.deepcopy(sklearn_obj)
        else:  # If there is no lale wrapper, there is no _wrapped_model
            class_._impl = copy.deepcopy(sklearn_obj)
        return class_

    from sklearn.pipeline import FeatureUnion, Pipeline
    from sklearn.base import BaseEstimator
    from lale.operators import make_pipeline, make_union
    if isinstance(sklearn_pipeline, Pipeline):
        nested_pipeline_steps = sklearn_pipeline.named_steps.values()
        nested_pipeline_lale_objects = [
            import_from_sklearn_pipeline(nested_pipeline_step, fitted=fitted)
            for nested_pipeline_step in nested_pipeline_steps
        ]
        lale_op_obj = make_pipeline(*nested_pipeline_lale_objects)
    elif isinstance(sklearn_pipeline, FeatureUnion):
        transformer_list = sklearn_pipeline.transformer_list
        concat_predecessors = [
            import_from_sklearn_pipeline(transformer[1], fitted=fitted)
            for transformer in transformer_list
        ]
        lale_op_obj = make_union(*concat_predecessors)
    else:
        lale_op_obj = get_equivalent_lale_op(sklearn_pipeline, fitted=fitted)
    return lale_op_obj
Ejemplo n.º 18
0
 def test_import_from_sklearn_pipeline1(self):
     from sklearn.decomposition import PCA
     from sklearn.neighbors import KNeighborsClassifier
     from sklearn.pipeline import make_pipeline
     sklearn_pipeline = make_pipeline(PCA(n_components=3),
                                      KNeighborsClassifier())
     lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
     for i, pipeline_step in enumerate(sklearn_pipeline.named_steps):
         sklearn_step_params = sklearn_pipeline.named_steps[
             pipeline_step].get_params()
         lale_sklearn_params = lale_pipeline.steps(
         )[i]._impl._wrapped_model.get_params()
         self.assertEqual(sklearn_step_params, lale_sklearn_params)
     self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
Ejemplo n.º 19
0
    def test_J48_for_car_dataset(self):
        from lalegpl.datasets.auto_weka import fetch_car
        (X_train, y_train), (X_test, y_test) = fetch_car()
        from sklearn.preprocessing import LabelEncoder
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_test = le.transform(y_test)

        clf = J48()
        from sklearn.metrics import accuracy_score
        from lale.lib.lale import NoOp, HyperoptClassifier
        from lale.operators import make_pipeline
        clf = HyperoptClassifier(make_pipeline(J48()), max_evals=1)
        trained_clf = clf.fit(X_train, y_train)
        print(accuracy_score(y_test, trained_clf.predict(X_test)))
Ejemplo n.º 20
0
    def test_autoai_libs_t_no_op(self):
        from lale.lib.autoai_libs import TNoOp
        from lightgbm import LGBMClassifier
        from lale.operators import make_pipeline
        t_no_op = TNoOp(name='no_action', datatypes='x', feat_constraints=[])
        lgbm_classifier = LGBMClassifier(class_weight='balanced', learning_rate=0.18)
        pipeline = make_pipeline(t_no_op, lgbm_classifier)
        expected = \
"""from lale.lib.autoai_libs import TNoOp
from lightgbm import LGBMClassifier
from lale.operators import make_pipeline

t_no_op = TNoOp(name='no_action', datatypes='x', feat_constraints=[])
lgbm_classifier = LGBMClassifier(class_weight='balanced', learning_rate=0.18)
pipeline = make_pipeline(t_no_op, lgbm_classifier)"""
        self._roundtrip(expected, lale.pretty_print.to_string(pipeline, combinators=False))
Ejemplo n.º 21
0
 def test_pipeline_2(self):
     from lale.lib.lale import NoOp
     from lale.lib.sklearn import Nystroem
     from lale.lib.sklearn import PCA
     from lale.lib.sklearn import LogisticRegression
     from lale.lib.sklearn import KNeighborsClassifier
     from lale.operators import make_choice, make_pipeline
     from lale.json_operator import to_json, from_json
     kernel_tfm_or_not = make_choice(NoOp, Nystroem)
     tfm = PCA
     clf = make_choice(LogisticRegression, KNeighborsClassifier)
     operator = make_pipeline(kernel_tfm_or_not, tfm, clf)
     json = to_json(operator)
     operator_2 = from_json(json)
     json_2 = to_json(operator_2)
     self.assertEqual(json, json_2)
Ejemplo n.º 22
0
    def sample(self, n: int) -> PlannedOperator:
        """
        Sample the grammar `g` starting from `g.start`, that is, choose one element at random for each possible choices.

        Parameters
        ----------
        n : int
            number of derivations

        Returns
        -------
        PlannedOperator
        """
        assert hasattr(self, "start"), "Rule start must be defined"
        op = self._sample(self.start, n)
        return make_pipeline(op) if op else NoOp
Ejemplo n.º 23
0
    def unfold(self, n: int) -> PlannedOperator:
        """
        Explore the grammar `g` starting from `g.start` and generate all possible   choices after `n` derivations.

        Parameters
        ----------
        g : Grammar
            input grammar
        n : int
            number of derivations

        Returns
        -------
        PlannedOperator
        """
        assert hasattr(self, "start"), "Rule start must be defined"
        op = self._unfold(self.start, n)
        return make_pipeline(op) if op else NoOp
Ejemplo n.º 24
0
    def test_compare_with_sklearn(self):
        from lale.operators import make_pipeline
        tfm = PCA()
        clf = LogisticRegression(LogisticRegression.solver.lbfgs,
                                 LogisticRegression.multi_class.auto)
        trainable = make_pipeline(tfm, clf)
        digits = sklearn.datasets.load_digits()
        trained = trainable.fit(digits.data, digits.target)
        predicted = trained.predict(digits.data)
        from sklearn.pipeline import make_pipeline as scikit_make_pipeline
        from sklearn.decomposition import PCA as SklearnPCA
        from sklearn.linear_model import LogisticRegression as SklearnLR
        sklearn_pipeline = scikit_make_pipeline(
            SklearnPCA(), SklearnLR(solver="lbfgs", multi_class="auto"))
        sklearn_pipeline.fit(digits.data, digits.target)
        predicted_sklearn = sklearn_pipeline.predict(digits.data)

        from sklearn.metrics import accuracy_score
        lale_score = accuracy_score(digits.target, predicted)
        scikit_score = accuracy_score(digits.target, predicted_sklearn)
        self.assertEqual(lale_score, scikit_score)
Ejemplo n.º 25
0
def import_from_sklearn_pipeline(sklearn_pipeline, fitted=True):
    #For all pipeline steps, identify equivalent lale wrappers if present,
    #if not, call make operator on sklearn classes and create a lale pipeline.

    def get_equivalent_lale_op(sklearn_obj, fitted):
        #Validate that the sklearn_obj is a valid sklearn-compatible object
        if sklearn_obj is None or not hasattr(sklearn_obj, 'get_params'):
            raise ValueError(
                "The input pipeline has a step that is not scikit-learn compatible."
            )
        module_names = ["lale.lib.sklearn", "lale.lib.autoai_libs"]
        from lale.operators import TrainedIndividualOp, make_operator

        lale_wrapper_found = False
        class_name = sklearn_obj.__class__.__name__
        for module_name in module_names:
            module = importlib.import_module(module_name)
            try:
                class_ = getattr(module, class_name)
                lale_wrapper_found = True
                break
            except AttributeError:
                continue
        else:
            class_ = make_operator(sklearn_obj, name=class_name)

        if not fitted:  #If fitted is False, we do not want to return a Trained operator.
            lale_op = class_
        else:
            lale_op = TrainedIndividualOp(class_._name, class_._impl,
                                          class_._schemas)

        orig_hyperparams = sklearn_obj.get_params()
        higher_order = False
        for hp_name, hp_val in orig_hyperparams.items():
            higher_order = higher_order or hasattr(hp_val, 'get_params')
        if higher_order:
            hyperparams = {}
            for hp_name, hp_val in orig_hyperparams.items():
                if hasattr(hp_val, 'get_params'):
                    nested_op = get_equivalent_lale_op(hp_val, fitted)
                    hyperparams[hp_name] = nested_op
                else:
                    hyperparams[hp_name] = hp_val
        else:
            hyperparams = orig_hyperparams

        class_ = lale_op(**hyperparams)
        if lale_wrapper_found:
            wrapped_model = copy.deepcopy(sklearn_obj)
            class_._impl_instance()._wrapped_model = wrapped_model
        else:  # If there is no lale wrapper, there is no _wrapped_model
            class_._impl = copy.deepcopy(sklearn_obj)
        return class_

    from sklearn.base import BaseEstimator
    from sklearn.pipeline import FeatureUnion, Pipeline

    from lale.operators import make_pipeline, make_union
    if isinstance(sklearn_pipeline, Pipeline):
        nested_pipeline_steps = sklearn_pipeline.named_steps.values()
        nested_pipeline_lale_objects = [
            import_from_sklearn_pipeline(nested_pipeline_step, fitted=fitted)
            for nested_pipeline_step in nested_pipeline_steps
        ]
        lale_op_obj = make_pipeline(*nested_pipeline_lale_objects)
    elif isinstance(sklearn_pipeline, FeatureUnion):
        transformer_list = sklearn_pipeline.transformer_list
        concat_predecessors = [
            import_from_sklearn_pipeline(transformer[1], fitted=fitted)
            for transformer in transformer_list
        ]
        lale_op_obj = make_union(*concat_predecessors)
    else:
        lale_op_obj = get_equivalent_lale_op(sklearn_pipeline, fitted=fitted)
    return lale_op_obj
Ejemplo n.º 26
0
    def dont_test_car_smac(self):
        import numpy as np

        from lale.datasets.auto_weka import fetch_car
        from sklearn.metrics import accuracy_score, make_scorer
        from sklearn.preprocessing import LabelEncoder
        import pandas as pd
        from lale.lib.weka import J48
        from lalegpl.lib.r import ArulesCBAClassifier 
        from lale.operators import make_pipeline
        from lale.lib.lale import HyperoptClassifier
        from lale.lib.sklearn import LogisticRegression, KNeighborsClassifier
        from smac.scenario.scenario import Scenario
        from smac.facade.smac_facade import SMAC
        from smac.configspace import ConfigurationSpace


        (X_train, y_train), (X_test, y_test) = fetch_car()
        y_name = y_train.name
        le = LabelEncoder()
        y_train = le.fit_transform(y_train)
        y_test = le.transform(y_test)

        y_train = pd.Series(y_train, name=y_name)
        y_test = pd.Series(y_test, name=y_name)

#        planned_pipeline = make_pipeline(J48() | ArulesCBAClassifier() | LogisticRegression() | KNeighborsClassifier())
        planned_pipeline = make_pipeline(ArulesCBAClassifier() | KNeighborsClassifier() | LogisticRegression())

        cs:ConfigurationSpace = get_smac_space(planned_pipeline)
        print(cs)
#        X_train = X_train[0:20]
#        y_train = y_train[0:20]
        # Scenario object
        run_count_limit = 1
        scenario = Scenario({"run_obj": "quality",   # we optimize quality (alternatively runtime)
                            "runcount-limit": run_count_limit,  # maximum function evaluations
                            "cs": cs,               # configuration space
                            "deterministic": "true",
                            "abort_on_first_run_crash": False
                            })

        # Optimize, using a SMAC-object
        def f_min(op): 
            return test_f_min(op, X_train, y_train, num_folds=2)
        tae = lale_op_smac_tae(planned_pipeline, f_min)

        print("Optimizing! Depending on your machine, this might take a few minutes.")
        smac = SMAC(scenario=scenario, rng=np.random.RandomState(42),
                tae_runner=tae)

        incumbent = smac.optimize()
        trainable_pipeline = lale_trainable_op_from_config(planned_pipeline, incumbent)
        trained_pipeline = trainable_pipeline.fit(X_train, y_train)
        pred = trained_pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        print("Accuracy: %.2f" % (accuracy))
        inc_value = tae(incumbent)

        print("Optimized Value: %.2f" % (inc_value))
        print(f"Run count limit: {run_count_limit}")
Ejemplo n.º 27
0
def fetch(dataset_name, task_type, verbose=False, preprocess=True):
    if verbose:
        print('Loading dataset:', dataset_name)
    #Check that the dataset name exists in experiments_dict
    try:
        dataset_name_found = experiments_dict[dataset_name]
        if experiments_dict[dataset_name]['task_type'] != task_type.lower():
            raise ValueError("The task type {} does not match with the given datasets task type {}"\
                .format(task_type, experiments_dict[dataset_name]['task_type']))

    except KeyError:
        raise KeyError("Dataset name {} not found in the supported datasets".format(dataset_name))
    data_file_name = os.path.join(download_data_dir, dataset_name+".arff")
    if verbose:
        print(data_file_name)
    if not os.path.exists(data_file_name):
        #TODO: Download the data
        if not os.path.exists(download_data_dir):
            os.makedirs(download_data_dir)
            if verbose:
                print('created directory {}'.format(download_data_dir))
        urllib.request.urlretrieve(experiments_dict[dataset_name]['download_arff_url'], data_file_name)

    assert os.path.exists(data_file_name)
    with open(data_file_name) as f:
        dataDictionary = arff.load(f)
        f.close()

    from lale.datasets.data_schemas import liac_arff_to_schema
    schema_orig = liac_arff_to_schema(dataDictionary)
    target_col = experiments_dict[dataset_name]['target']
    if preprocess:
        arffData = pd.DataFrame(dataDictionary['data'])
        #arffData = arffData.fillna(0)
        attributes = dataDictionary['attributes']

        if verbose:
            print(attributes)
        categorical_cols = []
        numeric_cols = []
        X_columns = []
        for i, item in enumerate(attributes):
            if item[0].lower() == target_col:
                target_indx = i
                #remove it from attributes so that the next loop indices are adjusted accordingly.
                del attributes[i]
                y = arffData.iloc[:,target_indx]
                arffData = arffData.drop(i, axis = 1)

        for i, item in enumerate(attributes):
            X_columns.append(i)
            if (((isinstance(item[1], str) and item[1].lower() not in numeric_data_types_list) \
                or isinstance(item[1], list)) and (item[0].lower() != 'class')):
                categorical_cols.append(i)
            elif (isinstance(item[1], str) and item[1].lower() in numeric_data_types_list) and (item[0].lower() != 'class'):
                numeric_cols.append(i)
        if verbose:
            print(f'categorical columns: {categorical_cols}')
            print(f'numeric columns:     {numeric_cols}')
        X = arffData.iloc[:,X_columns]

        #Check whether there is any error
        num_classes_from_last_row = len(list(set(y)))

        if verbose:
            print('num_classes_from_last_row', num_classes_from_last_row)

        transformers1 = [
            ( 'imputer_str',
              SimpleImputer(missing_values=None, strategy='most_frequent'),
              categorical_cols),
            ( 'imputer_num',
              SimpleImputer(strategy='mean'), numeric_cols)]
        txm1 = ColumnTransformer(transformers1, sparse_threshold=0.0)

        transformers2 = [
            ( 'ohe', OneHotEncoder(sparse=False),
              list(range(len(categorical_cols)))),
            ( 'no_op', 'passthrough',
              list(range(len(categorical_cols),
                         len(categorical_cols) + len(numeric_cols))))]
        txm2 = ColumnTransformer(transformers2, sparse_threshold=0.0)
        if verbose:
            print("Shape of X before preprocessing", X.shape)
        from lale.operators import make_pipeline
        preprocessing = make_pipeline(txm1, txm2)

        X = preprocessing.fit(X).transform(X)
        if verbose:
            print("Shape of X after preprocessing", X.shape)

    else:
        col_names = [attr[0] for attr in dataDictionary['attributes']]
        df_all = pd.DataFrame(dataDictionary['data'], columns=col_names)
        y = df_all[target_col]
        y = y.squeeze()
        cols_X = [col for col in col_names if col != target_col]
        X = df_all[cols_X]

    labelencoder = LabelEncoder()
    y = labelencoder.fit_transform(y)

    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size = 0.33, random_state = 0)
    if verbose:
        print(f'training set shapes: X {X_train.shape}, y {y_train.shape}')
        print(f'test set shapes:     X {X_test.shape}, y {y_test.shape}')
    X_train, X_test, y_train, y_test = add_schemas( \
        schema_orig, target_col, X_train, X_test, y_train, y_test)
    return (X_train, y_train), (X_test, y_test)