Ejemplo n.º 1
0
    def fit(self, X, y):
        """
        Generates and optimizes all legitimate pipelines. The best pipeline can be retrieved from `self.best_estimator_`

        :param X: Training data
        :param y: Corresponding observations
        :return: `self`
        """
        _X, _y = X, y
        if self.cat_cols is not None:
            from category_encoders.one_hot import OneHotEncoder

            enc = OneHotEncoder(
                cols=self.cat_cols, return_df=False, handle_unknown="ignore"
            )
            enc.fit(X)
            _X = enc.transform(X)

        X_, y_ = _X, _y
        self.num_features = len(X_[0])
        for l in range(1, self.length + 1):
            self._cast(l, X_, y_)
        self.best_estimator_ = list(self.get_top(1).items())[0][1][0]
        self.best_estimator_score = list(self.get_top(1).items())[0][1][1]
        return self
def enc(X):
    e = CEOneHotEncoder(use_cat_names=True, handle_unknown='ignore').fit(X)
    return e.transform(X)
Ejemplo n.º 3
0
    def eoa_fit(self, X, y, **kwargs):
        """
        Applies evolutionary optimization methods to find an optimum pipeline

        :param X: Training data
        :param y: Corresponding observations
        :param kwargs: `EOA` parameters
        :return: `self`
        """
        from .structsearch import BoxSample, CompactSample
        from .eoa import EOA
        _X, _y = X, y
        if self.cat_cols is not None:
            from category_encoders.one_hot import OneHotEncoder
            enc = OneHotEncoder(cols=self.cat_cols, return_df=False, handle_unknown='ignore')
            enc.fit(X)
            _X = enc.transform(X)
        X_, y_ = _X, _y
        self.num_features = len(X_[0])
        Pop = []
        for l in range(1, self.length + 1):
            candidates = self.words.Generate(l)
            for cnddt in candidates:
                if self._validate_sequence(cnddt):
                    Pop.append(cnddt)

        def _eval(ppl):
            if self.couldBfirst == []:
                from sklearn.pipeline import Pipeline
            else:
                from imblearn.pipeline import Pipeline
            from sklearn.model_selection import RandomizedSearchCV
            if self.surrogates is None:
                from numpy import logspace
                from sklearn.gaussian_process import GaussianProcessRegressor
                from sklearn.kernel_ridge import KernelRidge
                from sklearn.gaussian_process.kernels import Matern, Sum, ExpSineSquared, WhiteKernel
                param_grid_gpr = {"alpha": logspace(-8, 1, 20),
                                  "kernel": [Sum(Matern(length_scale=l_, nu=p), WhiteKernel(noise_level=q))
                                             for l_ in logspace(-3, 3, 20)
                                             for p in [0.5, 1.5, 2.5]
                                             for q in logspace(-3, 1.5, 20)]}
                GPR = RandomizedSearchCV(GaussianProcessRegressor(), param_distributions=param_grid_gpr, n_iter=20,
                                         cv=2)
                param_grid_krr = {"alpha": logspace(-4, 0, 10),
                                  "kernel": [Sum(Matern(), ExpSineSquared(l_, p))
                                             for l_ in logspace(-2, 2, 20)
                                             for p in logspace(0, 2, 20)]}
                KRR = RandomizedSearchCV(KernelRidge(), param_distributions=param_grid_krr, n_iter=30, cv=2)
                self.surrogates = [(KRR, 35, CompactSample, 'L-BFGS-B'), (GPR, 50, BoxSample, 'L-BFGS-B')]
                self.min_random_evals = 10
            from collections import OrderedDict
            fitted = OrderedDict([])
            for seq in ppl:
                best_mdl, best_scr = self.optimize_pipeline(seq, X_, y_)
                if seq not in self.models:
                    self.models[seq] = (best_mdl, best_scr)
                if self.verbose > 0:
                    print("score:%f" % best_scr)
                    print(best_mdl)
                fitted[seq] = -best_scr
            return fitted

        num_parents = kwargs.pop('num_parents', 30)
        mutation_prob = kwargs.pop('mutation_prob', .1)
        _eoa = EOA(population=Pop, fitness=_eval, num_parents=num_parents, mutation_prob=mutation_prob,
                   term_genes=self.couldBlast, init_genes=self.couldBfirst, **kwargs)
        _eoa()
        self.best_estimator_ = list(self.get_top(1).items())[0][1][0]
        return self