Beispiel #1
0
    def fit(self, x, y):
        n_samples, p = x.shape

        linear_names = ["x_{}".format(i) for i in range(p)]
        names = linear_names[:]
        data = [x[:, i] for i in range(p)]

        models = net(Lasso, x, y, max_coarsity=self.max_coarsity).values()
        scores = [model.score(x, y) for model in models]
        coefs = [model.coef_ for model in models]

        importance = get_importance(coefs, scores)

        stall_iter = 0

        best_names = linear_names[:]
        best_model, best_score = _fit_model(x,
                                            y,
                                            best_names,
                                            self.operators,
                                            n_jobs=self.n_jobs)
        pop_size = p * (self.mu + 1 + self.q)
        for _ in range(self.max_iter):
            old_names = sorted(names[:])
            stall_iter += 1
            new_names = []
            new_data = []

            for i in range(3 * pop_size):
                f, new_name, parents = mutate(names, importance, self.toursize,
                                              self.operators, self.rng)
                if size(
                        new_name
                ) <= self.max_size and new_name not in new_names and new_name not in names:
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        feature = f(*[data[i] for i in parents])
                        if np.all(np.isfinite(feature)) and all(
                                abs(np.corrcoef(feature, data[i]))[1,
                                                                   0] <= self.t
                                for i in parents):
                            new_names.append(new_name)
                            new_data.append(feature)
                if len(new_names + names) < pop_size:
                    break
            else:
                warnings.warn(
                    "Failed to produce a new population given the tree-depth {} and correlation threshold {}."
                    .format(self.max_size, self.t), ConvergenceWarning)

            names.extend(new_names)
            data.extend(new_data)
            models = net(Lasso,
                         np.array(data).T,
                         y,
                         max_coarsity=self.max_coarsity).values()
            scores = [model.score(np.array(data).T, y) for model in models]
            coefs = [model.coef_ for model in models]
            importance = list(get_importance(coefs, scores))
            names_to_discard = [
                n for n in sorted(names,
                                  key=lambda x: importance[names.index(x)],
                                  reverse=True) if n not in linear_names
            ][-self.mu * p:]
            for n in names_to_discard:
                i = names.index(n)
                names.pop(i)
                data.pop(i)
                importance.pop(i)

            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                model, score = _fit_model(x,
                                          y,
                                          names,
                                          self.operators,
                                          n_jobs=self.n_jobs)

            if score > best_score:
                best_model = model
                best_score = score
                stall_iter = 0

            elif stall_iter >= self.max_stall_iter:
                break

        self.model = best_model
        return self
Beispiel #2
0
exponents = [1]
operators = {}

sym = sf.SymbolicFeatures(exponents=exponents, operators=operators)
features = sym.fit_transform(x)

ests = [Lasso, STRidge]
attrs = ["alpha", "threshold"]
names = ["Lasso", "STRidge"]

for est, attr, name in zip(ests, attrs, names):

    models = net(est,
                 features,
                 y,
                 attr,
                 filter=True,
                 max_coarsity=5,
                 r_max=1e5)
    m = sorted(models)
    scores = np.array([models[k].score(features, y) for k in m])

    plt.plot(m, scores, 'o--', label=name)

plt.legend()
plt.xlabel("# coefficient")
plt.ylabel(r"$R^2$")
plt.gca().invert_xaxis()
plt.show()