def transform(self, X):
        """Transform X separately by each transformer, concatenate results.

        Parameters
        ----------
        X : iterable or array-like, depending on transformers
            Input data to be transformed.

        Returns
        -------
        X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)
            hstack of results of transformers. sum_n_components is the
            sum of n_components (output dimension) over transformers.
        """
        Xs = Parallel(n_jobs=self.n_jobs)(
            delayed(_transform_one)(trans, X, None, weight)
            for name, trans, weight in self._iter())
        if not Xs:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.vstack(Xs).tocsr()
        else:
            if isinstance(Xs[0], np.ndarray):
                Xs = np.vstack(Xs)
            elif isinstance(Xs[0], pd.Series) or isinstance(
                    Xs[0], pd.DataFrame):
                Xs = pd.concat(Xs, axis=1)
        return Xs
Exemple #2
0
def _resample_model(estimator_func,
                    X,
                    y,
                    scaling=.5,
                    n_resampling=200,
                    n_jobs=None,
                    verbose=False,
                    pre_dispatch='3*n_jobs',
                    random_state=None,
                    sample_fraction=.75,
                    **params):
    random_state = check_random_state(random_state)
    # We are generating 1 - weights, and not weights
    n_samples, n_features = X.shape

    if not (0 < scaling < 1):
        raise ValueError(
            "'scaling' should be between 0 and 1. Got %r instead." % scaling)

    scaling = 1. - scaling
    scores_ = 0.0
    for active_set in Parallel(
            n_jobs=n_jobs, verbose=verbose,
            pre_dispatch=pre_dispatch)(delayed(estimator_func)(
                X,
                y,
                weights=scaling *
                random_state.randint(0, 2, size=(n_features, )),
                mask=(random_state.rand(n_samples) < sample_fraction),
                verbose=max(0, verbose - 1),
                **params) for _ in range(n_resampling)):
        scores_ += active_set

    scores_ /= n_resampling
    return scores_
Exemple #3
0
    def fit(self, X, y, categories="auto"):
        # this is hard-coded for categorical variables
        if isinstance(y, pd.Series) and hasattr(y, "cat"):
            y = y.cat.codes

        self.n_classes_ = np.max(y) + 1
        categories = list(range(self.n_classes_))

        # order of estimators
        self.estimators_ = Parallel(n_jobs=self.n_jobs)(
            delayed(_parallel_fit_estimator)(clone(self.estimator), X, y, cat)
            for cat in categories[:-1])
        return self
Exemple #4
0
def validation_curve(estimator,
                     X,
                     y,
                     param_name,
                     param_range,
                     groups=None,
                     cv=None,
                     scoring=None,
                     n_jobs=None,
                     pre_dispatch="all",
                     verbose=0,
                     error_score=np.nan):
    X, y, groups = indexable(X, y, groups)

    cv = check_cv(cv, y, classifier=is_classifier(estimator))
    scorer = check_scoring(estimator, scoring=scoring)

    parallel = Parallel(n_jobs=n_jobs,
                        pre_dispatch=pre_dispatch,
                        verbose=verbose)
    out = parallel(
        delayed(_fit_and_score)(clone(estimator),
                                X,
                                y,
                                scorer,
                                train,
                                test,
                                verbose,
                                parameters={
                                    param_name: v
                                },
                                fit_params=None,
                                return_train_score=True,
                                error_score=error_score,
                                return_estimator=True,
                                return_times=True)
        # NOTE do not change order of iteration to allow one time cv splitters
        for train, test in cv.split(X, y, groups) for v in param_range)

    out = np.asarray(out)
    estimators = out[:, 4]
    out_scores = np.asarray(out[:, :2])
    fit_time = out[:, 2]
    score_time = out[:, 3]
    n_params = len(param_range)
    n_cv_folds = out_scores.shape[0] // n_params
    out_scores = out_scores.reshape(n_cv_folds, n_params, 2).transpose(
        (2, 1, 0))

    return estimators, np.float64(out_scores[0]), np.float64(out_scores[1]), np.float64(fit_time), \
           np.float64(score_time)
Exemple #5
0
    def fit(self, X, y):
        # n_samples, n_features = X.shape
        X = self._augment(X)

        # Perform label encoding so label indicies start from zero
        le = LabelEncoder()
        encoded_y = le.fit_transform(y)
        self.classes_ = le.classes_
        n_classes = len(self.classes_)

        # Use the Parallel library to fit C binary classifiers in parallel
        results = Parallel(
            n_jobs=self.n_jobs, prefer='threads',
            verbose=self.verbose)(delayed(_fit_binary_perceptron)(
                X, encoded_y, c, self.eta0, self.decay, self.max_iterations)
                                  for c in range(n_classes))

        # Store final result for prediction
        self.weights_ = np.array(results)

        return self
Exemple #6
0
	def transform (self, X):
		"""Transform X separately by each transformer, concatenate results.

		Parameters
		----------
		X : iterable or array-like, depending on transformers
		    Input data to be transformed.

		Returns
		-------
		X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)
		    hstack of results of transformers. sum_n_components is the
		    sum of n_components (output dimension) over transformers.
		"""
		if self.use_in_model:
			return super(CustomFeatureUnion, self).transform(X)

		Xs = Parallel(n_jobs=self.n_jobs)(
				delayed(_transform_one)(trans, X, None, weight)
				for _, trans, weight in self._iter())

		return self.get_result_as_dictionary(Xs)
    def fit(self, X, y=None):
        """Fit all transformers using X.

        Parameters
        ----------
        X : iterable or array-like, depending on transformers
            Input data, used to fit transformers.

        y : array-like, shape (n_samples, ...), optional
            Targets for supervised learning.

        Returns
        -------
        self : FeatureUnion
            This estimator
        """
        self.transformer_list = list(self.transformer_list)
        self._validate_transformers()
        transformers = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_one_transformer)(trans, X, y)
            for _, trans, _ in self._iter())
        self._update_transformer_list(transformers)
        return self
    def fit_transform(self, X, y=None, **fit_params):
        """Fit all transformers, transform the data and concatenate results.

        Parameters
        ----------
        X : iterable or array-like, depending on transformers
            Input data to be transformed.

        y : array-like, shape (n_samples, ...), optional
            Targets for supervised learning.

        Returns
        -------
        X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)
            hstack of results of transformers. sum_n_components is the
            sum of n_components (output dimension) over transformers.
        """
        self._validate_transformers()
        result = Parallel(n_jobs=self.n_jobs)(
            delayed(_fit_transform_one)(trans, X, y, weight, **fit_params)
            for name, trans, weight in self._iter())

        if not result:
            # All transformers are None
            return np.zeros((X.shape[0], 0))
        Xs, transformers = zip(*result)
        self._update_transformer_list(transformers)
        if any(sparse.issparse(f) for f in Xs):
            Xs = sparse.vstack(Xs).tocsr()
        else:
            if isinstance(Xs[0], np.ndarray):
                Xs = np.vstack(Xs)
            elif isinstance(Xs[0], pd.Series) or isinstance(
                    Xs[0], pd.DataFrame):
                Xs = pd.concat(Xs, axis=1)
        return Xs
        def fit(self, X, y, sample_weight=None):
            """ Fit the estimators.
            Parameters
            ----------
            X : {array-like, sparse matrix}, shape = [n_samples, n_features]
                Training vectors, where n_samples is the number of samples and
                n_features is the number of features.
            y : array-like, shape = [n_samples]
                Target values.
            sample_weight : array-like, shape = [n_samples] or None
                Sample weights. If None, then samples are equally weighted.
                Note that this is supported only if all underlying estimators
                support sample weights.
            Returns
            -------
            self : object
            """
            if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
                raise NotImplementedError('Multilabel and multi-output'
                                          ' classification is not supported.')

            if self.voting not in ('soft', 'hard'):
                raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
                                 % self.voting)

            if self.estimators is None or len(self.estimators) == 0:
                raise AttributeError('Invalid `estimators` attribute, `estimators`'
                                     ' should be a list of (string, estimator)'
                                     ' tuples')

            if (self.weights is not None and
                    len(self.weights) != len(self.estimators)):
                raise ValueError('Number of classifiers and weights must be equal'
                                 '; got %d weights, %d estimators'
                                 % (len(self.weights), len(self.estimators)))

            if sample_weight is not None:
                for name, step in self.estimators:
                    if not has_fit_parameter(step, 'sample_weight'):
                        raise ValueError('Underlying estimator \'%s\' does not'
                                         ' support sample weights.' % name)
            names, clfs = zip(*self.estimators)
            self._validate_names(names)

            n_isnone = np.sum([clf is None for _, clf in self.estimators])
            if n_isnone == len(self.estimators):
                raise ValueError('All estimators are None. At least one is '
                                 'required to be a classifier!')

            self.le_ = LabelEncoder().fit(y)
            self.classes_ = self.le_.classes_
            self.estimators_ = []

            transformed_y = self.le_.transform(y)

            self.estimators_ = Parallel(n_jobs=self.n_jobs)(
                    delayed(_parallel_fit_estimator)(clone(clf), X, transformed_y,
                                                     sample_weight=sample_weight)
                    for clf in clfs if clf is not None)

            self.named_estimators_ = Bunch(**dict())
            for k, e in zip(self.estimators, self.estimators_):
                self.named_estimators_[k[0]] = e
            return self
Exemple #10
0
    def fit(self, X, y=None):
        """Fits the GraphLasso covariance model to X.
        Parameters
        ----------
        X : ndarray, shape (n_samples, n_features)
            Data from which to compute the covariance estimate
        y : (ignored)
        """
        # Covariance does not make sense for a single feature
        self.random_state = check_random_state(self.random_state)
        # check_data_dimensions(X, layers=2)
        X = check_array(X, ensure_min_features=2,
                         ensure_min_samples=2, estimator=self)

        self.X_train = X
        if self.assume_centered:
            self.location_ = np.zeros((X.shape[0],  X.shape[1]))
        else:
            self.location_ = X.mean(0)

        emp_cov = empirical_covariance(
                        X, assume_centered=self.assume_centered)
       
        X = check_array(X, ensure_min_features=2, estimator=self)
        cv = check_cv(self.cv, y, classifier=False)

        # List of (alpha, scores, covs)
        path = list()
        n_etas = self.etas
        inner_verbose = max(0, self.verbose - 1)

        if isinstance(n_etas, Sequence):
            etas = self.etas
        else:
            eta_1 = par_max(emp_cov)
            eta_0 = 1e-2 * eta_1
            etas = np.logspace(np.log10(eta_0), np.log10(eta_1),
                                 n_etas)[::-1]
        
        n_mus = self.mus
        inner_verbose = max(0, self.verbose - 1)

        if isinstance(n_mus, Sequence):
            mus = self.mus
        else:
            mu_1 = par_max(emp_cov) # not sure is the best strategy
            mu_0 = 1e-2 * mu_1
            mus = np.logspace(np.log10(mu_0), np.log10(mu_1),
                                 n_mus)[::-1]

        with warnings.catch_warnings():
            warnings.simplefilter('ignore', ConvergenceWarning)

        this_path = Parallel(
                n_jobs=self.n_jobs,
                verbose=self.verbose
            )(delayed(flgl_path)(X[train], links=self.links, etas=etas, mus= mus,
                                 X_test=X[test], tol=self.tol,
                                        max_iter=int(.1 * self.max_iter),
                                        update_rho=self.update_rho,
                                        verbose=0, random_state=self.random_state)
              for train, test in cv.split(X, y))

        # Little danse to transform the list in what we need
        covs, precs, hidds, scores = zip(*this_path)
        covs = zip(*covs)
        precs = zip(*precs)
        hidds = zip(*hidds)
        scores = zip(*scores)
        combinations = list(product(etas, mus))
        path.extend(zip(combinations, scores, covs))
        path = sorted(path, key=operator.itemgetter(0), reverse=True)

        # Find the maximum (avoid using built in 'max' function to
        # have a fully-reproducible selection of the smallest alpha
        # in case of equality)
        best_score = -np.inf
        last_finite_idx = 0
        for index, (combination, scores, _) in enumerate(path):
            this_score = np.mean(scores)
            if this_score >= .1 / np.finfo(np.float64).eps:
                this_score = np.nan
            if np.isfinite(this_score):
                last_finite_idx = index
            if this_score >= best_score:
                best_score = this_score
                best_index = index

            
        path = list(zip(*path))
        grid_scores = list(path[1])
        parameters = list(path[0])
        # Finally, compute the score with alpha = 0
        best_eta, best_mu = combinations[best_index]
        self.eta_ = best_eta
        self.mu_ = best_mu
        self.cv_parameters_ = combinations

        # Finally fit the model with the selected alpha
        self.covariance_, self.precision_, self.hidden_, self.R_, self.n_iter_ = two_layers_fixed_links_GL(
            emp_cov, self.links, eta=best_eta, mu=best_mu, tol=self.tol,
            max_iter=self.max_iter,
            verbose=self.verbose, random_state=self.random_state, 
            compute_objective=True, return_n_iter=True)
        return self
Exemple #11
0
 def __call__(self, value):
     values = Parallel(n_jobs=self.n_jobs)(delayed(pfn)(value)
                                           for pfn in self.steps)
     return self.aggregate(values)
Exemple #12
0
    def kneighbors(self, X, n_neighbors=None, return_distance=True):
        """Finds the K-neighbors of a point.
        Returns indices of and distances to the neighbors of each point.

        Parameters
        ----------
        X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]),
        or numpy ndarray with shape([n_cases,n_readings,n_dimensions])

        y : {array-like, sparse matrix}
            Target values of shape = [n_samples]

        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        return_distance : boolean, optional. Defaults to True.
            If False, distances will not be returned

        Returns
        -------
        dist : array
            Array representing the lengths to points, only present if
            return_distance=True

        ind : array
            Indices of the nearest points in the population matrix.

        Examples
        --------
        In the following example, we construct a NeighborsClassifier
        class from an array representing our data set and ask who's
        the closest point to [1,1,1]

        >>> samples = [[0., 0., 0.], [0., .5, 0.], [1., 1., .5]]
        >>> from sklearn.neighbors import NearestNeighbors
        >>> neigh = NearestNeighbors(n_neighbors=1)
        >>> neigh.fit(samples) # doctest: +ELLIPSIS
        NearestNeighbors(algorithm='auto', leaf_size=30, ...)
        >>> print(neigh.kneighbors([[1., 1., 1.]])) # doctest: +ELLIPSIS
        (array([[0.5]]), array([[2]]))

        As you can see, it returns [[0.5]], and [[2]], which means that the
        element is at distance 0.5 and is the third element of samples
        (indexes start at 0). You can also query for multiple points:

        >>> X = [[0., 1., 0.], [1., 0., 1.]]
        >>> neigh.kneighbors(X, return_distance=False) # doctest: +ELLIPSIS
        array([[1],
               [2]]...)

        """
        check_data_sktime_tsc(X)
        check_is_fitted(self, "_fit_method")

        if n_neighbors is None:
            n_neighbors = self.n_neighbors
        elif n_neighbors <= 0:
            raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors)
        else:
            if not np.issubdtype(type(n_neighbors), np.integer):
                raise TypeError("n_neighbors does not take %s value, "
                                "enter integer value" % type(n_neighbors))

        if X is not None:
            query_is_train = False
            X = check_array(X, accept_sparse='csr', allow_nd=True)
        else:
            query_is_train = True
            X = self._fit_X
            # Include an extra neighbor to account for the sample itself being
            # returned, which is removed later
            n_neighbors += 1

        train_size = self._fit_X.shape[0]
        if n_neighbors > train_size:
            raise ValueError("Expected n_neighbors <= n_samples, "
                             " but n_samples = %d, n_neighbors = %d" %
                             (train_size, n_neighbors))
        n_samples = X.shape[0]
        sample_range = np.arange(n_samples)[:, None]

        n_jobs = effective_n_jobs(self.n_jobs)
        if self._fit_method == 'brute':

            reduce_func = partial(self._kneighbors_reduce_func,
                                  n_neighbors=n_neighbors,
                                  return_distance=return_distance)

            # for efficiency, use squared euclidean distances
            kwds = ({
                'squared': True
            } if self.effective_metric_ == 'euclidean' else
                    self.effective_metric_params_)

            result = pairwise_distances_chunked(X,
                                                self._fit_X,
                                                reduce_func=reduce_func,
                                                metric=self.effective_metric_,
                                                n_jobs=n_jobs,
                                                **kwds)

        elif self._fit_method in ['ball_tree', 'kd_tree']:
            if issparse(X):
                raise ValueError(
                    "%s does not work with sparse matrices. Densify the data, "
                    "or set algorithm='brute'" % self._fit_method)
            if LooseVersion(joblib_version) < LooseVersion('0.12'):
                # Deal with change of API in joblib
                delayed_query = delayed(self._tree.query, check_pickle=False)
                parallel_kwargs = {"backend": "threading"}
            else:
                delayed_query = delayed(self._tree.query)
                parallel_kwargs = {"prefer": "threads"}
            result = Parallel(n_jobs, **parallel_kwargs)(
                delayed_query(X[s], n_neighbors, return_distance)
                for s in gen_even_slices(X.shape[0], n_jobs))
        else:
            raise ValueError("internal: _fit_method not recognized")

        if return_distance:
            dist, neigh_ind = zip(*result)
            result = np.vstack(dist), np.vstack(neigh_ind)
        else:
            result = np.vstack(result)

        if not query_is_train:
            return result
        else:
            # If the query data is the same as the indexed data, we would like
            # to ignore the first nearest neighbor of every sample, i.e
            # the sample itself.
            if return_distance:
                dist, neigh_ind = result
            else:
                neigh_ind = result

            sample_mask = neigh_ind != sample_range

            # Corner case: When the number of duplicates are more
            # than the number of neighbors, the first NN will not
            # be the sample, but a duplicate.
            # In that case mask the first duplicate.
            dup_gr_nbrs = np.all(sample_mask, axis=1)
            sample_mask[:, 0][dup_gr_nbrs] = False

            neigh_ind = np.reshape(neigh_ind[sample_mask],
                                   (n_samples, n_neighbors - 1))

            if return_distance:
                dist = np.reshape(dist[sample_mask],
                                  (n_samples, n_neighbors - 1))
                return dist, neigh_ind
            return neigh_ind
    def fit(self, X, y=None, groups=None, **fit_params):
        """Run fit with all sets of parameters.
        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.
        y : array-like, shape = [n_samples] or [n_samples, n_output], optional
            Target relative to X for classification or regression;
            None for unsupervised learning.
        groups : array-like, with shape (n_samples,), optional
            Group labels for the samples used while splitting the dataset into
            train/test set.
        **fit_params : dict of string -> object
            Parameters passed to the ``fit`` method of the estimator
        """

        if self.fit_params is not None:
            warnings.warn('"fit_params" as a constructor argument was '
                          'deprecated in version 0.19 and will be removed '
                          'in version 0.21. Pass fit parameters to the '
                          '"fit" method instead.', DeprecationWarning)
            if fit_params:
                warnings.warn('Ignoring fit_params passed as a constructor '
                              'argument in favor of keyword arguments to '
                              'the "fit" method.', RuntimeWarning)
            else:
                fit_params = self.fit_params
        estimator = self.estimator
        cv = check_cv(self.cv, y, classifier=is_classifier(estimator))

        scorers, self.multimetric_ = _check_multimetric_scoring(
            self.estimator, scoring=self.scoring)

        if self.multimetric_:
            if self.refit is not False and (
                    not isinstance(self.refit, six.string_types) or
                    # This will work for both dict / list (tuple)
                    self.refit not in scorers):
                raise ValueError("For multi-metric scoring, the parameter "
                                 "refit must be set to a scorer key "
                                 "to refit an estimator with the best "
                                 "parameter setting on the whole data and "
                                 "make the best_* attributes "
                                 "available for that metric. If this is not "
                                 "needed, refit should be set to False "
                                 "explicitly. %r was passed." % self.refit)
            else:
                refit_metric = self.refit
        else:
            refit_metric = 'score'

        X, y, groups = indexable(X, y, groups)
        n_splits = cv.get_n_splits(X, y, groups)

        base_estimator = clone(self.estimator)

        parallel = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                            pre_dispatch=self.pre_dispatch)

        fit_and_score_kwargs = dict(scorer=scorers,
                                    fit_params=fit_params,
                                    return_train_score=self.return_train_score,
                                    return_n_test_samples=True,
                                    return_times=True,
                                    return_parameters=False,
                                    return_estimator=self.return_estimator,
                                    error_score=self.error_score,
                                    verbose=self.verbose)
        results_container = [{}]
        with parallel:
            all_candidate_params = []
            all_out = []
            all_estimators = []

            def evaluate_candidates(candidate_params):
                candidate_params = list(candidate_params)
                n_candidates = len(candidate_params)

                def _fit_and_score_recv(i_fold, X, y, train, test, parameters):
                    current_estimator = clone(base_estimator)
                    if isinstance(current_estimator, Pipeline):
                        if hasattr(current_estimator._final_estimator, 'cachedir'):
                            current_estimator._final_estimator.cachedir = os.path.join(self.cachedir,
                                                                                       '%i_fold ' % i_fold)
                        else:
                            warnings.warn('Final estimator does not have recovery'
                                          ' or saving capabilities')
                    elif hasattr(current_estimator, 'cachedir'):
                        current_estimator.cachedir = os.path.join(self.cachedir, '%i_fold ' % i_fold)
                    else:
                        warnings.warn('Estimator does not have recovery '
                                      ' or saving capabilities')
                    print parameters
                    print i_fold


                    return delayed(_fit_and_score)(current_estimator,
                                                    X, y,
                                                    train=train, test=test,
                                                    parameters=parameters,
                                                    **fit_and_score_kwargs)

                list_split = list(enumerate(cv.split(X, y, groups)))
                if self.verbose > 0:
                    print("Fitting {0} folds for each of {1} candidates,"
                          " totalling {2} fits".format(
                        n_splits, n_candidates, n_candidates * n_splits))

                # print list(candidate_params)
                # raise NotImplementedError
                if self.client is None:
                    out = parallel(_fit_and_score_recv(i_fold,
                                                       X, y,
                                                       train, test,
                                                       parameters)
                                   for (parameters, (i_fold, (train, test)))
                                   in product(candidate_params,
                                              list_split))
                else:
                    self.client[:].use_dill()

                    dview = self.client[:]
                    out = dview.map(lambda parameters, i_fold, train, test: _fit_and_score_recv(i_fold,
                                                                                                X, y,
                                                                                                train, test,
                                                                                                parameters),
                                   [(parameters, i_fold, train, test) for (parameters, (i_fold, (train, test)))
                                                                      in product(candidate_params,
                                                                                 list_split)])

                if self.return_estimator:
                    all_estimators.extend([out_set[-1] for out_set in out])
                    out = [out_set[:-1] for out_set in out]
                all_candidate_params.extend(candidate_params)
                all_out.extend(out)


                # XXX: When we drop Python 2 support, we can use nonlocal
                # instead of results_container
                results_container[0] = self._format_results(
                    all_candidate_params, scorers, n_splits, all_out)
                return results_container[0]

            self._run_search(evaluate_candidates)

        results = results_container[0]

        # For multi-metric evaluation, store the best_index_, best_params_ and
        # best_score_ iff refit is one of the scorer names
        # In single metric evaluation, refit_metric is "score"
        if self.refit or not self.multimetric_:
            self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
            self.best_params_ = results["params"][self.best_index_]
            self.best_score_ = results["mean_test_%s" % refit_metric][
                self.best_index_]

        if self.refit:
            self.best_estimator_ = clone(base_estimator).set_params(
                **self.best_params_)
            refit_start_time = time.time()
            if y is not None:
                self.best_estimator_.fit(X, y, **fit_params)
            else:
                self.best_estimator_.fit(X, **fit_params)
            refit_end_time = time.time()
            self.refit_time_ = refit_end_time - refit_start_time

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers if self.multimetric_ else scorers['score']

        self.cv_results_ = results
        self.n_splits_ = n_splits

        if self.return_estimator:
            self.cv_estimators = all_estimators

        return self
Exemple #14
0
def exp(solvers, penalties, single_target, n_samples=30000, max_iter=20,
        dataset='rcv1', n_jobs=1, skip_slow=False):
    mem = Memory(cachedir=expanduser('~/cache'), verbose=0)

    if dataset == 'rcv1':
        rcv1 = fetch_rcv1()

        lbin = LabelBinarizer()
        lbin.fit(rcv1.target_names)

        X = rcv1.data
        y = rcv1.target
        y = lbin.inverse_transform(y)
        le = LabelEncoder()
        y = le.fit_transform(y)
        if single_target:
            y_n = y.copy()
            y_n[y > 16] = 1
            y_n[y <= 16] = 0
            y = y_n

    elif dataset == 'digits':
        digits = load_digits()
        X, y = digits.data, digits.target
        if single_target:
            y_n = y.copy()
            y_n[y < 5] = 1
            y_n[y >= 5] = 0
            y = y_n
    elif dataset == 'iris':
        iris = load_iris()
        X, y = iris.data, iris.target
    elif dataset == '20newspaper':
        ng = fetch_20newsgroups_vectorized()
        X = ng.data
        y = ng.target
        if single_target:
            y_n = y.copy()
            y_n[y > 4] = 1
            y_n[y <= 16] = 0
            y = y_n

    X = X[:n_samples]
    y = y[:n_samples]

    cached_fit = mem.cache(fit_single)
    out = Parallel(n_jobs=n_jobs, mmap_mode=None)(
        delayed(cached_fit)(solver, X, y,
                            penalty=penalty, single_target=single_target,
                            C=1, max_iter=max_iter, skip_slow=skip_slow)
        for solver in solvers
        for penalty in penalties)

    res = []
    idx = 0
    for solver in solvers:
        for penalty in penalties:
            if not (skip_slow and solver == 'lightning' and penalty == 'l1'):
                lr, times, train_scores, test_scores, accuracies = out[idx]
                this_res = dict(solver=solver, penalty=penalty,
                                single_target=single_target,
                                times=times, train_scores=train_scores,
                                test_scores=test_scores,
                                accuracies=accuracies)
                res.append(this_res)
            idx += 1

    with open('bench_saga.json', 'w+') as f:
        json.dump(res, f)
Exemple #15
0
def lasso_stability_path(X,
                         y,
                         scaling=0.5,
                         random_state=None,
                         n_resampling=200,
                         n_grid=100,
                         sample_fraction=0.75,
                         eps=4 * np.finfo(np.float).eps,
                         n_jobs=None,
                         verbose=False):
    """Stability path based on randomized Lasso estimates
    Parameters
    ----------
    X : array-like, shape = [n_samples, n_features]
        training data.
    y : array-like, shape = [n_samples]
        target values.
    scaling : float, optional, default=0.5
        The alpha parameter in the stability selection article used to
        randomly scale the features. Should be between 0 and 1.
    random_state : int, RandomState instance or None, optional, default=None
        The generator used to randomize the design.  If int, random_state is
        the seed used by the random number generator; If RandomState instance,
        random_state is the random number generator; If None, the random number
        generator is the RandomState instance used by `np.random`.
    n_resampling : int, optional, default=200
        Number of randomized models.
    n_grid : int, optional, default=100
        Number of grid points. The path is linearly reinterpolated
        on a grid between 0 and 1 before computing the scores.
    sample_fraction : float, optional, default=0.75
        The fraction of samples to be used in each randomized design.
        Should be between 0 and 1. If 1, all samples are used.
    eps : float, optional
        Smallest value of alpha / alpha_max considered
    n_jobs : int or None, optional (default=None)
        Number of CPUs to use during the resampling.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.
    verbose : boolean or integer, optional
        Sets the verbosity amount
    Returns
    -------
    alphas_grid : array, shape ~ [n_grid]
        The grid points between 0 and 1: alpha/alpha_max
    scores_path : array, shape = [n_features, n_grid]
        The scores for each feature along the path.
    """
    X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'])
    rng = check_random_state(random_state)

    if not (0 < scaling < 1):
        raise ValueError("Parameter 'scaling' should be between 0 and 1."
                         " Got %r instead." % scaling)

    n_samples, n_features = X.shape

    paths = Parallel(n_jobs=n_jobs, verbose=verbose)(
        delayed(_lasso_stability_path)(
            X,
            y,
            mask=rng.rand(n_samples) < sample_fraction,
            weights=1. - scaling * rng.randint(0, 2, size=(n_features, )),
            eps=eps) for k in range(n_resampling))

    all_alphas = sorted(list(set(itertools.chain(*[p[0] for p in paths]))))
    # Take approximately n_grid values
    stride = int(max(1, int(len(all_alphas) / float(n_grid))))
    all_alphas = all_alphas[::stride]
    if not all_alphas[-1] == 1:
        all_alphas.append(1.)
    all_alphas = np.array(all_alphas)
    scores_path = np.zeros((n_features, len(all_alphas)))

    for alphas, coefs in paths:
        if alphas[0] != 0:
            alphas = np.r_[0, alphas]
            coefs = np.c_[np.ones((n_features, 1)), coefs]
        if alphas[-1] != all_alphas[-1]:
            alphas = np.r_[alphas, all_alphas[-1]]
            coefs = np.c_[coefs, np.zeros((n_features, 1))]
        scores_path += (interp1d(alphas,
                                 coefs,
                                 kind='nearest',
                                 bounds_error=False,
                                 fill_value=0,
                                 axis=-1)(all_alphas) != 0)

    scores_path /= n_resampling
    return all_alphas, scores_path