Ejemplo n.º 1
0
    def fit(self, X, y):
        self.clusterer_ = clone(self.clusterer)
        for i in range(10):
            clusters = self.clusterer_.fit_predict(X)
            cluster_ids = np.unique(clusters)

            if len(cluster_ids) == self.clusterer_.n_clusters:
                # Success!
                break
            else:
                assert i != 9, \
                    "MBC: clustering failed after 10 attempts - some clusters have no data.\n" + \
                    "    Probably too little data available: " + \
                    "Only {n} data points for {k} clusters.".format(
                        n=X.shape[0], k=self.clusterer_.n_clusters)
                logger.warning("MBC: Clustering failed, trying again")

        self.estimators_ = {}
        for c in cluster_ids:
            mask = clusters == c
            est = clone(self.estimator)
            est.fit(X[safe_mask(X, mask)], y[safe_mask(y, mask)])
            self.estimators_[c] = est

        return self
Ejemplo n.º 2
0
def test_safe_mask():
    random_state = check_random_state(0)
    X = random_state.rand(5, 4)
    X_csr = sp.csr_matrix(X)
    mask = [False, False, True, True, True]

    mask = safe_mask(X, mask)
    assert_equal(X[mask].shape[0], 3)

    mask = safe_mask(X_csr, mask)
    assert_equal(X_csr[mask].shape[0], 3)
Ejemplo n.º 3
0
def test_safe_mask():
    random_state = check_random_state(0)
    X = random_state.rand(5, 4)
    X_csr = sp.csr_matrix(X)
    mask = [False, False, True, True, True]

    mask = safe_mask(X, mask)
    assert_equal(X[mask].shape[0], 3)

    mask = safe_mask(X_csr, mask)
    assert_equal(X_csr[mask].shape[0], 3)
Ejemplo n.º 4
0
 def fit(self, X, y):
     self.clusterer_ = clone(self.clusterer)
     clusters = self.clusterer_.fit_predict(X)
     n_clusters = len(np.unique(clusters))
     self.estimators_ = []
     for c in range(n_clusters):
         mask = clusters == c
         est = clone(self.estimator)
         est.fit(X[safe_mask(X, mask)], y[safe_mask(y, mask)])
         self.estimators_.append(est)
     return self
def dci_skeletons_bootstrap_multiple(
        X1,
        X2,
        alpha_skeleton_grid: list = [0.1, 0.5],
        max_set_size: int = 3,
        difference_ug: list = None,
        nodes_cond_set: set = None,
        edge_threshold: float = 0.05,
        sample_fraction: float = 0.7,
        n_bootstrap_iterations: int = 50,
        alpha_ug: float = 1.,
        max_iter: int = 1000,
        n_jobs: int = 1,
        random_state: int = None,
        verbose: int = 0,
        lam: float = 0,
        true_diff: Optional[Set] = None
):
    if difference_ug is None or nodes_cond_set is None:
        difference_ug, nodes_cond_set = dci_undirected_graph(
            X1,
            X2,
            alpha=alpha_ug,
            max_iter=max_iter,
            edge_threshold=edge_threshold,
            verbose=verbose
        )
        if verbose > 0: print(f"{len(difference_ug)} edges in the difference UG, over {len(nodes_cond_set)} nodes")

    bootstrap_samples1 = bootstrap_generator(n_bootstrap_iterations, sample_fraction, X1, random_state=random_state)
    bootstrap_samples2 = bootstrap_generator(n_bootstrap_iterations, sample_fraction, X2, random_state=random_state)

    bootstrap_results = Parallel(n_jobs, verbose=verbose)(
        delayed(dci_skeleton_multiple)(
            X1[safe_mask(X1, subsample1), :],
            X2[safe_mask(X2, subsample2), :],
            alpha_skeleton_grid=alpha_skeleton_grid,
            max_set_size=max_set_size,
            difference_ug=difference_ug,
            nodes_cond_set=nodes_cond_set,
            verbose=verbose,
            lam=lam, true_diff=true_diff)
        for subsample1, subsample2 in zip(bootstrap_samples1, bootstrap_samples2))

    p = X1.shape[1]
    alpha2adjacency = {alpha: np.zeros([p, p]) for alpha in alpha_skeleton_grid}
    for res in bootstrap_results:
        for alpha in alpha_skeleton_grid:
            alpha2adjacency[alpha] += 1 / n_bootstrap_iterations * edges2adjacency(X1.shape[1], res[alpha],
                                                                                   undirected=True)

    return bootstrap_results, alpha2adjacency
def test_select_percentile_classif_sparse():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)
    X = sparse.csr_matrix(X)
    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='percentile',
                                   param=25).fit(X, y).transform(X)
    assert_array_equal(X_r.toarray(), X_r2.toarray())
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)

    X_r2inv = univariate_filter.inverse_transform(X_r2)
    assert_true(sparse.issparse(X_r2inv))
    support_mask = safe_mask(X_r2inv, support)
    assert_equal(X_r2inv.shape, X.shape)
    assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
    # Check other columns are empty
    assert_equal(X_r2inv.getnnz(), X_r.getnnz())
Ejemplo n.º 7
0
    def transform(self, dataset):
        """
        Instead of from sklearn.feature_selection.base import SelectorMixin.
        SelectorMixin is now part of the private API.

        Reduce dataset to the selected features.

        Parameters
        ----------
        dataset : array of shape [n_samples, n_features]
            The input samples.

        Returns
        -------
        dataset_r : array of shape [n_samples, n_selected_features]
            The input samples with only the selected features.
        """

        dataset = check_array(dataset,
                              dtype=None,
                              accept_sparse='csr',
                              force_all_finite=True)
        mask = self.get_support()
        if not mask.any():
            warn(
                "No features were selected: either the data is"
                " too noisy or the selection test too strict.", UserWarning)
            return np.empty(0).reshape((dataset.shape[0], 0))
        if len(mask) != dataset.shape[1]:
            raise ValueError(
                "dataset has a different shape than during fitting.")
        return dataset[:, safe_mask(dataset, mask)]
 def transform(self, X):
     """Transform a new matrix using the selected features"""
     mask = self.get_support()
     X = check_array(X)
     if len(mask) != X.shape[1]:
         raise ValueError("X has a different shape than during fitting.")
     return check_array(X)[:, safe_mask(X, mask)]
Ejemplo n.º 9
0
    def transform(self, X):
        """Reduce X to the selected features.

        Parameters
        ----------
        X : array of shape [n_samples, n_features]
            The input samples.

        Returns
        -------
        X_r : array of shape [n_samples, n_selected_features]
            The input samples with only the selected features.
        """
        masks = self.get_support()
        X_r = dict()
        for roi_id in self.roi_id_valid:
            mask = masks[roi_id]
            if len(mask) != X[roi_id].shape[1]:
                raise ValueError("Roi %g has a different shape than during fitting." % roi_id)
            if not mask.any():
                warn("No features were selected in roi %g: either the data is"
                     " too noisy or the selection test too strict." % roi_id,
                     UserWarning)
                X_r[roi_id] = np.empty(0).reshape((X[roi_id].shape[0], 0))
            else:
                X_r[roi_id] = X[roi_id][:, safe_mask(X[roi_id], mask)]
        return X_r
Ejemplo n.º 10
0
def _randomized_logistic(X, y, weights, mask, C=1., verbose=False,
                         fit_intercept=True, tol=1e-3):
    X = X[safe_mask(X, mask)]
    y = y[mask]
    if issparse(X):
        size = len(weights)
        weight_dia = sparse.dia_matrix((1 - weights, 0), (size, size))
        X = X * weight_dia
    else:
        X *= (1 - weights)

    C = np.atleast_1d(np.asarray(C, dtype=np.float64))
    if C.ndim > 1:
        raise ValueError("C should be 1-dimensional array-like, "
                         "but got a {}-dimensional array-like instead: {}."
                         .format(C.ndim, C))

    scores = np.zeros((X.shape[1], len(C)), dtype=np.bool)

    for this_C, this_scores in zip(C, scores.T):
        # XXX : would be great to do it with a warm_start ...
        clf = LogisticRegression(C=this_C, tol=tol, penalty='l1', dual=False,
                                 fit_intercept=fit_intercept,
                                 solver='liblinear', multi_class='ovr')
        clf.fit(X, y)
        this_scores[:] = np.any(
            np.abs(clf.coef_) > 10 * np.finfo(np.float).eps, axis=0)
    return scores
Ejemplo n.º 11
0
    def transform(self, X):
        """Reduce X to the selected features.

        Parameters
        ----------
        X : array of shape [n_samples, n_features]
            The input samples.

        Returns
        -------
        X_r : array of shape [n_samples, n_selected_features]
            The input samples with only the selected features.
        """
        masks = self.get_support()
        X_r = dict()
        for roi_id in self.roi_id_valid:
            mask = masks[roi_id]
            if len(mask) != X[roi_id].shape[1]:
                raise ValueError("Roi %g has a different shape than during fitting." % roi_id)
            if not mask.any():
                warn("No features were selected in roi %g: either the data is"
                     " too noisy or the selection test too strict." % roi_id,
                     UserWarning)
                X_r[roi_id] = np.empty(0).reshape((X[roi_id].shape[0], 0))
            else:
                X_r[roi_id] = X[roi_id][:, safe_mask(X[roi_id], mask)]
        return X_r
Ejemplo n.º 12
0
def f_classifNumba(X, y):
    """Compute the ANOVA F-value for the provided sample.

    Read more in the :ref:`User Guide <univariate_feature_selection>`.

    Parameters
    ----------
    X : {array-like, sparse matrix} shape = [n_samples, n_features]
        The set of regressors that will tested sequentially.

    y : array of shape(n_samples)
        The data matrix.

    Returns
    -------
    F : array, shape = [n_features,]
        The set of F values.

    pval : array, shape = [n_features,]
        The set of p-values.

    See also
    --------
    chi2: Chi-squared stats of non-negative features for classification tasks.
    f_regression: F-value between label/feature for regression tasks.
    """
    X, y = check_X_y(X, y, ['csr', 'csc', 'coo'])
    args = [X[safe_mask(X, y == k)] for k in np.unique(y)]
    return f_onewayNumba(*args)
Ejemplo n.º 13
0
def test_select_percentile_classif_sparse():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )
    X = sparse.csr_matrix(X)
    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="percentile", param=25).fit(X, y).transform(X)
    assert_array_equal(X_r.toarray(), X_r2.toarray())
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)

    X_r2inv = univariate_filter.inverse_transform(X_r2)
    assert_true(sparse.issparse(X_r2inv))
    support_mask = safe_mask(X_r2inv, support)
    assert_equal(X_r2inv.shape, X.shape)
    assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
    # Check other columns are empty
    assert_equal(X_r2inv.getnnz(), X_r.getnnz())
def _randomized_lasso(X, y, weights, mask, alpha=1., verbose=False,
                      precompute=False, eps=np.finfo(np.float).eps,
                      max_iter=500):
    X = X[safe_mask(X, mask)]
    y = y[mask]

    # Center X and y to avoid fit the intercept
    X -= X.mean(axis=0)
    y -= y.mean()

    alpha = np.atleast_1d(np.asarray(alpha, dtype=np.float64))

    X = (1 - weights) * X
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', ConvergenceWarning)
        alphas_, _, coef_ = lars_path(X, y,
                                      Gram=precompute, copy_X=False,
                                      copy_Gram=False, alpha_min=np.min(alpha),
                                      method='lasso', verbose=verbose,
                                      max_iter=max_iter, eps=eps)

    if len(alpha) > 1:
        if len(alphas_) > 1:  # np.min(alpha) < alpha_min
            interpolator = interp1d(alphas_[::-1], coef_[:, ::-1],
                                    bounds_error=False, fill_value=0.)
            scores = (interpolator(alpha) != 0.0)
        else:
            scores = np.zeros((X.shape[1], len(alpha)), dtype=np.bool)
    else:
        scores = coef_[:, -1] != 0.0
    return scores
Ejemplo n.º 15
0
def _lasso_stability_path(X, y, mask, weights, eps):
    "Inner loop of lasso_stability_path"
    X = X * weights[np.newaxis, :]
    X = X[safe_mask(X, mask), :]
    y = y[mask]

    alpha_max = np.max(np.abs(np.dot(X.T, y))) / X.shape[0]
    alpha_min = eps * alpha_max  # set for early stopping in path
    with warnings.catch_warnings():
        warnings.simplefilter('ignore', ConvergenceWarning)
        alphas, _, coefs = lars_path(X, y, method='lasso', verbose=False,
                                     alpha_min=alpha_min)
    # Scale alpha by alpha_max
    alphas /= alphas[0]
    # Sort alphas in ascending order
    alphas = alphas[::-1]
    coefs = coefs[:, ::-1]
    # Get rid of the alphas that are too small
    mask = alphas >= eps
    # We also want to keep the first one: it should be close to the OLS
    # solution
    mask[0] = True
    alphas = alphas[mask]
    coefs = coefs[:, mask]
    return alphas, coefs
    def transform(self, X, threshold=None):
        """Reduce X to the selected features.

        Parameters
        ----------
        X : array of shape [n_samples, n_features]
            The input samples.

        threshold: float.
            Threshold defining the minimum cutoff value for the
            stability scores.

        Returns
        -------
        X_r : array of shape [n_samples, n_selected_features]
            The input samples with only the selected features.
        """
        X = check_array(X, accept_sparse='csr')
        mask = self.get_support(threshold=threshold)

        check_is_fitted(self, 'stability_scores_')

        if len(mask) != X.shape[1]:
            raise ValueError("X has a different shape than during fitting.")

        if not mask.any():
            warn(
                "No features were selected: either the data is"
                " too noisy or the selection test too strict.", UserWarning)
            return np.empty(0).reshape((X.shape[0], 0))

        return X[:, safe_mask(X, mask)]
Ejemplo n.º 17
0
 def KruskalWallisAnalysis(self, array, label):
     args = [array[safe_mask(array, label == k)] for k in np.unique(label)]
     neg, pos = args[0], args[1]
     f_list, p_list = [], []
     for index in range(array.shape[1]):
         f, p = kruskal(neg[:, index], pos[:, index])
         f_list.append(f), p_list.append(p)
     return np.array(f_list), np.array(p_list)
Ejemplo n.º 18
0
    def fit(self, X, y):
        n_samples = X.shape[0]
        rs = check_random_state(self.random_state)

        self.label_binarizer_ = LabelBinarizer(neg_label=-1, pos_label=1)
        Y = self.label_binarizer_.fit_transform(y)
        self.classes_ = self.label_binarizer_.classes_.astype(np.int32)
        n_vectors = Y.shape[1]

        dual_coef = np.zeros((n_vectors, n_samples), dtype=np.float64)

        warm_start = False
        if self.warm_start and self.dual_coef_ is not None:
            warm_start = True
            dual_coef[:, self.support_indices_] = self.dual_coef_
        else:
            self.intercept_ = np.zeros((n_vectors,), dtype=np.float64)

        self.dual_coef_ = dual_coef

        kernel = self._get_kernel()
        kcache = KernelCache(kernel, n_samples, self.cache_mb, 1, self.verbose)
        self.support_vectors_ = X

        for i in xrange(n_vectors):
            b = _lasvm(
                self,
                self.dual_coef_[i],
                X,
                Y[:, i],
                kcache,
                self.selection,
                self.search_size,
                self.termination,
                self.sv_upper_bound,
                self.tau,
                self.finish_step,
                self.C,
                self.max_iter,
                rs,
                self.callback,
                verbose=self.verbose,
                warm_start=warm_start,
            )
            self.intercept_[i] = b

        sv = np.sum(self.dual_coef_ != 0, axis=0, dtype=bool)
        self.support_indices_ = np.arange(n_samples)[sv]

        if self.kernel != "precomputed":
            self.dual_coef_ = np.ascontiguousarray(self.dual_coef_[:, sv])
            mask = safe_mask(X, sv)
            self.support_vectors_ = X[mask]

        if self.verbose >= 1:
            print "Number of support vectors:", np.sum(sv)

        return self
Ejemplo n.º 19
0
    def fit(self, X, y):
        self.clusterer_ = clone(self.clusterer)
        clusters = self.clusterer_.fit_predict(X)
        cluster_ids = np.unique(clusters)

        assert (len(cluster_ids) == self.clusterer_.n_clusters), \
            "MBC: Some clusters have no data. Probably too little data available: " + \
            "Only {n} data points for {k} clusters.".format(
                n=X.shape[0], k=self.clusterer_.n_clusters)

        self.estimators_ = {}
        for c in cluster_ids:
            mask = clusters == c
            est = clone(self.estimator)
            est.fit(X[safe_mask(X, mask)], y[safe_mask(y, mask)])
            self.estimators_[c] = est

        return self
    def fit(self, X, y):
        """Fit the stability selection model on the given data.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            The training input samples.

        y : array-like, shape = [n_samples]
            The target values.
        """

        self._validate_input()

        X, y = check_X_y(X, y, accept_sparse='csr')

        n_samples, n_variables = X.shape
        n_subsamples = np.floor(self.sample_fraction * n_samples).astype(int)
        n_lambdas = self.lambda_grid.shape[0]

        base_estimator = clone(self.base_estimator)
        random_state = check_random_state(self.random_state)
        stability_scores = np.zeros((n_variables, n_lambdas))

        for idx, lambda_value in enumerate(self.lambda_grid):
            if self.verbose > 0:
                print(
                    "Fitting estimator for lambda = %.5f (%d / %d) on %d bootstrap samples"
                    % (lambda_value, idx + 1, n_lambdas,
                       self.n_bootstrap_iterations))

            bootstrap_samples = _bootstrap_generator(
                self.n_bootstrap_iterations,
                self.bootstrap_func,
                y,
                n_subsamples,
                random_state=random_state)

            selected_variables = Parallel(
                n_jobs=self.n_jobs,
                verbose=self.verbose,
                pre_dispatch=self.pre_dispatch)(
                    delayed(_fit_bootstrap_sample)(
                        clone(base_estimator),
                        X=X[safe_mask(X, subsample), :],
                        y=y[subsample],
                        lambda_name=self.lambda_name,
                        lambda_value=lambda_value,
                        threshold=self.bootstrap_threshold)
                    for subsample in bootstrap_samples)

            stability_scores[:,
                             idx] = np.vstack(selected_variables).mean(axis=0)

        self.stability_scores_ = stability_scores
        return self
Ejemplo n.º 21
0
def _cross_val_score(estimator, X, y, scorer, train, test, verbose, fit_params):
    """Inner loop for cross validation"""
    n_samples = X.shape[0] if sp.issparse(X) else len(X)
    fit_params = dict(
        [
            (k, np.asarray(v)[train] if hasattr(v, "__len__") and len(v) == n_samples else v)
            for k, v in fit_params.items()
        ]
    )
    if not hasattr(X, "shape"):
        if getattr(estimator, "_pairwise", False):
            raise ValueError(
                "Precomputed kernels or affinity matrices have " "to be passed as arrays or sparse matrices."
            )
        X_train = [X[idx] for idx in train]
        X_test = [X[idx] for idx in test]
    else:
        if getattr(estimator, "_pairwise", False):
            # X is a precomputed square kernel matrix
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square kernel matrix")
            X_train = X[np.ix_(train, train)]
            X_test = X[np.ix_(test, train)]
        else:
            X_train = X[safe_mask(X, train)]
            X_test = X[safe_mask(X, test)]

    if y is None:
        y_train = None
        y_test = None
    else:
        y_train = y[train]
        y_test = y[test]
    estimator.fit(X_train, y_train, **fit_params)
    if scorer is None:
        score = estimator.score(X_test, y_test)
    else:
        score = scorer(estimator, X_test, y_test)
        if not isinstance(score, numbers.Number):
            raise ValueError("scoring must return a number, got %s (%s)" " instead." % (str(score), type(score)))
    if verbose > 1:
        print("score: %f" % score)
    return score
Ejemplo n.º 22
0
def kruskal_classif(X, y):
    ret_k = []
    ret_p = []

    for column in X:
        args = [X[safe_mask(X, y == k)][column] for k in np.unique(y)]
        r = kruskal(*args)
        ret_k.append(abs(r[0]))
        ret_p.append(r[1])
    return np.asanyarray(ret_k), np.asanyarray(ret_p)
Ejemplo n.º 23
0
    def fit(self, X, y):
        self.clusterer_ = clone(self.clusterer)

        min_cluster_size = 5 * X.shape[1]
        assert X.shape[0] >= min_cluster_size * self.clusterer_.n_clusters, \
            "MBC: clustering not possible, n_samples ({ns}) is less than " + \
            "5 * n_features * n_clusters (3 * {nf} * {nc} = {nmin})".format(
                ns=X.shape[0], nf=X.shape[1], nc=self.clusterer_.n_clusters,
                nmin=min_cluster_size * self.clusterer_.n_clusters)

        for i in range(10):
            # We try 10 times
            clusters = self.clusterer_.fit_predict(X)
            cluster_ids = np.unique(clusters)

            if len(cluster_ids) != self.clusterer_.n_clusters:
                logger.warning(
                    "MBC: Clustering failed - empty clusters, trying again")
            elif np.min(np.bincount(clusters)) < min_cluster_size:
                # Require a minimum number of samples per cluster
                logger.warning(
                    "MBC: Clustering failed - clusters too small, trying again"
                )
            else:
                # Success!
                break

            # Fail completely after 10 attempts
            assert i != 9, \
                "MBC: clustering failed after 10 attempts - some clusters have no data.\n" + \
                "    Probably too little data available: " + \
                "Only {n} data points for {k} clusters (abs min = {nmin}).".format(
                    n=X.shape[0], k=self.clusterer_.n_clusters,
                    nmin=min_cluster_size * self.clusterer_.n_clusters)

        self.estimators_ = {}
        for c in cluster_ids:
            mask = clusters == c
            est = clone(self.estimator)
            est.fit(X[safe_mask(X, mask)], y[safe_mask(y, mask)])
            self.estimators_[c] = est

        return self
Ejemplo n.º 24
0
def ttest_ind_classif(X, y):
    ret_k = []
    ret_p = []

    for column in X:
        args = [X[safe_mask(X, y == k)][column] for k in np.unique(y)]
        r = ttest_ind(*args, equal_var=False)
        ret_k.append(abs(r[0]))
        ret_p.append(r[1])
    return np.asanyarray(ret_k), np.asanyarray(ret_p)
Ejemplo n.º 25
0
def levene_median(X, y):
    ret_k = []
    ret_p = []

    for column in X:
        args = [X[safe_mask(X, y == k)][column] for k in np.unique(y)]
        r = levene(args[0], args[1], center='median')
        ret_k.append(abs(r[0]))
        ret_p.append(r[1])
    return np.asanyarray(ret_k), np.asanyarray(ret_p)
Ejemplo n.º 26
0
def _check_prediction(estimator, X, y, ids, train, test):
    if not hasattr(X, "shape"):
        if getattr(estimator, "_pairwise", False):
            raise ValueError("Precomputed kernels or affinity matrices have "
                             "to be passed as arrays or sparse matrices.")
        X_train = [X[idx] for idx in train]
        X_test = [X[idx] for idx in test]
    else:
        if getattr(estimator, "_pairwise", False):
            # X is a precomputed square kernel matrix
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square kernel matrix")
            X_train = X[np.ix_(train, train)]
            X_test = X[np.ix_(test, train)]
        else:
            X_train = X[safe_mask(X, train)]
            X_test = X[safe_mask(X, test)]

    y_train = y[train]
    y_test = y[test]
    ids_test = [ids[idx] for idx in test]

    y_test_n = sum(y_test)
    print 'ids_test', len(ids_test)
    print 'y_test_n', y_test_n

    estimator.fit(X_train, y_train)
    predicted_scores = estimator.predict_proba(X_test).T[1]

    errors = []
    processed = 0
    # for pred_score, id_test, y_test_i in sorted(zip(predicted_scores, ids_test, y_test), reverse=True)[:y_test_n]:
    for pred_score, id_test, y_test_i in zip(predicted_scores, ids_test, y_test):
        processed += 1
        errors.append((pred_score, id_test))
        #if y_test_i == 0:
        #    errors.append((pred_score, id_test))

    print 'errors', len(errors)
    print 'processed', processed

    return errors
Ejemplo n.º 27
0
    def _post_process(self, X):
        # We can't know the support vectors when using precomputed kernels.
        if self.kernel != "precomputed":
            sv = np.sum(self.coef_ != 0, axis=0, dtype=bool)
            self.coef_ = np.ascontiguousarray(self.coef_[:, sv])
            mask = safe_mask(X, sv)
            self.support_vectors_ = np.ascontiguousarray(X[mask])
            self.support_indices_ = np.arange(X.shape[0], dtype=np.int32)[sv]

        if self.verbose >= 1:
            print "Number of support vectors:", np.sum(sv)
Ejemplo n.º 28
0
def meta_cross_val_score(estimator, Xs, y, scorer, train, test):
    Xs_train = []
    Xs_test = []
    for X in Xs:
        X_train = X[safe_mask(X, train)]
        X_test = X[safe_mask(X, test)]
        Xs_train.append(X_train)
        Xs_test.append(X_test)

    y_train = y[train]
    y_test = y[test]

    estimator.fit(Xs_train, y_train)

    if scorer is None:
        score = estimator.score(Xs_test, y_test)
    else:
        score = scorer(estimator, Xs_test, y_test)

    return score
Ejemplo n.º 29
0
 def predict(self, X):
     clusters = self.clusterer_.predict(X)
     y_tmp = []
     idx = []
     for c, est in enumerate(self.estimators_):
         mask = clusters == c
         idx.append(np.flatnonzero(mask))
         predictions.append(est.predict(X[safe_mask(X, mask)]))
     y_tmp = np.concatenate(y_tmp)
     idx = np.concatenate(idx)
     y = np.empty_like(y_tmp)
     y[idx] = y_tmp
     return y
Ejemplo n.º 30
0
    def transform(self, X, y=None):
        """Reduce X to the selected features.

        Parameters
        ----------
        X : ndarray of shape [n_samples, n_features]
            The input samples.
        y : ignored

        Returns
        -------
        X_r : ndarray
            The selected subset of the input.
        """

        if len(X.shape) == 1:
            X = X.reshape(-1, 1)

        mask = self.get_support()

        # note: we use _safe_tags instead of _get_tags because this is a
        # public Mixin.
        X = self._validate_data(
            X,
            dtype=None,
            accept_sparse="csr",
            force_all_finite=not _safe_tags(self, key="allow_nan"),
            reset=False,
            ensure_2d=self._axis,
        )

        if len(mask) != X.shape[self._axis]:
            raise ValueError("X has a different shape than during fitting.")

        if self._axis == 1:
            return X[:, safe_mask(X, mask)]
        else:
            return X[safe_mask(X, mask)]
Ejemplo n.º 31
0
    def _post_process(self, X):
        # We can't know the support vectors when using precomputed kernels.
        if self.kernel != "precomputed":
            sv = np.sum(self.coef_ != 0, axis=0, dtype=bool)
            if np.sum(sv) > 0:
                self.coef_ = np.ascontiguousarray(self.coef_[:, sv])
                mask = safe_mask(X, sv)
                self.support_vectors_ = np.ascontiguousarray(X[mask])
                self.support_indices_ = np.arange(X.shape[0],
                                                  dtype=np.int32)[sv]
                self.n_samples_ = X.shape[0]

            if self.verbose >= 1:
                print "Number of support vectors:", np.sum(sv)
Ejemplo n.º 32
0
 def predict(self, X):
     """Transform X separately by each transformer, concatenate results.
     Parameters
     ----------
     X : iterable or array-like, depending on transformers
         Input data to be transformed.
     Returns
     -------
     X_t : array-like or sparse matrix, shape (n_samples, sum_n_components)
         hstack of results of transformers. sum_n_components is the
         sum of n_components (output dimension) over transformers.
     """
     Xs = self.feature_union_apt.transform(X)
     Xp = Xs.loc[:, safe_mask(Xs, self.fmask_apt)]
     return self.estimator.predict(Xp)
Ejemplo n.º 33
0
    def transform(self, X):

        tags = self._get_tags()
        X = check_array(X,
                        dtype=None,
                        accept_sparse='csr',
                        force_all_finite=not tags.get('allow_nan', True))
        mask = self.get_support()
        if not mask.any():
            warn(
                "No features were selected: either the data is"
                " too noisy or the selection test too strict.", UserWarning)
            return np.empty(0).reshape((X.shape[0], 0))
        if len(mask) != X.shape[1]:
            raise ValueError("X has a different shape than during fitting.")
        return X[:, safe_mask(X, mask)]
Ejemplo n.º 34
0
    def predict(self, X):
        # this returns -1 if any of the values squared are too large
        # models with numerical instability will fail.
        clusters = self.clusterer_.predict(X)

        y_tmp = []
        idx = []
        for c, est in self.estimators_.items():
            mask = clusters == c
            if mask.any():
                idx.append(np.flatnonzero(mask))
                y_tmp.append(est.predict(X[safe_mask(X, mask)]))

        y_tmp = np.concatenate(y_tmp)
        idx = np.concatenate(idx)
        y = np.full([X.shape[0], y_tmp.shape[1]], np.nan)
        y[idx] = y_tmp

        return y
    def transform(self, X, exposure=None):
        """Reduce X to the selected features.

        Parameters
        ----------
        X : array of shape [n_samples, n_features]
            The input samples.

        Returns
        -------
        X_r : array of shape [n_samples, n_selected_features]
            The input samples with only the selected features.
        """
        mask = self.get_support()
        if not mask.any():
            return np.ones(shape=(X.shape[0], 1))
        if len(mask) != X.shape[1]:
            raise ValueError("X has a different shape than during fitting.")
        return X[:, safe_mask(X, mask)]
Ejemplo n.º 36
0
	def transform(self, X):
		"""Reduce X to the selected features.

		Parameters
		----------
		X : array of shape [n_samples, n_features]
			The input samples.

		Returns
		-------
		X_r : array of shape [n_samples, n_selected_features]
			The input samples with only the selected features.
		"""
		X = check_array(X, dtype=None, accept_sparse='csr')
		mask = self.get_support()
		if not mask.any():
			return numpy.empty(0).reshape((X.shape[0], 0))
		if len(mask) != X.shape[1]:
			raise ValueError("X has a different shape than during fitting.")
		return X[:, safe_mask(X, mask)]
    def wrapped_method(*args,
                       parameter_grid: list,
                       n_jobs: int = 1,
                       n_bootstrap_iterations: int = 50,
                       random_state: int = 0,
                       sample_fraction: float = 0.7,
                       verbose: bool = False,
                       bootstrap_threshold: float = 0.5,
                       **kwargs):
        parameters2results = dict()

        for parameters in parameter_grid:
            arg2subsamples_list = []
            for X in args:
                nsamples = X.shape[0]
                bootstrap_samples = list(
                    bootstrap_generator(n_bootstrap_iterations,
                                        sample_fraction,
                                        nsamples,
                                        random_state=random_state))
                arg2subsamples_list.append(bootstrap_samples)

            bootstrap_results = Parallel(n_jobs, verbose=verbose)(
                delayed(method)(*(
                    arg[safe_mask(arg, subsample), :]
                    for arg, subsample in zip(args, arg2subsamples)
                ), **kwargs, **parameters)
                for arg2subsamples in zip(*arg2subsamples_list))

            parameters2results[frozendict(parameters)] = bootstrap_results

        stable_results = set()
        for param, results in parameters2results.items():
            counter = Counter()
            for result in results:
                counter.update(result[0])
            for item, count in counter.items():
                if count >= bootstrap_threshold * n_bootstrap_iterations:
                    stable_results.add(item)

        return stable_results, parameters2results
Ejemplo n.º 38
0
    def transform(self, X):
        """Reduce X to the selected features.

        Parameters
        ----------
        X : array of shape [n_samples, n_features]
            The input samples.

        Returns
        -------
        X_r : array of shape [n_samples, n_selected_features]
            The input samples with only the selected features.
        """
        masks = self.get_support()
        X_r = dict()
        for roi_id in self.variances_.keys():
            mask = masks[roi_id]
            if len(mask) != X[roi_id].shape[1]:
                raise ValueError("Roi %g has a different shape than during fitting." % roi_id)
            X_r[roi_id] = X[roi_id][:, safe_mask(X[roi_id], mask)]
        return X_r
Ejemplo n.º 39
0
    def transform(self, X):
        """Reduce X to the selected features.

        Parameters
        ----------
        X : array of shape [n_samples, n_features]
            The input samples.

        Returns
        -------
        X_r : array of shape [n_samples, n_selected_features]
            The input samples with only the selected features.
        """
        masks = self.get_support()
        X_r = dict()
        for roi_id in self.variances_.keys():
            mask = masks[roi_id]
            if len(mask) != X[roi_id].shape[1]:
                raise ValueError("Roi %g has a different shape than during fitting." % roi_id)
            X_r[roi_id] = X[roi_id][:, safe_mask(X[roi_id], mask)]
        return X_r
def _randomized_logistic(X, y, weights, mask, C=1., verbose=False,
                         fit_intercept=True, tol=1e-3):
    X = X[safe_mask(X, mask)]
    y = y[mask]
    if issparse(X):
        size = len(weights)
        weight_dia = sparse.dia_matrix((1 - weights, 0), (size, size))
        X = X * weight_dia
    else:
        X *= (1 - weights)

    C = np.atleast_1d(np.asarray(C, dtype=np.float64))
    scores = np.zeros((X.shape[1], len(C)), dtype=np.bool)

    for this_C, this_scores in zip(C, scores.T):
        # XXX : would be great to do it with a warm_start ...
        clf = LogisticRegression(C=this_C, tol=tol, penalty='l1', dual=False,
                                 fit_intercept=fit_intercept)
        clf.fit(X, y)
        this_scores[:] = np.any(
            np.abs(clf.coef_) > 10 * np.finfo(np.float).eps, axis=0)
    return scores
Ejemplo n.º 41
0
 def mapper(X, mask=support_mask):
     X = check_array(X, accept_sparse='csr')
     if len(mask.value) != X.shape[1]:
         raise ValueError("X has a different shape than during fitting.")
     return check_array(X, accept_sparse='csr')[:, safe_mask(X, mask.value)]
Ejemplo n.º 42
0
def create_components(X, y=None, n_components=None,
                      class_distrib="global", verbose=0, random_state=None):

    random_state = check_random_state(random_state)

    if n_components is None or n_components < 0:
        raise ValueError("n_components must be a positive number.")

    n_samples, n_features = X.shape

    if 0 < n_components and n_components <= 1:
        n_components = int(n_components * n_samples)

    if verbose: print "Creating components with K-means..."
    start = time.time()

    if class_distrib == "global":
        kmeans = KMeans(k=n_components, n_init=1, random_state=random_state)
        kmeans.fit(X)
        components = kmeans.cluster_centers_

    elif class_distrib == "balanced":
        classes = np.unique(y)
        n_classes = classes.shape[0]
        k = n_components / n_classes
        components = []

        for c in classes:
            mask = safe_mask(X, y == c)
            X_mask = X[mask]
            if k >= X_mask.shape[0]:
                components.append(X_mask)
            else:
                kmeans = KMeans(k=k, n_init=1, random_state=random_state)
                kmeans.fit(X_mask)
                components.append(kmeans.cluster_centers_)

        components = np.vstack(components)

    elif class_distrib == "stratified":
        classes = np.unique(y)
        components = []

        for c in classes:
            mask = y == c
            n_c = np.sum(mask)
            k = n_components * n_c / n_samples
            mask = safe_mask(X, mask)
            kmeans = KMeans(k=k, n_init=1, random_state=random_state)
            kmeans.fit(X[mask])
            components.append(kmeans.cluster_centers_)

        components = np.vstack(components)

    else:
        raise ValueError("No supported class_distrib value.")

    if verbose:
        print "Done in", time.time() - start, "seconds"

    return components
Ejemplo n.º 43
0
    def fit(self, X, y, kcache=None):
        n_samples = X.shape[0]
        rs = check_random_state(self.random_state)

        self.label_binarizer_ = LabelBinarizer(neg_label=-1, pos_label=1)
        Y = self.label_binarizer_.fit_transform(y)
        self.classes_ = self.label_binarizer_.classes_.astype(np.int32)
        n_vectors = Y.shape[1]

        A = X
        C = self.C
        termination = self.termination

        if self.penalty == "l2" and self.components is not None:
            A = self.components

        if self.warm_start and self.coef_ is not None:
            coef = np.zeros((n_vectors, A.shape[0]), dtype=np.float64)
            coef[:, self.support_indices_] = self.coef_
            self.coef_ = coef
        else:
            self.coef_ = np.zeros((n_vectors, A.shape[0]), dtype=np.float64)
            self.errors_ = np.ones((n_vectors, n_samples), dtype=np.float64)

        indices = np.arange(A.shape[0], dtype=np.int32)

        if kcache is None:
            kernel = self._get_kernel()
            kcache = KernelCache(kernel, n_samples,
                                 self.cache_mb, 1, self.verbose)

        self.support_vectors_ = X
        self.intercept_ = np.zeros(n_vectors, dtype=np.float64)

        if self.penalty in ("l1", "l1l2"):
            for i in xrange(n_vectors):
                    _primal_cd_l2svm_l1r(self, self.coef_[i], self.errors_[i],
                                         X, Y[:, i], indices, kcache, False,
                                         self.selection, self.search_size,
                                         self.termination, self.sv_upper_bound,
                                         self.C, self.max_iter, rs, self.tol,
                                         self.callback, verbose=self.verbose)

        if self.penalty == "l1l2":
            sv = np.sum(self.coef_ != 0, axis=0, dtype=bool)
            self.support_indices_ = np.arange(n_samples, dtype=np.int32)[sv]
            indices = self.support_indices_.copy()
            A = X
            self.support_vectors_ = A
            if not self.warm_debiasing:
                self.coef_ = np.zeros((n_vectors, n_samples), dtype=np.float64)
                self.errors_ = np.ones((n_vectors, n_samples), dtype=np.float64)
            C = self.Cd
            termination = "convergence"

        if self.penalty in ("l2", "l1l2"):
            for i in xrange(n_vectors):
                _primal_cd_l2svm_l2r(self, self.coef_[i], self.errors_[i],
                                     X, A, Y[:, i], indices, kcache, False,
                                     termination, self.sv_upper_bound,
                                     C, self.max_iter, rs, self.tol,
                                     self.callback, verbose=self.verbose)

        sv = np.sum(self.coef_ != 0, axis=0, dtype=bool)
        self.support_indices_ = np.arange(A.shape[0], dtype=np.int32)[sv]

        if np.sum(sv) == 0:
            # Empty model...
            self.coef_ = None
            return self

        # We can't know the support vectors when using precomputed kernels.
        if self.kernel != "precomputed":
            self.coef_ = np.ascontiguousarray(self.coef_[:, sv])
            mask = safe_mask(X, sv)
            self.support_vectors_ = A[mask]

        if self.verbose >= 1:
            print "Number of support vectors:", np.sum(sv)

        return self
Ejemplo n.º 44
0
def fit_grid_point_extended(X, y, base_estimator, parameters, train, test, scorer,
                   verbose, loss_func=None, extraOut="auto", **fit_params):
    """Run fit on one set of parameters.

Parameters
----------
X : array-like, sparse matrix or list
Input data.

y : array-like or None
Targets for input data.

base_estimator : estimator object
This estimator will be cloned and then fitted.

parameters : dict
Parameters to be set on base_estimator clone for this grid point.

train : ndarray, dtype int or bool
Boolean mask or indices for training set.

test : ndarray, dtype int or bool
Boolean mask or indices for test set.

scorer : callable or None.
If provided must be a scorer callable object / function with signature
``scorer(estimator, X, y)``.

verbose : int
Verbosity level.

**fit_params : kwargs
Additional parameter passed to the fit function of the estimator.


Returns
-------
score : float
Score of this parameter setting on given training / test split.

parameters : dict
The parameters that have been evaluated.

n_samples_test : int
Number of test samples in this split.
"""
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                      for k, v in parameters.items()))
        print("[GridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.'))

    # update parameters of the classifier after a copy of its base structure
    clf = clone(base_estimator)
    clf.set_params(**parameters)

    if hasattr(base_estimator, 'kernel') and callable(base_estimator.kernel):
        # cannot compute the kernel values with custom function
        raise ValueError("Cannot use a custom kernel function. "
                         "Precompute the kernel matrix instead.")

    if not hasattr(X, "shape"):
        if getattr(base_estimator, "_pairwise", False):
            raise ValueError("Precomputed kernels or affinity matrices have "
                             "to be passed as arrays or sparse matrices.")
        X_train = [X[idx] for idx in train]
        X_test = [X[idx] for idx in test]
    else:
        if getattr(base_estimator, "_pairwise", False):
            # X is a precomputed square kernel matrix
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square kernel matrix")
            X_train = X[np.ix_(train, train)]
            X_test = X[np.ix_(test, train)]
        else:
            X_train = X[safe_mask(X, train)]
            X_test = X[safe_mask(X, test)]

    if y is not None:
        y_test = y[safe_mask(y, test)]
        y_train = y[safe_mask(y, train)]
        clf.fit(X_train, y_train, **fit_params)

        if scorer is not None:
            this_score = scorer(clf, X_test, y_test)
        else:
            this_score = clf.score(X_test, y_test)
    else:
        clf.fit(X_train, **fit_params)
        if scorer is not None:
            this_score = scorer(clf, X_test)
        else:
            this_score = clf.score(X_test)

    if not isinstance(this_score, numbers.Number):
        raise ValueError("scoring must return a number, got %s (%s)"
                         " instead." % (str(this_score), type(this_score)))

    if verbose > 2:
        msg += ", score=%f" % this_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print("[GridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
    extraRVs = {}
    if extraOut != None:
        if "estimator" in extraOut:
            extraRVs["estimator"] = clf
        if extraOut == "auto" or "predictions" in extraOut:
            predictions = clf.predict(X)
            predictionIndex = 0
            predictionByIndex = {}
            for exampleIndex in safe_mask(X, test):
                predictionByIndex[exampleIndex] = predictions[predictionIndex]
                predictionIndex += 1
            extraRVs["predictions"] = predictionByIndex
        if (extraOut == "auto" or "importances" in extraOut) and hasattr(clf, "feature_importances_"):
            extraRVs["importances"] = clf.feature_importances_
    rvs = [this_score, parameters, _num_samples(X_test), extraRVs]
    return rvs
Ejemplo n.º 45
0
def fit_grid_point(X, y, sample_weight, base_clf,
                   clf_params, train, test, verbose,
                   **fit_params):
    """Run fit on one set of parameters

    Returns the score and the instance of the classifier
    """
    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                     for k, v in clf_params.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    X, y = check_arrays(X, y)
    # update parameters of the classifier after a copy of its base structure
    clf = clone(base_clf)
    clf.set_params(**clf_params)

    if hasattr(base_clf, 'kernel') and hasattr(base_clf.kernel, '__call__'):
        # cannot compute the kernel values with custom function
        raise ValueError(
            "Cannot use a custom kernel function. "
            "Precompute the kernel matrix instead.")

    if getattr(base_clf, "_pairwise", False):
        # X is a precomputed square kernel matrix
        if X.shape[0] != X.shape[1]:
            raise ValueError("X should be a square kernel matrix")
        X_train = X[np.ix_(train, train)]
        X_test = X[np.ix_(test, train)]
    else:
        X_train = X[safe_mask(X, train)]
        X_test = X[safe_mask(X, test)]

    if y is not None:
        y_test = y[safe_mask(y, test)]
        y_train = y[safe_mask(y, train)]
    else:
        y_test = None
        y_train = None

    if sample_weight is not None:
        sample_weight_test = sample_weight[safe_mask(sample_weight, test)]
        sample_weight_train = sample_weight[safe_mask(sample_weight, train)]
    else:
        sample_weight_test = None
        sample_weight_train = None

    if sample_weight is not None:
        clf.fit(X_train, y_train,
                sample_weight=sample_weight_train,
                **fit_params)
    else:
        clf.fit(X_train, y_train, **fit_params)

    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)
    return clf, clf_params, train, test
Ejemplo n.º 46
0
def score_each_boost(X, y, sample_weight,
                     clf, clf_params,
                     min_n_estimators,
                     train, test, loss_func,
                     score_func, verbose):
    """Run fit on one set of parameters

    Returns the score and the instance of the classifier
    """
    if hasattr(clf, 'kernel') and hasattr(clf.kernel, '__call__'):
        # cannot compute the kernel values with custom function
        raise ValueError(
            "Cannot use a custom kernel function. "
            "Precompute the kernel matrix instead.")

    X, y = check_arrays(X, y)

    if getattr(clf, "_pairwise", False):
        # X is a precomputed square kernel matrix
        if X.shape[0] != X.shape[1]:
            raise ValueError("X should be a square kernel matrix")
        X_train = X[np.ix_(train, train)]
        X_test = X[np.ix_(test, train)]
    else:
        X_train = X[safe_mask(X, train)]
        X_test = X[safe_mask(X, test)]

    if y is not None:
        y_test = y[safe_mask(y, test)]
        y_train = y[safe_mask(y, train)]
    else:
        y_test = None
        y_train = None

    if sample_weight is not None:
        sample_weight_test = sample_weight[safe_mask(sample_weight, test)]
        sample_weight_train = sample_weight[safe_mask(sample_weight, train)]
    else:
        sample_weight_test = None
        sample_weight_train = None

    if verbose > 1:
        start_time = time.time()
        msg = '%s' % (', '.join('%s=%s' % (k, v)
                                     for k, v in clf_params.iteritems()))
        print "[BoostGridSearchCV] %s %s" % (msg, (64 - len(msg)) * '.')

    if y is not None:
        if hasattr(y, 'shape'):
            this_n_test_samples = y.shape[0]
        else:
            this_n_test_samples = len(y)
    else:
        if hasattr(X, 'shape'):
            this_n_test_samples = X.shape[0]
        else:
            this_n_test_samples = len(X)

    all_scores = []
    all_clf_params = []
    n_test_samples = []

    # TODO: include support for sample_weight in score functions
    if loss_func is not None or score_func is not None:
        for i, y_pred in enumerate(clf.staged_predict(X_test)):
            if i + 1 < min_n_estimators:
                continue
            if loss_func is not None:
                score = -loss_func(y_test, y_pred)
            elif score_func is not None:
                score = score_func(y_test, y_pred)
            all_scores.append(score)
            clf_para = copy(clf_params)
            clf_para['n_estimators'] = i + 1
            all_clf_params.append(clf_para)
            n_test_samples.append(this_n_test_samples)

    else:
        if sample_weight_test is not None:
            for i, score in enumerate(clf.staged_score(X_test, y_test,
                sample_weight=sample_weight_test)):
                if i + 1 < min_n_estimators:
                    continue
                all_scores.append(score)
                clf_para = copy(clf_params)
                clf_para['n_estimators'] = i + 1
                all_clf_params.append(clf_para)
                n_test_samples.append(this_n_test_samples)

        else:
            for i, score in enumerate(clf.staged_score(X_test, y_test)):
                if i + 1 < min_n_estimators:
                    continue
                all_scores.append(score)
                clf_para = copy(clf_params)
                clf_para['n_estimators'] = i + 1
                all_clf_params.append(clf_para)
                n_test_samples.append(this_n_test_samples)

    # boosting may have stopped early
    if len(all_scores) < clf.n_estimators - min_n_estimators + 1:
        last_score = all_scores[-1]
        last_clf_params = all_clf_params[-1]
        for i in range(len(all_scores),
                clf.n_estimators - min_n_estimators + 1):
            all_scores.append(last_score)
            clf_para = copy(last_clf_params)
            clf_para['n_estimators'] = i + 1
            all_clf_params.append(clf_para)
            n_test_samples.append(this_n_test_samples)

    if verbose > 1:
        end_msg = "%s -%s" % (msg,
                              logger.short_format_time(time.time() -
                                                       start_time))
        print "[BoostGridSearchCV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)
    return all_scores, all_clf_params, n_test_samples
Ejemplo n.º 47
0
def _cross_val_score(cv_number, estimator, X, y,
                     score_func, train, test, verbose, fit_params):
    """Inner loop for cross validation"""

    # set the cv_number on the estimator
    estimator.cv_number = cv_number
    # if the estimator is a pipeline, set cv_number on all of its components

    if hasattr(estimator, 'named_steps'):
        for _, est in estimator.named_steps.iteritems():
            est.cv_number = cv_number

    n_samples = X.shape[0] if sp.issparse(X) else len(X)
    fit_params = dict([(k,
                        np.asarray(v)[train]
                        if hasattr(v, '__len__') and len(v) == n_samples
                        else v)
                       for k, v in fit_params.items()])
    if not hasattr(X, "shape"):
        if getattr(estimator, "_pairwise", False):
            raise ValueError("Precomputed kernels or affinity matrices have "
                             "to be passed as arrays or sparse matrices.")
        X_train = [X[idx] for idx in train]
        X_test = [X[idx] for idx in test]
    else:
        if getattr(estimator, "_pairwise", False):
            # X is a precomputed square kernel matrix
            if X.shape[0] != X.shape[1]:
                raise ValueError("X should be a square kernel matrix")
            X_train = X[np.ix_(train, train)]
            X_test = X[np.ix_(test, train)]
        else:
            X_train = X[safe_mask(X, train)]
            X_test = X[safe_mask(X, test)]

    if y is None:
        estimator.fit(X_train, **fit_params)
        if score_func is None:
            score = estimator.score(X_test)
        else:
            score = score_func(X_test)
    else:
    #     memory = joblib_cache.get_memory()
    #
    #
        # def _train_estimator(estimator, X_train, y, train, fit_params,
        #                      key_params):
        #     print 'Training estimator with params:'
        #     for name in ['train', 'fit_params']:
        #         print "\t%s = %s" % (name, locals()[name])
        #     for name, value in key_params.items():
        #         print "\t%s = %s" % (name, value)
        #     print
        #
        #     estimator.fit(X_train, y[train], **fit_params)
        #     return estimator
        #
        # _train_cached = memory.cache(_train_estimator,
        #                              ignore=['estimator', 'X_train'])
        # key_params['classifier'] = estimator.named_steps['clf']
        # key_params['cv_number'] = cv_number
        estimator = estimator.fit(X_train, y[train], **fit_params)

        if score_func is None:
            score = estimator.score(X_test, y[test])
        else:
            score = score_func(y[test], estimator.predict(X_test))
    if verbose > 1:
        print("score: %f" % score)
    return cv_number, score