Exemple #1
0
def compute_imp_score(model, metric, training_features, training_classes,
                      random_state):
    """compute importance scores for features.
    If coef_ or feature_importances_ attribute is available for the model,
    the the importance scores will be based on the attribute. If not,
    then permuation importance scores will be estimated
    Parameters
    ----------
    tmpdir: string
        Temporary directory for saving experiment results
    model:  scikit-learn Estimator
        A fitted scikit-learn model
    metric: str, callable
        The metric for evaluating the feature importance through
        permutation. By default, the strings 'accuracy' is
        recommended for classifiers and the string 'r2' is
        recommended for regressors. Optionally, a custom
        scoring function (e.g., `metric=scoring_func`) that
        accepts two arguments, y_true and y_pred, which have
        similar shape to the `y` array.
    training_features: np.darray/pd.DataFrame
        Features in training dataset
    training_classes: np.darray/pd.DataFrame
        Target in training dataset
    random_state: int
        Random seed for permuation importances

    Returns
    -------
    coefs: np.darray
        Feature importance scores
    imp_score_type: string
        Importance score type

    """
    # exporting/computing importance score
    if hasattr(model, 'coef_'):
        coefs = model.coef_
        if coefs.ndim > 1:
            coefs = safe_sqr(coefs).sum(axis=0)
            imp_score_type = "Sum of Squares of Coefficients"
        else:
            coefs = safe_sqr(coefs)
            imp_score_type = "Squares of Coefficients"
    else:
        coefs = getattr(model, 'feature_importances_', None)
        imp_score_type = "Gini Importance"
    if coefs is None or np.isnan(coefs).any():
        coefs, _ = feature_importance_permutation(
            predict_method=model.predict,
            X=training_features,
            y=training_classes,
            num_rounds=5,
            metric=metric,
            seed=random_state,
        )
        imp_score_type = "Permutation Feature Importance"

    return coefs, imp_score_type
Exemple #2
0
def if_classif(X_y, n_features):
    """Compute the Anova F-value for the provided sample

    Parameters
    ----------
    X_y : Tuples of (X, y) with 
          X {array-like, sparse matrix} shape = [n_samples, n_features]
          The set of regressors that will tested sequentially
          y array of shape(n_samples)
          The data matrix

    Returns
    -------
    F : array, shape = [n_features,]
        The set of F values
    pval : array, shape = [n_features,]
        The set of p-values
    """
    
    n_samples = 0
    n_samples_per_class = defaultdict(lambda: 0)
    
    sums_args_d = defaultdict(lambda: np.zeros(shape=(n_features))) 
    ss_alldata = np.zeros(shape=(n_features))
    
    for X, y in X_y:
        if(n_samples % 100) == 0:
            logger.info("Processing doc #%d..." % n_samples)
            
        n_samples += 1
        n_samples_per_class[y] += 1
        
        ss_alldata[:] += X[:]**2
        sums_args_d[y][:] += X[:]
        
    n_classes = len(sums_args_d.keys())
    
    #Convert dictionary to numpy array
    sums_args = np.array(list(row for row in sums_args_d.itervalues()))
    
    square_of_sums_alldata = safe_sqr(reduce(lambda x, y: x + y, sums_args))
    square_of_sums_args = [safe_sqr(s) for s in sums_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
    ssbn = 0.
    for k, y in enumerate(n_samples_per_class.keys()):
        ssbn += square_of_sums_args[k] / n_samples_per_class[y]
    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    f = msb / msw
    # flatten matrix to vector in sparse case
    f = np.asarray(f).ravel()
    prob = stats.fprob(dfbn, dfwn, f)
    return f, prob
def f_regression_cov(X, y, C):
    """Univariate linear regression tests

    Quick linear model for testing the effect of a single regressor,
    sequentially for many regressors.

    This is done in 3 steps:
    1. the regressor of interest and the data are orthogonalized
    wrt constant regressors
    2. the cross correlation between data and regressors is computed
    3. it is converted to an F score then to a p-value

    Parameters
    ----------
    X : {array-like, sparse matrix}  shape = (n_samples, n_features)
        The set of regressors that will tested sequentially.

    y : array of shape(n_samples).
        The data matrix

    c : {array-like, sparse matrix}  shape = (n_samples, n_covariates)
        The set of covariates.


    Returns
    -------
    F : array, shape=(n_features,)
        F values of features.

    pval : array, shape=(n_features,)
        p-values of F-scores.
    """

    X = check_array(X, dtype=np.float)
    C = check_array(C, dtype=np.float)
    y = check_array(y, dtype=np.float)
    y = y.ravel()

    assert C.shape[1] < C.shape[0]
    cpinv = np.linalg.pinv(C)
    X -= np.dot(C, (np.dot(cpinv, X)))
    y -= np.dot(C, (np.dot(cpinv, y)))

    # compute the correlation
    corr = np.dot(y, X)
    corr /= np.asarray(np.sqrt(safe_sqr(X).sum(axis=0))).ravel()
    corr /= np.asarray(np.sqrt(safe_sqr(y).sum())).ravel()

    # convert to p-value
    dof = (X.shape[0] - 1 - C.shape[1]) / (1)  #(df_fm / (df_rm - df_fm))
    F = corr**2 / (1 - corr**2) * dof
    pv = stats.f.sf(F, 1, dof)
    return F, pv
def f_regression_cov(X, y, C):
    """Univariate linear regression tests

    Quick linear model for testing the effect of a single regressor,
    sequentially for many regressors.

    This is done in 3 steps:
    1. the regressor of interest and the data are orthogonalized
    wrt constant regressors
    2. the cross correlation between data and regressors is computed
    3. it is converted to an F score then to a p-value

    Parameters
    ----------
    X : {array-like, sparse matrix}  shape = (n_samples, n_features)
        The set of regressors that will tested sequentially.

    y : array of shape(n_samples).
        The data matrix

    c : {array-like, sparse matrix}  shape = (n_samples, n_covariates)
        The set of covariates.


    Returns
    -------
    F : array, shape=(n_features,)
        F values of features.

    pval : array, shape=(n_features,)
        p-values of F-scores.
    """

    X = check_arrays(X, dtype=np.float)
    C = check_arrays(C, dtype=np.float)
    y = check_arrays(y, dtype=np.float)    
    y = y.ravel()

    assert C.shape[1] < C.shape[0]
    cpinv = np.linalg.pinv(C)
    X -= np.dot(C,(np.dot(cpinv, X)))
    y -= np.dot(C,(np.dot(cpinv, y)))

    # compute the correlation
    corr = np.dot(y, X)
    corr /= np.asarray(np.sqrt(safe_sqr(X).sum(axis=0))).ravel()
    corr /= np.asarray(np.sqrt(safe_sqr(y).sum())).ravel()

    # convert to p-value
    dof = (X.shape[0] - 1 - C.shape[1]) / (1) #(df_fm / (df_rm - df_fm))
    F = corr ** 2 / (1 - corr ** 2) * dof
    pv = stats.f.sf(F, 1, dof)
    return F, pv
Exemple #5
0
def f_oneway(*args):

    n_classes = len(args)
    args = [as_float_array(a) for a in args]
    n_samples_per_class = np.array([a.shape[0] for a in args])
    n_samples = np.sum(n_samples_per_class)
    ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args)
    sums_args = [np.asarray(a.sum(axis=0)) for a in args]
    square_of_sums_alldata = sum(sums_args)**2
    square_of_sums_args = [s**2 for s in sums_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
    ssbn = 0.
    for k, _ in enumerate(args):
        ssbn += square_of_sums_args[k] / n_samples_per_class[k]
    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    constant_features_idx = np.where(msw == 0.)[0]
    if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size):
        warnings.warn("Features %s are constant." % constant_features_idx,
                      UserWarning)
    f = msb / msw
    # flatten matrix to vector in sparse case
    f = np.asarray(f).ravel()
    prob = special.fdtrc(dfbn, dfwn, f)
    return f, prob
Exemple #6
0
def read_selected_features_from_pipeline(classification_pipeline, is_sorted=True):
    """
    Given a classification pipeline, sort all of the features
    from the 'selector', as well as return the selection
    of features

    Arguments:
        classification_pipeline
    """
    rfe_step = classification_pipeline.named_steps.selection.named_steps.rfe

    # Get selected features
    sorted_idxs = np.argsort(safe_sqr(rfe_step.estimator_.coef_).sum(axis=0))
    mask = rfe_step.support_
    selected_features = np.array(read_all_features_from_pipeline(classification_pipeline))[mask]

    if not is_sorted:
        return selected_features

    # Sort selected features from bottom (worst) to highest (best)
    return selected_features[sorted_idxs]
Exemple #7
0
    def _fit(self, X, y, features_names=None, preload_features=None,
             ranking_th=0.005):
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                         multi_output=True)
        # Initialization
        n_features = X.shape[1]
        features = np.arange(n_features)

        cv = self.cv
        cv = check_cv(cv, y, classifier=is_classifier(self.estimator))
        if sklearn.__version__ == '0.17':
            n_splits = cv.n_folds
        else:
            n_splits = cv.get_n_splits(X, y)

        if self.verbose > 0:
            print("Fitting {0} folds for each of iteration".format(n_splits))

        if 0.0 < self.n_features_step < 1.0:
            step = int(max(1, self.n_features_step * n_features))
        else:
            step = int(self.n_features_step)
        if step <= 0:
            raise ValueError("Step must be >0")

        if features_names is not None:
            features_names = np.array(features_names)
        else:
            if self.features_names is not None:
                features_names = self.features_names
            else:
                features_names = np.arange(n_features)  # use indices

        tentative_support_ = np.zeros(n_features, dtype=np.bool)
        current_support_ = np.zeros(n_features, dtype=np.bool)

        self.scores_ = []
        self.scores_confidences_ = []
        self.features_per_it_ = []

        if preload_features is not None:
            preload_features = np.unique(preload_features).astype('int')
            current_support_[preload_features] = True

            X_selected = X[:, features[current_support_]]
            y_hat, cv_scores = my_cross_val_predict(clone(self.estimator),
                                                    X_selected, y, cv=cv)
            target = y - y_hat
        else:
            target = y.copy()

        score, confidence_interval = -np.inf, 0
        proceed = np.sum(current_support_) < X.shape[1]
        while proceed:
            if self.verbose > 0:
                print('\nN-times variance of target: {}'.format(
                    target.var() * target.shape[0]))
            # update values
            old_confidence_interval = confidence_interval
            old_score = score

            if self.scale:
                target = StandardScaler().fit_transform(target.reshape(
                    -1, 1)).ravel()
                # target = MinMaxScaler().fit_transform(target.reshape(
                #     -1,1)).ravel()

            if self.verbose > 0:
                print()
                print('Feature ranking')
                print()
                print("target shape: {}".format(target.shape))
                print()

            # Rank the remaining features
            start_t = time.time()
            rank_estimator = clone(self.estimator)
            rank_estimator.fit(X, target)
            end_fit = time.time() - start_t

            # Get coefs
            start_t = time.time()
            if hasattr(rank_estimator, 'coef_'):
                coefs = rank_estimator.coef_
            elif hasattr(rank_estimator, 'feature_importances_'):
                coefs = rank_estimator.feature_importances_
            else:
                raise RuntimeError('The classifier does not expose '
                                   '"coef_" or "feature_importances_" '
                                   'attributes')
            end_rank = time.time() - start_t

            # Get ranks by ordering in ascending way
            if coefs.ndim > 1:
                ranks = np.argsort(safe_sqr(coefs).sum(axis=0))
                coefs = coefs.sum(axis=0)
            else:
                ranks = np.argsort(safe_sqr(coefs))

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            if self.verbose > 0:
                ranked_f = features[ranks]
                if features_names is not None:
                    ranked_n = features_names[ranks]
                else:
                    ranked_n = ['-'] * n_features
                print('{:6}\t{:6}\t{:8}\t{}'.format('Rank', 'Index', 'Score',
                                                    'Feature Name'))
                for i in range(n_features):
                    idx = n_features - i - 1
                    if (coefs[ranks[idx]] < ranking_th) and (i > 2):
                        print(' ...')
                        break
                    print('#{:6}\t{:6}\t{:6f}\t{}'.format(str(i),
                                                          str(ranked_f[idx]),
                                                          coefs[ranks[idx]],
                                                          ranked_n[idx]))
                print(
                    "\n Fit done in {} s and rank done in {} s".format(end_fit,
                                                                       end_rank))

            # if coefs[ranks][-1] < 1e-5:
            #     if self.verbose > 0:
            #         import warnings
            #         warnings.warn('scores are too small to be used, please standardize inputs')
            #     break

            # get the best features (ie, the latest one)
            # if the most ranked features is selected go on a select
            # other features accordingly to the ranking

            # threshold = step
            # step_features = features[ranks][-threshold:]

            ii = len(features_names) - 1
            step_features = features[ranks][ii]
            while np.all(current_support_[step_features]) and ii > 0:
                ii -= 1
                step_features = features[ranks][ii]

            if np.all(current_support_[step_features]):
                if self.verbose > 0:
                    print("Selected features: {} {}".format(
                        features_names[step_features], step_features))
                    # if features_names is not None:
                    #     print("Selected features: {} {}".format(features_names[ranks][-threshold:], step_features))
                    # else:
                    #     print("Selected features: {}".format(step_features))
                    print('Ended because selected features already selected')
                step_features = None
                break

            # update selected features
            tentative_support_[step_features] = True

            # get the selected features
            X_selected = X[:, features[tentative_support_]]

            start_t = time.time()
            # cross validates to obtain the scores
            y_hat, cv_scores = my_cross_val_predict(clone(self.estimator),
                                                    X_selected, y, cv=cv)
            # y_hat = cross_val_predict(clone(self.estimator), X_selected, y, cv=cv)

            # compute new target
            target = y - y_hat

            # compute score and confidence interval
            # score = r2_score(y_true=y, y_pred=y_hat, multioutput='uniform_average')  # np.mean(cv_scores)
            if self.verbose > 0:
                print('r2: {}'.format(np.mean(cv_scores, axis=0)))

            score = np.mean(cv_scores)
            if len(cv_scores.shape) > 1:
                cv_scores = np.mean(cv_scores, axis=1)
            m2 = np.mean(cv_scores * cv_scores)
            confidence_interval_or = np.sqrt(
                (m2 - score * score) / (n_splits - 1))

            end_t = time.time() - start_t

            if self.verbose > 0:
                # if features_names is not None:
                print("Selected features: {} {}".format(
                    features_names[step_features], step_features))
                print("Total features: {} {}".format(
                    features_names[tentative_support_],
                    features[tentative_support_]))
                # else:
                #     print("Selected features: {}".format(step_features))
                #     print("Total features: {}".format(features[tentative_support_]))
                print("R2= {} +- {}".format(score, confidence_interval_or))
                print("\nCrossvalidation done in {} s".format(end_t))

            confidence_interval = confidence_interval_or * self.significance  # do not trust confidence interval completely

            # check terminal condition
            proceed = score - old_score > old_confidence_interval + confidence_interval
            if self.verbose > 0:
                print("PROCEED: {}\n\t{} - {} > {} + {}\n\t{} > {} )".format(
                    proceed, score, old_score,
                    old_confidence_interval,
                    confidence_interval,
                    score - old_score,
                    old_confidence_interval + confidence_interval))

            if proceed or np.sum(current_support_) == 0:
                # last feature set proved to be informative
                # we need to take into account of the new features (update current support)
                current_support_[step_features] = True
                self.features_per_it_.append(features_names[step_features])
                self.scores_.append(score)
                self.scores_confidences_.append(confidence_interval)

                # all the features are selected, stop
                if np.sum(current_support_) == n_features:
                    if self.verbose > 0:
                        print("All the features has been selected.")
                    proceed = False
            else:
                # last feature set proved to be not informative
                # keep old support and delete the current one (it is no more necessary)
                del tentative_support_
                if self.verbose > 0:
                    print('Last feature {} not added to the set'.format(
                        features_names[step_features]))

        # Set final attributes
        self.estimator_ = clone(self.estimator)
        # self.estimator_.fit(Xns[:, current_support_], yns)
        self.estimator_.fit(X[:, current_support_], y)

        self.n_features_ = current_support_.sum()
        self.support_ = current_support_
        # self.ranking_ = ranking_

        return self
def f_regression_cov_alt(X, y, C):
    """
    Implementation as derived in tex document

    See pg 12 of following document for definition of F-statistic
    http://www-stat.stanford.edu/~jtaylo/courses/stats191/notes/simple_diagnostics.pdf

    Parameters
    ----------
    X : {array-like, sparse matrix}  shape = (n_samples, n_features)
        The set of regressors that will tested sequentially.

    y : array of shape(n_samples).
        The data matrix

    c : {array-like, sparse matrix}  shape = (n_samples, n_covariates)
        The set of covariates.


    Returns
    -------
    F : array, shape=(n_features,)
        F values of features.

    pval : array, shape=(n_features,)
        p-values of F-scores.
    """
    # make sure we don't overwrite input data
    old_flag_X = X.flags.writeable
    old_flag_C = C.flags.writeable
    old_flag_y = y.flags.writeable
    X.flags.writeable = False
    C.flags.writeable = False
    y.flags.writeable = False


    #X, C, y = check_arrays(X, C, y, dtype=np.float)
    y = y.ravel()

    # make copy of input data
    X = X.copy(order="F")
    y = y.copy()

    assert C.shape[1] < C.shape[0]
    cpinv = np.linalg.pinv(C)
    X -= np.dot(C,(np.dot(cpinv, X))) #most expensive line (runtime)
    y -= np.dot(C,(np.dot(cpinv, y)))

    yS = safe_sqr(y.T.dot(X)) # will create a copy

    # Note: (X*X).sum(0) = X.T.dot(X).diagonal(), computed efficiently
    # see e.g.: http://stackoverflow.com/questions/14758283/is-there-a-numpy-scipy-dot-product-calculating-only-the-diagonal-entries-of-the
    # TODO: make this smarter using either stride tricks or cython
    X *= X
    denom = X.sum(0) * y.T.dot(y) - yS
    F = yS / denom

    # degrees of freedom
    dof = (X.shape[0] - 1 - C.shape[1]) / (1) #(df_fm / (df_rm - df_fm))
    F *= dof

    # convert to p-values
    pv = stats.f.sf(F, 1, dof)

    # restore old state
    X.flags.writeable = old_flag_X
    C.flags.writeable = old_flag_C
    y.flags.writeable = old_flag_y

    return F, pv
Exemple #9
0
    def _fit(self, X, y, step_score=None):
        # Parameter step_score controls the calculation of self.scores_
        # step_score is not exposed to users
        # and is used when implementing RFACV
        # self.scores_ will not be calculated when calling _fit through fit

        X, y = check_X_y(X, y, "csc")
        # Initialization
        n_features = X.shape[1]
        if self.n_features_to_select is None:
            n_features_to_select = n_features // 2
        else:
            n_features_to_select = self.n_features_to_select

        if 0.0 < self.step < 1.0:
            step = int(max(1, self.step * n_features))
        else:
            step = int(self.step)
        if step <= 0:
            raise ValueError("Step must be >0")

        support_added_ = np.zeros(n_features, dtype=np.bool)
        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)
        ranking_added_ = np.ones(n_features, dtype=np.int)

        if step_score:
            self.scores_ = []

        # Adding
        while np.sum(support_) > n_features_to_select:
            # Remaining features
            features = np.arange(n_features)[support_]

            # Added features
            features_added = np.arange(n_features)[support_added_]
            
            # Compute step score on the previous added features
            if step_score and np.sum(support_added_) > 0:
                estimator_added = clone(self.estimator)
                estimator_added.fit(X[:, features_added], y)
                self.scores_.append(step_score(estimator_added, features_added))

            # Rank the remaining features
            estimator = clone(self.estimator)
            if self.verbose > 0:
                print("Fitting estimator with %d features." % np.sum(support_))

            
            estimator.fit(X[:, features], y)           

            # Get coefs
            if hasattr(estimator, 'coef_'):
                coefs = estimator.coef_  
            else:
                coefs = getattr(estimator, 'feature_importances_', None)
            if coefs is None:
                raise RuntimeError('The classifier does not expose '
                                   '"coef_" or "feature_importances_" '
                                   'attributes')

            # Get ranks
            # ! For RFA, the rank is inverted: (np.argsort(list) replaced by (np.argsort(-list)
            if coefs.ndim > 1:
                try:                
                    ranks = np.argsort(-safe_sqr(coefs).sum(axis=0))
                except ValueError:
                    coefs = np.nan_to_num(coefs)
                    ranks = np.argsort(-safe_sqr(coefs).sum(axis=0))
            else:
                try:                
                    ranks = np.argsort(-safe_sqr(coefs))
                except ValueError:
                    coefs = np.nan_to_num(coefs)
                    ranks = np.argsort(-safe_sqr(coefs))

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Add the best features
            threshold = min(step, np.sum(support_) - n_features_to_select)
            
            # remaining features to test
            support_[features[ranks][:threshold]] = False
            ranking_[np.logical_not(support_)] += 1

            # added/ranked features            
            support_added_[features[ranks][:threshold]] = True
            ranking_added_[np.logical_not(support_added_)] += 1
            

        # Set final attributes
        features_added = np.arange(n_features)[support_added_]
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X[:, features_added], y)

        # Compute step score when only n_features_to_select features left
        if step_score:            
            self.scores_.append(step_score(self.estimator_, features_added))

        self.n_features_ = support_added_.sum()
        self.support_ = support_added_
        self.ranking_ = ranking_added_

        return self
    def _fit(self, X, y, step_score=None):
        X, y = check_X_y(X, y, "csc")
        # Initialization
        n_features = X.shape[1]
        if self.n_features_to_select is None:
            n_features_to_select = n_features // 2
        else:
            n_features_to_select = self.n_features_to_select

        if 0.0 < self.step < 1.0:
            if not self.stepwise_selection:
                step = int(max(1, self.step * n_features))
            else:
                step = self.step
        else:
            if self.stepwise_selection:
                warnings.warn(
                    "The parameter 'stepwise_selection' is true but "
                    "a fixed step size is given. Procedure will "
                    " continue as if 'stepwise_selection' is false",
                    RuntimeWarning)
            step = int(self.step)
        if step <= 0:
            raise ValueError("Step must be >0")

        if self.estimator_params is not None:
            warnings.warn(
                "The parameter 'estimator_params' is deprecated as "
                "of version 0.16 and will be removed in 0.18. The "
                "parameter is no longer necessary because the value "
                "is set via the estimator initialisation or "
                "set_params method.", DeprecationWarning)

        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)

        if step_score:
            self.scores_ = []

        # Elimination
        while np.sum(support_) > n_features_to_select:
            # Remaining features
            features = np.arange(n_features)[support_]

            # Rank the remaining features
            estimator = clone(self.estimator)
            if self.estimator_params:
                estimator.set_params(**self.estimator_params)
            if self.verbose > 0:
                print("Fitting estimator with %d features." % np.sum(support_))

            estimator.fit(X[:, features], y)

            # Get coefs
            if hasattr(estimator, 'coef_'):
                coefs = estimator.coef_
            elif hasattr(estimator, 'feature_importances_'):
                coefs = estimator.feature_importances_
            else:
                raise RuntimeError('The classifier does not expose '
                                   '"coef_" or "feature_importances_" '
                                   'attributes')

            # Get ranks
            if coefs.ndim > 1:
                ranks = np.argsort(safe_sqr(coefs).sum(axis=0))
            else:
                ranks = np.argsort(safe_sqr(coefs))

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Eliminate the worse features
            if self.stepwise_selection and 0.0 < step < 1.0:
                current_step_size = int(np.sum(support_) * step)
            else:
                current_step_size = step
            threshold = min(current_step_size,
                            np.sum(support_) - n_features_to_select)

            # Compute step score on the previous selection iteration
            # because 'estimator' must use features
            # that have not been eliminated yet
            if step_score:
                self.scores_.append(step_score(estimator, features))
            support_[features[ranks][:threshold]] = False
            ranking_[np.logical_not(support_)] += 1

        # Set final attributes
        features = np.arange(n_features)[support_]
        self.estimator_ = clone(self.estimator)
        if self.estimator_params:
            self.estimator_.set_params(**self.estimator_params)
        self.estimator_.fit(X[:, features], y)

        # Compute step score when only n_features_to_select features left
        if step_score:
            self.scores_.append(step_score(self.estimator_, features))
        self.n_features_ = support_.sum()
        self.support_ = support_
        self.ranking_ = ranking_

        return self
Exemple #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None)
    parser.add_argument('--target')
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S")
    result['start_time'] = start_time
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Running in: ' + gethostname())
        print('# Start: ' + start_time)

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1)))
    result_file = join(args.results_path, 'rfe_{}.json'.format(experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))

    load_params = {}
    if args.data == 'epi_ad':
        load_params = {'read_original': True, 'skip_pickle': True}

    data, factors = load(args.data, data_path=args.data_path, log=result, **load_params)
    if args.tissue:
        data = data[factors['source tissue'] == args.tissue]
        factors = factors[factors['source tissue'] == args.tissue]
    target = factors[args.target]

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 9

    preprocess_steps = [('scaler', StandardScaler())]

    # RFE
    d0 = datetime.now()
    result['experiments'] = []
    for i, (train, test) in enumerate(split):
        if args.verbose:
            print('### ITERATION {}'.format(i))
        result['experiments'].append({
            'iteration': i,
            'train_samples': data.index[train].tolist(),
            'subsets': []
        })
        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)
        for step in subset_sizes(n_features, n_features_to_select):
            if args.verbose:
                print('[{}] Evaluating with {} features and selecting {}.'
                      .format(datetime.now() - d0, np.sum(support_), np.sum(support_) - step))
            # Train with current subset
            pipeline = preprocess_steps + [('grid', GridWithCoef(clf, param_grid, cv=args.n_folds))]
            pipeline = Pipeline(pipeline)

            features = np.arange(n_features)[support_]
            pipeline.fit(data.iloc[train, features], target.iloc[train])

            # Save results for current set of features
            grid = pipeline.steps[-1][1]
            result['experiments'][-1]['subsets'].append({
                'n_features': np.sum(support_),
                'features': data.columns[features].tolist(),
                'best_params': grid.best_params_,
                'train': {
                    'y_true': target.iloc[train].tolist(),
                    'y_pred': grid.predict(data.iloc[train, features]).tolist()
                },
                'test': {
                    'y_true': target.iloc[test].tolist(),
                    'y_pred': grid.predict(data.iloc[test, features]).tolist()
                }
            })

            # Select best subset
            coef_ = safe_sqr(grid.coef_)

            if coef_.ndim > 1:
                ranks = np.argsort(coef_.sum(axis=0))
            else:
                ranks = np.argsort(coef_)

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Eliminate the worse features
            support_[features[ranks][:step]] = False
            ranking_[np.logical_not(support_)] += 1

            # Store results
            with open(result_file, 'w') as f:
                json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': '))

    if args.verbose:
        print('# OK')
Exemple #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--target')
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S"))
    result['selections'] = []

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10)))
    result_file = join(args.results_path, 'fs_{}.json'.format(experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))

    data, factors = load(args.data, data_path=args.data_path, log=result)
    target = factors[args.target]

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    feature_names = data.columns

    split = StratifiedShuffleSplit(target,
                                   n_iter=args.n_iter,
                                   test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 1

    support_ = np.ones(n_features, dtype=np.bool)
    ranking_ = np.ones(n_features, dtype=np.int)
    # Elimination
    t0 = time()
    d0 = datetime.now()
    while np.sum(support_) > n_features_to_select:
        step = 10**int(np.log10(np.sum(support_) - 1))
        odd_step = np.sum(support_) - step * (np.sum(support_) // step)
        if odd_step > 0:
            step = odd_step

        if args.verbose:
            print('[{}] Selecting best {:d} features.'.format(
                datetime.now() - d0,
                np.sum(support_) - step))
        # Remaining features
        features = np.arange(n_features)[support_]

        coef_ = None
        test_scores = []
        for train, test in split:
            # Rank the remaining features
            if args.n_folds == 'loo':
                cv = LeaveOneOut(len(train))
            else:
                cv = args.n_folds
            estimator = GridWithCoef(clf, param_grid, cv=cv)

            estimator.fit(data.iloc[train, features], target.iloc[train])
            if coef_ is None:
                coef_ = safe_sqr(estimator.coef_)
            else:
                coef_ += safe_sqr(estimator.coef_)

            test_scores.append(
                estimator.score(data.iloc[test, features], target.iloc[test]))

        if coef_.ndim > 1:
            ranks = np.argsort(coef_.sum(axis=0))
        else:
            ranks = np.argsort(coef_)

        # for sparse case ranks is matrix
        ranks = np.ravel(ranks)

        # Eliminate the worse features
        threshold = min(step, np.sum(support_) - n_features_to_select)
        support_[features[ranks][:threshold]] = False
        ranking_[np.logical_not(support_)] += 1

        result['selections'].append({
            'scores':
            test_scores,
            'n_features':
            np.sum(support_),
            'features':
            feature_names[support_].tolist()
        })

        with open(result_file, 'w') as f:
            json.dump(result,
                      f,
                      sort_keys=True,
                      indent=2,
                      separators=(',', ': '))

    if args.verbose:
        print('# OK')
    def _fit(self, X, y, step_score=None):
        #X, y = check_X_y(X, y, "csc")
        # Initialization
        n_features = X.shape[1]
        if self.n_features_to_select is None:
            n_features_to_select = n_features // 2
        else:
            n_features_to_select = self.n_features_to_select

        if 0.0 < self.step < 1.0:
            step = int(max(1, self.step * n_features))
        else:
            step = int(self.step)
        if step <= 0:
            raise ValueError("Step must be >0")

        if self.estimator_params is not None:
            warnings.warn("The parameter 'estimator_params' is deprecated as "
                          "of version 0.16 and will be removed in 0.18. The "
                          "parameter is no longer necessary because the value "
                          "is set via the estimator initialisation or "
                          "set_params method.", DeprecationWarning)

        support_ = np.zeros(n_features, dtype=np.bool)
        support_[0] = True
        ranking_ = np.zeros(n_features, dtype=np.int)
        ranking_[0] = 1

        if step_score:
            self.scores_ = []

        # Elimination
        while np.sum(support_) < n_features_to_select:
            # Remaining features
            features = np.arange(n_features)[support_]

            # Rank the remaining features
            estimator = clone(self.estimator)
            if self.estimator_params:
                estimator.set_params(**self.estimator_params)
            if self.verbose > 0:
                print("Fitting estimator with %d features." % np.sum(support_))

            estimator.fit(X[:, features], y)

            # Get coefs
            if hasattr(estimator, 'coef_'):
                coefs = estimator.coef_
            elif hasattr(estimator, 'feature_importances_'):
                coefs = estimator.feature_importances_
            else:
                raise RuntimeError('The classifier does not expose '
                                   '"coef_" or "feature_importances_" '
                                   'attributes')

            # Get ranks
            if coefs.ndim > 1:
                ranks = np.argsort(safe_sqr(coefs).sum(axis=0))[::-1]
            else:
                ranks = np.argsort(safe_sqr(coefs))[::-1]

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Eliminate the worse features
            threshold = min(step, n_features_to_select - np.sum(support_))

            # Compute step score on the previous selection iteration
            # because 'estimator' must use features
            # that have not been eliminated yet
            if step_score:
                self.scores_.append(step_score(estimator, features))
            support_[features[ranks][:threshold]] = True
            ranking_[np.logical_not(support_)] += 1

        # Set final attributes
        features = np.arange(n_features)[support_]
        self.estimator_ = clone(self.estimator)
        if self.estimator_params:
            self.estimator_.set_params(**self.estimator_params)
        self.estimator_.fit(X[:, features], y)

        # Compute step score when only n_features_to_select features left
        if step_score:
            self.scores_.append(step_score(self.estimator_, features))
        self.n_features_ = support_.sum()
        self.support_ = support_
        self.ranking_ = ranking_

        return self
Exemple #14
0
    def _fit(self, X, y, step_score=None):
        # Parameter step_score controls the calculation of self.scores_
        # step_score is not exposed to users
        # and is used when implementing RFECV
        # self.scores_ will not be calculated when calling _fit through fit

        X, y = check_X_y(X, y, "csc", ensure_min_features=2)
        # Initialization
        n_features = X.shape[1]
        if self.n_features_to_select is None:
            n_features_to_select = n_features // 2
        else:
            n_features_to_select = self.n_features_to_select


#         if 0.0 < self.step < 1.0:
#             step = int(max(1, self.step * n_features))
#         else:
#             step = int(self.step)
#         if step <= 0:
#             raise ValueError("Step must be >0")

        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)

        if step_score:
            self.scores_ = []

        # Elimination
        while np.sum(support_) > n_features_to_select:
            if 0.0 < self.step < 1.0:
                step = int(max(1, self.step * np.sum(support_)))
            else:
                step = int(self.step)
            if step <= 0:
                raise ValueError("Step must be >0")
            # Remaining features
            features = np.arange(n_features)[support_]

            # Rank the remaining features
            estimator = clone(self.estimator)
            if self.verbose > 0:
                print("Fitting estimator with %d features." % np.sum(support_))

            estimator.fit(X[:, features], y)

            # Get coefs
            if hasattr(estimator, 'coef_'):
                coefs = estimator.coef_
            else:
                coefs = getattr(estimator, 'feature_importances_', None)
            if coefs is None:
                raise RuntimeError('The classifier does not expose '
                                   '"coef_" or "feature_importances_" '
                                   'attributes')
            #print(coefs)
            # Get ranks
            if coefs.ndim > 1:
                ranks = np.argsort(safe_sqr(coefs).sum(axis=0))
            else:
                ranks = np.argsort(safe_sqr(coefs))

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Eliminate the worse features
            if np.sum(support_) <= 2 * n_features_to_select:
                threshold = 1
            else:
                threshold = min(step, np.sum(support_) - n_features_to_select)

            # Compute step score on the previous selection iteration
            # because 'estimator' must use features
            # that have not been eliminated yet
            if step_score:
                self.scores_.append(step_score(estimator, features))
            print(threshold)
            support_[features[ranks][:threshold]] = False
            ranking_[np.logical_not(support_)] += 1

        # Set final attributes
        features = np.arange(n_features)[support_]
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X[:, features], y)

        # Compute step score when only n_features_to_select features left
        if step_score:
            self.scores_.append(step_score(self.estimator_, features))
        self.n_features_ = support_.sum()
        self.support_ = support_
        self.ranking_ = ranking_

        return self
    def _fit(self, X, y, step_score=None):

        if type(self.step) is not list:
            return super(DyRFE, self)._fit(X, y, step_score)

        # dynamic step
        X, y = check_X_y(X, y, "csc")
        # Initialization
        n_features = X.shape[1]
        if self.n_features_to_select is None:
            n_features_to_select = n_features // 2
        else:
            n_features_to_select = self.n_features_to_select

        step = []
        for s in self.step:
            if 0.0 < s < 1.0:
                step.append(int(max(1, s * n_features)))
            else:
                step.append(int(s))
            if s <= 0:
                raise ValueError("Step must be >0")

        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)

        if step_score:
            self.scores_ = []

        step_i = 0
        # Elimination
        while np.sum(support_) > n_features_to_select and step_i < len(step):

            # if last step is 1, will keep loop
            if step_i == len(step) - 1 and step[step_i] != 0:
                step.append(step[step_i])

            # Remaining features
            features = np.arange(n_features)[support_]

            # Rank the remaining features
            estimator = clone(self.estimator)
            if self.verbose > 0:
                print("Fitting estimator with %d features." % np.sum(support_))

            estimator.fit(X[:, features], y)

            # Get coefs
            if hasattr(estimator, 'coef_'):
                coefs = estimator.coef_
            else:
                coefs = getattr(estimator, 'feature_importances_', None)
            if coefs is None:
                raise RuntimeError('The classifier does not expose '
                                   '"coef_" or "feature_importances_" '
                                   'attributes')

            # Get ranks
            if coefs.ndim > 1:
                ranks = np.argsort(safe_sqr(coefs).sum(axis=0))
            else:
                ranks = np.argsort(safe_sqr(coefs))

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Eliminate the worse features
            threshold =\
                min(step[step_i], np.sum(support_) - n_features_to_select)

            # Compute step score on the previous selection iteration
            # because 'estimator' must use features
            # that have not been eliminated yet
            if step_score:
                self.scores_.append(step_score(estimator, features))
            support_[features[ranks][:threshold]] = False
            ranking_[np.logical_not(support_)] += 1

            step_i += 1

        # Set final attributes
        features = np.arange(n_features)[support_]
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X[:, features], y)

        # Compute step score when only n_features_to_select features left
        if step_score:
            self.scores_.append(step_score(self.estimator_, features))
        self.n_features_ = support_.sum()
        self.support_ = support_
        self.ranking_ = ranking_

        return self
Exemple #16
0
    def _fit(self, x_data, y_data, step_score=None, **fit_kwargs):
        """Expand :meth:`_fit` to accept kwargs."""
        # Parameter step_score controls the calculation of self.scores_
        # step_score is not exposed to users
        # and is used when implementing AdvancedRFECV
        # self.scores_ will not be calculated when calling _fit through fit
        tags = self._get_tags()
        x_data, y_data = check_X_y(
            x_data,
            y_data,
            "csc",
            ensure_min_features=2,
            force_all_finite=not tags.get('allow_nan', True))

        # Initialization
        n_features = x_data.shape[1]
        if self.n_features_to_select is None:
            n_features_to_select = n_features // 2
        else:
            n_features_to_select = self.n_features_to_select

        if 0.0 < self.step < 1.0:
            step = int(max(1, self.step * n_features))
        else:
            step = int(self.step)
        if step <= 0:
            raise ValueError("Step must be >0")

        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)

        if step_score:
            self.scores_ = []

        # Elimination
        while np.sum(support_) > n_features_to_select:
            # Remaining features
            features = np.arange(n_features)[support_]

            # Rank the remaining features
            estimator = clone(self.estimator)
            if self.verbose > 0:
                print("Fitting estimator with %d features." % np.sum(support_))

            _update_transformers_param(estimator, support_)
            estimator.fit(x_data[:, features], y_data, **fit_kwargs)

            # Get coefs (hasattr(estimator, 'coef_') raises a KeyError for
            # XGBRegressor models
            try:
                coefs = estimator.coef_
            except (AttributeError, KeyError):
                coefs = getattr(estimator, 'feature_importances_', None)
            if coefs is None:
                raise RuntimeError('The classifier does not expose '
                                   '"coef_" or "feature_importances_" '
                                   'attributes')

            # Get ranks
            if coefs.ndim > 1:
                ranks = np.argsort(safe_sqr(coefs).sum(axis=0))
            else:
                ranks = np.argsort(safe_sqr(coefs))

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Eliminate the worse features
            threshold = min(step, np.sum(support_) - n_features_to_select)

            # Compute step score on the previous selection iteration
            # because 'estimator' must use features
            # that have not been eliminated yet
            if step_score:
                self.scores_.append(step_score(estimator, features))
            support_[features[ranks][:threshold]] = False
            ranking_[np.logical_not(support_)] += 1

        # Set final attributes
        features = np.arange(n_features)[support_]
        self.estimator_ = clone(self.estimator)
        _update_transformers_param(self.estimator_, support_)
        self.estimator_.fit(x_data[:, features], y_data, **fit_kwargs)

        # Compute step score when only n_features_to_select features left
        if step_score:
            self.scores_.append(step_score(self.estimator_, features))
        self.n_features_ = support_.sum()
        self.support_ = support_
        self.ranking_ = ranking_

        return self
    def _fit(self, X, y, step_score=None):
        # Parameter step_score controls the calculation of self.scores_
        # step_score is not exposed to users
        # and is used when implementing RFECV
        # self.scores_ will not be calculated when calling _fit through fit

        # X, y = check_X_y(X, y, "csc")
        X = pd.DataFrame(X)

        n_samples, n_features = X.shape
        if self.n_features_to_select is None:
            n_features_to_select = n_features // 2
        else:
            n_features_to_select = self.n_features_to_select

        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)

        if step_score:
            self.scores_ = []

        # compute correlation matrix
        # and sort feature by highest mean squared correlation
        C = np.square(np.corrcoef(X.T) - np.diag(np.ones(X.shape[1])))
        C = np.nan_to_num(C, nan=1)
        coefs = C.mean(axis=1)

        # Get ranks
        ranks = np.argsort(-safe_sqr(coefs))
        worst_feature = 0

        # Recursive elimination
        i = 1
        while np.sum(support_) > n_features_to_select:

            if worst_feature == n_features:
                break

            support_[ranks[worst_feature]] = False
            X_worse = X.iloc[:, ranks[worst_feature]]

            correlation_to_worst_feature = -C[:, ranks[worst_feature]]
            correlation_to_worst_feature[support_ == False] = 0
            most_related_features = np.argsort(correlation_to_worst_feature)
            sorted_support = support_[most_related_features]
            if self.min_corr < np.max(-correlation_to_worst_feature):
                sorted_support = sorted_support & (
                    -correlation_to_worst_feature[most_related_features] >
                    self.min_corr)

            X_reduced = X.iloc[:, most_related_features[sorted_support]]

            if self.n_splits > n_samples:
                skf = KFold(n_splits=self.n_splits,
                            shuffle=True,
                            random_state=self.random_state)
                train_index, val_index = [
                    split for split in skf.split(X_worse)
                ][0]
                X_train, X_val = X_reduced.iloc[train_index], X_reduced.iloc[
                    val_index]
                y_train, y_val = X_worse[train_index], X_worse[val_index]
            else:
                X_train, X_val = X_reduced, X_reduced
                y_train, y_val = X_worse, X_worse

            # Eliminate predictable features
            if self.verbose > 0:
                print("Fitting estimator with %d features (%d/%d)" %
                      (np.sum(support_), i, n_features))
                i += 1

            estimator = clone(self.estimator)
            estimator.fit(X_train, y_train)
            score = estimator.score(X_val, y_val)

            if score >= self.base_score:

                # Compute step score on the previous selection iteration
                # because 'estimator' must use features
                # that have not been eliminated yet
                ranking_[np.logical_not(support_)] += 1

            else:
                support_[ranks[worst_feature]] = True

            worst_feature += 1

        self.n_features_ = support_.sum()
        self.support_ = support_
        self.ranking_ = ranking_

        return self
Exemple #18
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--target')
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S"))
    result['selections'] = []

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10)))
    result_file = join(args.results_path, 'fs_{}.json'.format(experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))

    data, factors = load(args.data, data_path=args.data_path, log=result)
    target = factors[args.target]

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    feature_names = data.columns

    split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 1

    support_ = np.ones(n_features, dtype=np.bool)
    ranking_ = np.ones(n_features, dtype=np.int)
    # Elimination
    t0 = time()
    d0 = datetime.now()
    while np.sum(support_) > n_features_to_select:
        step = 10 ** int(np.log10(np.sum(support_) - 1))
        odd_step = np.sum(support_) - step * (np.sum(support_) // step)
        if odd_step > 0:
            step = odd_step

        if args.verbose:
            print('[{}] Selecting best {:d} features.'
                  .format(datetime.now() - d0, np.sum(support_) - step))
        # Remaining features
        features = np.arange(n_features)[support_]

        coef_ = None
        test_scores = []
        for train, test in split:
            # Rank the remaining features
            if args.n_folds == 'loo':
                cv = LeaveOneOut(len(train))
            else:
                cv = args.n_folds
            estimator = GridWithCoef(clf, param_grid, cv=cv)

            estimator.fit(data.iloc[train, features], target.iloc[train])
            if coef_ is None:
                coef_ = safe_sqr(estimator.coef_)
            else:
                coef_ += safe_sqr(estimator.coef_)

            test_scores.append(estimator.score(data.iloc[test, features], target.iloc[test]))

        if coef_.ndim > 1:
            ranks = np.argsort(coef_.sum(axis=0))
        else:
            ranks = np.argsort(coef_)

        # for sparse case ranks is matrix
        ranks = np.ravel(ranks)

        # Eliminate the worse features
        threshold = min(step, np.sum(support_) - n_features_to_select)
        support_[features[ranks][:threshold]] = False
        ranking_[np.logical_not(support_)] += 1

        result['selections'].append({
            'scores': test_scores,
            'n_features': np.sum(support_),
            'features': feature_names[support_].tolist()
        })

        with open(result_file, 'w') as f:
            json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': '))

    if args.verbose:
        print('# OK')
Exemple #19
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--results-path', default='./bucket/results/')
    parser.add_argument('--data')
    parser.add_argument('--tissue',
                        type=lambda x: re.sub(r'[\"\']', '', x)
                        if x is not None else None)
    parser.add_argument('--target')
    parser.add_argument('--data-path', default='./bucket/data/')
    parser.add_argument('--verbose', '-v', action='count')
    parser.add_argument('--test-size', type=float, default=0.1)
    parser.add_argument('--n-iter', type=int, default=1)
    parser.add_argument('--n-folds', default=10, type=n_folds_parser)
    parser.add_argument('--clf')
    args = parser.parse_args()

    result = {}
    result.update(args.__dict__)
    start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S")
    result['start_time'] = start_time
    if args.verbose:
        for key in result:
            print('# {}: {}'.format(key, result[key]))
        print('# Running in: ' + gethostname())
        print('# Start: ' + start_time)

    experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1)))
    result_file = join(args.results_path, 'rfe_{}.json'.format(experiment_id))
    if args.verbose:
        print('Results will be saved to {}'.format(result_file))

    load_params = {}
    if args.data == 'epi_ad':
        load_params = {'read_original': True, 'skip_pickle': True}

    data, factors = load(args.data,
                         data_path=args.data_path,
                         log=result,
                         **load_params)
    if args.tissue:
        data = data[factors['source tissue'] == args.tissue]
        factors = factors[factors['source tissue'] == args.tissue]
    target = factors[args.target]

    clf, param_grid = choose_classifier(args.clf, result, args.verbose)

    split = StratifiedShuffleSplit(target,
                                   n_iter=args.n_iter,
                                   test_size=args.test_size)
    n_features = data.shape[1]
    n_features_to_select = 9

    preprocess_steps = [('scaler', StandardScaler())]

    # RFE
    d0 = datetime.now()
    result['experiments'] = []
    for i, (train, test) in enumerate(split):
        if args.verbose:
            print('### ITERATION {}'.format(i))
        result['experiments'].append({
            'iteration':
            i,
            'train_samples':
            data.index[train].tolist(),
            'subsets': []
        })
        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)
        for step in subset_sizes(n_features, n_features_to_select):
            if args.verbose:
                print('[{}] Evaluating with {} features and selecting {}.'.
                      format(datetime.now() - d0, np.sum(support_),
                             np.sum(support_) - step))
            # Train with current subset
            pipeline = preprocess_steps + [
                ('grid', GridWithCoef(clf, param_grid, cv=args.n_folds))
            ]
            pipeline = Pipeline(pipeline)

            features = np.arange(n_features)[support_]
            pipeline.fit(data.iloc[train, features], target.iloc[train])

            # Save results for current set of features
            grid = pipeline.steps[-1][1]
            result['experiments'][-1]['subsets'].append({
                'n_features':
                np.sum(support_),
                'features':
                data.columns[features].tolist(),
                'best_params':
                grid.best_params_,
                'train': {
                    'y_true': target.iloc[train].tolist(),
                    'y_pred': grid.predict(data.iloc[train,
                                                     features]).tolist()
                },
                'test': {
                    'y_true': target.iloc[test].tolist(),
                    'y_pred': grid.predict(data.iloc[test, features]).tolist()
                }
            })

            # Select best subset
            coef_ = safe_sqr(grid.coef_)

            if coef_.ndim > 1:
                ranks = np.argsort(coef_.sum(axis=0))
            else:
                ranks = np.argsort(coef_)

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Eliminate the worse features
            support_[features[ranks][:step]] = False
            ranking_[np.logical_not(support_)] += 1

            # Store results
            with open(result_file, 'w') as f:
                json.dump(result,
                          f,
                          sort_keys=True,
                          indent=2,
                          separators=(',', ': '))

    if args.verbose:
        print('# OK')
Exemple #20
0
    def _fit(self, X, y, step_score=None):
        tags = self._get_tags()
        X, y = self._validate_data(
            X,
            y,
            accept_sparse="csc",
            ensure_min_features=2,
            force_all_finite=not tags.get('allow_nan', True),
            multi_output=True)

        # Initialization
        n_features = X.shape[1]
        if self.n_features_to_select is None:
            n_features_to_select = n_features // 2
        else:
            n_features_to_select = self.n_features_to_select

        if 0.0 < self.step < 1.0: step = int(max(1, self.step * n_features))
        else: step = int(self.step)
        if step <= 0: raise ValueError("Step must be >0")

        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)

        if step_score: self.scores_ = []

        # Elimination
        while np.sum(support_) > n_features_to_select:
            # Remaining features
            features = np.arange(n_features)[support_]

            # Rank the remaining features
            estimator = clone(self.estimator)
            if self.verbose > 0:
                print("Fitting estimator with %d features." % np.sum(support_))

            # Fit
            estimator.fit(X[:, features], y)

            # Get coefs
            if hasattr(estimator, 'coef_'): coefs = estimator.coef_
            else: coefs = getattr(estimator, 'feature_importances_', None)
            if coefs is None:
                raise RuntimeError(
                    "The classifier does not expose coef_or feature_importances_attributes"
                )

            # Get ranks
            if coefs.ndim > 1: ranks = np.argsort(safe_sqr(coefs).sum(axis=0))
            else: ranks = np.argsort(safe_sqr(coefs))

            # For sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Eliminate the worse features
            threshold = min(step, np.sum(support_) - n_features_to_select)

            # Save support of selected features
            self.supports.append(list(support_))

            # Compute step score on the previous selection iteration because 'estimator' must use features that have not been eliminated yet
            if step_score: self.scores_.append(step_score(estimator, features))
            support_[features[ranks][:threshold]] = False
            ranking_[np.logical_not(support_)] += 1

        # Set final attributes
        features = np.arange(n_features)[support_]
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X[:, features], y)

        # Compute step score when only n_features_to_select features left
        if step_score:
            self.scores_.append(step_score(self.estimator_, features))
        self.n_features_ = support_.sum()
        self.support_ = support_
        self.ranking_ = ranking_

        # Save support of selected features
        self.supports.append(list(support_))

        return self
Exemple #21
0
def f_regression_cov_alt(X, y, C):
    """
    Implementation as derived in tex document

    See pg 12 of following document for definition of F-statistic
    http://www-stat.stanford.edu/~jtaylo/courses/stats191/notes/simple_diagnostics.pdf

    Parameters
    ----------
    X : {array-like, sparse matrix}  shape = (n_samples, n_features)
        The set of regressors that will tested sequentially.

    y : array of shape(n_samples).
        The data matrix

    c : {array-like, sparse matrix}  shape = (n_samples, n_covariates)
        The set of covariates.


    Returns
    -------
    F : array, shape=(n_features,)
        F values of features.

    pval : array, shape=(n_features,)
        p-values of F-scores.
    """
    # make sure we don't overwrite input data
    old_flag_X = X.flags.writeable
    old_flag_C = C.flags.writeable
    old_flag_y = y.flags.writeable
    X.flags.writeable = False
    C.flags.writeable = False
    y.flags.writeable = False

    #X, C, y = check_array(X, C, y, dtype=np.float)
    y = y.ravel()

    # make copy of input data
    X = X.copy(order="F")
    y = y.copy()

    assert C.shape[1] < C.shape[0]
    cpinv = np.linalg.pinv(C)
    X -= np.dot(C, (np.dot(cpinv, X)))  #most expensive line (runtime)
    y -= np.dot(C, (np.dot(cpinv, y)))

    yS = safe_sqr(y.T.dot(X))  # will create a copy

    # Note: (X*X).sum(0) = X.T.dot(X).diagonal(), computed efficiently
    # see e.g.: http://stackoverflow.com/questions/14758283/is-there-a-numpy-scipy-dot-product-calculating-only-the-diagonal-entries-of-the
    # TODO: make this smarter using either stride tricks or cython
    X *= X
    denom = X.sum(0) * y.T.dot(y) - yS
    F = yS / denom

    # degrees of freedom
    dof = (X.shape[0] - 1 - C.shape[1]) / (1)  #(df_fm / (df_rm - df_fm))
    F *= dof

    # convert to p-values
    pv = stats.f.sf(F, 1, dof)

    # restore old state
    X.flags.writeable = old_flag_X
    C.flags.writeable = old_flag_C
    y.flags.writeable = old_flag_y

    return F, pv
Exemple #22
0
    def _fit(self, X, y, step_score=None):

        if type(self.step) is not list:
            return super(DyRFE, self)._fit(X, y, step_score)

        # dynamic step
        X, y = check_X_y(X, y, "csc")
        # Initialization
        n_features = X.shape[1]
        if self.n_features_to_select is None:
            n_features_to_select = n_features // 2
        else:
            n_features_to_select = self.n_features_to_select

        step = []
        for s in self.step:
            if 0.0 < s < 1.0:
                step.append(int(max(1, s * n_features)))
            else:
                step.append(int(s))
            if s <= 0:
                raise ValueError("Step must be >0")

        support_ = np.ones(n_features, dtype=np.bool)
        ranking_ = np.ones(n_features, dtype=np.int)

        if step_score:
            self.scores_ = []

        step_i = 0
        # Elimination
        while np.sum(support_) > n_features_to_select and step_i < len(step):

            # if last step is 1, will keep loop
            if step_i == len(step) - 1 and step[step_i] != 0:
                step.append(step[step_i])

            # Remaining features
            features = np.arange(n_features)[support_]

            # Rank the remaining features
            estimator = clone(self.estimator)
            if self.verbose > 0:
                print("Fitting estimator with %d features." % np.sum(support_))

            estimator.fit(X[:, features], y)

            # Get coefs
            if hasattr(estimator, 'coef_'):
                coefs = estimator.coef_
            else:
                coefs = getattr(estimator, 'feature_importances_', None)
            if coefs is None:
                raise RuntimeError('The classifier does not expose '
                                   '"coef_" or "feature_importances_" '
                                   'attributes')

            # Get ranks
            if coefs.ndim > 1:
                ranks = np.argsort(safe_sqr(coefs).sum(axis=0))
            else:
                ranks = np.argsort(safe_sqr(coefs))

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            # Eliminate the worse features
            threshold =\
                min(step[step_i], np.sum(support_) - n_features_to_select)

            # Compute step score on the previous selection iteration
            # because 'estimator' must use features
            # that have not been eliminated yet
            if step_score:
                self.scores_.append(step_score(estimator, features))
            support_[features[ranks][:threshold]] = False
            ranking_[np.logical_not(support_)] += 1

            step_i += 1

        # Set final attributes
        features = np.arange(n_features)[support_]
        self.estimator_ = clone(self.estimator)
        self.estimator_.fit(X[:, features], y)

        # Compute step score when only n_features_to_select features left
        if step_score:
            self.scores_.append(step_score(self.estimator_, features))
        self.n_features_ = support_.sum()
        self.support_ = support_
        self.ranking_ = ranking_

        return self
Exemple #23
0
    def _fit(self,
             X,
             y,
             features_names=None,
             preload_features=None,
             ranking_th=0.005):
        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csr', 'csc', 'coo'],
                         multi_output=True)
        y = check_array(y, accept_sparse=['csr', 'csc', 'coo'])
        # Initialization
        n_features = X.shape[1]
        features = np.arange(n_features)

        cv = self.cv
        cv = check_cv(cv, y, classifier=is_classifier(self.estimator))
        if sklearn.__version__ == '0.17':
            n_splits = cv.n_folds
        else:
            n_splits = cv.get_n_splits(X, y)

        if self.verbose > 1:
            print("Fitting {0} folds for each of iteration".format(n_splits))

        if 0.0 < self.n_features_step < 1.0:
            step = int(max(1, self.n_features_step * n_features))
        else:
            step = int(self.n_features_step)
        if step <= 0:
            raise ValueError("Step must be >0")

        if features_names is not None:
            features_names = np.array(features_names)
        else:
            if self.features_names is not None:
                features_names = self.features_names
            else:
                features_names = np.arange(n_features)  # use indices

        tentative_support_ = np.zeros(n_features, dtype=np.bool)
        current_support_ = np.zeros(n_features, dtype=np.bool)

        self.scores_ = []
        self.scores_confidences_ = []
        self.features_per_it_ = []

        if preload_features is not None:
            preload_features = np.unique(preload_features).astype('int')
            current_support_[preload_features] = True
            tentative_support_[preload_features] = True

            X_selected = X[:, features[current_support_]]
            y_hat, cv_scores = my_cross_val_predict(clone(self.estimator),
                                                    X_selected,
                                                    y,
                                                    cv=cv)
            target = y - y_hat
        else:
            target = y.copy()

        score, confidence_interval = -np.inf, 0
        proceed = np.sum(current_support_) < X.shape[1]
        while proceed:
            if self.verbose > 1:
                print('\nN-times variance of target: {}'.format(
                    target.var() * target.shape[0]))
            # update values
            old_confidence_interval = confidence_interval
            old_score = score

            if self.scale:
                target = StandardScaler().fit_transform(
                    target
                )  # Removed ravel to deal with multi-dimensional target
                # target = StandardScaler().fit_transform(target.reshape(
                #     -1, 1)).ravel()
                # target = MinMaxScaler().fit_transform(target.reshape(
                #     -1,1)).ravel()

            if self.verbose > 1:
                print()
                print('Feature ranking')
                print()
                print("target shape: {}".format(target.shape))
                print()

            # Rank the remaining features
            start_t = time.time()
            rank_estimator = clone(self.estimator)
            rank_estimator.fit(X, target)
            end_fit = time.time() - start_t

            # Get coefs
            start_t = time.time()
            if hasattr(rank_estimator, 'coef_'):
                coefs = rank_estimator.coef_
            elif hasattr(rank_estimator, 'feature_importances_'):
                coefs = rank_estimator.feature_importances_
            else:
                raise RuntimeError('The classifier does not expose '
                                   '"coef_" or "feature_importances_" '
                                   'attributes')
            end_rank = time.time() - start_t

            # Get ranks by ordering in ascending way
            if coefs.ndim > 1:
                ranks = np.argsort(safe_sqr(coefs).sum(axis=0))
                coefs = coefs.sum(axis=0)
            else:
                ranks = np.argsort(safe_sqr(coefs))

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            if self.verbose > 1:
                ranked_f = features[ranks]
                if features_names is not None:
                    ranked_n = features_names[ranks]
                else:
                    ranked_n = ['-'] * n_features
                print('{:6}\t{:6}\t{:8}\t{}'.format('Rank', 'Index', 'Score',
                                                    'Feature Name'))
                for i in range(n_features):
                    idx = n_features - i - 1
                    if (coefs[ranks[idx]] < ranking_th) and (i > 2):
                        print(' ...')
                        break
                    print('#{:6}\t{:6}\t{:6f}\t{}'.format(
                        str(i), str(ranked_f[idx]), coefs[ranks[idx]],
                        ranked_n[idx]))
                print("\n Fit done in {} s and rank done in {} s".format(
                    end_fit, end_rank))

            # if coefs[ranks][-1] < 1e-5:
            #     if self.verbose > 0:
            #         import warnings
            #         warnings.warn('scores are too small to be used, please standardize inputs')
            #     break

            # get the best features (ie, the latest one)
            # if the most ranked features is selected go on a select
            # other features accordingly to the ranking

            # threshold = step
            # step_features = features[ranks][-threshold:]

            ii = len(features_names) - 1
            step_features = features[ranks][ii]
            while np.all(current_support_[step_features]) and ii > 0:
                ii -= 1
                step_features = features[ranks][ii]

            if np.all(current_support_[step_features]):
                if self.verbose > 0:
                    print("Selected features: {} {}".format(
                        features_names[step_features], step_features))
                    # if features_names is not None:
                    #     print("Selected features: {} {}".format(features_names[ranks][-threshold:], step_features))
                    # else:
                    #     print("Selected features: {}".format(step_features))
                    print('Ended because selected features already selected')
                step_features = None
                break

            # update selected features
            tentative_support_[step_features] = True

            # get the selected features
            X_selected = X[:, features[tentative_support_]]

            start_t = time.time()
            # cross validates to obtain the scores
            y_hat, cv_scores = my_cross_val_predict(clone(self.estimator),
                                                    X_selected,
                                                    y,
                                                    cv=cv)
            # y_hat = cross_val_predict(clone(self.estimator), X_selected, y, cv=cv)

            # compute new target
            target = y - y_hat

            # compute score and confidence interval
            # score = r2_score(y_true=y, y_pred=y_hat, multioutput='uniform_average')  # np.mean(cv_scores)
            if self.verbose > 1:
                print('r2: {}'.format(np.mean(cv_scores, axis=0)))

            score = np.mean(cv_scores)
            if len(cv_scores.shape) > 1:
                cv_scores = np.mean(cv_scores, axis=1)
            m2 = np.mean(cv_scores * cv_scores)
            confidence_interval_or = np.sqrt(
                (m2 - score * score) / (n_splits - 1))

            end_t = time.time() - start_t

            if self.verbose > 0:
                # if features_names is not None:
                print("Selected features: {} {}".format(
                    features_names[step_features], step_features))
                print("Total features: {} {}".format(
                    features_names[tentative_support_],
                    features[tentative_support_]))
                # else:
                #     print("Selected features: {}".format(step_features))
                #     print("Total features: {}".format(features[tentative_support_]))
                print("R2= {} +- {}".format(score, confidence_interval_or))
                print("\nCrossvalidation done in {} s".format(end_t))

            confidence_interval = confidence_interval_or * self.significance  # do not trust confidence interval completely

            # check terminal condition
            proceed = score - old_score > old_confidence_interval + confidence_interval if score >= 0 and old_score >= 0 else True
            if self.verbose > 1:
                print("PROCEED: {}\n\t{} - {} > {} + {}\n\t{} > {} )".format(
                    proceed, score, old_score, old_confidence_interval,
                    confidence_interval, score - old_score,
                    old_confidence_interval + confidence_interval))

            if proceed or np.sum(current_support_) == 0:
                # last feature set proved to be informative
                # we need to take into account of the new features (update current support)
                current_support_[step_features] = True
                self.features_per_it_.append(features_names[step_features])
                self.scores_.append(score)
                self.scores_confidences_.append(confidence_interval)

                # all the features are selected, stop
                if np.sum(current_support_) == n_features:
                    if self.verbose > 0:
                        print("All the features has been selected.")
                    proceed = False
            else:
                # last feature set proved to be not informative
                # keep old support and delete the current one (it is no more necessary)
                del tentative_support_
                if self.verbose > 0:
                    print('Last feature {} not added to the set'.format(
                        features_names[step_features]))

        # Set final attributes
        self.estimator_ = clone(self.estimator)
        # self.estimator_.fit(Xns[:, current_support_], yns)
        self.estimator_.fit(X[:, current_support_], y)

        self.n_features_ = current_support_.sum()
        self.support_ = current_support_
        # self.ranking_ = ranking_

        return self
Exemple #24
0
    def fit(self, X, y=None, groups=None, **fit_params):
        """ 
        Apply feature elimination routine, ultimately fitting 
        estimator on the best feature set.

        Args:
            X (array-like, shape = [n_samples, n_features]): input data
            y (array-like, shape = [n_samples, ], [n_samples, n_classes]): targets
            groups (array-like): group labels for the samples used while 
                splitting the dataset into train/test set
            **fit_params (dict of string -> object): parameters passed
                to the `fit` method of the estimator
        """
        X, y = check_X_y(X, y, "csr", ensure_min_features=2)
        cv = check_cv(self.cv, y, is_classifier(self.estimator))
        scorer = check_scoring(self.estimator, scoring=self.scoring)

        n_features = X.shape[1]
        if self.min_features_to_select is None:
            min_features_to_select = n_features // 2
        else:
            min_features_to_select = self.min_features_to_select

        if 0.0 < self.step < 1.0:
            step = int(max(1, self.step * n_features))
        else:
            step = int(self.step)
        if step <= 0:
            raise ValueError("Step must be >0")

        initial_estimator = _clone(self.estimator)
        initial_estimator.fit(X, y, **fit_params)
        if hasattr(initial_estimator, 'coef_'):
            coefs = initial_estimator.coef_
        else:
            coefs = getattr(initial_estimator, 'feature_importances_', None)
        if coefs is None:
            raise RuntimeError('The classifier does not expose '
                                '"coef_" or "feature_importances_" '
                                'attributes')
        if coefs.ndim > 1:
            ranks = np.argsort(safe_sqr(coefs).sum(axis=0))
        else:
            ranks = np.argsort(safe_sqr(coefs))
        ranks = np.ravel(ranks)[:(n_features - min_features_to_select)]

        this_step = 0
        features_to_remove = [np.array([])]
        while this_step < (n_features - min_features_to_select):
            this_step += step
            features_to_remove.append(ranks[:this_step])

        cv_splits_ = list(cv.split(X,y,groups))
        fit_sets = list(product(features_to_remove, cv_splits_))
        base_estimator = _clone(self.estimator)
        if not self.sc:
            parallel = Parallel(
                n_jobs=self.n_jobs, verbose=self.verbose, 
                pre_dispatch=self.pre_dispatch
                )
            scores = parallel(
                delayed(_fit_and_score_one)(
                    index, _clone(base_estimator), X, y, 
                    scorer, train, test, self.verbose,
                    fit_params)
                for index, (train, test) in fit_sets)
            score_sets = _divide_chunks(list(scores), len(cv_splits_))
        else:
            base_estimator_ = self.sc.broadcast(base_estimator)
            partitions = _parse_partitions(self.partitions, len(fit_sets))
            verbose = self.verbose
            scores = (
                self.sc.parallelize(fit_sets, numSlices=partitions)
                .map(lambda x: [x[0], _fit_and_score_one(
                    x[0], _clone(base_estimator), X, y, scorer, 
                    x[1][0], x[1][1], verbose, fit_params)]).collect()
                )
            score_sets = []
            for feat_set in features_to_remove:
                this_set = []
                for row in scores:
                    if (feat_set.shape == row[0].shape) and np.allclose(feat_set, row[0]):
                        this_set.append(row[1])
                score_sets.append(this_set)
            
        self.scores_ = []
        for score_set in score_sets:
            this_score = np.mean(score_set)
            self.scores_.append(this_score)
            
        best_set_ = np.argmax(self.scores_)
        self.best_score_ = self.scores_[best_set_]
        if len(features_to_remove[best_set_]) > 0:
            self.best_features_ = np.delete(
                range(n_features), features_to_remove[best_set_])
        else:
            self.best_features_ = range(n_features)
        self.best_estimator_ = _clone(self.estimator)
        self.best_estimator_.fit(X[:, self.best_features_], y, **fit_params)
        self.n_features_ = len(self.best_features_)
        
        del self.sc
        return self   
Exemple #25
0
    def _fit(self, X, y, step_score=None, features_names=None):
        X, y = check_X_y(X,
                         y,
                         accept_sparse=['csr', 'csc', 'coo'],
                         multi_output=True)
        # Initialization
        n_features = X.shape[1]
        features = np.arange(n_features)

        cv = self.cv
        cv = check_cv(cv, y, classifier=is_classifier(self.estimator))
        n_splits = cv.get_n_splits(X, y)

        if self.verbose > 0:
            print("Fitting {0} folds for each of iteration".format(n_splits))

        if 0.0 < self.n_features_step < 1.0:
            step = int(max(1, self.n_features_step * n_features))
        else:
            step = int(self.n_features_step)
        if step <= 0:
            raise ValueError("Step must be >0")

        # if self.force_iterations is None:
        #     force_iteration = False
        # else:
        #     force_iteration = self.force_iterations

        # if step_score is None:
        #     step_score = r2_score

        if features_names is not None:
            features_names = np.array(features_names)
        else:
            if self.features_names is not None:
                features_names = self.features_names
            else:
                features_names = np.arange(n_features)  # use indices

        tentative_support_ = np.zeros(n_features, dtype=np.bool)
        current_support_ = np.zeros(n_features, dtype=np.bool)

        self.scores_ = []
        self.features_per_it_ = []

        target = y

        score, confidence_interval = -np.inf, 0
        proceed = True
        while proceed:
            if self.verbose > 0:
                print('\nN-times variance of target: {}'.format(
                    target.var() * target.shape[0]))
            # update values
            old_confidence_interval = confidence_interval
            old_score = score

            if self.verbose > 0:
                print()
                print('Feature ranking')
                print()

            if self.scale:
                target = StandardScaler().fit_transform(target)

            # Rank the remaining features
            rank_estimator = clone(self.estimator)
            rank_estimator.fit(X, target)

            # Get coefs
            if hasattr(rank_estimator, 'coef_'):
                coefs = rank_estimator.coef_
            elif hasattr(rank_estimator, 'feature_importances_'):
                coefs = rank_estimator.feature_importances_
            else:
                raise RuntimeError('The classifier does not expose '
                                   '"coef_" or "feature_importances_" '
                                   'attributes')

            # Get ranks by ordering in ascending way
            if coefs.ndim > 1:
                ranks = np.argsort(safe_sqr(coefs).sum(axis=0))
                coefs = coefs.sum(axis=0)
            else:
                ranks = np.argsort(safe_sqr(coefs))

            # for sparse case ranks is matrix
            ranks = np.ravel(ranks)

            if self.verbose > 0:
                ranked_f = features[ranks]
                if features_names is not None:
                    ranked_n = features_names[ranks]
                else:
                    ranked_n = ['-'] * n_features
                print('{:6}\t{:6}\t{:8}\t{}'.format('Rank', 'Index', 'Score',
                                                    'Feature Name'))
                for i in range(n_features):
                    idx = n_features - i - 1
                    print('#{:6}\t{:6}\t{:6f}\t{}'.format(
                        str(i), str(ranked_f[idx]), coefs[ranks[idx]],
                        ranked_n[idx]))

            if coefs[ranks][-1] < 1e-5:
                if self.verbose > 0:
                    import warnings
                    warnings.warn(
                        'scores are too small to be used, please standardize inputs'
                    )
                break

            # get the best features (ie, the latest one)
            # if the most ranked features is selected go on a select
            # other features accordingly to the ranking

            # threshold = step
            # step_features = features[ranks][-threshold:]

            ii = len(features_names) - 1
            step_features = features[ranks][ii]
            while np.all(current_support_[step_features]) and ii > 0:
                ii -= 1
                step_features = features[ranks][ii]

            if np.all(current_support_[step_features]):
                if self.verbose > 0:
                    print("Selected features: {} {}".format(
                        features_names[step_features], step_features))
                    # if features_names is not None:
                    #     print("Selected features: {} {}".format(features_names[ranks][-threshold:], step_features))
                    # else:
                    #     print("Selected features: {}".format(step_features))
                    print('Ended because selected features already selected')
                step_features = None
                break

            # update selected features
            tentative_support_[step_features] = True

            # get the selected features
            X_selected = X[:, features[tentative_support_]]

            # cross validates to obtain the scores
            # cv_scores = cross_val_score(clone(self.estimator), X_selected, y, cv=cv, scoring='r2')
            y_hat = cross_val_predict(clone(self.estimator),
                                      X_selected,
                                      y,
                                      cv=cv)

            # compute new target
            target = y - y_hat

            # compute score and confidence interval
            score = r2_score(
                y_true=y, y_pred=y_hat,
                multioutput='uniform_average')  # np.mean(cv_scores)
            print('r2: {}'.format(
                r2_score(y_true=y, y_pred=y_hat, multioutput='raw_values')))
            # m2 = np.mean(cv_scores * cv_scores)
            SIGNIFICANCE = 0.0
            confidence_interval = SIGNIFICANCE  # * np.sqrt((m2 - score * score) / (n_splits - 1))

            if self.verbose > 0:
                # if features_names is not None:
                print("Selected features: {} {}".format(
                    features_names[step_features], step_features))
                print("Total features: {} {}".format(
                    features_names[tentative_support_],
                    features[tentative_support_]))
                # else:
                #     print("Selected features: {}".format(step_features))
                #     print("Total features: {}".format(features[tentative_support_]))
                print("R2= {} +- {}".format(score, confidence_interval))

            self.scores_.append(score)
            self.features_per_it_.append(features_names[tentative_support_])

            # check terminal condition
            proceed = score - old_score > old_confidence_interval + confidence_interval

            if proceed or np.sum(current_support_) == 0:
                # last feature set proved to be informative
                # we need to take into account of the new features (update current support)
                current_support_[step_features] = True

                # all the features are selected, stop
                if np.sum(current_support_) == n_features:
                    if self.verbose > 0:
                        print("All the features has been selected.")
                    proceed = False
            else:
                # last feature set proved to be not informative
                # keep old support and delete the current one (it is no more necessary)
                del tentative_support_
                if self.verbose > 0:
                    print('Last feature {} not added to the set'.format(
                        features_names[step_features]))

        # Set final attributes
        self.estimator_ = clone(self.estimator)
        # self.estimator_.fit(Xns[:, current_support_], yns)
        self.estimator_.fit(X[:, current_support_], y)

        self.n_features_ = current_support_.sum()
        self.support_ = current_support_
        # self.ranking_ = ranking_

        return self
def f_oneway(*args):
    """Performs a 1-way ANOVA.
    The one-way ANOVA tests the null hypothesis that 2 or more groups have
    the same population mean. The test is applied to samples from two or
    more groups, possibly with differing sizes.
    Read more in the :ref:`User Guide <univariate_feature_selection>`.
    Parameters
    ----------
    *args : array_like, sparse matrices
        sample1, sample2... The sample measurements should be given as
        arguments.
    Returns
    -------
    F-value : float
        The computed F-value of the test.
    p-value : float
        The associated p-value from the F-distribution.
    Notes
    -----
    The ANOVA test has important assumptions that must be satisfied in order
    for the associated p-value to be valid.
    1. The samples are independent
    2. Each sample is from a normally distributed population
    3. The population standard deviations of the groups are all equal. This
       property is known as homoscedasticity.
    If these assumptions are not true for a given set of data, it may still be
    possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although
    with some loss of power.
    The algorithm is from Heiman[2], pp.394-7.
    See ``scipy.stats.f_oneway`` that should give the same results while
    being less efficient.
    References
    ----------
    .. [1] Lowry, Richard.  "Concepts and Applications of Inferential
           Statistics". Chapter 14.
           http://faculty.vassar.edu/lowry/ch14pt1.html
    .. [2] Heiman, G.W.  Research Methods in Statistics. 2002.
    """
    n_classes = len(args)
    args = [as_float_array(a) for a in args]
    n_samples_per_class = np.array([a.shape[0] for a in args])
    n_samples = np.sum(n_samples_per_class)
    ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args)
    sums_args = [np.asarray(a.sum(axis=0)) for a in args]
    square_of_sums_alldata = sum(sums_args)**2
    square_of_sums_args = [s**2 for s in sums_args]
    sstot = ss_alldata - square_of_sums_alldata / float(n_samples)
    ssbn = 0.
    for k, _ in enumerate(args):
        ssbn += square_of_sums_args[k] / n_samples_per_class[k]
    ssbn -= square_of_sums_alldata / float(n_samples)
    sswn = sstot - ssbn
    dfbn = n_classes - 1
    dfwn = n_samples - n_classes
    msb = ssbn / float(dfbn)
    msw = sswn / float(dfwn)
    constant_features_idx = np.where(msw == 0.)[0]
    if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size):
        warnings.warn("Features %s are constant." % constant_features_idx,
                      UserWarning)
    f = msb / msw
    # flatten matrix to vector in sparse case
    f = np.asarray(f).ravel()
    prob = special.fdtrc(dfbn, dfwn, f)
    return f, prob
    def svmFC(self, x, step):
        self.step = step
        self.X, self.y, self.X_val, self.y_val, self.featureNames = self.splitData(
            x=x)
        self.features = self.featureNames

        self.j = 0
        while self.X.shape[1] > 201:
            self.j += 1
            self.svc = SVC(kernel='linear')
            self.Cs = np.array([0.5, 1.0, 10, 100])

            #  get the hyperparamaters
            self.clf = GridSearchCV(estimator=self.svc,
                                    param_grid=dict(C=self.Cs),
                                    cv=5,
                                    return_train_score=True,
                                    n_jobs=20)
            self.clf.fit(self.X, self.y)

            # do 5-fold cross validation
            self.cv_test_error = []
            self.skf = StratifiedKFold(n_splits=5,
                                       random_state=self.j,
                                       shuffle=True)
            for trn, tst in self.skf.split(self.X, self.y):
                self.train_train, self.train_test = self.X[trn], self.X[tst]
                self.train_clstrs, self.test_clstrs = self.y[trn], self.y[tst]
                self.val_clf = SVC(C=list(self.clf.best_params_.values())[0],
                                   kernel="linear")
                self.val_clf.fit(self.train_train, self.train_clstrs)
                self.cv_test_error.append(
                    self.val_clf.score(self.train_test, self.test_clstrs))
            self.mean_cv_test_error = np.array(self.cv_test_error).mean()

            ## train classification for RFE

            self.rfe_clf = SVC(C=list(self.clf.best_params_.values())[0],
                               kernel="linear")
            self.rfe_clf.fit(self.X, self.y)

            # get coeffs
            self.coefs = self.rfe_clf.coef_

            # get ranks
            if self.coefs.ndim > 1:
                self.ranks = np.argsort(safe_sqr(self.coefs).sum(axis=0))
            else:
                self.ranks = np.argsort(safe_sqr(self.coefs))

            # remove the X least important features from the array
            self.to_remove_index = []

            for r in range(self.step):
                self.to_remove_index.append(self.ranks[r])
            self.to_remove_index.sort(reverse=True)

            # remove from largest index to smallest
            for f in self.to_remove_index:
                self.X = np.delete(self.X, f, axis=1)
                self.X_val = np.delete(self.X_val, f, axis=1)
                del self.features[f]

        return self.X, self.y, self.X_val, self.y_val, self.features