def compute_imp_score(model, metric, training_features, training_classes, random_state): """compute importance scores for features. If coef_ or feature_importances_ attribute is available for the model, the the importance scores will be based on the attribute. If not, then permuation importance scores will be estimated Parameters ---------- tmpdir: string Temporary directory for saving experiment results model: scikit-learn Estimator A fitted scikit-learn model metric: str, callable The metric for evaluating the feature importance through permutation. By default, the strings 'accuracy' is recommended for classifiers and the string 'r2' is recommended for regressors. Optionally, a custom scoring function (e.g., `metric=scoring_func`) that accepts two arguments, y_true and y_pred, which have similar shape to the `y` array. training_features: np.darray/pd.DataFrame Features in training dataset training_classes: np.darray/pd.DataFrame Target in training dataset random_state: int Random seed for permuation importances Returns ------- coefs: np.darray Feature importance scores imp_score_type: string Importance score type """ # exporting/computing importance score if hasattr(model, 'coef_'): coefs = model.coef_ if coefs.ndim > 1: coefs = safe_sqr(coefs).sum(axis=0) imp_score_type = "Sum of Squares of Coefficients" else: coefs = safe_sqr(coefs) imp_score_type = "Squares of Coefficients" else: coefs = getattr(model, 'feature_importances_', None) imp_score_type = "Gini Importance" if coefs is None or np.isnan(coefs).any(): coefs, _ = feature_importance_permutation( predict_method=model.predict, X=training_features, y=training_classes, num_rounds=5, metric=metric, seed=random_state, ) imp_score_type = "Permutation Feature Importance" return coefs, imp_score_type
def if_classif(X_y, n_features): """Compute the Anova F-value for the provided sample Parameters ---------- X_y : Tuples of (X, y) with X {array-like, sparse matrix} shape = [n_samples, n_features] The set of regressors that will tested sequentially y array of shape(n_samples) The data matrix Returns ------- F : array, shape = [n_features,] The set of F values pval : array, shape = [n_features,] The set of p-values """ n_samples = 0 n_samples_per_class = defaultdict(lambda: 0) sums_args_d = defaultdict(lambda: np.zeros(shape=(n_features))) ss_alldata = np.zeros(shape=(n_features)) for X, y in X_y: if(n_samples % 100) == 0: logger.info("Processing doc #%d..." % n_samples) n_samples += 1 n_samples_per_class[y] += 1 ss_alldata[:] += X[:]**2 sums_args_d[y][:] += X[:] n_classes = len(sums_args_d.keys()) #Convert dictionary to numpy array sums_args = np.array(list(row for row in sums_args_d.itervalues())) square_of_sums_alldata = safe_sqr(reduce(lambda x, y: x + y, sums_args)) square_of_sums_args = [safe_sqr(s) for s in sums_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = 0. for k, y in enumerate(n_samples_per_class.keys()): ssbn += square_of_sums_args[k] / n_samples_per_class[y] ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) f = msb / msw # flatten matrix to vector in sparse case f = np.asarray(f).ravel() prob = stats.fprob(dfbn, dfwn, f) return f, prob
def f_regression_cov(X, y, C): """Univariate linear regression tests Quick linear model for testing the effect of a single regressor, sequentially for many regressors. This is done in 3 steps: 1. the regressor of interest and the data are orthogonalized wrt constant regressors 2. the cross correlation between data and regressors is computed 3. it is converted to an F score then to a p-value Parameters ---------- X : {array-like, sparse matrix} shape = (n_samples, n_features) The set of regressors that will tested sequentially. y : array of shape(n_samples). The data matrix c : {array-like, sparse matrix} shape = (n_samples, n_covariates) The set of covariates. Returns ------- F : array, shape=(n_features,) F values of features. pval : array, shape=(n_features,) p-values of F-scores. """ X = check_array(X, dtype=np.float) C = check_array(C, dtype=np.float) y = check_array(y, dtype=np.float) y = y.ravel() assert C.shape[1] < C.shape[0] cpinv = np.linalg.pinv(C) X -= np.dot(C, (np.dot(cpinv, X))) y -= np.dot(C, (np.dot(cpinv, y))) # compute the correlation corr = np.dot(y, X) corr /= np.asarray(np.sqrt(safe_sqr(X).sum(axis=0))).ravel() corr /= np.asarray(np.sqrt(safe_sqr(y).sum())).ravel() # convert to p-value dof = (X.shape[0] - 1 - C.shape[1]) / (1) #(df_fm / (df_rm - df_fm)) F = corr**2 / (1 - corr**2) * dof pv = stats.f.sf(F, 1, dof) return F, pv
def f_regression_cov(X, y, C): """Univariate linear regression tests Quick linear model for testing the effect of a single regressor, sequentially for many regressors. This is done in 3 steps: 1. the regressor of interest and the data are orthogonalized wrt constant regressors 2. the cross correlation between data and regressors is computed 3. it is converted to an F score then to a p-value Parameters ---------- X : {array-like, sparse matrix} shape = (n_samples, n_features) The set of regressors that will tested sequentially. y : array of shape(n_samples). The data matrix c : {array-like, sparse matrix} shape = (n_samples, n_covariates) The set of covariates. Returns ------- F : array, shape=(n_features,) F values of features. pval : array, shape=(n_features,) p-values of F-scores. """ X = check_arrays(X, dtype=np.float) C = check_arrays(C, dtype=np.float) y = check_arrays(y, dtype=np.float) y = y.ravel() assert C.shape[1] < C.shape[0] cpinv = np.linalg.pinv(C) X -= np.dot(C,(np.dot(cpinv, X))) y -= np.dot(C,(np.dot(cpinv, y))) # compute the correlation corr = np.dot(y, X) corr /= np.asarray(np.sqrt(safe_sqr(X).sum(axis=0))).ravel() corr /= np.asarray(np.sqrt(safe_sqr(y).sum())).ravel() # convert to p-value dof = (X.shape[0] - 1 - C.shape[1]) / (1) #(df_fm / (df_rm - df_fm)) F = corr ** 2 / (1 - corr ** 2) * dof pv = stats.f.sf(F, 1, dof) return F, pv
def f_oneway(*args): n_classes = len(args) args = [as_float_array(a) for a in args] n_samples_per_class = np.array([a.shape[0] for a in args]) n_samples = np.sum(n_samples_per_class) ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args) sums_args = [np.asarray(a.sum(axis=0)) for a in args] square_of_sums_alldata = sum(sums_args)**2 square_of_sums_args = [s**2 for s in sums_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = 0. for k, _ in enumerate(args): ssbn += square_of_sums_args[k] / n_samples_per_class[k] ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) constant_features_idx = np.where(msw == 0.)[0] if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size): warnings.warn("Features %s are constant." % constant_features_idx, UserWarning) f = msb / msw # flatten matrix to vector in sparse case f = np.asarray(f).ravel() prob = special.fdtrc(dfbn, dfwn, f) return f, prob
def read_selected_features_from_pipeline(classification_pipeline, is_sorted=True): """ Given a classification pipeline, sort all of the features from the 'selector', as well as return the selection of features Arguments: classification_pipeline """ rfe_step = classification_pipeline.named_steps.selection.named_steps.rfe # Get selected features sorted_idxs = np.argsort(safe_sqr(rfe_step.estimator_.coef_).sum(axis=0)) mask = rfe_step.support_ selected_features = np.array(read_all_features_from_pipeline(classification_pipeline))[mask] if not is_sorted: return selected_features # Sort selected features from bottom (worst) to highest (best) return selected_features[sorted_idxs]
def _fit(self, X, y, features_names=None, preload_features=None, ranking_th=0.005): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], multi_output=True) # Initialization n_features = X.shape[1] features = np.arange(n_features) cv = self.cv cv = check_cv(cv, y, classifier=is_classifier(self.estimator)) if sklearn.__version__ == '0.17': n_splits = cv.n_folds else: n_splits = cv.get_n_splits(X, y) if self.verbose > 0: print("Fitting {0} folds for each of iteration".format(n_splits)) if 0.0 < self.n_features_step < 1.0: step = int(max(1, self.n_features_step * n_features)) else: step = int(self.n_features_step) if step <= 0: raise ValueError("Step must be >0") if features_names is not None: features_names = np.array(features_names) else: if self.features_names is not None: features_names = self.features_names else: features_names = np.arange(n_features) # use indices tentative_support_ = np.zeros(n_features, dtype=np.bool) current_support_ = np.zeros(n_features, dtype=np.bool) self.scores_ = [] self.scores_confidences_ = [] self.features_per_it_ = [] if preload_features is not None: preload_features = np.unique(preload_features).astype('int') current_support_[preload_features] = True X_selected = X[:, features[current_support_]] y_hat, cv_scores = my_cross_val_predict(clone(self.estimator), X_selected, y, cv=cv) target = y - y_hat else: target = y.copy() score, confidence_interval = -np.inf, 0 proceed = np.sum(current_support_) < X.shape[1] while proceed: if self.verbose > 0: print('\nN-times variance of target: {}'.format( target.var() * target.shape[0])) # update values old_confidence_interval = confidence_interval old_score = score if self.scale: target = StandardScaler().fit_transform(target.reshape( -1, 1)).ravel() # target = MinMaxScaler().fit_transform(target.reshape( # -1,1)).ravel() if self.verbose > 0: print() print('Feature ranking') print() print("target shape: {}".format(target.shape)) print() # Rank the remaining features start_t = time.time() rank_estimator = clone(self.estimator) rank_estimator.fit(X, target) end_fit = time.time() - start_t # Get coefs start_t = time.time() if hasattr(rank_estimator, 'coef_'): coefs = rank_estimator.coef_ elif hasattr(rank_estimator, 'feature_importances_'): coefs = rank_estimator.feature_importances_ else: raise RuntimeError('The classifier does not expose ' '"coef_" or "feature_importances_" ' 'attributes') end_rank = time.time() - start_t # Get ranks by ordering in ascending way if coefs.ndim > 1: ranks = np.argsort(safe_sqr(coefs).sum(axis=0)) coefs = coefs.sum(axis=0) else: ranks = np.argsort(safe_sqr(coefs)) # for sparse case ranks is matrix ranks = np.ravel(ranks) if self.verbose > 0: ranked_f = features[ranks] if features_names is not None: ranked_n = features_names[ranks] else: ranked_n = ['-'] * n_features print('{:6}\t{:6}\t{:8}\t{}'.format('Rank', 'Index', 'Score', 'Feature Name')) for i in range(n_features): idx = n_features - i - 1 if (coefs[ranks[idx]] < ranking_th) and (i > 2): print(' ...') break print('#{:6}\t{:6}\t{:6f}\t{}'.format(str(i), str(ranked_f[idx]), coefs[ranks[idx]], ranked_n[idx])) print( "\n Fit done in {} s and rank done in {} s".format(end_fit, end_rank)) # if coefs[ranks][-1] < 1e-5: # if self.verbose > 0: # import warnings # warnings.warn('scores are too small to be used, please standardize inputs') # break # get the best features (ie, the latest one) # if the most ranked features is selected go on a select # other features accordingly to the ranking # threshold = step # step_features = features[ranks][-threshold:] ii = len(features_names) - 1 step_features = features[ranks][ii] while np.all(current_support_[step_features]) and ii > 0: ii -= 1 step_features = features[ranks][ii] if np.all(current_support_[step_features]): if self.verbose > 0: print("Selected features: {} {}".format( features_names[step_features], step_features)) # if features_names is not None: # print("Selected features: {} {}".format(features_names[ranks][-threshold:], step_features)) # else: # print("Selected features: {}".format(step_features)) print('Ended because selected features already selected') step_features = None break # update selected features tentative_support_[step_features] = True # get the selected features X_selected = X[:, features[tentative_support_]] start_t = time.time() # cross validates to obtain the scores y_hat, cv_scores = my_cross_val_predict(clone(self.estimator), X_selected, y, cv=cv) # y_hat = cross_val_predict(clone(self.estimator), X_selected, y, cv=cv) # compute new target target = y - y_hat # compute score and confidence interval # score = r2_score(y_true=y, y_pred=y_hat, multioutput='uniform_average') # np.mean(cv_scores) if self.verbose > 0: print('r2: {}'.format(np.mean(cv_scores, axis=0))) score = np.mean(cv_scores) if len(cv_scores.shape) > 1: cv_scores = np.mean(cv_scores, axis=1) m2 = np.mean(cv_scores * cv_scores) confidence_interval_or = np.sqrt( (m2 - score * score) / (n_splits - 1)) end_t = time.time() - start_t if self.verbose > 0: # if features_names is not None: print("Selected features: {} {}".format( features_names[step_features], step_features)) print("Total features: {} {}".format( features_names[tentative_support_], features[tentative_support_])) # else: # print("Selected features: {}".format(step_features)) # print("Total features: {}".format(features[tentative_support_])) print("R2= {} +- {}".format(score, confidence_interval_or)) print("\nCrossvalidation done in {} s".format(end_t)) confidence_interval = confidence_interval_or * self.significance # do not trust confidence interval completely # check terminal condition proceed = score - old_score > old_confidence_interval + confidence_interval if self.verbose > 0: print("PROCEED: {}\n\t{} - {} > {} + {}\n\t{} > {} )".format( proceed, score, old_score, old_confidence_interval, confidence_interval, score - old_score, old_confidence_interval + confidence_interval)) if proceed or np.sum(current_support_) == 0: # last feature set proved to be informative # we need to take into account of the new features (update current support) current_support_[step_features] = True self.features_per_it_.append(features_names[step_features]) self.scores_.append(score) self.scores_confidences_.append(confidence_interval) # all the features are selected, stop if np.sum(current_support_) == n_features: if self.verbose > 0: print("All the features has been selected.") proceed = False else: # last feature set proved to be not informative # keep old support and delete the current one (it is no more necessary) del tentative_support_ if self.verbose > 0: print('Last feature {} not added to the set'.format( features_names[step_features])) # Set final attributes self.estimator_ = clone(self.estimator) # self.estimator_.fit(Xns[:, current_support_], yns) self.estimator_.fit(X[:, current_support_], y) self.n_features_ = current_support_.sum() self.support_ = current_support_ # self.ranking_ = ranking_ return self
def f_regression_cov_alt(X, y, C): """ Implementation as derived in tex document See pg 12 of following document for definition of F-statistic http://www-stat.stanford.edu/~jtaylo/courses/stats191/notes/simple_diagnostics.pdf Parameters ---------- X : {array-like, sparse matrix} shape = (n_samples, n_features) The set of regressors that will tested sequentially. y : array of shape(n_samples). The data matrix c : {array-like, sparse matrix} shape = (n_samples, n_covariates) The set of covariates. Returns ------- F : array, shape=(n_features,) F values of features. pval : array, shape=(n_features,) p-values of F-scores. """ # make sure we don't overwrite input data old_flag_X = X.flags.writeable old_flag_C = C.flags.writeable old_flag_y = y.flags.writeable X.flags.writeable = False C.flags.writeable = False y.flags.writeable = False #X, C, y = check_arrays(X, C, y, dtype=np.float) y = y.ravel() # make copy of input data X = X.copy(order="F") y = y.copy() assert C.shape[1] < C.shape[0] cpinv = np.linalg.pinv(C) X -= np.dot(C,(np.dot(cpinv, X))) #most expensive line (runtime) y -= np.dot(C,(np.dot(cpinv, y))) yS = safe_sqr(y.T.dot(X)) # will create a copy # Note: (X*X).sum(0) = X.T.dot(X).diagonal(), computed efficiently # see e.g.: http://stackoverflow.com/questions/14758283/is-there-a-numpy-scipy-dot-product-calculating-only-the-diagonal-entries-of-the # TODO: make this smarter using either stride tricks or cython X *= X denom = X.sum(0) * y.T.dot(y) - yS F = yS / denom # degrees of freedom dof = (X.shape[0] - 1 - C.shape[1]) / (1) #(df_fm / (df_rm - df_fm)) F *= dof # convert to p-values pv = stats.f.sf(F, 1, dof) # restore old state X.flags.writeable = old_flag_X C.flags.writeable = old_flag_C y.flags.writeable = old_flag_y return F, pv
def _fit(self, X, y, step_score=None): # Parameter step_score controls the calculation of self.scores_ # step_score is not exposed to users # and is used when implementing RFACV # self.scores_ will not be calculated when calling _fit through fit X, y = check_X_y(X, y, "csc") # Initialization n_features = X.shape[1] if self.n_features_to_select is None: n_features_to_select = n_features // 2 else: n_features_to_select = self.n_features_to_select if 0.0 < self.step < 1.0: step = int(max(1, self.step * n_features)) else: step = int(self.step) if step <= 0: raise ValueError("Step must be >0") support_added_ = np.zeros(n_features, dtype=np.bool) support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) ranking_added_ = np.ones(n_features, dtype=np.int) if step_score: self.scores_ = [] # Adding while np.sum(support_) > n_features_to_select: # Remaining features features = np.arange(n_features)[support_] # Added features features_added = np.arange(n_features)[support_added_] # Compute step score on the previous added features if step_score and np.sum(support_added_) > 0: estimator_added = clone(self.estimator) estimator_added.fit(X[:, features_added], y) self.scores_.append(step_score(estimator_added, features_added)) # Rank the remaining features estimator = clone(self.estimator) if self.verbose > 0: print("Fitting estimator with %d features." % np.sum(support_)) estimator.fit(X[:, features], y) # Get coefs if hasattr(estimator, 'coef_'): coefs = estimator.coef_ else: coefs = getattr(estimator, 'feature_importances_', None) if coefs is None: raise RuntimeError('The classifier does not expose ' '"coef_" or "feature_importances_" ' 'attributes') # Get ranks # ! For RFA, the rank is inverted: (np.argsort(list) replaced by (np.argsort(-list) if coefs.ndim > 1: try: ranks = np.argsort(-safe_sqr(coefs).sum(axis=0)) except ValueError: coefs = np.nan_to_num(coefs) ranks = np.argsort(-safe_sqr(coefs).sum(axis=0)) else: try: ranks = np.argsort(-safe_sqr(coefs)) except ValueError: coefs = np.nan_to_num(coefs) ranks = np.argsort(-safe_sqr(coefs)) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Add the best features threshold = min(step, np.sum(support_) - n_features_to_select) # remaining features to test support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 # added/ranked features support_added_[features[ranks][:threshold]] = True ranking_added_[np.logical_not(support_added_)] += 1 # Set final attributes features_added = np.arange(n_features)[support_added_] self.estimator_ = clone(self.estimator) self.estimator_.fit(X[:, features_added], y) # Compute step score when only n_features_to_select features left if step_score: self.scores_.append(step_score(self.estimator_, features_added)) self.n_features_ = support_added_.sum() self.support_ = support_added_ self.ranking_ = ranking_added_ return self
def _fit(self, X, y, step_score=None): X, y = check_X_y(X, y, "csc") # Initialization n_features = X.shape[1] if self.n_features_to_select is None: n_features_to_select = n_features // 2 else: n_features_to_select = self.n_features_to_select if 0.0 < self.step < 1.0: if not self.stepwise_selection: step = int(max(1, self.step * n_features)) else: step = self.step else: if self.stepwise_selection: warnings.warn( "The parameter 'stepwise_selection' is true but " "a fixed step size is given. Procedure will " " continue as if 'stepwise_selection' is false", RuntimeWarning) step = int(self.step) if step <= 0: raise ValueError("Step must be >0") if self.estimator_params is not None: warnings.warn( "The parameter 'estimator_params' is deprecated as " "of version 0.16 and will be removed in 0.18. The " "parameter is no longer necessary because the value " "is set via the estimator initialisation or " "set_params method.", DeprecationWarning) support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) if step_score: self.scores_ = [] # Elimination while np.sum(support_) > n_features_to_select: # Remaining features features = np.arange(n_features)[support_] # Rank the remaining features estimator = clone(self.estimator) if self.estimator_params: estimator.set_params(**self.estimator_params) if self.verbose > 0: print("Fitting estimator with %d features." % np.sum(support_)) estimator.fit(X[:, features], y) # Get coefs if hasattr(estimator, 'coef_'): coefs = estimator.coef_ elif hasattr(estimator, 'feature_importances_'): coefs = estimator.feature_importances_ else: raise RuntimeError('The classifier does not expose ' '"coef_" or "feature_importances_" ' 'attributes') # Get ranks if coefs.ndim > 1: ranks = np.argsort(safe_sqr(coefs).sum(axis=0)) else: ranks = np.argsort(safe_sqr(coefs)) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features if self.stepwise_selection and 0.0 < step < 1.0: current_step_size = int(np.sum(support_) * step) else: current_step_size = step threshold = min(current_step_size, np.sum(support_) - n_features_to_select) # Compute step score on the previous selection iteration # because 'estimator' must use features # that have not been eliminated yet if step_score: self.scores_.append(step_score(estimator, features)) support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 # Set final attributes features = np.arange(n_features)[support_] self.estimator_ = clone(self.estimator) if self.estimator_params: self.estimator_.set_params(**self.estimator_params) self.estimator_.fit(X[:, features], y) # Compute step score when only n_features_to_select features left if step_score: self.scores_.append(step_score(self.estimator_, features)) self.n_features_ = support_.sum() self.support_ = support_ self.ranking_ = ranking_ return self
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None) parser.add_argument('--target') parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') args = parser.parse_args() result = {} result.update(args.__dict__) start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S") result['start_time'] = start_time if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1))) result_file = join(args.results_path, 'rfe_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) load_params = {} if args.data == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(args.data, data_path=args.data_path, log=result, **load_params) if args.tissue: data = data[factors['source tissue'] == args.tissue] factors = factors[factors['source tissue'] == args.tissue] target = factors[args.target] clf, param_grid = choose_classifier(args.clf, result, args.verbose) split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 9 preprocess_steps = [('scaler', StandardScaler())] # RFE d0 = datetime.now() result['experiments'] = [] for i, (train, test) in enumerate(split): if args.verbose: print('### ITERATION {}'.format(i)) result['experiments'].append({ 'iteration': i, 'train_samples': data.index[train].tolist(), 'subsets': [] }) support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) for step in subset_sizes(n_features, n_features_to_select): if args.verbose: print('[{}] Evaluating with {} features and selecting {}.' .format(datetime.now() - d0, np.sum(support_), np.sum(support_) - step)) # Train with current subset pipeline = preprocess_steps + [('grid', GridWithCoef(clf, param_grid, cv=args.n_folds))] pipeline = Pipeline(pipeline) features = np.arange(n_features)[support_] pipeline.fit(data.iloc[train, features], target.iloc[train]) # Save results for current set of features grid = pipeline.steps[-1][1] result['experiments'][-1]['subsets'].append({ 'n_features': np.sum(support_), 'features': data.columns[features].tolist(), 'best_params': grid.best_params_, 'train': { 'y_true': target.iloc[train].tolist(), 'y_pred': grid.predict(data.iloc[train, features]).tolist() }, 'test': { 'y_true': target.iloc[test].tolist(), 'y_pred': grid.predict(data.iloc[test, features]).tolist() } }) # Select best subset coef_ = safe_sqr(grid.coef_) if coef_.ndim > 1: ranks = np.argsort(coef_.sum(axis=0)) else: ranks = np.argsort(coef_) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features support_[features[ranks][:step]] = False ranking_[np.logical_not(support_)] += 1 # Store results with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) if args.verbose: print('# OK')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--target') parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') args = parser.parse_args() result = {} result.update(args.__dict__) if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S")) result['selections'] = [] experiment_id = hash(json.dumps(result) + str(np.random.rand(10))) result_file = join(args.results_path, 'fs_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) data, factors = load(args.data, data_path=args.data_path, log=result) target = factors[args.target] clf, param_grid = choose_classifier(args.clf, result, args.verbose) feature_names = data.columns split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 1 support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) # Elimination t0 = time() d0 = datetime.now() while np.sum(support_) > n_features_to_select: step = 10**int(np.log10(np.sum(support_) - 1)) odd_step = np.sum(support_) - step * (np.sum(support_) // step) if odd_step > 0: step = odd_step if args.verbose: print('[{}] Selecting best {:d} features.'.format( datetime.now() - d0, np.sum(support_) - step)) # Remaining features features = np.arange(n_features)[support_] coef_ = None test_scores = [] for train, test in split: # Rank the remaining features if args.n_folds == 'loo': cv = LeaveOneOut(len(train)) else: cv = args.n_folds estimator = GridWithCoef(clf, param_grid, cv=cv) estimator.fit(data.iloc[train, features], target.iloc[train]) if coef_ is None: coef_ = safe_sqr(estimator.coef_) else: coef_ += safe_sqr(estimator.coef_) test_scores.append( estimator.score(data.iloc[test, features], target.iloc[test])) if coef_.ndim > 1: ranks = np.argsort(coef_.sum(axis=0)) else: ranks = np.argsort(coef_) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features threshold = min(step, np.sum(support_) - n_features_to_select) support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 result['selections'].append({ 'scores': test_scores, 'n_features': np.sum(support_), 'features': feature_names[support_].tolist() }) with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) if args.verbose: print('# OK')
def _fit(self, X, y, step_score=None): #X, y = check_X_y(X, y, "csc") # Initialization n_features = X.shape[1] if self.n_features_to_select is None: n_features_to_select = n_features // 2 else: n_features_to_select = self.n_features_to_select if 0.0 < self.step < 1.0: step = int(max(1, self.step * n_features)) else: step = int(self.step) if step <= 0: raise ValueError("Step must be >0") if self.estimator_params is not None: warnings.warn("The parameter 'estimator_params' is deprecated as " "of version 0.16 and will be removed in 0.18. The " "parameter is no longer necessary because the value " "is set via the estimator initialisation or " "set_params method.", DeprecationWarning) support_ = np.zeros(n_features, dtype=np.bool) support_[0] = True ranking_ = np.zeros(n_features, dtype=np.int) ranking_[0] = 1 if step_score: self.scores_ = [] # Elimination while np.sum(support_) < n_features_to_select: # Remaining features features = np.arange(n_features)[support_] # Rank the remaining features estimator = clone(self.estimator) if self.estimator_params: estimator.set_params(**self.estimator_params) if self.verbose > 0: print("Fitting estimator with %d features." % np.sum(support_)) estimator.fit(X[:, features], y) # Get coefs if hasattr(estimator, 'coef_'): coefs = estimator.coef_ elif hasattr(estimator, 'feature_importances_'): coefs = estimator.feature_importances_ else: raise RuntimeError('The classifier does not expose ' '"coef_" or "feature_importances_" ' 'attributes') # Get ranks if coefs.ndim > 1: ranks = np.argsort(safe_sqr(coefs).sum(axis=0))[::-1] else: ranks = np.argsort(safe_sqr(coefs))[::-1] # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features threshold = min(step, n_features_to_select - np.sum(support_)) # Compute step score on the previous selection iteration # because 'estimator' must use features # that have not been eliminated yet if step_score: self.scores_.append(step_score(estimator, features)) support_[features[ranks][:threshold]] = True ranking_[np.logical_not(support_)] += 1 # Set final attributes features = np.arange(n_features)[support_] self.estimator_ = clone(self.estimator) if self.estimator_params: self.estimator_.set_params(**self.estimator_params) self.estimator_.fit(X[:, features], y) # Compute step score when only n_features_to_select features left if step_score: self.scores_.append(step_score(self.estimator_, features)) self.n_features_ = support_.sum() self.support_ = support_ self.ranking_ = ranking_ return self
def _fit(self, X, y, step_score=None): # Parameter step_score controls the calculation of self.scores_ # step_score is not exposed to users # and is used when implementing RFECV # self.scores_ will not be calculated when calling _fit through fit X, y = check_X_y(X, y, "csc", ensure_min_features=2) # Initialization n_features = X.shape[1] if self.n_features_to_select is None: n_features_to_select = n_features // 2 else: n_features_to_select = self.n_features_to_select # if 0.0 < self.step < 1.0: # step = int(max(1, self.step * n_features)) # else: # step = int(self.step) # if step <= 0: # raise ValueError("Step must be >0") support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) if step_score: self.scores_ = [] # Elimination while np.sum(support_) > n_features_to_select: if 0.0 < self.step < 1.0: step = int(max(1, self.step * np.sum(support_))) else: step = int(self.step) if step <= 0: raise ValueError("Step must be >0") # Remaining features features = np.arange(n_features)[support_] # Rank the remaining features estimator = clone(self.estimator) if self.verbose > 0: print("Fitting estimator with %d features." % np.sum(support_)) estimator.fit(X[:, features], y) # Get coefs if hasattr(estimator, 'coef_'): coefs = estimator.coef_ else: coefs = getattr(estimator, 'feature_importances_', None) if coefs is None: raise RuntimeError('The classifier does not expose ' '"coef_" or "feature_importances_" ' 'attributes') #print(coefs) # Get ranks if coefs.ndim > 1: ranks = np.argsort(safe_sqr(coefs).sum(axis=0)) else: ranks = np.argsort(safe_sqr(coefs)) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features if np.sum(support_) <= 2 * n_features_to_select: threshold = 1 else: threshold = min(step, np.sum(support_) - n_features_to_select) # Compute step score on the previous selection iteration # because 'estimator' must use features # that have not been eliminated yet if step_score: self.scores_.append(step_score(estimator, features)) print(threshold) support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 # Set final attributes features = np.arange(n_features)[support_] self.estimator_ = clone(self.estimator) self.estimator_.fit(X[:, features], y) # Compute step score when only n_features_to_select features left if step_score: self.scores_.append(step_score(self.estimator_, features)) self.n_features_ = support_.sum() self.support_ = support_ self.ranking_ = ranking_ return self
def _fit(self, X, y, step_score=None): if type(self.step) is not list: return super(DyRFE, self)._fit(X, y, step_score) # dynamic step X, y = check_X_y(X, y, "csc") # Initialization n_features = X.shape[1] if self.n_features_to_select is None: n_features_to_select = n_features // 2 else: n_features_to_select = self.n_features_to_select step = [] for s in self.step: if 0.0 < s < 1.0: step.append(int(max(1, s * n_features))) else: step.append(int(s)) if s <= 0: raise ValueError("Step must be >0") support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) if step_score: self.scores_ = [] step_i = 0 # Elimination while np.sum(support_) > n_features_to_select and step_i < len(step): # if last step is 1, will keep loop if step_i == len(step) - 1 and step[step_i] != 0: step.append(step[step_i]) # Remaining features features = np.arange(n_features)[support_] # Rank the remaining features estimator = clone(self.estimator) if self.verbose > 0: print("Fitting estimator with %d features." % np.sum(support_)) estimator.fit(X[:, features], y) # Get coefs if hasattr(estimator, 'coef_'): coefs = estimator.coef_ else: coefs = getattr(estimator, 'feature_importances_', None) if coefs is None: raise RuntimeError('The classifier does not expose ' '"coef_" or "feature_importances_" ' 'attributes') # Get ranks if coefs.ndim > 1: ranks = np.argsort(safe_sqr(coefs).sum(axis=0)) else: ranks = np.argsort(safe_sqr(coefs)) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features threshold =\ min(step[step_i], np.sum(support_) - n_features_to_select) # Compute step score on the previous selection iteration # because 'estimator' must use features # that have not been eliminated yet if step_score: self.scores_.append(step_score(estimator, features)) support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 step_i += 1 # Set final attributes features = np.arange(n_features)[support_] self.estimator_ = clone(self.estimator) self.estimator_.fit(X[:, features], y) # Compute step score when only n_features_to_select features left if step_score: self.scores_.append(step_score(self.estimator_, features)) self.n_features_ = support_.sum() self.support_ = support_ self.ranking_ = ranking_ return self
def _fit(self, x_data, y_data, step_score=None, **fit_kwargs): """Expand :meth:`_fit` to accept kwargs.""" # Parameter step_score controls the calculation of self.scores_ # step_score is not exposed to users # and is used when implementing AdvancedRFECV # self.scores_ will not be calculated when calling _fit through fit tags = self._get_tags() x_data, y_data = check_X_y( x_data, y_data, "csc", ensure_min_features=2, force_all_finite=not tags.get('allow_nan', True)) # Initialization n_features = x_data.shape[1] if self.n_features_to_select is None: n_features_to_select = n_features // 2 else: n_features_to_select = self.n_features_to_select if 0.0 < self.step < 1.0: step = int(max(1, self.step * n_features)) else: step = int(self.step) if step <= 0: raise ValueError("Step must be >0") support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) if step_score: self.scores_ = [] # Elimination while np.sum(support_) > n_features_to_select: # Remaining features features = np.arange(n_features)[support_] # Rank the remaining features estimator = clone(self.estimator) if self.verbose > 0: print("Fitting estimator with %d features." % np.sum(support_)) _update_transformers_param(estimator, support_) estimator.fit(x_data[:, features], y_data, **fit_kwargs) # Get coefs (hasattr(estimator, 'coef_') raises a KeyError for # XGBRegressor models try: coefs = estimator.coef_ except (AttributeError, KeyError): coefs = getattr(estimator, 'feature_importances_', None) if coefs is None: raise RuntimeError('The classifier does not expose ' '"coef_" or "feature_importances_" ' 'attributes') # Get ranks if coefs.ndim > 1: ranks = np.argsort(safe_sqr(coefs).sum(axis=0)) else: ranks = np.argsort(safe_sqr(coefs)) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features threshold = min(step, np.sum(support_) - n_features_to_select) # Compute step score on the previous selection iteration # because 'estimator' must use features # that have not been eliminated yet if step_score: self.scores_.append(step_score(estimator, features)) support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 # Set final attributes features = np.arange(n_features)[support_] self.estimator_ = clone(self.estimator) _update_transformers_param(self.estimator_, support_) self.estimator_.fit(x_data[:, features], y_data, **fit_kwargs) # Compute step score when only n_features_to_select features left if step_score: self.scores_.append(step_score(self.estimator_, features)) self.n_features_ = support_.sum() self.support_ = support_ self.ranking_ = ranking_ return self
def _fit(self, X, y, step_score=None): # Parameter step_score controls the calculation of self.scores_ # step_score is not exposed to users # and is used when implementing RFECV # self.scores_ will not be calculated when calling _fit through fit # X, y = check_X_y(X, y, "csc") X = pd.DataFrame(X) n_samples, n_features = X.shape if self.n_features_to_select is None: n_features_to_select = n_features // 2 else: n_features_to_select = self.n_features_to_select support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) if step_score: self.scores_ = [] # compute correlation matrix # and sort feature by highest mean squared correlation C = np.square(np.corrcoef(X.T) - np.diag(np.ones(X.shape[1]))) C = np.nan_to_num(C, nan=1) coefs = C.mean(axis=1) # Get ranks ranks = np.argsort(-safe_sqr(coefs)) worst_feature = 0 # Recursive elimination i = 1 while np.sum(support_) > n_features_to_select: if worst_feature == n_features: break support_[ranks[worst_feature]] = False X_worse = X.iloc[:, ranks[worst_feature]] correlation_to_worst_feature = -C[:, ranks[worst_feature]] correlation_to_worst_feature[support_ == False] = 0 most_related_features = np.argsort(correlation_to_worst_feature) sorted_support = support_[most_related_features] if self.min_corr < np.max(-correlation_to_worst_feature): sorted_support = sorted_support & ( -correlation_to_worst_feature[most_related_features] > self.min_corr) X_reduced = X.iloc[:, most_related_features[sorted_support]] if self.n_splits > n_samples: skf = KFold(n_splits=self.n_splits, shuffle=True, random_state=self.random_state) train_index, val_index = [ split for split in skf.split(X_worse) ][0] X_train, X_val = X_reduced.iloc[train_index], X_reduced.iloc[ val_index] y_train, y_val = X_worse[train_index], X_worse[val_index] else: X_train, X_val = X_reduced, X_reduced y_train, y_val = X_worse, X_worse # Eliminate predictable features if self.verbose > 0: print("Fitting estimator with %d features (%d/%d)" % (np.sum(support_), i, n_features)) i += 1 estimator = clone(self.estimator) estimator.fit(X_train, y_train) score = estimator.score(X_val, y_val) if score >= self.base_score: # Compute step score on the previous selection iteration # because 'estimator' must use features # that have not been eliminated yet ranking_[np.logical_not(support_)] += 1 else: support_[ranks[worst_feature]] = True worst_feature += 1 self.n_features_ = support_.sum() self.support_ = support_ self.ranking_ = ranking_ return self
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--target') parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') args = parser.parse_args() result = {} result.update(args.__dict__) if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S")) result['selections'] = [] experiment_id = hash(json.dumps(result) + str(np.random.rand(10))) result_file = join(args.results_path, 'fs_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) data, factors = load(args.data, data_path=args.data_path, log=result) target = factors[args.target] clf, param_grid = choose_classifier(args.clf, result, args.verbose) feature_names = data.columns split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 1 support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) # Elimination t0 = time() d0 = datetime.now() while np.sum(support_) > n_features_to_select: step = 10 ** int(np.log10(np.sum(support_) - 1)) odd_step = np.sum(support_) - step * (np.sum(support_) // step) if odd_step > 0: step = odd_step if args.verbose: print('[{}] Selecting best {:d} features.' .format(datetime.now() - d0, np.sum(support_) - step)) # Remaining features features = np.arange(n_features)[support_] coef_ = None test_scores = [] for train, test in split: # Rank the remaining features if args.n_folds == 'loo': cv = LeaveOneOut(len(train)) else: cv = args.n_folds estimator = GridWithCoef(clf, param_grid, cv=cv) estimator.fit(data.iloc[train, features], target.iloc[train]) if coef_ is None: coef_ = safe_sqr(estimator.coef_) else: coef_ += safe_sqr(estimator.coef_) test_scores.append(estimator.score(data.iloc[test, features], target.iloc[test])) if coef_.ndim > 1: ranks = np.argsort(coef_.sum(axis=0)) else: ranks = np.argsort(coef_) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features threshold = min(step, np.sum(support_) - n_features_to_select) support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 result['selections'].append({ 'scores': test_scores, 'n_features': np.sum(support_), 'features': feature_names[support_].tolist() }) with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) if args.verbose: print('# OK')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None) parser.add_argument('--target') parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') args = parser.parse_args() result = {} result.update(args.__dict__) start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S") result['start_time'] = start_time if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1))) result_file = join(args.results_path, 'rfe_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) load_params = {} if args.data == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(args.data, data_path=args.data_path, log=result, **load_params) if args.tissue: data = data[factors['source tissue'] == args.tissue] factors = factors[factors['source tissue'] == args.tissue] target = factors[args.target] clf, param_grid = choose_classifier(args.clf, result, args.verbose) split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 9 preprocess_steps = [('scaler', StandardScaler())] # RFE d0 = datetime.now() result['experiments'] = [] for i, (train, test) in enumerate(split): if args.verbose: print('### ITERATION {}'.format(i)) result['experiments'].append({ 'iteration': i, 'train_samples': data.index[train].tolist(), 'subsets': [] }) support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) for step in subset_sizes(n_features, n_features_to_select): if args.verbose: print('[{}] Evaluating with {} features and selecting {}.'. format(datetime.now() - d0, np.sum(support_), np.sum(support_) - step)) # Train with current subset pipeline = preprocess_steps + [ ('grid', GridWithCoef(clf, param_grid, cv=args.n_folds)) ] pipeline = Pipeline(pipeline) features = np.arange(n_features)[support_] pipeline.fit(data.iloc[train, features], target.iloc[train]) # Save results for current set of features grid = pipeline.steps[-1][1] result['experiments'][-1]['subsets'].append({ 'n_features': np.sum(support_), 'features': data.columns[features].tolist(), 'best_params': grid.best_params_, 'train': { 'y_true': target.iloc[train].tolist(), 'y_pred': grid.predict(data.iloc[train, features]).tolist() }, 'test': { 'y_true': target.iloc[test].tolist(), 'y_pred': grid.predict(data.iloc[test, features]).tolist() } }) # Select best subset coef_ = safe_sqr(grid.coef_) if coef_.ndim > 1: ranks = np.argsort(coef_.sum(axis=0)) else: ranks = np.argsort(coef_) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features support_[features[ranks][:step]] = False ranking_[np.logical_not(support_)] += 1 # Store results with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) if args.verbose: print('# OK')
def _fit(self, X, y, step_score=None): tags = self._get_tags() X, y = self._validate_data( X, y, accept_sparse="csc", ensure_min_features=2, force_all_finite=not tags.get('allow_nan', True), multi_output=True) # Initialization n_features = X.shape[1] if self.n_features_to_select is None: n_features_to_select = n_features // 2 else: n_features_to_select = self.n_features_to_select if 0.0 < self.step < 1.0: step = int(max(1, self.step * n_features)) else: step = int(self.step) if step <= 0: raise ValueError("Step must be >0") support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) if step_score: self.scores_ = [] # Elimination while np.sum(support_) > n_features_to_select: # Remaining features features = np.arange(n_features)[support_] # Rank the remaining features estimator = clone(self.estimator) if self.verbose > 0: print("Fitting estimator with %d features." % np.sum(support_)) # Fit estimator.fit(X[:, features], y) # Get coefs if hasattr(estimator, 'coef_'): coefs = estimator.coef_ else: coefs = getattr(estimator, 'feature_importances_', None) if coefs is None: raise RuntimeError( "The classifier does not expose coef_or feature_importances_attributes" ) # Get ranks if coefs.ndim > 1: ranks = np.argsort(safe_sqr(coefs).sum(axis=0)) else: ranks = np.argsort(safe_sqr(coefs)) # For sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features threshold = min(step, np.sum(support_) - n_features_to_select) # Save support of selected features self.supports.append(list(support_)) # Compute step score on the previous selection iteration because 'estimator' must use features that have not been eliminated yet if step_score: self.scores_.append(step_score(estimator, features)) support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 # Set final attributes features = np.arange(n_features)[support_] self.estimator_ = clone(self.estimator) self.estimator_.fit(X[:, features], y) # Compute step score when only n_features_to_select features left if step_score: self.scores_.append(step_score(self.estimator_, features)) self.n_features_ = support_.sum() self.support_ = support_ self.ranking_ = ranking_ # Save support of selected features self.supports.append(list(support_)) return self
def f_regression_cov_alt(X, y, C): """ Implementation as derived in tex document See pg 12 of following document for definition of F-statistic http://www-stat.stanford.edu/~jtaylo/courses/stats191/notes/simple_diagnostics.pdf Parameters ---------- X : {array-like, sparse matrix} shape = (n_samples, n_features) The set of regressors that will tested sequentially. y : array of shape(n_samples). The data matrix c : {array-like, sparse matrix} shape = (n_samples, n_covariates) The set of covariates. Returns ------- F : array, shape=(n_features,) F values of features. pval : array, shape=(n_features,) p-values of F-scores. """ # make sure we don't overwrite input data old_flag_X = X.flags.writeable old_flag_C = C.flags.writeable old_flag_y = y.flags.writeable X.flags.writeable = False C.flags.writeable = False y.flags.writeable = False #X, C, y = check_array(X, C, y, dtype=np.float) y = y.ravel() # make copy of input data X = X.copy(order="F") y = y.copy() assert C.shape[1] < C.shape[0] cpinv = np.linalg.pinv(C) X -= np.dot(C, (np.dot(cpinv, X))) #most expensive line (runtime) y -= np.dot(C, (np.dot(cpinv, y))) yS = safe_sqr(y.T.dot(X)) # will create a copy # Note: (X*X).sum(0) = X.T.dot(X).diagonal(), computed efficiently # see e.g.: http://stackoverflow.com/questions/14758283/is-there-a-numpy-scipy-dot-product-calculating-only-the-diagonal-entries-of-the # TODO: make this smarter using either stride tricks or cython X *= X denom = X.sum(0) * y.T.dot(y) - yS F = yS / denom # degrees of freedom dof = (X.shape[0] - 1 - C.shape[1]) / (1) #(df_fm / (df_rm - df_fm)) F *= dof # convert to p-values pv = stats.f.sf(F, 1, dof) # restore old state X.flags.writeable = old_flag_X C.flags.writeable = old_flag_C y.flags.writeable = old_flag_y return F, pv
def _fit(self, X, y, features_names=None, preload_features=None, ranking_th=0.005): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], multi_output=True) y = check_array(y, accept_sparse=['csr', 'csc', 'coo']) # Initialization n_features = X.shape[1] features = np.arange(n_features) cv = self.cv cv = check_cv(cv, y, classifier=is_classifier(self.estimator)) if sklearn.__version__ == '0.17': n_splits = cv.n_folds else: n_splits = cv.get_n_splits(X, y) if self.verbose > 1: print("Fitting {0} folds for each of iteration".format(n_splits)) if 0.0 < self.n_features_step < 1.0: step = int(max(1, self.n_features_step * n_features)) else: step = int(self.n_features_step) if step <= 0: raise ValueError("Step must be >0") if features_names is not None: features_names = np.array(features_names) else: if self.features_names is not None: features_names = self.features_names else: features_names = np.arange(n_features) # use indices tentative_support_ = np.zeros(n_features, dtype=np.bool) current_support_ = np.zeros(n_features, dtype=np.bool) self.scores_ = [] self.scores_confidences_ = [] self.features_per_it_ = [] if preload_features is not None: preload_features = np.unique(preload_features).astype('int') current_support_[preload_features] = True tentative_support_[preload_features] = True X_selected = X[:, features[current_support_]] y_hat, cv_scores = my_cross_val_predict(clone(self.estimator), X_selected, y, cv=cv) target = y - y_hat else: target = y.copy() score, confidence_interval = -np.inf, 0 proceed = np.sum(current_support_) < X.shape[1] while proceed: if self.verbose > 1: print('\nN-times variance of target: {}'.format( target.var() * target.shape[0])) # update values old_confidence_interval = confidence_interval old_score = score if self.scale: target = StandardScaler().fit_transform( target ) # Removed ravel to deal with multi-dimensional target # target = StandardScaler().fit_transform(target.reshape( # -1, 1)).ravel() # target = MinMaxScaler().fit_transform(target.reshape( # -1,1)).ravel() if self.verbose > 1: print() print('Feature ranking') print() print("target shape: {}".format(target.shape)) print() # Rank the remaining features start_t = time.time() rank_estimator = clone(self.estimator) rank_estimator.fit(X, target) end_fit = time.time() - start_t # Get coefs start_t = time.time() if hasattr(rank_estimator, 'coef_'): coefs = rank_estimator.coef_ elif hasattr(rank_estimator, 'feature_importances_'): coefs = rank_estimator.feature_importances_ else: raise RuntimeError('The classifier does not expose ' '"coef_" or "feature_importances_" ' 'attributes') end_rank = time.time() - start_t # Get ranks by ordering in ascending way if coefs.ndim > 1: ranks = np.argsort(safe_sqr(coefs).sum(axis=0)) coefs = coefs.sum(axis=0) else: ranks = np.argsort(safe_sqr(coefs)) # for sparse case ranks is matrix ranks = np.ravel(ranks) if self.verbose > 1: ranked_f = features[ranks] if features_names is not None: ranked_n = features_names[ranks] else: ranked_n = ['-'] * n_features print('{:6}\t{:6}\t{:8}\t{}'.format('Rank', 'Index', 'Score', 'Feature Name')) for i in range(n_features): idx = n_features - i - 1 if (coefs[ranks[idx]] < ranking_th) and (i > 2): print(' ...') break print('#{:6}\t{:6}\t{:6f}\t{}'.format( str(i), str(ranked_f[idx]), coefs[ranks[idx]], ranked_n[idx])) print("\n Fit done in {} s and rank done in {} s".format( end_fit, end_rank)) # if coefs[ranks][-1] < 1e-5: # if self.verbose > 0: # import warnings # warnings.warn('scores are too small to be used, please standardize inputs') # break # get the best features (ie, the latest one) # if the most ranked features is selected go on a select # other features accordingly to the ranking # threshold = step # step_features = features[ranks][-threshold:] ii = len(features_names) - 1 step_features = features[ranks][ii] while np.all(current_support_[step_features]) and ii > 0: ii -= 1 step_features = features[ranks][ii] if np.all(current_support_[step_features]): if self.verbose > 0: print("Selected features: {} {}".format( features_names[step_features], step_features)) # if features_names is not None: # print("Selected features: {} {}".format(features_names[ranks][-threshold:], step_features)) # else: # print("Selected features: {}".format(step_features)) print('Ended because selected features already selected') step_features = None break # update selected features tentative_support_[step_features] = True # get the selected features X_selected = X[:, features[tentative_support_]] start_t = time.time() # cross validates to obtain the scores y_hat, cv_scores = my_cross_val_predict(clone(self.estimator), X_selected, y, cv=cv) # y_hat = cross_val_predict(clone(self.estimator), X_selected, y, cv=cv) # compute new target target = y - y_hat # compute score and confidence interval # score = r2_score(y_true=y, y_pred=y_hat, multioutput='uniform_average') # np.mean(cv_scores) if self.verbose > 1: print('r2: {}'.format(np.mean(cv_scores, axis=0))) score = np.mean(cv_scores) if len(cv_scores.shape) > 1: cv_scores = np.mean(cv_scores, axis=1) m2 = np.mean(cv_scores * cv_scores) confidence_interval_or = np.sqrt( (m2 - score * score) / (n_splits - 1)) end_t = time.time() - start_t if self.verbose > 0: # if features_names is not None: print("Selected features: {} {}".format( features_names[step_features], step_features)) print("Total features: {} {}".format( features_names[tentative_support_], features[tentative_support_])) # else: # print("Selected features: {}".format(step_features)) # print("Total features: {}".format(features[tentative_support_])) print("R2= {} +- {}".format(score, confidence_interval_or)) print("\nCrossvalidation done in {} s".format(end_t)) confidence_interval = confidence_interval_or * self.significance # do not trust confidence interval completely # check terminal condition proceed = score - old_score > old_confidence_interval + confidence_interval if score >= 0 and old_score >= 0 else True if self.verbose > 1: print("PROCEED: {}\n\t{} - {} > {} + {}\n\t{} > {} )".format( proceed, score, old_score, old_confidence_interval, confidence_interval, score - old_score, old_confidence_interval + confidence_interval)) if proceed or np.sum(current_support_) == 0: # last feature set proved to be informative # we need to take into account of the new features (update current support) current_support_[step_features] = True self.features_per_it_.append(features_names[step_features]) self.scores_.append(score) self.scores_confidences_.append(confidence_interval) # all the features are selected, stop if np.sum(current_support_) == n_features: if self.verbose > 0: print("All the features has been selected.") proceed = False else: # last feature set proved to be not informative # keep old support and delete the current one (it is no more necessary) del tentative_support_ if self.verbose > 0: print('Last feature {} not added to the set'.format( features_names[step_features])) # Set final attributes self.estimator_ = clone(self.estimator) # self.estimator_.fit(Xns[:, current_support_], yns) self.estimator_.fit(X[:, current_support_], y) self.n_features_ = current_support_.sum() self.support_ = current_support_ # self.ranking_ = ranking_ return self
def fit(self, X, y=None, groups=None, **fit_params): """ Apply feature elimination routine, ultimately fitting estimator on the best feature set. Args: X (array-like, shape = [n_samples, n_features]): input data y (array-like, shape = [n_samples, ], [n_samples, n_classes]): targets groups (array-like): group labels for the samples used while splitting the dataset into train/test set **fit_params (dict of string -> object): parameters passed to the `fit` method of the estimator """ X, y = check_X_y(X, y, "csr", ensure_min_features=2) cv = check_cv(self.cv, y, is_classifier(self.estimator)) scorer = check_scoring(self.estimator, scoring=self.scoring) n_features = X.shape[1] if self.min_features_to_select is None: min_features_to_select = n_features // 2 else: min_features_to_select = self.min_features_to_select if 0.0 < self.step < 1.0: step = int(max(1, self.step * n_features)) else: step = int(self.step) if step <= 0: raise ValueError("Step must be >0") initial_estimator = _clone(self.estimator) initial_estimator.fit(X, y, **fit_params) if hasattr(initial_estimator, 'coef_'): coefs = initial_estimator.coef_ else: coefs = getattr(initial_estimator, 'feature_importances_', None) if coefs is None: raise RuntimeError('The classifier does not expose ' '"coef_" or "feature_importances_" ' 'attributes') if coefs.ndim > 1: ranks = np.argsort(safe_sqr(coefs).sum(axis=0)) else: ranks = np.argsort(safe_sqr(coefs)) ranks = np.ravel(ranks)[:(n_features - min_features_to_select)] this_step = 0 features_to_remove = [np.array([])] while this_step < (n_features - min_features_to_select): this_step += step features_to_remove.append(ranks[:this_step]) cv_splits_ = list(cv.split(X,y,groups)) fit_sets = list(product(features_to_remove, cv_splits_)) base_estimator = _clone(self.estimator) if not self.sc: parallel = Parallel( n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch ) scores = parallel( delayed(_fit_and_score_one)( index, _clone(base_estimator), X, y, scorer, train, test, self.verbose, fit_params) for index, (train, test) in fit_sets) score_sets = _divide_chunks(list(scores), len(cv_splits_)) else: base_estimator_ = self.sc.broadcast(base_estimator) partitions = _parse_partitions(self.partitions, len(fit_sets)) verbose = self.verbose scores = ( self.sc.parallelize(fit_sets, numSlices=partitions) .map(lambda x: [x[0], _fit_and_score_one( x[0], _clone(base_estimator), X, y, scorer, x[1][0], x[1][1], verbose, fit_params)]).collect() ) score_sets = [] for feat_set in features_to_remove: this_set = [] for row in scores: if (feat_set.shape == row[0].shape) and np.allclose(feat_set, row[0]): this_set.append(row[1]) score_sets.append(this_set) self.scores_ = [] for score_set in score_sets: this_score = np.mean(score_set) self.scores_.append(this_score) best_set_ = np.argmax(self.scores_) self.best_score_ = self.scores_[best_set_] if len(features_to_remove[best_set_]) > 0: self.best_features_ = np.delete( range(n_features), features_to_remove[best_set_]) else: self.best_features_ = range(n_features) self.best_estimator_ = _clone(self.estimator) self.best_estimator_.fit(X[:, self.best_features_], y, **fit_params) self.n_features_ = len(self.best_features_) del self.sc return self
def _fit(self, X, y, step_score=None, features_names=None): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'], multi_output=True) # Initialization n_features = X.shape[1] features = np.arange(n_features) cv = self.cv cv = check_cv(cv, y, classifier=is_classifier(self.estimator)) n_splits = cv.get_n_splits(X, y) if self.verbose > 0: print("Fitting {0} folds for each of iteration".format(n_splits)) if 0.0 < self.n_features_step < 1.0: step = int(max(1, self.n_features_step * n_features)) else: step = int(self.n_features_step) if step <= 0: raise ValueError("Step must be >0") # if self.force_iterations is None: # force_iteration = False # else: # force_iteration = self.force_iterations # if step_score is None: # step_score = r2_score if features_names is not None: features_names = np.array(features_names) else: if self.features_names is not None: features_names = self.features_names else: features_names = np.arange(n_features) # use indices tentative_support_ = np.zeros(n_features, dtype=np.bool) current_support_ = np.zeros(n_features, dtype=np.bool) self.scores_ = [] self.features_per_it_ = [] target = y score, confidence_interval = -np.inf, 0 proceed = True while proceed: if self.verbose > 0: print('\nN-times variance of target: {}'.format( target.var() * target.shape[0])) # update values old_confidence_interval = confidence_interval old_score = score if self.verbose > 0: print() print('Feature ranking') print() if self.scale: target = StandardScaler().fit_transform(target) # Rank the remaining features rank_estimator = clone(self.estimator) rank_estimator.fit(X, target) # Get coefs if hasattr(rank_estimator, 'coef_'): coefs = rank_estimator.coef_ elif hasattr(rank_estimator, 'feature_importances_'): coefs = rank_estimator.feature_importances_ else: raise RuntimeError('The classifier does not expose ' '"coef_" or "feature_importances_" ' 'attributes') # Get ranks by ordering in ascending way if coefs.ndim > 1: ranks = np.argsort(safe_sqr(coefs).sum(axis=0)) coefs = coefs.sum(axis=0) else: ranks = np.argsort(safe_sqr(coefs)) # for sparse case ranks is matrix ranks = np.ravel(ranks) if self.verbose > 0: ranked_f = features[ranks] if features_names is not None: ranked_n = features_names[ranks] else: ranked_n = ['-'] * n_features print('{:6}\t{:6}\t{:8}\t{}'.format('Rank', 'Index', 'Score', 'Feature Name')) for i in range(n_features): idx = n_features - i - 1 print('#{:6}\t{:6}\t{:6f}\t{}'.format( str(i), str(ranked_f[idx]), coefs[ranks[idx]], ranked_n[idx])) if coefs[ranks][-1] < 1e-5: if self.verbose > 0: import warnings warnings.warn( 'scores are too small to be used, please standardize inputs' ) break # get the best features (ie, the latest one) # if the most ranked features is selected go on a select # other features accordingly to the ranking # threshold = step # step_features = features[ranks][-threshold:] ii = len(features_names) - 1 step_features = features[ranks][ii] while np.all(current_support_[step_features]) and ii > 0: ii -= 1 step_features = features[ranks][ii] if np.all(current_support_[step_features]): if self.verbose > 0: print("Selected features: {} {}".format( features_names[step_features], step_features)) # if features_names is not None: # print("Selected features: {} {}".format(features_names[ranks][-threshold:], step_features)) # else: # print("Selected features: {}".format(step_features)) print('Ended because selected features already selected') step_features = None break # update selected features tentative_support_[step_features] = True # get the selected features X_selected = X[:, features[tentative_support_]] # cross validates to obtain the scores # cv_scores = cross_val_score(clone(self.estimator), X_selected, y, cv=cv, scoring='r2') y_hat = cross_val_predict(clone(self.estimator), X_selected, y, cv=cv) # compute new target target = y - y_hat # compute score and confidence interval score = r2_score( y_true=y, y_pred=y_hat, multioutput='uniform_average') # np.mean(cv_scores) print('r2: {}'.format( r2_score(y_true=y, y_pred=y_hat, multioutput='raw_values'))) # m2 = np.mean(cv_scores * cv_scores) SIGNIFICANCE = 0.0 confidence_interval = SIGNIFICANCE # * np.sqrt((m2 - score * score) / (n_splits - 1)) if self.verbose > 0: # if features_names is not None: print("Selected features: {} {}".format( features_names[step_features], step_features)) print("Total features: {} {}".format( features_names[tentative_support_], features[tentative_support_])) # else: # print("Selected features: {}".format(step_features)) # print("Total features: {}".format(features[tentative_support_])) print("R2= {} +- {}".format(score, confidence_interval)) self.scores_.append(score) self.features_per_it_.append(features_names[tentative_support_]) # check terminal condition proceed = score - old_score > old_confidence_interval + confidence_interval if proceed or np.sum(current_support_) == 0: # last feature set proved to be informative # we need to take into account of the new features (update current support) current_support_[step_features] = True # all the features are selected, stop if np.sum(current_support_) == n_features: if self.verbose > 0: print("All the features has been selected.") proceed = False else: # last feature set proved to be not informative # keep old support and delete the current one (it is no more necessary) del tentative_support_ if self.verbose > 0: print('Last feature {} not added to the set'.format( features_names[step_features])) # Set final attributes self.estimator_ = clone(self.estimator) # self.estimator_.fit(Xns[:, current_support_], yns) self.estimator_.fit(X[:, current_support_], y) self.n_features_ = current_support_.sum() self.support_ = current_support_ # self.ranking_ = ranking_ return self
def f_oneway(*args): """Performs a 1-way ANOVA. The one-way ANOVA tests the null hypothesis that 2 or more groups have the same population mean. The test is applied to samples from two or more groups, possibly with differing sizes. Read more in the :ref:`User Guide <univariate_feature_selection>`. Parameters ---------- *args : array_like, sparse matrices sample1, sample2... The sample measurements should be given as arguments. Returns ------- F-value : float The computed F-value of the test. p-value : float The associated p-value from the F-distribution. Notes ----- The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid. 1. The samples are independent 2. Each sample is from a normally distributed population 3. The population standard deviations of the groups are all equal. This property is known as homoscedasticity. If these assumptions are not true for a given set of data, it may still be possible to use the Kruskal-Wallis H-test (`scipy.stats.kruskal`_) although with some loss of power. The algorithm is from Heiman[2], pp.394-7. See ``scipy.stats.f_oneway`` that should give the same results while being less efficient. References ---------- .. [1] Lowry, Richard. "Concepts and Applications of Inferential Statistics". Chapter 14. http://faculty.vassar.edu/lowry/ch14pt1.html .. [2] Heiman, G.W. Research Methods in Statistics. 2002. """ n_classes = len(args) args = [as_float_array(a) for a in args] n_samples_per_class = np.array([a.shape[0] for a in args]) n_samples = np.sum(n_samples_per_class) ss_alldata = sum(safe_sqr(a).sum(axis=0) for a in args) sums_args = [np.asarray(a.sum(axis=0)) for a in args] square_of_sums_alldata = sum(sums_args)**2 square_of_sums_args = [s**2 for s in sums_args] sstot = ss_alldata - square_of_sums_alldata / float(n_samples) ssbn = 0. for k, _ in enumerate(args): ssbn += square_of_sums_args[k] / n_samples_per_class[k] ssbn -= square_of_sums_alldata / float(n_samples) sswn = sstot - ssbn dfbn = n_classes - 1 dfwn = n_samples - n_classes msb = ssbn / float(dfbn) msw = sswn / float(dfwn) constant_features_idx = np.where(msw == 0.)[0] if (np.nonzero(msb)[0].size != msb.size and constant_features_idx.size): warnings.warn("Features %s are constant." % constant_features_idx, UserWarning) f = msb / msw # flatten matrix to vector in sparse case f = np.asarray(f).ravel() prob = special.fdtrc(dfbn, dfwn, f) return f, prob
def svmFC(self, x, step): self.step = step self.X, self.y, self.X_val, self.y_val, self.featureNames = self.splitData( x=x) self.features = self.featureNames self.j = 0 while self.X.shape[1] > 201: self.j += 1 self.svc = SVC(kernel='linear') self.Cs = np.array([0.5, 1.0, 10, 100]) # get the hyperparamaters self.clf = GridSearchCV(estimator=self.svc, param_grid=dict(C=self.Cs), cv=5, return_train_score=True, n_jobs=20) self.clf.fit(self.X, self.y) # do 5-fold cross validation self.cv_test_error = [] self.skf = StratifiedKFold(n_splits=5, random_state=self.j, shuffle=True) for trn, tst in self.skf.split(self.X, self.y): self.train_train, self.train_test = self.X[trn], self.X[tst] self.train_clstrs, self.test_clstrs = self.y[trn], self.y[tst] self.val_clf = SVC(C=list(self.clf.best_params_.values())[0], kernel="linear") self.val_clf.fit(self.train_train, self.train_clstrs) self.cv_test_error.append( self.val_clf.score(self.train_test, self.test_clstrs)) self.mean_cv_test_error = np.array(self.cv_test_error).mean() ## train classification for RFE self.rfe_clf = SVC(C=list(self.clf.best_params_.values())[0], kernel="linear") self.rfe_clf.fit(self.X, self.y) # get coeffs self.coefs = self.rfe_clf.coef_ # get ranks if self.coefs.ndim > 1: self.ranks = np.argsort(safe_sqr(self.coefs).sum(axis=0)) else: self.ranks = np.argsort(safe_sqr(self.coefs)) # remove the X least important features from the array self.to_remove_index = [] for r in range(self.step): self.to_remove_index.append(self.ranks[r]) self.to_remove_index.sort(reverse=True) # remove from largest index to smallest for f in self.to_remove_index: self.X = np.delete(self.X, f, axis=1) self.X_val = np.delete(self.X_val, f, axis=1) del self.features[f] return self.X, self.y, self.X_val, self.y_val, self.features