def rfe_single(estimator, dat, n_jobs = 3, xg = False, verbose = False): # Set up RFE rfe = RFE( estimator = estimator, n_features_to_select = 1, step = 1, verbose = verbose ) # Extract data X_train = dat[0] y_train = dat[1] X_test = dat[2] y_test = dat[3] # Set up scorer if xg: temp_scorer = lambda est, features: accuracy_score(y_true=y_test, y_pred=est.predict(X_test.values[:, features])) else: temp_scorer = lambda est, features: accuracy_score(y_true=y_test, y_pred=est.predict(X_test.iloc[:, features])) # Fit RFE rfe._fit(X_train, y_train, temp_scorer) # Return scores return rfe.scores_
def evaluate(args): import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.svm import LinearSVC from sklearn.metrics import roc_auc_score, accuracy_score, get_scorer from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler from sklearn.model_selection import GridSearchCV from sklearn.feature_selection import RFE, RFECV from sklearn.utils.class_weight import compute_sample_weight from sklearn.model_selection import KFold, StratifiedKFold, ShuffleSplit, LeaveOneOut, \ RepeatedKFold, RepeatedStratifiedKFold, LeaveOneOut, StratifiedShuffleSplit import pickle from estimators import RobustEstimator from tqdm import tqdm import h5py logger.info('read feature matrix: ' + args.matrix) m = pd.read_table(args.matrix, index_col=0, sep='\t') feature_names = m.columns.values logger.info('{} samples, {} features'.format(m.shape[0], m.shape[1])) logger.info('sample: {} ...'.format(str(m.index.values[:3]))) logger.info('features: {} ...'.format(str(m.columns.values[:3]))) logger.info('read sample classes: ' + args.sample_classes) sample_classes = pd.read_table(args.sample_classes, index_col=0, sep='\t') sample_classes = sample_classes.iloc[:, 0] sample_classes = sample_classes.loc[m.index.values] logger.info('sample_classes: {}'.format(sample_classes.shape[0])) # select samples if (args.positive_class is not None) and (args.negative_class is not None): positive_class = args.positive_class.split(',') negative_class = args.negative_class.split(',') else: unique_classes = np.unique(sample_classes.values) if len(unique_classes) != 2: raise ValueError('expect 2 classes but {} classes found'.format( len(unique_classes))) positive_class, negative_class = unique_classes positive_class = np.atleast_1d(positive_class) negative_class = np.atleast_1d(negative_class) logger.info('positive class: {}, negative class: {}'.format( positive_class, negative_class)) X_pos = m.loc[sample_classes[sample_classes.isin( positive_class)].index.values] X_neg = m.loc[sample_classes[sample_classes.isin( negative_class)].index.values] logger.info( 'number of positive samples: {}, negative samples: {}, class ratio: {}' .format(X_pos.shape[0], X_neg.shape[0], float(X_pos.shape[0]) / X_neg.shape[0])) X = pd.concat([X_pos, X_neg], axis=0) y = np.zeros(X.shape[0], dtype=np.int32) y[X_pos.shape[0]:] = 1 del X_pos del X_neg n_samples, n_features = X.shape sample_ids = X.index.values if not os.path.isdir(args.output_dir): logger.info('create outout directory: ' + args.output_dir) os.makedirs(args.output_dir) logger.info('save sample ids') X.index.to_series().to_csv(os.path.join(args.output_dir, 'samples.txt'), sep='\t', header=False, index=False) logger.info('save sample classes') np.savetxt(os.path.join(args.output_dir, 'classes.txt'), y, fmt='%d') # get numpy array from DataFrame X = X.values # check NaN values if np.any(np.isnan(X)): logger.info('nan values found in features') estimator = None grid_search = None logger.info('use {} to select features'.format(args.method)) if args.method == 'logistic_regression': estimator = LogisticRegression() grid_search = { 'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5] } elif args.method == 'random_forest': estimator = RandomForestClassifier() grid_search = { 'n_estimators': [25, 50, 75], 'max_depth': list(range(2, 8)) } elif args.method == 'linear_svm': estimator = LinearSVC() grid_search = { 'C': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4, 1e5] } else: raise ValueError('unknown feature selection method: {}'.format( args.method)) def get_splitter(splitter, n_splits=5, n_repeats=5, test_size=0.2): if splitter == 'kfold': return KFold(n_splits=n_splits) elif splitter == 'stratified_kfold': return StratifiedKFold(n_splits=n_splits) elif splitter == 'repeated_stratified_kfold': return RepeatedStratifiedKFold(n_splits=n_splits, n_repeats=n_repeats) elif splitter == 'shuffle_split': return ShuffleSplit(n_splits=n_splits, test_size=test_size) elif splitter == 'stratified_shuffle_split': return StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size) elif splitter == 'leave_one_out': return LeaveOneOut() else: raise ValueError('unknown splitter: {}'.format(splitter)) def score_function(estimator): '''Get method of an estimator that predict a continous score for each sample ''' if hasattr(estimator, 'predict_proba'): return estimator.predict_proba elif hasattr(estimator, 'decision_function'): return estimator.decision_function else: raise ValueError( 'the estimator should either have decision_function() method or predict_proba() method' ) def feature_importances(estimator): '''Get feature importance attribute of an estimator ''' if hasattr(estimator, 'coef_'): return np.ravel(estimator.coef_) elif hasattr(estimator, 'feature_importances_'): return np.ravel(estimator.feature_importances_) else: raise ValueError( 'the estimator should have either coef_ or feature_importances_ attribute' ) def get_scorer(scoring): if scoring == 'roc_auc': return roc_auc_score else: raise ValueError('unknonwn scoring: {}'.format(scoring)) splitter = get_splitter(args.splitter, n_splits=args.n_splits, n_repeats=args.n_repeats) metrics = [] predictions = np.full((splitter.get_n_splits(X), X.shape[0]), np.nan) predicted_labels = np.full((splitter.get_n_splits(X), X.shape[0]), np.nan) train_index_matrix = np.zeros((splitter.get_n_splits(X), X.shape[0]), dtype=np.bool) feature_selection_matrix = None if args.n_select is not None: feature_selection_matrix = np.zeros( (splitter.get_n_splits(X), X.shape[1]), dtype=bool) if args.rfe: if 0.0 < args.rfe_step < 1.0: rfe_step = int(max(1, args.rfe_step * n_features)) else: rfe_step = int(args.rfe_step) rfe_scores = None i_split = 0 scorer = get_scorer(args.scorer) data_splits = list(splitter.split(X, y)) data_splits.append((np.arange(n_samples), None)) for train_index, test_index in tqdm(data_splits, total=splitter.get_n_splits(X) + 1, unit='fold'): X_train, y_train = X[train_index], y[train_index] X_test, y_test = X[test_index], y[test_index] # optimize hyper-parameters if grid_search is not None: cv = GridSearchCV(estimator, grid_search, cv=5) cv.fit(X[train_index], y[train_index]) estimator = cv.best_estimator_ sample_weight = np.ones(X_train.shape[0]) if args.compute_sample_weight: sample_weight = compute_sample_weight('balanced', y_train) # feature selection if args.n_select is not None: if args.robust_select: resampler_args = {} if args.robust_resample_method == 'jackknife': resampler_args = { 'max_runs': args.robust_max_runs, 'remove': args.robust_jackknife_remove } elif args.robust_resample_method == 'bootstrap': resampler_args = {'max_runs': args.robust_max_runs} robust_estimator = RobustEstimator( estimator, n_select=args.n_select, resample_method=args.robust_resample_method, rfe=args.rfe, **resampler_args) robust_estimator.fit(X_train, y_train, sample_weight=sample_weight) estimator = robust_estimator.estimator_ features = robust_estimator.features_ # RFE feature selection elif args.rfe: rfe = RFE(estimator, n_features_to_select=args.n_select, step=rfe_step) if i_split < splitter.get_n_splits(X): if args.splitter == 'leave_one_out': # AUC is undefined for only one test sample step_score = lambda estimator, features: np.nan else: step_score = lambda estimator, features: scorer( y_test, score_function(estimator) (X[test_index][:, features])[:, 1]) else: step_score = None rfe._fit(X_train, y_train, step_score=step_score) features = np.nonzero(rfe.ranking_ == 1)[0] if i_split < splitter.get_n_splits(X): if rfe_scores is None: rfe_n_steps = len(rfe.scores_) rfe_n_features_step = np.maximum( n_features - rfe_step * np.arange(rfe_n_steps), 1) rfe_scores = np.zeros( (splitter.get_n_splits(X), rfe_n_steps)) rfe_scores[i_split] = rfe.scores_ estimator = rfe.estimator_ # no feature selection else: # train the model estimator.fit(X[train_index], y[train_index], sample_weight=sample_weight) features = np.argsort( -feature_importances(estimator))[:args.n_select] if i_split < splitter.get_n_splits(X): feature_selection_matrix[i_split, features] = True else: # no feature selection features = np.arange(n_features, dtype=np.int64) estimator.fit(X[train_index][:, features], y[train_index], sample_weight=sample_weight) if i_split != splitter.get_n_splits(X): predictions[i_split] = score_function(estimator)(X[:, features])[:, 1] predicted_labels[i_split] = estimator.predict(X[:, features]) metric = {} metric['train_{}'.format(args.scorer)] = scorer( y_train, predictions[i_split, train_index]) # AUC is undefined for only one test sample if args.splitter != 'leave_one_out': metric['test_{}'.format(args.scorer)] = scorer( y_test, predictions[i_split, test_index]) if args.splitter in ('repeated_kfold', 'repeated_stratified_kfold'): metric['repeat'] = i_split // args.n_repeats metric['split'] = i_split % args.n_repeats else: metric['split'] = i_split metrics.append(metric) train_index_matrix[i_split, train_index] = True i_split += 1 metrics = pd.DataFrame.from_records(metrics) if args.splitter == 'leave_one_out': metrics['test_{}'.format(args.scorer)] = scorer( y, predictions[np.r_[:n_samples], np.r_[:n_samples]]) logger.info('save best model') with open(os.path.join(args.output_dir, 'best_model.pkl'), 'wb') as f: pickle.dump(estimator, f) logger.info('save features') data = pd.Series(features, index=feature_names[features]) data.to_csv(os.path.join(args.output_dir, 'features.txt'), sep='\t', header=False) logger.info('save feature importances') data = pd.Series(feature_importances(estimator), index=feature_names[features]) data.to_csv(os.path.join(args.output_dir, 'feature_importances.txt'), sep='\t', header=False) logger.info('save evaluations') with h5py.File( os.path.join(args.output_dir, 'evaluation.{}.h5'.format(args.splitter)), 'w') as f: f.create_dataset('train_index', data=train_index_matrix) f.create_dataset('predictions', data=predictions) if feature_selection_matrix is not None: f.create_dataset('feature_selection', data=feature_selection_matrix) if args.rfe: f.create_dataset('rfe_n_features_step', data=rfe_n_features_step) f.create_dataset('rfe_scores', data=rfe_scores) f.create_dataset('labels', data=y) f.create_dataset('predicted_labels', data=predicted_labels) logger.info('save metrics') metrics.to_csv(os.path.join(args.output_dir, 'metrics.txt'), sep='\t', header=True, index=False)