# factors = factors[condition] split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) result['split'] = { 'type': 'StratifiedShuffleSplit', 'n_iter': args.n_iter, 'test_size': args.test_size } result['cross_val'] = {'n_folds': args.n_folds} steps = [] clf, param_grid = choose_classifier(args.clf) result['results'] = {'accuracy': {'train': [], 'test': []}} steps.append('clf') if args.verbose: print('{:<7} {:<7} {}'.format('Train', 'Test', 'Time')) t0 = time() for train, test in split: if args.n_folds == 'loo': cv = LeaveOneOut(len(train)) else: cv = args.n_folds grid = GridSearchCV(clf, param_grid=param_grid, cv=cv, n_jobs=-1) steps[-1] = ('clf', grid)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None) parser.add_argument('--target') parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') args = parser.parse_args() result = {} result.update(args.__dict__) start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S") result['start_time'] = start_time if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1))) result_file = join(args.results_path, 'rfe_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) load_params = {} if args.data == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(args.data, data_path=args.data_path, log=result, **load_params) if args.tissue: data = data[factors['source tissue'] == args.tissue] factors = factors[factors['source tissue'] == args.tissue] target = factors[args.target] clf, param_grid = choose_classifier(args.clf, result, args.verbose) split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 9 preprocess_steps = [('scaler', StandardScaler())] # RFE d0 = datetime.now() result['experiments'] = [] for i, (train, test) in enumerate(split): if args.verbose: print('### ITERATION {}'.format(i)) result['experiments'].append({ 'iteration': i, 'train_samples': data.index[train].tolist(), 'subsets': [] }) support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) for step in subset_sizes(n_features, n_features_to_select): if args.verbose: print('[{}] Evaluating with {} features and selecting {}.' .format(datetime.now() - d0, np.sum(support_), np.sum(support_) - step)) # Train with current subset pipeline = preprocess_steps + [('grid', GridWithCoef(clf, param_grid, cv=args.n_folds))] pipeline = Pipeline(pipeline) features = np.arange(n_features)[support_] pipeline.fit(data.iloc[train, features], target.iloc[train]) # Save results for current set of features grid = pipeline.steps[-1][1] result['experiments'][-1]['subsets'].append({ 'n_features': np.sum(support_), 'features': data.columns[features].tolist(), 'best_params': grid.best_params_, 'train': { 'y_true': target.iloc[train].tolist(), 'y_pred': grid.predict(data.iloc[train, features]).tolist() }, 'test': { 'y_true': target.iloc[test].tolist(), 'y_pred': grid.predict(data.iloc[test, features]).tolist() } }) # Select best subset coef_ = safe_sqr(grid.coef_) if coef_.ndim > 1: ranks = np.argsort(coef_.sum(axis=0)) else: ranks = np.argsort(coef_) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features support_[features[ranks][:step]] = False ranking_[np.logical_not(support_)] += 1 # Store results with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) if args.verbose: print('# OK')
print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) sys.stdout.flush() load_params = {} if result['data'] == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(result['data'], data_path=args.data_path, log=result, **load_params) if result['tissue']: data = data[factors['source tissue'] == result['tissue']] factors = factors[factors['source tissue'] == result['tissue']] target = factors[result['target']] clf, param_grid = choose_classifier(args.clf, result, args.verbose) train_samples = np.ones(data.shape[0], dtype=np.bool) train_samples[result['test_samples']] = False train_data = data.loc[train_samples, :] train_target = target.loc[train_samples] test_data = data.iloc[result['test_samples'], :] test_target = target.iloc[result['test_samples']] experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1))) result_file = join(args.results_path, 'mrmr_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) sys.stdout.flush()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--target') parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') args = parser.parse_args() result = {} result.update(args.__dict__) if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S")) result['selections'] = [] experiment_id = hash(json.dumps(result) + str(np.random.rand(10))) result_file = join(args.results_path, 'fs_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) data, factors = load(args.data, data_path=args.data_path, log=result) target = factors[args.target] clf, param_grid = choose_classifier(args.clf, result, args.verbose) feature_names = data.columns split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 1 support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) # Elimination t0 = time() d0 = datetime.now() while np.sum(support_) > n_features_to_select: step = 10**int(np.log10(np.sum(support_) - 1)) odd_step = np.sum(support_) - step * (np.sum(support_) // step) if odd_step > 0: step = odd_step if args.verbose: print('[{}] Selecting best {:d} features.'.format( datetime.now() - d0, np.sum(support_) - step)) # Remaining features features = np.arange(n_features)[support_] coef_ = None test_scores = [] for train, test in split: # Rank the remaining features if args.n_folds == 'loo': cv = LeaveOneOut(len(train)) else: cv = args.n_folds estimator = GridWithCoef(clf, param_grid, cv=cv) estimator.fit(data.iloc[train, features], target.iloc[train]) if coef_ is None: coef_ = safe_sqr(estimator.coef_) else: coef_ += safe_sqr(estimator.coef_) test_scores.append( estimator.score(data.iloc[test, features], target.iloc[test])) if coef_.ndim > 1: ranks = np.argsort(coef_.sum(axis=0)) else: ranks = np.argsort(coef_) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features threshold = min(step, np.sum(support_) - n_features_to_select) support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 result['selections'].append({ 'scores': test_scores, 'n_features': np.sum(support_), 'features': feature_names[support_].tolist() }) with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) if args.verbose: print('# OK')
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--target') parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None) parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') parser.add_argument('--filter', default='anova') args = parser.parse_args() result = {} result.update(args.__dict__) start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S") result['start_time'] = start_time if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) sys.stdout.flush() load_params = {} if args.data == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(args.data, data_path=args.data_path, log=result, **load_params) if args.tissue: data = data[factors['source tissue'] == args.tissue] factors = factors[factors['source tissue'] == args.tissue] target = factors[args.target] target_num = LabelEncoder().fit_transform(target) clf, param_grid = choose_classifier(args.clf, result, args.verbose) score_params = {} preprocessor = None if args.filter == 'anova': score_features = anova elif args.filter == 'infogain_10': score_features = relevance score_params = {'bins': 10} elif args.filter == 'infogain_exp': preprocessor = ExpressionDiscretizer() score_features = relevance score_params = {'bins': 3} elif args.filter == 'chi2': preprocessor = ExpressionDiscretizer() score_features = chi_squared else: raise ValueError('Filter {} unknown.'.format(args.filter)) experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1))) result_file = join(args.results_path, '{}_{}.json'.format(args.filter, experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) sys.stdout.flush() split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 9 d0 = datetime.now() result['experiments'] = [] for i, (train, test) in enumerate(split): if args.verbose: print('### ITERATION {}'.format(i)) if preprocessor: preprocessor.fit(data.iloc[train, :]) train_data = preprocessor.transform(data.iloc[train, :]) test_data = preprocessor.transform(data.iloc[test, :]) else: train_data = data.iloc[train, :] test_data = data.iloc[test, :] scores_ = score_features(train_data, target_num[train], **score_params) result['experiments'].append({ 'iteration': i, 'train_samples_label': data.index[train].tolist(), 'train_samples_idx': train.tolist(), 'scores': scores_.tolist() }) if args.verbose: print('[{}] Features scored.'.format(datetime.now() - d0)) sys.stdout.flush() result['experiments'][-1]['subsets'] = [] current_size = n_features sorted_features = np.argsort(scores_) for step in subset_sizes(n_features, n_features_to_select): if args.verbose: print('[{}] Fitting with {} features.'.format(datetime.now() - d0, current_size)) sys.stdout.flush() features = sorted_features[-current_size:] grid = GridWithCoef(clf, param_grid, cv=args.n_folds) grid.fit(train_data.iloc[:, features], target.iloc[train]) # Save results for current set of features result['experiments'][-1]['subsets'].append({ 'n_features': current_size, 'features': data.columns[features].tolist(), 'best_params': grid.best_params_, 'train': { 'y_true': target.iloc[train].tolist(), 'y_pred': grid.predict(train_data.iloc[:, features]).tolist() }, 'test': { 'y_true': target.iloc[test].tolist(), 'y_pred': grid.predict(test_data.iloc[:, features]).tolist() } }) # Store results with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) current_size -= step if args.verbose: print('# OK') sys.stdout.flush()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--target') parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') args = parser.parse_args() result = {} result.update(args.__dict__) if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Start: ' + datetime.now().strftime("%d/%m/%y %H:%M:%S")) result['selections'] = [] experiment_id = hash(json.dumps(result) + str(np.random.rand(10))) result_file = join(args.results_path, 'fs_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) data, factors = load(args.data, data_path=args.data_path, log=result) target = factors[args.target] clf, param_grid = choose_classifier(args.clf, result, args.verbose) feature_names = data.columns split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 1 support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) # Elimination t0 = time() d0 = datetime.now() while np.sum(support_) > n_features_to_select: step = 10 ** int(np.log10(np.sum(support_) - 1)) odd_step = np.sum(support_) - step * (np.sum(support_) // step) if odd_step > 0: step = odd_step if args.verbose: print('[{}] Selecting best {:d} features.' .format(datetime.now() - d0, np.sum(support_) - step)) # Remaining features features = np.arange(n_features)[support_] coef_ = None test_scores = [] for train, test in split: # Rank the remaining features if args.n_folds == 'loo': cv = LeaveOneOut(len(train)) else: cv = args.n_folds estimator = GridWithCoef(clf, param_grid, cv=cv) estimator.fit(data.iloc[train, features], target.iloc[train]) if coef_ is None: coef_ = safe_sqr(estimator.coef_) else: coef_ += safe_sqr(estimator.coef_) test_scores.append(estimator.score(data.iloc[test, features], target.iloc[test])) if coef_.ndim > 1: ranks = np.argsort(coef_.sum(axis=0)) else: ranks = np.argsort(coef_) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features threshold = min(step, np.sum(support_) - n_features_to_select) support_[features[ranks][:threshold]] = False ranking_[np.logical_not(support_)] += 1 result['selections'].append({ 'scores': test_scores, 'n_features': np.sum(support_), 'features': feature_names[support_].tolist() }) with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) if args.verbose: print('# OK')
# sys.exit() # data = data[condition] # factors = factors[condition] split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) result['split'] = { 'type': 'StratifiedShuffleSplit', 'n_iter': args.n_iter, 'test_size': args.test_size } result['cross_val'] = {'n_folds': args.n_folds} steps = [] clf, param_grid = choose_classifier(args.clf) result['results'] = { 'accuracy': { 'train': [], 'test': [] } } steps.append('clf') if args.verbose: print('{:<7} {:<7} {}'.format('Train', 'Test', 'Time')) t0 = time() for train, test in split: if args.n_folds == 'loo':
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--target') parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None) parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') parser.add_argument('--filter', default='anova') args = parser.parse_args() result = {} result.update(args.__dict__) start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S") result['start_time'] = start_time if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) sys.stdout.flush() load_params = {} if args.data == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(args.data, data_path=args.data_path, log=result, **load_params) if args.tissue: data = data[factors['source tissue'] == args.tissue] factors = factors[factors['source tissue'] == args.tissue] target = factors[args.target] target_num = LabelEncoder().fit_transform(target) clf, param_grid = choose_classifier(args.clf, result, args.verbose) score_params = {} preprocessor = None if args.filter == 'anova': score_features = anova elif args.filter == 'infogain_10': score_features = relevance score_params = {'bins': 10} elif args.filter == 'infogain_exp': preprocessor = ExpressionDiscretizer() score_features = relevance score_params = {'bins': 3} elif args.filter == 'chi2': preprocessor = ExpressionDiscretizer() score_features = chi_squared else: raise ValueError('Filter {} unknown.'.format(args.filter)) experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1))) result_file = join(args.results_path, '{}_{}.json'.format(args.filter, experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) sys.stdout.flush() split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 9 d0 = datetime.now() result['experiments'] = [] for i, (train, test) in enumerate(split): if args.verbose: print('### ITERATION {}'.format(i)) if preprocessor: preprocessor.fit(data.iloc[train, :]) train_data = preprocessor.transform(data.iloc[train, :]) test_data = preprocessor.transform(data.iloc[test, :]) else: train_data = data.iloc[train, :] test_data = data.iloc[test, :] scores_ = score_features(train_data, target_num[train], **score_params) result['experiments'].append({ 'iteration': i, 'train_samples_label': data.index[train].tolist(), 'train_samples_idx': train.tolist(), 'scores': scores_.tolist() }) if args.verbose: print('[{}] Features scored.'.format(datetime.now() - d0)) sys.stdout.flush() result['experiments'][-1]['subsets'] = [] current_size = n_features sorted_features = np.argsort(scores_) for step in subset_sizes(n_features, n_features_to_select): if args.verbose: print('[{}] Fitting with {} features.'.format( datetime.now() - d0, current_size)) sys.stdout.flush() features = sorted_features[-current_size:] grid = GridWithCoef(clf, param_grid, cv=args.n_folds) grid.fit(train_data.iloc[:, features], target.iloc[train]) # Save results for current set of features result['experiments'][-1]['subsets'].append({ 'n_features': current_size, 'features': data.columns[features].tolist(), 'best_params': grid.best_params_, 'train': { 'y_true': target.iloc[train].tolist(), 'y_pred': grid.predict(train_data.iloc[:, features]).tolist() }, 'test': { 'y_true': target.iloc[test].tolist(), 'y_pred': grid.predict(test_data.iloc[:, features]).tolist() } }) # Store results with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) current_size -= step if args.verbose: print('# OK') sys.stdout.flush()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--results-path', default='./bucket/results/') parser.add_argument('--data') parser.add_argument('--tissue', type=lambda x: re.sub(r'[\"\']', '', x) if x is not None else None) parser.add_argument('--target') parser.add_argument('--data-path', default='./bucket/data/') parser.add_argument('--verbose', '-v', action='count') parser.add_argument('--test-size', type=float, default=0.1) parser.add_argument('--n-iter', type=int, default=1) parser.add_argument('--n-folds', default=10, type=n_folds_parser) parser.add_argument('--clf') args = parser.parse_args() result = {} result.update(args.__dict__) start_time = datetime.now().strftime("%d/%m/%y %H:%M:%S") result['start_time'] = start_time if args.verbose: for key in result: print('# {}: {}'.format(key, result[key])) print('# Running in: ' + gethostname()) print('# Start: ' + start_time) experiment_id = hash(json.dumps(result) + str(np.random.rand(10, 1))) result_file = join(args.results_path, 'rfe_{}.json'.format(experiment_id)) if args.verbose: print('Results will be saved to {}'.format(result_file)) load_params = {} if args.data == 'epi_ad': load_params = {'read_original': True, 'skip_pickle': True} data, factors = load(args.data, data_path=args.data_path, log=result, **load_params) if args.tissue: data = data[factors['source tissue'] == args.tissue] factors = factors[factors['source tissue'] == args.tissue] target = factors[args.target] clf, param_grid = choose_classifier(args.clf, result, args.verbose) split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) n_features = data.shape[1] n_features_to_select = 9 preprocess_steps = [('scaler', StandardScaler())] # RFE d0 = datetime.now() result['experiments'] = [] for i, (train, test) in enumerate(split): if args.verbose: print('### ITERATION {}'.format(i)) result['experiments'].append({ 'iteration': i, 'train_samples': data.index[train].tolist(), 'subsets': [] }) support_ = np.ones(n_features, dtype=np.bool) ranking_ = np.ones(n_features, dtype=np.int) for step in subset_sizes(n_features, n_features_to_select): if args.verbose: print('[{}] Evaluating with {} features and selecting {}.'. format(datetime.now() - d0, np.sum(support_), np.sum(support_) - step)) # Train with current subset pipeline = preprocess_steps + [ ('grid', GridWithCoef(clf, param_grid, cv=args.n_folds)) ] pipeline = Pipeline(pipeline) features = np.arange(n_features)[support_] pipeline.fit(data.iloc[train, features], target.iloc[train]) # Save results for current set of features grid = pipeline.steps[-1][1] result['experiments'][-1]['subsets'].append({ 'n_features': np.sum(support_), 'features': data.columns[features].tolist(), 'best_params': grid.best_params_, 'train': { 'y_true': target.iloc[train].tolist(), 'y_pred': grid.predict(data.iloc[train, features]).tolist() }, 'test': { 'y_true': target.iloc[test].tolist(), 'y_pred': grid.predict(data.iloc[test, features]).tolist() } }) # Select best subset coef_ = safe_sqr(grid.coef_) if coef_.ndim > 1: ranks = np.argsort(coef_.sum(axis=0)) else: ranks = np.argsort(coef_) # for sparse case ranks is matrix ranks = np.ravel(ranks) # Eliminate the worse features support_[features[ranks][:step]] = False ranking_[np.logical_not(support_)] += 1 # Store results with open(result_file, 'w') as f: json.dump(result, f, sort_keys=True, indent=2, separators=(',', ': ')) if args.verbose: print('# OK')