def main(argv): # rma, drug, stress = load_data() rma, label = data.load_mdd_data() alpha_range = np.logspace(-2, 7, 10) l1_ratio_range = np.arange(0., 1., 0.1) en_param_grid = dict(alpha=alpha_range, l1_ratio=l1_ratio_range) c_range = np.logspace(-2, 7, 10) gamma_range = np.logspace(-6, 3, 10) svm_param_grid = dict(gamma=gamma_range, C=c_range) logit_param_grid = dict(C=c_range) test_size = float(argv[1]) n_iter = int(argv[2]) n_folds = int(argv[3]) target = argv[4] classifier = argv[5] pca_components = int(argv[7]) log = { 'target': target, 'std_select': { 'n_feat': pca_components }, 'split': { 'type': 'StratifiedShuffleSplit', 'n_iter': n_iter, 'test_size': test_size }, 'cross_val': { 'n_folds': n_folds }, 'classifier': classifier } # if target == 'drug': # target = drug # else: # target = stress # pca = PCA(n_components=pca_components) pca = SelectStd(n_features=pca_components) if classifier == 'svm': clf = SVC() param_grid = svm_param_grid grid_search = True elif classifier == 'en': clf = SGDClassifier(loss='log', penalty='elasticnet', n_jobs=1) param_grid = en_param_grid grid_search = True elif classifier == 'logit': clf = LogisticRegression() param_grid = logit_param_grid grid_search = True timer = Timer() print('\nStarting...' + ' '.join(argv)) pprint(log) split = StratifiedShuffleSplit(label[target], n_iter=n_iter, test_size=test_size) if grid_search: clf = GridSearchCV(clf, param_grid=param_grid, cv=n_folds, n_jobs=1) pipeline = Pipeline([('pca', pca), ('clf', clf)]) accuracy = cross_val_score(pipeline, rma, y=label[target], scoring='accuracy', cv=split, n_jobs=n_iter, verbose=1) print('\n{}: Accuracy: {:.2%} +/- {:.2%}'.format(timer.elapsed(), np.nanmean(accuracy), np.nanstd(accuracy))) log['results'] = { 'accuracy': { 'scores': accuracy.tolist(), 'mean': accuracy.mean(), 'std': accuracy.std() } } log['time'] = timer.elapsed() # results = [dict(log, accuracy=acc) for acc in accuracy] # log_results(results) # save_results(results, folder=argv[6], filename='results_new.json') save_experiment(log, folder=argv[6])
def main(argv): # rma, drug, stress = load_data() rma, label = data.load_mdd_data() alpha_range = np.logspace(-2, 7, 10) l1_ratio_range = np.arange(0.0, 1.0, 0.1) en_param_grid = dict(alpha=alpha_range, l1_ratio=l1_ratio_range) c_range = np.logspace(-2, 7, 10) gamma_range = np.logspace(-6, 3, 10) svm_param_grid = dict(gamma=gamma_range, C=c_range) logit_param_grid = dict(C=c_range) test_size = float(argv[1]) n_iter = int(argv[2]) n_folds = int(argv[3]) target = argv[4] classifier = argv[5] pca_components = int(argv[7]) log = { "target": target, "pca": {"n_components": pca_components}, "split": {"type": "StratifiedShuffleSplit", "n_iter": n_iter, "test_size": test_size}, "cross_val": {"n_folds": n_folds}, "classifier": classifier, } # if target == 'drug': # target = drug # else: # target = stress pca = PCA(n_components=pca_components) if classifier == "svm": clf = SVC() param_grid = svm_param_grid grid_search = True elif classifier == "en": clf = SGDClassifier(loss="log", penalty="elasticnet", n_jobs=1) param_grid = en_param_grid grid_search = True elif classifier == "logit": clf = LogisticRegression() param_grid = logit_param_grid grid_search = True timer = Timer() print("\nStarting..." + " ".join(argv)) pprint(log) split = StratifiedShuffleSplit(label[target], n_iter=n_iter, test_size=test_size) if grid_search: clf = GridSearchCV(clf, param_grid=param_grid, cv=n_folds, n_jobs=1) pipeline = Pipeline([("pca", pca), ("clf", clf)]) accuracy = cross_val_score(pipeline, rma, y=label[target], scoring="accuracy", cv=split, n_jobs=n_iter, verbose=1) print("\n{}: Accuracy: {:.2%} +/- {:.2%}".format(timer.elapsed(), np.nanmean(accuracy), np.nanstd(accuracy))) log["results"] = {"accuracy": {"scores": accuracy.tolist(), "mean": accuracy.mean(), "std": accuracy.std()}} log["time"] = timer.elapsed() # results = [dict(log, accuracy=acc) for acc in accuracy] # log_results(results) # save_results(results, folder=argv[6], filename='results_new.json') save_experiment(log, folder=argv[6])
print(result) if args.data == 'epi_ad': betas, factors = load_epi_ad_data(log=result, verbose=args.verbose) elif args.data == 'mdd': betas, factors = load_mdd_data(log=result, verbose=args.verbose) else: result['error'] = '{} not a valid dataset.'.format(args.tissue) if args.verbose: print('{} not a valid dataset.'.format(args.tissue)) if args.tissue is not None: condition = factors['source tissue'] == args.tissue if condition.sum() == 0: result['error'] = '{} is not a valid tissue.'.format(args.tissue) save_experiment(result, folder=args.results_path, filename=None, error=True, verbose=args.verbose) sys.exit() betas = betas[condition] factors = factors[condition] target = factors[args.target] split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) result['split'] = { 'type': 'StratifiedShuffleSplit', 'n_iter': args.n_iter, 'test_size': args.test_size } result['cross_val'] = {'n_folds': args.n_folds}
if args.data == 'epi_ad': betas, factors = load_epi_ad_data(log=result, verbose=args.verbose) elif args.data == 'mdd': betas, factors = load_mdd_data(log=result, verbose=args.verbose) else: result['error'] = '{} not a valid dataset.'.format(args.tissue) if args.verbose: print('{} not a valid dataset.'.format(args.tissue)) if args.tissue is not None: condition = factors['source tissue'] == args.tissue if condition.sum() == 0: result['error'] = '{} is not a valid tissue.'.format(args.tissue) save_experiment(result, folder=args.results_path, filename=None, error=True, verbose=args.verbose) sys.exit() betas = betas[condition] factors = factors[condition] target = factors[args.target] split = StratifiedShuffleSplit(target, n_iter=args.n_iter, test_size=args.test_size) result['split'] = { 'type': 'StratifiedShuffleSplit', 'n_iter': args.n_iter, 'test_size': args.test_size