Python spearmanr_nonan Exemples, util.spearmanr_nonan Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : util.py Projet : ElucidataInc/crispor

def get_all_metrics(results,
                    learn_options_set=None,
                    test_metrics=['spearmanr'],
                    add_extras=False,
                    force_by_gene=False):
    """
    'metrics' here are the metrics used to evaluate
    """
    all_results = dict([(k, {}) for k in results.keys()])
    genes = results[results.keys()[0]][1][0][0].keys()

    for metric in test_metrics:
        for method in all_results.keys():
            all_results[method][metric] = []

    non_binary_target_name = check_learn_options_set(learn_options_set)

    for method in results.keys():
        truth, predictions = results[method][1][0]
        test_indices = results[method][-1]
        tmp_genes = results[method][1][0][0].keys()
        if len(tmp_genes) != len(tmp_genes) or np.any(tmp_genes == genes):
            "genes have changed, need to modify code"
        all_truth_raw, all_truth_thrs, all_predictions = np.array(
            []), np.array([]), np.array([])

        fpr_gene = {}
        tpr_gene = {}
        y_truth_thresh_all = np.array([])
        y_pred_all = np.array([])

        for gene in genes:
            y_truth, y_pred = truth[gene], predictions[gene]
            all_truth_raw = np.append(all_truth_raw,
                                      y_truth[non_binary_target_name])
            all_truth_thrs = np.append(all_truth_thrs, y_truth['thrs'])
            all_predictions = np.append(all_predictions, y_pred)

            y_truth_thresh_all = np.append(y_truth_thresh_all, y_truth['thrs'])
            y_pred_all = np.append(y_pred_all, y_pred)

            if 'spearmanr' in test_metrics:
                spearmanr = util.spearmanr_nonan(
                    y_truth[non_binary_target_name], y_pred)[0]
                all_results[method]['spearmanr'].append(spearmanr)

            if 'spearmanr>2.5' in test_metrics:
                selected = y_truth[non_binary_target_name] > 1.0
                #spearmanr = sp.stats.spearmanr(y_truth[non_binary_target_name][selected], y_pred[selected])[0]
                spearmanr = np.sqrt(
                    np.mean((y_truth[non_binary_target_name][selected] -
                             y_pred[selected])**2))
                all_results[method]['spearmanr>2.5'].append(spearmanr)

            if 'RMSE' in test_metrics:
                rmse = np.sqrt(
                    np.mean((y_truth[non_binary_target_name] - y_pred)**2))
                all_results[method]['RMSE'].append(rmse)

            if 'NDCG@5' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(
                    y_truth[non_binary_target_name], y_pred, 5)
                all_results[method]['NDCG@5'].append(ndcg)

            if 'NDCG@10' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(
                    y_truth[non_binary_target_name], y_pred, 10)
                all_results[method]['NDCG@10'].append(ndcg)

            if 'NDCG@20' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(
                    y_truth[non_binary_target_name], y_pred, 20)
                all_results[method]['NDCG@20'].append(ndcg)

            if 'NDCG@50' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(
                    y_truth[non_binary_target_name], y_pred, 50)
                all_results[method]['NDCG@50'].append(ndcg)

            if 'precision@5' in test_metrics:
                y_top_truth = (y_truth[non_binary_target_name] >= np.sort(
                    y_truth[non_binary_target_name])[::-1][:5][-1]) * 1
                y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:5][-1]) * 1
                all_results[method]['precision@5'].append(
                    sklearn.metrics.precision_score(y_top_pred, y_top_truth))

            if 'precision@10' in test_metrics:
                y_top_truth = (y_truth[non_binary_target_name] >= np.sort(
                    y_truth[non_binary_target_name])[::-1][:10][-1]) * 1
                y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:10][-1]) * 1
                all_results[method]['precision@10'].append(
                    sklearn.metrics.precision_score(y_top_pred, y_top_truth))

            if 'precision@20' in test_metrics:
                y_top_truth = (y_truth[non_binary_target_name] >= np.sort(
                    y_truth[non_binary_target_name])[::-1][:20][-1]) * 1
                y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:20][-1]) * 1
                all_results[method]['precision@20'].append(
                    sklearn.metrics.precision_score(y_top_pred, y_top_truth))

            if 'AUC' in test_metrics:
                fpr_gene[gene], tpr_gene[gene], _ = sklearn.metrics.roc_curve(
                    y_truth['thrs'], y_pred)
                auc = sklearn.metrics.auc(fpr_gene[gene], tpr_gene[gene])
                all_results[method]['AUC'].append(auc)

    if add_extras:
        fpr_all, tpr_all, _ = sklearn.metrics.roc_curve(
            y_truth_thresh_all, y_pred_all)
        return all_results, genes, fpr_all, tpr_all, fpr_gene, tpr_gene
    else:
        return all_results, genes

Exemple #2

0

Afficher le fichier

Fichier : predict.py Projet : yusefbear/sgRNA-Design-Tool

def extract_spearman_for_fold(metrics, fold, i, predictions, truth, y_ground_truth, test, y_pred, learn_options):
    spearman = util.spearmanr_nonan(y_ground_truth[test].flatten(), y_pred.flatten())[0]
    assert not np.isnan(spearman), "found nan spearman"
    metrics.append(spearman)

Exemple #3

0

Afficher le fichier

Fichier : predict.py Projet : yusefbear/sgRNA-Design-Tool

def cross_validate(y_all, feature_sets, learn_options=None, TEST=False, train_genes=None, CV=True):
    """
    feature_sets is a dictionary of "set name" to pandas.DataFrame
    one set might be single-nucleotide, position-independent features of order X, for e.g.
    Method: "GPy" or "linreg"
    Metric: NDCG (learning to rank metric, Normalized Discounted Cumulative Gain); AUC
    Output: cv_score_median, gene_rocs
    """

    allowed_methods = [
        "GPy",
        "linreg",
        "AdaBoostRegressor",
        "DecisionTreeRegressor",
        "RandomForestRegressor",
        "ARDRegression",
        "GPy_fs",
        "mean",
        "random",
        "DNN",
        "lasso_ensemble",
        "doench",
        "logregL1",
        "sgrna_from_doench",
    ]
    assert learn_options["method"] in allowed_methods, "invalid method: %s" % learn_options["method"]
    assert (
        learn_options["method"] == "linreg" and learn_options["penalty"] == "L2" or learn_options["weighted"] is None
    ), "weighted only works with linreg L2 right now"

    # construct filename from options
    filename = construct_filename(learn_options, TEST)

    print "Cross-validating genes..."
    t2 = time.time()

    y = np.array(y_all[learn_options["target_name"]].values[:, None], dtype=np.float64)

    # concatenate feature sets in to one nparray, and get dimension of each
    inputs, dim, dimsum, feature_names = util.concatenate_feature_sets(feature_sets)

    if not CV:
        assert (
            learn_options["cv"] == "gene"
        ), "Can only use gene-CV when CV is False (I need to use all of the genes and stratified complicates that)"

    # set-up for cross-validation
    ## for outer loop, the one Doench et al use genes for
    if learn_options["cv"] == "stratified":
        assert not learn_options[
            "extra pairs"
        ], "can't use extra pairs with stratified CV, need to figure out how to properly account for genes affected by two drugs"
        label_encoder = sklearn.preprocessing.LabelEncoder()
        label_encoder.fit(y_all["Target gene"].values)
        gene_classes = label_encoder.transform(y_all["Target gene"].values)
        if learn_options["train_genes"] is not None and learn_options["test_genes"] is not None:
            n_folds = len(learn_options["test_genes"])
        else:
            n_folds = len(learn_options["all_genes"])
        cv = sklearn.cross_validation.StratifiedKFold(gene_classes, n_folds=n_folds, shuffle=True, indices=True)
        fold_labels = ["fold%d" % i for i in range(1, n_folds + 1)]
        if learn_options["num_genes_remove_train"] is not None:
            raise NotImplementedException()
    elif learn_options["cv"] == "gene":
        cv = []

        if not CV:
            train_test_tmp = get_train_test("dummy", y_all)  # get train, test split using a dummy gene
            train_tmp, test_tmp = train_test_tmp
            # not a typo, using training set to test on as well, just for this case. Test set is not used
            # for internal cross-val, etc. anyway.
            train_test_tmp = (train_tmp, train_tmp)
            cv.append(train_test_tmp)
            fold_labels = learn_options["all_genes"]

        elif learn_options["train_genes"] is not None and learn_options["test_genes"] is not None:
            assert (
                learn_options["train_genes"] is not None and learn_options["test_genes"] is not None
            ), "use both or neither"
            for i, gene in enumerate(learn_options["test_genes"]):
                cv.append(get_train_test(gene, y_all, learn_options["train_genes"]))
            fold_labels = learn_options["test_genes"]
            # if train and test genes are seperate, there should be only one fold
            train_test_disjoint = set.isdisjoint(
                set(learn_options["train_genes"].tolist()), set(learn_options["test_genes"].tolist())
            )

        else:
            for i, gene in enumerate(learn_options["all_genes"]):
                train_test_tmp = get_train_test(gene, y_all)
                cv.append(train_test_tmp)
            fold_labels = learn_options["all_genes"]

        if learn_options["num_genes_remove_train"] is not None:
            for i, (train, test) in enumerate(cv):
                unique_genes = np.random.permutation(np.unique(np.unique(y_all["Target gene"][train])))
                genes_to_keep = unique_genes[0 : len(unique_genes) - learn_options["num_genes_remove_train"]]
                guides_to_keep = []
                filtered_train = []
                for j, gene in enumerate(y_all["Target gene"]):
                    if j in train and gene in genes_to_keep:
                        filtered_train.append(j)
                cv_i_orig = copy.deepcopy(cv[i])
                cv[i] = (filtered_train, test)
                if learn_options["num_genes_remove_train"] == 0:
                    assert np.all(cv_i_orig[0] == cv[i][0])
                    assert np.all(cv_i_orig[1] == cv[i][1])
                print "# train/train after/before is %s, %s" % (len(cv[i][0]), len(cv_i_orig[0]))
                print "# test/test after/before is %s, %s" % (len(cv[i][1]), len(cv_i_orig[1]))
    else:
        raise Exception("invalid cv options given: %s" % learn_options["cv"])

    cv = [c for c in cv]  # make list from generator, so can subset for TEST case
    if TEST:
        ind_to_use = [0]  # [0,1]
        cv = [cv[i] for i in ind_to_use]
        fold_labels = [fold_labels[i] for i in ind_to_use]

    truth = dict([(t, dict([(m, np.array([])) for m in ["raw", "ranks", "thrs"]])) for t in fold_labels])
    predictions = dict([(t, np.array([])) for t in fold_labels])

    m = {}
    metrics = []

    # do the cross-validation
    num_proc = learn_options["num_proc"]
    if num_proc > 1:
        num_proc = np.min([num_proc, len(cv)])
        print "using multiprocessing with %d procs--one for each fold" % num_proc
        jobs = []
        pool = multiprocessing.Pool(processes=num_proc)
        for i, fold in enumerate(cv):
            train, test = fold
            print "working on fold %d of %d, with %d train and %d test" % (i, len(cv), len(train), len(test))
            if learn_options["method"] == "GPy":
                job = pool.apply_async(
                    models.GP.gp_on_fold, args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options)
                )
            elif learn_options["method"] == "linreg":
                job = pool.apply_async(
                    models.regression.linreg_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options),
                )
            elif learn_options["method"] == "logregL1":
                job = pool.apply_async(
                    models.regression.logreg_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options),
                )
            elif learn_options["method"] == "AdaBoostRegressor":
                job = pool.apply_async(
                    models.ensembles.adaboost_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options),
                )
            elif learn_options["method"] == "DecisionTreeRegressor":
                job = pool.apply_async(
                    models.ensembles.decisiontree_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options),
                )
            elif learn_options["method"] == "RandomForestRegressor":
                job = pool.apply_async(
                    models.ensembles.randomforest_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options),
                )
            elif learn_options["method"] == "ARDRegression":
                job = pool.apply_async(
                    models.regression.ARDRegression_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options),
                )
            elif learn_options["method"] == "random":
                job = pool.apply_async(
                    models.baselines.random_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options),
                )
            elif learn_options["method"] == "mean":
                job = pool.apply_async(
                    models.baselines.mean_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options),
                )
            elif learn_options["method"] == "DNN":
                job = pool.apply_async(
                    models.DNN.DNN_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options),
                )
            elif learn_options["method"] == "lasso_ensemble":
                job = pool.apply_async(
                    models.ensembles.LASSOs_ensemble_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options),
                )
            elif learn_options["method"] == "doench":
                job = pool.apply_async(
                    models.baselines.doench_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options),
                )
            elif learn_options["method"] == "sgrna_from_doench":
                job = pool.apply_async(
                    models.baselines.sgrna_from_doench_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options),
                )

            else:
                raise Exception("did not find method=%s" % learn_options["method"])
            jobs.append(job)
        pool.close()
        pool.join()
        for i, fold in enumerate(cv):  # i in range(0,len(jobs)):
            y_pred, m[i] = jobs[i].get()
            train, test = fold

            if learn_options["training_metric"] == "AUC":
                extract_fpr_tpr_for_fold(
                    metrics,
                    fold_labels[i],
                    i,
                    predictions,
                    truth,
                    y_all[learn_options["ground_truth_label"]].values,
                    test,
                    y_pred,
                )
            elif learn_options["training_metric"] == "NDCG":
                extract_NDCG_for_fold(
                    metrics,
                    fold_labels[i],
                    i,
                    predictions,
                    truth,
                    y_all[learn_options["ground_truth_label"]].values,
                    test,
                    y_pred,
                    learn_options,
                )
            elif learn_options["training_metric"] == "spearmanr":
                extract_spearman_for_fold(
                    metrics,
                    fold_labels[i],
                    i,
                    predictions,
                    truth,
                    y_all[learn_options["ground_truth_label"]].values,
                    test,
                    y_pred,
                    learn_options,
                )
            else:
                raise Exception("invalid 'training_metric' in learn_options: %s" % learn_options["training_metric"])

            truth, predictions = fill_in_truth_and_predictions(
                truth, predictions, fold_labels[i], y_all, y_pred, learn_options, test
            )

        pool.terminate()

    else:
        # non parallel version
        for i, fold in enumerate(cv):
            train, test = fold
            if learn_options["method"] == "GPy":
                y_pred, m[i] = gp_on_fold(
                    models.GP.feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "linreg":
                y_pred, m[i] = models.regression.linreg_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "logregL1":
                y_pred, m[i] = models.regression.logreg_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "AdaBoostRegressor":
                y_pred, m[i] = models.ensembles.adaboost_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "DecisionTreeRegressor":
                y_pred, m[i] = models.ensembles.decisiontree_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "RandomForestRegressor":
                y_pred, m[i] = models.ensembles.randomforest_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "ARDRegression":
                y_pred, m[i] = models.regression.ARDRegression_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "GPy_fs":
                y_pred, m[i] = models.GP.gp_with_fs_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "random":
                y_pred, m[i] = models.baselines.random_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "mean":
                y_pred, m[i] = models.baselines.mean_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "DNN":
                y_pred, m[i] = models.DNN.DNN_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "lasso_ensemble":
                y_pred, m[i] = models.ensembles.LASSOs_ensemble_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "doench":
                y_pred, m[i] = models.baselines.doench_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            elif learn_options["method"] == "sgrna_from_doench":
                y_pred, m[i] = models.baselines.sgrna_from_doench_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum, learn_options
                )
            else:
                raise Exception("invalid method found: %s" % learn_options["method"])

            if learn_options["training_metric"] == "AUC":
                # fills in truth and predictions
                extract_fpr_tpr_for_fold(
                    metrics,
                    fold_labels[i],
                    i,
                    predictions,
                    truth,
                    y_all[learn_options["ground_truth_label"]].values,
                    test,
                    y_pred,
                )
            elif learn_options["training_metric"] == "NDCG":
                extract_NDCG_for_fold(
                    metrics,
                    fold_labels[i],
                    i,
                    predictions,
                    truth,
                    y_all[learn_options["ground_truth_label"]].values,
                    test,
                    y_pred,
                    learn_options,
                )
            elif learn_options["training_metric"] == "spearmanr":
                extract_spearman_for_fold(
                    metrics,
                    fold_labels[i],
                    i,
                    predictions,
                    truth,
                    y_all[learn_options["ground_truth_label"]].values,
                    test,
                    y_pred,
                    learn_options,
                )

            truth, predictions = fill_in_truth_and_predictions(
                truth, predictions, fold_labels[i], y_all, y_pred, learn_options, test
            )

            print "\t\tRMSE: ", np.sqrt(((y_pred - y[test]) ** 2).mean())
            print "\t\tSpearman correlation: ", util.spearmanr_nonan(y[test], y_pred)[0]
            print "\t\tfinished fold/gene %i of %i" % (i, len(fold_labels))

    cv_median_metric = [np.median(metrics)]
    gene_pred = [(truth, predictions)]
    print "\t\tmedian %s across gene folds: %.3f" % (learn_options["training_metric"], cv_median_metric[-1])

    t3 = time.time()
    print "\t\tElapsed time for cv is %.2f seconds" % (t3 - t2)
    return metrics, gene_pred, fold_labels, m, dimsum, filename, feature_names

Exemple #4

0

Afficher le fichier

Fichier : util.py Projet : yusefbear/sgRNA-Design-Tool

def get_all_metrics(results, learn_options_set=None, test_metrics=['spearmanr'], add_extras=False, force_by_gene=False):
    """
    'metrics' here are the metrics used to evaluate
    """
    all_results = dict([(k, {}) for k in results.keys()])
    genes = results[results.keys()[0]][1][0][0].keys()

    for metric in test_metrics:
        for method in all_results.keys():
            all_results[method][metric] = []

    non_binary_target_name = check_learn_options_set(learn_options_set)

    for method in results.keys():
        truth, predictions = results[method][1][0]
        test_indices = results[method][-1]
        tmp_genes = results[method][1][0][0].keys()
        if len(tmp_genes) != len(tmp_genes) or np.any(tmp_genes==genes): "genes have changed, need to modify code"
        all_truth_raw, all_truth_thrs, all_predictions = np.array([]), np.array([]), np.array([])

        fpr_gene = {}
        tpr_gene ={}
        y_truth_thresh_all = np.array([])
        y_pred_all = np.array([])

        for gene in genes:
            y_truth, y_pred = truth[gene], predictions[gene]
            all_truth_raw = np.append(all_truth_raw, y_truth[non_binary_target_name])
            all_truth_thrs = np.append(all_truth_thrs, y_truth['thrs'])
            all_predictions = np.append(all_predictions, y_pred)

            y_truth_thresh_all = np.append(y_truth_thresh_all, y_truth['thrs'])
            y_pred_all = np.append(y_pred_all, y_pred)

            if 'spearmanr' in test_metrics:
                spearmanr = util.spearmanr_nonan(y_truth[non_binary_target_name], y_pred)[0]
                all_results[method]['spearmanr'].append(spearmanr)

            if 'spearmanr>2.5' in test_metrics:
                selected = y_truth[non_binary_target_name] > 1.0
                #spearmanr = sp.stats.spearmanr(y_truth[non_binary_target_name][selected], y_pred[selected])[0]
                spearmanr = np.sqrt(np.mean((y_truth[non_binary_target_name][selected] - y_pred[selected])**2))
                all_results[method]['spearmanr>2.5'].append(spearmanr)

            if 'RMSE' in test_metrics:
                rmse = np.sqrt(np.mean((y_truth[non_binary_target_name] - y_pred)**2))
                all_results[method]['RMSE'].append(rmse)

            if 'NDCG@5' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(y_truth[non_binary_target_name], y_pred, 5)
                all_results[method]['NDCG@5'].append(ndcg)

            if 'NDCG@10' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(y_truth[non_binary_target_name], y_pred, 10)
                all_results[method]['NDCG@10'].append(ndcg)

            if 'NDCG@20' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(y_truth[non_binary_target_name], y_pred, 20)
                all_results[method]['NDCG@20'].append(ndcg)

            if 'NDCG@50' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(y_truth[non_binary_target_name], y_pred, 50)
                all_results[method]['NDCG@50'].append(ndcg)

            if 'precision@5' in test_metrics:
                y_top_truth = (y_truth[non_binary_target_name] >= np.sort(y_truth[non_binary_target_name])[::-1][:5][-1]) * 1
                y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:5][-1]) * 1
                all_results[method]['precision@5'].append(sklearn.metrics.precision_score(y_top_pred, y_top_truth))

            if 'precision@10' in test_metrics:
                y_top_truth = (y_truth[non_binary_target_name] >= np.sort(y_truth[non_binary_target_name])[::-1][:10][-1]) * 1
                y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:10][-1]) * 1
                all_results[method]['precision@10'].append(sklearn.metrics.precision_score(y_top_pred, y_top_truth))

            if 'precision@20' in test_metrics:
                y_top_truth = (y_truth[non_binary_target_name] >= np.sort(y_truth[non_binary_target_name])[::-1][:20][-1]) * 1
                y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:20][-1]) * 1
                all_results[method]['precision@20'].append(sklearn.metrics.precision_score(y_top_pred, y_top_truth))

            if 'AUC' in test_metrics:
                fpr_gene[gene], tpr_gene[gene], _ = sklearn.metrics.roc_curve(y_truth['thrs'], y_pred)
                auc = sklearn.metrics.auc(fpr_gene[gene], tpr_gene[gene])
                all_results[method]['AUC'].append(auc)

    if add_extras:
        fpr_all, tpr_all, _ = sklearn.metrics.roc_curve(y_truth_thresh_all, y_pred_all)
        return all_results, genes, fpr_all, tpr_all, fpr_gene, tpr_gene
    else:
        return all_results, genes

Exemple #5

0

Afficher le fichier

def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options):
    '''
    linreg using scikitlearn, using more standard regression models with penalization requiring
    nested-cross-validation
    '''

    if learn_options["weighted"] is not None and (learn_options["penalty"] != "L2" or learn_options["method"] != "linreg"):
        raise NotImplementedError("weighted prediction not implemented for any methods by L2 at the moment")

    cv, n_folds = set_up_folds(learn_options, y_all.iloc[train])

    if learn_options['penalty'] == "L1":
        l1_ratio = [1.0]
    elif learn_options['penalty'] == "L2":
        l1_ratio = [0.0]
    elif learn_options['penalty'] == "EN":  # elastic net
        l1_ratio = np.linspace(0.0, 1.0, 20)

    performance = np.zeros((len(learn_options["alpha"]), len(l1_ratio)))
    degenerate_pred = np.zeros((len(learn_options["alpha"])))
    for train_inner, test_inner in cv:
        for i, alpha in enumerate(learn_options["alpha"]):
            for j, l1r in enumerate(l1_ratio):
                clf = train_linreg_model(alpha, l1r, learn_options, train_inner, X[train], y[train], y_all.iloc[train])
                if learn_options["feature_select"]:
                    clf, tmp_pred = feature_select(clf, learn_options, test_inner, train_inner, X[train], y[train])
                else:
                    tmp_pred = clf.predict(X[train][test_inner])

                if learn_options["training_metric"] == "AUC":
                    fpr, tpr, _ = roc_curve(y_all[learn_options["ground_truth_label"]][train][test_inner], tmp_pred)
                    assert ~np.any(np.isnan(fpr)), "found nan fpr"
                    assert ~np.any(np.isnan(tpr)), "found nan tpr"
                    tmp_auc = auc(fpr, tpr)
                    performance[i, j] += tmp_auc

                elif learn_options['training_metric'] == 'spearmanr':
                    spearman = util.spearmanr_nonan(y_all[learn_options['ground_truth_label']][train][test_inner], tmp_pred.flatten())[0]
                    performance[i, j] += spearman

                elif learn_options['training_metric'] == 'score':
                    performance[i, j] += clf.score(X[test_inner], y_all[learn_options['ground_truth_label']][train][test_inner])

                elif learn_options["training_metric"] == "NDCG":
                    assert "thresh" not in learn_options["ground_truth_label"], "for NDCG must not use thresholded ranks, but pure ranks"

                    # sorted = tmp_pred[np.argsort(y_all[ground_truth_label].values[test_inner])[::-1]].flatten()
                    # sortedgt = np.sort(y_all[ground_truth_label].values[test_inner])[::-1].flatten()
                    # tmp_perf = ranking_metrics.ndcg_at_k_ties(sorted, learn_options["NDGC_k"], sortedgt)
                    tmp_truth = y_all[learn_options["ground_truth_label"]].values[train][test_inner].flatten()
                    tmp_perf = ranking_metrics.ndcg_at_k_ties(tmp_truth, tmp_pred.flatten(), learn_options["NDGC_k"])
                    performance[i, j] += tmp_perf

                    degenerate_pred_tmp = len(np.unique(tmp_pred)) < len(tmp_pred)/2.0
                    degenerate_pred[i] += degenerate_pred_tmp

                    # sanity checking metric wrt ties, etc.
                    # rmse = np.sqrt(np.mean((tmp_pred - tmp_truth)**2))
                    tmp_pred_r, tmp_truth_r = ranking_metrics.rank_data(tmp_pred, tmp_truth)
                    # rmse_r = np.sqrt(np.mean((tmp_pred_r-tmp_truth_r)**2))

    performance /= n_folds

    max_score_ind = np.where(performance == np.nanmax(performance))
    assert max_score_ind != len(performance), "enlarge alpha range as hitting max boundary"
    # assert degenerate_pred[max_score_ind[0][0]]==0, "found degenerate predictions at max score"

    # in the unlikely event of tied scores, take the first one.
    if len(max_score_ind[0]) > 1:
        max_score_ind = [max_score_ind[0][0], max_score_ind[1][0]]

    best_alpha, best_l1r = learn_options["alpha"][max_score_ind[0]], l1_ratio[max_score_ind[1]]

    print "\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]])
    if learn_options['penalty'] == "EN":
        print "\t\tbest l1_ratio is %f from range=%s" % (best_l1r, l1_ratio[[0, -1]])
    max_perf = np.nanmax(performance)

    if max_perf < 0.0:
        raise Exception("performance is negative")

    print "\t\tbest performance is %f" % max_perf

    clf = train_linreg_model(best_alpha, l1r, learn_options, train, X, y, y_all)
    if learn_options["feature_select"]:
        raise Exception("untested in a long time, should double check")
        clf, y_pred = feature_select(clf, learn_options, test, train, X, y)
    else:
        y_pred = clf.predict(X[test])

    if learn_options["penalty"] != "L2":
        y_pred = y_pred[:, None]

    return y_pred, clf

Exemple #6

0

Afficher le fichier

def cross_validate(y_all,
                   feature_sets,
                   learn_options=None,
                   TEST=False,
                   train_genes=None,
                   CV=True):
    '''
    feature_sets is a dictionary of "set name" to pandas.DataFrame
    one set might be single-nucleotide, position-independent features of order X, for e.g.
    Method: "GPy" or "linreg"
    Metric: NDCG (learning to rank metric, Normalized Discounted Cumulative Gain); AUC
    Output: cv_score_median, gene_rocs
    When CV=False, it trains on everything (and tests on everything, just to fit the code)
    '''

    print "range of y_all is [%f, %f]" % (
        np.min(y_all[learn_options['target_name']].values),
        np.max(y_all[learn_options['target_name']].values))

    allowed_methods = [
        "GPy", "linreg", "AdaBoostRegressor", "AdaBoostClassifier",
        "DecisionTreeRegressor", "RandomForestRegressor", "ARDRegression",
        "GPy_fs", "mean", "random", "DNN", "lasso_ensemble", "doench",
        "logregL1", "sgrna_from_doench", 'SVC', 'xu_et_al'
    ]

    assert learn_options[
        "method"] in allowed_methods, "invalid method: %s" % learn_options[
            "method"]
    assert learn_options["method"] == "linreg" and learn_options[
        'penalty'] == 'L2' or learn_options[
            "weighted"] is None, "weighted only works with linreg L2 right now"

    # construct filename from options
    filename = construct_filename(learn_options, TEST)

    print "Cross-validating genes..."
    t2 = time.time()

    y = np.array(y_all[learn_options["target_name"]].values[:, None],
                 dtype=np.float64)

    # concatenate feature sets in to one nparray, and get dimension of each
    inputs, dim, dimsum, feature_names = util.concatenate_feature_sets(
        feature_sets)
    #import pickle; pickle.dump([y, inputs, feature_names, learn_options], open("saved_models/inputs.p", "wb" )); import ipdb; ipdb.set_trace()

    if not CV:
        assert learn_options[
            'cv'] == 'gene', 'Must use gene-CV when CV is False (I need to use all of the genes and stratified complicates that)'

    # set-up for cross-validation
    ## for outer loop, the one Doench et al use genes for
    if learn_options["cv"] == "stratified":
        assert not learn_options.has_key("extra_pairs") or learn_options[
            'extra pairs'], "can't use extra pairs with stratified CV, need to figure out how to properly account for genes affected by two drugs"
        label_encoder = sklearn.preprocessing.LabelEncoder()
        label_encoder.fit(y_all['Target gene'].values)
        gene_classes = label_encoder.transform(y_all['Target gene'].values)
        if 'n_folds' in learn_options.keys():
            n_splits = learn_options['n_folds']
        elif learn_options['train_genes'] is not None and learn_options[
                "test_genes"] is not None:
            n_splits = len(learn_options["test_genes"])
        else:
            n_splits = len(learn_options['all_genes'])

        skf = sklearn.model_selection.StratifiedKFold(n_splits=n_splits,
                                                      shuffle=True)
        cv = skf.split(np.zeros(len(gene_classes), dtype=np.bool),
                       gene_classes)
        fold_labels = ["fold%d" % i for i in range(1, n_folds + 1)]
        if learn_options['num_genes_remove_train'] is not None:
            raise NotImplementedException()
    elif learn_options["cv"] == "gene":
        cv = []

        if not CV:
            train_test_tmp = get_train_test(
                'dummy', y_all)  # get train, test split using a dummy gene
            #train_tmp, test_tmp = train_test_tmp
            # not a typo, using training set to test on as well, just for this case. Test set is not used
            # for internal cross-val, etc. anyway.
            #train_test_tmp = (train_tmp, train_tmp)
            cv.append(train_test_tmp)
            fold_labels = ["dummy_for_no_cv"]  #learn_options['all_genes']

        elif learn_options['train_genes'] is not None and learn_options[
                "test_genes"] is not None:
            assert learn_options['train_genes'] is not None and learn_options[
                'test_genes'] is not None, "use both or neither"
            for i, gene in enumerate(learn_options['test_genes']):
                cv.append(
                    get_train_test(gene, y_all, learn_options['train_genes']))
            fold_labels = learn_options["test_genes"]
            # if train and test genes are seperate, there should be only one fold
            train_test_disjoint = set.isdisjoint(
                set(learn_options["train_genes"].tolist()),
                set(learn_options["test_genes"].tolist()))

        else:
            for i, gene in enumerate(learn_options['all_genes']):
                train_test_tmp = get_train_test(gene, y_all)
                cv.append(train_test_tmp)
            fold_labels = learn_options['all_genes']

        if learn_options['num_genes_remove_train'] is not None:
            for i, (train, test) in enumerate(cv):
                unique_genes = np.random.permutation(
                    np.unique(np.unique(y_all['Target gene'][train])))
                genes_to_keep = unique_genes[
                    0:len(unique_genes) -
                    learn_options['num_genes_remove_train']]
                guides_to_keep = []
                filtered_train = []
                for j, gene in enumerate(y_all['Target gene']):
                    if j in train and gene in genes_to_keep:
                        filtered_train.append(j)
                cv_i_orig = copy.deepcopy(cv[i])
                cv[i] = (filtered_train, test)
                if learn_options['num_genes_remove_train'] == 0:
                    assert np.all(cv_i_orig[0] == cv[i][0])
                    assert np.all(cv_i_orig[1] == cv[i][1])
                print "# train/train after/before is %s, %s" % (len(
                    cv[i][0]), len(cv_i_orig[0]))
                print "# test/test after/before is %s, %s" % (len(
                    cv[i][1]), len(cv_i_orig[1]))
    else:
        raise Exception("invalid cv options given: %s" % learn_options["cv"])

    cv = [c
          for c in cv]  #make list from generator, so can subset for TEST case
    if TEST:
        ind_to_use = [0]  #[0,1]
        cv = [cv[i] for i in ind_to_use]
        fold_labels = [fold_labels[i] for i in ind_to_use]

    truth = dict([(t,
                   dict([(m, np.array([])) for m in ['raw', 'ranks', 'thrs']]))
                  for t in fold_labels])
    predictions = dict([(t, np.array([])) for t in fold_labels])

    m = {}
    metrics = []

    #do the cross-validation
    num_proc = learn_options["num_proc"]
    if num_proc > 1:
        num_proc = np.min([num_proc, len(cv)])
        print "using multiprocessing with %d procs--one for each fold" % num_proc
        jobs = []
        pool = multiprocessing.Pool(processes=num_proc)
        for i, fold in enumerate(cv):
            train, test = fold
            print "working on fold %d of %d, with %d train and %d test" % (
                i, len(cv), len(train), len(test))
            if learn_options["method"] == "GPy":
                job = pool.apply_async(azimuth.models.GP.gp_on_fold,
                                       args=(feature_sets, train, test, y,
                                             y_all, inputs, dim, dimsum,
                                             learn_options))
            elif learn_options["method"] == "linreg":
                job = pool.apply_async(
                    azimuth.models.regression.linreg_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim,
                          dimsum, learn_options))
            elif learn_options["method"] == "logregL1":
                job = pool.apply_async(
                    azimuth.models.regression.logreg_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim,
                          dimsum, learn_options))
            elif learn_options["method"] == "AdaBoostRegressor":
                job = pool.apply_async(
                    azimuth.models.ensembles.adaboost_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim,
                          dimsum, learn_options, False))
            elif learn_options["method"] == "AdaBoostClassifier":
                job = pool.apply_async(
                    azimuth.models.ensembles.adaboost_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim,
                          dimsum, learn_options, True))
            elif learn_options["method"] == "DecisionTreeRegressor":
                job = pool.apply_async(
                    azimuth.models.ensembles.decisiontree_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim,
                          dimsum, learn_options))
            elif learn_options["method"] == "RandomForestRegressor":
                job = pool.apply_async(
                    azimuth.models.ensembles.randomforest_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim,
                          dimsum, learn_options))
            elif learn_options["method"] == "ARDRegression":
                job = pool.apply_async(
                    azimuth.models.regression.ARDRegression_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim,
                          dimsum, learn_options))
            elif learn_options["method"] == "random":
                job = pool.apply_async(azimuth.models.baselines.random_on_fold,
                                       args=(feature_sets, train, test, y,
                                             y_all, inputs, dim, dimsum,
                                             learn_options))
            elif learn_options["method"] == "mean":
                job = pool.apply_async(azimuth.models.baselines.mean_on_fold,
                                       args=(feature_sets, train, test, y,
                                             y_all, inputs, dim, dimsum,
                                             learn_options))
            elif learn_options["method"] == "SVC":
                job = pool.apply_async(azimuth.models.baselines.SVC_on_fold,
                                       args=(feature_sets, train, test, y,
                                             y_all, inputs, dim, dimsum,
                                             learn_options))
            elif learn_options["method"] == "DNN":
                job = pool.apply_async(azimuth.models.DNN.DNN_on_fold,
                                       args=(feature_sets, train, test, y,
                                             y_all, inputs, dim, dimsum,
                                             learn_options))
            elif learn_options["method"] == "lasso_ensemble":
                job = pool.apply_async(
                    azimuth.models.ensembles.LASSOs_ensemble_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim,
                          dimsum, learn_options))
            elif learn_options["method"] == "doench":
                job = pool.apply_async(azimuth.models.baselines.doench_on_fold,
                                       args=(feature_sets, train, test, y,
                                             y_all, inputs, dim, dimsum,
                                             learn_options))
            elif learn_options["method"] == "sgrna_from_doench":
                job = pool.apply_async(
                    azimuth.models.baselines.sgrna_from_doench_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim,
                          dimsum, learn_options))
            elif learn_options["method"] == "xu_et_al":
                job = pool.apply_async(
                    azimuth.models.baselines.xu_et_al_on_fold,
                    args=(feature_sets, train, test, y, y_all, inputs, dim,
                          dimsum, learn_options))
            else:
                raise Exception("did not find method=%s" %
                                learn_options["method"])
            jobs.append(job)
        pool.close()
        pool.join()
        for i, fold in enumerate(cv):  #i in range(0,len(jobs)):
            y_pred, m[i] = jobs[i].get()
            train, test = fold

            if learn_options["training_metric"] == "AUC":
                extract_fpr_tpr_for_fold(
                    metrics, fold_labels[i], i, predictions, truth,
                    y_all[learn_options["ground_truth_label"]].values, test,
                    y_pred)
            elif learn_options["training_metric"] == "NDCG":
                extract_NDCG_for_fold(
                    metrics, fold_labels[i], i, predictions, truth,
                    y_all[learn_options["ground_truth_label"]].values, test,
                    y_pred, learn_options)
            elif learn_options["training_metric"] == 'spearmanr':
                extract_spearman_for_fold(
                    metrics, fold_labels[i], i, predictions, truth,
                    y_all[learn_options["ground_truth_label"]].values, test,
                    y_pred, learn_options)
            else:
                raise Exception(
                    "invalid 'training_metric' in learn_options: %s" %
                    learn_options["training_metric"])

            truth, predictions = fill_in_truth_and_predictions(
                truth, predictions, fold_labels[i], y_all, y_pred,
                learn_options, test)

        pool.terminate()

    else:
        # non parallel version
        for i, fold in enumerate(cv):
            train, test = fold
            if learn_options["method"] == "GPy":
                y_pred, m[i] = gp_on_fold(azimuth.models.GP.feature_sets,
                                          train, test, y, y_all, inputs, dim,
                                          dimsum, learn_options)
            elif learn_options["method"] == "linreg":
                y_pred, m[i] = azimuth.models.regression.linreg_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum,
                    learn_options)
            elif learn_options["method"] == "logregL1":
                y_pred, m[i] = azimuth.models.regression.logreg_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum,
                    learn_options)
            elif learn_options["method"] == "AdaBoostRegressor":
                y_pred, m[i] = azimuth.models.ensembles.adaboost_on_fold(
                    feature_sets,
                    train,
                    test,
                    y,
                    y_all,
                    inputs,
                    dim,
                    dimsum,
                    learn_options,
                    classification=False)
            elif learn_options["method"] == "AdaBoostClassifier":
                y_pred, m[i] = azimuth.models.ensembles.adaboost_on_fold(
                    feature_sets,
                    train,
                    test,
                    y,
                    y_all,
                    inputs,
                    dim,
                    dimsum,
                    learn_options,
                    classification=True)
            elif learn_options["method"] == "DecisionTreeRegressor":
                y_pred, m[i] = azimuth.models.ensembles.decisiontree_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum,
                    learn_options)
            elif learn_options["method"] == "RandomForestRegressor":
                y_pred, m[i] = azimuth.models.ensembles.randomforest_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum,
                    learn_options)
            elif learn_options["method"] == "ARDRegression":
                y_pred, m[i] = azimuth.models.regression.ARDRegression_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum,
                    learn_options)
            elif learn_options["method"] == "GPy_fs":
                y_pred, m[i] = azimuth.models.GP.gp_with_fs_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum,
                    learn_options)
            elif learn_options["method"] == "random":
                y_pred, m[i] = azimuth.models.baselines.random_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum,
                    learn_options)
            elif learn_options["method"] == "mean":
                y_pred, m[i] = azimuth.models.baselines.mean_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum,
                    learn_options)
            elif learn_options["method"] == "SVC":
                y_pred, m[i] = azimuth.models.baselines.SVC_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum,
                    learn_options)
            elif learn_options["method"] == "DNN":
                y_pred, m[i] = azimuth.models.DNN.DNN_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum,
                    learn_options)
            elif learn_options["method"] == "lasso_ensemble":
                y_pred, m[
                    i] = azimuth.models.ensembles.LASSOs_ensemble_on_fold(
                        feature_sets, train, test, y, y_all, inputs, dim,
                        dimsum, learn_options)
            elif learn_options["method"] == "doench":
                y_pred, m[i] = azimuth.models.baselines.doench_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum,
                    learn_options)
            elif learn_options["method"] == "sgrna_from_doench":
                y_pred, m[
                    i] = azimuth.models.baselines.sgrna_from_doench_on_fold(
                        feature_sets, train, test, y, y_all, inputs, dim,
                        dimsum, learn_options)
            elif learn_options["method"] == "xu_et_al":
                y_pred, m[i] = azimuth.models.baselines.xu_et_al_on_fold(
                    feature_sets, train, test, y, y_all, inputs, dim, dimsum,
                    learn_options)
            else:
                raise Exception("invalid method found: %s" %
                                learn_options["method"])

            if learn_options["training_metric"] == "AUC":
                # fills in truth and predictions
                extract_fpr_tpr_for_fold(
                    metrics, fold_labels[i], i, predictions, truth,
                    y_all[learn_options['ground_truth_label']].values, test,
                    y_pred)
            elif learn_options["training_metric"] == "NDCG":
                extract_NDCG_for_fold(
                    metrics, fold_labels[i], i, predictions, truth,
                    y_all[learn_options["ground_truth_label"]].values, test,
                    y_pred, learn_options)
            elif learn_options["training_metric"] == 'spearmanr':
                extract_spearman_for_fold(
                    metrics, fold_labels[i], i, predictions, truth,
                    y_all[learn_options["ground_truth_label"]].values, test,
                    y_pred, learn_options)

            truth, predictions = fill_in_truth_and_predictions(
                truth, predictions, fold_labels[i], y_all, y_pred,
                learn_options, test)

            print "\t\tRMSE: ", np.sqrt(((y_pred - y[test])**2).mean())
            print "\t\tSpearman correlation: ", util.spearmanr_nonan(
                y[test], y_pred)[0]
            print "\t\tfinished fold/gene %i of %i" % (i + 1, len(fold_labels))

    cv_median_metric = [np.median(metrics)]
    gene_pred = [(truth, predictions)]
    print "\t\tmedian %s across gene folds: %.3f" % (
        learn_options["training_metric"], cv_median_metric[-1])

    t3 = time.time()
    print "\t\tElapsed time for cv is %.2f seconds" % (t3 - t2)
    return metrics, gene_pred, fold_labels, m, dimsum, filename, feature_names

Exemple #7

0

Afficher le fichier

def extract_spearman_for_fold(metrics, fold, i, predictions, truth,
                              y_ground_truth, test, y_pred, learn_options):
    spearman = util.spearmanr_nonan(y_ground_truth[test].flatten(),
                                    y_pred.flatten())[0]
    assert not np.isnan(spearman), "found nan spearman"
    metrics.append(spearman)

Exemple #8

0

Afficher le fichier

Fichier : regression.py Projet : yusefbear/sgRNA-Design-Tool

def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options):
    '''
    linreg using scikitlearn, using more standard regression models with penalization requiring
    nested-cross-validation
    '''

    if learn_options["weighted"] is not None and (learn_options["penalty"] != "L2" or learn_options["method"] != "linreg"):
        raise NotImplementedError("weighted prediction not implemented for any methods by L2 at the moment")

    cv, n_folds = set_up_folds(learn_options, y_all.iloc[train])

    if learn_options['penalty'] == "L1":
        l1_ratio = [1.0]
    elif learn_options['penalty'] == "L2":
        l1_ratio = [0.0]
    elif learn_options['penalty'] == "EN":  # elastic net
        l1_ratio = np.linspace(0.0, 1.0, 20)

    performance = np.zeros((len(learn_options["alpha"]), len(l1_ratio)))
    degenerate_pred = np.zeros((len(learn_options["alpha"])))
    for train_inner, test_inner in cv:
        for i, alpha in enumerate(learn_options["alpha"]):
            for j, l1r in enumerate(l1_ratio):
                clf = train_linreg_model(alpha, l1r, learn_options, train_inner, X[train], y[train], y_all.iloc[train])
                if learn_options["feature_select"]:
                    clf, tmp_pred = feature_select(clf, learn_options, test_inner, train_inner, X[train], y[train])
                else:
                    tmp_pred = clf.predict(X[train][test_inner])

                if learn_options["training_metric"] == "AUC":
                    fpr, tpr, _ = roc_curve(y_all[learn_options["ground_truth_label"]][train][test_inner], tmp_pred)
                    assert ~np.any(np.isnan(fpr)), "found nan fpr"
                    assert ~np.any(np.isnan(tpr)), "found nan tpr"
                    tmp_auc = auc(fpr, tpr)
                    performance[i, j] += tmp_auc

                elif learn_options['training_metric'] == 'spearmanr':
                    spearman = util.spearmanr_nonan(y_all[learn_options['ground_truth_label']][train][test_inner], tmp_pred.flatten())[0]
                    performance[i, j] += spearman

                elif learn_options['training_metric'] == 'score':
                    performance[i, j] += clf.score(X[test_inner], y_all[learn_options['ground_truth_label']][train][test_inner])

                elif learn_options["training_metric"] == "NDCG":
                    assert "thresh" not in learn_options["ground_truth_label"], "for NDCG must not use thresholded ranks, but pure ranks"

                    # sorted = tmp_pred[np.argsort(y_all[ground_truth_label].values[test_inner])[::-1]].flatten()
                    # sortedgt = np.sort(y_all[ground_truth_label].values[test_inner])[::-1].flatten()
                    # tmp_perf = ranking_metrics.ndcg_at_k_ties(sorted, learn_options["NDGC_k"], sortedgt)
                    tmp_truth = y_all[learn_options["ground_truth_label"]].values[train][test_inner].flatten()
                    tmp_perf = ranking_metrics.ndcg_at_k_ties(tmp_truth, tmp_pred.flatten(), learn_options["NDGC_k"])
                    performance[i, j] += tmp_perf

                    degenerate_pred_tmp = len(np.unique(tmp_pred)) < len(tmp_pred)/2.0
                    degenerate_pred[i] += degenerate_pred_tmp

                    # sanity checking metric wrt ties, etc.
                    # rmse = np.sqrt(np.mean((tmp_pred - tmp_truth)**2))
                    tmp_pred_r, tmp_truth_r = ranking_metrics.rank_data(tmp_pred, tmp_truth)
                    # rmse_r = np.sqrt(np.mean((tmp_pred_r-tmp_truth_r)**2))

    performance /= n_folds

    max_score_ind = np.where(performance == np.nanmax(performance))
    assert max_score_ind != len(performance), "enlarge alpha range as hitting max boundary"
    # assert degenerate_pred[max_score_ind[0][0]]==0, "found degenerate predictions at max score"

    # in the unlikely event of tied scores, take the first one.
    if len(max_score_ind[0]) > 1:
        max_score_ind = [max_score_ind[0][0], max_score_ind[1][0]]

    best_alpha, best_l1r = learn_options["alpha"][max_score_ind[0]], l1_ratio[max_score_ind[1]]

    print "\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]])
    if learn_options['penalty'] == "EN":
        print "\t\tbest l1_ratio is %f from range=%s" % (best_l1r, l1_ratio[[0, -1]])
    max_perf = np.nanmax(performance)

    if max_perf < 0.0:
        raise Exception("performance is negative")

    print "\t\tbest performance is %f" % max_perf

    clf = train_linreg_model(best_alpha, l1r, learn_options, train, X, y, y_all)
    if learn_options["feature_select"]:
        raise Exception("untested in a long time, should double check")
        clf, y_pred = feature_select(clf, learn_options, test, train, X, y)
    else:
        y_pred = clf.predict(X[test])

    if learn_options["penalty"] != "L2":
        y_pred = y_pred[:, None]

    return y_pred, clf