Ejemplo n.º 1
0
def get_all_metrics(results,
                    learn_options_set=None,
                    test_metrics=['spearmanr'],
                    add_extras=False,
                    force_by_gene=False):
    """
    'metrics' here are the metrics used to evaluate
    """
    all_results = dict([(k, {}) for k in results.keys()])
    genes = results[results.keys()[0]][1][0][0].keys()

    for metric in test_metrics:
        for method in all_results.keys():
            all_results[method][metric] = []

    non_binary_target_name = check_learn_options_set(learn_options_set)

    for method in results.keys():
        truth, predictions = results[method][1][0]
        test_indices = results[method][-1]
        tmp_genes = results[method][1][0][0].keys()
        if len(tmp_genes) != len(tmp_genes) or np.any(tmp_genes == genes):
            "genes have changed, need to modify code"
        all_truth_raw, all_truth_thrs, all_predictions = np.array(
            []), np.array([]), np.array([])

        fpr_gene = {}
        tpr_gene = {}
        y_truth_thresh_all = np.array([])
        y_pred_all = np.array([])

        for gene in genes:
            y_truth, y_pred = truth[gene], predictions[gene]
            all_truth_raw = np.append(all_truth_raw,
                                      y_truth[non_binary_target_name])
            all_truth_thrs = np.append(all_truth_thrs, y_truth['thrs'])
            all_predictions = np.append(all_predictions, y_pred)

            y_truth_thresh_all = np.append(y_truth_thresh_all, y_truth['thrs'])
            y_pred_all = np.append(y_pred_all, y_pred)

            if 'spearmanr' in test_metrics:
                spearmanr = util.spearmanr_nonan(
                    y_truth[non_binary_target_name], y_pred)[0]
                all_results[method]['spearmanr'].append(spearmanr)

            if 'spearmanr>2.5' in test_metrics:
                selected = y_truth[non_binary_target_name] > 1.0
                #spearmanr = sp.stats.spearmanr(y_truth[non_binary_target_name][selected], y_pred[selected])[0]
                spearmanr = np.sqrt(
                    np.mean((y_truth[non_binary_target_name][selected] -
                             y_pred[selected])**2))
                all_results[method]['spearmanr>2.5'].append(spearmanr)

            if 'RMSE' in test_metrics:
                rmse = np.sqrt(
                    np.mean((y_truth[non_binary_target_name] - y_pred)**2))
                all_results[method]['RMSE'].append(rmse)

            if 'NDCG@5' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(
                    y_truth[non_binary_target_name], y_pred, 5)
                all_results[method]['NDCG@5'].append(ndcg)

            if 'NDCG@10' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(
                    y_truth[non_binary_target_name], y_pred, 10)
                all_results[method]['NDCG@10'].append(ndcg)

            if 'NDCG@20' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(
                    y_truth[non_binary_target_name], y_pred, 20)
                all_results[method]['NDCG@20'].append(ndcg)

            if 'NDCG@50' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(
                    y_truth[non_binary_target_name], y_pred, 50)
                all_results[method]['NDCG@50'].append(ndcg)

            if 'precision@5' in test_metrics:
                y_top_truth = (y_truth[non_binary_target_name] >= np.sort(
                    y_truth[non_binary_target_name])[::-1][:5][-1]) * 1
                y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:5][-1]) * 1
                all_results[method]['precision@5'].append(
                    sklearn.metrics.precision_score(y_top_pred, y_top_truth))

            if 'precision@10' in test_metrics:
                y_top_truth = (y_truth[non_binary_target_name] >= np.sort(
                    y_truth[non_binary_target_name])[::-1][:10][-1]) * 1
                y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:10][-1]) * 1
                all_results[method]['precision@10'].append(
                    sklearn.metrics.precision_score(y_top_pred, y_top_truth))

            if 'precision@20' in test_metrics:
                y_top_truth = (y_truth[non_binary_target_name] >= np.sort(
                    y_truth[non_binary_target_name])[::-1][:20][-1]) * 1
                y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:20][-1]) * 1
                all_results[method]['precision@20'].append(
                    sklearn.metrics.precision_score(y_top_pred, y_top_truth))

            if 'AUC' in test_metrics:
                fpr_gene[gene], tpr_gene[gene], _ = sklearn.metrics.roc_curve(
                    y_truth['thrs'], y_pred)
                auc = sklearn.metrics.auc(fpr_gene[gene], tpr_gene[gene])
                all_results[method]['AUC'].append(auc)

    if add_extras:
        fpr_all, tpr_all, _ = sklearn.metrics.roc_curve(
            y_truth_thresh_all, y_pred_all)
        return all_results, genes, fpr_all, tpr_all, fpr_gene, tpr_gene
    else:
        return all_results, genes
Ejemplo n.º 2
0
def get_all_metrics(results, learn_options_set=None, test_metrics=['spearmanr'], add_extras=False, force_by_gene=False):
    """
    'metrics' here are the metrics used to evaluate
    """
    all_results = dict([(k, {}) for k in results.keys()])
    genes = results[results.keys()[0]][1][0][0].keys()

    for metric in test_metrics:
        for method in all_results.keys():
            all_results[method][metric] = []

    non_binary_target_name = check_learn_options_set(learn_options_set)

    for method in results.keys():
        truth, predictions = results[method][1][0]
        test_indices = results[method][-1]
        tmp_genes = results[method][1][0][0].keys()
        if len(tmp_genes) != len(tmp_genes) or np.any(tmp_genes==genes): "genes have changed, need to modify code"
        all_truth_raw, all_truth_thrs, all_predictions = np.array([]), np.array([]), np.array([])

        fpr_gene = {}
        tpr_gene ={}
        y_truth_thresh_all = np.array([])
        y_pred_all = np.array([])

        for gene in genes:
            y_truth, y_pred = truth[gene], predictions[gene]
            all_truth_raw = np.append(all_truth_raw, y_truth[non_binary_target_name])
            all_truth_thrs = np.append(all_truth_thrs, y_truth['thrs'])
            all_predictions = np.append(all_predictions, y_pred)

            y_truth_thresh_all = np.append(y_truth_thresh_all, y_truth['thrs'])
            y_pred_all = np.append(y_pred_all, y_pred)

            if 'spearmanr' in test_metrics:
                spearmanr = spearmanr_nonan(y_truth[non_binary_target_name], y_pred)[0]
                all_results[method]['spearmanr'].append(spearmanr)

            if 'spearmanr>2.5' in test_metrics:
                selected = y_truth[non_binary_target_name] > 1.0
                #spearmanr = sp.stats.spearmanr(y_truth[non_binary_target_name][selected], y_pred[selected])[0]
                spearmanr = np.sqrt(np.mean((y_truth[non_binary_target_name][selected] - y_pred[selected])**2))
                all_results[method]['spearmanr>2.5'].append(spearmanr)

            if 'RMSE' in test_metrics:
                rmse = np.sqrt(np.mean((y_truth[non_binary_target_name] - y_pred)**2))
                all_results[method]['RMSE'].append(rmse)

            if 'NDCG@5' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(y_truth[non_binary_target_name], y_pred, 5)
                all_results[method]['NDCG@5'].append(ndcg)

            if 'NDCG@10' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(y_truth[non_binary_target_name], y_pred, 10)
                all_results[method]['NDCG@10'].append(ndcg)

            if 'NDCG@20' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(y_truth[non_binary_target_name], y_pred, 20)
                all_results[method]['NDCG@20'].append(ndcg)

            if 'NDCG@50' in test_metrics:
                ndcg = ranking_metrics.ndcg_at_k_ties(y_truth[non_binary_target_name], y_pred, 50)
                all_results[method]['NDCG@50'].append(ndcg)

            if 'precision@5' in test_metrics:
                y_top_truth = (y_truth[non_binary_target_name] >= np.sort(y_truth[non_binary_target_name])[::-1][:5][-1]) * 1
                y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:5][-1]) * 1
                all_results[method]['precision@5'].append(sklearn.metrics.precision_score(y_top_pred, y_top_truth))

            if 'precision@10' in test_metrics:
                y_top_truth = (y_truth[non_binary_target_name] >= np.sort(y_truth[non_binary_target_name])[::-1][:10][-1]) * 1
                y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:10][-1]) * 1
                all_results[method]['precision@10'].append(sklearn.metrics.precision_score(y_top_pred, y_top_truth))

            if 'precision@20' in test_metrics:
                y_top_truth = (y_truth[non_binary_target_name] >= np.sort(y_truth[non_binary_target_name])[::-1][:20][-1]) * 1
                y_top_pred = (y_pred >= np.sort(y_pred)[::-1][:20][-1]) * 1
                all_results[method]['precision@20'].append(sklearn.metrics.precision_score(y_top_pred, y_top_truth))

            if 'AUC' in test_metrics:
                fpr_gene[gene], tpr_gene[gene], _ = sklearn.metrics.roc_curve(y_truth['thrs'], y_pred)
                auc = sklearn.metrics.auc(fpr_gene[gene], tpr_gene[gene])
                all_results[method]['AUC'].append(auc)

    if add_extras:
        fpr_all, tpr_all, _ = sklearn.metrics.roc_curve(y_truth_thresh_all, y_pred_all)
        return all_results, genes, fpr_all, tpr_all, fpr_gene, tpr_gene
    else:
        return all_results, genes
Ejemplo n.º 3
0
def extract_NDCG_for_fold(metrics, fold, i, predictions, truth, y_ground_truth, test, y_pred, learn_options):
    NDCG_fold = ranking_metrics.ndcg_at_k_ties(
        y_ground_truth[test].flatten(), y_pred.flatten(), learn_options["NDGC_k"]
    )
    metrics.append(NDCG_fold)
Ejemplo n.º 4
0
def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options):
    '''
    linreg using scikitlearn, using more standard regression models with penalization requiring
    nested-cross-validation
    '''

    if learn_options["weighted"] is not None and (learn_options["penalty"] != "L2" or learn_options["method"] != "linreg"):
        raise NotImplementedError("weighted prediction not implemented for any methods by L2 at the moment")

    cv, n_folds = set_up_folds(learn_options, y_all.iloc[train])

    if learn_options['penalty'] == "L1":
        l1_ratio = [1.0]
    elif learn_options['penalty'] == "L2":
        l1_ratio = [0.0]
    elif learn_options['penalty'] == "EN":  # elastic net
        l1_ratio = np.linspace(0.0, 1.0, 20)

    performance = np.zeros((len(learn_options["alpha"]), len(l1_ratio)))
    degenerate_pred = np.zeros((len(learn_options["alpha"])))
    for train_inner, test_inner in cv:
        for i, alpha in enumerate(learn_options["alpha"]):
            for j, l1r in enumerate(l1_ratio):
                clf = train_linreg_model(alpha, l1r, learn_options, train_inner, X[train], y[train], y_all.iloc[train])
                if learn_options["feature_select"]:
                    clf, tmp_pred = feature_select(clf, learn_options, test_inner, train_inner, X[train], y[train])
                else:
                    tmp_pred = clf.predict(X[train][test_inner])

                if learn_options["training_metric"] == "AUC":
                    fpr, tpr, _ = roc_curve(y_all[learn_options["ground_truth_label"]][train][test_inner], tmp_pred)
                    assert ~np.any(np.isnan(fpr)), "found nan fpr"
                    assert ~np.any(np.isnan(tpr)), "found nan tpr"
                    tmp_auc = auc(fpr, tpr)
                    performance[i, j] += tmp_auc

                elif learn_options['training_metric'] == 'spearmanr':
                    spearman = util.spearmanr_nonan(y_all[learn_options['ground_truth_label']][train][test_inner], tmp_pred.flatten())[0]
                    performance[i, j] += spearman

                elif learn_options['training_metric'] == 'score':
                    performance[i, j] += clf.score(X[test_inner], y_all[learn_options['ground_truth_label']][train][test_inner])

                elif learn_options["training_metric"] == "NDCG":
                    assert "thresh" not in learn_options["ground_truth_label"], "for NDCG must not use thresholded ranks, but pure ranks"

                    # sorted = tmp_pred[np.argsort(y_all[ground_truth_label].values[test_inner])[::-1]].flatten()
                    # sortedgt = np.sort(y_all[ground_truth_label].values[test_inner])[::-1].flatten()
                    # tmp_perf = ranking_metrics.ndcg_at_k_ties(sorted, learn_options["NDGC_k"], sortedgt)
                    tmp_truth = y_all[learn_options["ground_truth_label"]].values[train][test_inner].flatten()
                    tmp_perf = ranking_metrics.ndcg_at_k_ties(tmp_truth, tmp_pred.flatten(), learn_options["NDGC_k"])
                    performance[i, j] += tmp_perf

                    degenerate_pred_tmp = len(np.unique(tmp_pred)) < len(tmp_pred)/2.0
                    degenerate_pred[i] += degenerate_pred_tmp

                    # sanity checking metric wrt ties, etc.
                    # rmse = np.sqrt(np.mean((tmp_pred - tmp_truth)**2))
                    tmp_pred_r, tmp_truth_r = ranking_metrics.rank_data(tmp_pred, tmp_truth)
                    # rmse_r = np.sqrt(np.mean((tmp_pred_r-tmp_truth_r)**2))

    performance /= n_folds

    max_score_ind = np.where(performance == np.nanmax(performance))
    assert max_score_ind != len(performance), "enlarge alpha range as hitting max boundary"
    # assert degenerate_pred[max_score_ind[0][0]]==0, "found degenerate predictions at max score"

    # in the unlikely event of tied scores, take the first one.
    if len(max_score_ind[0]) > 1:
        max_score_ind = [max_score_ind[0][0], max_score_ind[1][0]]

    best_alpha, best_l1r = learn_options["alpha"][max_score_ind[0]], l1_ratio[max_score_ind[1]]

    print "\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]])
    if learn_options['penalty'] == "EN":
        print "\t\tbest l1_ratio is %f from range=%s" % (best_l1r, l1_ratio[[0, -1]])
    max_perf = np.nanmax(performance)

    if max_perf < 0.0:
        raise Exception("performance is negative")

    print "\t\tbest performance is %f" % max_perf

    clf = train_linreg_model(best_alpha, l1r, learn_options, train, X, y, y_all)
    if learn_options["feature_select"]:
        raise Exception("untested in a long time, should double check")
        clf, y_pred = feature_select(clf, learn_options, test, train, X, y)
    else:
        y_pred = clf.predict(X[test])

    if learn_options["penalty"] != "L2":
        y_pred = y_pred[:, None]

    return y_pred, clf
Ejemplo n.º 5
0
def extract_NDCG_for_fold(metrics, fold, i, predictions, truth, y_ground_truth,
                          test, y_pred, learn_options):
    NDCG_fold = ranking_metrics.ndcg_at_k_ties(y_ground_truth[test].flatten(),
                                               y_pred.flatten(),
                                               learn_options["NDGC_k"])
    metrics.append(NDCG_fold)
Ejemplo n.º 6
0
def linreg_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options):
    """
    linreg using scikitlearn, using more standard regression models with penalization requiring
    nested-cross-validation
    """

    if learn_options["weighted"] is not None and (
        learn_options["penalty"] != "L2" or learn_options["method"] != "linreg"
    ):
        raise NotImplementedError("weighted prediction not implemented for any methods by L2 at the moment")

    cv, n_folds = set_up_folds(learn_options, y_all.iloc[train])

    if learn_options["penalty"] == "L1":
        l1_ratio = [1.0]
    elif learn_options["penalty"] == "L2":
        l1_ratio = [0.0]
    elif learn_options["penalty"] == "EN":  # elastic net
        l1_ratio = np.linspace(0.0, 1.0, 20)

    performance = np.zeros((len(learn_options["alpha"]), len(l1_ratio)))
    degenerate_pred = np.zeros((len(learn_options["alpha"])))
    for train_inner, test_inner in cv:
        for i, alpha in enumerate(learn_options["alpha"]):
            for j, l1r in enumerate(l1_ratio):
                clf = train_linreg_model(alpha, l1r, learn_options, train_inner, X[train], y[train], y_all.iloc[train])
                if learn_options["feature_select"]:
                    clf, tmp_pred = feature_select(clf, learn_options, test_inner, train_inner, X[train], y[train])
                else:
                    tmp_pred = clf.predict(X[train][test_inner])

                if learn_options["training_metric"] == "AUC":
                    fpr, tpr, _ = roc_curve(y_all[learn_options["ground_truth_label"]][train][test_inner], tmp_pred)
                    assert ~np.any(np.isnan(fpr)), "found nan fpr"
                    assert ~np.any(np.isnan(tpr)), "found nan tpr"
                    tmp_auc = auc(fpr, tpr)
                    performance[i, j] += tmp_auc

                elif learn_options["training_metric"] == "spearmanr":
                    spearman = util.spearmanr_nonan(
                        y_all[learn_options["ground_truth_label"]][train][test_inner], tmp_pred.flatten()
                    )[0]
                    performance[i, j] += spearman

                elif learn_options["training_metric"] == "score":
                    performance[i, j] += clf.score(
                        X[test_inner], y_all[learn_options["ground_truth_label"]][train][test_inner]
                    )

                elif learn_options["training_metric"] == "NDCG":
                    assert (
                        "thresh" not in learn_options["ground_truth_label"]
                    ), "for NDCG must not use thresholded ranks, but pure ranks"

                    # sorted = tmp_pred[np.argsort(y_all[ground_truth_label].values[test_inner])[::-1]].flatten()
                    # sortedgt = np.sort(y_all[ground_truth_label].values[test_inner])[::-1].flatten()
                    # tmp_perf = ranking_metrics.ndcg_at_k_ties(sorted, learn_options["NDGC_k"], sortedgt)
                    tmp_truth = y_all[learn_options["ground_truth_label"]].values[train][test_inner].flatten()
                    tmp_perf = ranking_metrics.ndcg_at_k_ties(tmp_truth, tmp_pred.flatten(), learn_options["NDGC_k"])
                    performance[i, j] += tmp_perf

                    degenerate_pred_tmp = len(np.unique(tmp_pred)) < len(tmp_pred) / 2.0
                    degenerate_pred[i] += degenerate_pred_tmp

                    # sanity checking metric wrt ties, etc.
                    # rmse = np.sqrt(np.mean((tmp_pred - tmp_truth)**2))
                    tmp_pred_r, tmp_truth_r = ranking_metrics.rank_data(tmp_pred, tmp_truth)
                    # rmse_r = np.sqrt(np.mean((tmp_pred_r-tmp_truth_r)**2))

    performance /= n_folds

    max_score_ind = np.where(performance == np.nanmax(performance))
    assert max_score_ind != len(performance), "enlarge alpha range as hitting max boundary"
    # assert degenerate_pred[max_score_ind[0][0]]==0, "found degenerate predictions at max score"

    # in the unlikely event of tied scores, take the first one.
    if len(max_score_ind[0]) > 1:
        max_score_ind = [max_score_ind[0][0], max_score_ind[1][0]]

    best_alpha, best_l1r = learn_options["alpha"][max_score_ind[0]], l1_ratio[max_score_ind[1]]

    print "\t\tbest alpha is %f from range=%s" % (best_alpha, learn_options["alpha"][[0, -1]])
    if learn_options["penalty"] == "EN":
        print "\t\tbest l1_ratio is %f from range=%s" % (best_l1r, l1_ratio[[0, -1]])
    max_perf = np.nanmax(performance)

    if max_perf < 0.0:
        raise Exception("performance is negative")

    print "\t\tbest performance is %f" % max_perf

    clf = train_linreg_model(best_alpha, l1r, learn_options, train, X, y, y_all)
    if learn_options["feature_select"]:
        raise Exception("untested in a long time, should double check")
        clf, y_pred = feature_select(clf, learn_options, test, train, X, y)
    else:
        y_pred = clf.predict(X[test])

    if learn_options["penalty"] != "L2":
        y_pred = y_pred[:, None]

    return y_pred, clf