コード例 #1
0
ファイル: compare_scores.py プロジェクト: jwayne/conseval
def compare_scores(dataset_name, *batchscore_ids):
    """
    Compare scores from 2 scoring methods in a scatter plot.
    """
    if len(batchscore_ids) != 2:
        raise Exception("Need 2 batchscore runs to compare")
    id1, id2 = batchscore_ids
    pos_pairs = []
    neg_pairs = []

    for alignment, scores_cols in get_batchscores(dataset_name, batchscore_ids):
        ts = alignment.testset
        scores_pairs = zip(*scores_cols)
        pos_pairs += (scores_pairs[i] for i in xrange(len(scores_pairs)) if ts[i])
        neg_pairs += (scores_pairs[i] for i in xrange(len(scores_pairs)) if not ts[i])

    plt.figure()
    x1, y1 = zip(*random.sample(neg_pairs, min(len(neg_pairs), 10000)))
    plt.scatter(x1, y1, color="red")
    x2, y2 = zip(*random.sample(pos_pairs, min(len(pos_pairs), 1000)))
    plt.scatter(x2, y2, color="green")
    plt.xlabel(id1)
    plt.ylabel(id2)
    plt.xlim([min(min(x1), min(x2)), max(max(x1), max(x2))])
    plt.ylim([min(min(y1), min(y2)), max(max(y1), max(y2))])
    plt.show()
コード例 #2
0
ファイル: compute_variance.py プロジェクト: jwayne/conseval
def compute_variance(dataset_name, *batchscore_ids):
    assert len(batchscore_ids) == 1
    var_list = []
    for alignment, scores_cols in get_batchscores(dataset_name, batchscore_ids):
        var_list.append(np.var(zip(*scores_cols)))
    print np.mean(var_list)
    plt.hist(var_list)
    plt.title('Per-sequence variances of rates r')
    plt.xlabel('Variance')
    plt.ylabel('Count')
    plt.show()
コード例 #3
0
ファイル: compute_summary.py プロジェクト: jwayne/conseval
def compute_summary(dataset_name):
    n_sites = []
    n_positives = []
    n_seqs = []
    n_seqs_orig = []
    for alignment, _ in get_batchscores(dataset_name):
        n_sites.append( len(alignment.msa[0]) )
        n_positives.append( alignment.testset.count(1) )
        n_seqs.append( len(alignment.msa) )
        n_seqs_orig.append( alignment.orig_num_sequences )

    print "Avg # seqs per alignment: %d" % np.mean(n_seqs)
    print "Avg # seqs per alignment before filtering: %d" % np.mean(n_seqs_orig)
    print "Avg # sites per alignment: %d" % np.mean(n_sites)
    print "Avg %% positives per alignment: %f" % np.mean(np.array(n_positives) / np.array(n_sites))
    print "%% positives total: %f" % ( np.sum(np.array(n_positives)) / np.sum(np.array(n_sites)) )
コード例 #4
0
ファイル: pr_roc.py プロジェクト: jwayne/conseval
def pr_roc(dataset_name, *batchscore_ids):
    """
    Draw PR and ROC curves for each scorer.
    """
    allscores_cols = [[] for i in batchscore_ids]
    test_scores = []

    # Just aggregate all scores across all data files.  It isn't much memory anyway.
    for alignment, scores_cols in get_batchscores(dataset_name, batchscore_ids):
        ts = alignment.testset
        counts = []
        for allscores_col, scores in zip(allscores_cols, scores_cols):
            counts.append(ts.count(None))
            allscores_col += [scores[i] for i in xrange(len(ts)) if ts[i] is not None]
        test_scores += [ts[i] for i in xrange(len(ts)) if ts[i] is not None]

    scorer_fprs = []
    scorer_tprs = []
    scorer_precisions = []
    scorer_recalls = []
    for allscores_col in allscores_cols:
        fprs, tprs, _ = roc_curve(test_scores, allscores_col, pos_label=1)
        scorer_fprs.append(fprs)
        scorer_tprs.append(tprs)
        precisions, recalls, _ = precision_recall_curve(test_scores, allscores_col, pos_label=1)
        scorer_precisions.append(precisions)
        scorer_recalls.append(recalls)

    print_auc("PR", batchscore_ids, scorer_recalls, scorer_precisions)
    print_auc("ROC", batchscore_ids, scorer_fprs, scorer_tprs)

    plot_pr(dataset_name, scorer_precisions, scorer_recalls, batchscore_ids)
    plot_roc(dataset_name, scorer_fprs, scorer_tprs, batchscore_ids, .5)
    plt.show(block=False)

    print """\n* To plot another PR curve:
plot_pr(dataset_name, scorer_precisions, scorer_recalls, batchscore_ids, legend='upper right')
plt.show()"""
    print """\n* To plot another ROC curve:
plot_roc(dataset_name, scorer_fprs, scorer_tprs, batchscore_ids, x_max=.5, legend='lower right')
plt.show()"""
    print ""
    import ipdb
    ipdb.set_trace()
コード例 #5
0
ファイル: hist_scores.py プロジェクト: jwayne/conseval
def hist_scores(dataset_name, *batchscore_ids, **kwargs):
    """
    Histogram positive and negative scores for each scorer.  Plot F1 for corresponding
    thresholds alongside.
    """
    if 'noblock' in kwargs:
        noblock = kwargs['noblock']
    else:
        noblock = False
    if 'fit_gamma' in kwargs:
        fit_gamma = kwargs['fit_gamma']
    else:
        fit_gamma = False

    batchscore_ids = list(batchscore_ids)
    N = len(batchscore_ids)
    pos_cols = [[] for i in xrange(N)]
    neg_cols = [[] for i in xrange(N)]

    # Just aggregate all scores across all data files.  It isn't much memory anyway.
    for alignment, scores_cols in get_batchscores(dataset_name, batchscore_ids):
        ts = alignment.testset
        for pos_col, neg_col, scores_col in zip(pos_cols, neg_cols, scores_cols):
            pos_col += (scores_col[i] for i in xrange(len(scores_col)) if ts[i])
            neg_col += (scores_col[i] for i in xrange(len(scores_col)) if not ts[i])

    for i in xrange(len(pos_cols)):
        pos_col = pos_cols[i]
        if isinstance(pos_col, tuple) and len(pos_col[0]) == 2:
            # We are analyzing r4s_func.
            pos_cols[i], pos_col_new = zip(*pos_col)
            pos_cols.append(pos_col_new)
            neg_cols[i], neg_col_new = zip(*neg_cols[i])
            neg_cols.append(neg_col_new)
            batchscore_ids.append(batchscore_ids[i] + '-c')

    figs = []
    for pos_col, neg_col, batchscore_id in zip(pos_cols, neg_cols, batchscore_ids):
        pos_col = np.array(pos_col)
        neg_col = np.array(neg_col)

        # Compute summary statistics
        tot = len(pos_col) + len(neg_col)
        pos_mean = np.mean(pos_col)
        pos_var = np.var(pos_col)
        neg_mean = np.mean(neg_col)
        neg_var = np.var(neg_col)
        print "%s:\n\tpos %f, neg %s\n\tpos mean %f, var %f\n\tneg mean %f, var %f" % (
            batchscore_id, len(pos_col)/tot, len(neg_col)/tot,
            pos_mean, pos_var, neg_mean, neg_var)

        # Compute plot statistics
        scores_min = np.min((np.min(pos_col), np.min(neg_col)))
        scores_max = np.max((np.max(pos_col), np.max(neg_col)))
        bins = np.linspace(scores_min,scores_max,101)
        bin_width = bins[1] - bins[0]
        bin_lows = bins[:-1]
        bin_mids = bin_lows + bin_width

        # Plot counts
        plt.ion()
        fig = plt.figure()
        plt.xlabel('Score')
        pos_counts, _ = np.histogram(pos_col, bins)
        neg_counts, _ = np.histogram(neg_col, bins)
        ax = plt.gca()
        ax.bar(bin_lows, neg_counts, bin_width, color='r', alpha=0.5)#, bottom=pos_counts)
        ax.bar(bin_lows, pos_counts, bin_width, color='g', alpha=0.8)
        ax.set_ylabel('Count')

        # Plot gamma fit if desired
        if fit_gamma:
            x = np.abs(bin_mids)
            plt.plot(bin_mids, bin_width * len(pos_col) * \
                ss.gamma.pdf(x, a=abs(pos_mean)**2/pos_var, scale=pos_var/abs(pos_mean)),
                'g')
            plt.plot(bin_mids, bin_width * len(neg_col) * \
                ss.gamma.pdf(x, a=abs(neg_mean)**2/neg_var, scale=neg_var/abs(neg_mean)),
                'r')

        # Plot corresponding F1
        pos_tot = sum(pos_counts)
        pos_right = 0
        neg_right = 0
        f1 = []
        for pos_count, neg_count in reversed(zip(pos_counts, neg_counts)):
            pos_right += pos_count
            neg_right += neg_count
            precision = pos_right / pos_tot
            recall = pos_right / (pos_right + neg_right)
            if precision or recall:
                f1.append( 2 * precision * recall / (precision + recall) )
            else:
                f1.append(0)
        f1 = np.array(list(reversed(f1)))
        ax = ax.twinx()
        ax.plot(bin_mids, f1, '--')
        ax.set_ylabel('F1')

        plt.title('Scores (%s, %s)' % (batchscore_id, dataset_name))
        figs.append(fig)

    if noblock:
        plt.show()
    else:
        plt.show(block=False)
        import ipdb
        ipdb.set_trace()
コード例 #6
0
ファイル: r4s_ppc.py プロジェクト: jwayne/conseval
def r4s_ppc(dataset_name, **jsd_params):
    afs = list(get_batchscores(dataset_name, align_files_only=True))

    dc = DATASET_CONFIGS[dataset_name]
    r4s_name = 'R4S_EB-vanilla'
    r4s_dir = os.path.join(get_batchscore_dir(dataset_name), r4s_name)

    r4s = Rate4siteEb()
    jsd = JsDivergence(**jsd_params)

    # Choose random alignment/scores pair
    align_file = random.choice(afs)
    test_file = dc.get_test_file(align_file)
    r4s_file = dc.get_out_file(align_file, r4s_dir)
    alignment = Alignment(align_file, test_file=test_file, parse_testset_fn=dc.parse_testset_fn)
    n_seqs = len(alignment.msa)
    n_sites = len(alignment.msa[0])

    fig = plt.figure()
    ax = plt.gca()
    inds = range(n_sites)

    rates = read_batchscores(r4s_file)
    tree = alignment.get_phylotree()
    root = tree.root
    names_map = dict((name,i) for i,name in enumerate(alignment.names))

    # Pre-compute probabilities for branch, for every site (i.e., rate)
    P_cached = precompute_tree_probs(tree, rates, r4s.sub_model)
    for r in P_cached:
        for node in P_cached[r]:
            # We treat root separately
            if node != root:
                cum_probs = np.cumsum(P_cached[r][node],axis=1)
                if not np.all(np.abs(cum_probs[:,-1]-1) < 1e-4):
                    raise ValueError("Bad probability matrix")
                cum_probs[:,-1] = 1
                P_cached[r][node] = np.array(cum_probs)
    root_freqs = np.cumsum(r4s.sub_model.freqs)
    if not abs(root_freqs[-1]-1) < 1e-4:
        raise ValueError("Bad probability matrix")
    root_freqs[-1] = 1

    # Repeat replication for N_RUNS
    jsd_rep_scores_all = []
    for n in xrange(N_RUNS):
        # For each site, generate amino acids for each sequence using that site's rate
        # DFS through tree, setting amino acids at each node
        msa = [[] for i in xrange(n_seqs)]
        for i in xrange(n_sites):
            r = rates[i]
            aa_ind = weighted_choice(root_freqs)
            bfs = [(aa_ind,node) for node in root.clades]
            for aa_ind, node in bfs:
                aa_ind = weighted_choice(P_cached[r][node][aa_ind])
                if node.is_terminal():
                    msa[names_map[node.name]].append(amino_acids[aa_ind])
                else:
                    bfs += ((aa_ind,child) for child in node.clades)
        aln_rep = MockAlignment(alignment.names, msa, tree, alignment.get_seq_weights)
        jsd_rep_scores = jsd.score(aln_rep)
        jsd_rep_scores_all.append(jsd_rep_scores)
        ax.scatter(inds, jsd_rep_scores, color='k', alpha=0.2)

    jsd_orig_scores = jsd.score(alignment)
    ts = alignment.testset
    pos_inds = [i for i in inds if ts[i]]
    pos_orig_scores = [jsd_orig_scores[i] for i in pos_inds]
    neg_inds = [i for i in inds if not ts[i]]
    neg_orig_scores = [jsd_orig_scores[i] for i in neg_inds]
    ax.scatter(pos_inds, pos_orig_scores, color='g')
    ax.scatter(neg_inds, neg_orig_scores, color='r')
    ax.set_ylabel('JSD score')

    rep_scores_per_col = np.array(jsd_rep_scores_all).T
    means = np.mean(rep_scores_per_col, axis=1)
    stds = np.std(rep_scores_per_col, axis=1)
    zscores = (np.array(jsd_orig_scores) - means) / stds
    ax2 = ax.twinx()
    ax2.plot(inds, zscores, '--')
    ax2.set_ylabel('Deviations')

    plt.xlim(inds[0], inds[-1])
    plt.xlabel('Sites')

    plt.figure()
    plt.scatter(rates, zscores)
    plt.xlabel('Rates')
    plt.xlabel('Deviations')

    plt.show(block=False)
    import ipdb
    ipdb.set_trace()