Exemple #1
0
def peruse_forward_scores():
    _, reco_info = seqfileopener.get_seqfile_info(simfname, is_data=False)  #, n_max_queries=10000)
    logprobs, partialcorr_logprobs, corr_logprobs = OrderedDict(), OrderedDict(), OrderedDict()
    for n_set in n_set_list:
        print n_set
        # if n_set != 5:
        #     continue
        logprobs[n_set], partialcorr_logprobs[n_set], corr_logprobs[n_set] = OrderedDict(), OrderedDict(), OrderedDict()
        with open(outputdir + '/' + str(n_set) + '-forward.csv') as csvfile:
            reader = csv.DictReader(csvfile)
            for line in reader:
                uidlist = line['unique_ids'].split(':')
                assert utils.from_same_event(reco_info, uidlist)
                reco_id = reco_info[uidlist[0]]['reco_id']
                if reco_id in logprobs[n_set]:
                    raise Exception('already had %s' % reco_id)

                logprobs[n_set][reco_id] = float(line['logprob'])

                factor = 1. / n_set
                partialcorr_logprobs[n_set][reco_id] = factor * float(line['logprob'])

                factor = (1. - 0.24 / pow(float(n_set), 0.9)) / n_set
                # factor = 1. / (0.77547824*n_set + 0.20327936)
                corr_logprobs[n_set][reco_id] = factor * float(line['logprob'])


    i_baseline = -1
    deviations = get_deviations(logprobs, i_baseline)
    # fit_stuff(n_set_list, deviations)
    partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline)
    signed_partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline, signed=True)
    corr_deviations = get_deviations(corr_logprobs, i_baseline)
    signed_corr_deviations = get_deviations(corr_logprobs, i_baseline, signed=True)

    import plotting
    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, deviations, marker='.')
    plotting.mpl_finish(ax, baseplotdir, 'forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline]))  #, ybounds=(-0.02, 0.02))

    # fig, ax = plotting.mpl_init()
    # ax.plot(n_set_list, partialcorr_deviations, marker='.')
    # ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    # plotting.mpl_finish(ax, baseplotdir, 'partially-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))

    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, partialcorr_deviations, marker='.', label='1/n (abs)')
    ax.plot(n_set_list, signed_partialcorr_deviations, marker='.', label='1/n')
    ax.plot(n_set_list, corr_deviations, marker='.', label='1/crap (abs)')
    ax.plot(n_set_list, signed_corr_deviations, marker='.', label='1/crap')
    ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    plotting.mpl_finish(ax, baseplotdir, 'corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))

    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, signed_corr_deviations, marker='.')
    ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    plotting.mpl_finish(ax, baseplotdir, 'signed-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))
Exemple #2
0
    def make_transition_plot(self, gene_name, model):
        """ NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """
        fig, ax = plotting.mpl_init()
        fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])

        ibin = 0
        print utils.color_gene(utils.unsanitize_name(gene_name))
        legend_colors = set()  # add a color to this the first time you plot it
        for state in model.states:

            # bin label
            ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8)

            sorted_to_states = {}
            for name in state.transitions.keys():
                if name.find('IG') == 0 or name.find('TR') == 0:
                    sorted_to_states[name] = int(paramutils.simplify_state_name(name))
                else:
                    sorted_to_states[name] = name
            sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1))

            total = 0.0
            for to_state, simple_to_state in sorted_to_states:

                prob = state.transitions[to_state]

                alpha = 0.6
                width = 3

                if 'insert' in str(simple_to_state):
                    label = 'insert'
                    color = '#3498db'  # blue
                elif str(simple_to_state) == 'end':
                    label = 'end'
                    color = 'red'
                else:  # regional/internal states
                    assert to_state.find('IG') == 0 or to_state.find('TR') == 0
                    label = 'internal'
                    color = 'green'

                label_to_use = None
                if color not in legend_colors:
                    label_to_use = label
                    legend_colors.add(color)

                # horizontal line at height total+prob
                ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use)

                # vertical line from total to total + prob
                ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width)

                midpoint = 0.5*(prob + 2*total)
                # ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state))  # nicely labels the midpoint of the chunk between lines, but there isn't really room for it

                total += prob
    
            ibin += 1

        ax.get_xaxis().set_visible(False)
        plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
Exemple #3
0
    def sample(self, n_vals, include_overflows=False, debug_plot=False):  # draw <n_vals> random numbers from the x axis, according to the probabilities given by the bin contents NOTE similarity to recombinator.choose_vdj_combo()
        assert not include_overflows  # probably doesn't really make sense (since contents of overflows could've been from anywhere below/above, but we'd only return bin center), this is just a way to remind that it doesn't make sense
        self.normalize(include_overflows=include_overflows)  # if this is going to get called a lot with n_vals of 1, this would be slow, but otoh we *really* want to make sure things are normalized with include_overflows the same as it is here
        centers = self.get_bin_centers()
        pvals = numpy.random.uniform(0, 1, size=n_vals)
        return_vals = [None for _ in pvals]
        sum_prob, last_sum_prob = 0., 0.
        for ibin in self.ibiniter(include_overflows):
            sum_prob += self.bin_contents[ibin]
            for iprob, pval in enumerate(pvals):
                if pval < sum_prob and pval >= last_sum_prob:
                    return_vals[iprob] = centers[ibin]
            last_sum_prob = sum_prob
        assert return_vals.count(None) == 0

        if debug_plot:
            import plotting
            fig, ax = plotting.mpl_init()
            self.mpl_plot(ax, label='original')
            shist = Hist(value_list=return_vals, init_int_bins=True)
            shist.normalize(include_overflows=False)
            shist.mpl_plot(ax, label='sampled', color='red')
            plotting.mpl_finish(ax, '', 'tmp')

        return return_vals
Exemple #4
0
    def make_transition_plot(self, gene_name, model):
        """ NOTE shares a lot with make_mutefreq_plot() in python/paramutils.py """
        fig, ax = plotting.mpl_init()
        fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])

        ibin = 0
        print utils.color_gene(utils.unsanitize_name(gene_name))
        legend_colors = set()  # add a color to this the first time you plot it
        for state in model.states:

            # bin label
            ax.text(-0.5 + ibin, -0.075, paramutils.simplify_state_name(state.name), rotation='vertical', size=8)

            sorted_to_states = {}
            for name in state.transitions.keys():
                if name.find('IG') == 0:
                    sorted_to_states[name] = int(paramutils.simplify_state_name(name))
                else:
                    sorted_to_states[name] = name
            sorted_to_states = sorted(sorted_to_states.items(), key=operator.itemgetter(1))

            total = 0.0
            for to_state, simple_to_state in sorted_to_states:

                prob = state.transitions[to_state]

                alpha = 0.6
                width = 3

                if 'insert' in str(simple_to_state):
                    label = 'insert'
                    color = '#3498db'  # blue
                elif str(simple_to_state) == 'end':
                    label = 'end'
                    color = 'red'
                else:  # regional/internal states
                    assert to_state.find('IG') == 0
                    label = 'internal'
                    color = 'green'

                label_to_use = None
                if color not in legend_colors:
                    label_to_use = label
                    legend_colors.add(color)

                # horizontal line at height total+prob
                ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, linewidth=width, alpha=alpha, label=label_to_use)

                # vertical line from total to total + prob
                ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=width)

                midpoint = 0.5*(prob + 2*total)
                # ax.text(ibin, midpoint, paramutils.simplify_state_name(to_state))  # nicely labels the midpoint of the chunk between lines, but there isn't really room for it

                total += prob
    
            ibin += 1

        ax.get_xaxis().set_visible(False)
        plotting.mpl_finish(ax, self.base_plotdir + '/transitions', gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(model.states) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
Exemple #5
0
def peruse_naive_seqs():
    from hist import Hist
    # hall = Hist(n_set_list[-1], n_set_list[0] - 0.5, n_set_list[-1] + 0.5)
    means = []
    for n_set in n_set_list:
        plotdir = baseplotdir + '/' + str(n_set)
        hist = Hist(fname=plotdir + '/hmm/hamming_to_true_naive.csv')
        print '%2d   %.2f' % (n_set, hist.get_mean()),
        # hall.set_ibin(hall.find_bin(n_set), hist.get_mean())
        means.append(hist.get_mean())
    
    import plotting
    fig, ax = plotting.mpl_init()
    # hall.mpl_plot(ax)
    ax.plot(n_set_list, means, marker='.')
    plotting.mpl_finish(ax, baseplotdir, 'means', xlabel='N simultaneous seqs', ylabel='mean hamming to true naive', ybounds=(0, None))
Exemple #6
0
def plotheatmap(plotdir, plotname, difftype, genelist=None, genesets=None, title='', xtitle=''):
    assert genelist is None or genesets is None
    if genelist is not None:
        smatrix = get_gene_pair_matrix(genelist, difftype)
        xticklabels = [utils.summarize_gene_name(g) for g in genelist]
    elif genesets is not None:
        smatrix = get_gene_set_mean_matrix(genesets, difftype)
        xticklabels = [sn for sn in genesets.keys()]
    else:
        raise Exception('no gene list specified')
    assert len(smatrix) == len(smatrix[0])  # uh, I think I need this to be true
    fig, ax = plotting.mpl_init()
    plt.tick_params(axis='both', which='major', labelsize=7)
    plt.gcf().subplots_adjust(bottom=0.14, left=0.18, right=0.95, top=0.92)
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    data = numpy.array(smatrix)
    cmap = plt.cm.Blues  #cm.get_cmap('jet')
    cmap.set_under('w')
    heatmap = ax.pcolor(data, cmap=cmap, vmin=0., vmax=0.5)  #vmax=numpy.amax(smatrix))
    cbar = plt.colorbar(heatmap, shrink=0.9, pad=0.01)
    cbar.set_label(xtitle, rotation=270, labelpad=30)
    cbar.ax.tick_params(labelsize=10) 

    ticks = [n - 0.5 for n in range(1, len(xticklabels) + 1, 1)]
    # xticklabels = [str(int(n + 0.5)) for n in ticks]
    yticklabels = xticklabels
    # if n_biggest_clusters > 20:
    #     modulo = 3
    #     ticks = [ticks[it] for it in range(0, len(ticks), modulo)]
    #     xticklabels = [b_cluster_lengths[it] for it in range(0, len(b_cluster_lengths), modulo)]
    #     yticklabels = [a_cluster_lengths[it] for it in range(0, len(a_cluster_lengths), modulo)]
    plt.xticks(ticks, xticklabels, rotation=90)
    plt.yticks(ticks, yticklabels)
    # plt.xlabel(legends.get(meth2, meth2) + ' cluster size')  # I don't know why it's reversed, it just is
    # plt.ylabel(legends.get(meth1, meth1) + ' cluster size')
    # ax.set_xlim(0, n_biggest_clusters)
    # ax.set_ylim(0, n_biggest_clusters)

    plt.title(title)

    if not os.path.exists(plotdir + '/plots'):
        os.makedirs(plotdir + '/plots')
    plt.savefig(plotdir + '/plots/' + plotname + '.svg')
    plt.close()
Exemple #7
0
def plotheatmap(plotdir, plotname, difftype, genelist=None, genesets=None, title='', xtitle=''):
    assert genelist is None or genesets is None
    if genelist is not None:
        smatrix = get_gene_pair_matrix(genelist, difftype)
        xticklabels = [utils.summarize_gene_name(g) for g in genelist]
    elif genesets is not None:
        smatrix = get_gene_set_mean_matrix(genesets, difftype)
        xticklabels = [sn for sn in genesets.keys()]
    else:
        raise Exception('no gene list specified')
    assert len(smatrix) == len(smatrix[0])  # uh, I think I need this to be true
    fig, ax = plotting.mpl_init()
    plt.tick_params(axis='both', which='major', labelsize=7)
    plt.gcf().subplots_adjust(bottom=0.14, left=0.18, right=0.95, top=0.92)
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    data = numpy.array(smatrix)
    cmap = plt.cm.Blues  #cm.get_cmap('jet')
    cmap.set_under('w')
    heatmap = ax.pcolor(data, cmap=cmap, vmin=0., vmax=0.5)  #vmax=numpy.amax(smatrix))
    cbar = plt.colorbar(heatmap, shrink=0.9, pad=0.01)
    cbar.set_label(xtitle, rotation=270, labelpad=30)
    cbar.ax.tick_params(labelsize=10) 

    ticks = [n - 0.5 for n in range(1, len(xticklabels) + 1, 1)]
    # xticklabels = [str(int(n + 0.5)) for n in ticks]
    yticklabels = xticklabels
    # if n_biggest_clusters > 20:
    #     modulo = 3
    #     ticks = [ticks[it] for it in range(0, len(ticks), modulo)]
    #     xticklabels = [b_cluster_lengths[it] for it in range(0, len(b_cluster_lengths), modulo)]
    #     yticklabels = [a_cluster_lengths[it] for it in range(0, len(a_cluster_lengths), modulo)]
    plt.xticks(ticks, xticklabels, rotation=90)
    plt.yticks(ticks, yticklabels)
    # plt.xlabel(legends.get(meth2, meth2) + ' cluster size')  # I don't know why it's reversed, it just is
    # plt.ylabel(legends.get(meth1, meth1) + ' cluster size')
    # ax.set_xlim(0, n_biggest_clusters)
    # ax.set_ylim(0, n_biggest_clusters)

    plt.title(title)

    if not os.path.exists(plotdir + '/plots'):
        os.makedirs(plotdir + '/plots')
    plt.savefig(plotdir + '/plots/' + plotname + '.svg')
    plt.close()
Exemple #8
0
def make_mutefreq_plot(plotdir, gene_name, positions):
    import plotting
    """ NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """
    nuke_colors = {'A' : 'red', 'C' : 'blue', 'G' : 'orange', 'T' : 'green'}
    fig, ax = plotting.mpl_init()
    fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])

    ibin = 0
    print utils.color_gene(utils.unsanitize_name(gene_name))
    legend_colors = set()
    for info in positions:
        posname = info['name']

        # make label below bin
        ax.text(-0.5 + ibin, -0.075, simplify_state_name(posname), rotation='vertical', size=8)

        total = 0.0
        alpha = 0.6
        for nuke, prob in sorted(info['nuke_freqs'].items(), key=operator.itemgetter(1), reverse=True):
            color = nuke_colors[nuke]

            label_to_use = None
            if color not in legend_colors:
                label_to_use = nuke
                legend_colors.add(color)

            # horizontal line at height total+prob
            ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, alpha=alpha, linewidth=3, label=label_to_use)

            # vertical line from total to total + prob
            ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=3)

            # # write [ACGT] at midpoint between total and total+prob
            # midpoint = 0.5*(prob + 2*total)
            # ... *redacted*

            total += prob

        ibin += 1

    ax.get_xaxis().set_visible(False)
    plotting.mpl_finish(ax, plotdir, gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(positions) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
Exemple #9
0
def make_mutefreq_plot(plotdir, gene_name, positions):
    """ NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """
    nuke_colors = {'A' : 'red', 'C' : 'blue', 'G' : 'orange', 'T' : 'green'}
    fig, ax = plotting.mpl_init()
    fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])

    ibin = 0
    print utils.color_gene(utils.unsanitize_name(gene_name))
    legend_colors = set()
    for info in positions:
        posname = info['name']

        # make label below bin
        ax.text(-0.5 + ibin, -0.075, simplify_state_name(posname), rotation='vertical', size=8)

        total = 0.0
        alpha = 0.6
        for nuke, prob in sorted(info['nuke_freqs'].items(), key=operator.itemgetter(1), reverse=True):
            color = nuke_colors[nuke]

            label_to_use = None
            if color not in legend_colors:
                label_to_use = nuke
                legend_colors.add(color)

            # horizontal line at height total+prob
            ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob], color=color, alpha=alpha, linewidth=3, label=label_to_use)

            # vertical line from total to total + prob
            ax.plot([ibin, ibin], [total + 0.01, total + prob], color=color, alpha=alpha, linewidth=3)

            # # write [ACGT] at midpoint between total and total+prob
            # midpoint = 0.5*(prob + 2*total)
            # ... *redacted*

            total += prob

        ibin += 1

    ax.get_xaxis().set_visible(False)
    plotting.mpl_finish(ax, plotdir, gene_name, ybounds=(-0.01, 1.01), xbounds=(-3, len(positions) + 3), leg_loc=(0.95, 0.1), adjust={'left' : 0.1, 'right' : 0.8}, leg_prop={'size' : 8})
Exemple #10
0
def peruse_naive_seqs():
    from hist import Hist
    # hall = Hist(n_set_list[-1], n_set_list[0] - 0.5, n_set_list[-1] + 0.5)
    means = []
    for n_set in n_set_list:
        plotdir = baseplotdir + '/' + str(n_set)
        hist = Hist(fname=plotdir + '/hmm/hamming_to_true_naive.csv')
        print '%2d   %.2f' % (n_set, hist.get_mean()),
        # hall.set_ibin(hall.find_bin(n_set), hist.get_mean())
        means.append(hist.get_mean())

    import plotting
    fig, ax = plotting.mpl_init()
    # hall.mpl_plot(ax)
    ax.plot(n_set_list, means, marker='.')
    plotting.mpl_finish(ax,
                        baseplotdir,
                        'means',
                        xlabel='N simultaneous seqs',
                        ylabel='mean hamming to true naive',
                        ybounds=(0, None))
Exemple #11
0
    def make_single_size_vs_shm_plot(self,
                                     sorted_clusters,
                                     annotations,
                                     repertoire_size,
                                     base_plotdir,
                                     plotname,
                                     n_max_mutations=100,
                                     plot_high_mutation=False,
                                     title=None,
                                     debug=False):
        import plotting

        def gety(minval, maxval, xmax, x):
            slope = (maxval - minval) / xmax
            return slope * x + minval

        def getnmutelist(cluster):
            return annotations[':'.join(cluster)]['n_mutations']

        colors = ['#006600', '#3399ff', '#ffa500']
        # goldenrod '#daa520'
        # red '#cc0000',
        # dark red '#990012'
        # purple '#a821c7'
        # grey '#808080'

        dpi = 80
        xpixels = 450
        ypixels = max(400, 10 * len(sorted_clusters))
        fig, ax = plotting.mpl_init(figsize=(xpixels / dpi, ypixels / dpi))

        min_linewidth = 0.3
        max_linewidth = 12
        # min_alpha = 0.1
        # max_alpha = 1.
        # linewidth = 7
        alpha = 0.55

        ymin, ymax = 9999, 0
        iclust_global = 0
        yticks, yticklabels = [], []

        high_mutation_clusters = []
        biggest_n_mutations = None

        if debug:
            print '  %s   %d x %d' % (
                plotname, xpixels, ypixels
            )  #, utils.color('red', 'high mutation') if plot_high_mutation else '')
            print '      size   frac      yval    median   mean'

        for csize, cluster_group in itertools.groupby(sorted_clusters,
                                                      key=lambda c: len(c)):
            cluster_group = sorted(list(cluster_group),
                                   key=lambda c: numpy.median(getnmutelist(c)))
            n_clusters = len(cluster_group)
            repfracstr = self.get_repfracstr(csize, repertoire_size)
            for iclust in range(len(cluster_group)):
                cluster = cluster_group[iclust]
                nmutelist = sorted(getnmutelist(cluster))
                nmedian = numpy.median(nmutelist)
                nmean = numpy.mean(
                    nmutelist)  # maybe should use this instead of the median?
                if biggest_n_mutations is None or nmutelist[
                        -1] > biggest_n_mutations:
                    biggest_n_mutations = nmutelist[-1]

                yval = len(sorted_clusters) - iclust_global
                if yval < ymin:
                    ymin = yval
                if yval > ymax:
                    ymax = yval
                yticks.append(yval)
                yticklabels.append('%d' % csize)
                # yticklabels.append(repfracstr)

                base_color = colors[iclust_global % len(colors)]
                if self.args.queries_to_include is not None:
                    queries_to_include_in_this_cluster = set(cluster) & set(
                        self.args.queries_to_include)
                    if len(queries_to_include_in_this_cluster) > 0:
                        base_color = 'red'
                        if plot_high_mutation:
                            xtext = 1.1
                        elif float(nmedian) / n_max_mutations < 0.5:
                            xtext = 0.75
                        else:
                            xtext = 0.1
                        ax.text(xtext * n_max_mutations,
                                yval,
                                ' '.join(queries_to_include_in_this_cluster),
                                color='red',
                                fontsize=8)

                if debug:
                    print '     %5s  %-10s  %4.1f  %6.1f  %6.1f' % (
                        '%d' % csize if iclust == 0 else '', repfracstr
                        if iclust == 0 else '', yval, nmedian, nmean),

                if nmedian > n_max_mutations and not plot_high_mutation:
                    if debug:
                        print '%s' % utils.color('red', 'high mutation')
                    high_mutation_clusters.append(cluster)
                    continue

                if debug:
                    print ''

                nbins = nmutelist[-1] - nmutelist[0] + 1
                hist = Hist(nbins, nmutelist[0] - 0.5, nmutelist[-1] + 0.5)
                for nm in nmutelist:
                    hist.fill(nm)
                assert hist.overflow_contents() == 0.  # includes underflows
                xmax = max(hist.bin_contents)  # float(csize)
                for ibin in range(1, hist.n_bins + 1):
                    linewidth = gety(min_linewidth, max_linewidth, xmax,
                                     hist.bin_contents[ibin])
                    color = base_color
                    # alpha = gety(min_alpha, max_alpha, xmax, hist.bin_contents[ibin])
                    if hist.bin_contents[ibin] == 0.:
                        color = 'grey'
                        linewidth = min_linewidth
                        alpha = 0.4
                    ax.plot([hist.low_edges[ibin], hist.low_edges[ibin + 1]],
                            [yval, yval],
                            color=color,
                            linewidth=linewidth,
                            alpha=alpha,
                            solid_capstyle='butt')

                iclust_global += 1

        xbounds = [-0.2, n_max_mutations] if not plot_high_mutation else [
            n_max_mutations, biggest_n_mutations
        ]
        ybounds = [0.95 * ymin, 1.05 * ymax]
        n_ticks = 5
        if len(yticks) > n_ticks:
            yticks = [
                yticks[i] for i in range(0, len(yticks),
                                         int(len(yticks) / float(n_ticks - 1)))
            ]
            yticklabels = [
                yticklabels[i]
                for i in range(0, len(yticklabels),
                               int(len(yticklabels) / float(n_ticks - 1)))
            ]
        plotting.mpl_finish(ax,
                            base_plotdir + '/overall',
                            plotname,
                            xlabel='N mutations',
                            ylabel='clonal family size',
                            title=title,
                            xbounds=xbounds,
                            ybounds=ybounds,
                            yticks=yticks,
                            yticklabels=yticklabels,
                            adjust={'left': 0.18})

        return high_mutation_clusters
Exemple #12
0
 def fullplot(self, plotdir, plotname, **kwargs):  # i.e. full plotting process, not just the ax.plot type stuff above
     import plotting
     fig, ax = plotting.mpl_init()  # this'll need to be updated when i want to use a kwarg for this fcn
     self.mpl_plot(ax)
     plotting.mpl_finish(ax, plotdir, plotname, **kwargs)
     self.write('%s/%s.csv'%(plotdir, plotname))
Exemple #13
0
def peruse_forward_scores():
    _, reco_info = seqfileopener.get_seqfile_info(
        simfname, is_data=False)  #, n_max_queries=10000)
    logprobs, partialcorr_logprobs, corr_logprobs = OrderedDict(), OrderedDict(
    ), OrderedDict()
    for n_set in n_set_list:
        print n_set
        # if n_set != 5:
        #     continue
        logprobs[n_set], partialcorr_logprobs[n_set], corr_logprobs[
            n_set] = OrderedDict(), OrderedDict(), OrderedDict()
        with open(outputdir + '/' + str(n_set) + '-forward.csv') as csvfile:
            reader = csv.DictReader(csvfile)
            for line in reader:
                uidlist = line['unique_ids'].split(':')
                assert utils.from_same_event(reco_info, uidlist)
                reco_id = reco_info[uidlist[0]]['reco_id']
                if reco_id in logprobs[n_set]:
                    raise Exception('already had %s' % reco_id)

                logprobs[n_set][reco_id] = float(line['logprob'])

                factor = 1. / n_set
                partialcorr_logprobs[n_set][reco_id] = factor * float(
                    line['logprob'])

                factor = (1. - 0.24 / pow(float(n_set), 0.9)) / n_set
                # factor = 1. / (0.77547824*n_set + 0.20327936)
                corr_logprobs[n_set][reco_id] = factor * float(line['logprob'])

    i_baseline = -1
    deviations = get_deviations(logprobs, i_baseline)
    # fit_stuff(n_set_list, deviations)
    partialcorr_deviations = get_deviations(partialcorr_logprobs, i_baseline)
    signed_partialcorr_deviations = get_deviations(partialcorr_logprobs,
                                                   i_baseline,
                                                   signed=True)
    corr_deviations = get_deviations(corr_logprobs, i_baseline)
    signed_corr_deviations = get_deviations(corr_logprobs,
                                            i_baseline,
                                            signed=True)

    import plotting
    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, deviations, marker='.')
    plotting.mpl_finish(ax,
                        baseplotdir,
                        'forwards',
                        xlabel='N simultaneous seqs',
                        ylabel='log prob deviation to ' +
                        str(n_set_list[i_baseline]))  #, ybounds=(-0.02, 0.02))

    # fig, ax = plotting.mpl_init()
    # ax.plot(n_set_list, partialcorr_deviations, marker='.')
    # ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    # plotting.mpl_finish(ax, baseplotdir, 'partially-corrected-forwards', xlabel='N simultaneous seqs', ylabel='log prob deviation to ' + str(n_set_list[i_baseline])) #, ybounds=(-0.02, 0.02))

    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, partialcorr_deviations, marker='.', label='1/n (abs)')
    ax.plot(n_set_list, signed_partialcorr_deviations, marker='.', label='1/n')
    ax.plot(n_set_list, corr_deviations, marker='.', label='1/crap (abs)')
    ax.plot(n_set_list, signed_corr_deviations, marker='.', label='1/crap')
    ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    plotting.mpl_finish(ax,
                        baseplotdir,
                        'corrected-forwards',
                        xlabel='N simultaneous seqs',
                        ylabel='log prob deviation to ' +
                        str(n_set_list[i_baseline]))  #, ybounds=(-0.02, 0.02))

    fig, ax = plotting.mpl_init()
    ax.plot(n_set_list, signed_corr_deviations, marker='.')
    ax.plot([n_set_list[0], n_set_list[-1]], [0, 0])
    plotting.mpl_finish(ax,
                        baseplotdir,
                        'signed-corrected-forwards',
                        xlabel='N simultaneous seqs',
                        ylabel='log prob deviation to ' +
                        str(n_set_list[i_baseline]))  #, ybounds=(-0.02, 0.02))
Exemple #14
0
chimeric_fraction = n_above_cutoff / float(len(chfo))
print '  %d / %d = %.3f above chimeric cutoff' % (n_above_cutoff, len(chfo),
                                                  chimeric_fraction)

hmaxval = Hist(45, 0., 0.65)
for uid in annotations:
    hmaxval.fill(chfo[uid]['max_abs_diff'])
himax = Hist(75, 0., 400)
for uid in annotations:
    himax.fill(chfo[uid]['imax'])

utils.prep_dir(args.plotdir, wildlings=['*.svg', '*.csv'])

import matplotlib
from matplotlib import pyplot as plt
fig, ax = plotting.mpl_init()
xvals, yvals = zip(*[(v['imax'], v['max_abs_diff']) for v in chfo.values()])
plt.scatter(xvals, yvals, alpha=0.4)

print 'writing to %s' % args.plotdir
plotting.mpl_finish(ax,
                    args.plotdir,
                    'hexbin',
                    title=args.title,
                    xlabel='break point',
                    ylabel='abs mfreq diff')

plotting.draw_no_root(hmaxval,
                      plotdir=args.plotdir,
                      plotname='mfreq-diff',
                      shift_overflows=True,
    def plot(self, plotdir, only_csv=False):
        utils.prep_dir(plotdir, wildling=None, multilings=['*.csv', '*.svg', '*.root'])
        for column in self.values:
            if self.only_correct_gene_fractions and column not in bool_columns:
                continue
            if column in bool_columns:
                right = self.values[column]['right']
                wrong = self.values[column]['wrong']
                errs = fraction_uncertainty.err(right, right+wrong)
                print '  %s\n    correct up to allele: %4d / %-4d = %4.4f (-%.3f, +%.3f)' % (column, right, right+wrong, float(right) / (right + wrong), errs[0], errs[1])
                hist = plotting.make_bool_hist(right, wrong, self.name + '-' + column)
                plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, stats='0-bin', only_csv=only_csv)
            else:
                # TODO this is dumb... I should make the integer-valued ones histograms as well
                hist = plotting.make_hist_from_dict_of_counts(self.values[column], 'int', self.name + '-' + column, normalize=True)
                log = ''
                if column.find('hamming_to_true_naive') >= 0:  # TODO why doesn't this just use the config dicts in plotheaders or wherever?
                    hist.title = 'hamming distance'
                else:
                    hist.title = 'inferred - true'
                plotting.draw_no_root(hist, plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv)
        for column in self.hists:
            plotting.draw_no_root(self.hists[column], plotname=column, plotdir=plotdir, write_csv=True, log=log, only_csv=only_csv)

        # per-gene support crap
        for region in utils.regions:
            if self.hists[region + '_allele_right_vs_per_gene_support'].integral(include_overflows=True) == 0:
                continue
            xvals = self.hists[region + '_allele_right_vs_per_gene_support'].get_bin_centers() #ignore_overflows=True)
            right = self.hists[region + '_allele_right_vs_per_gene_support'].bin_contents
            wrong = self.hists[region + '_allele_wrong_vs_per_gene_support'].bin_contents
            yvals = [float(r) / (r + w) if r + w > 0. else 0. for r, w in zip(right, wrong)]

            # remove values corresponding to bins with no entries
            while yvals.count(0.) > 0:
                iv = yvals.index(0.)
                xvals.pop(iv)
                right.pop(iv)
                wrong.pop(iv)
                yvals.pop(iv)

            tmphilos = [fraction_uncertainty.err(r, r + w) for r, w in zip(right, wrong)]
            yerrs = [err[1] - err[0] for err in tmphilos]

            # fitting a line isn't particularly informative, actually
            # params, cov = numpy.polyfit(xvals, yvals, 1, w=[1./(e*e) if e > 0. else 0. for e in yerrs], cov=True)
            # slope, slope_err = params[0], math.sqrt(cov[0][0])
            # y_icpt, y_icpt_err = params[1], math.sqrt(cov[1][1])
            # print '%s  slope: %5.2f +/- %5.2f  y-intercept: %5.2f +/- %5.2f' % (region, slope, slope_err, y_icpt, y_icpt_err)

            # print '%s' % region
            # for iv in range(len(xvals)):
            #     print '   %5.2f     %5.0f / %5.0f  =  %5.2f   +/-  %.3f' % (xvals[iv], right[iv], right[iv] + wrong[iv], yvals[iv], yerrs[iv])

            fig, ax = plotting.mpl_init()

            ax.errorbar(xvals, yvals, yerr=yerrs, markersize=10, linewidth=1, marker='.')
            ax.plot((0, 1), (0, 1), color='black', linestyle='--', linewidth=3)  # line with slope 1 and intercept 0
            # linevals = [slope*x + y_icpt for x in [0] + xvals]  # fitted line
            # ax.plot([0] + xvals, linevals)

            plotting.mpl_finish(ax, plotdir, region + '_allele_fraction_correct_vs_per_gene_support', xlabel='support', ylabel='fraction correct', xbounds=(-0.1, 1.1), ybounds=(-0.1, 1.1))

        if not only_csv:
            plotting.make_html(plotdir)
Exemple #16
0
def make_mutefreq_plot(plotdir, gene_name, positions, debug=False):
    import plotting
    """ NOTE shares a lot with make_transition_plot() in bin/plot-hmms.py. """
    nuke_colors = {'A': 'red', 'C': 'blue', 'G': 'orange', 'T': 'green'}
    fig, ax = plotting.mpl_init()
    fig.set_size_inches(plotting.plot_ratios[utils.get_region(gene_name)])

    ibin = 0
    if debug:
        print '  %s' % utils.color_gene(utils.unsanitize_name(gene_name))
    legend_colors = set()
    for info in positions:
        posname = info['name']

        # make label below bin for position and germline nuke
        ax.text(-0.5 + ibin,
                -0.075,
                simplify_state_name(posname),
                rotation='vertical',
                size=8)
        ax.text(-0.5 + ibin,
                -0.15,
                info.get('gl_nuke', '?'),
                fontsize=10,
                fontweight='bold')
        sorted_nukes, _ = zip(*sorted(info['nuke_freqs'].items(),
                                      key=operator.itemgetter(1),
                                      reverse=True))
        if 'gl_nuke' in info and info['gl_nuke'] in info[
                'nuke_freqs']:  # put the germline nuke first if we have it (second clause is for states with germline N))
            sorted_nukes = [info['gl_nuke']] + [
                n for n in sorted_nukes if n != info['gl_nuke']
            ]

        total = 0.0
        alpha = 0.6
        for nuke in sorted_nukes:
            prob = info['nuke_freqs'][nuke]
            color = nuke_colors[nuke]

            label_to_use = None
            if color not in legend_colors:
                label_to_use = nuke
                legend_colors.add(color)

            # horizontal line at height total+prob
            ax.plot([-0.5 + ibin, 0.5 + ibin], [total + prob, total + prob],
                    color=color,
                    alpha=alpha,
                    linewidth=3,
                    label=label_to_use)

            # vertical line from total to total + prob
            ax.plot([ibin, ibin], [total + 0.01, total + prob],
                    color=color,
                    alpha=alpha,
                    linewidth=3)

            # # write [ACGT] at midpoint between total and total+prob
            # midpoint = 0.5*(prob + 2*total)
            # ... *redacted*

            total += prob

        ibin += 1

    ax.get_xaxis().set_visible(False)
    plotting.mpl_finish(ax,
                        plotdir,
                        gene_name,
                        ybounds=(-0.01, 1.01),
                        xbounds=(-3, len(positions) + 3),
                        leg_loc=(0.95, 0.1),
                        adjust={
                            'left': 0.1,
                            'right': 0.8
                        },
                        leg_prop={'size': 8})
Exemple #17
0
    def make_single_hexbin_size_vs_shm_plot(self,
                                            sorted_clusters,
                                            annotations,
                                            repertoire_size,
                                            base_plotdir,
                                            plotname,
                                            n_max_mutations=100,
                                            log_cluster_size=False,
                                            debug=False):
        import plotting
        import matplotlib.pyplot as plt

        def getnmutelist(cluster):
            return annotations[':'.join(cluster)]['n_mutations']

        fig, ax = plotting.mpl_init()

        xvals, yvals = zip(
            *[[numpy.mean(getnmutelist(cluster)),
               len(cluster)] for cluster in sorted_clusters
              if numpy.mean(getnmutelist(cluster)) < n_max_mutations])
        if log_cluster_size:
            yvals = [math.log(yv) for yv in yvals]
        hb = ax.hexbin(xvals,
                       yvals,
                       gridsize=n_max_mutations,
                       cmap=plt.cm.Blues,
                       bins='log')

        nticks = 5
        yticks = [
            yvals[0] + itick * (yvals[-1] - yvals[0]) / float(nticks - 1)
            for itick in range(nticks)
        ]
        if log_cluster_size:
            yticklabels = [math.exp(yt) for yt in yticks]
            yticklabels = [('%.0f' % yt) if yt > 5 else ('%.1f' % yt)
                           for yt in yticklabels]
        else:
            yticklabels = [int(yt) for yt in yticks]

        if self.args.queries_to_include is not None:
            for cluster in sorted_clusters:
                queries_to_include_in_this_cluster = set(cluster) & set(
                    self.args.queries_to_include)
                if len(queries_to_include_in_this_cluster) == 0:
                    continue
                xval = numpy.mean(getnmutelist(cluster))
                yval = len(cluster)
                if log_cluster_size:
                    yval = math.log(yval)
                ax.plot([xval], [yval], color='red', marker='.', markersize=10)
                ax.text(xval,
                        yval,
                        ' '.join(queries_to_include_in_this_cluster),
                        color='red',
                        fontsize=8)

        ylabel = 'clonal family size'
        if log_cluster_size:
            ylabel += ' (log)'
            plotname += '-log'
        plotting.mpl_finish(ax,
                            base_plotdir + '/overall',
                            plotname,
                            xlabel='mean N mutations',
                            ylabel=ylabel,
                            xbounds=[0, n_max_mutations],
                            yticks=yticks,
                            yticklabels=yticklabels)