def get_mutation_matrix(data):
    '''
    Calculate the mutation rate matrix from accumulation of
    intra patient diversity via linear regression. Uncertainty
    of the estimates is assessed via boot strapping over patients.
    '''
    def get_mu(data):
        d = (data
             .loc[:, ['af', 'time_binc', 'mut']]
             .groupby(['mut', 'time_binc'])
             .mean()
             .unstack('time_binc')
             .loc[:, 'af'])

        rates = {}
        for mut, aft in d.iterrows():
            times = np.array(aft.index)
            aft = np.array(aft)
            rate = np.inner(aft, times) / np.inner(times, times)
            rates[mut] = rate

        mu = pd.Series(rates)
        mu.name = 'mutation rate from longitudinal data'

        return mu

    mu = get_mu(data)

    # Bootstrap
    dmulog10 = mu.copy()
    muBS = boot_strap_patients(data, get_mu, n_bootstrap=100)
    for key, _ in dmulog10.iteritems():
        dmulog10[key] = np.std([np.log10(tmp[key]) for tmp in muBS])

    return mu, dmulog10
def get_mutation_matrix(data):
    '''
    Calculate the mutation rate matrix from accumulation of
    intra patient diversity via linear regression. Uncertainty
    of the estimates is assessed via boot strapping over patients.
    '''
    def get_mu(data):
        d = (data.loc[:, ['af', 'time_binc', 'mut']].groupby(
            ['mut', 'time_binc']).mean().unstack('time_binc').loc[:, 'af'])

        rates = {}
        for mut, aft in d.iterrows():
            times = np.array(aft.index)
            aft = np.array(aft)
            rate = np.inner(aft, times) / np.inner(times, times)
            rates[mut] = rate

        mu = pd.Series(rates)
        mu.name = 'mutation rate from longitudinal data'

        return mu

    mu = get_mu(data)

    # Bootstrap
    dmulog10 = mu.copy()
    muBS = boot_strap_patients(data, get_mu, n_bootstrap=100)
    for key, _ in dmulog10.iteritems():
        dmulog10[key] = np.std([np.log10(tmp[key]) for tmp in muBS])

    return mu, dmulog10
Example #3
0
    def bootstrap():
        def prepare_and_fit(data):
            data_to_fit = average_data(data)
            return fit_data(data_to_fit, mu=mu)[1]['s']

        ds = s['s'].copy()
        sBS = boot_strap_patients(data, prepare_and_fit, n_bootstrap=100)
        for key, _ in ds.iteritems():
            ds[key] = np.std([tmp[key] for tmp in sBS])
        s['ds'] = ds
Example #4
0
def plot_to_away(data, fig_filename = None, figtypes=['.png', '.svg', '.pdf']):
    ####### plotting ###########
    import seaborn as sns
    from matplotlib import pyplot as plt
    plt.ion()
    sns.set_style('darkgrid')
    figpath = 'figures/'
    fs=fig_fontsize
    fig_size = (fig_width, 0.8*fig_width)
    fig, axs = plt.subplots(1, 1, figsize=fig_size)

    ax=axs
    Sbins = np.array([0,0.02, 0.08, 0.25, 2])
    Sbinc = 0.5*(Sbins[1:]+Sbins[:-1])
    def get_Sbin_mean(df):
        return df.groupby(by=['S_bin'], as_index=False).mean()
    for lblstr, subtype in [('subtype', 'patient'), ('group M', 'any')]:
        mv = data[subtype]['minor_variants']
        # subset to a specific time interval
        mv = mv.loc[(mv.loc[:,'time']>1500)&(mv.loc[:,'time']<2500),:]
        print "average time:", mv.loc[:,'time'].mean()/365
        mv.loc[:,['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived']] = \
            mv.loc[:,['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived']].astype(float)
        mean_to_away =get_Sbin_mean(mv)
        bs = boot_strap_patients(mv, eval_func=get_Sbin_mean, 
                             columns=['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived', 'S_bin'])

        print mean_to_away
        col = 'af_away_derived'
        ax.errorbar(Sbinc, mean_to_away.loc[:,col], 
                    replicate_func(bs, col, np.std, bin_index='S_bin'), 
                    lw = 3, label = 'founder = '+lblstr)
        col = 'af_to_derived'
        ax.errorbar(Sbinc, mean_to_away.loc[:,col], 
                    replicate_func(bs, col, np.std, bin_index='S_bin'),
                    lw = 3, label = u'founder \u2260 '+lblstr)
    ax.set_yscale('log')
    ax.set_xscale('log')
    ax.set_ylabel('Divergence from founder sequence', fontsize = fig_fontsize)
    ax.set_xlabel('Variability [bits]', fontsize = fig_fontsize)
    for item in ax.get_yticklabels()+ax.get_xticklabels():
        item.set_fontsize(fs)
    ax.set_xlim([0.005,2])
    ax.legend(loc = 'lower right', fontsize = fig_fontsize)
    plt.tight_layout(rect=(0.0, 0.02, 0.98, 0.98), pad=0.05, h_pad=0.5, w_pad=0.4)
    if fig_filename is not None:
        for ext in figtypes:
            fig.savefig(fig_filename+'_sfs'+ext)
    else:
        plt.ion()
        plt.show()



    def get_time_bin_means(df):
        return df.loc[:,['divergence', 'reversion','time_bin']].groupby(by=['time_bin'], as_index=False).mean()
    for subtype in ['patient', 'any']:
        to_away = data[subtype]['to_away']
        time_bins = np.array([0,500,1000,1500, 2500, 3500])
        binc = 0.5*(time_bins[1:]+time_bins[:-1])
        add_binned_column(to_away, time_bins, 'time')
        to_away.loc[:,['reversion', 'divergence']] = \
                to_away.loc[:,['reversion', 'divergence']].astype(float)
        rev_div = get_time_bin_means(to_away)
        bs = boot_strap_patients(to_away, get_time_bin_means, columns = ['reversion','divergence','time_bin'])
        reversion_std = replicate_func(bs, 'reversion', np.std, bin_index='time_bin')
        total_div_std = replicate_func(bs, 'divergence', np.std, bin_index='time_bin')
        fraction = rev_div.loc[:,'reversion']/rev_div.loc[:,'divergence']
        print "Comparison:", subtype
        print "Reversions:\n", rev_div.loc[:,'reversion']
        print "Divergence:\n", rev_div.loc[:,'divergence']
        print "Fraction:"
        for frac, total, num_std, denom_std in zip(fraction, rev_div.loc[:,'divergence'],reversion_std, total_div_std):
            print frac, '+/-', np.sqrt(num_std**2/total**2 + denom_std**2*frac**2/total**2)
        #print reversion_std,total_div_std
        print "Consensus!=Founder:",np.mean(data[subtype]['consensus_distance'].values())
               .groupby(['syn', 'protein_secondary_structure'] + additional)
               .count()
               ['af'])
        dav['std'] = (data
               .loc[:, ['syn', 'protein_secondary_structure', 'af'] + additional]
               .groupby(['syn', 'protein_secondary_structure'] + additional)
               .std()
               ['af'])
        dav['sem'] = dav['std'] / dav['#']
        return dav

    bt = make_binary_table(data)


    from util import boot_strap_patients
    reps = pd.concat(boot_strap_patients(data, average_data, n_bootstrap=10),
                     axis=1)
    reps.columns = np.arange(reps.shape[1]) + 1

    dav = pd.concat([reps.mean(axis=1), reps.std(axis=1)], axis=1)
    dav.columns = ['mean', 'std']

    def plot_average_frequencies(dav):
        fig, ax = plt.subplots()
        fs = 16
        colors = {'B': 'darkorange', 'H': 'steelblue',
                  'T': 'seagreen', 'X': 'black', '-': 'grey'}
        lss = {True: '--', False: '-'}
        d = {True: 'syn', False: 'nonsyn'}
        labs = {'B': 'sheet', 'T': 'turn',
                'H': 'helix', 'X': 'unstructured'}
Example #6
0
def plot_to_away(data, fig_filename=None, figtypes=['.png', '.svg', '.pdf'],
                 sequence_type='nuc'):
    '''Makes a two panel figure summarizing the results on reversion

    Args:
        data (dict): data to be plotted (see below)
    '''

    import seaborn as sns
    from matplotlib import pyplot as plt

    plt.ion()
    sns.set_style('darkgrid')
    figpath = 'figures/'
    fs=fig_fontsize
    fig_size = (1.0*fig_width, 0.6*fig_width)
    fig, axs = plt.subplots(1, 2, figsize=fig_size)
    nbs=100 # number of bootstrap replicates

    # set the colors for the plots, both panels use the same color scheme
    cols = HIVEVO_colormap()
    colors = [cols(x) for x in [0.0, 0.33, 0.66, 0.99]]

    ####################################################################################
    # make panel divergence vs entropy
    ####################################################################################
    ax=axs[1]
    if sequence_type == 'nuc':
        Sbins = np.array([0, 0.02, 0.08, 0.25, 2])
    else:
        Sbins = np.array([0, 0.1, 0.3, 3])

    Sbinc = 0.5*(Sbins[1:]+Sbins[:-1])
    def get_Sbin_mean(df): # regroup and calculate mean in entropy bins
        return df.groupby(by=['S_bin'], as_index=False).mean()
    color_count = 0
    for lblstr, subtype, ls in [('subtype', 'patient', '--'), ('group M', 'any', '-')]:
        mv = data[subtype]['minor_variants']
        # subset to a specific time interval
        mv = mv.loc[(mv.loc[:,'time'] > 1500)&(mv.loc[:,'time'] < 2500),:]
        print "average time:", mv.loc[:,'time'].mean() / 365.25
        mv.loc[:,['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived']] = \
            mv.loc[:,['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived']].astype(float)
        mean_to_away =get_Sbin_mean(mv)
        bs = boot_strap_patients(mv,
                                 eval_func=get_Sbin_mean,
                                 n_bootstrap=nbs, 
                                 columns=['af_away_minor',
                                          'af_away_derived',
                                          'af_to_minor',
                                          'af_to_derived',
                                          'S_bin'])

        print mean_to_away
        col = 'af_away_derived'
        ax.errorbar(Sbinc,
                    mean_to_away.loc[:,col], 
                    replicate_func(bs, col, np.std, bin_index='S_bin'),
                    ls=ls, 
                    lw=3,
                    label='founder = '+lblstr,
                    c=colors[color_count])
        color_count+=1
        col = 'af_to_derived'
        ax.errorbar(Sbinc, mean_to_away.loc[:,col], 
                    replicate_func(bs, col, np.std, bin_index='S_bin'), ls=ls,
                    lw = 3, label = u'founder \u2260 '+lblstr, c=colors[color_count])
        color_count+=1
    ax.set_yscale('log')
    ax.set_xscale('log')
    ax.set_ylabel('Divergence from founder', fontsize = fig_fontsize)
    ax.set_xlabel('Variability [bits]', fontsize = fig_fontsize)
    add_panel_label(ax, 'B', x_offset=-0.32)
    for item in ax.get_yticklabels()+ax.get_xticklabels():
        item.set_fontsize(fs-2)
    ax.set_xlim([0.005, 2])

    ####################################################################################
    # print reversion statistics
    ####################################################################################
    def get_time_bin_means(df): # get mean of divergence, reversion divergence and time for each time bin
        return df.loc[:,['divergence', 'reversion','time_bin']].groupby(by=['time_bin'], as_index=False).mean()
    for subtype in ['patient', 'any']:
        to_away = data[subtype]['to_away']
        time_bins = np.array([0, 500, 1000, 1500, 2500, 3500])
        binc = 0.5*(time_bins[1:]+time_bins[:-1])
        add_binned_column(to_away, time_bins, 'time')
        to_away.loc[:,['reversion', 'divergence']] = \
                to_away.loc[:,['reversion', 'divergence']].astype(float)
        rev_div = get_time_bin_means(to_away)
        bs = boot_strap_patients(to_away, get_time_bin_means,  n_bootstrap = nbs, 
                                 columns = ['reversion','divergence','time_bin'])
        reversion_std = replicate_func(bs, 'reversion', np.std, bin_index='time_bin')
        total_div_std = replicate_func(bs, 'divergence', np.std, bin_index='time_bin')
        fraction = rev_div.loc[:,'reversion']/rev_div.loc[:,'divergence']
        print "Comparison:", subtype
        print "Reversions:\n", rev_div.loc[:,'reversion']
        print "Divergence:\n", rev_div.loc[:,'divergence']
        # print the fraction of divergence that is due to reversion at different times
        # gives errors as standard deviations over patient bootstraps
        print "Fraction:"
        for frac, total, num_std, denom_std in zip(fraction, rev_div.loc[:,'divergence'],reversion_std, total_div_std):
            print frac, '+/-', np.sqrt(num_std**2/total**2 + denom_std**2*frac**2/total**2)

        print "Consensus!=Founder:",np.mean(data[subtype]['consensus_distance'].values())

    ####################################################################################
    # make panel divergence vs time
    ####################################################################################
    to_histogram=data['to_histogram']
    away_histogram=data['away_histogram']
    time_bins=data['time_bins']
    af_bins=data['af_bins']
    af_binc=0.5*(af_bins[1:]+af_bins[:-1])

    def bin_time(freq_arrays, time_bins):  
        '''sum up allele frequency histgrams corresponding to the same time bin'''
        binned_hists = [np.zeros_like(af_binc) for ti in time_bins[1:]]
        for hists in freq_arrays.values():
            for t, y in hists.iteritems():
                ti = np.searchsorted(time_bins, t)
                if ti>0 and ti<len(time_bins):
                    binned_hists[ti-1]+=y

        return binned_hists

    def get_div(afhist, fixed=False):
        '''return the fraction of fixed alleles or the mean divergence'''
        if fixed:
            return afhist[0]/afhist.sum()
        else:
            return np.array(afhist[:-1]*(1-af_binc[:-1])).sum()/afhist.sum()

    from random import choice
    ax = axs[0]
    time_binc = 0.5*(time_bins[1:]+time_bins[:-1])
    sym='o'
    fs = fig_fontsize
    color_count=0
    for subtype, ls in [('patient', '--'), ('any','-')]:
        for toaway, H in [(u'founder = '+('group M' if subtype=='any' else 'subtype'),  away_histogram[subtype]), 
                          (u'founder \u2260 '+('group M' if subtype=='any' else 'subtype'), to_histogram[subtype])]:
            mean_hists = bin_time(H,time_bins)
            div = [get_div(mean_hists[ti]) for ti in range(len(time_bins)-1)]
            # make replicates and calculate bootstrap confidence intervals
            replicates = []
            all_keys = H.keys()
            for ri in xrange(nbs):
                bootstrap_keys = [all_keys[ii] for ii in np.random.randint(len(all_keys), size=len(all_keys))]
                tmp = bin_time({key:H[key] for key in bootstrap_keys}, time_bins)
                replicates.append([get_div(tmp[ti]) for ti in range(len(time_bins)-1)])
            std_dev = np.array(replicates).std(axis=0)
            ax.errorbar(time_binc/365.25, div, std_dev, ls = ls, lw=3, c=colors[color_count])
            ax.plot(time_binc/365.25, div, label = toaway, ls = ls, lw=3, c=colors[color_count]) # plot again with label to avoid error bars in legend
            color_count+=1

    if sequence_type == 'nuc':
        ax.set_ylim([0,0.16])
        ax.set_yticks([0, 0.04, 0.08, 0.12])
    else:
        ax.set_ylim([0,0.32])
        ax.set_yticks([0, 0.08, 0.16, 0.24])

    ax.set_xlabel('ETI [years]', fontsize=fs)
    ax.set_ylabel('Divergence from founder', fontsize=fs)
    ax.legend(loc=2, fontsize=fs-2, labelspacing=0)
    add_panel_label(ax, 'A', x_offset=-0.32)
    ax.tick_params(axis='both', labelsize=fs-2)
    plt.tight_layout(pad=0.3, h_pad=0.5) #rect=(0.0, 0.02, 0.98, 0.98), pad=0.05, h_pad=0.5, w_pad=0.4)
    for ext in figtypes:
        fig.savefig(fig_filename+ext)
Example #7
0
def plot_divdiv(data, fig_filename=None, figtypes=['.png', '.svg', '.pdf']):
    '''
    plot divergence and diversity of synonymous and nonsynonymous mutations
    includes:
        - a panel that compares syn diversity/nonsyn divergence
    '''
    n_bootstrap = 50
    ####### plotting ###########
    import seaborn as sns
    from matplotlib import pyplot as plt
    plt.ion()
    sns.set_style('darkgrid')
    figpath = 'figures/'
    fs = fig_fontsize
    fig_size = (fig_width, 1.0 * fig_width)
    cols = HIVEVO_colormap()

    fig, axs = plt.subplots(2, 2, figsize=fig_size)
    divdiv = data['divdiv']
    # in rough the order in which they most dominantly appear in the plot
    regions = ['envelope', 'accessory', 'structural', 'enzymes']
    time_bins = np.array([0, 200, 500, 1000, 1500, 2000, 4000])
    time_binc = 0.5 * (time_bins[:-1] + time_bins[1:])
    add_binned_column(divdiv, time_bins, 'time')

    # map the regions to rough genomic order to match the genome color map in panel C
    colors = {
        reg: c
        for reg, c in zip(regions, [cols(x) for x in [0.66, 0.99, 0.01, 0.33]])
    }

    def get_time_bin_mean(df):
        return df.loc[:, ['time_bin', 'diversity', 'divergence']].groupby(
            by=['time_bin'], as_index=False).mean()

    def label_func(
        mutclass, region, divordiv
    ):  # assign labels to Panels A and B separately to make a combinatorial legend (regions vs syn/nonsyn)
        if divordiv == 'divergence' and mutclass == 'nonsyn':
            return region
        elif divordiv == 'diversity' and region == 'accessory':
            return mutclass
        else:
            return None

    ########## panel A and B #####################
    csv_out = open(fig_filename + '_AB.tsv', 'w')
    for ax, dtype in izip(axs[0, :], ['divergence', 'diversity']):
        add_panel_label(ax,
                        'A' if dtype == 'divergence' else 'B',
                        x_offset=-0.3)
        for mutclass in ['nonsyn', 'syn']:
            for region in regions:
                ind = (divdiv.loc[:, 'region']
                       == region) & (divdiv.loc[:, 'mutclass'] == mutclass)
                tmp = divdiv.loc[
                    ind, ['time_bin', 'diversity', 'divergence', 'pcode']]
                avg_divdiv = get_time_bin_mean(tmp)
                bs = boot_strap_patients(tmp,
                                         eval_func=get_time_bin_mean,
                                         n_bootstrap=n_bootstrap)
                # plot the same line with and without error bars, labels for legend without
                ax.plot(time_binc / 365.25,
                        avg_divdiv.loc[:, dtype],
                        ls='-' if mutclass == 'nonsyn' else '--',
                        c=colors[region],
                        lw=3,
                        label=label_func(mutclass, region, dtype))
                ax.errorbar(time_binc / 365.25,
                            avg_divdiv.loc[:, dtype],
                            replicate_func(bs,
                                           dtype,
                                           np.std,
                                           bin_index='time_bin'),
                            ls='-' if mutclass == 'nonsyn' else '--',
                            c=colors[region],
                            lw=3)
                csv_out.write('\t'.join(
                    map(str, [dtype, mutclass, region] +
                        list(avg_divdiv.loc[:, dtype]))) + '\n')

        ax.legend(loc=2, fontsize=fs - 1, numpoints=2, labelspacing=0)
        ax.set_xticks([0, 2, 4, 6, 8])
        if dtype == 'divergence':
            ax.set_yticks([0, .02, .04])
            ax.set_ylim([0, .048])
        else:
            ax.set_yticks([0, .01, .02])
            ax.set_ylim([0, .028])
        ax.set_xlim([0, 8.5])
        ax.set_ylabel(dtype)
        ax.tick_params(labelsize=fs - 2)
        ax.set_xlabel('Years since EDI', fontsize=fs)
    csv_out.close()

    ########## panel C: anti correlation of syn diversity and nonsyn divergence #############
    csv_out = open(fig_filename + '_C.tsv', 'w')
    (avg_nonsyn_divg, avg_nonsyn_divs, avg_syn_divs) = data['divdiv_corr']
    ax = axs[1, 0]
    add_panel_label(ax, 'C', x_offset=-0.3)
    x_data, y_data = avg_nonsyn_divg[::500], avg_syn_divs[::500]
    ax.scatter(x_data,
               y_data,
               c=[cols(p) for p in np.linspace(0, 1, len(x_data))],
               s=50)
    csv_out.write('\t'.join(map(str, ["nonsyn_divergence"] + list(x_data))) +
                  '\n')
    csv_out.write('\t'.join(map(str, ["syn_diversity"] + list(y_data))) + '\n')
    csv_out.close()

    ax.set_xlabel('nonsyn divergence', fontsize=fig_fontsize)
    ax.set_ylabel('syn diversity', fontsize=fig_fontsize)
    ax.set_ylim([0, 0.028])
    ax.set_xlim([0, 0.012])
    ax.set_xticks([0, 0.005, 0.01])
    ax.set_yticks([0, 0.01, 0.02])
    ax.tick_params(labelsize=fig_fontsize - 2)

    ########## sfs in panel D ##############
    csv_out = open(fig_filename + '_D.tsv', 'w')
    sfs = data['sfs']
    ax = axs[1, 1]
    add_panel_label(ax, 'D', x_offset=-0.3)
    colors = sns.color_palette(n_colors=2)
    binc = binc = 0.5 * (sfs['bins'][1:] + sfs['bins'][:-1])
    ax.bar(binc - 0.045,
           sfs['syn'] / np.sum(sfs['syn']),
           width=0.04,
           label='syn',
           color=colors[0])
    ax.bar(binc,
           sfs['nonsyn'] / np.sum(sfs['nonsyn']),
           width=0.04,
           label='nonsyn',
           color=colors[1])
    csv_out.write('\t'.join(map(str, ["bin_centers"] + list(binc))) + '\n')
    csv_out.write('\t'.join(
        map(str, ["sfs_nonsyn"] +
            list(sfs['nonsyn'] / np.sum(sfs['nonsyn'])))) + '\n')
    csv_out.write('\t'.join(
        map(str, ["sfs_syn"] + list(sfs['syn'] / np.sum(sfs['syn'])))) + '\n')
    csv_out.close()
    ax.set_ylim([0.005, 2.0])
    ax.set_yscale('log')
    ax.set_xlabel('Frequency', fontsize=fs)
    ax.set_ylabel('Fractions of SNPs', fontsize=fs)
    ax.legend(loc=1, fontsize=fs - 2)
    ax.tick_params(labelsize=fig_fontsize - 2)

    # finalize and save the figure
    plt.tight_layout(rect=(0.0, 0.02, 0.98, 0.98),
                     pad=0.05,
                     h_pad=0.5,
                     w_pad=0.4)
    if fig_filename is not None:
        for ext in figtypes:
            fig.savefig(fig_filename + ext)
    else:
        plt.ion()
        plt.show()
Example #8
0
                                          muNS=muNS,
                                          nu_sweep_norm=nu_sweep_norm)

    sys.exit()

    if True:
        def fit_fitness_cost_for_bootstrap(data):
            data_to_fit = prepare_data_for_fit(data, plot=False)
            s =  fit_fitness_cost_interpmu(data_to_fit,
                                           mu=mu,
                                           muNS=muNS,
                                           nu_sweep_norm=nu_sweep_norm)
            return s['s']

        ds = s['s'].copy()
        sBS = boot_strap_patients(data, fit_fitness_cost_for_bootstrap, n_bootstrap=100)
        for key, _ in ds.iteritems():
            ds[key] = np.std([tmp[key] for tmp in sBS])
        s.rename(columns={'ds': 'ds_fit'}, inplace=True)
        s['ds_bootstrap'] = ds
        s.sort_index(axis=1, ascending=False, inplace=True)

    fn_s = 'data/fitness_cost_result.pickle'
    s.to_pickle(fn_s)

    plot_fitness_cost_allmuts(sMu)

    for mut in ['A->G', 'G->A', 'C->T', 'T->C']:
        plot_fitness_cost(data_to_fit,
                          sMu.loc['s', mut], mu, ds=sMu.loc['ds', mut],
                          muNS=muNS,
        dav['#'] = (data.loc[:, ['syn', 'protein_secondary_structure', 'af'] +
                             additional].groupby(
                                 ['syn', 'protein_secondary_structure'] +
                                 additional).count()['af'])
        dav['std'] = (data.loc[:,
                               ['syn', 'protein_secondary_structure', 'af'] +
                               additional].groupby(
                                   ['syn', 'protein_secondary_structure'] +
                                   additional).std()['af'])
        dav['sem'] = dav['std'] / dav['#']
        return dav

    bt = make_binary_table(data)

    from util import boot_strap_patients
    reps = pd.concat(boot_strap_patients(data, average_data, n_bootstrap=10),
                     axis=1)
    reps.columns = np.arange(reps.shape[1]) + 1

    dav = pd.concat([reps.mean(axis=1), reps.std(axis=1)], axis=1)
    dav.columns = ['mean', 'std']

    def plot_average_frequencies(dav):
        fig, ax = plt.subplots()
        fs = 16
        colors = {
            'B': 'darkorange',
            'H': 'steelblue',
            'T': 'seagreen',
            'X': 'black',
            '-': 'grey'
def plot_divdiv(data, fig_filename=None, figtypes=['.png', '.svg', '.pdf']):
    '''
    plot divergence and diversity of synonymous and nonsynonymous mutations
    includes:
        - a panel that compares syn diversity/nonsyn divergence
    '''
    n_bootstrap=50
    ####### plotting ###########
    import seaborn as sns
    from matplotlib import pyplot as plt
    plt.ion()
    sns.set_style('darkgrid')
    figpath = 'figures/'
    fs=fig_fontsize
    fig_size = (fig_width, 1.0*fig_width)
    cols = HIVEVO_colormap()

    fig, axs = plt.subplots(2, 2,figsize=fig_size)
    divdiv = data['divdiv']
    # in rough the order in which they most dominantly appear in the plot
    regions = ['envelope', 'accessory', 'structural','enzymes']
    time_bins = np.array([0, 200, 500, 1000, 1500, 2000, 4000])
    time_binc = 0.5*(time_bins[:-1]+time_bins[1:])
    add_binned_column(divdiv, time_bins, 'time')

    # map the regions to rough genomic order to match the genome color map in panel C
    colors = {reg:c for reg, c in zip(regions, [cols(x) for x in [0.66, 0.99, 0.01, 0.33]])}
    def get_time_bin_mean(df):
        return df.loc[:,['time_bin', 'diversity', 'divergence']].groupby(by=['time_bin'], as_index=False).mean()
    def label_func(mutclass, region, divordiv):  # assign labels to Panels A and B separately to make a combinatorial legend (regions vs syn/nonsyn)
        if divordiv=='divergence' and mutclass=='nonsyn':
            return region
        elif divordiv=='diversity' and region=='accessory':
            return mutclass
        else:
            return None

    ########## panel A and B #####################
    csv_out = open(fig_filename+'_AB.tsv', 'w')
    for ax, dtype in izip(axs[0,:], ['divergence', 'diversity']):
        add_panel_label(ax, 'A' if dtype=='divergence' else 'B', x_offset = -0.3)
        for mutclass in ['nonsyn', 'syn']:
            for region in regions:
                ind = (divdiv.loc[:,'region']==region) & (divdiv.loc[:,'mutclass']==mutclass)
                tmp = divdiv.loc[ind,['time_bin', 'diversity', 'divergence', 'pcode']]
                avg_divdiv = get_time_bin_mean(tmp)
                bs = boot_strap_patients(tmp, eval_func = get_time_bin_mean, n_bootstrap=n_bootstrap)
                # plot the same line with and without error bars, labels for legend without
                ax.plot(time_binc/365.25, avg_divdiv.loc[:,dtype], ls='-' if mutclass=='nonsyn' else '--',
                            c=colors[region], lw=3, label=label_func(mutclass, region, dtype))
                ax.errorbar(time_binc/365.25, avg_divdiv.loc[:,dtype], replicate_func(bs, dtype, np.std, bin_index='time_bin'),
                            ls='-' if mutclass=='nonsyn' else '--', c=colors[region], lw=3)
                csv_out.write('\t'.join(map(str,[dtype, mutclass, region]+list(avg_divdiv.loc[:,dtype])))+'\n')

        ax.legend(loc=2, fontsize=fs-1, numpoints=2, labelspacing = 0)
        ax.set_xticks([0,2,4,6,8])
        if dtype=='divergence':
            ax.set_yticks([0,.02,.04])
            ax.set_ylim([0,.048])
        else:
            ax.set_yticks([0,.01,.02])
            ax.set_ylim([0,.028])
        ax.set_xlim([0,8.5])
        ax.set_ylabel(dtype)
        ax.tick_params(labelsize=fs-2)
        ax.set_xlabel('Years since EDI', fontsize=fs)
    csv_out.close()

    ########## panel C: anti correlation of syn diversity and nonsyn divergence #############
    csv_out = open(fig_filename+'_C.tsv', 'w')
    (avg_nonsyn_divg, avg_nonsyn_divs, avg_syn_divs) = data['divdiv_corr']
    ax = axs[1,0]
    add_panel_label(ax, 'C', x_offset = -0.3)
    x_data, y_data = avg_nonsyn_divg[::500], avg_syn_divs[::500]
    ax.scatter(x_data, y_data, c=[cols(p) for p in np.linspace(0,1,len(x_data))], s=50)
    csv_out.write('\t'.join(map(str, ["nonsyn_divergence"]+list(x_data)))+'\n')
    csv_out.write('\t'.join(map(str, ["syn_diversity"]+list(y_data)))+'\n')
    csv_out.close()

    ax.set_xlabel('nonsyn divergence', fontsize = fig_fontsize)
    ax.set_ylabel('syn diversity', fontsize = fig_fontsize)
    ax.set_ylim([0,0.028])
    ax.set_xlim([0,0.012])
    ax.set_xticks([0, 0.005,0.01])
    ax.set_yticks([0, 0.01, 0.02])
    ax.tick_params(labelsize=fig_fontsize-2)


    ########## sfs in panel D ##############
    csv_out = open(fig_filename+'_D.tsv', 'w')
    sfs=data['sfs']
    ax = axs[1,1]
    add_panel_label(ax, 'D', x_offset = -0.3)
    colors = sns.color_palette(n_colors=2)
    binc = binc = 0.5*(sfs['bins'][1:]+sfs['bins'][:-1])
    ax.bar(binc-0.045, sfs['syn']/np.sum(sfs['syn']),width = 0.04, label='syn', color=colors[0])
    ax.bar(binc, sfs['nonsyn']/np.sum(sfs['nonsyn']),width = 0.04, label='nonsyn', color=colors[1])
    csv_out.write('\t'.join(map(str, ["bin_centers"]+list(binc)))+'\n')
    csv_out.write('\t'.join(map(str, ["sfs_nonsyn"]+list(sfs['nonsyn']/np.sum(sfs['nonsyn']))))+'\n')
    csv_out.write('\t'.join(map(str, ["sfs_syn"]+list(sfs['syn']/np.sum(sfs['syn']))))+'\n')
    csv_out.close()
    ax.set_ylim([0.005,2.0])
    ax.set_yscale('log')
    ax.set_xlabel('Frequency',fontsize=fs)
    ax.set_ylabel('Fractions of SNPs',fontsize=fs)
    ax.legend(loc=1, fontsize=fs-2)
    ax.tick_params(labelsize=fig_fontsize-2)

    # finalize and save the figure
    plt.tight_layout(rect=(0.0, 0.02, 0.98, 0.98), pad=0.05, h_pad=0.5, w_pad=0.4)
    if fig_filename is not None:
        for ext in figtypes:
            fig.savefig(fig_filename+ext)
    else:
        plt.ion()
        plt.show()