def get_mutation_matrix(data): ''' Calculate the mutation rate matrix from accumulation of intra patient diversity via linear regression. Uncertainty of the estimates is assessed via boot strapping over patients. ''' def get_mu(data): d = (data .loc[:, ['af', 'time_binc', 'mut']] .groupby(['mut', 'time_binc']) .mean() .unstack('time_binc') .loc[:, 'af']) rates = {} for mut, aft in d.iterrows(): times = np.array(aft.index) aft = np.array(aft) rate = np.inner(aft, times) / np.inner(times, times) rates[mut] = rate mu = pd.Series(rates) mu.name = 'mutation rate from longitudinal data' return mu mu = get_mu(data) # Bootstrap dmulog10 = mu.copy() muBS = boot_strap_patients(data, get_mu, n_bootstrap=100) for key, _ in dmulog10.iteritems(): dmulog10[key] = np.std([np.log10(tmp[key]) for tmp in muBS]) return mu, dmulog10
def get_mutation_matrix(data): ''' Calculate the mutation rate matrix from accumulation of intra patient diversity via linear regression. Uncertainty of the estimates is assessed via boot strapping over patients. ''' def get_mu(data): d = (data.loc[:, ['af', 'time_binc', 'mut']].groupby( ['mut', 'time_binc']).mean().unstack('time_binc').loc[:, 'af']) rates = {} for mut, aft in d.iterrows(): times = np.array(aft.index) aft = np.array(aft) rate = np.inner(aft, times) / np.inner(times, times) rates[mut] = rate mu = pd.Series(rates) mu.name = 'mutation rate from longitudinal data' return mu mu = get_mu(data) # Bootstrap dmulog10 = mu.copy() muBS = boot_strap_patients(data, get_mu, n_bootstrap=100) for key, _ in dmulog10.iteritems(): dmulog10[key] = np.std([np.log10(tmp[key]) for tmp in muBS]) return mu, dmulog10
def bootstrap(): def prepare_and_fit(data): data_to_fit = average_data(data) return fit_data(data_to_fit, mu=mu)[1]['s'] ds = s['s'].copy() sBS = boot_strap_patients(data, prepare_and_fit, n_bootstrap=100) for key, _ in ds.iteritems(): ds[key] = np.std([tmp[key] for tmp in sBS]) s['ds'] = ds
def plot_to_away(data, fig_filename = None, figtypes=['.png', '.svg', '.pdf']): ####### plotting ########### import seaborn as sns from matplotlib import pyplot as plt plt.ion() sns.set_style('darkgrid') figpath = 'figures/' fs=fig_fontsize fig_size = (fig_width, 0.8*fig_width) fig, axs = plt.subplots(1, 1, figsize=fig_size) ax=axs Sbins = np.array([0,0.02, 0.08, 0.25, 2]) Sbinc = 0.5*(Sbins[1:]+Sbins[:-1]) def get_Sbin_mean(df): return df.groupby(by=['S_bin'], as_index=False).mean() for lblstr, subtype in [('subtype', 'patient'), ('group M', 'any')]: mv = data[subtype]['minor_variants'] # subset to a specific time interval mv = mv.loc[(mv.loc[:,'time']>1500)&(mv.loc[:,'time']<2500),:] print "average time:", mv.loc[:,'time'].mean()/365 mv.loc[:,['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived']] = \ mv.loc[:,['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived']].astype(float) mean_to_away =get_Sbin_mean(mv) bs = boot_strap_patients(mv, eval_func=get_Sbin_mean, columns=['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived', 'S_bin']) print mean_to_away col = 'af_away_derived' ax.errorbar(Sbinc, mean_to_away.loc[:,col], replicate_func(bs, col, np.std, bin_index='S_bin'), lw = 3, label = 'founder = '+lblstr) col = 'af_to_derived' ax.errorbar(Sbinc, mean_to_away.loc[:,col], replicate_func(bs, col, np.std, bin_index='S_bin'), lw = 3, label = u'founder \u2260 '+lblstr) ax.set_yscale('log') ax.set_xscale('log') ax.set_ylabel('Divergence from founder sequence', fontsize = fig_fontsize) ax.set_xlabel('Variability [bits]', fontsize = fig_fontsize) for item in ax.get_yticklabels()+ax.get_xticklabels(): item.set_fontsize(fs) ax.set_xlim([0.005,2]) ax.legend(loc = 'lower right', fontsize = fig_fontsize) plt.tight_layout(rect=(0.0, 0.02, 0.98, 0.98), pad=0.05, h_pad=0.5, w_pad=0.4) if fig_filename is not None: for ext in figtypes: fig.savefig(fig_filename+'_sfs'+ext) else: plt.ion() plt.show() def get_time_bin_means(df): return df.loc[:,['divergence', 'reversion','time_bin']].groupby(by=['time_bin'], as_index=False).mean() for subtype in ['patient', 'any']: to_away = data[subtype]['to_away'] time_bins = np.array([0,500,1000,1500, 2500, 3500]) binc = 0.5*(time_bins[1:]+time_bins[:-1]) add_binned_column(to_away, time_bins, 'time') to_away.loc[:,['reversion', 'divergence']] = \ to_away.loc[:,['reversion', 'divergence']].astype(float) rev_div = get_time_bin_means(to_away) bs = boot_strap_patients(to_away, get_time_bin_means, columns = ['reversion','divergence','time_bin']) reversion_std = replicate_func(bs, 'reversion', np.std, bin_index='time_bin') total_div_std = replicate_func(bs, 'divergence', np.std, bin_index='time_bin') fraction = rev_div.loc[:,'reversion']/rev_div.loc[:,'divergence'] print "Comparison:", subtype print "Reversions:\n", rev_div.loc[:,'reversion'] print "Divergence:\n", rev_div.loc[:,'divergence'] print "Fraction:" for frac, total, num_std, denom_std in zip(fraction, rev_div.loc[:,'divergence'],reversion_std, total_div_std): print frac, '+/-', np.sqrt(num_std**2/total**2 + denom_std**2*frac**2/total**2) #print reversion_std,total_div_std print "Consensus!=Founder:",np.mean(data[subtype]['consensus_distance'].values())
.groupby(['syn', 'protein_secondary_structure'] + additional) .count() ['af']) dav['std'] = (data .loc[:, ['syn', 'protein_secondary_structure', 'af'] + additional] .groupby(['syn', 'protein_secondary_structure'] + additional) .std() ['af']) dav['sem'] = dav['std'] / dav['#'] return dav bt = make_binary_table(data) from util import boot_strap_patients reps = pd.concat(boot_strap_patients(data, average_data, n_bootstrap=10), axis=1) reps.columns = np.arange(reps.shape[1]) + 1 dav = pd.concat([reps.mean(axis=1), reps.std(axis=1)], axis=1) dav.columns = ['mean', 'std'] def plot_average_frequencies(dav): fig, ax = plt.subplots() fs = 16 colors = {'B': 'darkorange', 'H': 'steelblue', 'T': 'seagreen', 'X': 'black', '-': 'grey'} lss = {True: '--', False: '-'} d = {True: 'syn', False: 'nonsyn'} labs = {'B': 'sheet', 'T': 'turn', 'H': 'helix', 'X': 'unstructured'}
def plot_to_away(data, fig_filename=None, figtypes=['.png', '.svg', '.pdf'], sequence_type='nuc'): '''Makes a two panel figure summarizing the results on reversion Args: data (dict): data to be plotted (see below) ''' import seaborn as sns from matplotlib import pyplot as plt plt.ion() sns.set_style('darkgrid') figpath = 'figures/' fs=fig_fontsize fig_size = (1.0*fig_width, 0.6*fig_width) fig, axs = plt.subplots(1, 2, figsize=fig_size) nbs=100 # number of bootstrap replicates # set the colors for the plots, both panels use the same color scheme cols = HIVEVO_colormap() colors = [cols(x) for x in [0.0, 0.33, 0.66, 0.99]] #################################################################################### # make panel divergence vs entropy #################################################################################### ax=axs[1] if sequence_type == 'nuc': Sbins = np.array([0, 0.02, 0.08, 0.25, 2]) else: Sbins = np.array([0, 0.1, 0.3, 3]) Sbinc = 0.5*(Sbins[1:]+Sbins[:-1]) def get_Sbin_mean(df): # regroup and calculate mean in entropy bins return df.groupby(by=['S_bin'], as_index=False).mean() color_count = 0 for lblstr, subtype, ls in [('subtype', 'patient', '--'), ('group M', 'any', '-')]: mv = data[subtype]['minor_variants'] # subset to a specific time interval mv = mv.loc[(mv.loc[:,'time'] > 1500)&(mv.loc[:,'time'] < 2500),:] print "average time:", mv.loc[:,'time'].mean() / 365.25 mv.loc[:,['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived']] = \ mv.loc[:,['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived']].astype(float) mean_to_away =get_Sbin_mean(mv) bs = boot_strap_patients(mv, eval_func=get_Sbin_mean, n_bootstrap=nbs, columns=['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived', 'S_bin']) print mean_to_away col = 'af_away_derived' ax.errorbar(Sbinc, mean_to_away.loc[:,col], replicate_func(bs, col, np.std, bin_index='S_bin'), ls=ls, lw=3, label='founder = '+lblstr, c=colors[color_count]) color_count+=1 col = 'af_to_derived' ax.errorbar(Sbinc, mean_to_away.loc[:,col], replicate_func(bs, col, np.std, bin_index='S_bin'), ls=ls, lw = 3, label = u'founder \u2260 '+lblstr, c=colors[color_count]) color_count+=1 ax.set_yscale('log') ax.set_xscale('log') ax.set_ylabel('Divergence from founder', fontsize = fig_fontsize) ax.set_xlabel('Variability [bits]', fontsize = fig_fontsize) add_panel_label(ax, 'B', x_offset=-0.32) for item in ax.get_yticklabels()+ax.get_xticklabels(): item.set_fontsize(fs-2) ax.set_xlim([0.005, 2]) #################################################################################### # print reversion statistics #################################################################################### def get_time_bin_means(df): # get mean of divergence, reversion divergence and time for each time bin return df.loc[:,['divergence', 'reversion','time_bin']].groupby(by=['time_bin'], as_index=False).mean() for subtype in ['patient', 'any']: to_away = data[subtype]['to_away'] time_bins = np.array([0, 500, 1000, 1500, 2500, 3500]) binc = 0.5*(time_bins[1:]+time_bins[:-1]) add_binned_column(to_away, time_bins, 'time') to_away.loc[:,['reversion', 'divergence']] = \ to_away.loc[:,['reversion', 'divergence']].astype(float) rev_div = get_time_bin_means(to_away) bs = boot_strap_patients(to_away, get_time_bin_means, n_bootstrap = nbs, columns = ['reversion','divergence','time_bin']) reversion_std = replicate_func(bs, 'reversion', np.std, bin_index='time_bin') total_div_std = replicate_func(bs, 'divergence', np.std, bin_index='time_bin') fraction = rev_div.loc[:,'reversion']/rev_div.loc[:,'divergence'] print "Comparison:", subtype print "Reversions:\n", rev_div.loc[:,'reversion'] print "Divergence:\n", rev_div.loc[:,'divergence'] # print the fraction of divergence that is due to reversion at different times # gives errors as standard deviations over patient bootstraps print "Fraction:" for frac, total, num_std, denom_std in zip(fraction, rev_div.loc[:,'divergence'],reversion_std, total_div_std): print frac, '+/-', np.sqrt(num_std**2/total**2 + denom_std**2*frac**2/total**2) print "Consensus!=Founder:",np.mean(data[subtype]['consensus_distance'].values()) #################################################################################### # make panel divergence vs time #################################################################################### to_histogram=data['to_histogram'] away_histogram=data['away_histogram'] time_bins=data['time_bins'] af_bins=data['af_bins'] af_binc=0.5*(af_bins[1:]+af_bins[:-1]) def bin_time(freq_arrays, time_bins): '''sum up allele frequency histgrams corresponding to the same time bin''' binned_hists = [np.zeros_like(af_binc) for ti in time_bins[1:]] for hists in freq_arrays.values(): for t, y in hists.iteritems(): ti = np.searchsorted(time_bins, t) if ti>0 and ti<len(time_bins): binned_hists[ti-1]+=y return binned_hists def get_div(afhist, fixed=False): '''return the fraction of fixed alleles or the mean divergence''' if fixed: return afhist[0]/afhist.sum() else: return np.array(afhist[:-1]*(1-af_binc[:-1])).sum()/afhist.sum() from random import choice ax = axs[0] time_binc = 0.5*(time_bins[1:]+time_bins[:-1]) sym='o' fs = fig_fontsize color_count=0 for subtype, ls in [('patient', '--'), ('any','-')]: for toaway, H in [(u'founder = '+('group M' if subtype=='any' else 'subtype'), away_histogram[subtype]), (u'founder \u2260 '+('group M' if subtype=='any' else 'subtype'), to_histogram[subtype])]: mean_hists = bin_time(H,time_bins) div = [get_div(mean_hists[ti]) for ti in range(len(time_bins)-1)] # make replicates and calculate bootstrap confidence intervals replicates = [] all_keys = H.keys() for ri in xrange(nbs): bootstrap_keys = [all_keys[ii] for ii in np.random.randint(len(all_keys), size=len(all_keys))] tmp = bin_time({key:H[key] for key in bootstrap_keys}, time_bins) replicates.append([get_div(tmp[ti]) for ti in range(len(time_bins)-1)]) std_dev = np.array(replicates).std(axis=0) ax.errorbar(time_binc/365.25, div, std_dev, ls = ls, lw=3, c=colors[color_count]) ax.plot(time_binc/365.25, div, label = toaway, ls = ls, lw=3, c=colors[color_count]) # plot again with label to avoid error bars in legend color_count+=1 if sequence_type == 'nuc': ax.set_ylim([0,0.16]) ax.set_yticks([0, 0.04, 0.08, 0.12]) else: ax.set_ylim([0,0.32]) ax.set_yticks([0, 0.08, 0.16, 0.24]) ax.set_xlabel('ETI [years]', fontsize=fs) ax.set_ylabel('Divergence from founder', fontsize=fs) ax.legend(loc=2, fontsize=fs-2, labelspacing=0) add_panel_label(ax, 'A', x_offset=-0.32) ax.tick_params(axis='both', labelsize=fs-2) plt.tight_layout(pad=0.3, h_pad=0.5) #rect=(0.0, 0.02, 0.98, 0.98), pad=0.05, h_pad=0.5, w_pad=0.4) for ext in figtypes: fig.savefig(fig_filename+ext)
def plot_divdiv(data, fig_filename=None, figtypes=['.png', '.svg', '.pdf']): ''' plot divergence and diversity of synonymous and nonsynonymous mutations includes: - a panel that compares syn diversity/nonsyn divergence ''' n_bootstrap = 50 ####### plotting ########### import seaborn as sns from matplotlib import pyplot as plt plt.ion() sns.set_style('darkgrid') figpath = 'figures/' fs = fig_fontsize fig_size = (fig_width, 1.0 * fig_width) cols = HIVEVO_colormap() fig, axs = plt.subplots(2, 2, figsize=fig_size) divdiv = data['divdiv'] # in rough the order in which they most dominantly appear in the plot regions = ['envelope', 'accessory', 'structural', 'enzymes'] time_bins = np.array([0, 200, 500, 1000, 1500, 2000, 4000]) time_binc = 0.5 * (time_bins[:-1] + time_bins[1:]) add_binned_column(divdiv, time_bins, 'time') # map the regions to rough genomic order to match the genome color map in panel C colors = { reg: c for reg, c in zip(regions, [cols(x) for x in [0.66, 0.99, 0.01, 0.33]]) } def get_time_bin_mean(df): return df.loc[:, ['time_bin', 'diversity', 'divergence']].groupby( by=['time_bin'], as_index=False).mean() def label_func( mutclass, region, divordiv ): # assign labels to Panels A and B separately to make a combinatorial legend (regions vs syn/nonsyn) if divordiv == 'divergence' and mutclass == 'nonsyn': return region elif divordiv == 'diversity' and region == 'accessory': return mutclass else: return None ########## panel A and B ##################### csv_out = open(fig_filename + '_AB.tsv', 'w') for ax, dtype in izip(axs[0, :], ['divergence', 'diversity']): add_panel_label(ax, 'A' if dtype == 'divergence' else 'B', x_offset=-0.3) for mutclass in ['nonsyn', 'syn']: for region in regions: ind = (divdiv.loc[:, 'region'] == region) & (divdiv.loc[:, 'mutclass'] == mutclass) tmp = divdiv.loc[ ind, ['time_bin', 'diversity', 'divergence', 'pcode']] avg_divdiv = get_time_bin_mean(tmp) bs = boot_strap_patients(tmp, eval_func=get_time_bin_mean, n_bootstrap=n_bootstrap) # plot the same line with and without error bars, labels for legend without ax.plot(time_binc / 365.25, avg_divdiv.loc[:, dtype], ls='-' if mutclass == 'nonsyn' else '--', c=colors[region], lw=3, label=label_func(mutclass, region, dtype)) ax.errorbar(time_binc / 365.25, avg_divdiv.loc[:, dtype], replicate_func(bs, dtype, np.std, bin_index='time_bin'), ls='-' if mutclass == 'nonsyn' else '--', c=colors[region], lw=3) csv_out.write('\t'.join( map(str, [dtype, mutclass, region] + list(avg_divdiv.loc[:, dtype]))) + '\n') ax.legend(loc=2, fontsize=fs - 1, numpoints=2, labelspacing=0) ax.set_xticks([0, 2, 4, 6, 8]) if dtype == 'divergence': ax.set_yticks([0, .02, .04]) ax.set_ylim([0, .048]) else: ax.set_yticks([0, .01, .02]) ax.set_ylim([0, .028]) ax.set_xlim([0, 8.5]) ax.set_ylabel(dtype) ax.tick_params(labelsize=fs - 2) ax.set_xlabel('Years since EDI', fontsize=fs) csv_out.close() ########## panel C: anti correlation of syn diversity and nonsyn divergence ############# csv_out = open(fig_filename + '_C.tsv', 'w') (avg_nonsyn_divg, avg_nonsyn_divs, avg_syn_divs) = data['divdiv_corr'] ax = axs[1, 0] add_panel_label(ax, 'C', x_offset=-0.3) x_data, y_data = avg_nonsyn_divg[::500], avg_syn_divs[::500] ax.scatter(x_data, y_data, c=[cols(p) for p in np.linspace(0, 1, len(x_data))], s=50) csv_out.write('\t'.join(map(str, ["nonsyn_divergence"] + list(x_data))) + '\n') csv_out.write('\t'.join(map(str, ["syn_diversity"] + list(y_data))) + '\n') csv_out.close() ax.set_xlabel('nonsyn divergence', fontsize=fig_fontsize) ax.set_ylabel('syn diversity', fontsize=fig_fontsize) ax.set_ylim([0, 0.028]) ax.set_xlim([0, 0.012]) ax.set_xticks([0, 0.005, 0.01]) ax.set_yticks([0, 0.01, 0.02]) ax.tick_params(labelsize=fig_fontsize - 2) ########## sfs in panel D ############## csv_out = open(fig_filename + '_D.tsv', 'w') sfs = data['sfs'] ax = axs[1, 1] add_panel_label(ax, 'D', x_offset=-0.3) colors = sns.color_palette(n_colors=2) binc = binc = 0.5 * (sfs['bins'][1:] + sfs['bins'][:-1]) ax.bar(binc - 0.045, sfs['syn'] / np.sum(sfs['syn']), width=0.04, label='syn', color=colors[0]) ax.bar(binc, sfs['nonsyn'] / np.sum(sfs['nonsyn']), width=0.04, label='nonsyn', color=colors[1]) csv_out.write('\t'.join(map(str, ["bin_centers"] + list(binc))) + '\n') csv_out.write('\t'.join( map(str, ["sfs_nonsyn"] + list(sfs['nonsyn'] / np.sum(sfs['nonsyn'])))) + '\n') csv_out.write('\t'.join( map(str, ["sfs_syn"] + list(sfs['syn'] / np.sum(sfs['syn'])))) + '\n') csv_out.close() ax.set_ylim([0.005, 2.0]) ax.set_yscale('log') ax.set_xlabel('Frequency', fontsize=fs) ax.set_ylabel('Fractions of SNPs', fontsize=fs) ax.legend(loc=1, fontsize=fs - 2) ax.tick_params(labelsize=fig_fontsize - 2) # finalize and save the figure plt.tight_layout(rect=(0.0, 0.02, 0.98, 0.98), pad=0.05, h_pad=0.5, w_pad=0.4) if fig_filename is not None: for ext in figtypes: fig.savefig(fig_filename + ext) else: plt.ion() plt.show()
muNS=muNS, nu_sweep_norm=nu_sweep_norm) sys.exit() if True: def fit_fitness_cost_for_bootstrap(data): data_to_fit = prepare_data_for_fit(data, plot=False) s = fit_fitness_cost_interpmu(data_to_fit, mu=mu, muNS=muNS, nu_sweep_norm=nu_sweep_norm) return s['s'] ds = s['s'].copy() sBS = boot_strap_patients(data, fit_fitness_cost_for_bootstrap, n_bootstrap=100) for key, _ in ds.iteritems(): ds[key] = np.std([tmp[key] for tmp in sBS]) s.rename(columns={'ds': 'ds_fit'}, inplace=True) s['ds_bootstrap'] = ds s.sort_index(axis=1, ascending=False, inplace=True) fn_s = 'data/fitness_cost_result.pickle' s.to_pickle(fn_s) plot_fitness_cost_allmuts(sMu) for mut in ['A->G', 'G->A', 'C->T', 'T->C']: plot_fitness_cost(data_to_fit, sMu.loc['s', mut], mu, ds=sMu.loc['ds', mut], muNS=muNS,
dav['#'] = (data.loc[:, ['syn', 'protein_secondary_structure', 'af'] + additional].groupby( ['syn', 'protein_secondary_structure'] + additional).count()['af']) dav['std'] = (data.loc[:, ['syn', 'protein_secondary_structure', 'af'] + additional].groupby( ['syn', 'protein_secondary_structure'] + additional).std()['af']) dav['sem'] = dav['std'] / dav['#'] return dav bt = make_binary_table(data) from util import boot_strap_patients reps = pd.concat(boot_strap_patients(data, average_data, n_bootstrap=10), axis=1) reps.columns = np.arange(reps.shape[1]) + 1 dav = pd.concat([reps.mean(axis=1), reps.std(axis=1)], axis=1) dav.columns = ['mean', 'std'] def plot_average_frequencies(dav): fig, ax = plt.subplots() fs = 16 colors = { 'B': 'darkorange', 'H': 'steelblue', 'T': 'seagreen', 'X': 'black', '-': 'grey'
def plot_divdiv(data, fig_filename=None, figtypes=['.png', '.svg', '.pdf']): ''' plot divergence and diversity of synonymous and nonsynonymous mutations includes: - a panel that compares syn diversity/nonsyn divergence ''' n_bootstrap=50 ####### plotting ########### import seaborn as sns from matplotlib import pyplot as plt plt.ion() sns.set_style('darkgrid') figpath = 'figures/' fs=fig_fontsize fig_size = (fig_width, 1.0*fig_width) cols = HIVEVO_colormap() fig, axs = plt.subplots(2, 2,figsize=fig_size) divdiv = data['divdiv'] # in rough the order in which they most dominantly appear in the plot regions = ['envelope', 'accessory', 'structural','enzymes'] time_bins = np.array([0, 200, 500, 1000, 1500, 2000, 4000]) time_binc = 0.5*(time_bins[:-1]+time_bins[1:]) add_binned_column(divdiv, time_bins, 'time') # map the regions to rough genomic order to match the genome color map in panel C colors = {reg:c for reg, c in zip(regions, [cols(x) for x in [0.66, 0.99, 0.01, 0.33]])} def get_time_bin_mean(df): return df.loc[:,['time_bin', 'diversity', 'divergence']].groupby(by=['time_bin'], as_index=False).mean() def label_func(mutclass, region, divordiv): # assign labels to Panels A and B separately to make a combinatorial legend (regions vs syn/nonsyn) if divordiv=='divergence' and mutclass=='nonsyn': return region elif divordiv=='diversity' and region=='accessory': return mutclass else: return None ########## panel A and B ##################### csv_out = open(fig_filename+'_AB.tsv', 'w') for ax, dtype in izip(axs[0,:], ['divergence', 'diversity']): add_panel_label(ax, 'A' if dtype=='divergence' else 'B', x_offset = -0.3) for mutclass in ['nonsyn', 'syn']: for region in regions: ind = (divdiv.loc[:,'region']==region) & (divdiv.loc[:,'mutclass']==mutclass) tmp = divdiv.loc[ind,['time_bin', 'diversity', 'divergence', 'pcode']] avg_divdiv = get_time_bin_mean(tmp) bs = boot_strap_patients(tmp, eval_func = get_time_bin_mean, n_bootstrap=n_bootstrap) # plot the same line with and without error bars, labels for legend without ax.plot(time_binc/365.25, avg_divdiv.loc[:,dtype], ls='-' if mutclass=='nonsyn' else '--', c=colors[region], lw=3, label=label_func(mutclass, region, dtype)) ax.errorbar(time_binc/365.25, avg_divdiv.loc[:,dtype], replicate_func(bs, dtype, np.std, bin_index='time_bin'), ls='-' if mutclass=='nonsyn' else '--', c=colors[region], lw=3) csv_out.write('\t'.join(map(str,[dtype, mutclass, region]+list(avg_divdiv.loc[:,dtype])))+'\n') ax.legend(loc=2, fontsize=fs-1, numpoints=2, labelspacing = 0) ax.set_xticks([0,2,4,6,8]) if dtype=='divergence': ax.set_yticks([0,.02,.04]) ax.set_ylim([0,.048]) else: ax.set_yticks([0,.01,.02]) ax.set_ylim([0,.028]) ax.set_xlim([0,8.5]) ax.set_ylabel(dtype) ax.tick_params(labelsize=fs-2) ax.set_xlabel('Years since EDI', fontsize=fs) csv_out.close() ########## panel C: anti correlation of syn diversity and nonsyn divergence ############# csv_out = open(fig_filename+'_C.tsv', 'w') (avg_nonsyn_divg, avg_nonsyn_divs, avg_syn_divs) = data['divdiv_corr'] ax = axs[1,0] add_panel_label(ax, 'C', x_offset = -0.3) x_data, y_data = avg_nonsyn_divg[::500], avg_syn_divs[::500] ax.scatter(x_data, y_data, c=[cols(p) for p in np.linspace(0,1,len(x_data))], s=50) csv_out.write('\t'.join(map(str, ["nonsyn_divergence"]+list(x_data)))+'\n') csv_out.write('\t'.join(map(str, ["syn_diversity"]+list(y_data)))+'\n') csv_out.close() ax.set_xlabel('nonsyn divergence', fontsize = fig_fontsize) ax.set_ylabel('syn diversity', fontsize = fig_fontsize) ax.set_ylim([0,0.028]) ax.set_xlim([0,0.012]) ax.set_xticks([0, 0.005,0.01]) ax.set_yticks([0, 0.01, 0.02]) ax.tick_params(labelsize=fig_fontsize-2) ########## sfs in panel D ############## csv_out = open(fig_filename+'_D.tsv', 'w') sfs=data['sfs'] ax = axs[1,1] add_panel_label(ax, 'D', x_offset = -0.3) colors = sns.color_palette(n_colors=2) binc = binc = 0.5*(sfs['bins'][1:]+sfs['bins'][:-1]) ax.bar(binc-0.045, sfs['syn']/np.sum(sfs['syn']),width = 0.04, label='syn', color=colors[0]) ax.bar(binc, sfs['nonsyn']/np.sum(sfs['nonsyn']),width = 0.04, label='nonsyn', color=colors[1]) csv_out.write('\t'.join(map(str, ["bin_centers"]+list(binc)))+'\n') csv_out.write('\t'.join(map(str, ["sfs_nonsyn"]+list(sfs['nonsyn']/np.sum(sfs['nonsyn']))))+'\n') csv_out.write('\t'.join(map(str, ["sfs_syn"]+list(sfs['syn']/np.sum(sfs['syn']))))+'\n') csv_out.close() ax.set_ylim([0.005,2.0]) ax.set_yscale('log') ax.set_xlabel('Frequency',fontsize=fs) ax.set_ylabel('Fractions of SNPs',fontsize=fs) ax.legend(loc=1, fontsize=fs-2) ax.tick_params(labelsize=fig_fontsize-2) # finalize and save the figure plt.tight_layout(rect=(0.0, 0.02, 0.98, 0.98), pad=0.05, h_pad=0.5, w_pad=0.4) if fig_filename is not None: for ext in figtypes: fig.savefig(fig_filename+ext) else: plt.ion() plt.show()