def fitnesscost_distribution(regions, minor_af, synnonsyn, synnonsyn_uc, mut_rates, fname=None, ref=None): ''' produce figure of distribution of selection coefficients separately for synonymous, nonsynonymous sites, and synonymous sites in reading frame overlaps. FIGURE 4 of the manuscript ''' from util import add_panel_label if ref is not None: if not hasattr(ref, 'fitness_cost'): ref.fitness_cost = np.zeros_like(ref.entropy) fig, axs = plt.subplots(1, 3, sharey=True, figsize=(10,6)) if type(regions)==str: # if only one region is passed as string regions = [regions] for ni,ax,label_str in ((0, axs[0], 'synonymous'), (1, axs[1], 'syn-overlaps'), (2, axs[2], 'nonsyn')): slist = [] for region in regions: if label_str=='synonymous': ind = synnonsyn[region] elif label_str=='syn-overlaps': ind = synnonsyn_uc[region]&(~synnonsyn[region]) else: ind = ~synnonsyn_uc[region] ind = ind&(~np.isnan(minor_af[region])) slist.extend(mut_rates[region][ind]/(minor_af[region][ind]+af_cutoff)) s = np.array(slist) s[s>=0.1] = 0.1 s[s<=0.001] = 0.001 if ref is not None: bg = ref.annotation[region].location.start ed = ref.annotation[region].location.end ref.fitness_cost[bg:ed][ind] = s if len(s): ax.hist(s, color=cols[ni], weights=np.ones(len(s), dtype=float)/len(s), bins=np.logspace(-3,-1,11), label=label_str+', n='+str(len(s))) ax.set_xscale('log') ax.tick_params(labelsize=fs*0.8) ax.text(0.1, 0.8, 'position: '+str(ni)) if ni==0: ax.set_ylabel('fraction of sites', fontsize=fs) ax.set_yscale('linear') ax.set_xlabel('fitness cost', fontsize=fs) ax.set_xticks([0.001, 0.01, 0.1]) ax.set_xticklabels([r'$<10^{-3}$', r'$10^{-2}$', r'$>10^{-1}$']) ax.legend(loc=2, fontsize=fs*0.8) add_panel_label(ax, ['A', 'B', 'C'][ni], x_offset=-0.2 - 0.1 * (ni == 0)) plt.tight_layout() if fname is not None: for ext in ['png', 'svg', 'pdf']: plt.savefig(fname+'.'+ext)
def plot_figure_1(data, mu, dmulog10, muA, dmuAlog10,suffix=''): '''Plot figure 1 of the paper''' print('Plot Figure 1') fig = plt.figure(figsize=(12, 11)) ax1 = plt.subplot2grid((2,2), (0,0)) ax2 = plt.subplot2grid((2,2), (0,1)) ax3 = plt.subplot2grid((2,2), (1, 0), colspan=2) # plot linear regression plot_mutation_increase(data, mu=mu, axs=[ax1, ax2]) mu_all = pd.DataFrame({'mu': mu, 'muA': muA, 'dmulog10': dmulog10, 'dmuAlog10': dmuAlog10, }) # plot matrix of arrows plot_mutation_rate_graph(mu_all, ax=ax3) plt.tight_layout() # Add labels from util import add_panel_label add_panel_label(ax1, 'A', x_offset=-0.2) add_panel_label(ax2, 'B', x_offset=-0.2) add_panel_label(ax3, 'C', x_offset=-0.08) plt.ion() plt.show() for ext in ['svg', 'png', 'pdf']: fig.savefig('../figures/figure_1'+suffix+'.'+ext)
def plot_figure_1(data, mu, dmulog10, muA, dmuAlog10, suffix=''): '''Plot figure 1 of the paper''' print('Plot Figure 1') fig = plt.figure(figsize=(12, 11)) ax1 = plt.subplot2grid((2, 2), (0, 0)) ax2 = plt.subplot2grid((2, 2), (0, 1)) ax3 = plt.subplot2grid((2, 2), (1, 0), colspan=2) # plot linear regression plot_mutation_increase(data, mu=mu, axs=[ax1, ax2]) mu_all = pd.DataFrame({ 'mu': mu, 'muA': muA, 'dmulog10': dmulog10, 'dmuAlog10': dmuAlog10, }) # plot matrix of arrows plot_mutation_rate_graph(mu_all, ax=ax3) plt.tight_layout() # Add labels from util import add_panel_label add_panel_label(ax1, 'A', x_offset=-0.2) add_panel_label(ax2, 'B', x_offset=-0.2) add_panel_label(ax3, 'C', x_offset=-0.08) plt.ion() plt.show() for ext in ['svg', 'png', 'pdf']: fig.savefig('../figures/figure_1' + suffix + '.' + ext)
def plot_to_away(data, fig_filename=None, figtypes=['.png', '.svg', '.pdf'], sequence_type='nuc'): '''Makes a two panel figure summarizing the results on reversion Args: data (dict): data to be plotted (see below) ''' import seaborn as sns from matplotlib import pyplot as plt plt.ion() sns.set_style('darkgrid') figpath = 'figures/' fs=fig_fontsize fig_size = (1.0*fig_width, 0.6*fig_width) fig, axs = plt.subplots(1, 2, figsize=fig_size) nbs=100 # number of bootstrap replicates # set the colors for the plots, both panels use the same color scheme cols = HIVEVO_colormap() colors = [cols(x) for x in [0.0, 0.33, 0.66, 0.99]] #################################################################################### # make panel divergence vs entropy #################################################################################### ax=axs[1] if sequence_type == 'nuc': Sbins = np.array([0, 0.02, 0.08, 0.25, 2]) else: Sbins = np.array([0, 0.1, 0.3, 3]) Sbinc = 0.5*(Sbins[1:]+Sbins[:-1]) def get_Sbin_mean(df): # regroup and calculate mean in entropy bins return df.groupby(by=['S_bin'], as_index=False).mean() color_count = 0 for lblstr, subtype, ls in [('subtype', 'patient', '--'), ('group M', 'any', '-')]: mv = data[subtype]['minor_variants'] # subset to a specific time interval mv = mv.loc[(mv.loc[:,'time'] > 1500)&(mv.loc[:,'time'] < 2500),:] print "average time:", mv.loc[:,'time'].mean() / 365.25 mv.loc[:,['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived']] = \ mv.loc[:,['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived']].astype(float) mean_to_away =get_Sbin_mean(mv) bs = boot_strap_patients(mv, eval_func=get_Sbin_mean, n_bootstrap=nbs, columns=['af_away_minor', 'af_away_derived', 'af_to_minor', 'af_to_derived', 'S_bin']) print mean_to_away col = 'af_away_derived' ax.errorbar(Sbinc, mean_to_away.loc[:,col], replicate_func(bs, col, np.std, bin_index='S_bin'), ls=ls, lw=3, label='founder = '+lblstr, c=colors[color_count]) color_count+=1 col = 'af_to_derived' ax.errorbar(Sbinc, mean_to_away.loc[:,col], replicate_func(bs, col, np.std, bin_index='S_bin'), ls=ls, lw = 3, label = u'founder \u2260 '+lblstr, c=colors[color_count]) color_count+=1 ax.set_yscale('log') ax.set_xscale('log') ax.set_ylabel('Divergence from founder', fontsize = fig_fontsize) ax.set_xlabel('Variability [bits]', fontsize = fig_fontsize) add_panel_label(ax, 'B', x_offset=-0.32) for item in ax.get_yticklabels()+ax.get_xticklabels(): item.set_fontsize(fs-2) ax.set_xlim([0.005, 2]) #################################################################################### # print reversion statistics #################################################################################### def get_time_bin_means(df): # get mean of divergence, reversion divergence and time for each time bin return df.loc[:,['divergence', 'reversion','time_bin']].groupby(by=['time_bin'], as_index=False).mean() for subtype in ['patient', 'any']: to_away = data[subtype]['to_away'] time_bins = np.array([0, 500, 1000, 1500, 2500, 3500]) binc = 0.5*(time_bins[1:]+time_bins[:-1]) add_binned_column(to_away, time_bins, 'time') to_away.loc[:,['reversion', 'divergence']] = \ to_away.loc[:,['reversion', 'divergence']].astype(float) rev_div = get_time_bin_means(to_away) bs = boot_strap_patients(to_away, get_time_bin_means, n_bootstrap = nbs, columns = ['reversion','divergence','time_bin']) reversion_std = replicate_func(bs, 'reversion', np.std, bin_index='time_bin') total_div_std = replicate_func(bs, 'divergence', np.std, bin_index='time_bin') fraction = rev_div.loc[:,'reversion']/rev_div.loc[:,'divergence'] print "Comparison:", subtype print "Reversions:\n", rev_div.loc[:,'reversion'] print "Divergence:\n", rev_div.loc[:,'divergence'] # print the fraction of divergence that is due to reversion at different times # gives errors as standard deviations over patient bootstraps print "Fraction:" for frac, total, num_std, denom_std in zip(fraction, rev_div.loc[:,'divergence'],reversion_std, total_div_std): print frac, '+/-', np.sqrt(num_std**2/total**2 + denom_std**2*frac**2/total**2) print "Consensus!=Founder:",np.mean(data[subtype]['consensus_distance'].values()) #################################################################################### # make panel divergence vs time #################################################################################### to_histogram=data['to_histogram'] away_histogram=data['away_histogram'] time_bins=data['time_bins'] af_bins=data['af_bins'] af_binc=0.5*(af_bins[1:]+af_bins[:-1]) def bin_time(freq_arrays, time_bins): '''sum up allele frequency histgrams corresponding to the same time bin''' binned_hists = [np.zeros_like(af_binc) for ti in time_bins[1:]] for hists in freq_arrays.values(): for t, y in hists.iteritems(): ti = np.searchsorted(time_bins, t) if ti>0 and ti<len(time_bins): binned_hists[ti-1]+=y return binned_hists def get_div(afhist, fixed=False): '''return the fraction of fixed alleles or the mean divergence''' if fixed: return afhist[0]/afhist.sum() else: return np.array(afhist[:-1]*(1-af_binc[:-1])).sum()/afhist.sum() from random import choice ax = axs[0] time_binc = 0.5*(time_bins[1:]+time_bins[:-1]) sym='o' fs = fig_fontsize color_count=0 for subtype, ls in [('patient', '--'), ('any','-')]: for toaway, H in [(u'founder = '+('group M' if subtype=='any' else 'subtype'), away_histogram[subtype]), (u'founder \u2260 '+('group M' if subtype=='any' else 'subtype'), to_histogram[subtype])]: mean_hists = bin_time(H,time_bins) div = [get_div(mean_hists[ti]) for ti in range(len(time_bins)-1)] # make replicates and calculate bootstrap confidence intervals replicates = [] all_keys = H.keys() for ri in xrange(nbs): bootstrap_keys = [all_keys[ii] for ii in np.random.randint(len(all_keys), size=len(all_keys))] tmp = bin_time({key:H[key] for key in bootstrap_keys}, time_bins) replicates.append([get_div(tmp[ti]) for ti in range(len(time_bins)-1)]) std_dev = np.array(replicates).std(axis=0) ax.errorbar(time_binc/365.25, div, std_dev, ls = ls, lw=3, c=colors[color_count]) ax.plot(time_binc/365.25, div, label = toaway, ls = ls, lw=3, c=colors[color_count]) # plot again with label to avoid error bars in legend color_count+=1 if sequence_type == 'nuc': ax.set_ylim([0,0.16]) ax.set_yticks([0, 0.04, 0.08, 0.12]) else: ax.set_ylim([0,0.32]) ax.set_yticks([0, 0.08, 0.16, 0.24]) ax.set_xlabel('ETI [years]', fontsize=fs) ax.set_ylabel('Divergence from founder', fontsize=fs) ax.legend(loc=2, fontsize=fs-2, labelspacing=0) add_panel_label(ax, 'A', x_offset=-0.32) ax.tick_params(axis='both', labelsize=fs-2) plt.tight_layout(pad=0.3, h_pad=0.5) #rect=(0.0, 0.02, 0.98, 0.98), pad=0.05, h_pad=0.5, w_pad=0.4) for ext in figtypes: fig.savefig(fig_filename+ext)
def plot_divdiv(data, fig_filename=None, figtypes=['.png', '.svg', '.pdf']): ''' plot divergence and diversity of synonymous and nonsynonymous mutations includes: - a panel that compares syn diversity/nonsyn divergence ''' n_bootstrap = 50 ####### plotting ########### import seaborn as sns from matplotlib import pyplot as plt plt.ion() sns.set_style('darkgrid') figpath = 'figures/' fs = fig_fontsize fig_size = (fig_width, 1.0 * fig_width) cols = HIVEVO_colormap() fig, axs = plt.subplots(2, 2, figsize=fig_size) divdiv = data['divdiv'] # in rough the order in which they most dominantly appear in the plot regions = ['envelope', 'accessory', 'structural', 'enzymes'] time_bins = np.array([0, 200, 500, 1000, 1500, 2000, 4000]) time_binc = 0.5 * (time_bins[:-1] + time_bins[1:]) add_binned_column(divdiv, time_bins, 'time') # map the regions to rough genomic order to match the genome color map in panel C colors = { reg: c for reg, c in zip(regions, [cols(x) for x in [0.66, 0.99, 0.01, 0.33]]) } def get_time_bin_mean(df): return df.loc[:, ['time_bin', 'diversity', 'divergence']].groupby( by=['time_bin'], as_index=False).mean() def label_func( mutclass, region, divordiv ): # assign labels to Panels A and B separately to make a combinatorial legend (regions vs syn/nonsyn) if divordiv == 'divergence' and mutclass == 'nonsyn': return region elif divordiv == 'diversity' and region == 'accessory': return mutclass else: return None ########## panel A and B ##################### csv_out = open(fig_filename + '_AB.tsv', 'w') for ax, dtype in izip(axs[0, :], ['divergence', 'diversity']): add_panel_label(ax, 'A' if dtype == 'divergence' else 'B', x_offset=-0.3) for mutclass in ['nonsyn', 'syn']: for region in regions: ind = (divdiv.loc[:, 'region'] == region) & (divdiv.loc[:, 'mutclass'] == mutclass) tmp = divdiv.loc[ ind, ['time_bin', 'diversity', 'divergence', 'pcode']] avg_divdiv = get_time_bin_mean(tmp) bs = boot_strap_patients(tmp, eval_func=get_time_bin_mean, n_bootstrap=n_bootstrap) # plot the same line with and without error bars, labels for legend without ax.plot(time_binc / 365.25, avg_divdiv.loc[:, dtype], ls='-' if mutclass == 'nonsyn' else '--', c=colors[region], lw=3, label=label_func(mutclass, region, dtype)) ax.errorbar(time_binc / 365.25, avg_divdiv.loc[:, dtype], replicate_func(bs, dtype, np.std, bin_index='time_bin'), ls='-' if mutclass == 'nonsyn' else '--', c=colors[region], lw=3) csv_out.write('\t'.join( map(str, [dtype, mutclass, region] + list(avg_divdiv.loc[:, dtype]))) + '\n') ax.legend(loc=2, fontsize=fs - 1, numpoints=2, labelspacing=0) ax.set_xticks([0, 2, 4, 6, 8]) if dtype == 'divergence': ax.set_yticks([0, .02, .04]) ax.set_ylim([0, .048]) else: ax.set_yticks([0, .01, .02]) ax.set_ylim([0, .028]) ax.set_xlim([0, 8.5]) ax.set_ylabel(dtype) ax.tick_params(labelsize=fs - 2) ax.set_xlabel('Years since EDI', fontsize=fs) csv_out.close() ########## panel C: anti correlation of syn diversity and nonsyn divergence ############# csv_out = open(fig_filename + '_C.tsv', 'w') (avg_nonsyn_divg, avg_nonsyn_divs, avg_syn_divs) = data['divdiv_corr'] ax = axs[1, 0] add_panel_label(ax, 'C', x_offset=-0.3) x_data, y_data = avg_nonsyn_divg[::500], avg_syn_divs[::500] ax.scatter(x_data, y_data, c=[cols(p) for p in np.linspace(0, 1, len(x_data))], s=50) csv_out.write('\t'.join(map(str, ["nonsyn_divergence"] + list(x_data))) + '\n') csv_out.write('\t'.join(map(str, ["syn_diversity"] + list(y_data))) + '\n') csv_out.close() ax.set_xlabel('nonsyn divergence', fontsize=fig_fontsize) ax.set_ylabel('syn diversity', fontsize=fig_fontsize) ax.set_ylim([0, 0.028]) ax.set_xlim([0, 0.012]) ax.set_xticks([0, 0.005, 0.01]) ax.set_yticks([0, 0.01, 0.02]) ax.tick_params(labelsize=fig_fontsize - 2) ########## sfs in panel D ############## csv_out = open(fig_filename + '_D.tsv', 'w') sfs = data['sfs'] ax = axs[1, 1] add_panel_label(ax, 'D', x_offset=-0.3) colors = sns.color_palette(n_colors=2) binc = binc = 0.5 * (sfs['bins'][1:] + sfs['bins'][:-1]) ax.bar(binc - 0.045, sfs['syn'] / np.sum(sfs['syn']), width=0.04, label='syn', color=colors[0]) ax.bar(binc, sfs['nonsyn'] / np.sum(sfs['nonsyn']), width=0.04, label='nonsyn', color=colors[1]) csv_out.write('\t'.join(map(str, ["bin_centers"] + list(binc))) + '\n') csv_out.write('\t'.join( map(str, ["sfs_nonsyn"] + list(sfs['nonsyn'] / np.sum(sfs['nonsyn'])))) + '\n') csv_out.write('\t'.join( map(str, ["sfs_syn"] + list(sfs['syn'] / np.sum(sfs['syn'])))) + '\n') csv_out.close() ax.set_ylim([0.005, 2.0]) ax.set_yscale('log') ax.set_xlabel('Frequency', fontsize=fs) ax.set_ylabel('Fractions of SNPs', fontsize=fs) ax.legend(loc=1, fontsize=fs - 2) ax.tick_params(labelsize=fig_fontsize - 2) # finalize and save the figure plt.tight_layout(rect=(0.0, 0.02, 0.98, 0.98), pad=0.05, h_pad=0.5, w_pad=0.4) if fig_filename is not None: for ext in figtypes: fig.savefig(fig_filename + ext) else: plt.ion() plt.show()
def plot_allele_frequency_overlap(data, title='', VERBOSE=0, use_logit=False, fig_filename=None, separate_axes=False): '''Plot allele frequency in the overlap regions''' if VERBOSE >= 2: print 'Plot allele frequency in overlaps' import matplotlib.pyplot as plt import seaborn as sns from util import add_panel_label sns.set_style('darkgrid') colors = sns.color_palette('Set1', 5) fs = fig_fontsize xmin = 1e-3 if not separate_axes: fig, ax = plt.subplots(figsize=(fig_width, 0.8 * fig_width)) axs = [ax] else: fig = plt.figure(figsize=(1.5 * fig_width, 1.5 * 0.8 * fig_width)) fun = fig.add_axes lpad = 0.07 hpad = 0.05 vpad = 0.05 width = (1.0 - lpad - 3 * hpad) / 3 height = (1.0 - 4 * vpad) / 2 axs = [fun([lpad, 0.5 + vpad, width, height]), fun([lpad + hpad + width, 0.5 + vpad, width, height]), fun([lpad + 2 * hpad + 2 * width, 0.5 + vpad, width, height]), fun([0.5 * (1 - hpad) - width, vpad, width, height]), fun([0.5 * (1 + hpad), vpad, width, height]), ] # NOTE (Fabio): my logit patch has made it to matplotlib master but now there # is a weird bug here that crashes the figure. If I manually call # plt.yscale('logit') # it works, so I don't quite understand. Anyway, this is only aesthetics and # does not affect any data. if use_logit: for iax, ax in enumerate(axs): ax.set_xlim(xmin, 1 - xmin) ax.set_ylim(xmin, 1 - xmin) ax.set_xscale('logit') ax.set_yscale('logit') ax.xaxis.set_tick_params(labelsize=fs) ax.yaxis.set_tick_params(labelsize=fs) if iax not in (0, 3): ax.set_yticklabels([]) for ida, datum in enumerate(data): if separate_axes: ax = axs[ida] afjoint = datum['af'] color = colors[datum['io']] x = afjoint[0].ravel() y = afjoint[1].ravel() ind = ~(x.mask | y.mask) x = x[ind] y = y[ind] ind = (x >= xmin) & (x <= 1 - xmin) & (y >= xmin) & (y <= 1 - xmin) x = x[ind] y = y[ind] ax.scatter(x, y, s=50, color=color, alpha=0.7, edgecolor='none') ## Plot stddev in Poisson sampling #n = datum['n'] #x = np.linspace(np.log10(xmin), 0, 1000) #x = 1.0 / (1 + 10**(-x)) #y = x - np.sqrt(x / n) #ax.plot(np.concatenate([x, 1 - y[::-1]]), np.concatenate([y, 1 - x[::-1]]), # lw=3, c=color, alpha=0.5) #ax.plot(np.concatenate([y, 1 - x[::-1]]), np.concatenate([x, 1 - y[::-1]]), # lw=3, c=color, alpha=0.5, # label=datum['overlap']) if separate_axes or (ida == len(data) - 1): ax.plot([xmin, 1 - xmin], [xmin, 1 - xmin], lw=2, color='k', alpha=0.5) #ax.set_xlabel('SNP frequency leading fragment', fontsize=fs) #ax.set_ylabel('SNP frequency trailing fragment', fontsize=fs) if not separate_axes: add_panel_label(ax, 'C', x_offset=-0.22) if title: ax.set_title(title) if not separate_axes: plt.tight_layout() if fig_filename is not None: for ext in ['.pdf','.svg', '.png']: fig.savefig(fig_filename+ext) plt.close(fig) else: plt.ion() plt.show()
def plot_allele_frequency_overlap(data, title='', VERBOSE=0, use_logit=False, fig_filename=None, separate_axes=False): '''Plot allele frequency in the overlap regions''' if VERBOSE >= 2: print 'Plot allele frequency in overlaps' import matplotlib.pyplot as plt import seaborn as sns from util import add_panel_label sns.set_style('darkgrid') colors = sns.color_palette('Set1', 5) fs = fig_fontsize xmin = 1e-3 if not separate_axes: fig, ax = plt.subplots(figsize=(fig_width, 0.8 * fig_width)) axs = [ax] else: fig = plt.figure(figsize=(1.5 * fig_width, 1.5 * 0.8 * fig_width)) fun = fig.add_axes lpad = 0.07 hpad = 0.05 vpad = 0.05 width = (1.0 - lpad - 3 * hpad) / 3 height = (1.0 - 4 * vpad) / 2 axs = [ fun([lpad, 0.5 + vpad, width, height]), fun([lpad + hpad + width, 0.5 + vpad, width, height]), fun([lpad + 2 * hpad + 2 * width, 0.5 + vpad, width, height]), fun([0.5 * (1 - hpad) - width, vpad, width, height]), fun([0.5 * (1 + hpad), vpad, width, height]), ] # NOTE (Fabio): my logit patch has made it to matplotlib master but now there # is a weird bug here that crashes the figure. If I manually call # plt.yscale('logit') # it works, so I don't quite understand. Anyway, this is only aesthetics and # does not affect any data. if use_logit: for iax, ax in enumerate(axs): ax.set_xlim(xmin, 1 - xmin) ax.set_ylim(xmin, 1 - xmin) ax.set_xscale('logit') ax.set_yscale('logit') ax.xaxis.set_tick_params(labelsize=fs) ax.yaxis.set_tick_params(labelsize=fs) if iax not in (0, 3): ax.set_yticklabels([]) for ida, datum in enumerate(data): if separate_axes: ax = axs[ida] afjoint = datum['af'] color = colors[datum['io']] x = afjoint[0].ravel() y = afjoint[1].ravel() ind = ~(x.mask | y.mask) x = x[ind] y = y[ind] ind = (x >= xmin) & (x <= 1 - xmin) & (y >= xmin) & (y <= 1 - xmin) x = x[ind] y = y[ind] ax.scatter(x, y, s=50, color=color, alpha=0.7, edgecolor='none') ## Plot stddev in Poisson sampling #n = datum['n'] #x = np.linspace(np.log10(xmin), 0, 1000) #x = 1.0 / (1 + 10**(-x)) #y = x - np.sqrt(x / n) #ax.plot(np.concatenate([x, 1 - y[::-1]]), np.concatenate([y, 1 - x[::-1]]), # lw=3, c=color, alpha=0.5) #ax.plot(np.concatenate([y, 1 - x[::-1]]), np.concatenate([x, 1 - y[::-1]]), # lw=3, c=color, alpha=0.5, # label=datum['overlap']) if separate_axes or (ida == len(data) - 1): ax.plot([xmin, 1 - xmin], [xmin, 1 - xmin], lw=2, color='k', alpha=0.5) #ax.set_xlabel('SNP frequency leading fragment', fontsize=fs) #ax.set_ylabel('SNP frequency trailing fragment', fontsize=fs) if not separate_axes: add_panel_label(ax, 'C', x_offset=-0.22) if title: ax.set_title(title) if not separate_axes: plt.tight_layout() if fig_filename is not None: for ext in ['.pdf', '.svg', '.png']: fig.savefig(fig_filename + ext) plt.close(fig) else: plt.ion() plt.show()
def plot_subtype_correlation(data, fig_filename=None, figtypes=['.png', '.svg', '.pdf']): '''Plot results''' import seaborn as sns from matplotlib import pyplot as plt plt.ion() sns.set_style('darkgrid') figpath = 'figures/' fs=fig_fontsize fig_size = (fig_width, 0.5*fig_width) fig, axs = plt.subplots(1, 2, figsize=fig_size) ax=axs[0] add_panel_label(ax, 'A', x_offset=-0.15) patients = sorted(data['correlations']['pcode'].unique(), key=lambda x:int(x[1:])) colors = patient_colors # calculate mean and variance across regions for each time point and patient mean_rho = data['correlations'].groupby(by=['time', 'pcode'], as_index=False).mean().groupby('pcode') var_rho = data['correlations'].groupby(by=['time', 'pcode'], as_index=False).var().groupby('pcode') # loop over patients and plot the mean/std of the previously grouped data for pat in patients: ax.errorbar(np.array(mean_rho.get_group(pat)['time']/365.25), np.array(mean_rho.get_group(pat)['rho']), yerr=np.array(np.sqrt(var_rho.get_group(pat)['rho'])), color=colors[pat], ls="none", markersize=8, marker='o', label=pat) ax.legend(loc=2, fontsize=fs-3, ncol=2, labelspacing=0.1, columnspacing=0.1) ax.set_yticks([0,0.25,0.5]) ax.set_xticks([0,2,4,6,8]) ax.set_xlim([-2,8.5]) ax.set_ylim([-0.1, 0.8]) ax.set_xlabel('ETI [years]', fontsize=fs) ax.set_title(r"Spearman's $\rho$", fontsize=fs) for item in ax.get_xticklabels()+ax.get_yticklabels(): item.set_fontsize(fs) # add a second plot that shows the fraction of variable sites by entropy bin ax=axs[1] add_panel_label(ax, 'B', x_offset=-0.15) div = data['diverse_fraction'] colors = sns.color_palette(n_colors=4) # add a time bin column time_bins = np.arange(0,4000,500) binc = 0.5*(time_bins[1:] + time_bins[:-1]) div.loc[:,'time_bin'] = np.minimum(len(time_bins)-2, np.maximum(0,np.searchsorted(time_bins, div["time"])-1)) for i in range(4): ent = 'S'+str(i+1) div.loc[:,ent] = div.loc[:,ent].astype(float) # calculate mean and variance over regions and patients and samples within a time bin mean_div = div.loc[:,[ent, 'time_bin']].groupby(by=['time_bin'], as_index=False).mean() var_div = div.loc[:,[ent, 'time_bin']].groupby(by=['time_bin'], as_index=False).var() ax.errorbar(np.array(binc/365.25), np.array(mean_div.loc[:,ent]), yerr=np.array(np.sqrt(var_div.loc[:,ent])), label='Q'+str(i+1), c=colors[i]) ax.set_ylim([0,0.35]) ax.set_yticks([0, 0.1, 0.2, 0.3]) ax.set_xticks([0,2,4,6,8]) ax.set_title('Fraction of SNPs > 0.01') ax.set_xlabel('ETI [years]', fontsize=fs) for item in ax.get_xticklabels()+ax.get_yticklabels(): item.set_fontsize(fs) ax.legend(loc=2, ncol=2,fontsize=fs-3, title='Conservation', labelspacing=0.1, columnspacing=0.5) # plot output plt.tight_layout(rect=(0.0, 0.02, 0.98, 0.98), pad=0.05, h_pad=0.5, w_pad=0.4) if fig_filename is not None: for ext in figtypes: fig.savefig(fig_filename+ext) else: plt.ion() plt.show()
aa_mutation_rates, total_nonsyn_mutation_rates = calc_amino_acid_mutation_rates() selcoeff = {} for region in regions: s = fitness_costs_per_site(region, data, total_nonsyn_mutation_rates) s[s>1] = 1 selcoeff[region] = s aa_ref = 'NL4-3' global_ref = HIVreference(refname=aa_ref, subtype=args.subtype) ### FIGURE 5 fig,axs = plt.subplots(1,2, figsize=(10,5)) #fitness_costs_in_optimal_epis(['gag', 'nef'], selcoeff, ax=axs[0]) #add_panel_label(axs[0], 'A', x_offset=-0.15) plot_fraction_associated(regions, selcoeff, associations, axs=axs, slope=2.0) add_panel_label(axs[0], 'A', x_offset=-0.15) region='nef' reference = HIVreferenceAminoacid(region, refname=aa_ref, subtype = args.subtype) tmp, rho, pval = fitness_scatter(region, selcoeff, associations, reference, ax=axs[0]) add_panel_label(axs[1], 'B', x_offset=-0.15) axs[0].legend(loc=3, fontsize=fs) axs[0].set_ylim([0.03,3]) plt.tight_layout() for fmt in ['pdf', 'png', 'svg']: plt.savefig('../figures/figure_5_'+region+'_st_'+args.subtype+'.'+fmt) # calculate corrleations between fitness costs and phenotypes phenotype_correlations = {} erich = np.zeros((2,2,2)) for region in regions:
def plot_fit(data_sat, data_pooled): from matplotlib import cm from util import add_panel_label palette = sns.color_palette('colorblind') fig_width = 5 fs = 16 fig, axs = plt.subplots(1, 2, figsize=(2 * fig_width, fig_width)) data_to_fit = data_sat['data_to_fit'] mu = data_sat['mu'] s = data_sat['s'] fun = lambda x, s: mu / s * (1.0 - np.exp(-s * x)) # PANEL A: data and fits ax = axs[0] for iS, (S, datum) in enumerate(data_to_fit.iterrows()): x = np.array(datum.index) y = np.array(datum) color = cm.jet(1.0 * iS / data_to_fit.shape[0]) # Most conserved group is dashed if iS == 0: ls = '--' else: ls = '-' ax.scatter(x, y, s=70, color=color, ) xfit = np.linspace(0, 3000) yfit = fun(xfit, s.loc[S, 's']) ax.plot(xfit, yfit, lw=2, color=color, ls=ls, ) ax.set_xlabel('days since EDI', fontsize=fs) ax.set_ylabel('average SNP frequency', fontsize=fs) ax.set_xlim(-200, 3200) ax.set_ylim(-0.0005, 0.025) ax.set_xticks(np.linspace(0, 0.005, 5)) ax.set_xticks([0, 1000, 2000, 3000]) ax.xaxis.set_tick_params(labelsize=fs) ax.yaxis.set_tick_params(labelsize=fs) ax.text(0, 0.023, r'$\mu = 1.2 \cdot 10^{-5}$ per day', fontsize=16) ax.plot([200, 1300], [0.007, 0.007 + (1300 - 200) * mu], lw=1.5, c='k') # PANEL B: costs ax = axs[1] # B1: Saturation fit x = np.array(s.index) y = np.array(s['s']) dy = np.array(s['ds']) ymin = 0.1 x = x[1:] y = y[1:] dy = dy[1:] ax.errorbar(x, y, yerr=dy, ls='-', marker='o', lw=2, color=palette[0], label='Sat', ) # B2: pooled x = data_pooled['all'][:-1, 0] y = data_pooled['all'][:-1, 1] dy = data_pooled['all_std'][:-1, 1] ax.errorbar(x, y, yerr=dy, ls='-', marker='o', lw=2, color=palette[2], label='Pooled', ) ax.legend(loc='upper right', fontsize=16) ax.set_xlabel('variability in group M [bits]', fontsize=fs) ax.set_ylabel('fitness cost', fontsize=fs) ax.set_xlim(0.9e-3, 2.5) ax.set_ylim(9e-5, 0.11) ax.set_xscale('log') ax.set_yscale('log') ax.xaxis.set_tick_params(labelsize=fs) ax.yaxis.set_tick_params(labelsize=fs) # Panel labels add_panel_label(axs[0], 'A', x_offset=-0.27) add_panel_label(axs[1], 'B', x_offset=-0.21) plt.tight_layout() plt.ion() plt.show()
from hivevo.hivevo.patients import Patient from hivevo.hivevo.samples import all_fragments from util import store_data, load_data, draw_genome, fig_width, fig_fontsize, add_panel_label # Script if __name__ == "__main__": patients = ['p' + str(i) for i in range(1, 12) if i not in [4, 7]] # make two figures, each showing one method of template quantification sns.set_style('darkgrid') fs = fig_fontsize fig1, ax1 = plt.subplots(figsize=(fig_width, 0.8 * fig_width)) fig2, ax2 = plt.subplots(figsize=(fig_width, 0.8 * fig_width)) add_panel_label(ax1, 'A', x_offset=-0.15) add_panel_label(ax2, 'B', x_offset=-0.15) # define colors for patients and fragments pat_colors = sns.color_palette( sns.color_palette([ '#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c', '#fdbf6f', '#ff7f00', '#cab2d6' ], n_colors=len(patients))) frag_colors = sns.color_palette(n_colors=6) depth_estimates = [] total_viral_load_dilutions_list = [] overlap_dilution_list = {i: [] for i in range(6)} for pi, pcode in enumerate(patients):
def plot_divdiv(data, fig_filename=None, figtypes=['.png', '.svg', '.pdf']): ''' plot divergence and diversity of synonymous and nonsynonymous mutations includes: - a panel that compares syn diversity/nonsyn divergence ''' n_bootstrap=50 ####### plotting ########### import seaborn as sns from matplotlib import pyplot as plt plt.ion() sns.set_style('darkgrid') figpath = 'figures/' fs=fig_fontsize fig_size = (fig_width, 1.0*fig_width) cols = HIVEVO_colormap() fig, axs = plt.subplots(2, 2,figsize=fig_size) divdiv = data['divdiv'] # in rough the order in which they most dominantly appear in the plot regions = ['envelope', 'accessory', 'structural','enzymes'] time_bins = np.array([0, 200, 500, 1000, 1500, 2000, 4000]) time_binc = 0.5*(time_bins[:-1]+time_bins[1:]) add_binned_column(divdiv, time_bins, 'time') # map the regions to rough genomic order to match the genome color map in panel C colors = {reg:c for reg, c in zip(regions, [cols(x) for x in [0.66, 0.99, 0.01, 0.33]])} def get_time_bin_mean(df): return df.loc[:,['time_bin', 'diversity', 'divergence']].groupby(by=['time_bin'], as_index=False).mean() def label_func(mutclass, region, divordiv): # assign labels to Panels A and B separately to make a combinatorial legend (regions vs syn/nonsyn) if divordiv=='divergence' and mutclass=='nonsyn': return region elif divordiv=='diversity' and region=='accessory': return mutclass else: return None ########## panel A and B ##################### csv_out = open(fig_filename+'_AB.tsv', 'w') for ax, dtype in izip(axs[0,:], ['divergence', 'diversity']): add_panel_label(ax, 'A' if dtype=='divergence' else 'B', x_offset = -0.3) for mutclass in ['nonsyn', 'syn']: for region in regions: ind = (divdiv.loc[:,'region']==region) & (divdiv.loc[:,'mutclass']==mutclass) tmp = divdiv.loc[ind,['time_bin', 'diversity', 'divergence', 'pcode']] avg_divdiv = get_time_bin_mean(tmp) bs = boot_strap_patients(tmp, eval_func = get_time_bin_mean, n_bootstrap=n_bootstrap) # plot the same line with and without error bars, labels for legend without ax.plot(time_binc/365.25, avg_divdiv.loc[:,dtype], ls='-' if mutclass=='nonsyn' else '--', c=colors[region], lw=3, label=label_func(mutclass, region, dtype)) ax.errorbar(time_binc/365.25, avg_divdiv.loc[:,dtype], replicate_func(bs, dtype, np.std, bin_index='time_bin'), ls='-' if mutclass=='nonsyn' else '--', c=colors[region], lw=3) csv_out.write('\t'.join(map(str,[dtype, mutclass, region]+list(avg_divdiv.loc[:,dtype])))+'\n') ax.legend(loc=2, fontsize=fs-1, numpoints=2, labelspacing = 0) ax.set_xticks([0,2,4,6,8]) if dtype=='divergence': ax.set_yticks([0,.02,.04]) ax.set_ylim([0,.048]) else: ax.set_yticks([0,.01,.02]) ax.set_ylim([0,.028]) ax.set_xlim([0,8.5]) ax.set_ylabel(dtype) ax.tick_params(labelsize=fs-2) ax.set_xlabel('Years since EDI', fontsize=fs) csv_out.close() ########## panel C: anti correlation of syn diversity and nonsyn divergence ############# csv_out = open(fig_filename+'_C.tsv', 'w') (avg_nonsyn_divg, avg_nonsyn_divs, avg_syn_divs) = data['divdiv_corr'] ax = axs[1,0] add_panel_label(ax, 'C', x_offset = -0.3) x_data, y_data = avg_nonsyn_divg[::500], avg_syn_divs[::500] ax.scatter(x_data, y_data, c=[cols(p) for p in np.linspace(0,1,len(x_data))], s=50) csv_out.write('\t'.join(map(str, ["nonsyn_divergence"]+list(x_data)))+'\n') csv_out.write('\t'.join(map(str, ["syn_diversity"]+list(y_data)))+'\n') csv_out.close() ax.set_xlabel('nonsyn divergence', fontsize = fig_fontsize) ax.set_ylabel('syn diversity', fontsize = fig_fontsize) ax.set_ylim([0,0.028]) ax.set_xlim([0,0.012]) ax.set_xticks([0, 0.005,0.01]) ax.set_yticks([0, 0.01, 0.02]) ax.tick_params(labelsize=fig_fontsize-2) ########## sfs in panel D ############## csv_out = open(fig_filename+'_D.tsv', 'w') sfs=data['sfs'] ax = axs[1,1] add_panel_label(ax, 'D', x_offset = -0.3) colors = sns.color_palette(n_colors=2) binc = binc = 0.5*(sfs['bins'][1:]+sfs['bins'][:-1]) ax.bar(binc-0.045, sfs['syn']/np.sum(sfs['syn']),width = 0.04, label='syn', color=colors[0]) ax.bar(binc, sfs['nonsyn']/np.sum(sfs['nonsyn']),width = 0.04, label='nonsyn', color=colors[1]) csv_out.write('\t'.join(map(str, ["bin_centers"]+list(binc)))+'\n') csv_out.write('\t'.join(map(str, ["sfs_nonsyn"]+list(sfs['nonsyn']/np.sum(sfs['nonsyn']))))+'\n') csv_out.write('\t'.join(map(str, ["sfs_syn"]+list(sfs['syn']/np.sum(sfs['syn']))))+'\n') csv_out.close() ax.set_ylim([0.005,2.0]) ax.set_yscale('log') ax.set_xlabel('Frequency',fontsize=fs) ax.set_ylabel('Fractions of SNPs',fontsize=fs) ax.legend(loc=1, fontsize=fs-2) ax.tick_params(labelsize=fig_fontsize-2) # finalize and save the figure plt.tight_layout(rect=(0.0, 0.02, 0.98, 0.98), pad=0.05, h_pad=0.5, w_pad=0.4) if fig_filename is not None: for ext in figtypes: fig.savefig(fig_filename+ext) else: plt.ion() plt.show()
from util import store_data, load_data, draw_genome, fig_width, fig_fontsize, add_panel_label # Script if __name__=="__main__": patients = ['p'+str(i) for i in range(1,12) if i not in [4,7]] # make two figures, each showing one method of template quantification sns.set_style('darkgrid') fs=fig_fontsize fig1, ax1 = plt.subplots(figsize=(fig_width, 0.8*fig_width)) fig2, ax2 = plt.subplots(figsize=(fig_width, 0.8*fig_width)) add_panel_label(ax1, 'A', x_offset=-0.15) add_panel_label(ax2, 'B', x_offset=-0.15) # define colors for patients and fragments pat_colors = sns.color_palette(sns.color_palette(['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c', '#fdbf6f', '#ff7f00', '#cab2d6'], n_colors=len(patients))) frag_colors = sns.color_palette(n_colors=6) depth_estimates = [] total_viral_load_dilutions_list = [] overlap_dilution_list = {i:[] for i in range(6)} for pi, pcode in enumerate(patients):
def plot_subtype_correlation(data, fig_filename=None, figtypes=['.png', '.svg', '.pdf']): '''Plot results''' import seaborn as sns from matplotlib import pyplot as plt plt.ion() sns.set_style('darkgrid') figpath = 'figures/' fs = fig_fontsize fig_size = (fig_width, 0.5 * fig_width) fig, axs = plt.subplots(1, 2, figsize=fig_size) ax = axs[0] add_panel_label(ax, 'A', x_offset=-0.15) patients = sorted(data['correlations']['pcode'].unique(), key=lambda x: int(x[1:])) colors = patient_colors # calculate mean and variance across regions for each time point and patient mean_rho = data['correlations'].groupby( by=['time', 'pcode'], as_index=False).mean().groupby('pcode') var_rho = data['correlations'].groupby( by=['time', 'pcode'], as_index=False).var().groupby('pcode') # loop over patients and plot the mean/std of the previously grouped data for pat in patients: ax.errorbar(np.array(mean_rho.get_group(pat)['time'] / 365.25), np.array(mean_rho.get_group(pat)['rho']), yerr=np.array(np.sqrt(var_rho.get_group(pat)['rho'])), color=colors[pat], ls="none", markersize=8, marker='o', label=pat) ax.legend(loc=2, fontsize=fs - 3, ncol=2, labelspacing=0.1, columnspacing=0.1) ax.set_yticks([0, 0.25, 0.5]) ax.set_xticks([0, 2, 4, 6, 8]) ax.set_xlim([-2, 8.5]) ax.set_ylim([-0.1, 0.8]) ax.set_xlabel('ETI [years]', fontsize=fs) ax.set_title(r"Spearman's $\rho$", fontsize=fs) for item in ax.get_xticklabels() + ax.get_yticklabels(): item.set_fontsize(fs) # add a second plot that shows the fraction of variable sites by entropy bin ax = axs[1] add_panel_label(ax, 'B', x_offset=-0.15) div = data['diverse_fraction'] colors = sns.color_palette(n_colors=4) # add a time bin column time_bins = np.arange(0, 4000, 500) binc = 0.5 * (time_bins[1:] + time_bins[:-1]) div.loc[:, 'time_bin'] = np.minimum( len(time_bins) - 2, np.maximum(0, np.searchsorted(time_bins, div["time"]) - 1)) for i in range(4): ent = 'S' + str(i + 1) div.loc[:, ent] = div.loc[:, ent].astype(float) # calculate mean and variance over regions and patients and samples within a time bin mean_div = div.loc[:, [ent, 'time_bin']].groupby(by=['time_bin'], as_index=False).mean() var_div = div.loc[:, [ent, 'time_bin']].groupby(by=['time_bin'], as_index=False).var() ax.errorbar(np.array(binc / 365.25), np.array(mean_div.loc[:, ent]), yerr=np.array(np.sqrt(var_div.loc[:, ent])), label='Q' + str(i + 1), c=colors[i]) ax.set_ylim([0, 0.35]) ax.set_yticks([0, 0.1, 0.2, 0.3]) ax.set_xticks([0, 2, 4, 6, 8]) ax.set_title('Fraction of SNPs > 0.01') ax.set_xlabel('ETI [years]', fontsize=fs) for item in ax.get_xticklabels() + ax.get_yticklabels(): item.set_fontsize(fs) ax.legend(loc=2, ncol=2, fontsize=fs - 3, title='Conservation', labelspacing=0.1, columnspacing=0.5) # plot output plt.tight_layout(rect=(0.0, 0.02, 0.98, 0.98), pad=0.05, h_pad=0.5, w_pad=0.4) if fig_filename is not None: for ext in figtypes: fig.savefig(fig_filename + ext) else: plt.ion() plt.show()
def plot_non_coding_figure(data, minor_af, synnonsyn, reference, fname=None): '''Plot fitness cost at noncoding features''' from util import add_panel_label ymax = 0.25 ymin = 0.0005 y_second_gene = 1.18 fig, axs = plt.subplots(1, 4, sharey=True, figsize =(10,5), gridspec_kw={'width_ratios':[4, 1, 2.5, 1]}) # plot the 5' region start, stop = 500, 900 feature_names = ['polyA', 'U5', 'U5 stem', 'PBS', 'PSI SL1-4'] ax = plot_fitness_costs_along_genome(start, stop, feature_names, data, minor_af, reference, pheno=None, synnonsyn=synnonsyn['genomewide'], ws=8, ws_syn=4, ax=axs[0]) # add label and dimension to left-most axis, all other are tied to this one ax.set_ylabel('fitness cost [1/day]', fontsize=fs) ax.set_ylim(ymin, ymax) add_panel_label(ax, 'B', x_offset=-0.15) ax.plot([start,reference.annotation["LTR5'"].location.end], ax.get_ylim()[0]*np.ones(2), lw=10, c='k', alpha=0.7) ax.text(start, ax.get_ylim()[0]*1.17, "LTR5'", fontsize=fs*0.8, horizontalalignment='left') ax.plot([reference.annotation['gag'].location.start, stop], ax.get_ylim()[0]*np.ones(2), lw=10, c='k', alpha=0.7) ax.text(stop, ax.get_ylim()[0]*1.17, 'gag', fontsize=fs*0.8, horizontalalignment='right') ax.set_ylim(ymin, ymax) # frame shift region -- no syn fitness cost here since this is in an overlap start, stop = 2050, 2150 feature_names = ['frameshift'] ax = plot_fitness_costs_along_genome(start, stop, feature_names, data, minor_af, reference, pheno=None, ws=8, ws_syn=4, ax=axs[1]) ax.plot([start, reference.annotation['gag'].location.end], ax.get_ylim()[0]*np.ones(2), lw=10, c='k', alpha=0.7) ax.text(start, ax.get_ylim()[0]*1.17, 'gag', fontsize=fs*0.8, horizontalalignment='left') ax.plot([reference.annotation['pol'].location.start,stop], y_second_gene*ax.get_ylim()[0]*np.ones(2), lw=5, c='k', alpha=0.7) ax.text(stop, ax.get_ylim()[0]*(y_second_gene+0.17), 'pol', fontsize=fs*0.8, horizontalalignment='right') ax.set_xticks([2050, 2150]) ax.set_ylim(ymin, ymax) # plot the cPPT region start, stop = 4750, 5000 feature_names = ['A1','D2', 'cPPT'] ax = plot_fitness_costs_along_genome(start, stop, feature_names, data, minor_af, reference, pheno=None, synnonsyn=synnonsyn['genomewide'], ws=8, ws_syn=4, ax=axs[2]) # add label and dimension to left-most axis, all other are tied to this one ax.set_ylim(ymin, ymax) ax.plot([start,reference.annotation["IN"].location.end], ax.get_ylim()[0]*np.ones(2), lw=10, c='k', alpha=0.7) ax.text(start, ax.get_ylim()[0]*1.17, "IN", fontsize=fs*0.8, horizontalalignment='left') ax.plot([reference.annotation['vif'].location.start, stop], y_second_gene*ax.get_ylim()[0]*np.ones(2), lw=10, c='k', alpha=0.7) ax.text(stop, ax.get_ylim()[0]*(y_second_gene+0.17), 'vif', fontsize=fs*0.8, horizontalalignment='right') ax.set_xticks([4800, 4900]) ax.set_ylim(ymin, ymax) # plot the 3' region start, stop = 9050, 9150 feature_names = ['PPT'] ax = plot_fitness_costs_along_genome(start, stop, feature_names, data, minor_af, reference, pheno=None, synnonsyn=synnonsyn['genomewide'], ws=8, ws_syn=4, ax=axs[3]) ax.plot([start, reference.annotation['nef'].location.end], ax.get_ylim()[0]*np.ones(2), lw=10, c='k', alpha=0.7) ax.text(start, ax.get_ylim()[0]*1.17, 'nef', fontsize=fs*0.8, horizontalalignment='left') ax.plot([reference.annotation["LTR3'"].location.start,stop], y_second_gene*ax.get_ylim()[0]*np.ones(2), lw=5, c='k', alpha=0.7) ax.text(stop, ax.get_ylim()[0]*(y_second_gene+0.17), "LTR3'", fontsize=fs*0.8, horizontalalignment='right') ax.set_xticks([9050, 9100,9150]) ax.set_ylim(ymin, ymax) fig.text(0.5, 0.01, 'Position in HIV-1 reference (HXB2) [bp]', ha='center', fontsize=fs) plt.tight_layout(rect=(0, 0.04, 1, 1),w_pad=-1) if fname is not None: for ext in ['.png', '.svg', '.pdf']: plt.savefig(fname+ext)
def plot_fitness_cost_along_genome(regions, data, minor_af, synnonsyn, reference, ws=30): '''Plot the fitness costs along the genome We have the fitness costs per site, but we only plot a running average over 30 bp as a smoothing, for visual clarity. Later on we export the actual per-site fitness costs to file. ''' from util import add_panel_label all_sel_coeff = [] # Fitness costs along the genome fig, axs = plt.subplots(2, 1, sharex=True, gridspec_kw={'height_ratios':[6, 1]}) for ni,label_str in ((1,'nonsynonymous'), (0,'synonymous')): for ri, region in enumerate(regions): ind = synnonsyn[region] if label_str=='synonymous' else ~synnonsyn[region] ind = ind&(~np.isnan(minor_af[region])) #axs[0].plot([x for x in reference.annotation[region] if x%3==0], 1.0/np.convolve(np.ones(ws, dtype=float)/ws, 1.0/sc[region], mode='same'), c=cols[ri]) sc = (data['mut_rate'][region]/(af_cutoff+minor_af[region])) sc[sc>0.1] = 0.1 sc[sc<0.001] = 0.001 axs[0].plot(running_average(np.array(list(reference.annotation[region]))[ind], ws), np.exp(running_average(np.log(sc[ind]), ws)), c=cols[ri%len(cols)], ls='--' if label_str=='synonymous' else '-', label=label_str if region=='gag' else None) if ni and region not in ['vpr', 'vpu']: all_sel_coeff.extend([(region, pos, np.log10(sc[pos]), synnonsyn[region][pos]) for pos in range(len(sc))]) axs[0].legend(loc=1, fontsize=fs*0.8) axs[0].set_yscale('log') axs[0].set_ylabel('fitness cost [1/day]', fontsize=fs) axs[0].set_ylim(0.002, 0.25) axs[0].tick_params(labelsize=fs*0.8) # The genome annotations regs = ['p17', 'p6', 'p7', 'p24', 'PR', 'RT', 'IN', 'p15', 'nef', 'gp120', 'gp41', 'vif', 'vpu', 'vpr', 'rev', 'tat', 'V1', 'V2', 'V3', 'V5'] annotations = {k: val for k, val in reference.annotation.iteritems() if k in regs} annotations = draw_genome(annotations, axs[1]) axs[1].set_axis_off() feas = ['p17', 'p24', 'PR', 'RT', 'p15', 'IN', 'vif', 'gp120', 'gp41', 'nef'] vlines = np.unique(annotations.loc[annotations['name'].isin(feas), ['x1', 'x2']]) for xtmp in vlines: axs[0].axvline(xtmp, lw=1, color='0.8') plt.tight_layout() add_panel_label(axs[0], 'A', x_offset=-0.1) for ext in ['png', 'svg', 'pdf']: fig.savefig('../figures/figure_3A_st_' + reference.subtype + '.'+ext) # Violin plots of the fitness cost distributions for syn and nonsyn all_sel_coeff = pd.DataFrame(data=all_sel_coeff, columns=['gene', 'position', 'selection', 'synonymous']) all_sel_coeff.loc[all_sel_coeff['synonymous'] == True, 'synonymous'] = 'synonymous' all_sel_coeff.loc[all_sel_coeff['synonymous'] == False, 'synonymous'] = 'nonsynonymous' fig = plt.figure() ax = sns.violinplot(x='gene', y='selection', hue='synonymous', data=all_sel_coeff, inner='quartile', split=True, cut=0, scale='area') ax.set_yticks([-3,-2,-1]) ax.set_yticklabels([r'$10^{'+str(i)+'}$' for i in [-3,-2,-1]]) ax.tick_params(labelsize=0.8*fs) ax.set_ylabel('fitness cost [1/day]', fontsize=fs) ax.set_xlabel('') ax.set_ylim(-3, -0.5) ax.legend(loc=1, fontsize=fs, title=None) plt.tight_layout() #add_panel_label(ax, 'B', x_offset=-0.1) for ext in ['png', 'svg', 'pdf']: fig.savefig('../figures/figure_S6_st_' + reference.subtype +'.'+ext)
def plot_minor_allele_example(data, title='', VERBOSE=0, fig_filename=None): '''Plot minor allele in a typical sample''' import matplotlib.pyplot as plt import seaborn as sns from util import add_panel_label plt.ioff() if VERBOSE: print 'Plot minor alleles of example sample' fig_size = (fig_width, 0.8*fig_width) fig, axs = plt.subplots(1, 2, figsize=fig_size, sharey=True, gridspec_kw={'width_ratios': [3, 1]}) sns.set_style('darkgrid') labels = ['control', 'patient'] alphas = [0.6, 1] colors = [sns.color_palette()[i] for i in [2, 0]] shapes = ['s', 'o'] for idat, datum in enumerate(data): y = datum['freq_minor'] x = np.arange(len(y)) #axs[0].plot(x, y, lw=1.5, alpha=0.8) axs[0].scatter(x, y, marker=shapes[idat], lw=1.5, edgecolor='none', facecolor=colors[idat], zorder=idat+1) h = np.histogram(y, bins=np.logspace(-4, 0, 27)) axs[1].barh(h[1][:-1], h[0], (h[1][1:] - h[1][:-1]), color=colors[idat], alpha=alphas[idat], zorder=2 - idat) axs[0].set_xlabel('Position [bp]', fontsize=fig_fontsize) axs[0].set_ylabel('SNP frequency', fontsize=fig_fontsize) axs[0].set_yscale('log') axs[0].set_ylim(10**(-4), 1) axs[0].set_xlim(-20, y.nonzero()[0][-1] + 21) axs[0].grid(True) axs[0].tick_params(axis='both', labelsize=fig_fontsize) axs[1].set_xlabel('Number of positions', fontsize=fig_fontsize) axs[1].grid(True) axs[1].set_yscale('log') axs[1].set_xlim(0.8, 2 * h[0].max()) axs[1].set_xscale('log') axs[1].tick_params(axis='x', labelsize=fig_fontsize) add_panel_label(axs[0], 'C', x_offset=-0.22) plt.tight_layout(pad=0.1, h_pad=0.001, w_pad=0.001) if title: fig.suptitle(title) if fig_filename is not None: for ext in ['.pdf','.svg', '.png']: fig.savefig(fig_filename+ext) plt.close(fig) else: plt.ion() plt.show()
def plot_fit(data_sat, data_pooled, bins_sat): from matplotlib import cm from util import add_panel_label palette = sns.color_palette('colorblind') fig_width = 5 fs = 16 fig, axs = plt.subplots(1, 2, figsize=(2 * fig_width, fig_width)) data_to_fit = data_sat['data_to_fit'] mu = data_sat['mu'] s = data_sat['s'] fun = lambda x, s: mu / s * (1.0 - np.exp(-s * x)) # PANEL A: data and fits ax = axs[0] for iS, (S, datum) in enumerate(data_to_fit.iterrows()): x = np.array(datum.index) y = np.array(datum) color = cm.jet(1.0 * iS / data_to_fit.shape[0]) # Most conserved group is dashed if iS == 0: ls = '--' else: ls = '-' ax.scatter( x, y, s=70, color=color, ) xfit = np.linspace(0, 3000) yfit = fun(xfit, s.loc[S, 's']) ax.plot( xfit, yfit, lw=2, color=color, ls=ls, ) ax.set_xlabel('days since EDI', fontsize=fs) ax.set_ylabel('divergence', fontsize=fs) ax.set_xlim(-200, 3200) ax.set_ylim(-0.0005, 0.025) ax.set_xticks(np.linspace(0, 0.005, 5)) ax.set_xticks([0, 1000, 2000, 3000]) ax.xaxis.set_tick_params(labelsize=fs) ax.yaxis.set_tick_params(labelsize=fs) ax.text(0, 0.023, r'$\mu = 1.2 \cdot 10^{-5}$ per day', fontsize=16) ax.plot([200, 1300], [0.007, 0.007 + (1300 - 200) * mu], lw=1.5, c='k') # PANEL B: costs ax = axs[1] # B1: Saturation fit x = np.array(s.index) y = np.array(s['s']) dy = np.array(s['ds']) ymin = 0.1 x = x[1:] y = y[1:] dx = np.array((x - bins_sat[1:-1], bins_sat[2:] - x)) dy = dy[1:] ax.errorbar( x, y, yerr=dy, xerr=dx, ls='-', marker='o', lw=2, color=palette[0], label='Sat', ) # Annotate with colors from panel A #ax.scatter(x, y, # marker='o', # s=130, # edgecolor=cm.jet(1.0 * np.arange(1, data_to_fit.shape[0]) / data_to_fit.shape[0]), # facecolor='none', # lw=2, # zorder=5, # ) for iS in xrange(1, data_to_fit.shape[0]): ax.annotate( '', xy=(x[iS - 1], y[iS - 1] * 0.7 if iS != data_to_fit.shape[0] - 1 else 1e-4), xytext=(x[iS - 1], y[iS - 1] * 1.0 / 3 if iS != data_to_fit.shape[0] - 1 else 2e-4), arrowprops={ 'facecolor': cm.jet(1.0 * iS / data_to_fit.shape[0]), 'edgecolor': 'none', 'shrink': 0.05 }, ) # B2: pooled x = data_pooled['all'][:-1, 0] y = data_pooled['all'][:-1, -1] dy = data_pooled['all_std'][:-1, -1] dx = np.array( (x - data_pooled['all'][:-1, 1], data_pooled['all'][:-1, 2] - x)) ax.errorbar( x, y, yerr=dy, xerr=dx, ls='-', marker='o', lw=2, color=palette[2], label='Pooled', ) ax.legend(loc='upper right', fontsize=16) ax.set_xlabel('variability in group M [bits]', fontsize=fs) ax.set_ylabel('fitness cost [1/day]', fontsize=fs) ax.set_xlim(0.9e-3, 2.5) ax.set_ylim(9e-5, 0.11) ax.set_xscale('log') ax.set_yscale('log') ax.xaxis.set_tick_params(labelsize=fs) ax.yaxis.set_tick_params(labelsize=fs) # Panel labels add_panel_label(axs[0], 'A', x_offset=-0.27) add_panel_label(axs[1], 'B', x_offset=-0.21) plt.tight_layout() plt.ion() plt.show()
def plot_non_coding_figure(data, minor_af, synnonsyn, reference, fname=None): '''Plot fitness cost at noncoding features''' from util import add_panel_label ymax = 0.25 ymin = 0.0005 y_second_gene = 1.18 fig, axs = plt.subplots(1, 4, sharey=True, figsize=(10, 5), gridspec_kw={'width_ratios': [4, 1, 2.5, 1]}) # plot the 5' region start, stop = 500, 900 feature_names = ['polyA', 'U5', 'U5 stem', 'PBS', 'PSI SL1-4'] ax = plot_fitness_costs_along_genome(start, stop, feature_names, data, minor_af, reference, pheno=None, synnonsyn=synnonsyn['genomewide'], ws=8, ws_syn=4, ax=axs[0]) # add label and dimension to left-most axis, all other are tied to this one ax.set_ylabel('fitness cost [1/day]', fontsize=fs) ax.set_ylim(ymin, ymax) add_panel_label(ax, 'B', x_offset=-0.15) ax.plot([start, reference.annotation["LTR5'"].location.end], ax.get_ylim()[0] * np.ones(2), lw=10, c='k', alpha=0.7) ax.text(start, ax.get_ylim()[0] * 1.17, "LTR5'", fontsize=fs * 0.8, horizontalalignment='left') ax.plot([reference.annotation['gag'].location.start, stop], ax.get_ylim()[0] * np.ones(2), lw=10, c='k', alpha=0.7) ax.text(stop, ax.get_ylim()[0] * 1.17, 'gag', fontsize=fs * 0.8, horizontalalignment='right') ax.set_ylim(ymin, ymax) # frame shift region -- no syn fitness cost here since this is in an overlap start, stop = 2050, 2150 feature_names = ['frameshift'] ax = plot_fitness_costs_along_genome(start, stop, feature_names, data, minor_af, reference, pheno=None, ws=8, ws_syn=4, ax=axs[1]) ax.plot([start, reference.annotation['gag'].location.end], ax.get_ylim()[0] * np.ones(2), lw=10, c='k', alpha=0.7) ax.text(start, ax.get_ylim()[0] * 1.17, 'gag', fontsize=fs * 0.8, horizontalalignment='left') ax.plot([reference.annotation['pol'].location.start, stop], y_second_gene * ax.get_ylim()[0] * np.ones(2), lw=5, c='k', alpha=0.7) ax.text(stop, ax.get_ylim()[0] * (y_second_gene + 0.17), 'pol', fontsize=fs * 0.8, horizontalalignment='right') ax.set_xticks([2050, 2150]) ax.set_ylim(ymin, ymax) # plot the cPPT region start, stop = 4750, 5000 feature_names = ['A1', 'D2', 'cPPT'] ax = plot_fitness_costs_along_genome(start, stop, feature_names, data, minor_af, reference, pheno=None, synnonsyn=synnonsyn['genomewide'], ws=8, ws_syn=4, ax=axs[2]) # add label and dimension to left-most axis, all other are tied to this one ax.set_ylim(ymin, ymax) ax.plot([start, reference.annotation["IN"].location.end], ax.get_ylim()[0] * np.ones(2), lw=10, c='k', alpha=0.7) ax.text(start, ax.get_ylim()[0] * 1.17, "IN", fontsize=fs * 0.8, horizontalalignment='left') ax.plot([reference.annotation['vif'].location.start, stop], y_second_gene * ax.get_ylim()[0] * np.ones(2), lw=10, c='k', alpha=0.7) ax.text(stop, ax.get_ylim()[0] * (y_second_gene + 0.17), 'vif', fontsize=fs * 0.8, horizontalalignment='right') ax.set_xticks([4800, 4900]) ax.set_ylim(ymin, ymax) # plot the 3' region start, stop = 9050, 9150 feature_names = ['PPT'] ax = plot_fitness_costs_along_genome(start, stop, feature_names, data, minor_af, reference, pheno=None, synnonsyn=synnonsyn['genomewide'], ws=8, ws_syn=4, ax=axs[3]) ax.plot([start, reference.annotation['nef'].location.end], ax.get_ylim()[0] * np.ones(2), lw=10, c='k', alpha=0.7) ax.text(start, ax.get_ylim()[0] * 1.17, 'nef', fontsize=fs * 0.8, horizontalalignment='left') ax.plot([reference.annotation["LTR3'"].location.start, stop], y_second_gene * ax.get_ylim()[0] * np.ones(2), lw=5, c='k', alpha=0.7) ax.text(stop, ax.get_ylim()[0] * (y_second_gene + 0.17), "LTR3'", fontsize=fs * 0.8, horizontalalignment='right') ax.set_xticks([9050, 9100, 9150]) ax.set_ylim(ymin, ymax) fig.text(0.5, 0.01, 'Position in HIV-1 reference (HXB2) [bp]', ha='center', fontsize=fs) plt.tight_layout(rect=(0, 0.04, 1, 1), w_pad=-1) if fname is not None: for ext in ['.png', '.svg', '.pdf']: plt.savefig(fname + ext)
def plot_minor_allele_example(data, title='', VERBOSE=0, fig_filename=None): '''Plot minor allele in a typical sample''' import matplotlib.pyplot as plt import seaborn as sns from util import add_panel_label plt.ioff() if VERBOSE: print 'Plot minor alleles of example sample' fig_size = (fig_width, 0.8 * fig_width) fig, axs = plt.subplots(1, 2, figsize=fig_size, sharey=True, gridspec_kw={'width_ratios': [3, 1]}) sns.set_style('darkgrid') labels = ['control', 'patient'] alphas = [0.6, 1] colors = [sns.color_palette()[i] for i in [2, 0]] shapes = ['s', 'o'] for idat, datum in enumerate(data): y = datum['freq_minor'] x = np.arange(len(y)) #axs[0].plot(x, y, lw=1.5, alpha=0.8) axs[0].scatter(x, y, marker=shapes[idat], lw=1.5, edgecolor='none', facecolor=colors[idat], zorder=idat + 1) h = np.histogram(y, bins=np.logspace(-4, 0, 27)) axs[1].barh(h[1][:-1], h[0], (h[1][1:] - h[1][:-1]), color=colors[idat], alpha=alphas[idat], zorder=2 - idat) axs[0].set_xlabel('Position [bp]', fontsize=fig_fontsize) axs[0].set_ylabel('SNP frequency', fontsize=fig_fontsize) axs[0].set_yscale('log') axs[0].set_ylim(10**(-4), 1) axs[0].set_xlim(-20, y.nonzero()[0][-1] + 21) axs[0].grid(True) axs[0].tick_params(axis='both', labelsize=fig_fontsize) axs[1].set_xlabel('Number of positions', fontsize=fig_fontsize) axs[1].grid(True) axs[1].set_yscale('log') axs[1].set_xlim(0.8, 2 * h[0].max()) axs[1].set_xscale('log') axs[1].tick_params(axis='x', labelsize=fig_fontsize) add_panel_label(axs[0], 'C', x_offset=-0.22) plt.tight_layout(pad=0.1, h_pad=0.001, w_pad=0.001) if title: fig.suptitle(title) if fig_filename is not None: for ext in ['.pdf', '.svg', '.png']: fig.savefig(fig_filename + ext) plt.close(fig) else: plt.ion() plt.show()