def plot_distance_histogram(data_folder, adaID, fragment, counts, savefig=False): '''Plot the histogram of distance from consensus''' from hivwholeseq.sequencing.filenames import get_distance_from_consensus_figure_filename as gff import matplotlib.pyplot as plt if savefig: is_ion = plt.isinteractive() plt.ioff() # Linear histogram fig, ax = plt.subplots(1, 1) ax.set_xlabel('Hamming distance') ax.set_ylabel('# read pairs') ax.set_title('adaID ' + adaID + ', ' + fragment) ax.set_xlim(-0.5, 0.5 + counts.nonzero()[0][-1]) ax.plot(np.arange(len(counts)), counts, 'b', lw=2) if savefig: outputfile = gff(data_folder, adaID, fragment) fig.savefig(outputfile) plt.close(fig) # Log cumulative histogram fig, ax = plt.subplots(1, 1) ax.set_xlabel('Hamming distance') ax.set_ylabel('# read pairs < x') ax.set_title('adaID ' + adaID + ', ' + fragment) ax.set_xlim(-0.5, 0.5 + counts.nonzero()[0][-1]) ax.set_ylim(1.0 / counts.sum() * 0.9, 1.1) ax.set_yscale('log') y = 1.0 - 1.0 * np.cumsum(counts) / counts.sum() ax.plot(np.arange(len(counts)), y, 'b', lw=2) if savefig: outputfile = gff(data_folder, adaID, fragment, cumulative=True, yscale='log') fig.savefig(outputfile) plt.close(fig) if is_ion: plt.ion()
def plot_coverage(data_folder, adaID, fragment, counts, VERBOSE=0, savefig=False): '''Plot figure with the coverage''' from hivwholeseq.sequencing.filenames import get_coverage_figure_filename as gff if VERBOSE >= 1: print 'Plotting coverage: ' + adaID + ' ' + fragment coverage = counts.sum(axis=1).sum(axis=0) import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1, figsize=(15, 8)) ax.plot(coverage + 0.5) ax.set_yscale('log') ax.set_xlabel('Position') ax.set_ylabel('Coverage') ax.set_title('adaID ' + adaID + ', fragment ' + fragment) if savefig: outputfile = gff(data_folder, adaID, fragment) fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()
def plot_coverage(data_folder, adaID, fragment, counts, VERBOSE=0, savefig=False): '''Plot figure with the coverage''' from hivwholeseq.sequencing.filenames import get_coverage_figure_filename as gff if VERBOSE >= 1: print 'Plotting coverage: '+adaID+' '+fragment coverage = counts.sum(axis=1).sum(axis=0) import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1, figsize=(15, 8)) ax.plot(coverage + 0.5) ax.set_yscale('log') ax.set_xlabel('Position') ax.set_ylabel('Coverage') ax.set_title('adaID '+adaID+', fragment '+fragment) if savefig: outputfile = gff(data_folder, adaID, fragment) fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()
def plot_minor_allele_frequency_filtered(data_folder, adaID, fragments, VERBOSE=0, savefig=False): '''Plot minor allele frequency along the genome''' nus = np.load(get_merged_allele_frequencies_filename(data_folder, adaID, fragments)) nu_min = np.ma.masked_all(nus.shape[-1]) for pos, nutmp in enumerate(nus.T): try: if not np.ma.is_masked(nutmp): nu_min[pos] = np.sort(nutmp)[-2] except ValueError: print pos, np.ma.is_masked(nutmp) import ipdb; ipdb.set_trace() import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1, figsize=(15, 8)) ax.plot(nu_min, lw=1.5, c='k') ax.scatter(np.arange(len(nu_min)), nu_min, s=30, c='k') ax.set_yscale('log') ax.set_xlabel('Position') ax.set_ylabel(r'$\nu$', fontsize=20) ax.set_title('adaID '+adaID+', '+'-'.join(fragments)) ax.set_xlim(-100, len(nu_min) + 100) plt.tight_layout() if savefig: from hivwholeseq.sequencing.filenames import \ get_minor_allele_frequency_merged_figure_filename as gff outputfile = gff(data_folder, adaID, fragments) fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()
def plot_distance_histogram(data_folder, adaID, fragment, counts, savefig=False): '''Plot the histogram of distance from consensus''' from hivwholeseq.sequencing.filenames import get_distance_from_consensus_figure_filename as gff import matplotlib.pyplot as plt if savefig: is_ion = plt.isinteractive() plt.ioff() # Linear histogram fig, ax = plt.subplots(1, 1) ax.set_xlabel('Hamming distance') ax.set_ylabel('# read pairs') ax.set_title('adaID '+adaID+', '+fragment) ax.set_xlim(-0.5, 0.5 + counts.nonzero()[0][-1]) ax.plot(np.arange(len(counts)), counts, 'b', lw=2) if savefig: outputfile = gff(data_folder, adaID, fragment) fig.savefig(outputfile) plt.close(fig) # Log cumulative histogram fig, ax = plt.subplots(1, 1) ax.set_xlabel('Hamming distance') ax.set_ylabel('# read pairs < x') ax.set_title('adaID '+adaID+', '+fragment) ax.set_xlim(-0.5, 0.5 + counts.nonzero()[0][-1]) ax.set_ylim(1.0 / counts.sum() * 0.9, 1.1) ax.set_yscale('log') y = 1.0 - 1.0 * np.cumsum(counts) / counts.sum() ax.plot(np.arange(len(counts)), y, 'b', lw=2) if savefig: outputfile = gff(data_folder, adaID, fragment, cumulative=True, yscale='log') fig.savefig(outputfile) plt.close(fig) if is_ion: plt.ion()
def plot_minor_allele_frequency_filtered(data_folder, adaID, fragments, VERBOSE=0, savefig=False): '''Plot minor allele frequency along the genome''' nus = np.load( get_merged_allele_frequencies_filename(data_folder, adaID, fragments)) nu_min = np.ma.masked_all(nus.shape[-1]) for pos, nutmp in enumerate(nus.T): try: if not np.ma.is_masked(nutmp): nu_min[pos] = np.sort(nutmp)[-2] except ValueError: print pos, np.ma.is_masked(nutmp) import ipdb ipdb.set_trace() import matplotlib.pyplot as plt fig, ax = plt.subplots(1, 1, figsize=(15, 8)) ax.plot(nu_min, lw=1.5, c='k') ax.scatter(np.arange(len(nu_min)), nu_min, s=30, c='k') ax.set_yscale('log') ax.set_xlabel('Position') ax.set_ylabel(r'$\nu$', fontsize=20) ax.set_title('adaID ' + adaID + ', ' + '-'.join(fragments)) ax.set_xlim(-100, len(nu_min) + 100) plt.tight_layout() if savefig: from hivwholeseq.sequencing.filenames import \ get_minor_allele_frequency_merged_figure_filename as gff outputfile = gff(data_folder, adaID, fragments) fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()
def plot_SFS_folded(data_folder, adaID, fragment, nu_filtered, VERBOSE=0, savefig=False): '''Plot the site frequency spectrum (folded)''' if VERBOSE >= 1: print 'Plotting folded SFS' from hivwholeseq.sequencing.filenames import get_SFS_figure_filename as gff import matplotlib.pyplot as plt import numpy as np nu_maj = np.ma.masked_all(nu_filtered.shape[1]) nu_min = np.ma.masked_all(nu_filtered.shape[1]) for pos, nus in enumerate(nu_filtered.T): if nus[0] == np.ma.masked: continue nus = np.sort(nus) if (nus[-1] < 0.5): if VERBOSE >= 3: print pos, 'has 3+ alleles:', nus, 'skipping.' continue nu_maj[pos] = nus[-1] nu_min[pos] = nus[-2] nu_maj_fold = 1 - nu_maj nu_mm = np.concatenate([nu_maj_fold, nu_min]) nu_mm = np.array(nu_mm[nu_mm > 1e-5]) nu_mm.sort() # Cumulative histogram fig, ax = plt.subplots(1, 1) ax.set_xlabel(r'$\nu$', fontsize=20) ax.set_ylabel('# alleles < x folded') ax.set_title('adaID ' + adaID + ', ' + fragment) ax.set_xlim(10**(np.floor(np.log10(nu_mm[0] * 0.9))), 0.6) ax.set_xscale('log') ax.set_ylim(1.0 / len(nu_mm) * 0.9, 1.1) ax.set_yscale('log') ax.plot(nu_mm, 1.0 - np.linspace(0, 1 - 1.0 / len(nu_mm), len(nu_mm)), lw=2, c='b') if savefig: outputfile = gff(data_folder, adaID, fragment, cumulative=True, yscale='log') fig.savefig(outputfile) plt.close(fig) # Histogram fig, ax = plt.subplots(1, 1) ax.set_xlabel(r'$\nu$', fontsize=20) ax.set_ylabel('SFS folded (density)') ax.set_title('adaID ' + adaID + ', ' + fragment) ax.set_xlim(10**(np.floor(np.log10(nu_mm[0] * 0.9))), 0.6) ax.set_xscale('log') ax.set_yscale('log') bins = np.logspace(-4, np.log10(0.5), 50) h = np.histogram(nu_mm, bins=bins, density=True) x = np.sqrt(h[1][1:] * h[1][:-1]) y = h[0] ax.plot(x, y, lw=2, c='b') ax.scatter(x, y, s=50, edgecolor='none', facecolor='b') ax.grid() if savefig: outputfile = gff(data_folder, adaID, fragment, cumulative=False, yscale='log') fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()
def plot_SFS_folded(data_folder, adaID, fragment, nu_filtered, VERBOSE=0, savefig=False): '''Plot the site frequency spectrum (folded)''' if VERBOSE >= 1: print 'Plotting folded SFS' from hivwholeseq.sequencing.filenames import get_SFS_figure_filename as gff import matplotlib.pyplot as plt import numpy as np nu_maj = np.ma.masked_all(nu_filtered.shape[1]) nu_min = np.ma.masked_all(nu_filtered.shape[1]) for pos, nus in enumerate(nu_filtered.T): if nus[0] == np.ma.masked: continue nus = np.sort(nus) if (nus[-1] < 0.5): if VERBOSE >= 3: print pos, 'has 3+ alleles:', nus, 'skipping.' continue nu_maj[pos] = nus[-1] nu_min[pos] = nus[-2] nu_maj_fold = 1 - nu_maj nu_mm = np.concatenate([nu_maj_fold, nu_min]) nu_mm = np.array(nu_mm[nu_mm > 1e-5]) nu_mm.sort() # Cumulative histogram fig, ax = plt.subplots(1, 1) ax.set_xlabel(r'$\nu$', fontsize=20) ax.set_ylabel('# alleles < x folded') ax.set_title('adaID '+adaID+', '+fragment) ax.set_xlim(10**(np.floor(np.log10(nu_mm[0] * 0.9))), 0.6) ax.set_xscale('log') ax.set_ylim(1.0 / len(nu_mm) * 0.9, 1.1) ax.set_yscale('log') ax.plot(nu_mm, 1.0 - np.linspace(0, 1 - 1.0 / len(nu_mm), len(nu_mm)), lw=2, c='b') if savefig: outputfile = gff(data_folder, adaID, fragment, cumulative=True, yscale='log') fig.savefig(outputfile) plt.close(fig) # Histogram fig, ax = plt.subplots(1, 1) ax.set_xlabel(r'$\nu$', fontsize=20) ax.set_ylabel('SFS folded (density)') ax.set_title('adaID '+adaID+', '+fragment) ax.set_xlim(10**(np.floor(np.log10(nu_mm[0] * 0.9))), 0.6) ax.set_xscale('log') ax.set_yscale('log') bins = np.logspace(-4, np.log10(0.5), 50) h = np.histogram(nu_mm, bins=bins, density=True) x = np.sqrt(h[1][1:] * h[1][:-1]) y = h[0] ax.plot(x, y, lw=2, c='b') ax.scatter(x, y, s=50, edgecolor='none', facecolor='b') ax.grid() if savefig: outputfile = gff(data_folder, adaID, fragment, cumulative=False, yscale='log') fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()
def plot_distance_histogram_sliding_window(data_folder, adaID, fragment, lref, counts, binsize=200, savefig=False): '''Plot the distance histogram along the genome''' from hivwholeseq.sequencing.filenames import get_distance_from_consensus_figure_filename as gff import matplotlib.pyplot as plt from matplotlib import cm if savefig: is_ion = plt.isinteractive() plt.ioff() # Figure max x xmax = counts.nonzero()[1].max() # Linear histogram fig, ax = plt.subplots(1, 1) ax.set_xlabel('Hamming distance') ax.set_ylabel('# read pairs') ax.set_title('adaID ' + adaID + ', ' + fragment) ax.set_xlim(-0.5, 0.5 + xmax) for i, count in enumerate(counts): color = cm.jet(int(255.0 * i / counts.shape[0])) start = binsize * i end = min(binsize * (i + 1), lref) ax.plot(np.arange(counts.shape[1]), count, lw=2, color=color, label=str(start) + ' to ' + str(end)) ax.legend(loc=1) if savefig: outputfile = gff(data_folder, adaID, fragment, sliding_window=True) fig.savefig(outputfile) plt.close(fig) # Log cumulative histogram fig, ax = plt.subplots(1, 1) ax.set_xlabel('Hamming distance') ax.set_ylabel('# read pairs') ax.set_title('adaID ' + adaID + ', ' + fragment) ax.set_xlim(-0.5, 0.5 + xmax) ax.set_ylim(1.0 / counts.sum(axis=1).max() * 0.9, 1.1) ax.set_yscale('log') for i, count in enumerate(counts): color = cm.jet(int(255.0 * i / counts.shape[0])) start = binsize * i end = min(binsize * (i + 1), lref) y = 1.0 - 1.0 * np.cumsum(count) / count.sum() ax.plot(np.arange(counts.shape[1]), y, lw=2, color=color, label=str(start) + ' to ' + str(end)) ax.legend(loc=1) if savefig: outputfile = gff(data_folder, adaID, fragment, cumulative=True, sliding_window=True) fig.savefig(outputfile) plt.close(fig) if is_ion: plt.ion()
def plot_distance_histogram_sliding_window(data_folder, adaID, fragment, lref, counts, binsize=200, savefig=False): '''Plot the distance histogram along the genome''' from hivwholeseq.sequencing.filenames import get_distance_from_consensus_figure_filename as gff import matplotlib.pyplot as plt from matplotlib import cm if savefig: is_ion = plt.isinteractive() plt.ioff() # Figure max x xmax = counts.nonzero()[1].max() # Linear histogram fig, ax = plt.subplots(1, 1) ax.set_xlabel('Hamming distance') ax.set_ylabel('# read pairs') ax.set_title('adaID '+adaID+', '+fragment) ax.set_xlim(-0.5, 0.5 + xmax) for i, count in enumerate(counts): color = cm.jet(int(255.0 * i / counts.shape[0])) start = binsize * i end = min(binsize * (i+1), lref) ax.plot(np.arange(counts.shape[1]), count, lw=2, color=color, label=str(start)+' to '+str(end)) ax.legend(loc=1) if savefig: outputfile = gff(data_folder, adaID, fragment, sliding_window=True) fig.savefig(outputfile) plt.close(fig) # Log cumulative histogram fig, ax = plt.subplots(1, 1) ax.set_xlabel('Hamming distance') ax.set_ylabel('# read pairs') ax.set_title('adaID '+adaID+', '+fragment) ax.set_xlim(-0.5, 0.5 + xmax) ax.set_ylim(1.0 / counts.sum(axis=1).max() * 0.9, 1.1) ax.set_yscale('log') for i, count in enumerate(counts): color = cm.jet(int(255.0 * i / counts.shape[0])) start = binsize * i end = min(binsize * (i+1), lref) y = 1.0 - 1.0 * np.cumsum(count) / count.sum() ax.plot(np.arange(counts.shape[1]), y, lw=2, color=color, label=str(start)+' to '+str(end)) ax.legend(loc=1) if savefig: outputfile = gff(data_folder, adaID, fragment, cumulative=True, sliding_window=True) fig.savefig(outputfile) plt.close(fig) if is_ion: plt.ion()
def plot_minor_allele_frequency(data_folder, adaID, fragments, VERBOSE=0, savefig=False): '''Plot minor allele frequency along the genome''' from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff import matplotlib params = {'axes.labelsize': 20, 'text.fontsize': 20, 'legend.fontsize': 8, 'xtick.labelsize': 16, 'ytick.labelsize': 16, 'text.usetex': False} matplotlib.rcParams.update(params) from matplotlib import cm import matplotlib.pyplot as plt plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)] # Store in globals structures covs = {} nus_minor = {} alls_minor = {} nus_filtered = {} nus_minor_filtered = {} for fragment in fragments: coverage = np.load(get_coverage_filename(data_folder, adaID, fragment)) covs[fragment] = coverage counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment)) (counts_major, counts_minor, counts_minor2) = get_minor_allele_counts(counts, n_minor=2) # Get minor allele frequencies and identities nu_minor = 1.0 * counts_minor[:, :, 1] / (coverage + 1e-6) nus_minor[fragment] = nu_minor all_minor = counts_minor[:, :, 0] alls_minor[fragment] = all_minor # Filter the minor frequencies by comparing the read types try: nu_filtered = np.load(get_allele_frequencies_filename(data_folder, adaID, fragment)) except IOError: nu_filtered = filter_nus(counts, coverage) nut = np.zeros(nu_filtered.shape[-1]) for pos, nupos in enumerate(nu_filtered.T): nut[pos] = np.sort(nupos)[-2] nus_filtered[fragment] = nu_filtered nus_minor_filtered[fragment] = nut # Plot them (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1] fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8)) if len(fragments) > 1: axs = axs.ravel() else: axs = [axs] fig.suptitle('adapterID '+adaID, fontsize=20) labss = {'read1 f': 'read1 fwd', 'read1 r': 'read1 rev', 'read2 f': 'read2 fwd', 'read2 r': 'read2 rev'} for i, fragment in enumerate(fragments): ax = axs[i] ax.set_yscale('log') ax.set_title(fragment) if i in [0, 3]: ax.set_ylabel(r'$\nu$') if i > 2: ax.set_xlabel('Position') # Plot divided by readtype for js, nu_minorjs in enumerate(nus_minor[fragment]): color = cm.jet(int(255.0 * js / len(read_types))) ax.plot(nu_minorjs, lw=1.5, c=color, label=labss[read_types[js]]) ax.scatter(np.arange(len(nu_minorjs)), nu_minorjs, lw=1.5, color=color) # Plot filtered ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5, label='Filtered') ax.scatter(np.arange(len(nus_minor_filtered[fragment])), nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5) # Plot 1/max(coverage) coverage = covs[fragment] cov_tot = coverage.sum(axis=0) ax.plot(1.0 / cov_tot, lw=1.2, c='r', label='Detection limit') ax.set_xlim(-100, len(nu_minorjs) + 100) plt.grid() plt.legend(loc='upper right') plt.tight_layout(rect=(0, 0, 1, 0.95)) if savefig: outputfile = gff(data_folder, adaID, fragment) fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()
def plot_minor_allele_frequency_filtered(data_folder, adaID, fragments, VERBOSE=0, savefig=False): '''Plot minor allele frequency along the genome''' from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff import matplotlib params = {'axes.labelsize': 20, 'text.fontsize': 20, 'legend.fontsize': 8, 'xtick.labelsize': 16, 'ytick.labelsize': 16, 'text.usetex': False} matplotlib.rcParams.update(params) from matplotlib import cm import matplotlib.pyplot as plt # Store in globals structures covs = {} nus_minor_filtered = {} for fragment in fragments: coverage = np.load(get_coverage_filename(data_folder, adaID, fragment)) covs[fragment] = coverage try: nu_filtered = np.load(get_allele_frequencies_filename(data_folder, adaID, fragment)) except IOError: counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment)) nu_filtered = filter_nus(counts) nut = np.zeros(nu_filtered.shape[-1]) for pos, nupos in enumerate(nu_filtered.T): nut[pos] = np.sort(nupos)[-2] nus_minor_filtered[fragment] = nut # Plot them plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)] (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1] fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8)) if len(fragments) > 1: axs = axs.ravel() else: axs = [axs] fig.suptitle('adapterID '+adaID, fontsize=20) for i, fragment in enumerate(fragments): ax = axs[i] ax.set_yscale('log') ax.set_title(fragment) if i in [0, 3]: ax.set_ylabel(r'$\nu$') if i > 2: ax.set_xlabel('Position') # Plot filtered ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5, label='Filtered') ax.scatter(np.arange(len(nus_minor_filtered[fragment])), nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5) ax.set_xlim(-100, len(nus_minor_filtered[fragment]) + 100) #plt.legend(loc='upper right') plt.tight_layout(rect=(0, 0, 1, 0.95)) if savefig: outputfile = gff(data_folder, adaID, fragment, only_filtered=True) fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()