def write_frequency_files(data_folder, adaID, fragment, nu_filtered, VERBOSE=0): '''Write the corrected allele frequencies to file''' if VERBOSE: print 'Storing allele frequencies to file:', adaID, fragment nu_filtered.dump( get_allele_frequencies_filename(data_folder, adaID, fragment))
def merge_allele_frequencies(data_folder, adaID, fragments, VERBOSE=0): '''Merge allele frequencies at overlapping pairs''' import warnings import numpy as np consensi = {frag: SeqIO.read(get_consensus_filename(data_folder, adaID, frag, trim_primers=True), 'fasta') for frag in fragments} nus = {frag: np.load(get_allele_frequencies_filename(data_folder, adaID, frag)) for frag in fragments} pairs = get_overlapping_fragments(fragments) overlaps = {} for (frag1, frag2) in pairs: overlap = get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=VERBOSE) is_diff = check_overlap_consensus(data_folder, adaID, frag1, frag2, overlap, VERBOSE=VERBOSE) if is_diff: warnings.warn(frag1+' and '+frag2+' have different consensi.', RuntimeWarning) overlaps[(frag1, frag2)] = overlap nu = [] fragments = sorted(fragments) for i, frag in enumerate(fragments): # If the start is not an overlap, start a new chunk and copy all if (i == 0) or (fragments[i-1], frag) not in overlaps: nuf = [[frag], nus[frag]] nu.append(nuf) # else, copy from the end of the overlap on # FIXME: we could average the consensus zone out of indels... else: nuf = nu[-1] nuf[0].append(frag) tmp = overlaps[(fragments[i-1], frag)] if tmp is not None: (_, start, _) = tmp #(recursion is not the most efficient but -- oh, well) nuf[1] = np.concatenate([nuf[1], nus[frag][:, start:]], axis=1) else: tmp = np.zeros((nuf[1].shape[0], 10), float) tmp[-1] = 1 nuf[1] = np.concatenate([nuf[1], tmp, nus[frag][:, start:]], axis=1) return nu
def plot_minor_allele_frequency(data_folder, adaID, fragments, VERBOSE=0, savefig=False): '''Plot minor allele frequency along the genome''' from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff import matplotlib params = {'axes.labelsize': 20, 'text.fontsize': 20, 'legend.fontsize': 8, 'xtick.labelsize': 16, 'ytick.labelsize': 16, 'text.usetex': False} matplotlib.rcParams.update(params) from matplotlib import cm import matplotlib.pyplot as plt plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)] # Store in globals structures covs = {} nus_minor = {} alls_minor = {} nus_filtered = {} nus_minor_filtered = {} for fragment in fragments: coverage = np.load(get_coverage_filename(data_folder, adaID, fragment)) covs[fragment] = coverage counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment)) (counts_major, counts_minor, counts_minor2) = get_minor_allele_counts(counts, n_minor=2) # Get minor allele frequencies and identities nu_minor = 1.0 * counts_minor[:, :, 1] / (coverage + 1e-6) nus_minor[fragment] = nu_minor all_minor = counts_minor[:, :, 0] alls_minor[fragment] = all_minor # Filter the minor frequencies by comparing the read types try: nu_filtered = np.load(get_allele_frequencies_filename(data_folder, adaID, fragment)) except IOError: nu_filtered = filter_nus(counts, coverage) nut = np.zeros(nu_filtered.shape[-1]) for pos, nupos in enumerate(nu_filtered.T): nut[pos] = np.sort(nupos)[-2] nus_filtered[fragment] = nu_filtered nus_minor_filtered[fragment] = nut # Plot them (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1] fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8)) if len(fragments) > 1: axs = axs.ravel() else: axs = [axs] fig.suptitle('adapterID '+adaID, fontsize=20) labss = {'read1 f': 'read1 fwd', 'read1 r': 'read1 rev', 'read2 f': 'read2 fwd', 'read2 r': 'read2 rev'} for i, fragment in enumerate(fragments): ax = axs[i] ax.set_yscale('log') ax.set_title(fragment) if i in [0, 3]: ax.set_ylabel(r'$\nu$') if i > 2: ax.set_xlabel('Position') # Plot divided by readtype for js, nu_minorjs in enumerate(nus_minor[fragment]): color = cm.jet(int(255.0 * js / len(read_types))) ax.plot(nu_minorjs, lw=1.5, c=color, label=labss[read_types[js]]) ax.scatter(np.arange(len(nu_minorjs)), nu_minorjs, lw=1.5, color=color) # Plot filtered ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5, label='Filtered') ax.scatter(np.arange(len(nus_minor_filtered[fragment])), nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5) # Plot 1/max(coverage) coverage = covs[fragment] cov_tot = coverage.sum(axis=0) ax.plot(1.0 / cov_tot, lw=1.2, c='r', label='Detection limit') ax.set_xlim(-100, len(nu_minorjs) + 100) plt.grid() plt.legend(loc='upper right') plt.tight_layout(rect=(0, 0, 1, 0.95)) if savefig: outputfile = gff(data_folder, adaID, fragment) fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()
def write_frequency_files(data_folder, adaID, fragment, nu_filtered, VERBOSE=0): '''Write the corrected allele frequencies to file''' if VERBOSE: print 'Storing allele frequencies to file:', adaID, fragment nu_filtered.dump(get_allele_frequencies_filename(data_folder, adaID, fragment))
def plot_minor_allele_frequency_filtered(data_folder, adaID, fragments, VERBOSE=0, savefig=False): '''Plot minor allele frequency along the genome''' from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff import matplotlib params = {'axes.labelsize': 20, 'text.fontsize': 20, 'legend.fontsize': 8, 'xtick.labelsize': 16, 'ytick.labelsize': 16, 'text.usetex': False} matplotlib.rcParams.update(params) from matplotlib import cm import matplotlib.pyplot as plt # Store in globals structures covs = {} nus_minor_filtered = {} for fragment in fragments: coverage = np.load(get_coverage_filename(data_folder, adaID, fragment)) covs[fragment] = coverage try: nu_filtered = np.load(get_allele_frequencies_filename(data_folder, adaID, fragment)) except IOError: counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment)) nu_filtered = filter_nus(counts) nut = np.zeros(nu_filtered.shape[-1]) for pos, nupos in enumerate(nu_filtered.T): nut[pos] = np.sort(nupos)[-2] nus_minor_filtered[fragment] = nut # Plot them plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)] (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1] fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8)) if len(fragments) > 1: axs = axs.ravel() else: axs = [axs] fig.suptitle('adapterID '+adaID, fontsize=20) for i, fragment in enumerate(fragments): ax = axs[i] ax.set_yscale('log') ax.set_title(fragment) if i in [0, 3]: ax.set_ylabel(r'$\nu$') if i > 2: ax.set_xlabel('Position') # Plot filtered ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5, label='Filtered') ax.scatter(np.arange(len(nus_minor_filtered[fragment])), nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5) ax.set_xlim(-100, len(nus_minor_filtered[fragment]) + 100) #plt.legend(loc='upper right') plt.tight_layout(rect=(0, 0, 1, 0.95)) if savefig: outputfile = gff(data_folder, adaID, fragment, only_filtered=True) fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()