def write_counts_files(data_folder, adaID, fragment, counts, inserts, coverage=None, VERBOSE=0): """Write allele counts, inserts, and coverage to file""" if VERBOSE >= 1: print "Write to file: " + adaID + " " + fragment if coverage is None: coverage = counts.sum(axis=1) # Save counts and coverage counts.dump(get_allele_counts_filename(data_folder, adaID, fragment)) coverage.dump(get_coverage_filename(data_folder, adaID, fragment)) # Convert inserts to normal nested dictionary for pickle inserts_dic = {k: dict(v) for (k, v) in inserts.iteritems()} with open(get_insert_counts_filename(data_folder, adaID, fragment), "w") as f: pickle.dump(inserts_dic, f, protocol=-1)
def write_counts_files(data_folder, adaID, fragment, counts, inserts, coverage=None, VERBOSE=0): '''Write allele counts, inserts, and coverage to file''' if VERBOSE >= 1: print 'Write to file: ' + adaID + ' ' + fragment if coverage is None: coverage = counts.sum(axis=1) # Save counts and coverage counts.dump(get_allele_counts_filename(data_folder, adaID, fragment)) coverage.dump(get_coverage_filename(data_folder, adaID, fragment)) # Convert inserts to normal nested dictionary for pickle inserts_dic = {k: dict(v) for (k, v) in inserts.iteritems()} with open(get_insert_counts_filename(data_folder, adaID, fragment), 'w') as f: pickle.dump(inserts_dic, f, protocol=-1)
if VERBOSE >= 2: print ali[:, :30] print ali[:, -30:] print 'Lenghts: ref', len(refseq), 'consensus', len( consensusseq) len_ali = ali.get_alignment_length() n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali)) print 'Differences from ref:', n_diff, '(' + '{:3.1f}'.format( 100.0 * n_diff / len_ali) + '%)' # Ungap consensus consensusseq = SeqRecord(ali[1].seq, id=name, name=name) if '-' in consensusseq: consensusseq.seq = consensusseq.seq.ungap('-') # Write output outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True) SeqIO.write(consensusseq, outfile, 'fasta') AlignIO.write( ali, get_reference_consensus_ali_filename(data_folder, adaID, fragment), 'fasta') if store_allele_counts: allele_counts.dump( get_allele_counts_filename(data_folder, adaID, frag_out))
ali = align_muscle(refseq, consensusseq, sort=True) if ali[0][-1] == '-': start_nongap = len(ali[0]) - len(ali[0].seq.lstrip('-')) end_nongap = len(ali[0].seq.rstrip('-')) ali = ali[:, start_nongap: end_nongap] if VERBOSE >= 2: print ali[:, :30] print ali[:, -30:] print 'Lenghts: ref', len(refseq), 'consensus', len(consensusseq) len_ali = ali.get_alignment_length() n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali)) print 'Differences from ref:', n_diff, '('+'{:3.1f}'.format(100.0 * n_diff / len_ali)+'%)' # Ungap consensus consensusseq = SeqRecord(ali[1].seq, id=name, name=name) if '-' in consensusseq: consensusseq.seq = consensusseq.seq.ungap('-') # Write output outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True) SeqIO.write(consensusseq, outfile, 'fasta') AlignIO.write(ali, get_reference_consensus_ali_filename(data_folder, adaID, fragment), 'fasta') if store_allele_counts: allele_counts.dump(get_allele_counts_filename(data_folder, adaID, frag_out))
def check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap, VERBOSE=0, ax=None): '''Check biases in allele frequencies in the overlap''' (start_s2, end_s1, ali) = overlap # Get allele counts and coverage cou1 = np.load(get_allele_counts_filename(data_folder, adaID, frag1)) cov1 = np.load(get_coverage_filename(data_folder, adaID, frag1)) cou2 = np.load(get_allele_counts_filename(data_folder, adaID, frag2)) cov2 = np.load(get_coverage_filename(data_folder, adaID, frag2)) # Cut the counts and coverage to the overlap region cou1 = cou1[:, :, start_s2:] cov1 = cov1[:, start_s2:] cou2 = cou2[:, :, :end_s1] cov2 = cov2[:, :end_s1] # Reduce the allele counts (fwd and rev have different status on most overlaps, # because of the uneven coverage) nu1 = filter_nus(cou1, cov1) nu2 = filter_nus(cou2, cov2) # FIXME if nu1.shape != nu2.shape: return # Print table of called polymorphisms print 'adaID', adaID, frag1, frag2, 'polymorphism matrix (NO | YES)' print 3 * ' ', '|', '{:^10s}'.format(frag1) print 15 * '-' print 3 * ' ', '|', \ '{:3d}'.format(((nu1 < 1e-6) & (nu2 < 1e-6)).sum()), '|', \ '{:3d}'.format(((nu1 > 3e-3) & (nu2 < 1e-6)).sum()) print '{:3s}'.format(frag2), '+'+(5*'-')+'+'+(4*'-') print 3 * ' ', '|', \ '{:3d}'.format(((nu1 < 1e-6) & (nu2 > 3e-3)).sum()), '|', \ '{:3d}'.format(((nu1 > 3e-6) & (nu2 > 3e-3)).sum()) # Plot scatter import matplotlib.pyplot as plt from matplotlib import cm if ax is None: show_plot = True fig, ax = plt.subplots(1, 1, figsize=(6, 6)) ax.set_title('allele frequencies, adaID '+str(adaID)+', '+\ str(frag1)+' - '+str(frag2), fontsize=18) else: show_plot = False ax.set_title(str(frag1)+' - '+str(frag2), fontsize=18) ax.scatter(np.abs(nu1 - 1e-5), np.abs(nu2 - 1e-5), s=30, c=cm.jet([int(255.0 * i / len(nu1)) for i in xrange(len(nu1))])) # Plot diagonal ax.plot([1e-7, 2], [1e-7, 2], lw=1, c='k', ls='--') ax.set_xlabel(r'$\nu_1$', fontsize=20) ax.set_ylabel(r'$\nu_2$', fontsize=20) ax.set_xscale('log') ax.set_yscale('log') ax.set_xlim(0.7e-5, 1.2) ax.set_ylim(0.7e-5, 1.2) if show_plot: plt.tight_layout(w_pad=0.05) plt.ion() plt.show()
# Determine the overlap overlap = get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=VERBOSE) # Check consensus is_diff = check_overlap_consensus(data_folder, adaID, frag1, frag2, overlap, VERBOSE=VERBOSE) if overlap is None: continue overlaps.append(((frag1, frag2), overlap)) # Check allele frequencies if present if not os.path.isfile(get_allele_counts_filename(data_folder, adaID, 'F1')): continue # Make unified figures import matplotlib.pyplot as plt fig, axs = plt.subplots(1, len(overlaps), figsize=(2+3.6*len(fragments), 6)) fig.suptitle('allele frequencies, adaID '+str(adaID), fontsize=18) if len(overlaps) == 1: axs = [axs] for (ax, ((frag1, frag2), overlap)) in izip(axs, overlaps): # Check allele frequencies check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap, VERBOSE=VERBOSE, ax=ax) plt.tight_layout(rect=(0, 0, 1, 0.95))
print 'adaIDs', adaIDs # If the script is called with no fragment, iterate over all if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments # Iterate over samples and fragments for adaID in adaIDs: for fragment in fragments: consensus = SeqIO.read(get_consensus_filename(data_folder, adaID, fragment), 'fasta') cmat = np.array(consensus) counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment)) coverage = np.load(get_coverage_filename(data_folder, adaID, fragment)) nu = filter_nus(counts, coverage, VERBOSE=VERBOSE) # Note: not-covered positions are filtered, but argmax cannot work # with masked arrays cmat_af = alpha[nu.argmax(axis=0)] if hasattr(nu, 'mask'): cmat_af[nu.mask.all(axis=0)] = 'N' # Check for consistency first if len(cmat) != len(cmat_af): print 'Consensus has a different length from allele frequency \ matrix... WTF?' # Do not actually align, it makes a huge mess (we miss mistakes)
def check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap, VERBOSE=0, ax=None): """Check biases in allele frequencies in the overlap""" (start_s2, end_s1, ali) = overlap # Get allele counts and coverage cou1 = np.load(get_allele_counts_filename(data_folder, adaID, frag1)) cov1 = np.load(get_coverage_filename(data_folder, adaID, frag1)) cou2 = np.load(get_allele_counts_filename(data_folder, adaID, frag2)) cov2 = np.load(get_coverage_filename(data_folder, adaID, frag2)) # Cut the counts and coverage to the overlap region cou1 = cou1[:, :, start_s2:] cov1 = cov1[:, start_s2:] cou2 = cou2[:, :, :end_s1] cov2 = cov2[:, :end_s1] # Reduce the allele counts (fwd and rev have different status on most overlaps, # because of the uneven coverage) nu1 = filter_nus(cou1, cov1) nu2 = filter_nus(cou2, cov2) # FIXME if nu1.shape != nu2.shape: return # Print table of called polymorphisms print "adaID", adaID, frag1, frag2, "polymorphism matrix (NO | YES)" print 3 * " ", "|", "{:^10s}".format(frag1) print 15 * "-" print 3 * " ", "|", "{:3d}".format(((nu1 < 1e-6) & (nu2 < 1e-6)).sum()), "|", "{:3d}".format( ((nu1 > 3e-3) & (nu2 < 1e-6)).sum() ) print "{:3s}".format(frag2), "+" + (5 * "-") + "+" + (4 * "-") print 3 * " ", "|", "{:3d}".format(((nu1 < 1e-6) & (nu2 > 3e-3)).sum()), "|", "{:3d}".format( ((nu1 > 3e-6) & (nu2 > 3e-3)).sum() ) # Plot scatter import matplotlib.pyplot as plt from matplotlib import cm if ax is None: show_plot = True fig, ax = plt.subplots(1, 1, figsize=(6, 6)) ax.set_title("allele frequencies, adaID " + str(adaID) + ", " + str(frag1) + " - " + str(frag2), fontsize=18) else: show_plot = False ax.set_title(str(frag1) + " - " + str(frag2), fontsize=18) ax.scatter( np.abs(nu1 - 1e-5), np.abs(nu2 - 1e-5), s=30, c=cm.jet([int(255.0 * i / len(nu1)) for i in xrange(len(nu1))]) ) # Plot diagonal ax.plot([1e-7, 2], [1e-7, 2], lw=1, c="k", ls="--") ax.set_xlabel(r"$\nu_1$", fontsize=20) ax.set_ylabel(r"$\nu_2$", fontsize=20) ax.set_xscale("log") ax.set_yscale("log") ax.set_xlim(0.7e-5, 1.2) ax.set_ylim(0.7e-5, 1.2) if show_plot: plt.tight_layout(w_pad=0.05) plt.ion() plt.show()
overlaps = [] for (frag1, frag2) in pairs: # Determine the overlap overlap = get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=VERBOSE) # Check consensus is_diff = check_overlap_consensus(data_folder, adaID, frag1, frag2, overlap, VERBOSE=VERBOSE) if overlap is None: continue overlaps.append(((frag1, frag2), overlap)) # Check allele frequencies if present if not os.path.isfile(get_allele_counts_filename(data_folder, adaID, "F1")): continue # Make unified figures import matplotlib.pyplot as plt fig, axs = plt.subplots(1, len(overlaps), figsize=(2 + 3.6 * len(fragments), 6)) fig.suptitle("allele frequencies, adaID " + str(adaID), fontsize=18) if len(overlaps) == 1: axs = [axs] for (ax, ((frag1, frag2), overlap)) in izip(axs, overlaps): # Check allele frequencies check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap, VERBOSE=VERBOSE, ax=ax) plt.tight_layout(rect=(0, 0, 1, 0.95))
def plot_minor_allele_frequency(data_folder, adaID, fragments, VERBOSE=0, savefig=False): '''Plot minor allele frequency along the genome''' from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff import matplotlib params = {'axes.labelsize': 20, 'text.fontsize': 20, 'legend.fontsize': 8, 'xtick.labelsize': 16, 'ytick.labelsize': 16, 'text.usetex': False} matplotlib.rcParams.update(params) from matplotlib import cm import matplotlib.pyplot as plt plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)] # Store in globals structures covs = {} nus_minor = {} alls_minor = {} nus_filtered = {} nus_minor_filtered = {} for fragment in fragments: coverage = np.load(get_coverage_filename(data_folder, adaID, fragment)) covs[fragment] = coverage counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment)) (counts_major, counts_minor, counts_minor2) = get_minor_allele_counts(counts, n_minor=2) # Get minor allele frequencies and identities nu_minor = 1.0 * counts_minor[:, :, 1] / (coverage + 1e-6) nus_minor[fragment] = nu_minor all_minor = counts_minor[:, :, 0] alls_minor[fragment] = all_minor # Filter the minor frequencies by comparing the read types try: nu_filtered = np.load(get_allele_frequencies_filename(data_folder, adaID, fragment)) except IOError: nu_filtered = filter_nus(counts, coverage) nut = np.zeros(nu_filtered.shape[-1]) for pos, nupos in enumerate(nu_filtered.T): nut[pos] = np.sort(nupos)[-2] nus_filtered[fragment] = nu_filtered nus_minor_filtered[fragment] = nut # Plot them (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1] fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8)) if len(fragments) > 1: axs = axs.ravel() else: axs = [axs] fig.suptitle('adapterID '+adaID, fontsize=20) labss = {'read1 f': 'read1 fwd', 'read1 r': 'read1 rev', 'read2 f': 'read2 fwd', 'read2 r': 'read2 rev'} for i, fragment in enumerate(fragments): ax = axs[i] ax.set_yscale('log') ax.set_title(fragment) if i in [0, 3]: ax.set_ylabel(r'$\nu$') if i > 2: ax.set_xlabel('Position') # Plot divided by readtype for js, nu_minorjs in enumerate(nus_minor[fragment]): color = cm.jet(int(255.0 * js / len(read_types))) ax.plot(nu_minorjs, lw=1.5, c=color, label=labss[read_types[js]]) ax.scatter(np.arange(len(nu_minorjs)), nu_minorjs, lw=1.5, color=color) # Plot filtered ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5, label='Filtered') ax.scatter(np.arange(len(nus_minor_filtered[fragment])), nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5) # Plot 1/max(coverage) coverage = covs[fragment] cov_tot = coverage.sum(axis=0) ax.plot(1.0 / cov_tot, lw=1.2, c='r', label='Detection limit') ax.set_xlim(-100, len(nu_minorjs) + 100) plt.grid() plt.legend(loc='upper right') plt.tight_layout(rect=(0, 0, 1, 0.95)) if savefig: outputfile = gff(data_folder, adaID, fragment) fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()
def plot_minor_allele_frequency_filtered(data_folder, adaID, fragments, VERBOSE=0, savefig=False): '''Plot minor allele frequency along the genome''' from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff import matplotlib params = {'axes.labelsize': 20, 'text.fontsize': 20, 'legend.fontsize': 8, 'xtick.labelsize': 16, 'ytick.labelsize': 16, 'text.usetex': False} matplotlib.rcParams.update(params) from matplotlib import cm import matplotlib.pyplot as plt # Store in globals structures covs = {} nus_minor_filtered = {} for fragment in fragments: coverage = np.load(get_coverage_filename(data_folder, adaID, fragment)) covs[fragment] = coverage try: nu_filtered = np.load(get_allele_frequencies_filename(data_folder, adaID, fragment)) except IOError: counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment)) nu_filtered = filter_nus(counts) nut = np.zeros(nu_filtered.shape[-1]) for pos, nupos in enumerate(nu_filtered.T): nut[pos] = np.sort(nupos)[-2] nus_minor_filtered[fragment] = nut # Plot them plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)] (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1] fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8)) if len(fragments) > 1: axs = axs.ravel() else: axs = [axs] fig.suptitle('adapterID '+adaID, fontsize=20) for i, fragment in enumerate(fragments): ax = axs[i] ax.set_yscale('log') ax.set_title(fragment) if i in [0, 3]: ax.set_ylabel(r'$\nu$') if i > 2: ax.set_xlabel('Position') # Plot filtered ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5, label='Filtered') ax.scatter(np.arange(len(nus_minor_filtered[fragment])), nus_minor_filtered[fragment], lw=1.5, c='k', alpha=0.5) ax.set_xlim(-100, len(nus_minor_filtered[fragment]) + 100) #plt.legend(loc='upper right') plt.tight_layout(rect=(0, 0, 1, 0.95)) if savefig: outputfile = gff(data_folder, adaID, fragment, only_filtered=True) fig.savefig(outputfile) plt.close(fig) else: plt.ion() plt.show()