fragments = ['F' + str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments # Iterate over samples and fragments for adaID in adaIDs: for fragment in fragments: consensus = SeqIO.read( get_consensus_filename(data_folder, adaID, fragment), 'fasta') cmat = np.array(consensus) counts = np.load( get_allele_counts_filename(data_folder, adaID, fragment)) coverage = np.load( get_coverage_filename(data_folder, adaID, fragment)) nu = filter_nus(counts, coverage, VERBOSE=VERBOSE) # Note: not-covered positions are filtered, but argmax cannot work # with masked arrays cmat_af = alpha[nu.argmax(axis=0)] if hasattr(nu, 'mask'): cmat_af[nu.mask.all(axis=0)] = 'N' # Check for consistency first if len(cmat) != len(cmat_af): print 'Consensus has a different length from allele frequency \ matrix... WTF?' # Do not actually align, it makes a huge mess (we miss mistakes) ali = [cmat, cmat_af] if (ali[0] != ali[1]).any():
# If the script is called with no fragment, iterate over all if not fragments: fragments = ['F'+str(i) for i in xrange(1, 7)] if VERBOSE >= 3: print 'fragments', fragments # Iterate over samples and fragments for adaID in adaIDs: for fragment in fragments: consensus = SeqIO.read(get_consensus_filename(data_folder, adaID, fragment), 'fasta') cmat = np.array(consensus) counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment)) coverage = np.load(get_coverage_filename(data_folder, adaID, fragment)) nu = filter_nus(counts, coverage, VERBOSE=VERBOSE) # Note: not-covered positions are filtered, but argmax cannot work # with masked arrays cmat_af = alpha[nu.argmax(axis=0)] if hasattr(nu, 'mask'): cmat_af[nu.mask.all(axis=0)] = 'N' # Check for consistency first if len(cmat) != len(cmat_af): print 'Consensus has a different length from allele frequency \ matrix... WTF?' # Do not actually align, it makes a huge mess (we miss mistakes) ali = [cmat, cmat_af] if (ali[0] != ali[1]).any():
def check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap, VERBOSE=0, ax=None): '''Check biases in allele frequencies in the overlap''' (start_s2, end_s1, ali) = overlap # Get allele counts and coverage cou1 = np.load(get_allele_counts_filename(data_folder, adaID, frag1)) cov1 = np.load(get_coverage_filename(data_folder, adaID, frag1)) cou2 = np.load(get_allele_counts_filename(data_folder, adaID, frag2)) cov2 = np.load(get_coverage_filename(data_folder, adaID, frag2)) # Cut the counts and coverage to the overlap region cou1 = cou1[:, :, start_s2:] cov1 = cov1[:, start_s2:] cou2 = cou2[:, :, :end_s1] cov2 = cov2[:, :end_s1] # Reduce the allele counts (fwd and rev have different status on most overlaps, # because of the uneven coverage) nu1 = filter_nus(cou1, cov1) nu2 = filter_nus(cou2, cov2) # FIXME if nu1.shape != nu2.shape: return # Print table of called polymorphisms print 'adaID', adaID, frag1, frag2, 'polymorphism matrix (NO | YES)' print 3 * ' ', '|', '{:^10s}'.format(frag1) print 15 * '-' print 3 * ' ', '|', \ '{:3d}'.format(((nu1 < 1e-6) & (nu2 < 1e-6)).sum()), '|', \ '{:3d}'.format(((nu1 > 3e-3) & (nu2 < 1e-6)).sum()) print '{:3s}'.format(frag2), '+'+(5*'-')+'+'+(4*'-') print 3 * ' ', '|', \ '{:3d}'.format(((nu1 < 1e-6) & (nu2 > 3e-3)).sum()), '|', \ '{:3d}'.format(((nu1 > 3e-6) & (nu2 > 3e-3)).sum()) # Plot scatter import matplotlib.pyplot as plt from matplotlib import cm if ax is None: show_plot = True fig, ax = plt.subplots(1, 1, figsize=(6, 6)) ax.set_title('allele frequencies, adaID '+str(adaID)+', '+\ str(frag1)+' - '+str(frag2), fontsize=18) else: show_plot = False ax.set_title(str(frag1)+' - '+str(frag2), fontsize=18) ax.scatter(np.abs(nu1 - 1e-5), np.abs(nu2 - 1e-5), s=30, c=cm.jet([int(255.0 * i / len(nu1)) for i in xrange(len(nu1))])) # Plot diagonal ax.plot([1e-7, 2], [1e-7, 2], lw=1, c='k', ls='--') ax.set_xlabel(r'$\nu_1$', fontsize=20) ax.set_ylabel(r'$\nu_2$', fontsize=20) ax.set_xscale('log') ax.set_yscale('log') ax.set_xlim(0.7e-5, 1.2) ax.set_ylim(0.7e-5, 1.2) if show_plot: plt.tight_layout(w_pad=0.05) plt.ion() plt.show()
def check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap, VERBOSE=0, ax=None): """Check biases in allele frequencies in the overlap""" (start_s2, end_s1, ali) = overlap # Get allele counts and coverage cou1 = np.load(get_allele_counts_filename(data_folder, adaID, frag1)) cov1 = np.load(get_coverage_filename(data_folder, adaID, frag1)) cou2 = np.load(get_allele_counts_filename(data_folder, adaID, frag2)) cov2 = np.load(get_coverage_filename(data_folder, adaID, frag2)) # Cut the counts and coverage to the overlap region cou1 = cou1[:, :, start_s2:] cov1 = cov1[:, start_s2:] cou2 = cou2[:, :, :end_s1] cov2 = cov2[:, :end_s1] # Reduce the allele counts (fwd and rev have different status on most overlaps, # because of the uneven coverage) nu1 = filter_nus(cou1, cov1) nu2 = filter_nus(cou2, cov2) # FIXME if nu1.shape != nu2.shape: return # Print table of called polymorphisms print "adaID", adaID, frag1, frag2, "polymorphism matrix (NO | YES)" print 3 * " ", "|", "{:^10s}".format(frag1) print 15 * "-" print 3 * " ", "|", "{:3d}".format(((nu1 < 1e-6) & (nu2 < 1e-6)).sum()), "|", "{:3d}".format( ((nu1 > 3e-3) & (nu2 < 1e-6)).sum() ) print "{:3s}".format(frag2), "+" + (5 * "-") + "+" + (4 * "-") print 3 * " ", "|", "{:3d}".format(((nu1 < 1e-6) & (nu2 > 3e-3)).sum()), "|", "{:3d}".format( ((nu1 > 3e-6) & (nu2 > 3e-3)).sum() ) # Plot scatter import matplotlib.pyplot as plt from matplotlib import cm if ax is None: show_plot = True fig, ax = plt.subplots(1, 1, figsize=(6, 6)) ax.set_title("allele frequencies, adaID " + str(adaID) + ", " + str(frag1) + " - " + str(frag2), fontsize=18) else: show_plot = False ax.set_title(str(frag1) + " - " + str(frag2), fontsize=18) ax.scatter( np.abs(nu1 - 1e-5), np.abs(nu2 - 1e-5), s=30, c=cm.jet([int(255.0 * i / len(nu1)) for i in xrange(len(nu1))]) ) # Plot diagonal ax.plot([1e-7, 2], [1e-7, 2], lw=1, c="k", ls="--") ax.set_xlabel(r"$\nu_1$", fontsize=20) ax.set_ylabel(r"$\nu_2$", fontsize=20) ax.set_xscale("log") ax.set_yscale("log") ax.set_xlim(0.7e-5, 1.2) ax.set_ylim(0.7e-5, 1.2) if show_plot: plt.tight_layout(w_pad=0.05) plt.ion() plt.show()