コード例 #1
0
        fragments = ['F' + str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    # Iterate over samples and fragments
    for adaID in adaIDs:
        for fragment in fragments:
            consensus = SeqIO.read(
                get_consensus_filename(data_folder, adaID, fragment), 'fasta')
            cmat = np.array(consensus)

            counts = np.load(
                get_allele_counts_filename(data_folder, adaID, fragment))
            coverage = np.load(
                get_coverage_filename(data_folder, adaID, fragment))
            nu = filter_nus(counts, coverage, VERBOSE=VERBOSE)

            # Note: not-covered positions are filtered, but argmax cannot work
            # with masked arrays
            cmat_af = alpha[nu.argmax(axis=0)]
            if hasattr(nu, 'mask'):
                cmat_af[nu.mask.all(axis=0)] = 'N'

            # Check for consistency first
            if len(cmat) != len(cmat_af):
                print 'Consensus has a different length from allele frequency \
                        matrix... WTF?'

            # Do not actually align, it makes a huge mess (we miss mistakes)
            ali = [cmat, cmat_af]
            if (ali[0] != ali[1]).any():
コード例 #2
0
    # If the script is called with no fragment, iterate over all
    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    # Iterate over samples and fragments
    for adaID in adaIDs:
        for fragment in fragments:
            consensus = SeqIO.read(get_consensus_filename(data_folder, adaID, fragment),
                                   'fasta')
            cmat = np.array(consensus)

            counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment))
            coverage = np.load(get_coverage_filename(data_folder, adaID, fragment))
            nu = filter_nus(counts, coverage, VERBOSE=VERBOSE)

            # Note: not-covered positions are filtered, but argmax cannot work
            # with masked arrays
            cmat_af = alpha[nu.argmax(axis=0)]
            if hasattr(nu, 'mask'):
                cmat_af[nu.mask.all(axis=0)] = 'N'

            # Check for consistency first
            if len(cmat) != len(cmat_af):
                print 'Consensus has a different length from allele frequency \
                        matrix... WTF?'

            # Do not actually align, it makes a huge mess (we miss mistakes)
            ali = [cmat, cmat_af]
            if (ali[0] != ali[1]).any():
コード例 #3
0
def check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap,
                                     VERBOSE=0, ax=None):
    '''Check biases in allele frequencies in the overlap'''
    (start_s2, end_s1, ali) = overlap

    # Get allele counts and coverage
    cou1 = np.load(get_allele_counts_filename(data_folder, adaID, frag1))
    cov1 = np.load(get_coverage_filename(data_folder, adaID, frag1))
    cou2 = np.load(get_allele_counts_filename(data_folder, adaID, frag2))
    cov2 = np.load(get_coverage_filename(data_folder, adaID, frag2))

    # Cut the counts and coverage to the overlap region
    cou1 = cou1[:, :, start_s2:]
    cov1 = cov1[:, start_s2:]
    cou2 = cou2[:, :, :end_s1]
    cov2 = cov2[:, :end_s1]

    # Reduce the allele counts (fwd and rev have different status on most overlaps,
    # because of the uneven coverage)
    nu1 = filter_nus(cou1, cov1)
    nu2 = filter_nus(cou2, cov2)

    # FIXME
    if nu1.shape != nu2.shape:
        return

    # Print table of called polymorphisms
    print 'adaID', adaID, frag1, frag2, 'polymorphism matrix (NO | YES)'
    print 3 * ' ', '|', '{:^10s}'.format(frag1)
    print 15 * '-'
    print 3 * ' ', '|', \
            '{:3d}'.format(((nu1 < 1e-6) & (nu2 < 1e-6)).sum()), '|', \
            '{:3d}'.format(((nu1 > 3e-3) & (nu2 < 1e-6)).sum())
    print '{:3s}'.format(frag2), '+'+(5*'-')+'+'+(4*'-')
    print 3 * ' ', '|', \
            '{:3d}'.format(((nu1 < 1e-6) & (nu2 > 3e-3)).sum()), '|', \
            '{:3d}'.format(((nu1 > 3e-6) & (nu2 > 3e-3)).sum())
         

    # Plot scatter
    import matplotlib.pyplot as plt
    from matplotlib import cm
    if ax is None:
        show_plot = True
        fig, ax = plt.subplots(1, 1, figsize=(6, 6))
        ax.set_title('allele frequencies, adaID '+str(adaID)+', '+\
                     str(frag1)+' - '+str(frag2),
                     fontsize=18)
    else:
        show_plot = False
        ax.set_title(str(frag1)+' - '+str(frag2), fontsize=18)
    ax.scatter(np.abs(nu1 - 1e-5), np.abs(nu2 - 1e-5), s=30,
               c=cm.jet([int(255.0 * i / len(nu1)) for i in xrange(len(nu1))]))
    # Plot diagonal
    ax.plot([1e-7, 2], [1e-7, 2], lw=1, c='k', ls='--')
    ax.set_xlabel(r'$\nu_1$', fontsize=20)
    ax.set_ylabel(r'$\nu_2$', fontsize=20)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlim(0.7e-5, 1.2)
    ax.set_ylim(0.7e-5, 1.2)

    if show_plot:
        plt.tight_layout(w_pad=0.05)
        plt.ion()
        plt.show()
コード例 #4
0
def check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap, VERBOSE=0, ax=None):
    """Check biases in allele frequencies in the overlap"""
    (start_s2, end_s1, ali) = overlap

    # Get allele counts and coverage
    cou1 = np.load(get_allele_counts_filename(data_folder, adaID, frag1))
    cov1 = np.load(get_coverage_filename(data_folder, adaID, frag1))
    cou2 = np.load(get_allele_counts_filename(data_folder, adaID, frag2))
    cov2 = np.load(get_coverage_filename(data_folder, adaID, frag2))

    # Cut the counts and coverage to the overlap region
    cou1 = cou1[:, :, start_s2:]
    cov1 = cov1[:, start_s2:]
    cou2 = cou2[:, :, :end_s1]
    cov2 = cov2[:, :end_s1]

    # Reduce the allele counts (fwd and rev have different status on most overlaps,
    # because of the uneven coverage)
    nu1 = filter_nus(cou1, cov1)
    nu2 = filter_nus(cou2, cov2)

    # FIXME
    if nu1.shape != nu2.shape:
        return

    # Print table of called polymorphisms
    print "adaID", adaID, frag1, frag2, "polymorphism matrix (NO | YES)"
    print 3 * " ", "|", "{:^10s}".format(frag1)
    print 15 * "-"
    print 3 * " ", "|", "{:3d}".format(((nu1 < 1e-6) & (nu2 < 1e-6)).sum()), "|", "{:3d}".format(
        ((nu1 > 3e-3) & (nu2 < 1e-6)).sum()
    )
    print "{:3s}".format(frag2), "+" + (5 * "-") + "+" + (4 * "-")
    print 3 * " ", "|", "{:3d}".format(((nu1 < 1e-6) & (nu2 > 3e-3)).sum()), "|", "{:3d}".format(
        ((nu1 > 3e-6) & (nu2 > 3e-3)).sum()
    )

    # Plot scatter
    import matplotlib.pyplot as plt
    from matplotlib import cm

    if ax is None:
        show_plot = True
        fig, ax = plt.subplots(1, 1, figsize=(6, 6))
        ax.set_title("allele frequencies, adaID " + str(adaID) + ", " + str(frag1) + " - " + str(frag2), fontsize=18)
    else:
        show_plot = False
        ax.set_title(str(frag1) + " - " + str(frag2), fontsize=18)
    ax.scatter(
        np.abs(nu1 - 1e-5), np.abs(nu2 - 1e-5), s=30, c=cm.jet([int(255.0 * i / len(nu1)) for i in xrange(len(nu1))])
    )
    # Plot diagonal
    ax.plot([1e-7, 2], [1e-7, 2], lw=1, c="k", ls="--")
    ax.set_xlabel(r"$\nu_1$", fontsize=20)
    ax.set_ylabel(r"$\nu_2$", fontsize=20)
    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.set_xlim(0.7e-5, 1.2)
    ax.set_ylim(0.7e-5, 1.2)

    if show_plot:
        plt.tight_layout(w_pad=0.05)
        plt.ion()
        plt.show()