def write_counts_files(data_folder, adaID, fragment, counts, inserts, coverage=None, VERBOSE=0):
    """Write allele counts, inserts, and coverage to file"""
    if VERBOSE >= 1:
        print "Write to file: " + adaID + " " + fragment

    if coverage is None:
        coverage = counts.sum(axis=1)

    # Save counts and coverage
    counts.dump(get_allele_counts_filename(data_folder, adaID, fragment))
    coverage.dump(get_coverage_filename(data_folder, adaID, fragment))

    # Convert inserts to normal nested dictionary for pickle
    inserts_dic = {k: dict(v) for (k, v) in inserts.iteritems()}
    with open(get_insert_counts_filename(data_folder, adaID, fragment), "w") as f:
        pickle.dump(inserts_dic, f, protocol=-1)
def write_counts_files(data_folder,
                       adaID,
                       fragment,
                       counts,
                       inserts,
                       coverage=None,
                       VERBOSE=0):
    '''Write allele counts, inserts, and coverage to file'''
    if VERBOSE >= 1:
        print 'Write to file: ' + adaID + ' ' + fragment

    if coverage is None:
        coverage = counts.sum(axis=1)

    # Save counts and coverage
    counts.dump(get_allele_counts_filename(data_folder, adaID, fragment))
    coverage.dump(get_coverage_filename(data_folder, adaID, fragment))

    # Convert inserts to normal nested dictionary for pickle
    inserts_dic = {k: dict(v) for (k, v) in inserts.iteritems()}
    with open(get_insert_counts_filename(data_folder, adaID, fragment),
              'w') as f:
        pickle.dump(inserts_dic, f, protocol=-1)
    if not fragments:
        fragments = ['F' + str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    # Iterate over samples and fragments
    for adaID in adaIDs:
        for fragment in fragments:
            consensus = SeqIO.read(
                get_consensus_filename(data_folder, adaID, fragment), 'fasta')
            cmat = np.array(consensus)

            counts = np.load(
                get_allele_counts_filename(data_folder, adaID, fragment))
            coverage = np.load(
                get_coverage_filename(data_folder, adaID, fragment))
            nu = filter_nus(counts, coverage, VERBOSE=VERBOSE)

            # Note: not-covered positions are filtered, but argmax cannot work
            # with masked arrays
            cmat_af = alpha[nu.argmax(axis=0)]
            if hasattr(nu, 'mask'):
                cmat_af[nu.mask.all(axis=0)] = 'N'

            # Check for consistency first
            if len(cmat) != len(cmat_af):
                print 'Consensus has a different length from allele frequency \
                        matrix... WTF?'

            # Do not actually align, it makes a huge mess (we miss mistakes)
            ali = [cmat, cmat_af]
Exemple #4
0
def check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap,
                                     VERBOSE=0, ax=None):
    '''Check biases in allele frequencies in the overlap'''
    (start_s2, end_s1, ali) = overlap

    # Get allele counts and coverage
    cou1 = np.load(get_allele_counts_filename(data_folder, adaID, frag1))
    cov1 = np.load(get_coverage_filename(data_folder, adaID, frag1))
    cou2 = np.load(get_allele_counts_filename(data_folder, adaID, frag2))
    cov2 = np.load(get_coverage_filename(data_folder, adaID, frag2))

    # Cut the counts and coverage to the overlap region
    cou1 = cou1[:, :, start_s2:]
    cov1 = cov1[:, start_s2:]
    cou2 = cou2[:, :, :end_s1]
    cov2 = cov2[:, :end_s1]

    # Reduce the allele counts (fwd and rev have different status on most overlaps,
    # because of the uneven coverage)
    nu1 = filter_nus(cou1, cov1)
    nu2 = filter_nus(cou2, cov2)

    # FIXME
    if nu1.shape != nu2.shape:
        return

    # Print table of called polymorphisms
    print 'adaID', adaID, frag1, frag2, 'polymorphism matrix (NO | YES)'
    print 3 * ' ', '|', '{:^10s}'.format(frag1)
    print 15 * '-'
    print 3 * ' ', '|', \
            '{:3d}'.format(((nu1 < 1e-6) & (nu2 < 1e-6)).sum()), '|', \
            '{:3d}'.format(((nu1 > 3e-3) & (nu2 < 1e-6)).sum())
    print '{:3s}'.format(frag2), '+'+(5*'-')+'+'+(4*'-')
    print 3 * ' ', '|', \
            '{:3d}'.format(((nu1 < 1e-6) & (nu2 > 3e-3)).sum()), '|', \
            '{:3d}'.format(((nu1 > 3e-6) & (nu2 > 3e-3)).sum())
         

    # Plot scatter
    import matplotlib.pyplot as plt
    from matplotlib import cm
    if ax is None:
        show_plot = True
        fig, ax = plt.subplots(1, 1, figsize=(6, 6))
        ax.set_title('allele frequencies, adaID '+str(adaID)+', '+\
                     str(frag1)+' - '+str(frag2),
                     fontsize=18)
    else:
        show_plot = False
        ax.set_title(str(frag1)+' - '+str(frag2), fontsize=18)
    ax.scatter(np.abs(nu1 - 1e-5), np.abs(nu2 - 1e-5), s=30,
               c=cm.jet([int(255.0 * i / len(nu1)) for i in xrange(len(nu1))]))
    # Plot diagonal
    ax.plot([1e-7, 2], [1e-7, 2], lw=1, c='k', ls='--')
    ax.set_xlabel(r'$\nu_1$', fontsize=20)
    ax.set_ylabel(r'$\nu_2$', fontsize=20)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlim(0.7e-5, 1.2)
    ax.set_ylim(0.7e-5, 1.2)

    if show_plot:
        plt.tight_layout(w_pad=0.05)
        plt.ion()
        plt.show()
    # If the script is called with no fragment, iterate over all
    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    # Iterate over samples and fragments
    for adaID in adaIDs:
        for fragment in fragments:
            consensus = SeqIO.read(get_consensus_filename(data_folder, adaID, fragment),
                                   'fasta')
            cmat = np.array(consensus)

            counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment))
            coverage = np.load(get_coverage_filename(data_folder, adaID, fragment))
            nu = filter_nus(counts, coverage, VERBOSE=VERBOSE)

            # Note: not-covered positions are filtered, but argmax cannot work
            # with masked arrays
            cmat_af = alpha[nu.argmax(axis=0)]
            if hasattr(nu, 'mask'):
                cmat_af[nu.mask.all(axis=0)] = 'N'

            # Check for consistency first
            if len(cmat) != len(cmat_af):
                print 'Consensus has a different length from allele frequency \
                        matrix... WTF?'

            # Do not actually align, it makes a huge mess (we miss mistakes)
            ali = [cmat, cmat_af]
def check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap, VERBOSE=0, ax=None):
    """Check biases in allele frequencies in the overlap"""
    (start_s2, end_s1, ali) = overlap

    # Get allele counts and coverage
    cou1 = np.load(get_allele_counts_filename(data_folder, adaID, frag1))
    cov1 = np.load(get_coverage_filename(data_folder, adaID, frag1))
    cou2 = np.load(get_allele_counts_filename(data_folder, adaID, frag2))
    cov2 = np.load(get_coverage_filename(data_folder, adaID, frag2))

    # Cut the counts and coverage to the overlap region
    cou1 = cou1[:, :, start_s2:]
    cov1 = cov1[:, start_s2:]
    cou2 = cou2[:, :, :end_s1]
    cov2 = cov2[:, :end_s1]

    # Reduce the allele counts (fwd and rev have different status on most overlaps,
    # because of the uneven coverage)
    nu1 = filter_nus(cou1, cov1)
    nu2 = filter_nus(cou2, cov2)

    # FIXME
    if nu1.shape != nu2.shape:
        return

    # Print table of called polymorphisms
    print "adaID", adaID, frag1, frag2, "polymorphism matrix (NO | YES)"
    print 3 * " ", "|", "{:^10s}".format(frag1)
    print 15 * "-"
    print 3 * " ", "|", "{:3d}".format(((nu1 < 1e-6) & (nu2 < 1e-6)).sum()), "|", "{:3d}".format(
        ((nu1 > 3e-3) & (nu2 < 1e-6)).sum()
    )
    print "{:3s}".format(frag2), "+" + (5 * "-") + "+" + (4 * "-")
    print 3 * " ", "|", "{:3d}".format(((nu1 < 1e-6) & (nu2 > 3e-3)).sum()), "|", "{:3d}".format(
        ((nu1 > 3e-6) & (nu2 > 3e-3)).sum()
    )

    # Plot scatter
    import matplotlib.pyplot as plt
    from matplotlib import cm

    if ax is None:
        show_plot = True
        fig, ax = plt.subplots(1, 1, figsize=(6, 6))
        ax.set_title("allele frequencies, adaID " + str(adaID) + ", " + str(frag1) + " - " + str(frag2), fontsize=18)
    else:
        show_plot = False
        ax.set_title(str(frag1) + " - " + str(frag2), fontsize=18)
    ax.scatter(
        np.abs(nu1 - 1e-5), np.abs(nu2 - 1e-5), s=30, c=cm.jet([int(255.0 * i / len(nu1)) for i in xrange(len(nu1))])
    )
    # Plot diagonal
    ax.plot([1e-7, 2], [1e-7, 2], lw=1, c="k", ls="--")
    ax.set_xlabel(r"$\nu_1$", fontsize=20)
    ax.set_ylabel(r"$\nu_2$", fontsize=20)
    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.set_xlim(0.7e-5, 1.2)
    ax.set_ylim(0.7e-5, 1.2)

    if show_plot:
        plt.tight_layout(w_pad=0.05)
        plt.ion()
        plt.show()
            if reads[0].isize < 300:
                continue

            for read in reads:
                if read.seq.find(primer_new) != -1:
                    cov_new += 1
                if read.seq.find(primer_old) != -1:
                    cov_old += 1

        print 'old:', cov_old, 'new:', cov_new

        bamfile.close()

        # Get coverage and see
        covfn = get_coverage_filename(data_folder, adaID, fragment)
        cov = np.load(covfn)

        import matplotlib.pyplot as plt
        import matplotlib.cm as cm

        for js, read_type in enumerate(read_types):
            plt.plot(np.arange(cov.shape[1]), cov[js], lw=2,
                     c=cm.jet(int(255.0 * js / len(read_types))))

        plt.xlabel('Position [bases]')
        plt.title(str(adaID)+' '+fragment)
        plt.ylabel('Coverage')

        plt.ion()
        plt.show()
Exemple #8
0
def plot_minor_allele_frequency(data_folder, adaID, fragments, VERBOSE=0,
                                savefig=False):
    '''Plot minor allele frequency along the genome'''
    from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff
    import matplotlib
    params = {'axes.labelsize': 20, 
              'text.fontsize': 20,
              'legend.fontsize': 8,
              'xtick.labelsize': 16,
              'ytick.labelsize': 16,
              'text.usetex': False}
    matplotlib.rcParams.update(params)
    from matplotlib import cm
    import matplotlib.pyplot as plt

    plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)]

    # Store in globals structures
    covs = {}
    nus_minor = {}
    alls_minor = {}
    nus_filtered = {}
    nus_minor_filtered = {}

    for fragment in fragments:
        coverage = np.load(get_coverage_filename(data_folder, adaID, fragment))
        covs[fragment] = coverage
    
        counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment))
        (counts_major,
         counts_minor,
         counts_minor2) = get_minor_allele_counts(counts, n_minor=2)
    
        # Get minor allele frequencies and identities
        nu_minor = 1.0 * counts_minor[:, :, 1] / (coverage + 1e-6)
        nus_minor[fragment] = nu_minor
        all_minor = counts_minor[:, :, 0]
        alls_minor[fragment] = all_minor
    
        # Filter the minor frequencies by comparing the read types
        try:
            nu_filtered = np.load(get_allele_frequencies_filename(data_folder, adaID, fragment))
        except IOError:
            nu_filtered = filter_nus(counts, coverage)
        nut = np.zeros(nu_filtered.shape[-1])
        for pos, nupos in enumerate(nu_filtered.T):
            nut[pos] = np.sort(nupos)[-2]
        
        nus_filtered[fragment] = nu_filtered
        nus_minor_filtered[fragment] = nut

    # Plot them
    (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1]
    fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8))
    if len(fragments) > 1:
        axs = axs.ravel()
    else:
        axs = [axs]
    fig.suptitle('adapterID '+adaID, fontsize=20)
    labss = {'read1 f': 'read1 fwd', 'read1 r': 'read1 rev',
             'read2 f': 'read2 fwd', 'read2 r': 'read2 rev'}
    for i, fragment in enumerate(fragments):
        ax = axs[i]
        ax.set_yscale('log')
        ax.set_title(fragment)
        if i in [0, 3]:
            ax.set_ylabel(r'$\nu$')
        if i > 2:
            ax.set_xlabel('Position')
    
        # Plot divided by readtype
        for js, nu_minorjs in enumerate(nus_minor[fragment]):
            color = cm.jet(int(255.0 * js / len(read_types)))
            ax.plot(nu_minorjs, lw=1.5, c=color, label=labss[read_types[js]])
            ax.scatter(np.arange(len(nu_minorjs)), nu_minorjs, lw=1.5,
                       color=color)
        
        # Plot filtered
        ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k',
                alpha=0.5, label='Filtered')
        ax.scatter(np.arange(len(nus_minor_filtered[fragment])),
                   nus_minor_filtered[fragment], lw=1.5, c='k',
                   alpha=0.5)

        # Plot 1/max(coverage)
        coverage = covs[fragment]
        cov_tot = coverage.sum(axis=0)
        ax.plot(1.0 / cov_tot, lw=1.2, c='r', label='Detection limit')

        ax.set_xlim(-100, len(nu_minorjs) + 100)
    
    plt.grid()
    plt.legend(loc='upper right')
    plt.tight_layout(rect=(0, 0, 1, 0.95))

    if savefig:
        outputfile = gff(data_folder, adaID, fragment)
        fig.savefig(outputfile)
        plt.close(fig)
    else:
        plt.ion()
        plt.show()
Exemple #9
0
def plot_minor_allele_frequency_filtered(data_folder, adaID, fragments, VERBOSE=0,
                                         savefig=False):
    '''Plot minor allele frequency along the genome'''
    from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff
    import matplotlib
    params = {'axes.labelsize': 20, 
              'text.fontsize': 20,
              'legend.fontsize': 8,
              'xtick.labelsize': 16,
              'ytick.labelsize': 16,
              'text.usetex': False}
    matplotlib.rcParams.update(params)
    from matplotlib import cm
    import matplotlib.pyplot as plt

    # Store in globals structures
    covs = {}
    nus_minor_filtered = {}
    for fragment in fragments:
        coverage = np.load(get_coverage_filename(data_folder, adaID, fragment))
        covs[fragment] = coverage

        try:
            nu_filtered = np.load(get_allele_frequencies_filename(data_folder,
                                                                  adaID, fragment))
        except IOError:
            counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment))
            nu_filtered = filter_nus(counts)

        nut = np.zeros(nu_filtered.shape[-1])
        for pos, nupos in enumerate(nu_filtered.T):
            nut[pos] = np.sort(nupos)[-2]
        
        nus_minor_filtered[fragment] = nut
 
    # Plot them
    plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)]
    (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1]
    fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8))
    if len(fragments) > 1:
        axs = axs.ravel()
    else:
        axs = [axs]
    fig.suptitle('adapterID '+adaID, fontsize=20)
    for i, fragment in enumerate(fragments):
        ax = axs[i]
        ax.set_yscale('log')
        ax.set_title(fragment)
        if i in [0, 3]:
            ax.set_ylabel(r'$\nu$')
        if i > 2:
            ax.set_xlabel('Position')
        
        # Plot filtered
        ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k',
                alpha=0.5, label='Filtered')
        ax.scatter(np.arange(len(nus_minor_filtered[fragment])),
                   nus_minor_filtered[fragment], lw=1.5, c='k',
                   alpha=0.5)

        ax.set_xlim(-100, len(nus_minor_filtered[fragment]) + 100)
    
    #plt.legend(loc='upper right')
    plt.tight_layout(rect=(0, 0, 1, 0.95))

    if savefig:
        outputfile = gff(data_folder, adaID, fragment, only_filtered=True)
        fig.savefig(outputfile)
        plt.close(fig)
    else:
        plt.ion()
        plt.show()