Beispiel #1
0
def get_allele_frequency_trajectories(pname,
                                      samples,
                                      fragment,
                                      qual_min=30,
                                      VERBOSE=0):
    '''Scan the reads of all samples and write to a single file'''
    if VERBOSE >= 1:
        print 'Getting allele frequency trajectories:', pname, fragment

    from hivwholeseq.patients.filenames import get_initial_reference_filename, \
            get_mapped_to_initial_filename, get_allele_frequency_trajectories_filename, \
            get_allele_count_trajectories_filename
    from hivwholeseq.utils.one_site_statistics import get_allele_counts_insertions_from_file, \
            get_allele_counts_insertions_from_file_unfiltered, \
            filter_nus

    refseq = SeqIO.read(get_initial_reference_filename(pname, fragment),
                        'fasta')

    # Prepare output data structures
    cos_traj = np.zeros((len(samples), len(alpha), len(refseq)), int)
    nus_traj = np.zeros((len(samples), len(alpha), len(refseq)))

    for it, sample in enumerate(samples):
        if VERBOSE >= 2:
            print pname, it, sample

        input_filename = get_mapped_to_initial_filename(pname,
                                                        sample,
                                                        fragment,
                                                        type='bam')
        (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(
            input_filename, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE)
        # Take the total counts, blending in the read types
        cou = counts.sum(axis=0)
        cos_traj[it] = cou

        # Take the filtered frequencies, blending in the read types
        nu = filter_nus(counts)
        nus_traj[it] = nu

    #FIXME: test, etc.

    return (cos_traj, nus_traj)
Beispiel #2
0
                          summary=summary)
                continue

            # Get coverage and counts
            counts = np.load(
                get_allele_counts_filename(data_folder, adaID, fragment))
            if len(counts.shape) == 2:
                import warnings
                warnings.warn(
                    'Counts not divided by read type: will normalize instead of filter!'
                )
                nu_filtered = 1.0 * counts / counts.sum(axis=0)

            else:
                # Filter the minor frequencies by comparing the read types
                nu_filtered = filter_nus(counts)

            # Write output
            write_frequency_files(data_folder,
                                  adaID,
                                  fragment,
                                  nu_filtered,
                                  VERBOSE=VERBOSE)

            if summary:
                import matplotlib.pyplot as plt
                was_interactive = plt.isinteractive()
                plt.ioff()
                plot_SFS_folded(data_folder,
                                adaID,
                                fragment,
    # Iterate over all requested samples
    for samplename, sample in samples.iterrows():
        sample = SampleSeq(sample)
        adaID = sample.adapter

        if not fragments:
            fragments_sample = sample.regions_generic
        else:
            fragments_sample = sorted(set(fragments) & set(sample.regions_generic))

        for fragment in fragments_sample:

            # Submit to the cluster self if requested
            if submit:
                fork_self(seq_run, adaID, fragment, VERBOSE=VERBOSE)
                continue

            counts, inserts = get_allele_counts(data_folder, adaID, fragment, VERBOSE=VERBOSE)
            write_counts_files(data_folder, adaID, fragment, counts, inserts, VERBOSE=VERBOSE)

            if summary:
                plot_coverage(data_folder, adaID, fragment, counts, VERBOSE=VERBOSE, savefig=True)

            if write_frequencies:
                nu_filtered = filter_nus(counts)
                write_frequency_files(data_folder, adaID, fragment, nu_filtered, VERBOSE=VERBOSE)

                if summary:
                    plot_SFS_folded(data_folder, adaID, fragment, nu_filtered, VERBOSE=VERBOSE, savefig=True)
Beispiel #4
0
def plot_minor_allele_frequency(data_folder, adaID, fragments, VERBOSE=0,
                                savefig=False):
    '''Plot minor allele frequency along the genome'''
    from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff
    import matplotlib
    params = {'axes.labelsize': 20, 
              'text.fontsize': 20,
              'legend.fontsize': 8,
              'xtick.labelsize': 16,
              'ytick.labelsize': 16,
              'text.usetex': False}
    matplotlib.rcParams.update(params)
    from matplotlib import cm
    import matplotlib.pyplot as plt

    plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)]

    # Store in globals structures
    covs = {}
    nus_minor = {}
    alls_minor = {}
    nus_filtered = {}
    nus_minor_filtered = {}

    for fragment in fragments:
        coverage = np.load(get_coverage_filename(data_folder, adaID, fragment))
        covs[fragment] = coverage
    
        counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment))
        (counts_major,
         counts_minor,
         counts_minor2) = get_minor_allele_counts(counts, n_minor=2)
    
        # Get minor allele frequencies and identities
        nu_minor = 1.0 * counts_minor[:, :, 1] / (coverage + 1e-6)
        nus_minor[fragment] = nu_minor
        all_minor = counts_minor[:, :, 0]
        alls_minor[fragment] = all_minor
    
        # Filter the minor frequencies by comparing the read types
        try:
            nu_filtered = np.load(get_allele_frequencies_filename(data_folder, adaID, fragment))
        except IOError:
            nu_filtered = filter_nus(counts, coverage)
        nut = np.zeros(nu_filtered.shape[-1])
        for pos, nupos in enumerate(nu_filtered.T):
            nut[pos] = np.sort(nupos)[-2]
        
        nus_filtered[fragment] = nu_filtered
        nus_minor_filtered[fragment] = nut

    # Plot them
    (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1]
    fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8))
    if len(fragments) > 1:
        axs = axs.ravel()
    else:
        axs = [axs]
    fig.suptitle('adapterID '+adaID, fontsize=20)
    labss = {'read1 f': 'read1 fwd', 'read1 r': 'read1 rev',
             'read2 f': 'read2 fwd', 'read2 r': 'read2 rev'}
    for i, fragment in enumerate(fragments):
        ax = axs[i]
        ax.set_yscale('log')
        ax.set_title(fragment)
        if i in [0, 3]:
            ax.set_ylabel(r'$\nu$')
        if i > 2:
            ax.set_xlabel('Position')
    
        # Plot divided by readtype
        for js, nu_minorjs in enumerate(nus_minor[fragment]):
            color = cm.jet(int(255.0 * js / len(read_types)))
            ax.plot(nu_minorjs, lw=1.5, c=color, label=labss[read_types[js]])
            ax.scatter(np.arange(len(nu_minorjs)), nu_minorjs, lw=1.5,
                       color=color)
        
        # Plot filtered
        ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k',
                alpha=0.5, label='Filtered')
        ax.scatter(np.arange(len(nus_minor_filtered[fragment])),
                   nus_minor_filtered[fragment], lw=1.5, c='k',
                   alpha=0.5)

        # Plot 1/max(coverage)
        coverage = covs[fragment]
        cov_tot = coverage.sum(axis=0)
        ax.plot(1.0 / cov_tot, lw=1.2, c='r', label='Detection limit')

        ax.set_xlim(-100, len(nu_minorjs) + 100)
    
    plt.grid()
    plt.legend(loc='upper right')
    plt.tight_layout(rect=(0, 0, 1, 0.95))

    if savefig:
        outputfile = gff(data_folder, adaID, fragment)
        fig.savefig(outputfile)
        plt.close(fig)
    else:
        plt.ion()
        plt.show()
Beispiel #5
0
def plot_minor_allele_frequency_filtered(data_folder, adaID, fragments, VERBOSE=0,
                                         savefig=False):
    '''Plot minor allele frequency along the genome'''
    from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff
    import matplotlib
    params = {'axes.labelsize': 20, 
              'text.fontsize': 20,
              'legend.fontsize': 8,
              'xtick.labelsize': 16,
              'ytick.labelsize': 16,
              'text.usetex': False}
    matplotlib.rcParams.update(params)
    from matplotlib import cm
    import matplotlib.pyplot as plt

    # Store in globals structures
    covs = {}
    nus_minor_filtered = {}
    for fragment in fragments:
        coverage = np.load(get_coverage_filename(data_folder, adaID, fragment))
        covs[fragment] = coverage

        try:
            nu_filtered = np.load(get_allele_frequencies_filename(data_folder,
                                                                  adaID, fragment))
        except IOError:
            counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment))
            nu_filtered = filter_nus(counts)

        nut = np.zeros(nu_filtered.shape[-1])
        for pos, nupos in enumerate(nu_filtered.T):
            nut[pos] = np.sort(nupos)[-2]
        
        nus_minor_filtered[fragment] = nut
 
    # Plot them
    plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)]
    (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1]
    fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8))
    if len(fragments) > 1:
        axs = axs.ravel()
    else:
        axs = [axs]
    fig.suptitle('adapterID '+adaID, fontsize=20)
    for i, fragment in enumerate(fragments):
        ax = axs[i]
        ax.set_yscale('log')
        ax.set_title(fragment)
        if i in [0, 3]:
            ax.set_ylabel(r'$\nu$')
        if i > 2:
            ax.set_xlabel('Position')
        
        # Plot filtered
        ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k',
                alpha=0.5, label='Filtered')
        ax.scatter(np.arange(len(nus_minor_filtered[fragment])),
                   nus_minor_filtered[fragment], lw=1.5, c='k',
                   alpha=0.5)

        ax.set_xlim(-100, len(nus_minor_filtered[fragment]) + 100)
    
    #plt.legend(loc='upper right')
    plt.tight_layout(rect=(0, 0, 1, 0.95))

    if savefig:
        outputfile = gff(data_folder, adaID, fragment, only_filtered=True)
        fig.savefig(outputfile)
        plt.close(fig)
    else:
        plt.ion()
        plt.show()