Python get_allele_counts_filenameの例、hivwholeseq.sequencing.filenames.get_allele_counts_filename Pythonの例

コード例 #1

0

ファイルを表示

ファイル: get_allele_counts.py プロジェクト: iosonofabio/hivwholeseq

def write_counts_files(data_folder, adaID, fragment, counts, inserts, coverage=None, VERBOSE=0):
    """Write allele counts, inserts, and coverage to file"""
    if VERBOSE >= 1:
        print "Write to file: " + adaID + " " + fragment

    if coverage is None:
        coverage = counts.sum(axis=1)

    # Save counts and coverage
    counts.dump(get_allele_counts_filename(data_folder, adaID, fragment))
    coverage.dump(get_coverage_filename(data_folder, adaID, fragment))

    # Convert inserts to normal nested dictionary for pickle
    inserts_dic = {k: dict(v) for (k, v) in inserts.iteritems()}
    with open(get_insert_counts_filename(data_folder, adaID, fragment), "w") as f:
        pickle.dump(inserts_dic, f, protocol=-1)

コード例 #2

0

ファイルを表示

ファイル: get_allele_counts.py プロジェクト: 5l1v3r1/hivwholeseq

def write_counts_files(data_folder,
                       adaID,
                       fragment,
                       counts,
                       inserts,
                       coverage=None,
                       VERBOSE=0):
    '''Write allele counts, inserts, and coverage to file'''
    if VERBOSE >= 1:
        print 'Write to file: ' + adaID + ' ' + fragment

    if coverage is None:
        coverage = counts.sum(axis=1)

    # Save counts and coverage
    counts.dump(get_allele_counts_filename(data_folder, adaID, fragment))
    coverage.dump(get_coverage_filename(data_folder, adaID, fragment))

    # Convert inserts to normal nested dictionary for pickle
    inserts_dic = {k: dict(v) for (k, v) in inserts.iteritems()}
    with open(get_insert_counts_filename(data_folder, adaID, fragment),
              'w') as f:
        pickle.dump(inserts_dic, f, protocol=-1)

コード例 #3

0

ファイルを表示

ファイル: build_consensus.py プロジェクト: 5l1v3r1/hivwholeseq

            if VERBOSE >= 2:
                print ali[:, :30]
                print ali[:, -30:]
                print 'Lenghts: ref', len(refseq), 'consensus', len(
                    consensusseq)
                len_ali = ali.get_alignment_length()
                n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali))
                print 'Differences from ref:', n_diff, '(' + '{:3.1f}'.format(
                    100.0 * n_diff / len_ali) + '%)'

            # Ungap consensus
            consensusseq = SeqRecord(ali[1].seq, id=name, name=name)
            if '-' in consensusseq:
                consensusseq.seq = consensusseq.seq.ungap('-')

            # Write output
            outfile = get_consensus_filename(data_folder,
                                             adaID,
                                             frag_out,
                                             trim_primers=True)
            SeqIO.write(consensusseq, outfile, 'fasta')

            AlignIO.write(
                ali,
                get_reference_consensus_ali_filename(data_folder, adaID,
                                                     fragment), 'fasta')

            if store_allele_counts:
                allele_counts.dump(
                    get_allele_counts_filename(data_folder, adaID, frag_out))

コード例 #4

0

ファイルを表示

ファイル: build_consensus.py プロジェクト: iosonofabio/hivwholeseq

            ali = align_muscle(refseq, consensusseq, sort=True)

            if ali[0][-1] == '-':
                start_nongap = len(ali[0]) - len(ali[0].seq.lstrip('-'))
                end_nongap = len(ali[0].seq.rstrip('-'))
                ali = ali[:, start_nongap: end_nongap]

            if VERBOSE >= 2:
                print ali[:, :30]
                print ali[:, -30:]
                print 'Lenghts: ref', len(refseq), 'consensus', len(consensusseq)
                len_ali = ali.get_alignment_length()
                n_diff = sum(ali[0, i] != ali[1, i] for i in xrange(len_ali))
                print 'Differences from ref:', n_diff, '('+'{:3.1f}'.format(100.0 * n_diff / len_ali)+'%)'

            # Ungap consensus
            consensusseq = SeqRecord(ali[1].seq, id=name, name=name)
            if '-' in consensusseq:
                consensusseq.seq = consensusseq.seq.ungap('-')

            # Write output
            outfile = get_consensus_filename(data_folder, adaID, frag_out, trim_primers=True)
            SeqIO.write(consensusseq, outfile, 'fasta')

            AlignIO.write(ali, get_reference_consensus_ali_filename(data_folder, adaID, fragment), 'fasta')

            if store_allele_counts:
                allele_counts.dump(get_allele_counts_filename(data_folder, adaID, frag_out))

コード例 #5

0

ファイルを表示

def check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap,
                                     VERBOSE=0, ax=None):
    '''Check biases in allele frequencies in the overlap'''
    (start_s2, end_s1, ali) = overlap

    # Get allele counts and coverage
    cou1 = np.load(get_allele_counts_filename(data_folder, adaID, frag1))
    cov1 = np.load(get_coverage_filename(data_folder, adaID, frag1))
    cou2 = np.load(get_allele_counts_filename(data_folder, adaID, frag2))
    cov2 = np.load(get_coverage_filename(data_folder, adaID, frag2))

    # Cut the counts and coverage to the overlap region
    cou1 = cou1[:, :, start_s2:]
    cov1 = cov1[:, start_s2:]
    cou2 = cou2[:, :, :end_s1]
    cov2 = cov2[:, :end_s1]

    # Reduce the allele counts (fwd and rev have different status on most overlaps,
    # because of the uneven coverage)
    nu1 = filter_nus(cou1, cov1)
    nu2 = filter_nus(cou2, cov2)

    # FIXME
    if nu1.shape != nu2.shape:
        return

    # Print table of called polymorphisms
    print 'adaID', adaID, frag1, frag2, 'polymorphism matrix (NO | YES)'
    print 3 * ' ', '|', '{:^10s}'.format(frag1)
    print 15 * '-'
    print 3 * ' ', '|', \
            '{:3d}'.format(((nu1 < 1e-6) & (nu2 < 1e-6)).sum()), '|', \
            '{:3d}'.format(((nu1 > 3e-3) & (nu2 < 1e-6)).sum())
    print '{:3s}'.format(frag2), '+'+(5*'-')+'+'+(4*'-')
    print 3 * ' ', '|', \
            '{:3d}'.format(((nu1 < 1e-6) & (nu2 > 3e-3)).sum()), '|', \
            '{:3d}'.format(((nu1 > 3e-6) & (nu2 > 3e-3)).sum())
         

    # Plot scatter
    import matplotlib.pyplot as plt
    from matplotlib import cm
    if ax is None:
        show_plot = True
        fig, ax = plt.subplots(1, 1, figsize=(6, 6))
        ax.set_title('allele frequencies, adaID '+str(adaID)+', '+\
                     str(frag1)+' - '+str(frag2),
                     fontsize=18)
    else:
        show_plot = False
        ax.set_title(str(frag1)+' - '+str(frag2), fontsize=18)
    ax.scatter(np.abs(nu1 - 1e-5), np.abs(nu2 - 1e-5), s=30,
               c=cm.jet([int(255.0 * i / len(nu1)) for i in xrange(len(nu1))]))
    # Plot diagonal
    ax.plot([1e-7, 2], [1e-7, 2], lw=1, c='k', ls='--')
    ax.set_xlabel(r'$\nu_1$', fontsize=20)
    ax.set_ylabel(r'$\nu_2$', fontsize=20)
    ax.set_xscale('log')
    ax.set_yscale('log')
    ax.set_xlim(0.7e-5, 1.2)
    ax.set_ylim(0.7e-5, 1.2)

    if show_plot:
        plt.tight_layout(w_pad=0.05)
        plt.ion()
        plt.show()

コード例 #6

0

ファイルを表示

            # Determine the overlap
            overlap = get_overlap(data_folder, adaID,
                                  frag1, frag2, VERBOSE=VERBOSE)

            # Check consensus
            is_diff = check_overlap_consensus(data_folder, adaID, frag1, frag2, overlap,
                                              VERBOSE=VERBOSE)

            if overlap is None:
                continue

            overlaps.append(((frag1, frag2), overlap))

        # Check allele frequencies if present
        if not os.path.isfile(get_allele_counts_filename(data_folder, adaID, 'F1')):
            continue

        # Make unified figures
        import matplotlib.pyplot as plt
        fig, axs = plt.subplots(1, len(overlaps), figsize=(2+3.6*len(fragments), 6))
        fig.suptitle('allele frequencies, adaID '+str(adaID), fontsize=18)
        if len(overlaps) == 1:
            axs = [axs]
        for (ax, ((frag1, frag2), overlap)) in izip(axs, overlaps):

            # Check allele frequencies
            check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2,
                                             overlap, VERBOSE=VERBOSE, ax=ax)

        plt.tight_layout(rect=(0, 0, 1, 0.95))

コード例 #7

0

ファイルを表示

ファイル: check_consensus.py プロジェクト: iosonofabio/hivwholeseq

        print 'adaIDs', adaIDs

    # If the script is called with no fragment, iterate over all
    if not fragments:
        fragments = ['F'+str(i) for i in xrange(1, 7)]
    if VERBOSE >= 3:
        print 'fragments', fragments

    # Iterate over samples and fragments
    for adaID in adaIDs:
        for fragment in fragments:
            consensus = SeqIO.read(get_consensus_filename(data_folder, adaID, fragment),
                                   'fasta')
            cmat = np.array(consensus)

            counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment))
            coverage = np.load(get_coverage_filename(data_folder, adaID, fragment))
            nu = filter_nus(counts, coverage, VERBOSE=VERBOSE)

            # Note: not-covered positions are filtered, but argmax cannot work
            # with masked arrays
            cmat_af = alpha[nu.argmax(axis=0)]
            if hasattr(nu, 'mask'):
                cmat_af[nu.mask.all(axis=0)] = 'N'

            # Check for consistency first
            if len(cmat) != len(cmat_af):
                print 'Consensus has a different length from allele frequency \
                        matrix... WTF?'

            # Do not actually align, it makes a huge mess (we miss mistakes)

コード例 #8

0

ファイルを表示

ファイル: check_overlaps.py プロジェクト: iosonofabio/hivwholeseq

def check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap, VERBOSE=0, ax=None):
    """Check biases in allele frequencies in the overlap"""
    (start_s2, end_s1, ali) = overlap

    # Get allele counts and coverage
    cou1 = np.load(get_allele_counts_filename(data_folder, adaID, frag1))
    cov1 = np.load(get_coverage_filename(data_folder, adaID, frag1))
    cou2 = np.load(get_allele_counts_filename(data_folder, adaID, frag2))
    cov2 = np.load(get_coverage_filename(data_folder, adaID, frag2))

    # Cut the counts and coverage to the overlap region
    cou1 = cou1[:, :, start_s2:]
    cov1 = cov1[:, start_s2:]
    cou2 = cou2[:, :, :end_s1]
    cov2 = cov2[:, :end_s1]

    # Reduce the allele counts (fwd and rev have different status on most overlaps,
    # because of the uneven coverage)
    nu1 = filter_nus(cou1, cov1)
    nu2 = filter_nus(cou2, cov2)

    # FIXME
    if nu1.shape != nu2.shape:
        return

    # Print table of called polymorphisms
    print "adaID", adaID, frag1, frag2, "polymorphism matrix (NO | YES)"
    print 3 * " ", "|", "{:^10s}".format(frag1)
    print 15 * "-"
    print 3 * " ", "|", "{:3d}".format(((nu1 < 1e-6) & (nu2 < 1e-6)).sum()), "|", "{:3d}".format(
        ((nu1 > 3e-3) & (nu2 < 1e-6)).sum()
    )
    print "{:3s}".format(frag2), "+" + (5 * "-") + "+" + (4 * "-")
    print 3 * " ", "|", "{:3d}".format(((nu1 < 1e-6) & (nu2 > 3e-3)).sum()), "|", "{:3d}".format(
        ((nu1 > 3e-6) & (nu2 > 3e-3)).sum()
    )

    # Plot scatter
    import matplotlib.pyplot as plt
    from matplotlib import cm

    if ax is None:
        show_plot = True
        fig, ax = plt.subplots(1, 1, figsize=(6, 6))
        ax.set_title("allele frequencies, adaID " + str(adaID) + ", " + str(frag1) + " - " + str(frag2), fontsize=18)
    else:
        show_plot = False
        ax.set_title(str(frag1) + " - " + str(frag2), fontsize=18)
    ax.scatter(
        np.abs(nu1 - 1e-5), np.abs(nu2 - 1e-5), s=30, c=cm.jet([int(255.0 * i / len(nu1)) for i in xrange(len(nu1))])
    )
    # Plot diagonal
    ax.plot([1e-7, 2], [1e-7, 2], lw=1, c="k", ls="--")
    ax.set_xlabel(r"$\nu_1$", fontsize=20)
    ax.set_ylabel(r"$\nu_2$", fontsize=20)
    ax.set_xscale("log")
    ax.set_yscale("log")
    ax.set_xlim(0.7e-5, 1.2)
    ax.set_ylim(0.7e-5, 1.2)

    if show_plot:
        plt.tight_layout(w_pad=0.05)
        plt.ion()
        plt.show()

コード例 #9

0

ファイルを表示

ファイル: check_overlaps.py プロジェクト: iosonofabio/hivwholeseq

        overlaps = []
        for (frag1, frag2) in pairs:

            # Determine the overlap
            overlap = get_overlap(data_folder, adaID, frag1, frag2, VERBOSE=VERBOSE)

            # Check consensus
            is_diff = check_overlap_consensus(data_folder, adaID, frag1, frag2, overlap, VERBOSE=VERBOSE)

            if overlap is None:
                continue

            overlaps.append(((frag1, frag2), overlap))

        # Check allele frequencies if present
        if not os.path.isfile(get_allele_counts_filename(data_folder, adaID, "F1")):
            continue

        # Make unified figures
        import matplotlib.pyplot as plt

        fig, axs = plt.subplots(1, len(overlaps), figsize=(2 + 3.6 * len(fragments), 6))
        fig.suptitle("allele frequencies, adaID " + str(adaID), fontsize=18)
        if len(overlaps) == 1:
            axs = [axs]
        for (ax, ((frag1, frag2), overlap)) in izip(axs, overlaps):

            # Check allele frequencies
            check_overlap_allele_frequencies(data_folder, adaID, frag1, frag2, overlap, VERBOSE=VERBOSE, ax=ax)

        plt.tight_layout(rect=(0, 0, 1, 0.95))

コード例 #10

0

ファイルを表示

def plot_minor_allele_frequency(data_folder, adaID, fragments, VERBOSE=0,
                                savefig=False):
    '''Plot minor allele frequency along the genome'''
    from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff
    import matplotlib
    params = {'axes.labelsize': 20, 
              'text.fontsize': 20,
              'legend.fontsize': 8,
              'xtick.labelsize': 16,
              'ytick.labelsize': 16,
              'text.usetex': False}
    matplotlib.rcParams.update(params)
    from matplotlib import cm
    import matplotlib.pyplot as plt

    plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)]

    # Store in globals structures
    covs = {}
    nus_minor = {}
    alls_minor = {}
    nus_filtered = {}
    nus_minor_filtered = {}

    for fragment in fragments:
        coverage = np.load(get_coverage_filename(data_folder, adaID, fragment))
        covs[fragment] = coverage
    
        counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment))
        (counts_major,
         counts_minor,
         counts_minor2) = get_minor_allele_counts(counts, n_minor=2)
    
        # Get minor allele frequencies and identities
        nu_minor = 1.0 * counts_minor[:, :, 1] / (coverage + 1e-6)
        nus_minor[fragment] = nu_minor
        all_minor = counts_minor[:, :, 0]
        alls_minor[fragment] = all_minor
    
        # Filter the minor frequencies by comparing the read types
        try:
            nu_filtered = np.load(get_allele_frequencies_filename(data_folder, adaID, fragment))
        except IOError:
            nu_filtered = filter_nus(counts, coverage)
        nut = np.zeros(nu_filtered.shape[-1])
        for pos, nupos in enumerate(nu_filtered.T):
            nut[pos] = np.sort(nupos)[-2]
        
        nus_filtered[fragment] = nu_filtered
        nus_minor_filtered[fragment] = nut

    # Plot them
    (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1]
    fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8))
    if len(fragments) > 1:
        axs = axs.ravel()
    else:
        axs = [axs]
    fig.suptitle('adapterID '+adaID, fontsize=20)
    labss = {'read1 f': 'read1 fwd', 'read1 r': 'read1 rev',
             'read2 f': 'read2 fwd', 'read2 r': 'read2 rev'}
    for i, fragment in enumerate(fragments):
        ax = axs[i]
        ax.set_yscale('log')
        ax.set_title(fragment)
        if i in [0, 3]:
            ax.set_ylabel(r'$\nu$')
        if i > 2:
            ax.set_xlabel('Position')
    
        # Plot divided by readtype
        for js, nu_minorjs in enumerate(nus_minor[fragment]):
            color = cm.jet(int(255.0 * js / len(read_types)))
            ax.plot(nu_minorjs, lw=1.5, c=color, label=labss[read_types[js]])
            ax.scatter(np.arange(len(nu_minorjs)), nu_minorjs, lw=1.5,
                       color=color)
        
        # Plot filtered
        ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k',
                alpha=0.5, label='Filtered')
        ax.scatter(np.arange(len(nus_minor_filtered[fragment])),
                   nus_minor_filtered[fragment], lw=1.5, c='k',
                   alpha=0.5)

        # Plot 1/max(coverage)
        coverage = covs[fragment]
        cov_tot = coverage.sum(axis=0)
        ax.plot(1.0 / cov_tot, lw=1.2, c='r', label='Detection limit')

        ax.set_xlim(-100, len(nu_minorjs) + 100)
    
    plt.grid()
    plt.legend(loc='upper right')
    plt.tight_layout(rect=(0, 0, 1, 0.95))

    if savefig:
        outputfile = gff(data_folder, adaID, fragment)
        fig.savefig(outputfile)
        plt.close(fig)
    else:
        plt.ion()
        plt.show()

コード例 #11

0

ファイルを表示

def plot_minor_allele_frequency_filtered(data_folder, adaID, fragments, VERBOSE=0,
                                         savefig=False):
    '''Plot minor allele frequency along the genome'''
    from hivwholeseq.sequencing.filenames import get_minor_allele_frequency_figure_filename as gff
    import matplotlib
    params = {'axes.labelsize': 20, 
              'text.fontsize': 20,
              'legend.fontsize': 8,
              'xtick.labelsize': 16,
              'ytick.labelsize': 16,
              'text.usetex': False}
    matplotlib.rcParams.update(params)
    from matplotlib import cm
    import matplotlib.pyplot as plt

    # Store in globals structures
    covs = {}
    nus_minor_filtered = {}
    for fragment in fragments:
        coverage = np.load(get_coverage_filename(data_folder, adaID, fragment))
        covs[fragment] = coverage

        try:
            nu_filtered = np.load(get_allele_frequencies_filename(data_folder,
                                                                  adaID, fragment))
        except IOError:
            counts = np.load(get_allele_counts_filename(data_folder, adaID, fragment))
            nu_filtered = filter_nus(counts)

        nut = np.zeros(nu_filtered.shape[-1])
        for pos, nupos in enumerate(nu_filtered.T):
            nut[pos] = np.sort(nupos)[-2]
        
        nus_minor_filtered[fragment] = nut
 
    # Plot them
    plot_grid = [(1, 1), (1, 2), (1, 3), (2, 2), (1, 5), (2, 3)]
    (n_plots_y, n_plots_x) = plot_grid[len(fragments) - 1]
    fig, axs = plt.subplots(n_plots_y, n_plots_x, figsize=(13, 8))
    if len(fragments) > 1:
        axs = axs.ravel()
    else:
        axs = [axs]
    fig.suptitle('adapterID '+adaID, fontsize=20)
    for i, fragment in enumerate(fragments):
        ax = axs[i]
        ax.set_yscale('log')
        ax.set_title(fragment)
        if i in [0, 3]:
            ax.set_ylabel(r'$\nu$')
        if i > 2:
            ax.set_xlabel('Position')
        
        # Plot filtered
        ax.plot(nus_minor_filtered[fragment], lw=1.5, c='k',
                alpha=0.5, label='Filtered')
        ax.scatter(np.arange(len(nus_minor_filtered[fragment])),
                   nus_minor_filtered[fragment], lw=1.5, c='k',
                   alpha=0.5)

        ax.set_xlim(-100, len(nus_minor_filtered[fragment]) + 100)
    
    #plt.legend(loc='upper right')
    plt.tight_layout(rect=(0, 0, 1, 0.95))

    if savefig:
        outputfile = gff(data_folder, adaID, fragment, only_filtered=True)
        fig.savefig(outputfile)
        plt.close(fig)
    else:
        plt.ion()
        plt.show()