def check_coverage(data_folder,
                   adaID,
                   fragment,
                   seq_run,
                   qual_min=35,
                   reference='HXB2',
                   maxreads=-1,
                   VERBOSE=0,
                   rescue=False,
                   minor_allele=False):
    '''Check division into fragments: coverage, etc.'''
    ref_fn = get_consensus_filename(data_folder, adaID, fragment)
    refseq = SeqIO.read(ref_fn, 'fasta')

    input_filename = get_mapped_filename(data_folder,
                                         adaID,
                                         fragment,
                                         type='bam',
                                         rescue=rescue)

    counts, inserts = get_allele_counts_insertions_from_file_unfiltered(
        input_filename, len(refseq), maxreads=maxreads, VERBOSE=VERBOSE)

    # Plot results
    title = ', '.join(
        map(lambda x: ' '.join([x[0], str(x[1])]), [
            ['run', seq_run],
            ['adaID', adaID],
            ['fragment', fragment],
            ['maxreads', maxreads],
        ]))
    plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
def make_consensus(data_folder, adaID, fragment, n_iter, qual_min=20, VERBOSE=0,
                   coverage_min=10, summary=True):
    '''Make consensus sequence from the mapped reads'''
    if VERBOSE:
        print 'Build consensus: '+adaID+' '+fragment+' iteration '+str(n_iter)
    
    # Read reference
    reffilename = get_reference_filename(data_folder, adaID, fragment, n_iter)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    # Open BAM file
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, n_iter)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(bamfilename,\
                                len(refseq), qual_min=qual_min,
                                match_len_min=match_len_min)

    consensus_final = build_consensus(counts, inserts,
                                      coverage_min=coverage_min,
                                      VERBOSE=VERBOSE)

    if summary:
        with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f:
            f.write('Consensus built for iteration '+str(n_iter))
            f.write('\n')

    return refseq, consensus_final
Exemple #3
0
def get_allele_frequency_trajectories(pname,
                                      samples,
                                      fragment,
                                      qual_min=30,
                                      VERBOSE=0):
    '''Scan the reads of all samples and write to a single file'''
    if VERBOSE >= 1:
        print 'Getting allele frequency trajectories:', pname, fragment

    from hivwholeseq.patients.filenames import get_initial_reference_filename, \
            get_mapped_to_initial_filename, get_allele_frequency_trajectories_filename, \
            get_allele_count_trajectories_filename
    from hivwholeseq.utils.one_site_statistics import get_allele_counts_insertions_from_file, \
            get_allele_counts_insertions_from_file_unfiltered, \
            filter_nus

    refseq = SeqIO.read(get_initial_reference_filename(pname, fragment),
                        'fasta')

    # Prepare output data structures
    cos_traj = np.zeros((len(samples), len(alpha), len(refseq)), int)
    nus_traj = np.zeros((len(samples), len(alpha), len(refseq)))

    for it, sample in enumerate(samples):
        if VERBOSE >= 2:
            print pname, it, sample

        input_filename = get_mapped_to_initial_filename(pname,
                                                        sample,
                                                        fragment,
                                                        type='bam')
        (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(
            input_filename, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE)
        # Take the total counts, blending in the read types
        cou = counts.sum(axis=0)
        cos_traj[it] = cou

        # Take the filtered frequencies, blending in the read types
        nu = filter_nus(counts)
        nus_traj[it] = nu

    #FIXME: test, etc.

    return (cos_traj, nus_traj)
Exemple #4
0
def make_consensus(data_folder,
                   adaID,
                   fragment,
                   n_iter,
                   qual_min=20,
                   VERBOSE=0,
                   coverage_min=10,
                   summary=True):
    '''Make consensus sequence from the mapped reads'''
    if VERBOSE:
        print 'Build consensus: ' + adaID + ' ' + fragment + ' iteration ' + str(
            n_iter)

    # Read reference
    reffilename = get_reference_filename(data_folder, adaID, fragment, n_iter)
    refseq = SeqIO.read(reffilename, 'fasta')
    ref = np.array(refseq)

    # Open BAM file
    bamfilename = get_mapped_filename(data_folder, adaID, fragment, n_iter)
    if not os.path.isfile(bamfilename):
        convert_sam_to_bam(bamfilename)

    (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(bamfilename,\
                                len(refseq), qual_min=qual_min,
                                match_len_min=match_len_min)

    consensus_final = build_consensus(counts,
                                      inserts,
                                      coverage_min=coverage_min,
                                      VERBOSE=VERBOSE)

    if summary:
        with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f:
            f.write('Consensus built for iteration ' + str(n_iter))
            f.write('\n')

    return refseq, consensus_final
def check_coverage(data_folder, adaID, fragment, seq_run, qual_min=35,
                   reference='HXB2', maxreads=-1, VERBOSE=0,
                   rescue=False,
                   minor_allele=False):
    '''Check division into fragments: coverage, etc.'''
    ref_fn = get_consensus_filename(data_folder, adaID, fragment)
    refseq = SeqIO.read(ref_fn, 'fasta')

    input_filename = get_mapped_filename(data_folder, adaID, fragment, type='bam',
                                         rescue=rescue)

    counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename,
                                                                        len(refseq),
                                                                        maxreads=maxreads,
                                                                        VERBOSE=VERBOSE)

    # Plot results
    title=', '.join(map(lambda x: ' '.join([x[0], str(x[1])]),
                        [['run', seq_run],
                         ['adaID', adaID],
                         ['fragment', fragment],
                         ['maxreads', maxreads],
                        ]))
    plot_coverage(counts, suptitle=title, minor_allele=minor_allele)
def check_premap(data_folder, adaID, fragments, seq_run, samplename,
                 qual_min=30, match_len_min=10,
                 maxreads=-1, VERBOSE=0,
                 savefig=True,
                 title=None):
    '''Check premap to reference: coverage, etc.'''
    refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID), 'fasta')

    # FIXME: do this possibly better than parsing the description!
    try:
        fields = refseq.description.split()
        refseq_start = int(fields[fields.index('(indices') - 3])
    except ValueError:
        refseq_start = 550

    fragpos_filename = get_fragment_positions_filename(data_folder, adaID)
    if os.path.isfile(fragpos_filename):
        # Load the fragment positions, considering mixed fragments (e.g. F5a+b)
        fragtmp = []
        postmp = []
        with open(fragpos_filename, 'r') as f:
            f.readline() #HEADER
            for line in f:
                fields = line[:-1].split('\t')
                fragtmp.append(fields[0])
                if 'inner' not in fields[1]:
                    postmp.append([fields[1], fields[4]])
                else:
                    start = int(fields[1].split(',')[1].split(': ')[1].rstrip('}'))
                    end = int(fields[4].split(',')[1].split(': ')[1].rstrip('}'))
                    postmp.append([start, end])

        postmp = np.array(postmp, int)
        # NOTE: In a lot of old files, it says F3o instead of F3ao
        if 'F3o' in fragtmp:
            fragtmp[fragtmp.index('F3o')] = 'F3ao'
        elif 'F3i' in fragtmp:
            fragtmp[fragtmp.index('F3i')] = 'F3ai'


        frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments], int).T

    else:
        frags_pos = None
    
    frags_pos_out = None

    # Open BAM and scan reads
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')
    if not os.path.isfile(input_filename):
        if VERBOSE:
            print 'Premapped BAM file not found'
        return (None, None)

    # Count reads if requested
    n_reads = get_number_reads(input_filename)
    if VERBOSE:
        print 'N. of reads:', n_reads

    # Get counts
    counts, inserts = get_allele_counts_insertions_from_file_unfiltered(input_filename,
                                                             len(refseq),
                                                             qual_min=qual_min,
                                                             match_len_min=match_len_min,
                                                             maxreads=maxreads,
                                                             VERBOSE=VERBOSE)

    # Plot results
    if title is None:
        title=', '.join(['run '+seq_run+' '+adaID,
                         'sample '+samplename,
                         'reads '+str(min(maxreads, n_reads))+'/'+str(n_reads),
                        ])
    plot_coverage(counts,
                  offset_x=refseq_start,
                  frags_pos=frags_pos,
                  frags_pos_out=frags_pos_out,
                  title=title)

    if savefig:
        from hivwholeseq.sequencing.adapter_info import foldername_adapter
        plt.savefig(data_folder+foldername_adapter(adaID)+'figures/coverage_premapped_'+samplename+'.png')

    return (counts, inserts)
Exemple #7
0
def check_premap(data_folder,
                 adaID,
                 fragments,
                 seq_run,
                 samplename,
                 qual_min=30,
                 match_len_min=10,
                 maxreads=-1,
                 VERBOSE=0,
                 savefig=True,
                 title=None):
    '''Check premap to reference: coverage, etc.'''
    refseq = SeqIO.read(get_reference_premap_filename(data_folder, adaID),
                        'fasta')

    # FIXME: do this possibly better than parsing the description!
    try:
        fields = refseq.description.split()
        refseq_start = int(fields[fields.index('(indices') - 3])
    except ValueError:
        refseq_start = 550

    fragpos_filename = get_fragment_positions_filename(data_folder, adaID)
    if os.path.isfile(fragpos_filename):
        # Load the fragment positions, considering mixed fragments (e.g. F5a+b)
        fragtmp = []
        postmp = []
        with open(fragpos_filename, 'r') as f:
            f.readline()  #HEADER
            for line in f:
                fields = line[:-1].split('\t')
                fragtmp.append(fields[0])
                if 'inner' not in fields[1]:
                    postmp.append([fields[1], fields[4]])
                else:
                    start = int(
                        fields[1].split(',')[1].split(': ')[1].rstrip('}'))
                    end = int(
                        fields[4].split(',')[1].split(': ')[1].rstrip('}'))
                    postmp.append([start, end])

        postmp = np.array(postmp, int)
        # NOTE: In a lot of old files, it says F3o instead of F3ao
        if 'F3o' in fragtmp:
            fragtmp[fragtmp.index('F3o')] = 'F3ao'
        elif 'F3i' in fragtmp:
            fragtmp[fragtmp.index('F3i')] = 'F3ai'

        frags_pos = np.array([postmp[fragtmp.index(fr)] for fr in fragments],
                             int).T

    else:
        frags_pos = None

    frags_pos_out = None

    # Open BAM and scan reads
    input_filename = get_premapped_filename(data_folder, adaID, type='bam')
    if not os.path.isfile(input_filename):
        if VERBOSE:
            print 'Premapped BAM file not found'
        return (None, None)

    # Count reads if requested
    n_reads = get_number_reads(input_filename)
    if VERBOSE:
        print 'N. of reads:', n_reads

    # Get counts
    counts, inserts = get_allele_counts_insertions_from_file_unfiltered(
        input_filename,
        len(refseq),
        qual_min=qual_min,
        match_len_min=match_len_min,
        maxreads=maxreads,
        VERBOSE=VERBOSE)

    # Plot results
    if title is None:
        title = ', '.join([
            'run ' + seq_run + ' ' + adaID,
            'sample ' + samplename,
            'reads ' + str(min(maxreads, n_reads)) + '/' + str(n_reads),
        ])
    plot_coverage(counts,
                  offset_x=refseq_start,
                  frags_pos=frags_pos,
                  frags_pos_out=frags_pos_out,
                  title=title)

    if savefig:
        from hivwholeseq.sequencing.adapter_info import foldername_adapter
        plt.savefig(data_folder + foldername_adapter(adaID) +
                    'figures/coverage_premapped_' + samplename + '.png')

    return (counts, inserts)