Ejemplo n.º 1
0
def filter_read_pair(reads,
                     ref,
                     hist_distance_from_consensus=None,
                     hist_dist_along=None,
                     binsize=None,
                     max_mismatches=100,
                     match_len_min=30,
                     trim_bad_cigars=3,
                     VERBOSE=0):
    '''Filter read pair'''
    from hivwholeseq.utils.mapping import trim_short_cigars_pair

    (read1, read2) = reads

    # Check names to make sure we are looking at paired reads, this would
    # screw up the whole bamfile
    if read1.qname != read2.qname:
        n_wrongname += 1
        raise ValueError('Read pair ' + str(irp) +
                         ': reads have different names!')

    # Ignore unmapped reads
    if read1.is_unmapped or read2.is_unmapped:
        if VERBOSE >= 2:
            print 'Read pair ' + read1.qname + ': unmapped'
        return 'unmapped'

    # Ignore not properly paired reads (this includes mates sitting on
    # different fragments)
    if (not read1.is_proper_pair) or (not read2.is_proper_pair):
        if VERBOSE >= 2:
            print 'Read pair ' + read1.qname + ': not properly paired'
        return 'unpaired'

    i_fwd = reads[0].is_reverse
    i_rev = not i_fwd
    readf = reads[i_fwd]
    readr = reads[i_rev]

    # Mismappings are often characterized by many mutations:
    # check the number of mismatches and skip reads with too many
    dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)

    if hist_distance_from_consensus is not None:
        hist_distance_from_consensus[dc.sum()] += 1

    if hist_dist_along is not None:
        hbin = (readf.pos + readf.isize / 2) // binsize
        hist_dist_along[hbin, dc.sum()] += 1

    if (dc.sum() > max_mismatches):
        if VERBOSE >= 2:
            print 'Read pair '+read1.qname+': too many mismatches '+\
                    '('+str(dc[0])+' + '+str(dc[1])+')'
        return 'mutator'

    # Trim the bad CIGARs from the sides, if there are any good ones
    # FIXME: this must have a bug with leading insertions
    # NOTE: I rewrote the function, now simpler, it should work
    skip = trim_short_cigars_pair(reads,
                                  match_len_min=match_len_min,
                                  trim_pad=trim_bad_cigars,
                                  throw=False)
    if skip:
        return 'bad_cigar'

    # Check the reads are still long enough after trimming
    if (len(read1.seq) < 100):
        if VERBOSE >= 2:
            print 'Read too short:', read1.qname, len(read1.seq)
        return 'tiny'

    if (len(read2.seq) < 100):
        if VERBOSE >= 2:
            print 'Read too short:', read2.qname, len(read2.seq)
        return 'tiny'

    # NOTE: cross-overhang and similar stuff should never happen, because we
    # filter only insert sizes > 400 after premapping. Nonetheless...
    if readf.isize < 300:
        if VERBOSE >= 2:
            print 'Insert too small:', readf.isize
        return 'tiny'

    return 'good'
Ejemplo n.º 2
0
def fish_distant_reads(bamfilename, ref,
                       min_mismatches=20, max_mismatches=30,
                       VERBOSE=0, maxseqs=-1):
    '''Fish distant reads from the trash'''
    import numpy as np

    from hivwholeseq.utils.mapping import pair_generator, reads_to_seqrecord
    from hivwholeseq.sequencing.filter_mapped_reads import check_overhanging_reads, \
            get_distance_from_consensus

    distances = []
    seqs = []
    edges = []
    with pysam.Samfile(bamfilename, 'rb') as bamfile:
        for irp, reads in enumerate(pair_generator(bamfile)):
            if VERBOSE >= 2:
                if not ((irp + 1) % 10000):
                    print irp + 1

            (read1, read2) = reads
            i_fwd = reads[0].is_reverse

            # Check a few things to make sure we are looking at paired reads
            if read1.qname != read2.qname:
                raise ValueError('Read pair '+str(irp)+': reads have different names!')

            # Ignore unmapped reads
            if read1.is_unmapped or read2.is_unmapped:
                continue
            
            # Ignore not properly paired reads (this includes mates sitting on
            # different fragments)
            if (not read1.is_proper_pair) or (not read2.is_proper_pair):
                continue

            # Check for overhangs beyond the edge
            skip = check_overhanging_reads(reads, len(ref))
            if skip:
                continue

            # Fish out our reads
            dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
            if (min_mismatches <= dc.sum() <= max_mismatches):
                if VERBOSE >= 3:
                    print 'Gotcha!', reads[0].qname
                seqs.append(reads[0])
                seqs.append(reads[1])
                distances.append(dc)
                edge = [(read.pos, read.pos + sum(bl for bt, bl in read.cigar if bt in (0, 2)))
                        for read in reads]
                edges.append(edge)

                if len(seqs) // 2 == maxseqs:
                    if VERBOSE >= 2:
                        print 'Max seqs reached:', maxseqs
                    break

        seqs = list(pair_generator(reads_to_seqrecord(seqs)))

    distances = np.array(distances, int)
    return (distances, edges, seqs)
Ejemplo n.º 3
0
def filter_read_pair(reads,
                     ref,
                     hist_distance_from_consensus=None,
                     hist_dist_along=None,
                     binsize=None,
                     max_mismatches=100,
                     match_len_min=30,
                     trim_bad_cigars=3,
                     VERBOSE=0):
    '''Filter read pair'''
    from hivwholeseq.utils.mapping import trim_short_cigars_pair

    (read1, read2) = reads

    # Check names to make sure we are looking at paired reads, this would
    # screw up the whole bamfile
    if read1.qname != read2.qname:
        n_wrongname += 1
        raise ValueError('Read pair '+str(irp)+': reads have different names!')

    # Ignore unmapped reads
    if read1.is_unmapped or read2.is_unmapped:
        if VERBOSE >= 2:
            print 'Read pair '+read1.qname+': unmapped'
        return 'unmapped'

    # Ignore not properly paired reads (this includes mates sitting on
    # different fragments)
    if (not read1.is_proper_pair) or (not read2.is_proper_pair):
        if VERBOSE >= 2:
            print 'Read pair '+read1.qname+': not properly paired'
        return 'unpaired'
        
    i_fwd = reads[0].is_reverse
    i_rev = not i_fwd
    readf = reads[i_fwd]
    readr = reads[i_rev]

    # Mismappings are often characterized by many mutations:
    # check the number of mismatches and skip reads with too many
    dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE)
    
    if hist_distance_from_consensus is not None:
        hist_distance_from_consensus[dc.sum()] += 1

    if hist_dist_along is not None:
        hbin = (readf.pos + readf.isize / 2) // binsize
        hist_dist_along[hbin, dc.sum()] += 1

    if (dc.sum() > max_mismatches):
        if VERBOSE >= 2:
            print 'Read pair '+read1.qname+': too many mismatches '+\
                    '('+str(dc[0])+' + '+str(dc[1])+')'
        return 'mutator'

    # Trim the bad CIGARs from the sides, if there are any good ones
    # FIXME: this must have a bug with leading insertions
    # NOTE: I rewrote the function, now simpler, it should work
    skip = trim_short_cigars_pair(reads, match_len_min=match_len_min,
                                  trim_pad=trim_bad_cigars, throw=False)
    if skip:
        return 'bad_cigar'

    # Check the reads are still long enough after trimming
    if (len(read1.seq) < 100):
        if VERBOSE >= 2:
            print 'Read too short:', read1.qname, len(read1.seq)
        return 'tiny'
    
    if (len(read2.seq) < 100):
        if VERBOSE >= 2:
            print 'Read too short:', read2.qname, len(read2.seq)
        return 'tiny'

    # NOTE: cross-overhang and similar stuff should never happen, because we
    # filter only insert sizes > 400 after premapping. Nonetheless...
    if readf.isize < 300:
        if VERBOSE >= 2:
            print 'Insert too small:', readf.isize
        return 'tiny'

    return 'good'