Exemple #1
0
def trim_nongenomic_polyA_from_end(mapping, region_fetcher):
    ''' If a mapping ends in a polyA stretch, soft clip from the first
    nongenomic A onward.
    '''
    if sam.contains_indel_pysam(mapping) or mapping.is_unmapped:
        return mapping

    first_ref_index = None
    if mapping.is_reverse:
        bases_to_trim = 0
        poly_T_end = find_poly_T(mapping.seq)
        for read_index, ref_index in mapping.aligned_pairs[::-1]:
            if read_index == None:
                # indels are filtered out above, so this can only be
                # a skip from splicing
                continue
            if read_index > poly_T_end:
                first_ref_index = ref_index
                continue
            ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1)
            if ref_base != 'T':
                bases_to_trim = read_index + 1
                break
            else:
                # first_ref_index needs to be set to the last position
                # that passed that 'are you genomic?' test
                first_ref_index = ref_index
    else:
        first_ref_index = mapping.pos
        bases_to_trim = 0
        poly_A_start = find_poly_A(mapping.seq)
        for read_index, ref_index in mapping.aligned_pairs:
            if read_index < poly_A_start:
                continue
            ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1)
            if ref_base != 'A':
                bases_to_trim = len(mapping.seq) - read_index
                break

    if first_ref_index == None:
        print mapping
        raise ValueError('first_ref_index not set')

    if bases_to_trim > 0:
        mapping.pos = first_ref_index

        trimmed_length = len(mapping.seq) - bases_to_trim
        soft_clipped_block = [(sam.BAM_CSOFT_CLIP, bases_to_trim)]
        if mapping.is_reverse:
            # Remove blocks from the beginning.
            trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(
                mapping.cigar, trimmed_length)
            updated_cigar = soft_clipped_block + trimmed_cigar
        else:
            # Remove blocks from the end.
            trimmed_cigar = sam.truncate_cigar_blocks_up_to(
                mapping.cigar, trimmed_length)
            updated_cigar = trimmed_cigar + soft_clipped_block

        mapping.cigar = updated_cigar

    if mapping.tags:
        # Clear the MD tag since the possible removal of bases to the
        # alignment may have made it inaccurate.
        # TODO: now have machinery to make it accurate.
        filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags)
        mapping.tags = filtered_tags

    set_nongenomic_length(mapping, bases_to_trim)

    return mapping
Exemple #2
0
def trim_mismatches_from_start(mapping, region_fetcher, type_counts):
    ''' Remove all consecutive Q30+ mismatches from the beginning of alignments,
        under the assumption that these represent untemplated additions during
        reverse transcription.
        Characterize the mismatches into type_counts.
    '''
    if sam.contains_indel_pysam(mapping) or mapping.is_unmapped:
        set_nongenomic_length(mapping, 0)
        return mapping

    if mapping.is_reverse:
        aligned_pairs = mapping.aligned_pairs[::-1]
        index_lookup = utilities.base_to_complement_index
    else:
        aligned_pairs = mapping.aligned_pairs
        index_lookup = utilities.base_to_index

    decoded_qual = fastq.decode_sanger(mapping.qual)

    bases_to_trim = 0
    found_trim_point = False
    first_ref_index = None
    for read_index, ref_index in aligned_pairs:
        if read_index == None:
            # This shouldn't be able to be triggered since alignments
            # containing indels are ruled out above.
            continue

        if mapping.is_reverse:
            corrected_read_index = mapping.qlen - 1 - read_index
        else:
            corrected_read_index = read_index

        ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1)
        read_base = mapping.seq[read_index]
        read_qual = decoded_qual[read_index]
        coords = (
            mapping.qlen,
            corrected_read_index,
            read_qual,
            index_lookup[ref_base],
            index_lookup[read_base],
        )
        type_counts[coords] += 1

        if not found_trim_point:
            if read_base != ref_base and read_qual >= 30:
                bases_to_trim += 1
            else:
                first_ref_index = ref_index
                found_trim_point = True

    if first_ref_index == None:
        raise ValueError('first_ref_index not set')

    if bases_to_trim == 0:
        trimmed_mapping = mapping
    else:
        trimmed_mapping = pysam.AlignedRead()
        trimmed_mapping.qname = mapping.qname
        trimmed_mapping.tid = mapping.tid

        # first_ref_index has been set above to the be index of the
        # reference base aligned to the first non-trimmed base in the
        # read. If the mapping is forward, this will be the new pos.
        # If the mapping is reverse, the pos won't change.
        if mapping.is_reverse:
            first_ref_index = mapping.pos
        trimmed_mapping.pos = first_ref_index

        trimmed_mapping.is_reverse = mapping.is_reverse
        trimmed_mapping.is_secondary = mapping.is_secondary
        trimmed_mapping.mapq = mapping.mapq

        if mapping.is_reverse:
            # bases_to_trim is never zero here, so there is no danger
            # of minus zero
            trimmed_slice = slice(None, -bases_to_trim)
        else:
            trimmed_slice = slice(bases_to_trim, None)

        trimmed_mapping.seq = mapping.seq[trimmed_slice]
        trimmed_mapping.qual = mapping.qual[trimmed_slice]
        trimmed_mapping.rnext = -1
        trimmed_mapping.pnext = -1

        trimmed_length = len(mapping.seq) - bases_to_trim
        if mapping.is_reverse:
            # Remove blocks from the end
            trimmed_cigar = sam.truncate_cigar_blocks_up_to(
                mapping.cigar, trimmed_length)
        else:
            # Remove blocks from the beginning
            trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(
                mapping.cigar, trimmed_length)

        trimmed_mapping.cigar = trimmed_cigar

    return trimmed_mapping
Exemple #3
0
def trim_nongenomic_polyA_from_end(mapping, region_fetcher):
    ''' If a mapping ends in a polyA stretch, soft clip from the first
    nongenomic A onward.
    '''
    if sam.contains_indel_pysam(mapping) or mapping.is_unmapped:
        return mapping

    first_ref_index = None
    if mapping.is_reverse:
        bases_to_trim = 0
        poly_T_end = find_poly_T(mapping.seq)
        for read_index, ref_index in mapping.aligned_pairs[::-1]:
            if read_index == None:
                # indels are filtered out above, so this can only be 
                # a skip from splicing
                continue
            if read_index > poly_T_end:
                first_ref_index = ref_index
                continue
            ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1)
            if ref_base != 'T':
                bases_to_trim = read_index + 1
                break
            else:
                # first_ref_index needs to be set to the last position
                # that passed that 'are you genomic?' test
                first_ref_index = ref_index
    else:
        first_ref_index = mapping.pos
        bases_to_trim = 0
        poly_A_start = find_poly_A(mapping.seq)
        for read_index, ref_index in mapping.aligned_pairs:
            if read_index < poly_A_start:
                continue
            ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1)
            if ref_base != 'A':
                bases_to_trim = len(mapping.seq) - read_index
                break
    
    if first_ref_index == None:
        print mapping
        raise ValueError('first_ref_index not set')

    if bases_to_trim > 0:
        mapping.pos = first_ref_index

        trimmed_length = len(mapping.seq) - bases_to_trim
        soft_clipped_block = [(sam.BAM_CSOFT_CLIP, bases_to_trim)]
        if mapping.is_reverse:
            # Remove blocks from the beginning.
            trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(mapping.cigar, trimmed_length)
            updated_cigar = soft_clipped_block + trimmed_cigar
        else:
            # Remove blocks from the end.
            trimmed_cigar = sam.truncate_cigar_blocks_up_to(mapping.cigar, trimmed_length)
            updated_cigar = trimmed_cigar + soft_clipped_block
        
        mapping.cigar = updated_cigar
    
    if mapping.tags:
        # Clear the MD tag since the possible removal of bases to the
        # alignment may have made it inaccurate. 
        # TODO: now have machinery to make it accurate.
        filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags)
        mapping.tags = filtered_tags

    set_nongenomic_length(mapping, bases_to_trim)

    return mapping
Exemple #4
0
def filter_mappings(
    mappings,
    minimum_mapq=42,
    max_insert_length=1000,
    counts_dict=None,
    verbose=False,
    unmapped_fns=None,
):
    ''' Filters out unmapped, nonuniquely mapped, or discordantly mapped
        reads.
    '''
    pair_counts = {
        'total': 0,
        'unmapped': 0,
        'indel': 0,
        'nonunique': 0,
        'discordant': 0,
        'disoriented': 0,
        'unique': Counter(),
        'mapqs': Counter(),
        'fragment_lengths': Counter(),
        'tids': Counter(),
    }

    if unmapped_fns:
        R1_unmapped_fn, R2_unmapped_fn = unmapped_fns
        R1_unmapped_fh = open(R1_unmapped_fn, 'w')
        R2_unmapped_fh = open(R2_unmapped_fn, 'w')

    for _, aligned_pair in utilities.group_by(mappings, key=lambda m: m.qname):
        if len(aligned_pair) != 2:
            raise ValueError(len(aligned_pair))

        pair_counts['total'] += 1

        R1_aligned, R2_aligned = aligned_pair
        # If R2 is mapped but R1 isn't, R2 gets reported first.
        if not R1_aligned.is_read1:
            R1_aligned, R2_aligned = R2_aligned, R1_aligned

        if (not R1_aligned.is_read1) or (not R2_aligned.is_read2):
            raise ValueError(R1_aligned, R2_aligned)

        pair_counts['mapqs'][R1_aligned.mapq] += 1
        pair_counts['mapqs'][R2_aligned.mapq] += 1

        if R1_aligned.is_unmapped or R2_aligned.is_unmapped:
            pair_counts['unmapped'] += 1

            if verbose:
                logging.info('{0} was unmapped'.format(R1_aligned.qname))

            if unmapped_fns:
                R1_read = sam.mapping_to_Read(R1_aligned)
                R2_read = sam.mapping_to_Read(R2_aligned)
                R1_unmapped_fh.write(str(R1_read))
                R2_unmapped_fh.write(str(R2_read))

        elif is_discordant(R1_aligned, R2_aligned, max_insert_length):
            pair_counts['discordant'] += 1

        else:
            pair_counts['tids'][R1_aligned.tid] += 1

            if is_disoriented(R1_aligned, R2_aligned):
                pair_counts['disoriented'] += 1
            elif R1_aligned.mapq < minimum_mapq or R2_aligned.mapq < minimum_mapq:
                pair_counts['nonunique'] += 1
                if verbose:
                    logging.info('{0} was nonunique, {1}, {2}'.format(
                        R1_aligned.qname, R1_aligned.mapq, R2_aligned.mapq))
            else:
                pair_counts['unique'][R1_aligned.tid] += 1

                fragment_length = abs(R1_aligned.tlen)
                pair_counts['fragment_lengths'][fragment_length] += 1

                if sam.contains_indel_pysam(
                        R1_aligned) or sam.contains_indel_pysam(R2_aligned):
                    pair_counts['indel'] += 1

                yield R1_aligned, R2_aligned

    if counts_dict != None:
        counts_dict.update(pair_counts)
Exemple #5
0
def trim_mismatches_from_start(mapping, region_fetcher, type_counts):
    ''' Remove all consecutive Q30+ mismatches from the beginning of alignments,
        under the assumption that these represent untemplated additions during
        reverse transcription.
        Characterize the mismatches into type_counts.
    '''
    if sam.contains_indel_pysam(mapping) or mapping.is_unmapped:
        set_nongenomic_length(mapping, 0)
        return mapping

    if mapping.is_reverse:
        aligned_pairs = mapping.aligned_pairs[::-1]
        index_lookup = utilities.base_to_complement_index
    else:
        aligned_pairs = mapping.aligned_pairs
        index_lookup = utilities.base_to_index

    decoded_qual = fastq.decode_sanger(mapping.qual)
    
    bases_to_trim = 0
    found_trim_point = False
    first_ref_index = None
    for read_index, ref_index in aligned_pairs:
        if read_index == None:
            # This shouldn't be able to be triggered since alignments
            # containing indels are ruled out above.
            continue

        if mapping.is_reverse:
            corrected_read_index = mapping.qlen - 1 - read_index
        else:
            corrected_read_index = read_index

        ref_base = region_fetcher(mapping.tid, ref_index, ref_index + 1)
        read_base = mapping.seq[read_index]
        read_qual = decoded_qual[read_index]
        coords = (mapping.qlen,
                  corrected_read_index,
                  read_qual,
                  index_lookup[ref_base],
                  index_lookup[read_base],
                 )
        type_counts[coords] += 1

        if not found_trim_point:
            if read_base != ref_base and read_qual >= 30:
                bases_to_trim += 1
            else:
                first_ref_index = ref_index
                found_trim_point = True

    if first_ref_index == None:
        raise ValueError('first_ref_index not set')

    if bases_to_trim == 0:
        trimmed_mapping = mapping
    else:
        trimmed_mapping = pysam.AlignedRead()
        trimmed_mapping.qname = mapping.qname
        trimmed_mapping.tid = mapping.tid
        
        # first_ref_index has been set above to the be index of the
        # reference base aligned to the first non-trimmed base in the
        # read. If the mapping is forward, this will be the new pos.
        # If the mapping is reverse, the pos won't change.
        if mapping.is_reverse:
            first_ref_index = mapping.pos
        trimmed_mapping.pos = first_ref_index

        trimmed_mapping.is_reverse = mapping.is_reverse
        trimmed_mapping.is_secondary = mapping.is_secondary
        trimmed_mapping.mapq = mapping.mapq

        if mapping.is_reverse:
            # bases_to_trim is never zero here, so there is no danger
            # of minus zero
            trimmed_slice = slice(None, -bases_to_trim)
        else:
            trimmed_slice = slice(bases_to_trim, None)

        trimmed_mapping.seq = mapping.seq[trimmed_slice]
        trimmed_mapping.qual = mapping.qual[trimmed_slice]
        trimmed_mapping.rnext = -1
        trimmed_mapping.pnext = -1

        trimmed_length = len(mapping.seq) - bases_to_trim
        if mapping.is_reverse:
            # Remove blocks from the end
            trimmed_cigar = sam.truncate_cigar_blocks_up_to(mapping.cigar, trimmed_length)
        else:
            # Remove blocks from the beginning
            trimmed_cigar = sam.truncate_cigar_blocks_from_beginning(mapping.cigar, trimmed_length)
        
        trimmed_mapping.cigar = trimmed_cigar

    return trimmed_mapping