def __init__(self, **kwargs):
        super(WilkeningRNASeqExperiment, self).__init__(**kwargs)

        full_adapter_in_R2 = utilities.reverse_complement(
            self.barcode) + utilities.reverse_complement(
                adapters.primers['PE']['R1'])
        self.adapter_in_R2 = full_adapter_in_R2[:19]
Exemple #2
0
def unambiguously_trimmed(bam_fn, unambiguous_bam_fn, genome_dir):
    ''' Reads that have had poly-As trimmed may have had some real RPF A's
        trimmed as well. Retains only mapped reads for which the last aligned
        base and the following base in the reference are both non-A.
    '''
    genome = genomes.load_entire_genome(genome_dir)

    bamfile = pysam.Samfile(bam_fn)
    with pysam.Samfile(unambiguous_bam_fn, 'wb',
                       header=bamfile.header) as unambiguous_bam_fh:
        for read in bamfile:
            rname = bamfile.getrname(read.tid)

            if not read.is_reverse:
                if read.positions[-1] == bamfile.lengths[read.tid] - 1:
                    # There is no next base to get
                    continue
                last_position = read.positions[-1]
                last_base, next_base = genome[rname][
                    last_position:last_position + 2]
            else:
                if read.positions[0] == 0:
                    # There is no next base to get
                    continue
                last_position = read.positions[0]
                last_base, next_base = utilities.reverse_complement(
                    genome[rname][last_position - 1:last_position + 1])

            if last_base.upper() != 'A' and next_base.upper() != 'A':
                unambiguous_bam_fh.write(read)

    pysam.index(unambiguous_bam_fn)
Exemple #3
0
def unambiguously_trimmed(bam_fn, unambiguous_bam_fn, genome_dir):
    ''' Reads that have had poly-As trimmed may have had some real RPF A's
        trimmed as well. Retains only mapped reads for which the last aligned
        base and the following base in the reference are both non-A.
    '''
    genome = genomes.load_entire_genome(genome_dir)
    
    bamfile = pysam.Samfile(bam_fn)
    with pysam.Samfile(unambiguous_bam_fn, 'wb', header=bamfile.header) as unambiguous_bam_fh:
        for read in bamfile:
            rname = bamfile.getrname(read.tid)

            if not read.is_reverse:
                if read.positions[-1] == bamfile.lengths[read.tid] - 1:
                    # There is no next base to get
                    continue
                last_position = read.positions[-1]
                last_base, next_base = genome[rname][last_position:last_position + 2]
            else:
                if read.positions[0] == 0:
                    # There is no next base to get
                    continue
                last_position = read.positions[0]
                last_base, next_base = utilities.reverse_complement(genome[rname][last_position - 1:last_position + 1])

            if last_base.upper() != 'A' and next_base.upper() != 'A':
                unambiguous_bam_fh.write(read)

    pysam.index(unambiguous_bam_fn)
Exemple #4
0
def build_adapter_ranges(index_sequence, primer_type='tru_seq'):
    def make_ranges(construct, names):
        cumulative_lengths = list(np.cumsum(map(len, construct)))
        bounds = zip([0] + cumulative_lengths, cumulative_lengths)
        ranges = zip(names, bounds)
        return ranges

    primer_in_R1 = utilities.reverse_complement(primers[primer_type]['R2'])
    primer_in_R2 = utilities.reverse_complement(primers[primer_type]['R1'])

    R1_construct = [
        primer_in_R1,
        index_sequence,
        P7_rc,
        A_tail,
    ]
    R1_names = [
        'R2 primer',
        'I7',
        'P7',
        'A tail',
    ]

    chemistry_only_cycles = 7
    I5_length = 8
    R2_construct = [
        primer_in_R2[:-(I5_length + chemistry_only_cycles)],
        primer_in_R2[-(I5_length +
                       chemistry_only_cycles):-(chemistry_only_cycles)],
        primer_in_R2[-chemistry_only_cycles:],
        P5_rc,
        A_tail,
    ]
    R2_names = [
        'R1 primer',
        'I5',
        'Chemistry',
        'P7',
        'A tail',
    ]

    R1_ranges = make_ranges(R1_construct, R1_names)
    R2_ranges = make_ranges(R2_construct, R2_names)
    return R1_ranges, R2_ranges
Exemple #5
0
def build_before_adapters(I7_sequence='',
                          primer_type='tru_seq',
                          just_primers=False):
    if just_primers:
        before_R1 = primers[primer_type]['R1']
        before_R2 = primers[primer_type]['R2']
    else:
        before_R1 = flow_cell['P5'] + primers[primer_type]['R1']
        before_R2 = flow_cell['P7'] + utilities.reverse_complement(
            I7_sequence) + primers[primer_type]['R2']

    return before_R1, before_R2
Exemple #6
0
def get_seq_info_pairs(clean_bam_fn):
    clean_bam_file = pysam.Samfile(clean_bam_fn)
    
    for aligned_read in clean_bam_file:
        if aligned_read.is_unmapped or aligned_read.is_secondary:
            continue
        perfect_and_unique = dict(aligned_read.tags)['NM'] == 0 and aligned_read.mapq == 50
        if aligned_read.is_reverse:
            seq = utilities.reverse_complement(aligned_read.seq)
        else:
            seq = aligned_read.seq

        yield seq, perfect_and_unique
Exemple #7
0
def build_adapter_ranges(index_sequence, primer_type='tru_seq'):
    def make_ranges(construct, names):
        cumulative_lengths = list(np.cumsum(map(len, construct)))
        bounds = zip([0] + cumulative_lengths, cumulative_lengths)
        ranges = zip(names, bounds)
        return ranges
    
    primer_in_R1 = utilities.reverse_complement(primers[primer_type]['R2'])
    primer_in_R2 = utilities.reverse_complement(primers[primer_type]['R1'])

    R1_construct = [primer_in_R1,
                    index_sequence,
                    P7_rc,
                    A_tail,
                   ]
    R1_names = ['R2 primer',
                'I7',
                'P7',
                'A tail',
               ]

    chemistry_only_cycles = 7
    I5_length = 8
    R2_construct = [primer_in_R2[:-(I5_length + chemistry_only_cycles)],
                    primer_in_R2[-(I5_length + chemistry_only_cycles):-(chemistry_only_cycles)],
                    primer_in_R2[-chemistry_only_cycles:],
                    P5_rc,
                    A_tail,
                   ]
    R2_names = ['R1 primer',
                'I5',
                'Chemistry',
                'P7',
                'A tail',
               ]
    
    R1_ranges = make_ranges(R1_construct, R1_names)
    R2_ranges = make_ranges(R2_construct, R2_names)
    return R1_ranges, R2_ranges
Exemple #8
0
def get_seq_info_pairs(clean_bam_fn):
    clean_bam_file = pysam.Samfile(clean_bam_fn)

    for aligned_read in clean_bam_file:
        if aligned_read.is_unmapped or aligned_read.is_secondary:
            continue
        perfect_and_unique = dict(
            aligned_read.tags)['NM'] == 0 and aligned_read.mapq == 50
        if aligned_read.is_reverse:
            seq = utilities.reverse_complement(aligned_read.seq)
        else:
            seq = aligned_read.seq

        yield seq, perfect_and_unique
Exemple #9
0
def print_diagnostic(R1, R2, before_R1, before_R2, alignment, fh=sys.stdout):
    extended_R1 = before_R1.lower() + R1.seq
    extended_R2 = utilities.reverse_complement(before_R2.lower() + R2.seq)
    fh.write(R1.name + '\n')
    fh.write(R1.qual + '\n')
    fh.write(R2.qual + '\n')
    fh.write('{0}\t{1}\t{2}\n'.format(alignment['score'], len(alignment['path']) * 2, alignment['score'] - len(alignment['path']) * 2))
    fh.write(str(alignment['path']) + '\n')
    print_local_alignment(extended_R1, extended_R2, alignment['path'], fh=fh)
    fh.write(str(alignment['insertions']) + '\n')
    fh.write(str(alignment['deletions']) + '\n')
    fh.write(str(sorted(alignment['mismatches'])) + '\n')
    for q, t in sorted(alignment['mismatches']):
        fh.write('\t{0}\t{1}\n'.format(extended_R1[q], extended_R2[t]))
Exemple #10
0
def print_diagnostic(R1, R2, before_R1, before_R2, alignment, fh=sys.stdout):
    extended_R1 = before_R1.lower() + R1.seq
    extended_R2 = utilities.reverse_complement(before_R2.lower() + R2.seq)
    fh.write(R1.name + '\n')
    fh.write(R1.qual + '\n')
    fh.write(R2.qual + '\n')
    fh.write('{0}\t{1}\t{2}\n'.format(
        alignment['score'],
        len(alignment['path']) * .2,
        alignment['score'] - len(alignment['path']) * 2))
    fh.write(str(alignment['path']) + '\n')
    print_local_alignment(extended_R1, extended_R2, alignment['path'], fh=fh)
    fh.write(str(alignment['insertions']) + '\n')
    fh.write(str(alignment['deletions']) + '\n')
    fh.write(str(sorted(alignment['mismatches'])) + '\n')
    for q, t in sorted(alignment['mismatches']):
        fh.write('\t{0}\t{1}\n'.format(extended_R1[q], extended_R2[t]))
def get_edge_alignments(read, targets):
    seq = read.seq
    seq_rc = utilities.reverse_complement(read.seq)
    all_alignments = []
    min_score = 12
    for target in targets:
        for query, is_reverse in [(seq, False), (seq_rc, True)]:
            alignments = sw.generate_alignments(query,
                                                target.seq,
                                                'unpaired_adapter',
                                                min_score=min_score,
                                               )
            for alignment in alignments:
                if alignment['score'] >= 2 * len(alignment['path']):
                    alignment['query'] = query
                    alignment['rname'] = target.name
                    alignment['is_reverse'] = is_reverse
                    all_alignments.append(alignment)

    return all_alignments
def get_local_alignments(read, targets):
    seq = read.seq
    seq_rc = utilities.reverse_complement(read.seq)
    all_alignments = []
    for target in targets:
        min_score = min(20, 2 * len(target.seq))
        for query, is_reverse in [(seq, False), (seq_rc, True)]:
            alignments = sw.generate_alignments(query,
                                                target.seq,
                                                'local',
                                                min_score=min_score,
                                                max_alignments=3,
                                               )
            for alignment in alignments:
                if alignment['score'] >= 0.5 * 2 * len(alignment['path']):
                    alignment['query'] = query
                    alignment['rname'] = target.name
                    alignment['is_reverse'] = is_reverse
                    all_alignments.append(alignment)

    return all_alignments
Exemple #13
0
    def get_extent_sequence(self, left_buffer=0, right_buffer=0):
        ''' Get the sequence of the extent. Useful for looking at gene with
        annotated frameshifts.
        '''
        sequence = self.region_fetcher(self.seqname,
                                       min(self.genomic_to_extent),
                                       max(self.genomic_to_extent) + 1,
                                      )
        if self.strand == '-':
            sequence = utilities.reverse_complement(sequence)

        sequence = np.asarray(sequence, dtype='c')

        extent_landmarks = {'start': 0,
                            'end': self.extent_length,
                           }
        return positions.PositionCounts(extent_landmarks,
                                        left_buffer,
                                        right_buffer,
                                        data=sequence,
                                       )
def get_edge_alignments(read, targets):
    seq = read.seq
    seq_rc = utilities.reverse_complement(read.seq)
    all_alignments = []
    min_score = 10
    for target in targets:
        for query, is_reverse in [(seq, False), (seq_rc, True)]:
            alignments = sw.generate_alignments(
                query,
                target.seq,
                'unpaired_adapter',
                min_score=min_score,
            )
            for alignment in alignments:
                if alignment['score'] >= 2 * len(alignment['path']):
                    alignment['query'] = query
                    alignment['rname'] = target.name
                    alignment['is_reverse'] = is_reverse
                    all_alignments.append(alignment)

    return all_alignments
Exemple #15
0
def get_amino_acid_locations(gene, genome):
    amino_acid_locations = defaultdict(list)
    if gene.seqname == 'MT':
        # Ignore these for now - diffent genetic code and tRNAs presumably means
        # different translation dynamics
        return None
    try:
        if gene.strand == '+':
            # gene.end is the last base before the stop codon
            seq = genome[gene.seqname][gene.start:gene.end + 4]
            translation = Bio.Seq.translate(seq, cds=True)
        elif gene.strand == '-':
            # gene.start is the first base after the (rc of the) stop codon
            # gene.end is the last base of the (rc of the) start codon
            rc_seq = genome[gene.seqname][gene.start - 3:gene.end + 1]
            seq = utilities.reverse_complement(rc_seq)
            translation = Bio.Seq.translate(seq, cds=True)
    except Bio.Seq.CodonTable.TranslationError, err:
        print err
        print gene.source, gene.feature, gene.seqname
        print gene.attribute
        return None
def get_local_alignments(read, targets):
    seq = read.seq
    seq_rc = utilities.reverse_complement(read.seq)
    all_alignments = []
    for target in targets:
        min_score = min(20, 2 * len(target.seq))
        for query, is_reverse in [(seq, False), (seq_rc, True)]:
            alignments = sw.generate_alignments(
                query,
                target.seq,
                'local',
                min_score=min_score,
                max_alignments=3,
            )
            for alignment in alignments:
                if alignment['score'] >= 0.7 * 2 * len(alignment['path']):
                    alignment['query'] = query
                    alignment['rname'] = target.name
                    alignment['is_reverse'] = is_reverse
                    all_alignments.append(alignment)

    return all_alignments
Exemple #17
0
def get_amino_acid_locations(gene, genome):
    amino_acid_locations = defaultdict(list)
    if gene.seqname == 'MT':
        # Ignore these for now - diffent genetic code and tRNAs presumably means
        # different translation dynamics
        return None
    try:
        if gene.strand == '+':
            # gene.end is the last base before the stop codon
            seq = genome[gene.seqname][gene.start:gene.end + 4]
            translation = Bio.Seq.translate(seq, cds=True)
        elif gene.strand == '-':
            # gene.start is the first base after the (rc of the) stop codon
            # gene.end is the last base of the (rc of the) start codon
            rc_seq = genome[gene.seqname][gene.start - 3:gene.end + 1]
            seq = utilities.reverse_complement(rc_seq)
            translation = Bio.Seq.translate(seq, cds=True)
    except Bio.Seq.CodonTable.TranslationError, err:
        print err
        print gene.source, gene.feature, gene.seqname
        print gene.attribute
        return None
Exemple #18
0
    def get_extent_sequence(self, left_buffer=0, right_buffer=0):
        ''' Get the sequence of the extent. Useful for looking at gene with
        annotated frameshifts.
        '''
        sequence = self.region_fetcher(
            self.seqname,
            min(self.genomic_to_extent),
            max(self.genomic_to_extent) + 1,
        )
        if self.strand == '-':
            sequence = utilities.reverse_complement(sequence)

        sequence = np.asarray(sequence, dtype='c')

        extent_landmarks = {
            'start': 0,
            'end': self.extent_length,
        }
        return positions.PositionCounts(
            extent_landmarks,
            left_buffer,
            right_buffer,
            data=sequence,
        )
Exemple #19
0
def infer_insert_length(R1, R2, before_R1, before_R2):
    ''' Infer the length of the insert represented by R1 and R2 by performing
        a semi-local alignment of R1 and the reverse complement of R2 with
        the expected adapter sequences prepended to each read.
    '''
    extended_R1 = before_R1 + R1.seq
    extended_R2 = utilities.reverse_complement(before_R2 + R2.seq)
    alignment,  = generate_alignments(extended_R1,
                                      extended_R2, 
                                      'overlap',
                                      2,
                                      -1,
                                      -5,
                                      1,
                                      0,
                                     )

    #print_diagnostic(R1, R2, before_R1, before_R2, alignment)
    
    R1_start = len(before_R1)
    R2_start = len(R2.seq) - 1
    R1_start_in_R2 = alignment['query_mappings'][len(before_R1)]
    R2_start_in_R1 = alignment['target_mappings'][len(R2.seq) - 1]
    
    # Since R1 is the query and R2 is the target, bases in R1 that aren't in
    # R2 are called insertions, and bases in R2 that aren't in R1 are called
    # deletions.
    # An indel in the insert is non-physical.
    if R2_start_in_R1 != SOFT_CLIPPED:
        illegal_insertion = any(R1_start <= i <= R2_start_in_R1 for i in alignment['insertions'])
    else:
        illegal_insertion = any(R1_start <= i for i in alignment['insertions'])

    if R1_start_in_R2 != SOFT_CLIPPED:
        illegal_deletion = any(R1_start_in_R2 <= d <= R2_start for d in alignment['deletions'])
    else:
        illegal_deletion = any(d <= R2_start for d in alignment['deletions'])
    
    if illegal_insertion or illegal_deletion:
        return 'illegal', 500, -1

    if R1_start_in_R2 != SOFT_CLIPPED and R2_start_in_R1 != SOFT_CLIPPED:
        length_from_R1 = R2_start_in_R1 - R1_start + 1
        length_from_R2 = R2_start - R1_start_in_R2 + 1
    else:
        # overlap alignment forces the alignment to start with either the
        # beginning of R1 or R2 and end with either the end of R1 or R2. 
        # Making it to this else brach means that either the first base of R1 or
        # the first base of R2 or both wasn't aligned. This either means that
        # the insert is longer than the read length or a pathological alignment
        # has been produced in which only adapter bases are involved in the 
        # alignment. Flag the second case as illegal.

        first_R1_index, first_R2_index = alignment['path'][0]
        length_from_R1 = (first_R1_index - R1_start + 1) + (len(R2.seq) - 1)

        last_R1_index, last_R2_index = alignment['path'][-1]
        length_from_R2 = (R2_start - last_R2_index + 1) + (len(R1.seq) - 1)
        
        if first_R1_index == 0 or last_R2_index == 0:
            return 'illegal', 500, -1 

    if length_from_R1 < -1 or length_from_R2 < -1:
        # Negative insert lengths are non-physical. Even though I don't
        # understand it, -1 is relatively common so is tolerated.
        return 'illegal', 500, -1

    insert_length = length_from_R1

    if 2 * len(alignment['path']) - alignment['score'] > .2 * len(alignment['path']):
        status = 'bad'
    else:
        status = 'good'
    
    if status == 'good' and (length_from_R1 != length_from_R2):
        print 'length from R1', length_from_R1
        print 'length from R2', length_from_R2
        print_diagnostic(R1, R2, before_R1, before_R2, alignment)
        # This shouldn't be possible without an illegal indel.
        raise ValueError

    return status, insert_length, alignment
Exemple #20
0
def extract_seqs_from_combined(
    combined_mapping,
    include_overlap=True,
    remove_soft_clipped=True,
    flip_if_reverse=True,
):
    ''' Separates out the R1 and R2 seq and quals that went into a
    combined_mapping.
    '''
    strand = sam.get_strand(combined_mapping)
    tags = dict(combined_mapping.tags)
    if 'Xs' not in tags:
        tags['Xs'] = ''
        tags['Xq'] = ''
        tags['Xw'] = 'left'

    skip_index = find_skip_index_in_combined(combined_mapping)

    left_cigar = combined_mapping.cigar[:skip_index]
    right_cigar = combined_mapping.cigar[skip_index + 1:]

    left_length = sam.total_read_nucs(left_cigar)

    left_seq = combined_mapping.seq[:left_length]
    left_qual = combined_mapping.qual[:left_length]

    right_seq = combined_mapping.seq[left_length:]
    right_qual = combined_mapping.qual[left_length:]

    if remove_soft_clipped:
        first_left_op, first_left_length = left_cigar[0]
        if first_left_op == sam.BAM_CSOFT_CLIP:
            left_seq = left_seq[first_left_length:]
            left_qual = left_qual[first_left_length:]

        last_right_op, last_right_length = right_cigar[-1]
        if last_right_op == sam.BAM_CSOFT_CLIP:
            right_seq = right_seq[:-last_right_length]
            right_qual = right_qual[:-last_right_length]

    if include_overlap:
        if tags['Xw'] == 'left':
            # Overlapping sequence in the combined read reflects that from the
            # left mapping, so the overlap from the right was stored in the Xs
            # and Xq tags.
            right_seq = tags['Xs'] + right_seq
            right_qual = tags['Xq'] + right_qual
        elif tags['Xw'] == 'right':
            # Overlapping sequence in the combined read reflects that from the
            # right mapping, so the overlap from the left was stored in the Xs
            # and Xq tags.
            left_seq = left_seq + tags['Xs']
            left_qual = left_qual + tags['Xq']

    if strand == '+':
        R1_seq, R1_qual = left_seq, left_qual
        R2_seq, R2_qual = right_seq, right_qual

        if flip_if_reverse:
            R2_seq = utilities.reverse_complement(R2_seq)
            R2_qual = R2_qual[::-1]

    elif strand == '-':
        R1_seq, R1_qual = right_seq, right_qual
        R2_seq, R2_qual = left_seq, left_qual

        if flip_if_reverse:
            R1_seq = utilities.reverse_complement(R1_seq)
            R1_qual = R1_qual[::-1]

    return R1_seq, R1_qual, R2_seq, R2_qual
def find_boundary_sequences(R1, R2, counters):
    # Find which read in the read pair is from the reverse strand by looking for
    # common_right_reverse.
    # First try to find a unique position entirely contained within R1 or R2
    # that is close to common_right_reverse.
    # Failing this, find the longest of (the longest suffix of R1 or R2 that
    # matches a prefix of common_right_reverse) or (the longest prefix of R1 or
    # R2 that matches a suffix of common_right_reverse).

    R1_contained, R1_prefix, R1_suffix = all_adapter_possibilites(R1.seq, common_right_reverse)
    R2_contained, R2_prefix, R2_suffix = all_adapter_possibilites(R2.seq, common_right_reverse)

    if len(R1_contained) + len(R2_contained) > 1:
        # Only one of occurence of common_right_reverse should exist between R1
        # and R2.
        return None, None
    elif len(R1_contained) + len(R2_contained) == 0:
        possiblities = [(len(common_right_reverse) - R1_prefix, 'R1_prefix'),
                        (len(common_right_reverse) - R2_prefix, 'R2_prefix'),
                        (len(common_right_reverse) - R1_suffix, 'R1_suffix'),
                        (len(common_right_reverse) - R2_suffix, 'R2_suffix'),
                       ]
        length, kind = max(possiblities)
        if length > 5:
            if 'R1' in kind:
                reverse_read = R1
                forward_read = R2
                polyA_read = 'R2_forward'
                polyT_read = 'R1_reverse'
            elif 'R2' in kind:
                reverse_read = R2
                forward_read = R1
                polyA_read = 'R1_forward'
                polyT_read = 'R2_reverse'
            if 'prefix' in kind:
                common_right_reverse_start = len(reverse_read.seq) - length
            elif 'suffix' in kind:
                common_right_reverse_start = -length
        else:
            return None, None

    elif len(R1_contained) == 1:
        reverse_read = R1
        forward_read = R2
        polyA_read = 'R2_forward'
        polyT_read = 'R1_reverse'
        common_right_reverse_start = R1_contained.pop()
    elif len(R2_contained) == 1:
        reverse_read = R2
        forward_read = R1
        polyA_read = 'R1_forward'
        polyT_read = 'R2_reverse'
        common_right_reverse_start = R2_contained.pop()

    # '*' means that there was no opportunity to see this id.
    # 'X' means that there was an opportunity and it was neither A nor B.
    right_id = '*'
    left_id = '*'

    five_payload_slice = slice(None, max(0, common_right_reverse_start))
    five_payload_seq = utilities.reverse_complement(reverse_read.seq[five_payload_slice])
    five_payload_qual = reverse_read.qual[five_payload_slice][::-1]

    current_p = common_right_reverse_start + len(common_right_reverse)
    if current_p < len(reverse_read.seq) - after_right_length:
        right_id_seq = reverse_read.seq[current_p:current_p + after_right_length]
        for key, prefix in after_right_prefix.items():
            if right_id_seq == prefix:
                right_id = key
        if right_id == '*':
            right_id = 'X'

        counters['right_ids'][right_id_seq] += 1

        if right_id != 'X':
            current_p += len(after_right[right_id])
            if current_p < len(reverse_read.seq) - 4:
                left_id_seq = reverse_read.seq[current_p:current_p + 4]
                for key, sequence in after_left.items():
                    if left_id_seq == sequence:
                        left_id = key
                if left_id == '*':
                    left_id = 'X'
            
                counters['left_ids'][left_id_seq] += 1

    polyA_start, polyA_length = find_polyA_cython.find_polyA(forward_read.seq, 15)
    polyA_slice = slice(polyA_start, polyA_start + polyA_length)
    polyA_seq = forward_read.seq[polyA_slice]
    polyA_qual = fastq.sanitize_qual(forward_read.qual[polyA_slice])
    three_payload_slice = slice(None, polyA_start)
    three_payload_seq = forward_read.seq[three_payload_slice]
    three_payload_qual = forward_read.qual[three_payload_slice]

    common_name, _ = R1.name.rsplit(':', 1)
    control_ids_string = '{0}-{1}'.format(left_id, right_id)
    five_annotation = trim.PayloadAnnotation(original_name=common_name,
                                             left_seq=control_ids_string,
                                             left_qual='',
                                             right_seq='',
                                             right_qual='',
                                            )
    three_annotation = trim.PayloadAnnotation(original_name=common_name,
                                              left_seq=control_ids_string,
                                              left_qual='',
                                              right_seq=polyA_seq,
                                              right_qual=polyA_qual,
                                             )
    five_payload_read = fastq.Read(five_annotation.identifier, five_payload_seq, five_payload_qual)
    three_payload_read = fastq.Read(three_annotation.identifier, three_payload_seq, three_payload_qual)

    counters['positions'][polyT_read][max(0, common_right_reverse_start)] += 1
    counters['positions'][polyA_read][polyA_start] += 1
    counters['joint_lengths'][max(0, common_right_reverse_start), polyA_start] += 1
    counters['polyA_lengths'][polyA_length] += 1
    counters['control_ids'][control_ids_string] += 1

    if polyA_length < 13:
        return None, None

    return five_payload_read, three_payload_read
Exemple #22
0
def build_adapters(index_sequence='', max_length=None, primer_type='tru_seq'):
    before_R1, before_R2 = build_before_adapters(index_sequence, primer_type)
    adapter_in_R1 = utilities.reverse_complement(before_R2) + A_tail
    adapter_in_R2 = utilities.reverse_complement(before_R1) + A_tail
    truncated_slice = slice(None, max_length)
    return adapter_in_R1[truncated_slice], adapter_in_R2[truncated_slice]
    def __init__(self, **kwargs):
        super(ThreeTFillExperiment, self).__init__(**kwargs)

        self.barcode = kwargs['barcode']
        full_adapter_in_R1 = utilities.reverse_complement(self.barcode) + utilities.reverse_complement(adapters.primers['PE']['R2']) 
        self.adapter_in_R1 = full_adapter_in_R1[:19]
Exemple #24
0
def align_reads(
    target_fasta_fn,
    reads,
    bam_fn,
    min_path_length=15,
    error_fn='/dev/null',
    alignment_type='overlap',
):
    ''' Aligns reads to targets in target_fasta_fn by Smith-Waterman, storing
    alignments in bam_fn and yielding unaligned reads.
    '''
    targets = {r.name: r.seq for r in fasta.reads(target_fasta_fn)}

    target_names = sorted(targets)
    target_lengths = [len(targets[n]) for n in target_names]
    alignment_sorter = sam.AlignmentSorter(
        target_names,
        target_lengths,
        bam_fn,
    )
    statistics = Counter()

    with alignment_sorter:
        for original_read in reads:
            statistics['input'] += 1

            alignments = []

            rc_read = fastq.Read(
                original_read.name,
                utilities.reverse_complement(original_read.seq),
                original_read.qual[::-1],
            )

            for read, is_reverse in ([original_read, False], [rc_read, True]):
                qual = fastq.decode_sanger(read.qual)
                for target_name, target_seq in targets.iteritems():
                    alignment = generate_alignments(read.seq, target_seq,
                                                    alignment_type)[0]
                    path = alignment['path']
                    if len(path) >= min_path_length and alignment['score'] / (
                            2. * len(path)) > 0.8:
                        aligned_segment = pysam.AlignedSegment()
                        aligned_segment.seq = read.seq
                        aligned_segment.query_qualities = qual
                        aligned_segment.is_reverse = is_reverse

                        char_pairs = make_char_pairs(path, read.seq,
                                                     target_seq)

                        cigar = sam.aligned_pairs_to_cigar(char_pairs)
                        clip_from_start = first_query_index(path)
                        if clip_from_start > 0:
                            cigar = [(sam.BAM_CSOFT_CLIP, clip_from_start)
                                     ] + cigar
                        clip_from_end = len(
                            read.seq) - 1 - last_query_index(path)
                        if clip_from_end > 0:
                            cigar = cigar + [
                                (sam.BAM_CSOFT_CLIP, clip_from_end)
                            ]
                        aligned_segment.cigar = cigar

                        read_aligned, ref_aligned = zip(*char_pairs)
                        md = sam.alignment_to_MD_string(
                            ref_aligned, read_aligned)
                        aligned_segment.set_tag('MD', md)

                        aligned_segment.set_tag('AS', alignment['score'])
                        aligned_segment.tid = alignment_sorter.get_tid(
                            target_name)
                        aligned_segment.query_name = read.name
                        aligned_segment.next_reference_id = -1
                        aligned_segment.reference_start = first_target_index(
                            path)

                        alignments.append(aligned_segment)

            if alignments:
                statistics['aligned'] += 1

                sorted_alignments = sorted(alignments,
                                           key=lambda m: m.get_tag('AS'),
                                           reverse=True)
                grouped = utilities.group_by(sorted_alignments,
                                             key=lambda m: m.get_tag('AS'))
                _, highest_group = grouped.next()
                primary_already_assigned = False
                for alignment in highest_group:
                    if len(highest_group) == 1:
                        alignment.mapping_quality = 2
                    else:
                        alignment.mapping_quality = 1

                    if not primary_already_assigned:
                        primary_already_assigned = True
                    else:
                        alignment.is_secondary = True

                    alignment_sorter.write(alignment)
            else:
                statistics['unaligned'] += 1

                yield read

        with open(error_fn, 'w') as error_fh:
            for key in ['input', 'aligned', 'unaligned']:
                error_fh.write('{0}: {1:,}\n'.format(key, statistics[key]))
Exemple #25
0
         open(args.bad_R2_fn, 'w') as bad_R2_fn:

        for R1, R2 in itertools.islice(read_pairs, 10000):
            if len(R1.seq) != len(R2.seq):
                bad_R1_fn.write(str(R1))
                bad_R2_fn.write(str(R2))
                continue

            status, insert_length, alignment = infer_insert_length(
                R1, R2, '', '')
            if status == 'bad':
                bad_R1_fn.write(str(R1))
                bad_R2_fn.write(str(R2))
                continue
            else:
                R2_rc_seq = utilities.reverse_complement(R2.seq)
                R2_rc_qual = R2.qual[::-1]

                just_R1_slice = slice(None, insert_length - len(R1.seq))
                just_R1_seq = R1.seq[just_R1_slice]
                just_R1_qual = R1.qual[just_R1_slice]

                overlap_R1_slice = slice(insert_length - len(R1.seq), None)
                overlap_R1_seq = R1.seq[overlap_R1_slice]
                overlap_R1_qual = R1.seq[overlap_R1_slice]

                overlap_R2_slice = slice(None, len(overlap_R1_seq))
                overlap_R2_seq = R2_rc_seq[overlap_R2_slice]
                overlap_R2_qual = R2_rc_qual[overlap_R2_slice]

                just_R2_slice = slice(len(overlap_R1_seq), None)
Exemple #26
0
def infer_insert_length(R1, R2, before_R1, before_R2, solid=False):
    ''' Infer the length of the insert represented by R1 and R2 by performing
        a semi-local alignment of R1 and the reverse complement of R2 with
        the expected adapter sequences prepended to each read.
    '''
    extended_R1 = before_R1 + R1.seq
    extended_R2 = utilities.reverse_complement(before_R2 + R2.seq)
    alignment, = generate_alignments(
        extended_R1,
        extended_R2,
        'overlap',
        2,
        -1,
        -5,
        1,
        0,
    )

    R1_start = len(before_R1)
    R2_start = len(R2.seq) - 1
    R1_start_in_R2 = alignment['query_mappings'][len(before_R1)]
    R2_start_in_R1 = alignment['target_mappings'][len(R2.seq) - 1]

    # Since R1 is the query and R2 is the target, bases in R1 that aren't in
    # R2 are called insertions, and bases in R2 that aren't in R1 are called
    # deletions.
    # An indel in the insert is non-physical.
    if R2_start_in_R1 != SOFT_CLIPPED:
        illegal_insertion = any(R1_start <= i <= R2_start_in_R1
                                for i in alignment['insertions'])
    else:
        illegal_insertion = any(R1_start <= i for i in alignment['insertions'])

    if R1_start_in_R2 != SOFT_CLIPPED:
        illegal_deletion = any(R1_start_in_R2 <= d <= R2_start
                               for d in alignment['deletions'])
    else:
        illegal_deletion = any(d <= R2_start for d in alignment['deletions'])

    if illegal_insertion or illegal_deletion:
        return 'illegal', 500, -1

    if len(alignment['path']) == 0:
        return 'illegal', 500, -1

    if R1_start_in_R2 != SOFT_CLIPPED and R2_start_in_R1 != SOFT_CLIPPED:
        length_from_R1 = R2_start_in_R1 - R1_start + 1
        length_from_R2 = R2_start - R1_start_in_R2 + 1
    else:
        # overlap alignment forces the alignment to start with either the
        # beginning of R1 or R2 and end with either the end of R1 or R2.
        # Making it to this else branch means that either the first base of R1 or
        # the first base of R2 or both wasn't aligned. This either means that
        # the insert is longer than the read length or a pathological alignment
        # has been produced in which only adapter bases are involved in the
        # alignment. Flag the second case as illegal.

        try:
            first_R1_index, first_R2_index = alignment['path'][0]
        except IndexError:
            print R1
            print R2
            print alignment
            raise
        length_from_R1 = (first_R1_index - R1_start + 1) + (len(R2.seq) - 1)

        last_R1_index, last_R2_index = alignment['path'][-1]
        length_from_R2 = (R2_start - last_R2_index + 1) + (len(R1.seq) - 1)

        if first_R1_index == 0 or last_R2_index == 0:
            return 'illegal', 500, -1

    if length_from_R1 < -1 or length_from_R2 < -1:
        # Negative insert lengths are non-physical. Even though I don't
        # understand it, -1 is relatively common so is tolerated.
        return 'illegal', 500, -1

    insert_length = length_from_R1

    if 2 * len(alignment['path']) - alignment['score'] > .2 * len(
            alignment['path']):
        status = 'bad'
    else:
        status = 'good'

    if status == 'good' and (length_from_R1 != length_from_R2):
        if solid and not (alignment['insertions'] or alignment['deletions']):
            pass
        else:
            # This shouldn't be possible without an illegal indel.
            #print 'length from R1', length_from_R1
            #print 'length from R2', length_from_R2
            #print_diagnostic(R1, R2, before_R1, before_R2, alignment)
            return 'illegal', 500, -1

    #print_diagnostic(R1, R2, before_R1, before_R2, alignment)

    return status, insert_length, alignment
 def get_R2_rc_reads():
     read_pairs = islice(get_read_pairs(), 100)
     return (fastq.Read(R2.name, utilities.reverse_complement(R2.seq),
                        R2.qual[::-1]) for R1, R2 in read_pairs)
def find_boundary_sequences(R1, R2, counters):
    # Find which read in the read pair is from the reverse strand by looking for
    # common_right_reverse.
    # First try to find a unique position entirely contained within R1 or R2
    # that is close to common_right_reverse.
    # Failing this, find the longest of (the longest suffix of R1 or R2 that
    # matches a prefix of common_right_reverse) or (the longest prefix of R1 or
    # R2 that matches a suffix of common_right_reverse).

    R1_contained, R1_prefix, R1_suffix = all_adapter_possibilites(
        R1.seq, common_right_reverse)
    R2_contained, R2_prefix, R2_suffix = all_adapter_possibilites(
        R2.seq, common_right_reverse)

    if len(R1_contained) + len(R2_contained) > 1:
        # Only one of occurence of common_right_reverse should exist between R1
        # and R2.
        return None, None
    elif len(R1_contained) + len(R2_contained) == 0:
        possiblities = [
            (len(common_right_reverse) - R1_prefix, 'R1_prefix'),
            (len(common_right_reverse) - R2_prefix, 'R2_prefix'),
            (len(common_right_reverse) - R1_suffix, 'R1_suffix'),
            (len(common_right_reverse) - R2_suffix, 'R2_suffix'),
        ]
        length, kind = max(possiblities)
        if length > 5:
            if 'R1' in kind:
                reverse_read = R1
                forward_read = R2
                polyA_read = 'R2_forward'
                polyT_read = 'R1_reverse'
            elif 'R2' in kind:
                reverse_read = R2
                forward_read = R1
                polyA_read = 'R1_forward'
                polyT_read = 'R2_reverse'
            if 'prefix' in kind:
                common_right_reverse_start = len(reverse_read.seq) - length
            elif 'suffix' in kind:
                common_right_reverse_start = -length
        else:
            return None, None

    elif len(R1_contained) == 1:
        reverse_read = R1
        forward_read = R2
        polyA_read = 'R2_forward'
        polyT_read = 'R1_reverse'
        common_right_reverse_start = R1_contained.pop()
    elif len(R2_contained) == 1:
        reverse_read = R2
        forward_read = R1
        polyA_read = 'R1_forward'
        polyT_read = 'R2_reverse'
        common_right_reverse_start = R2_contained.pop()

    # '*' means that there was no opportunity to see this id.
    # 'X' means that there was an opportunity and it was neither A nor B.
    right_id = '*'
    left_id = '*'

    five_payload_slice = slice(None, max(0, common_right_reverse_start))
    five_payload_seq = utilities.reverse_complement(
        reverse_read.seq[five_payload_slice])
    five_payload_qual = reverse_read.qual[five_payload_slice][::-1]

    current_p = common_right_reverse_start + len(common_right_reverse)
    if current_p < len(reverse_read.seq) - after_right_length:
        right_id_seq = reverse_read.seq[current_p:current_p +
                                        after_right_length]
        for key, prefix in after_right_prefix.items():
            if right_id_seq == prefix:
                right_id = key
        if right_id == '*':
            right_id = 'X'

        counters['right_ids'][right_id_seq] += 1

        if right_id != 'X':
            current_p += len(after_right[right_id])
            if current_p < len(reverse_read.seq) - 4:
                left_id_seq = reverse_read.seq[current_p:current_p + 4]
                for key, sequence in after_left.items():
                    if left_id_seq == sequence:
                        left_id = key
                if left_id == '*':
                    left_id = 'X'

                counters['left_ids'][left_id_seq] += 1

    polyA_start, polyA_length = find_polyA_cython.find_polyA(
        forward_read.seq, 15)
    polyA_slice = slice(polyA_start, polyA_start + polyA_length)
    polyA_seq = forward_read.seq[polyA_slice]
    polyA_qual = fastq.sanitize_qual(forward_read.qual[polyA_slice])
    three_payload_slice = slice(None, polyA_start)
    three_payload_seq = forward_read.seq[three_payload_slice]
    three_payload_qual = forward_read.qual[three_payload_slice]

    common_name, _ = R1.name.rsplit(':', 1)
    control_ids_string = '{0}-{1}'.format(left_id, right_id)
    five_annotation = trim.PayloadAnnotation(
        original_name=common_name,
        left_seq=control_ids_string,
        left_qual='',
        right_seq='',
        right_qual='',
    )
    three_annotation = trim.PayloadAnnotation(
        original_name=common_name,
        left_seq=control_ids_string,
        left_qual='',
        right_seq=polyA_seq,
        right_qual=polyA_qual,
    )
    five_payload_read = fastq.Read(five_annotation.identifier,
                                   five_payload_seq, five_payload_qual)
    three_payload_read = fastq.Read(three_annotation.identifier,
                                    three_payload_seq, three_payload_qual)

    counters['positions'][polyT_read][max(0, common_right_reverse_start)] += 1
    counters['positions'][polyA_read][polyA_start] += 1
    counters['joint_lengths'][max(0, common_right_reverse_start),
                              polyA_start] += 1
    counters['polyA_lengths'][polyA_length] += 1
    counters['control_ids'][control_ids_string] += 1

    if polyA_length < 13:
        return None, None

    return five_payload_read, three_payload_read
 def __init__(self, **kwargs):
     super(WilkeningRNASeqExperiment, self).__init__(**kwargs)
     
     full_adapter_in_R2 = utilities.reverse_complement(self.barcode) + utilities.reverse_complement(adapters.primers['PE']['R1']) 
     self.adapter_in_R2 = full_adapter_in_R2[:19]
Exemple #30
0
def extend_polyA_end(mapping, region_fetcher, trimmed_twice=False):
    if mapping.is_unmapped:
        return mapping

    if trimmed_twice:
        # Trailing poly-As were removed by the second trimming step. 
        annotation = TrimmedTwiceAnnotation.from_identifier(mapping.qname)
        polyA_seq = annotation['retrimmed_right_seq']
        polyA_qual = annotation['retrimmed_right_qual']
        new_qname = PayloadAnnotation.from_prefix_identifier(mapping.qname).identifier
    else:
        annotation = PayloadAnnotation.from_identifier(mapping.qname)
        polyA_seq = annotation['right_seq']
        polyA_qual = annotation['right_qual']
        new_qname = '{0}_{1}_{2}'.format(annotation['original_name'],
                                         annotation['left_seq'],
                                         annotation['left_qual'],
                                        )

    num_trimmed = len(polyA_seq)
    
    if mapping.is_reverse:
        after = region_fetcher(mapping.tid, mapping.pos - num_trimmed, mapping.pos)
        after = utilities.reverse_complement(after)
    else:
        after = region_fetcher(mapping.tid, mapping.aend, mapping.aend + num_trimmed)

    extra_genomic_As = 0
    for b in after:
        if b == 'A':
            extra_genomic_As += 1
        else:
            break

    nongenomic_length = num_trimmed - extra_genomic_As

    if mapping.is_reverse:
        nongenomic_start = mapping.pos - 1 - extra_genomic_As
    else:
        # Note: 'aend points to one past the last aligned residue'
        nongenomic_start = mapping.aend + extra_genomic_As

    extra_genomic_seq = polyA_seq[:extra_genomic_As]
    soft_clipped_seq = polyA_seq[extra_genomic_As:]
    extra_genomic_qual = polyA_qual[:extra_genomic_As]
    soft_clipped_qual = polyA_qual[extra_genomic_As:]

    extra_seq = extra_genomic_seq + soft_clipped_seq
    extra_qual = extra_genomic_qual + soft_clipped_qual

    if mapping.is_reverse:
        final_cigar_block_index = 0
        extended_seq = utilities.reverse_complement(extra_seq) + mapping.seq
        extended_qual = extra_qual[::-1] + mapping.qual
        mapping.pos = mapping.pos - extra_genomic_As
    else:
        final_cigar_block_index = -1
        extended_seq = mapping.seq + extra_seq
        extended_qual = mapping.qual + extra_qual

    # Note: writing to mapping.seq destroys mapping.qual, so
    # mapping.qual needs to be retrieved above
    mapping.seq = extended_seq
    mapping.qual = extended_qual

    op, length = mapping.cigar[final_cigar_block_index]
    if op != 0:
        raise ValueError
    length += extra_genomic_As
    
    updated_cigar = mapping.cigar
    updated_cigar[final_cigar_block_index] = (op, length)
    if len(soft_clipped_seq) > 0:
        soft_clipped_block = [(sam.BAM_CSOFT_CLIP, len(soft_clipped_seq))]
        if final_cigar_block_index == 0:
            updated_cigar = soft_clipped_block + updated_cigar
        elif final_cigar_block_index == -1:
            updated_cigar = updated_cigar + soft_clipped_block

    mapping.cigar = updated_cigar

    if mapping.tags:
        # Clear the MD tag since the possible addition of bases to the
        # alignment may have made it inaccurate. 
        filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags)
        mapping.tags = filtered_tags

    set_nongenomic_length(mapping, nongenomic_length)

    mapping.qname = new_qname

    return mapping
Exemple #31
0
def build_before_adapters(index_sequence='', primer_type='tru_seq'):
    before_R1 = flow_cell['P5'] + primers[primer_type]['R1']
    before_R2 = flow_cell['P7'] + utilities.reverse_complement(index_sequence) + primers[primer_type]['R2']
    return before_R1, before_R2
Exemple #32
0
def build_adapters(index_sequence='', max_length=None, primer_type='tru_seq'):
    before_R1, before_R2 = build_before_adapters(index_sequence, primer_type)
    adapter_in_R1 = utilities.reverse_complement(before_R2) + A_tail
    adapter_in_R2 = utilities.reverse_complement(before_R1) + A_tail
    truncated_slice = slice(None, max_length)
    return adapter_in_R1[truncated_slice], adapter_in_R2[truncated_slice]
Exemple #33
0
def extend_polyA_end(mapping, region_fetcher, trimmed_twice=False):
    if mapping.is_unmapped:
        return mapping

    if trimmed_twice:
        # Trailing poly-As were removed by the second trimming step.
        annotation = TrimmedTwiceAnnotation.from_identifier(mapping.qname)
        polyA_seq = annotation['retrimmed_right_seq']
        polyA_qual = annotation['retrimmed_right_qual']
        new_qname = PayloadAnnotation.from_prefix_identifier(
            mapping.qname).identifier
    else:
        annotation = PayloadAnnotation.from_identifier(mapping.qname)
        polyA_seq = annotation['right_seq']
        polyA_qual = annotation['right_qual']
        new_qname = '{0}_{1}_{2}'.format(
            annotation['original_name'],
            annotation['left_seq'],
            annotation['left_qual'],
        )

    num_trimmed = len(polyA_seq)

    if mapping.is_reverse:
        after = region_fetcher(mapping.tid, mapping.pos - num_trimmed,
                               mapping.pos)
        after = utilities.reverse_complement(after)
    else:
        after = region_fetcher(mapping.tid, mapping.aend,
                               mapping.aend + num_trimmed)

    extra_genomic_As = 0
    for b in after:
        if b == 'A':
            extra_genomic_As += 1
        else:
            break

    nongenomic_length = num_trimmed - extra_genomic_As

    if mapping.is_reverse:
        nongenomic_start = mapping.pos - 1 - extra_genomic_As
    else:
        # Note: 'aend points to one past the last aligned residue'
        nongenomic_start = mapping.aend + extra_genomic_As

    extra_genomic_seq = polyA_seq[:extra_genomic_As]
    soft_clipped_seq = polyA_seq[extra_genomic_As:]
    extra_genomic_qual = polyA_qual[:extra_genomic_As]
    soft_clipped_qual = polyA_qual[extra_genomic_As:]

    extra_seq = extra_genomic_seq + soft_clipped_seq
    extra_qual = extra_genomic_qual + soft_clipped_qual

    if mapping.is_reverse:
        final_cigar_block_index = 0
        extended_seq = utilities.reverse_complement(extra_seq) + mapping.seq
        extended_qual = extra_qual[::-1] + mapping.qual
        mapping.pos = mapping.pos - extra_genomic_As
    else:
        final_cigar_block_index = -1
        extended_seq = mapping.seq + extra_seq
        extended_qual = mapping.qual + extra_qual

    # Note: writing to mapping.seq destroys mapping.qual, so
    # mapping.qual needs to be retrieved above
    mapping.seq = extended_seq
    mapping.qual = extended_qual

    op, length = mapping.cigar[final_cigar_block_index]
    if op != 0:
        raise ValueError
    length += extra_genomic_As

    updated_cigar = mapping.cigar
    updated_cigar[final_cigar_block_index] = (op, length)
    if len(soft_clipped_seq) > 0:
        soft_clipped_block = [(sam.BAM_CSOFT_CLIP, len(soft_clipped_seq))]
        if final_cigar_block_index == 0:
            updated_cigar = soft_clipped_block + updated_cigar
        elif final_cigar_block_index == -1:
            updated_cigar = updated_cigar + soft_clipped_block

    mapping.cigar = updated_cigar

    if mapping.tags:
        # Clear the MD tag since the possible addition of bases to the
        # alignment may have made it inaccurate.
        filtered_tags = filter(lambda t: t[0] != 'MD', mapping.tags)
        mapping.tags = filtered_tags

    set_nongenomic_length(mapping, nongenomic_length)

    mapping.qname = new_qname

    return mapping