Esempio n. 1
0
def trim(reads, find_start=None, find_end=None, second_time=False):
    ''' Wrapper that handles the logistics of trimming reads given functions
        find_start and find_end that take a sequence and returns positions
        that trimming should occur at.
    '''
    if find_start == None:
        find_start = lambda seq: 0
    if find_end == None:
        find_end = len

    for read in reads:
        start = find_start(read.seq)
        end = find_end(read.seq)

        left_seq = read.seq[:start]
        left_qual = fastq.sanitize_qual(read.qual[:start])
        right_seq = read.seq[end:]
        right_qual = fastq.sanitize_qual(read.qual[end:])
        if second_time:
            payload_annotation = PayloadAnnotation.from_identifier(read.name)
            annotation = TrimmedTwiceAnnotation(
                retrimmed_left_seq=left_seq,
                retrimmed_left_qual=left_qual,
                retrimmed_right_seq=right_seq,
                retrimmed_right_qual=right_qual,
                **payload_annotation)
        else:
            annotation = PayloadAnnotation(
                original_name=read.name,
                left_seq=left_seq,
                left_qual=left_qual,
                right_seq=right_seq,
                right_qual=right_qual,
            )
        trimmed_read = fastq.Read(
            annotation.identifier,
            read.seq[start:end],
            read.qual[start:end],
        )
        yield trimmed_read
Esempio n. 2
0
def trim(reads, find_start=None, find_end=None, second_time=False):
    ''' Wrapper that handles the logistics of trimming reads given functions
        find_start and find_end that take a sequence and returns positions
        that trimming should occur at.
    '''
    if find_start == None:
        find_start = lambda seq: 0
    if find_end == None:
        find_end = len

    for read in reads:
        start = find_start(read.seq)
        end = find_end(read.seq) 

        left_seq = read.seq[:start]
        left_qual = fastq.sanitize_qual(read.qual[:start])
        right_seq = read.seq[end:]
        right_qual = fastq.sanitize_qual(read.qual[end:])
        if second_time:
            payload_annotation = PayloadAnnotation.from_identifier(read.name)
            annotation = TrimmedTwiceAnnotation(retrimmed_left_seq=left_seq,
                                                retrimmed_left_qual=left_qual,
                                                retrimmed_right_seq=right_seq,
                                                retrimmed_right_qual=right_qual,
                                                **payload_annotation)
        else:
            annotation = PayloadAnnotation(original_name=read.name,
                                           left_seq=left_seq,
                                           left_qual=left_qual,
                                           right_seq=right_seq,
                                           right_qual=right_qual,
                                          )
        trimmed_read = fastq.Read(annotation.identifier,
                                  read.seq[start:end],
                                  read.qual[start:end],
                                 )
        yield trimmed_read
def find_boundary_sequences(R1, R2, counters):
    # Find which read in the read pair is from the reverse strand by looking for
    # common_right_reverse.
    # First try to find a unique position entirely contained within R1 or R2
    # that is close to common_right_reverse.
    # Failing this, find the longest of (the longest suffix of R1 or R2 that
    # matches a prefix of common_right_reverse) or (the longest prefix of R1 or
    # R2 that matches a suffix of common_right_reverse).

    R1_contained, R1_prefix, R1_suffix = all_adapter_possibilites(
        R1.seq, common_right_reverse)
    R2_contained, R2_prefix, R2_suffix = all_adapter_possibilites(
        R2.seq, common_right_reverse)

    if len(R1_contained) + len(R2_contained) > 1:
        # Only one of occurence of common_right_reverse should exist between R1
        # and R2.
        return None, None
    elif len(R1_contained) + len(R2_contained) == 0:
        possiblities = [
            (len(common_right_reverse) - R1_prefix, 'R1_prefix'),
            (len(common_right_reverse) - R2_prefix, 'R2_prefix'),
            (len(common_right_reverse) - R1_suffix, 'R1_suffix'),
            (len(common_right_reverse) - R2_suffix, 'R2_suffix'),
        ]
        length, kind = max(possiblities)
        if length > 5:
            if 'R1' in kind:
                reverse_read = R1
                forward_read = R2
                polyA_read = 'R2_forward'
                polyT_read = 'R1_reverse'
            elif 'R2' in kind:
                reverse_read = R2
                forward_read = R1
                polyA_read = 'R1_forward'
                polyT_read = 'R2_reverse'
            if 'prefix' in kind:
                common_right_reverse_start = len(reverse_read.seq) - length
            elif 'suffix' in kind:
                common_right_reverse_start = -length
        else:
            return None, None

    elif len(R1_contained) == 1:
        reverse_read = R1
        forward_read = R2
        polyA_read = 'R2_forward'
        polyT_read = 'R1_reverse'
        common_right_reverse_start = R1_contained.pop()
    elif len(R2_contained) == 1:
        reverse_read = R2
        forward_read = R1
        polyA_read = 'R1_forward'
        polyT_read = 'R2_reverse'
        common_right_reverse_start = R2_contained.pop()

    # '*' means that there was no opportunity to see this id.
    # 'X' means that there was an opportunity and it was neither A nor B.
    right_id = '*'
    left_id = '*'

    five_payload_slice = slice(None, max(0, common_right_reverse_start))
    five_payload_seq = utilities.reverse_complement(
        reverse_read.seq[five_payload_slice])
    five_payload_qual = reverse_read.qual[five_payload_slice][::-1]

    current_p = common_right_reverse_start + len(common_right_reverse)
    if current_p < len(reverse_read.seq) - after_right_length:
        right_id_seq = reverse_read.seq[current_p:current_p +
                                        after_right_length]
        for key, prefix in after_right_prefix.items():
            if right_id_seq == prefix:
                right_id = key
        if right_id == '*':
            right_id = 'X'

        counters['right_ids'][right_id_seq] += 1

        if right_id != 'X':
            current_p += len(after_right[right_id])
            if current_p < len(reverse_read.seq) - 4:
                left_id_seq = reverse_read.seq[current_p:current_p + 4]
                for key, sequence in after_left.items():
                    if left_id_seq == sequence:
                        left_id = key
                if left_id == '*':
                    left_id = 'X'

                counters['left_ids'][left_id_seq] += 1

    polyA_start, polyA_length = find_polyA_cython.find_polyA(
        forward_read.seq, 15)
    polyA_slice = slice(polyA_start, polyA_start + polyA_length)
    polyA_seq = forward_read.seq[polyA_slice]
    polyA_qual = fastq.sanitize_qual(forward_read.qual[polyA_slice])
    three_payload_slice = slice(None, polyA_start)
    three_payload_seq = forward_read.seq[three_payload_slice]
    three_payload_qual = forward_read.qual[three_payload_slice]

    common_name, _ = R1.name.rsplit(':', 1)
    control_ids_string = '{0}-{1}'.format(left_id, right_id)
    five_annotation = trim.PayloadAnnotation(
        original_name=common_name,
        left_seq=control_ids_string,
        left_qual='',
        right_seq='',
        right_qual='',
    )
    three_annotation = trim.PayloadAnnotation(
        original_name=common_name,
        left_seq=control_ids_string,
        left_qual='',
        right_seq=polyA_seq,
        right_qual=polyA_qual,
    )
    five_payload_read = fastq.Read(five_annotation.identifier,
                                   five_payload_seq, five_payload_qual)
    three_payload_read = fastq.Read(three_annotation.identifier,
                                    three_payload_seq, three_payload_qual)

    counters['positions'][polyT_read][max(0, common_right_reverse_start)] += 1
    counters['positions'][polyA_read][polyA_start] += 1
    counters['joint_lengths'][max(0, common_right_reverse_start),
                              polyA_start] += 1
    counters['polyA_lengths'][polyA_length] += 1
    counters['control_ids'][control_ids_string] += 1

    if polyA_length < 13:
        return None, None

    return five_payload_read, three_payload_read
Esempio n. 4
0
def find_boundary_sequences(R1, R2, counters):
    # Find which read in the read pair is from the reverse strand by looking for
    # common_right_reverse.
    # First try to find a unique position entirely contained within R1 or R2
    # that is close to common_right_reverse.
    # Failing this, find the longest of (the longest suffix of R1 or R2 that
    # matches a prefix of common_right_reverse) or (the longest prefix of R1 or
    # R2 that matches a suffix of common_right_reverse).

    R1_contained, R1_prefix, R1_suffix = all_adapter_possibilites(R1.seq, common_right_reverse)
    R2_contained, R2_prefix, R2_suffix = all_adapter_possibilites(R2.seq, common_right_reverse)

    if len(R1_contained) + len(R2_contained) > 1:
        # Only one of occurence of common_right_reverse should exist between R1
        # and R2.
        return None, None
    elif len(R1_contained) + len(R2_contained) == 0:
        possiblities = [(len(common_right_reverse) - R1_prefix, 'R1_prefix'),
                        (len(common_right_reverse) - R2_prefix, 'R2_prefix'),
                        (len(common_right_reverse) - R1_suffix, 'R1_suffix'),
                        (len(common_right_reverse) - R2_suffix, 'R2_suffix'),
                       ]
        length, kind = max(possiblities)
        if length > 5:
            if 'R1' in kind:
                reverse_read = R1
                forward_read = R2
                polyA_read = 'R2_forward'
                polyT_read = 'R1_reverse'
            elif 'R2' in kind:
                reverse_read = R2
                forward_read = R1
                polyA_read = 'R1_forward'
                polyT_read = 'R2_reverse'
            if 'prefix' in kind:
                common_right_reverse_start = len(reverse_read.seq) - length
            elif 'suffix' in kind:
                common_right_reverse_start = -length
        else:
            return None, None

    elif len(R1_contained) == 1:
        reverse_read = R1
        forward_read = R2
        polyA_read = 'R2_forward'
        polyT_read = 'R1_reverse'
        common_right_reverse_start = R1_contained.pop()
    elif len(R2_contained) == 1:
        reverse_read = R2
        forward_read = R1
        polyA_read = 'R1_forward'
        polyT_read = 'R2_reverse'
        common_right_reverse_start = R2_contained.pop()

    # '*' means that there was no opportunity to see this id.
    # 'X' means that there was an opportunity and it was neither A nor B.
    right_id = '*'
    left_id = '*'

    five_payload_slice = slice(None, max(0, common_right_reverse_start))
    five_payload_seq = utilities.reverse_complement(reverse_read.seq[five_payload_slice])
    five_payload_qual = reverse_read.qual[five_payload_slice][::-1]

    current_p = common_right_reverse_start + len(common_right_reverse)
    if current_p < len(reverse_read.seq) - after_right_length:
        right_id_seq = reverse_read.seq[current_p:current_p + after_right_length]
        for key, prefix in after_right_prefix.items():
            if right_id_seq == prefix:
                right_id = key
        if right_id == '*':
            right_id = 'X'

        counters['right_ids'][right_id_seq] += 1

        if right_id != 'X':
            current_p += len(after_right[right_id])
            if current_p < len(reverse_read.seq) - 4:
                left_id_seq = reverse_read.seq[current_p:current_p + 4]
                for key, sequence in after_left.items():
                    if left_id_seq == sequence:
                        left_id = key
                if left_id == '*':
                    left_id = 'X'
            
                counters['left_ids'][left_id_seq] += 1

    polyA_start, polyA_length = find_polyA_cython.find_polyA(forward_read.seq, 15)
    polyA_slice = slice(polyA_start, polyA_start + polyA_length)
    polyA_seq = forward_read.seq[polyA_slice]
    polyA_qual = fastq.sanitize_qual(forward_read.qual[polyA_slice])
    three_payload_slice = slice(None, polyA_start)
    three_payload_seq = forward_read.seq[three_payload_slice]
    three_payload_qual = forward_read.qual[three_payload_slice]

    common_name, _ = R1.name.rsplit(':', 1)
    control_ids_string = '{0}-{1}'.format(left_id, right_id)
    five_annotation = trim.PayloadAnnotation(original_name=common_name,
                                             left_seq=control_ids_string,
                                             left_qual='',
                                             right_seq='',
                                             right_qual='',
                                            )
    three_annotation = trim.PayloadAnnotation(original_name=common_name,
                                              left_seq=control_ids_string,
                                              left_qual='',
                                              right_seq=polyA_seq,
                                              right_qual=polyA_qual,
                                             )
    five_payload_read = fastq.Read(five_annotation.identifier, five_payload_seq, five_payload_qual)
    three_payload_read = fastq.Read(three_annotation.identifier, three_payload_seq, three_payload_qual)

    counters['positions'][polyT_read][max(0, common_right_reverse_start)] += 1
    counters['positions'][polyA_read][polyA_start] += 1
    counters['joint_lengths'][max(0, common_right_reverse_start), polyA_start] += 1
    counters['polyA_lengths'][polyA_length] += 1
    counters['control_ids'][control_ids_string] += 1

    if polyA_length < 13:
        return None, None

    return five_payload_read, three_payload_read