예제 #1
0
def nextseq_trim_index(sequence, cutoff, base=33):
    """
    Variant of the above quality trimming routine that works on NextSeq data.
    With Illumina NextSeq, bases are encoded with two colors. 'No color' (a
    dark cycle) usually means that a 'G' was sequenced, but that also occurs
    when sequencing falls off the end of the fragment. The read then contains
    a run of high-quality G bases in the end.

    This routine works as the one above, but counts qualities belonging to 'G'
    bases as being equal to cutoff - 1.
    """
    bases = sequence.sequence
    qualities = sequence.qualities
    s = 0
    max_qual = 0
    max_i = len(qualities)
    for i in reversed(xrange(max_i)):
        q = qual2int(qualities[i], base)
        if bases[i] == 'G':
            q = cutoff - 1
        s += cutoff - q
        if s < 0:
            break
        if s > max_qual:
            max_qual = s
            max_i = i
    return max_i
예제 #2
0
    def nextseq_trim_index(sequence, cutoff, base=33):
        """Variant of the above quality trimming routine that works on NextSeq
        data. With Illumina NextSeq, bases are encoded with two colors.
        'No color' (a dark cycle) usually means that a 'G' was sequenced, but
        that also occurs when sequencing falls off the end of the fragment. The
        read then contains a run of high-quality G bases in the end.

        This routine works as the one above, but counts qualities belonging to
        'G' bases as being equal to cutoff - 1.
        """
        bases = sequence.sequence
        qualities = sequence.qualities
        score = 0
        max_qual = 0
        max_i = len(qualities)
        for idx in reversed(range(max_i)):
            qual = qual2int(qualities[idx], base)
            if bases[idx] == 'G':
                qual = cutoff - 1
            score += cutoff - qual
            if score < 0:
                break
            if score > max_qual:
                max_qual = score
                max_i = idx
        return max_i
예제 #3
0
 def summarize(self):
     """Flatten into a table of N*K rows, where N is the sequence size and
     K is the union of keys in the nested dicts, and the columns are counts
     by nucleotide.
     """
     keys1 = set()
     keys2 = set()
     for dict1 in self.dicts:
         keys1.update(dict1.keys())
         for dict2 in dict1.values():
             keys2.update(dict2.keys())
     keys1 = tuple(sorted(keys1))
     keys2 = tuple(sorted(keys2))
     if self.is_qualities:
         columns = tuple(qual2int(k, self.quality_base) for k in keys2)
     else:
         columns = keys2
     return dict(columns=columns,
                 columns2=keys1,
                 rows=ordered_dict(
                     (idx,
                      ordered_dict((key1,
                                    tuple(dict_item[key1].get(key2, 0)
                                          for key2 in keys2))
                                   for key1 in keys1))
                     for idx, dict_item in enumerate(self.dicts, 1)))
예제 #4
0
    def quality_trim_index(qualities, cutoff_front, cutoff_back, base=33):
        """Find the position at which to trim a low-quality end from a
        nucleotide sequence.
        
        Qualities are assumed to be ASCII-encoded as chr(qual + base).
        
        The algorithm is the same as the one used by BWA within the function
        'bwa_trim_read':
        - Subtract the cutoff value from all qualities.
        - Compute partial sums from all indices to the end of the sequence.
        - Trim sequence at the index at which the sum is minimal.
        """
        start = 0
        stop = max_i = len(qualities)

        # find trim position for 5' end
        score = 0
        max_qual = 0
        for idx in range(max_i):
            qual = qual2int(qualities[idx], base)
            score += cutoff_front - (qual - base)
            if score < 0:
                break
            if score > max_qual:
                max_qual = score
                start = idx + 1

        # same for 3' end
        max_qual = 0
        score = 0
        for idx in reversed(range(max_i)):
            qual = qual2int(qualities[idx], base)
            score += cutoff_back - (qual - base)
            if score < 0:
                break
            if score > max_qual:
                max_qual = score
                stop = idx

        if start >= stop:
            start, stop = 0, 0

        return (start, stop)
예제 #5
0
 def summarize(self):
     """Flatten into a table with N rows (where N is the size of the
     sequence) and the columns are counts by nucleotide.
     
     Returns:
         A tuple of (columns, [rows]), where each row is
         (position, (base_counts...))
     """
     keys = set()
     for dict_item in self.dicts:
         keys.update(dict_item.keys())
     if self.is_qualities:
         keys = tuple(sorted(keys))
         columns = tuple(qual2int(k, self.quality_base) for k in keys)
     else:
         acgt = ('A', 'C', 'G', 'T')
         n_val = ('N', )
         columns = keys = acgt + tuple(keys - set(acgt + n_val)) + n_val
     return dict(columns=columns,
                 rows=ordered_dict(
                     (idx, tuple(dict_item.get(key, 0) for key in keys))
                     for idx, dict_item in enumerate(self.dicts, 1)))
예제 #6
0
def quality_trim_index(qualities, cutoff, base=33):
    """
    Find the position at which to trim a low-quality end from a nucleotide sequence.

    Qualities are assumed to be ASCII-encoded as chr(qual + base).

    The algorithm is the same as the one used by BWA within the function
    'bwa_trim_read':
    - Subtract the cutoff value from all qualities.
    - Compute partial sums from all indices to the end of the sequence.
    - Trim sequence at the index at which the sum is minimal.
    """
    s = 0
    max_qual = 0
    max_i = len(qualities)
    for i in reversed(range(max_i)):
        q = qual2int(qualities[i], base)
        s += cutoff - q
        if s < 0:
            break
        if s > max_qual:
            max_qual = s
            max_i = i
    return max_i
예제 #7
0
def quality_trim_index(qualities, cutoff, base=33):
    """
    Find the position at which to trim a low-quality end from a nucleotide sequence.

    Qualities are assumed to be ASCII-encoded as chr(qual + base).

    The algorithm is the same as the one used by BWA within the function
    'bwa_trim_read':
    - Subtract the cutoff value from all qualities.
    - Compute partial sums from all indices to the end of the sequence.
    - Trim sequence at the index at which the sum is minimal.
    """
    s = 0
    max_qual = 0
    max_i = len(qualities)
    for i in reversed(range(max_i)):
        q = qual2int(qualities[i], base)
        s += cutoff - q
        if s < 0:
            break
        if s > max_qual:
            max_qual = s
            max_i = i
    return max_i