def nextseq_trim_index(sequence, cutoff, base=33): """ Variant of the above quality trimming routine that works on NextSeq data. With Illumina NextSeq, bases are encoded with two colors. 'No color' (a dark cycle) usually means that a 'G' was sequenced, but that also occurs when sequencing falls off the end of the fragment. The read then contains a run of high-quality G bases in the end. This routine works as the one above, but counts qualities belonging to 'G' bases as being equal to cutoff - 1. """ bases = sequence.sequence qualities = sequence.qualities s = 0 max_qual = 0 max_i = len(qualities) for i in reversed(xrange(max_i)): q = qual2int(qualities[i], base) if bases[i] == 'G': q = cutoff - 1 s += cutoff - q if s < 0: break if s > max_qual: max_qual = s max_i = i return max_i
def nextseq_trim_index(sequence, cutoff, base=33): """Variant of the above quality trimming routine that works on NextSeq data. With Illumina NextSeq, bases are encoded with two colors. 'No color' (a dark cycle) usually means that a 'G' was sequenced, but that also occurs when sequencing falls off the end of the fragment. The read then contains a run of high-quality G bases in the end. This routine works as the one above, but counts qualities belonging to 'G' bases as being equal to cutoff - 1. """ bases = sequence.sequence qualities = sequence.qualities score = 0 max_qual = 0 max_i = len(qualities) for idx in reversed(range(max_i)): qual = qual2int(qualities[idx], base) if bases[idx] == 'G': qual = cutoff - 1 score += cutoff - qual if score < 0: break if score > max_qual: max_qual = score max_i = idx return max_i
def summarize(self): """Flatten into a table of N*K rows, where N is the sequence size and K is the union of keys in the nested dicts, and the columns are counts by nucleotide. """ keys1 = set() keys2 = set() for dict1 in self.dicts: keys1.update(dict1.keys()) for dict2 in dict1.values(): keys2.update(dict2.keys()) keys1 = tuple(sorted(keys1)) keys2 = tuple(sorted(keys2)) if self.is_qualities: columns = tuple(qual2int(k, self.quality_base) for k in keys2) else: columns = keys2 return dict(columns=columns, columns2=keys1, rows=ordered_dict( (idx, ordered_dict((key1, tuple(dict_item[key1].get(key2, 0) for key2 in keys2)) for key1 in keys1)) for idx, dict_item in enumerate(self.dicts, 1)))
def quality_trim_index(qualities, cutoff_front, cutoff_back, base=33): """Find the position at which to trim a low-quality end from a nucleotide sequence. Qualities are assumed to be ASCII-encoded as chr(qual + base). The algorithm is the same as the one used by BWA within the function 'bwa_trim_read': - Subtract the cutoff value from all qualities. - Compute partial sums from all indices to the end of the sequence. - Trim sequence at the index at which the sum is minimal. """ start = 0 stop = max_i = len(qualities) # find trim position for 5' end score = 0 max_qual = 0 for idx in range(max_i): qual = qual2int(qualities[idx], base) score += cutoff_front - (qual - base) if score < 0: break if score > max_qual: max_qual = score start = idx + 1 # same for 3' end max_qual = 0 score = 0 for idx in reversed(range(max_i)): qual = qual2int(qualities[idx], base) score += cutoff_back - (qual - base) if score < 0: break if score > max_qual: max_qual = score stop = idx if start >= stop: start, stop = 0, 0 return (start, stop)
def summarize(self): """Flatten into a table with N rows (where N is the size of the sequence) and the columns are counts by nucleotide. Returns: A tuple of (columns, [rows]), where each row is (position, (base_counts...)) """ keys = set() for dict_item in self.dicts: keys.update(dict_item.keys()) if self.is_qualities: keys = tuple(sorted(keys)) columns = tuple(qual2int(k, self.quality_base) for k in keys) else: acgt = ('A', 'C', 'G', 'T') n_val = ('N', ) columns = keys = acgt + tuple(keys - set(acgt + n_val)) + n_val return dict(columns=columns, rows=ordered_dict( (idx, tuple(dict_item.get(key, 0) for key in keys)) for idx, dict_item in enumerate(self.dicts, 1)))
def quality_trim_index(qualities, cutoff, base=33): """ Find the position at which to trim a low-quality end from a nucleotide sequence. Qualities are assumed to be ASCII-encoded as chr(qual + base). The algorithm is the same as the one used by BWA within the function 'bwa_trim_read': - Subtract the cutoff value from all qualities. - Compute partial sums from all indices to the end of the sequence. - Trim sequence at the index at which the sum is minimal. """ s = 0 max_qual = 0 max_i = len(qualities) for i in reversed(range(max_i)): q = qual2int(qualities[i], base) s += cutoff - q if s < 0: break if s > max_qual: max_qual = s max_i = i return max_i