def __iter__(self): """ Return tuples: (name, sequence, qualities). qualities is a string and it contains the unmodified, encoded qualities. """ for i, line in enumerate(self.fp): if i % 4 == 0: if not line.startswith(b'@'): raise FormatError("at line {0}, expected a line starting with '+'".format(i+1)) name = line.strip()[1:] elif i % 4 == 1: sequence = line.strip() elif i % 4 == 2: line = line.strip() if not line.startswith(b'+'): raise FormatError("at line {0}, expected a line starting with '+'".format(i+1)) if len(line) > 1: self.twoheaders = True if not line[1:] == name: raise FormatError( "At line {0}: Sequence descriptions in the FASTQ file do not match " "({1!r} != {2!r}).\n" "The second sequence description must be either empty " "or equal to the first description.".format( i+1, bytes_to_str(name), bytes_to_str(line.rstrip()[1:]))) elif i % 4 == 3: qualities = line.rstrip(b'\n\r') yield self.sequence_class(bytes_to_str(name), sequence, qualities)
def find_match(self, read): """ Determine the adapter that best matches the given read. Since the best adapter is searched repeatedly, a list of AdapterMatch instances is returned, which need to be applied consecutively to the read. The list is empty if there are no adapter matches. The read will be converted to uppercase before it is compared to the adapter sequences. """ matches = [] # try at most self.times times to remove an adapter for t in range(self.times): match = self._best_match(read) if match is None: # nothing found break self._write_info(match) # FIXME move to cut() or somewhere else assert match.length > 0 assert match.errors / match.length <= match.adapter.max_error_rate assert match.length - match.errors > 0 if self.wildcard_file: # FIXME move to cut() or somewhere else print(bytes_to_str(match.wildcards()), read.name, file=self.wildcard_file) matches.append(match) if t != self.times - 1: read = match.adapter.trimmed(match) return matches
def _write_info(self, match): """write one line to the info file""" # TODO move to separate class if not self.info_file: return seq = match.read.sequence if match is None: print(match.read.name, -1, seq, sep='\t', file=self.info_file) else: print(match.read.name, match.errors, match.rstart, match.rstop, bytes_to_str(seq[0:match.rstart]), bytes_to_str(seq[match.rstart:match.rstop]), bytes_to_str(seq[match.rstop:]), match.adapter.name, sep='\t', file=self.info_file)
def _write_info(self, match): """write one line to the info file""" # TODO move to separate class if not self.info_file: return seq = match.read.sequence if match is None: print(match.read.name, -1, seq, sep='\t', file=self.info_file) else: print( match.read.name, match.errors, match.rstart, match.rstop, bytes_to_str(seq[0:match.rstart]), bytes_to_str(seq[match.rstart:match.rstop]), bytes_to_str(seq[match.rstop:]), match.adapter.name, sep='\t', file=self.info_file)
def __iter__(self): """ Read next entry from the file (single entry at a time). # TODO this can be quadratic since += is used for the sequence string """ name = None seq = bytes() delim = b'\n' if PY3 else '\n' recordstart = ord('>') if PY3 else '>' for line in self.fp: # strip() should also take care of DOS line breaks line = line.strip() if line and line[0] == recordstart: if name is not None: assert seq.find(delim) == -1 yield self.sequence_class(name, seq, None) name = bytes_to_str(line[1:]) seq = bytes() else: seq += line if name is not None: assert seq.find(delim) == -1 yield self.sequence_class(name, seq, None)
def write(self, match): rest = match.rest() if len(rest) > 0: print(bytes_to_str(rest), match.read.name, file=self.file)
def print_statistics( adapters, time, n, total_bp, quality_trimmed, trim, reads_matched, error_rate, too_short, too_long, args, file=None ): """Print summary to file""" old_stdout = sys.stdout if file is not None: sys.stdout = file print("cutadapt version", __version__) print("Command line parameters:", " ".join(args)) print("Maximum error rate: {0:.2%}".format(error_rate)) print(" No. of adapters:", len(adapters)) print(" Processed reads: {0:12}".format(n)) print(" Processed bases: {0:12} bp ({1:.1F} Mbp)".format(total_bp, total_bp / 1e6)) trimmed_bp = 0 for adapter in adapters: for d in (adapter.lengths_front, adapter.lengths_back): trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items()) if n > 0: operation = "Trimmed" if trim else "Matched" print(" {0} reads: {1:12} ({2:.1%})".format(operation, reads_matched, reads_matched / n)) t = [("Quality-trimmed", quality_trimmed), (" Trimmed bases", trimmed_bp)] if quality_trimmed < 0: del t[0] for what, bp in t: s = " ({0:.2%} of total)".format(float(bp) / total_bp) if total_bp > 0 else "" print(" {0}: {1:12} bp ({2:.1F} Mbp){3}".format(what, bp, bp / 1e6, s)) print(" Too short reads: {0:12} ({1:.1%} of processed reads)".format(too_short, too_short / n)) print(" Too long reads: {0:12} ({1:.1%} of processed reads)".format(too_long, too_long / n)) print(" Total time: {0:9.2F} s".format(time)) if n > 0: print(" Time per read: {0:10.3F} ms".format(1000.0 * time / n)) print() for index, adapter in enumerate(adapters): total_front = sum(adapter.lengths_front.values()) total_back = sum(adapter.lengths_back.values()) total = total_front + total_back where = adapter.where assert ( where == ANYWHERE or (where == BACK and total_front == 0) or (where in (FRONT, PREFIX) and total_back == 0) ) print("=" * 3, "Adapter", index + 1, "=" * 3) print() if not adapter.name_is_generated: name = "'{0}' ({1})".format(adapter.name, bytes_to_str(adapter.sequence)) else: name = "'{0}'".format(bytes_to_str(adapter.sequence)) print("Adapter {0}, length {1}, was trimmed {2} times.".format(name, len(adapter.sequence), total)) if where == ANYWHERE: print(total_front, "times, it overlapped the 5' end of a read") print(total_back, "times, it overlapped the 3' end or was within the read") print() print_error_ranges(len(adapter), adapter.max_error_rate) print("Overview of removed sequences (5')") print_histogram(adapter.lengths_front, len(adapter), n, adapter.max_error_rate, adapter.errors_front) print() print("Overview of removed sequences (3' or within)") print_histogram(adapter.lengths_back, len(adapter), n, adapter.max_error_rate, adapter.errors_back) elif where in (FRONT, PREFIX): print() print_error_ranges(len(adapter), adapter.max_error_rate) print("Overview of removed sequences") print_histogram(adapter.lengths_front, len(adapter), n, adapter.max_error_rate, adapter.errors_front) else: assert where == BACK print() print_error_ranges(len(adapter), adapter.max_error_rate) print("Overview of removed sequences") print_histogram(adapter.lengths_back, len(adapter), n, adapter.max_error_rate, adapter.errors_back) if n == 0: print("No reads were read! Either your input file is empty or you used the wrong -f/--format parameter.") sys.stdout = old_stdout
def print_statistics(adapters, time, n, total_bp, quality_trimmed, trim, reads_matched, error_rate, too_short, too_long, args, file=None): """Print summary to file""" old_stdout = sys.stdout if file is not None: sys.stdout = file print("cutadapt version", __version__) print("Command line parameters:", " ".join(args)) print("Maximum error rate: {0:.2%}".format(error_rate)) print(" No. of adapters:", len(adapters)) print(" Processed reads: {0:12}".format(n)) print(" Processed bases: {0:12} bp ({1:.1F} Mbp)".format( total_bp, total_bp / 1E6)) trimmed_bp = 0 for adapter in adapters: for d in (adapter.lengths_front, adapter.lengths_back): trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items()) if n > 0: operation = "Trimmed" if trim else "Matched" print(" {0} reads: {1:12} ({2:.1%})".format( operation, reads_matched, reads_matched / n)) t = [("Quality-trimmed", quality_trimmed), (" Trimmed bases", trimmed_bp)] if quality_trimmed < 0: del t[0] for what, bp in t: s = " ({0:.2%} of total)".format(float(bp) / total_bp) if total_bp > 0 else '' print(" {0}: {1:12} bp ({2:.1F} Mbp){3}".format( what, bp, bp / 1E6, s)) print(" Too short reads: {0:12} ({1:.1%} of processed reads)".format( too_short, too_short / n)) print(" Too long reads: {0:12} ({1:.1%} of processed reads)".format( too_long, too_long / n)) print(" Total time: {0:9.2F} s".format(time)) if n > 0: print(" Time per read: {0:10.3F} ms".format(1000. * time / n)) print() for index, adapter in enumerate(adapters): total_front = sum(adapter.lengths_front.values()) total_back = sum(adapter.lengths_back.values()) total = total_front + total_back where = adapter.where assert where == ANYWHERE or (where == BACK and total_front == 0) or ( where in (FRONT, PREFIX) and total_back == 0) print("=" * 3, "Adapter", index + 1, "=" * 3) print() if not adapter.name_is_generated: name = "'{0}' ({1})".format(adapter.name, bytes_to_str(adapter.sequence)) else: name = "'{0}'".format(bytes_to_str(adapter.sequence)) print("Adapter {0}, length {1}, was trimmed {2} times.".format( name, len(adapter.sequence), total)) if where == ANYWHERE: print(total_front, "times, it overlapped the 5' end of a read") print(total_back, "times, it overlapped the 3' end or was within the read") print() print_error_ranges(len(adapter), adapter.max_error_rate) print("Overview of removed sequences (5')") print_histogram(adapter.lengths_front, len(adapter), n, adapter.max_error_rate, adapter.errors_front) print() print("Overview of removed sequences (3' or within)") print_histogram(adapter.lengths_back, len(adapter), n, adapter.max_error_rate, adapter.errors_back) elif where in (FRONT, PREFIX): print() print_error_ranges(len(adapter), adapter.max_error_rate) print("Overview of removed sequences") print_histogram(adapter.lengths_front, len(adapter), n, adapter.max_error_rate, adapter.errors_front) else: assert where == BACK print() print_error_ranges(len(adapter), adapter.max_error_rate) print("Overview of removed sequences") print_histogram(adapter.lengths_back, len(adapter), n, adapter.max_error_rate, adapter.errors_back) if n == 0: print( "No reads were read! Either your input file is empty or you used the wrong -f/--format parameter." ) sys.stdout = old_stdout