Beispiel #1
0
	def __iter__(self):
		"""
		Return tuples: (name, sequence, qualities).
		qualities is a string and it contains the unmodified, encoded qualities.
		"""
		for i, line in enumerate(self.fp):
			if i % 4 == 0:
				if not line.startswith(b'@'):
					raise FormatError("at line {0}, expected a line starting with '+'".format(i+1))
				name = line.strip()[1:]
			elif i % 4 == 1:
				sequence = line.strip()
			elif i % 4 == 2:
				line = line.strip()
				if not line.startswith(b'+'):
					raise FormatError("at line {0}, expected a line starting with '+'".format(i+1))
				if len(line) > 1:
					self.twoheaders = True
					if not line[1:] == name:
						raise FormatError(
							"At line {0}: Sequence descriptions in the FASTQ file do not match "
							"({1!r} != {2!r}).\n"
							"The second sequence description must be either empty "
							"or equal to the first description.".format(
								i+1, bytes_to_str(name), bytes_to_str(line.rstrip()[1:])))
			elif i % 4 == 3:
				qualities = line.rstrip(b'\n\r')
				yield self.sequence_class(bytes_to_str(name), sequence, qualities)
Beispiel #2
0
	def __iter__(self):
		"""
		Return tuples: (name, sequence, qualities).
		qualities is a string and it contains the unmodified, encoded qualities.
		"""
		for i, line in enumerate(self.fp):
			if i % 4 == 0:
				if not line.startswith(b'@'):
					raise FormatError("at line {0}, expected a line starting with '+'".format(i+1))
				name = line.strip()[1:]
			elif i % 4 == 1:
				sequence = line.strip()
			elif i % 4 == 2:
				line = line.strip()
				if not line.startswith(b'+'):
					raise FormatError("at line {0}, expected a line starting with '+'".format(i+1))
				if len(line) > 1:
					self.twoheaders = True
					if not line[1:] == name:
						raise FormatError(
							"At line {0}: Sequence descriptions in the FASTQ file do not match "
							"({1!r} != {2!r}).\n"
							"The second sequence description must be either empty "
							"or equal to the first description.".format(
								i+1, bytes_to_str(name), bytes_to_str(line.rstrip()[1:])))
			elif i % 4 == 3:
				qualities = line.rstrip(b'\n\r')
				yield self.sequence_class(bytes_to_str(name), sequence, qualities)
Beispiel #3
0
    def find_match(self, read):
        """
		Determine the adapter that best matches the given read.
		Since the best adapter is searched repeatedly, a list
		of AdapterMatch instances is returned, which
		need to be applied consecutively to the read.
		The list is empty if there are no adapter matches.

		The read will be converted to uppercase
		before it is compared to the adapter sequences.
		"""
        matches = []

        # try at most self.times times to remove an adapter
        for t in range(self.times):
            match = self._best_match(read)
            if match is None:
                # nothing found
                break
            self._write_info(match)  # FIXME move to cut() or somewhere else
            assert match.length > 0
            assert match.errors / match.length <= match.adapter.max_error_rate
            assert match.length - match.errors > 0

            if self.wildcard_file:  # FIXME move to cut() or somewhere else
                print(bytes_to_str(match.wildcards()), read.name, file=self.wildcard_file)

            matches.append(match)
            if t != self.times - 1:
                read = match.adapter.trimmed(match)
        return matches
Beispiel #4
0
    def find_match(self, read):
        """
		Determine the adapter that best matches the given read.
		Since the best adapter is searched repeatedly, a list
		of AdapterMatch instances is returned, which
		need to be applied consecutively to the read.
		The list is empty if there are no adapter matches.

		The read will be converted to uppercase
		before it is compared to the adapter sequences.
		"""
        matches = []

        # try at most self.times times to remove an adapter
        for t in range(self.times):
            match = self._best_match(read)
            if match is None:
                # nothing found
                break
            self._write_info(match)  # FIXME move to cut() or somewhere else
            assert match.length > 0
            assert match.errors / match.length <= match.adapter.max_error_rate
            assert match.length - match.errors > 0

            if self.wildcard_file:  # FIXME move to cut() or somewhere else
                print(bytes_to_str(match.wildcards()),
                      read.name,
                      file=self.wildcard_file)

            matches.append(match)
            if t != self.times - 1:
                read = match.adapter.trimmed(match)
        return matches
Beispiel #5
0
 def _write_info(self, match):
     """write one line to the info file"""
     # TODO move to separate class
     if not self.info_file:
         return
     seq = match.read.sequence
     if match is None:
         print(match.read.name, -1, seq, sep='\t', file=self.info_file)
     else:
         print(match.read.name,
               match.errors,
               match.rstart,
               match.rstop,
               bytes_to_str(seq[0:match.rstart]),
               bytes_to_str(seq[match.rstart:match.rstop]),
               bytes_to_str(seq[match.rstop:]),
               match.adapter.name,
               sep='\t',
               file=self.info_file)
Beispiel #6
0
	def _write_info(self, match):
		"""write one line to the info file"""
		# TODO move to separate class
		if not self.info_file:
			return
		seq = match.read.sequence
		if match is None:
			print(match.read.name, -1, seq, sep='\t', file=self.info_file)
		else:
			print(
				match.read.name,
				match.errors,
				match.rstart,
				match.rstop,
				bytes_to_str(seq[0:match.rstart]),
				bytes_to_str(seq[match.rstart:match.rstop]),
				bytes_to_str(seq[match.rstop:]),
				match.adapter.name,
				sep='\t', file=self.info_file)
Beispiel #7
0
	def __iter__(self):
		"""
		Read next entry from the file (single entry at a time).

		# TODO this can be quadratic since += is used for the sequence string
		"""
		name = None
		seq = bytes()
		delim = b'\n' if PY3 else '\n'
		recordstart = ord('>') if PY3 else '>'
		for line in self.fp:
			# strip() should also take care of DOS line breaks
			line = line.strip()
			if line and line[0] == recordstart:
				if name is not None:
					assert seq.find(delim) == -1
					yield self.sequence_class(name, seq, None)
				name = bytes_to_str(line[1:])
				seq = bytes()
			else:
				seq += line
		if name is not None:
			assert seq.find(delim) == -1
			yield self.sequence_class(name, seq, None)
Beispiel #8
0
	def __iter__(self):
		"""
		Read next entry from the file (single entry at a time).

		# TODO this can be quadratic since += is used for the sequence string
		"""
		name = None
		seq = bytes()
		delim = b'\n' if PY3 else '\n'
		recordstart = ord('>') if PY3 else '>'
		for line in self.fp:
			# strip() should also take care of DOS line breaks
			line = line.strip()
			if line and line[0] == recordstart:
				if name is not None:
					assert seq.find(delim) == -1
					yield self.sequence_class(name, seq, None)
				name = bytes_to_str(line[1:])
				seq = bytes()
			else:
				seq += line
		if name is not None:
			assert seq.find(delim) == -1
			yield self.sequence_class(name, seq, None)
Beispiel #9
0
 def write(self, match):
     rest = match.rest()
     if len(rest) > 0:
         print(bytes_to_str(rest), match.read.name, file=self.file)
Beispiel #10
0
def print_statistics(
    adapters, time, n, total_bp, quality_trimmed, trim, reads_matched, error_rate, too_short, too_long, args, file=None
):
    """Print summary to file"""
    old_stdout = sys.stdout
    if file is not None:
        sys.stdout = file
    print("cutadapt version", __version__)
    print("Command line parameters:", " ".join(args))
    print("Maximum error rate: {0:.2%}".format(error_rate))
    print("   No. of adapters:", len(adapters))
    print("   Processed reads: {0:12}".format(n))
    print("   Processed bases: {0:12} bp ({1:.1F} Mbp)".format(total_bp, total_bp / 1e6))
    trimmed_bp = 0
    for adapter in adapters:
        for d in (adapter.lengths_front, adapter.lengths_back):
            trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items())

    if n > 0:
        operation = "Trimmed" if trim else "Matched"
        print("     {0} reads: {1:12} ({2:.1%})".format(operation, reads_matched, reads_matched / n))
        t = [("Quality-trimmed", quality_trimmed), ("  Trimmed bases", trimmed_bp)]
        if quality_trimmed < 0:
            del t[0]
        for what, bp in t:
            s = " ({0:.2%} of total)".format(float(bp) / total_bp) if total_bp > 0 else ""
            print("   {0}: {1:12} bp ({2:.1F} Mbp){3}".format(what, bp, bp / 1e6, s))
        print("   Too short reads: {0:12} ({1:.1%} of processed reads)".format(too_short, too_short / n))
        print("    Too long reads: {0:12} ({1:.1%} of processed reads)".format(too_long, too_long / n))
    print("        Total time: {0:9.2F} s".format(time))
    if n > 0:
        print("     Time per read: {0:10.3F} ms".format(1000.0 * time / n))
    print()
    for index, adapter in enumerate(adapters):
        total_front = sum(adapter.lengths_front.values())
        total_back = sum(adapter.lengths_back.values())
        total = total_front + total_back
        where = adapter.where
        assert (
            where == ANYWHERE or (where == BACK and total_front == 0) or (where in (FRONT, PREFIX) and total_back == 0)
        )

        print("=" * 3, "Adapter", index + 1, "=" * 3)
        print()
        if not adapter.name_is_generated:
            name = "'{0}' ({1})".format(adapter.name, bytes_to_str(adapter.sequence))
        else:
            name = "'{0}'".format(bytes_to_str(adapter.sequence))
        print("Adapter {0}, length {1}, was trimmed {2} times.".format(name, len(adapter.sequence), total))
        if where == ANYWHERE:
            print(total_front, "times, it overlapped the 5' end of a read")
            print(total_back, "times, it overlapped the 3' end or was within the read")
            print()
            print_error_ranges(len(adapter), adapter.max_error_rate)
            print("Overview of removed sequences (5')")
            print_histogram(adapter.lengths_front, len(adapter), n, adapter.max_error_rate, adapter.errors_front)
            print()
            print("Overview of removed sequences (3' or within)")
            print_histogram(adapter.lengths_back, len(adapter), n, adapter.max_error_rate, adapter.errors_back)
        elif where in (FRONT, PREFIX):
            print()
            print_error_ranges(len(adapter), adapter.max_error_rate)
            print("Overview of removed sequences")
            print_histogram(adapter.lengths_front, len(adapter), n, adapter.max_error_rate, adapter.errors_front)
        else:
            assert where == BACK
            print()
            print_error_ranges(len(adapter), adapter.max_error_rate)
            print("Overview of removed sequences")
            print_histogram(adapter.lengths_back, len(adapter), n, adapter.max_error_rate, adapter.errors_back)

    if n == 0:
        print("No reads were read! Either your input file is empty or you used the wrong -f/--format parameter.")
    sys.stdout = old_stdout
Beispiel #11
0
 def write(self, match):
     rest = match.rest()
     if len(rest) > 0:
         print(bytes_to_str(rest), match.read.name, file=self.file)
Beispiel #12
0
def print_statistics(adapters,
                     time,
                     n,
                     total_bp,
                     quality_trimmed,
                     trim,
                     reads_matched,
                     error_rate,
                     too_short,
                     too_long,
                     args,
                     file=None):
    """Print summary to file"""
    old_stdout = sys.stdout
    if file is not None:
        sys.stdout = file
    print("cutadapt version", __version__)
    print("Command line parameters:", " ".join(args))
    print("Maximum error rate: {0:.2%}".format(error_rate))
    print("   No. of adapters:", len(adapters))
    print("   Processed reads: {0:12}".format(n))
    print("   Processed bases: {0:12} bp ({1:.1F} Mbp)".format(
        total_bp, total_bp / 1E6))
    trimmed_bp = 0
    for adapter in adapters:
        for d in (adapter.lengths_front, adapter.lengths_back):
            trimmed_bp += sum(seqlen * count for (seqlen, count) in d.items())

    if n > 0:
        operation = "Trimmed" if trim else "Matched"
        print("     {0} reads: {1:12} ({2:.1%})".format(
            operation, reads_matched, reads_matched / n))
        t = [("Quality-trimmed", quality_trimmed),
             ("  Trimmed bases", trimmed_bp)]
        if quality_trimmed < 0:
            del t[0]
        for what, bp in t:
            s = " ({0:.2%} of total)".format(float(bp) /
                                             total_bp) if total_bp > 0 else ''
            print("   {0}: {1:12} bp ({2:.1F} Mbp){3}".format(
                what, bp, bp / 1E6, s))
        print("   Too short reads: {0:12} ({1:.1%} of processed reads)".format(
            too_short, too_short / n))
        print("    Too long reads: {0:12} ({1:.1%} of processed reads)".format(
            too_long, too_long / n))
    print("        Total time: {0:9.2F} s".format(time))
    if n > 0:
        print("     Time per read: {0:10.3F} ms".format(1000. * time / n))
    print()
    for index, adapter in enumerate(adapters):
        total_front = sum(adapter.lengths_front.values())
        total_back = sum(adapter.lengths_back.values())
        total = total_front + total_back
        where = adapter.where
        assert where == ANYWHERE or (where == BACK and total_front == 0) or (
            where in (FRONT, PREFIX) and total_back == 0)

        print("=" * 3, "Adapter", index + 1, "=" * 3)
        print()
        if not adapter.name_is_generated:
            name = "'{0}' ({1})".format(adapter.name,
                                        bytes_to_str(adapter.sequence))
        else:
            name = "'{0}'".format(bytes_to_str(adapter.sequence))
        print("Adapter {0}, length {1}, was trimmed {2} times.".format(
            name, len(adapter.sequence), total))
        if where == ANYWHERE:
            print(total_front, "times, it overlapped the 5' end of a read")
            print(total_back,
                  "times, it overlapped the 3' end or was within the read")
            print()
            print_error_ranges(len(adapter), adapter.max_error_rate)
            print("Overview of removed sequences (5')")
            print_histogram(adapter.lengths_front, len(adapter), n,
                            adapter.max_error_rate, adapter.errors_front)
            print()
            print("Overview of removed sequences (3' or within)")
            print_histogram(adapter.lengths_back, len(adapter), n,
                            adapter.max_error_rate, adapter.errors_back)
        elif where in (FRONT, PREFIX):
            print()
            print_error_ranges(len(adapter), adapter.max_error_rate)
            print("Overview of removed sequences")
            print_histogram(adapter.lengths_front, len(adapter), n,
                            adapter.max_error_rate, adapter.errors_front)
        else:
            assert where == BACK
            print()
            print_error_ranges(len(adapter), adapter.max_error_rate)
            print("Overview of removed sequences")
            print_histogram(adapter.lengths_back, len(adapter), n,
                            adapter.max_error_rate, adapter.errors_back)

    if n == 0:
        print(
            "No reads were read! Either your input file is empty or you used the wrong -f/--format parameter."
        )
    sys.stdout = old_stdout