Esempio n. 1
0
		def collect(back, anywhere, front):
			adapters = []
			for name, seq, where in gather_adapters(back, anywhere, front):
				if not seq:
					parser.error("The adapter sequence is empty.")
				adapter = ADAPTER_CLASS(seq, where, options.error_rate,
					options.overlap, options.match_read_wildcards,
					options.match_adapter_wildcards, name=name, indels=options.indels)
				if options.debug:
					adapter.enable_debug()
				adapters.append(adapter)
			return adapters
Esempio n. 2
0
		def collect(back, anywhere, front):
			adapters = []
			for name, seq, where in gather_adapters(back, anywhere, front):
				if not seq:
					parser.error("The adapter sequence is empty.")
				if not options.indels and where not in (PREFIX, SUFFIX):
					parser.error("Not allowing indels is currently supported only for anchored 5' and 3' adapters.")
				adapter = ADAPTER_CLASS(seq, where, options.error_rate,
					options.overlap, options.match_read_wildcards,
					options.match_adapter_wildcards, name=name, indels=options.indels)
				adapters.append(adapter)
			return adapters
Esempio n. 3
0
 def collect(back, anywhere, front):
     adapters = []
     for name, seq, where in gather_adapters(back, anywhere, front):
         if not seq:
             parser.error("The adapter sequence is empty.")
         adapter = ADAPTER_CLASS(seq,
                                 where,
                                 options.error_rate,
                                 options.overlap,
                                 options.match_read_wildcards,
                                 options.match_adapter_wildcards,
                                 name=name,
                                 indels=options.indels)
         if options.debug:
             adapter.enable_debug()
         adapters.append(adapter)
     return adapters
Esempio n. 4
0
 def __init__(self, queue=None, results=None, adapter=None, phred64=False):
     super(Worker, self).__init__()
     self.queue=queue
     self.results = results
     self.phred = 64 if phred64 else 33
     self.modifiers = [QualityTrimmer(0, 10, self.phred)]
     self.adapters = []
     self.error_rate = 0.12
     self.min_length = 16
     if adapter.startswith('+'):
         self.modifiers.append(UnconditionalCutter(int(adapter)))
     elif adapter == 'none':
         self.adapter = None
     else:
         for name,seq,where in gather_adapters(adapter.split(','), [], []):
             self.adapters.append(Adapter(seq, where, self.error_rate, name=name))
         adapter_cutter = AdapterCutter(self.adapters)
         self.modifiers.append(adapter_cutter)
def main(cmdlineargs=None, default_outfile=sys.stdout):
    """
	Main function that evaluates command-line parameters and iterates
	over all reads.

	default_outfile is the file to which trimmed reads are sent if the ``-o``
	parameter is not used.
	"""
    parser = get_option_parser()
    if cmdlineargs is None:
        cmdlineargs = sys.argv[1:]
    options, args = parser.parse_args(args=cmdlineargs)

    if len(args) == 0:
        parser.error("At least one parameter needed: name of a FASTA or FASTQ file.")
    elif len(args) > 2:
        parser.error("Too many parameters.")

    input_filename = args[0]

    # If a second file name was given, then we either have single-end reads
    # provided as a pair of .fasta/.qual files or we have paired-end reads.
    quality_filename = None
    input_paired_filename = None
    if len(args) == 2:
        if args[0].endswith(".qual"):
            parser.error("The QUAL file must be the second argument.")
        if args[1].endswith(".qual"):
            quality_filename = args[1]
        else:
            input_paired_filename = args[1]
            if not options.paired_output:
                parser.error("You must use --paired-output when trimming paired-end reads.")

    if len(args) == 1 and options.paired_output:
        parser.error("You specified a --paired-output file, but gave only one input file.")
    if options.paired_output and bool(options.untrimmed_output) != bool(options.untrimmed_paired_output):
        parser.error(
            "When trimming paired-end reads, you must use either none "
            "or both of the --untrimmed-output/--untrimmed-paired-output options."
        )
    if options.untrimmed_paired_output and not options.paired_output:
        parser.error(
            "Option --untrimmed-paired-output can only be used when "
            "trimming paired-end reads (with option --paired-output)."
        )
    if input_filename.endswith(".qual"):
        parser.error("Need a FASTA file in addition to the QUAL file.")

    if options.format is not None and options.format.lower() not in ["fasta", "fastq", "sra-fastq"]:
        parser.error(
            "The input file format must be either 'fasta', 'fastq' or "
            "'sra-fastq' (not '{0}').".format(options.format)
        )

        # TODO should this really be an error?
    if options.format is not None and quality_filename is not None:
        parser.error("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.")

    writers = []
    too_short_outfile = None  # too short reads go here
    too_short_filter = None
    # TODO pass file name to TooShortReadFilter, add a .close() method?
    if options.minimum_length > 0:
        if options.too_short_output:
            too_short_outfile = xopen(options.too_short_output, "w")
        else:
            too_short_outfile = None
        too_short_filter = TooShortReadFilter(options.minimum_length, too_short_outfile)
        writers.append(too_short_filter)
    too_long_outfile = None  # too long reads go here
    too_long_filter = None
    if options.maximum_length < sys.maxsize:
        if options.too_long_output is not None:
            too_long_outfile = xopen(options.too_long_output, "w")
        else:
            too_long_outfile = None
        too_long_filter = TooLongReadFilter(options.maximum_length, too_long_outfile)
        writers.append(too_long_filter)

    demultiplexer = None
    if options.output is not None and "{name}" in options.output:
        if options.discard_trimmed:
            parser.error("Do not use --discard-trimmed when demultiplexing.")
        if input_paired_filename:
            parser.error("Demultiplexing not supported for paired-end files, yet.")
        untrimmed = options.output.format(name="unknown")
        if options.untrimmed_output:
            untrimmed = options.untrimmed_output
        if options.discard_untrimmed:
            untrimmed = None
        demultiplexer = Demultiplexer(options.output, untrimmed)
        writers.append(demultiplexer)
        trimmed_outfile, untrimmed_outfile = None, None
        trimmed_paired_outfile, untrimmed_paired_outfile = None, None
    else:
        trimmed_outfile, untrimmed_outfile = trimmed_and_untrimmed_files(
            default_outfile,
            options.output,
            options.untrimmed_output,
            options.discard_trimmed,
            options.discard_untrimmed,
        )

        trimmed_paired_outfile, untrimmed_paired_outfile = trimmed_and_untrimmed_files(
            None,  # applies when not trimming paired-end data
            options.paired_output,
            options.untrimmed_paired_output,
            options.discard_trimmed,
            options.discard_untrimmed,
        )

        writer = ProcessedReadWriter(
            trimmed_outfile, trimmed_paired_outfile, untrimmed_outfile, untrimmed_paired_outfile
        )
        writers.append(writer)

    if options.maq:
        options.colorspace = True
        options.double_encode = True
        options.trim_primer = True
        options.strip_suffix.append("_F3")
        options.suffix = "/1"
    if options.zero_cap is None:
        options.zero_cap = options.colorspace
    if options.trim_primer and not options.colorspace:
        parser.error("Trimming the primer makes only sense in colorspace.")
    if options.double_encode and not options.colorspace:
        parser.error("Double-encoding makes only sense in colorspace.")
    if options.anywhere and options.colorspace:
        parser.error(
            "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)."
        )
    if not (0 <= options.error_rate <= 1.0):
        parser.error("The maximum error rate must be between 0 and 1.")
    if options.overlap < 1:
        parser.error("The overlap must be at least 1.")

    if options.rest_file is not None:
        options.rest_file = xopen(options.rest_file, "w")
        rest_writer = RestFileWriter(options.rest_file)
    else:
        rest_writer = None
    if options.info_file is not None:
        options.info_file = xopen(options.info_file, "w")
    if options.wildcard_file is not None:
        options.wildcard_file = xopen(options.wildcard_file, "w")

    if options.colorspace:
        if options.match_read_wildcards:
            parser.error("IUPAC wildcards not supported in colorspace")
        options.match_adapter_wildcards = False
    adapters = []
    ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter

    try:
        for name, seq, where in gather_adapters(options.adapters, options.anywhere, options.front):
            if not seq:
                parser.error("The adapter sequence is empty")
            if not options.indels and where not in (PREFIX, SUFFIX):
                parser.error("Not allowing indels is currently supported only for anchored 5' and 3' adapters.")
            adapter = ADAPTER_CLASS(
                seq,
                where,
                options.error_rate,
                options.overlap,
                options.match_read_wildcards,
                options.match_adapter_wildcards,
                name=name,
                indels=options.indels,
            )
            adapters.append(adapter)
    except IOError as e:
        if e.errno == errno.ENOENT:
            print("Error:", e, file=sys.stderr)
            sys.exit(1)
        raise

    if (
        not adapters
        and options.quality_cutoff == 0
        and options.cut == 0
        and options.minimum_length == 0
        and options.maximum_length == sys.maxsize
    ):
        parser.error("You need to provide at least one adapter sequence.")

    if input_paired_filename:
        reader = seqio.PairedSequenceReader(
            input_filename, input_paired_filename, colorspace=options.colorspace, fileformat=options.format
        )
    else:
        reader = read_sequences(
            input_filename, quality_filename, colorspace=options.colorspace, fileformat=options.format
        )

        # Create the processing pipeline as a list of "modifiers".
    modifiers = []
    if options.cut:
        if len(options.cut) > 2:
            parser.error("You cannot remove bases from more than two ends.")
        if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0:
            parser.error("You cannot remove bases from the same end twice.")
        for cut in options.cut:
            if cut != 0:
                modifiers.append(UnconditionalCutter(cut))

    if options.quality_cutoff > 0:
        modifiers.append(QualityTrimmer(options.quality_cutoff, options.quality_base))
    if adapters:
        adapter_cutter = AdapterCutter(
            adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action
        )
        modifiers.append(adapter_cutter)
    else:
        adapter_cutter = None
    if options.length_tag:
        modifiers.append(LengthTagModifier(options.length_tag))
    if options.strip_f3:
        options.strip_suffix.append("_F3")
    for suffix in options.strip_suffix:
        modifiers.append(SuffixRemover(suffix))
    if options.prefix or options.suffix:
        modifiers.append(PrefixSuffixAdder(options.prefix, options.suffix))
    if options.double_encode:
        modifiers.append(DoubleEncoder())
    if options.zero_cap and reader.delivers_qualities:
        modifiers.append(ZeroCapper(quality_base=options.quality_base))
    if options.trim_primer:
        modifiers.append(PrimerTrimmer)

    start_time = time.clock()
    try:
        if input_paired_filename:
            stats = process_paired_reads(reader, modifiers, writers)
        else:
            stats = process_single_reads(reader, modifiers, writers)
    except KeyboardInterrupt as e:
        print("Interrupted", file=sys.stderr)
        sys.exit(1)
    except IOError as e:
        if e.errno == errno.EPIPE:
            sys.exit(1)
        raise
    except seqio.FormatError as e:
        print("Error:", e, file=sys.stderr)
        sys.exit(1)

        # close open files
    for f in [
        trimmed_outfile,
        untrimmed_outfile,
        trimmed_paired_outfile,
        untrimmed_paired_outfile,
        options.rest_file,
        options.wildcard_file,
        options.info_file,
        too_short_outfile,
        too_long_outfile,
        options.info_file,
        demultiplexer,
    ]:
        if f is not None and f is not sys.stdin and f is not sys.stdout:
            f.close()

    if not options.quiet:
        # send statistics to stderr if result was sent to stdout
        stat_file = sys.stderr if options.output is None else None
        print_statistics(
            adapters,
            time.clock() - start_time,
            stats,
            options.action,
            adapter_cutter.reads_matched if adapter_cutter else 0,
            options.error_rate,
            too_short_filter.too_short if too_short_filter else 0,
            too_long_filter.too_long if too_long_filter else 0,
            cmdlineargs,
            file=stat_file,
        )
def main(cmdlineargs=None, default_outfile=sys.stdout):
	"""
	Main function that evaluates command-line parameters and iterates
	over all reads.

	default_outfile is the file to which trimmed reads are sent if the ``-o``
	parameter is not used.
	"""
	logging.basicConfig(level=logging.INFO, format='%(message)s')  #  %(levelname)s
	parser = get_option_parser()
	if cmdlineargs is None:
		cmdlineargs = sys.argv[1:]
	options, args = parser.parse_args(args=cmdlineargs)

	if len(args) == 0:
		parser.error("At least one parameter needed: name of a FASTA or FASTQ file.")
	elif len(args) > 2:
		parser.error("Too many parameters.")
	input_filename = args[0]

	# Find out which 'mode' we need to use.
	# Default: single-read trimming (neither -p nor -A/-G/-B/-U given)
	paired = False
	if options.paired_output:
		# Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U).
		# This exists for backwards compatibility ('legacy mode').
		paired = 'first'
	if options.adapters2 or options.front2 or options.anywhere2 or options.cut2:
		# Full paired-end trimming when both -p and -A/-G/-B/-U given
		# Also the read modifications (such as quality trimming) are applied
		# to second read.
		paired = 'both'

	if paired and len(args) == 1:
		parser.error("When paired-end trimming is enabled via -A/-G/-B/-U or -p, "
			"two input files are required.")
	if paired:
		input_paired_filename = args[1]
		quality_filename = None
	else:
		input_paired_filename = None
		if len(args) == 2:
			if args[0].endswith('.qual'):
				parser.error("The QUAL file must be the second argument.")
			quality_filename = args[1]
		else:
			quality_filename = None

	if paired:
		if not options.paired_output:
			parser.error("When paired-end trimming is enabled via -A/-G/-B/-U, "
				"a second output file needs to be specified via -p (--paired-output).")
		if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output):
			parser.error("When trimming paired-end reads, you must use either none "
				"or both of the --untrimmed-output/--untrimmed-paired-output options.")
	else:
		if options.untrimmed_paired_output:
			parser.error("Option --untrimmed-paired-output can only be used when "
				"trimming paired-end reads (with option -p).")
		if input_filename.endswith('.qual'):
			parser.error("Need a FASTA file in addition to the QUAL file.")
		if options.format is not None and quality_filename is not None:
			parser.error("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.")

	if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']:
		parser.error("The input file format must be either 'fasta', 'fastq' or "
			"'sra-fastq' (not '{0}').".format(options.format))

	if options.quality_cutoff is not None:
		cutoffs = options.quality_cutoff.split(',')
		if len(cutoffs) == 1:
			try:
				cutoffs = [0, int(cutoffs[0])]
			except ValueError as e:
				parser.error("Quality cutoff value not recognized: {0}".format(e))
		elif len(cutoffs) == 2:
			try:
				cutoffs = [int(cutoffs[0]), int(cutoffs[1])]
			except ValueError as e:
				parser.error("Quality cutoff value not recognized: {0}".format(e))
		else:
			parser.error("Expected one value or two values separated by comma for the quality cutoff")
	else:
		cutoffs = None
	writers = []
	too_short_outfile = None  # too short reads go here
	too_short_filter = None
	# TODO pass file name to TooShortReadFilter, add a .close() method?
	if options.minimum_length > 0:
		if options.too_short_output:
			too_short_outfile = xopen(options.too_short_output, 'w')
		else:
			too_short_outfile = None
		too_short_filter = TooShortReadFilter(options.minimum_length,
			too_short_outfile, paired=='both')
		writers.append(too_short_filter)
	too_long_outfile = None  # too long reads go here
	too_long_filter = None
	if options.maximum_length < sys.maxsize:
		if options.too_long_output is not None:
			too_long_outfile = xopen(options.too_long_output, 'w')
		else:
			too_long_outfile = None
		too_long_filter = TooLongReadFilter(options.maximum_length,
			too_long_outfile, check_second=paired=='both')
		writers.append(too_long_filter)

	if options.max_n != -1:
		writers.append(NContentTrimmer(options.max_n, check_second=paired=='both'))

	demultiplexer = None
	if options.output is not None and '{name}' in options.output:
		if options.discard_trimmed:
			parser.error("Do not use --discard-trimmed when demultiplexing.")
		if paired:
			parser.error("Demultiplexing not supported for paired-end files, yet.")
		untrimmed = options.output.format(name='unknown')
		if options.untrimmed_output:
			untrimmed = options.untrimmed_output
		if options.discard_untrimmed:
			untrimmed = None
		demultiplexer = Demultiplexer(options.output, untrimmed)
		writers.append(demultiplexer)
		trimmed_outfile, untrimmed_outfile = None, None
		trimmed_paired_outfile, untrimmed_paired_outfile = None, None
	else:
		trimmed_outfile, untrimmed_outfile = trimmed_and_untrimmed_files(
			default_outfile,
			options.output,
			options.untrimmed_output,
			options.discard_trimmed,
			options.discard_untrimmed)

		trimmed_paired_outfile, untrimmed_paired_outfile = trimmed_and_untrimmed_files(
			None,  # applies when not trimming paired-end data
			options.paired_output,
			options.untrimmed_paired_output,
			options.discard_trimmed,
			options.discard_untrimmed)

		writer = ProcessedReadWriter(
			trimmed_outfile, trimmed_paired_outfile,
			untrimmed_outfile, untrimmed_paired_outfile,
			check_second=paired=='both'
		)
		writers.append(writer)

	if options.maq:
		options.colorspace = True
		options.double_encode = True
		options.trim_primer = True
		options.strip_suffix.append('_F3')
		options.suffix = "/1"
	if options.zero_cap is None:
		options.zero_cap = options.colorspace
	if options.trim_primer and not options.colorspace:
		parser.error("Trimming the primer makes only sense in colorspace.")
	if options.double_encode and not options.colorspace:
		parser.error("Double-encoding makes only sense in colorspace.")
	if options.anywhere and options.colorspace:
		parser.error("Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author).")
	if not (0 <= options.error_rate <= 1.):
		parser.error("The maximum error rate must be between 0 and 1.")
	if options.overlap < 1:
		parser.error("The overlap must be at least 1.")

	if options.rest_file is not None:
		options.rest_file = xopen(options.rest_file, 'w')
		rest_writer = RestFileWriter(options.rest_file)
	else:
		rest_writer = None
	if options.info_file is not None:
		options.info_file = xopen(options.info_file, 'w')
	if options.wildcard_file is not None:
		options.wildcard_file = xopen(options.wildcard_file, 'w')

	if options.colorspace:
		if options.match_read_wildcards:
			parser.error('IUPAC wildcards not supported in colorspace')
		options.match_adapter_wildcards = False

	ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter
	try:
		# TODO refactor, code duplicated
		adapters = []
		for name, seq, where in gather_adapters(options.adapters, options.anywhere, options.front):
			if not seq:
				parser.error("The adapter sequence is empty.")
			if not options.indels and where not in (PREFIX, SUFFIX):
				parser.error("Not allowing indels is currently supported only for anchored 5' and 3' adapters.")
			adapter = ADAPTER_CLASS(seq, where, options.error_rate,
				options.overlap, options.match_read_wildcards,
				options.match_adapter_wildcards, name=name, indels=options.indels)
			adapters.append(adapter)
		adapters2 = []
		for name, seq, where in gather_adapters(options.adapters2, options.anywhere2, options.front2):
			if not seq:
				parser.error("The adapter sequence is empty.")
			if not options.indels and where != PREFIX:
				parser.error("Not allowing indels is currently supported only for anchored 5' and 3' adapters.")
			adapter = ADAPTER_CLASS(seq, where, options.error_rate,
				options.overlap, options.match_read_wildcards,
				options.match_adapter_wildcards, name=name, indels=options.indels)
			adapters2.append(adapter)
	except IOError as e:
		if e.errno == errno.ENOENT:
			parser.error(e)
		raise

	if not adapters and not adapters2 and not cutoffs and \
			options.cut == [] and options.cut2 == [] and \
			options.minimum_length == 0 and \
			options.maximum_length == sys.maxsize and \
			quality_filename is None and \
			options.max_n == -1:
		parser.error("You need to provide at least one adapter sequence.")

	try:
		reader = seqio.open(input_filename, file2=input_paired_filename,
				qualfile=quality_filename, colorspace=options.colorspace,
				fileformat=options.format)
	except (seqio.UnknownFileType, IOError) as e:
		parser.error(e)

	# Create the processing pipeline consisting of a list of "modifiers".
	modifiers = []
	if options.cut:
		if len(options.cut) > 2:
			parser.error("You cannot remove bases from more than two ends.")
		if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0:
			parser.error("You cannot remove bases from the same end twice.")
		for cut in options.cut:
			if cut != 0:
				modifiers.append(UnconditionalCutter(cut))

	if cutoffs:
		modifiers.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))
	if adapters:
		adapter_cutter = AdapterCutter(adapters, options.times,
				options.wildcard_file, options.info_file,
				rest_writer, options.action)
		modifiers.append(adapter_cutter)
	else:
		adapter_cutter = None

	# Modifiers that apply to both reads of paired-end reads
	modifiers_both = []
	if options.trim_n:
		modifiers_both.append(NEndTrimmer())
	if options.length_tag:
		modifiers_both.append(LengthTagModifier(options.length_tag))
	if options.strip_f3:
		options.strip_suffix.append('_F3')
	for suffix in options.strip_suffix:
		modifiers_both.append(SuffixRemover(suffix))
	if options.prefix or options.suffix:
		modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix))
	if options.double_encode:
		modifiers_both.append(DoubleEncoder())
	if options.zero_cap and reader.delivers_qualities:
		modifiers_both.append(ZeroCapper(quality_base=options.quality_base))
	if options.trim_primer:
		modifiers_both.append(PrimerTrimmer)
	modifiers.extend(modifiers_both)

	# For paired-end data, create a second processing pipeline.
	# However, if no second-read adapters were given (via -A/-G/-B/-U), we need to
	# be backwards compatible and *no modifications* are done to the second read.
	modifiers2 = []
	if paired == 'both':
		if options.cut2:
			if len(options.cut2) > 2:
				parser.error("You cannot remove bases from more than two ends.")
			if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0:
				parser.error("You cannot remove bases from the same end twice.")
			for cut in options.cut2:
				if cut != 0:
					modifiers2.append(UnconditionalCutter(cut))

		if cutoffs:
			modifiers2.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))
		if adapters2:
			adapter_cutter2 = AdapterCutter(adapters2, options.times,
					None, None, None, options.action)
			modifiers2.append(adapter_cutter2)
		else:
			adapter_cutter2 = None
		modifiers2.extend(modifiers_both)

	# Due to backwards compatibility, from here on logging output needs to be
	# sent to standard output instead of standard error if the -o option is used.
	if options.output:
		logger.root.handlers = []
		logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout)
	logger.info("This is cutadapt %s with Python %s", __version__, platform.python_version())
	logger.info("Command line parameters: %s", " ".join(cmdlineargs))
	logger.info("Trimming %s adapter%s with at most %.1f%% errors in %s mode ...",
		len(adapters) + len(adapters2), 's' if len(adapters) + len(adapters2) != 1 else '',
		options.error_rate * 100,
		{ False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[paired])

	start_time = time.clock()
	try:
		if paired:
			stats = process_paired_reads(reader, modifiers, modifiers2, writers)
		else:
			stats = process_single_reads(reader, modifiers, writers)
	except KeyboardInterrupt as e:
		print("Interrupted", file=sys.stderr)
		sys.exit(130)
	except IOError as e:
		if e.errno == errno.EPIPE:
			sys.exit(1)
		raise
	except (seqio.FormatError, EOFError) as e:
		sys.exit("cutadapt: error: {0}".format(e))

	# close open files
	for f in [trimmed_outfile, untrimmed_outfile, trimmed_paired_outfile,
			untrimmed_paired_outfile, options.rest_file, options.wildcard_file,
			options.info_file, too_short_outfile, too_long_outfile,
			options.info_file, demultiplexer]:
		if f is not None and f is not sys.stdin and f is not sys.stdout:
			f.close()

	elapsed_time = time.clock() - start_time
	if not options.quiet:
		stats.collect((adapters, adapters2), elapsed_time,
			modifiers, modifiers2, writers)
		# send statistics to stderr if result was sent to stdout
		stat_file = sys.stderr if options.output is None else None
		with redirect_standard_output(stat_file):
			print_report(stats, (adapters, adapters2))
Esempio n. 7
0
 def parse_adapters(adapter, error_rate=None):
     adapters = []
     for name, seq, where in gather_adapters(adapter.split(','), [], []):
         adapters.append(Adapter(seq, where, error_rate, name=name))
     return adapters
def main(cmdlineargs=None, default_outfile=sys.stdout):
    """
	Main function that evaluates command-line parameters and iterates
	over all reads.

	default_outfile is the file to which trimmed reads are sent if the ``-o``
	parameter is not used.
	"""
    parser = get_option_parser()
    if cmdlineargs is None:
        cmdlineargs = sys.argv[1:]
    options, args = parser.parse_args(args=cmdlineargs)

    if len(args) == 0:
        parser.error(
            "At least one parameter needed: name of a FASTA or FASTQ file.")
    elif len(args) > 2:
        parser.error("Too many parameters.")

    input_filename = args[0]

    # If a second file name was given, then we either have single-end reads
    # provided as a pair of .fasta/.qual files or we have paired-end reads.
    quality_filename = None
    input_paired_filename = None
    if len(args) == 2:
        if args[0].endswith('.qual'):
            parser.error("The QUAL file must be the second argument.")
        if args[1].endswith('.qual'):
            quality_filename = args[1]
        else:
            input_paired_filename = args[1]
            if not options.paired_output:
                parser.error(
                    'You must use --paired-output when trimming paired-end reads.'
                )

    if len(args) == 1 and options.paired_output:
        parser.error(
            "You specified a --paired-output file, but gave only one input file."
        )
    if options.paired_output and bool(options.untrimmed_output) != bool(
            options.untrimmed_paired_output):
        parser.error(
            "When trimming paired-end reads, you must use either none "
            "or both of the --untrimmed-output/--untrimmed-paired-output options."
        )
    if options.untrimmed_paired_output and not options.paired_output:
        parser.error(
            "Option --untrimmed-paired-output can only be used when "
            "trimming paired-end reads (with option --paired-output).")
    if input_filename.endswith('.qual'):
        parser.error("Need a FASTA file in addition to the QUAL file.")

    if options.format is not None and options.format.lower() not in [
            'fasta', 'fastq', 'sra-fastq'
    ]:
        parser.error(
            "The input file format must be either 'fasta', 'fastq' or "
            "'sra-fastq' (not '{0}').".format(options.format))

    # TODO should this really be an error?
    if options.format is not None and quality_filename is not None:
        parser.error(
            "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used."
        )

    writers = []
    too_short_outfile = None  # too short reads go here
    too_short_filter = None
    # TODO pass file name to TooShortReadFilter, add a .close() method?
    if options.minimum_length > 0:
        if options.too_short_output:
            too_short_outfile = xopen(options.too_short_output, 'w')
        else:
            too_short_outfile = None
        too_short_filter = TooShortReadFilter(options.minimum_length,
                                              too_short_outfile)
        writers.append(too_short_filter)
    too_long_outfile = None  # too long reads go here
    too_long_filter = None
    if options.maximum_length < sys.maxsize:
        if options.too_long_output is not None:
            too_long_outfile = xopen(options.too_long_output, 'w')
        else:
            too_long_outfile = None
        too_long_filter = TooLongReadFilter(options.maximum_length,
                                            too_long_outfile)
        writers.append(too_long_filter)

    demultiplexer = None
    if options.output is not None and '{name}' in options.output:
        if options.discard_trimmed:
            parser.error("Do not use --discard-trimmed when demultiplexing.")
        if input_paired_filename:
            parser.error(
                "Demultiplexing not supported for paired-end files, yet.")
        untrimmed = options.output.format(name='unknown')
        if options.untrimmed_output:
            untrimmed = options.untrimmed_output
        if options.discard_untrimmed:
            untrimmed = None
        demultiplexer = Demultiplexer(options.output, untrimmed)
        writers.append(demultiplexer)
        trimmed_outfile, untrimmed_outfile = None, None
        trimmed_paired_outfile, untrimmed_paired_outfile = None, None
    else:
        trimmed_outfile, untrimmed_outfile = trimmed_and_untrimmed_files(
            default_outfile, options.output, options.untrimmed_output,
            options.discard_trimmed, options.discard_untrimmed)

        trimmed_paired_outfile, untrimmed_paired_outfile = trimmed_and_untrimmed_files(
            None,  # applies when not trimming paired-end data
            options.paired_output,
            options.untrimmed_paired_output,
            options.discard_trimmed,
            options.discard_untrimmed)

        writer = ProcessedReadWriter(trimmed_outfile, trimmed_paired_outfile,
                                     untrimmed_outfile,
                                     untrimmed_paired_outfile)
        writers.append(writer)

    if options.maq:
        options.colorspace = True
        options.double_encode = True
        options.trim_primer = True
        options.strip_suffix.append('_F3')
        options.suffix = "/1"
    if options.zero_cap is None:
        options.zero_cap = options.colorspace
    if options.trim_primer and not options.colorspace:
        parser.error("Trimming the primer makes only sense in colorspace.")
    if options.double_encode and not options.colorspace:
        parser.error("Double-encoding makes only sense in colorspace.")
    if options.anywhere and options.colorspace:
        parser.error(
            "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)."
        )
    if not (0 <= options.error_rate <= 1.):
        parser.error("The maximum error rate must be between 0 and 1.")
    if options.overlap < 1:
        parser.error("The overlap must be at least 1.")

    if options.rest_file is not None:
        options.rest_file = xopen(options.rest_file, 'w')
        rest_writer = RestFileWriter(options.rest_file)
    else:
        rest_writer = None
    if options.info_file is not None:
        options.info_file = xopen(options.info_file, 'w')
    if options.wildcard_file is not None:
        options.wildcard_file = xopen(options.wildcard_file, 'w')

    if options.colorspace:
        if options.match_read_wildcards:
            parser.error('IUPAC wildcards not supported in colorspace')
        options.match_adapter_wildcards = False
    adapters = []
    ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter

    try:
        for name, seq, where in gather_adapters(options.adapters,
                                                options.anywhere,
                                                options.front):
            if not seq:
                parser.error("The adapter sequence is empty")
            if not options.indels and where not in (PREFIX, SUFFIX):
                parser.error(
                    "Not allowing indels is currently supported only for anchored 5' and 3' adapters."
                )
            adapter = ADAPTER_CLASS(seq,
                                    where,
                                    options.error_rate,
                                    options.overlap,
                                    options.match_read_wildcards,
                                    options.match_adapter_wildcards,
                                    name=name,
                                    indels=options.indels)
            adapters.append(adapter)
    except IOError as e:
        if e.errno == errno.ENOENT:
            print("Error:", e, file=sys.stderr)
            sys.exit(1)
        raise

    if not adapters and options.quality_cutoff == 0 and options.cut == 0 and \
      options.minimum_length == 0 and options.maximum_length == sys.maxsize:
        parser.error("You need to provide at least one adapter sequence.")

    if input_paired_filename:
        reader = seqio.PairedSequenceReader(input_filename,
                                            input_paired_filename,
                                            colorspace=options.colorspace,
                                            fileformat=options.format)
    else:
        reader = read_sequences(input_filename,
                                quality_filename,
                                colorspace=options.colorspace,
                                fileformat=options.format)

    # Create the processing pipeline as a list of "modifiers".
    modifiers = []
    if options.cut:
        if len(options.cut) > 2:
            parser.error("You cannot remove bases from more than two ends.")
        if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0:
            parser.error("You cannot remove bases from the same end twice.")
        for cut in options.cut:
            if cut != 0:
                modifiers.append(UnconditionalCutter(cut))

    if options.quality_cutoff > 0:
        modifiers.append(
            QualityTrimmer(options.quality_cutoff, options.quality_base))
    if adapters:
        adapter_cutter = AdapterCutter(adapters, options.times,
                                       options.wildcard_file,
                                       options.info_file, rest_writer,
                                       options.action)
        modifiers.append(adapter_cutter)
    else:
        adapter_cutter = None
    if options.length_tag:
        modifiers.append(LengthTagModifier(options.length_tag))
    if options.strip_f3:
        options.strip_suffix.append('_F3')
    for suffix in options.strip_suffix:
        modifiers.append(SuffixRemover(suffix))
    if options.prefix or options.suffix:
        modifiers.append(PrefixSuffixAdder(options.prefix, options.suffix))
    if options.double_encode:
        modifiers.append(DoubleEncoder())
    if options.zero_cap and reader.delivers_qualities:
        modifiers.append(ZeroCapper(quality_base=options.quality_base))
    if options.trim_primer:
        modifiers.append(PrimerTrimmer)

    start_time = time.clock()
    try:
        if input_paired_filename:
            stats = process_paired_reads(reader, modifiers, writers)
        else:
            stats = process_single_reads(reader, modifiers, writers)
    except KeyboardInterrupt as e:
        print("Interrupted", file=sys.stderr)
        sys.exit(1)
    except IOError as e:
        if e.errno == errno.EPIPE:
            sys.exit(1)
        raise
    except seqio.FormatError as e:
        print("Error:", e, file=sys.stderr)
        sys.exit(1)

    # close open files
    for f in [
            trimmed_outfile, untrimmed_outfile, trimmed_paired_outfile,
            untrimmed_paired_outfile, options.rest_file, options.wildcard_file,
            options.info_file, too_short_outfile, too_long_outfile,
            options.info_file, demultiplexer
    ]:
        if f is not None and f is not sys.stdin and f is not sys.stdout:
            f.close()

    if not options.quiet:
        # send statistics to stderr if result was sent to stdout
        stat_file = sys.stderr if options.output is None else None
        print_statistics(adapters,
                         time.clock() - start_time,
                         stats,
                         options.action,
                         adapter_cutter.reads_matched if adapter_cutter else 0,
                         options.error_rate,
                         too_short_filter.too_short if too_short_filter else 0,
                         too_long_filter.too_long if too_long_filter else 0,
                         cmdlineargs,
                         file=stat_file)