コード例 #1
0
def test_quality_trimmer():
    read = Sequence('read1', 'ACGTTTACGTA', '##456789###')

    qt = QualityTrimmer(10, 10, 33)
    assert qt(read) == Sequence('read1', 'GTTTAC', '456789')

    qt = QualityTrimmer(0, 10, 33)
    assert qt(read) == Sequence('read1', 'ACGTTTAC', '##456789')

    qt = QualityTrimmer(10, 0, 33)
    assert qt(read) == Sequence('read1', 'GTTTACGTA', '456789###')
コード例 #2
0
ファイル: digest.py プロジェクト: wan230114/miRge3.0
def stipulate(args):
    """
    REQUIRED TO CREATE ITERABLE FUNCTIONS TO RUN IN CUTADAPT 2.7. THIS FUNCTION IS CALLED ONLY ONE TIME. 
    """
    modifiers = []
    pipeline_add = modifiers.append
    adapter_parser = AdapterParser(
        max_error_rate=args.error_rate,
        min_overlap=args.overlap,
        read_wildcards=args.match_read_wildcards,
        adapter_wildcards=args.match_adapter_wildcards,
        indels=args.indels,
    )
    adapters = adapter_parser.parse_multi(args.adapters)
    warn_duplicate_adapters(adapters)

    if args.nextseq_trim is not None:
        pipeline_add(NextseqQualityTrimmer(args.nextseq_trim, args.phred64))
    if args.quality_cutoff is not None:
        cutoffs = parse_cutoffs(args.quality_cutoff)
        pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.phred64))

    adapter_cutter = None
    if adapters:
        adapter_cutter = AdapterCutter(adapters, args.times, args.action)
        pipeline_add(adapter_cutter)
    if args.trim_n:
        pipeline_add(NEndTrimmer())
    add_unconditional_cutters(pipeline_add, args.cut)

    print("modifiers (cutadapt):", modifiers)
    return modifiers
コード例 #3
0
 def __init__(self, queue=None, results=None, adapter=None, phred64=False):
     super(Worker, self).__init__()
     self.queue = queue
     self.results = results
     self.phred = 64 if phred64 else 33
     self.modifiers = [QualityTrimmer(0, 10, self.phred)]
     self.adapters = []
     self.error_rate = 0.12
     self.min_length = 16
     if adapter.startswith('+'):
         self.modifiers.append(UnconditionalCutter(int(adapter)))
     elif adapter == 'none':
         self.adapter = None
     else:
         self.adapters = parse_adapters(adapter, error_rate=self.error_rate)
         adapter_cutter = AdapterCutter(self.adapters)
         self.modifiers.append(adapter_cutter)
コード例 #4
0
def main(cmdlineargs=None, default_outfile=sys.stdout):
    """
	Main function that evaluates command-line parameters and iterates
	over all reads.

	default_outfile is the file to which trimmed reads are sent if the ``-o``
	parameter is not used.
	"""
    logging.basicConfig(level=logging.INFO,
                        format='%(message)s')  #  %(levelname)s
    parser = get_option_parser()
    if cmdlineargs is None:
        cmdlineargs = sys.argv[1:]
    options, args = parser.parse_args(args=cmdlineargs)

    if len(args) == 0:
        parser.error(
            "At least one parameter needed: name of a FASTA or FASTQ file.")
    elif len(args) > 2:
        parser.error("Too many parameters.")
    input_filename = args[0]

    # Find out which 'mode' we need to use.
    # Default: single-read trimming (neither -p nor -A/-G/-B/-U given)
    paired = False
    if options.paired_output:
        # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U).
        # This exists for backwards compatibility ('legacy mode').
        paired = 'first'
    if options.adapters2 or options.front2 or options.anywhere2 or options.cut2:
        # Full paired-end trimming when both -p and -A/-G/-B/-U given
        # Also the read modifications (such as quality trimming) are applied
        # to second read.
        paired = 'both'

    if paired and len(args) == 1:
        parser.error(
            "When paired-end trimming is enabled via -A/-G/-B/-U or -p, "
            "two input files are required.")
    if paired:
        input_paired_filename = args[1]
        quality_filename = None
    else:
        input_paired_filename = None
        if len(args) == 2:
            if args[0].endswith('.qual'):
                parser.error("The QUAL file must be the second argument.")
            quality_filename = args[1]
        else:
            quality_filename = None

    if paired:
        if not options.paired_output:
            parser.error(
                "When paired-end trimming is enabled via -A/-G/-B/-U, "
                "a second output file needs to be specified via -p (--paired-output)."
            )
        if bool(options.untrimmed_output) != bool(
                options.untrimmed_paired_output):
            parser.error(
                "When trimming paired-end reads, you must use either none "
                "or both of the --untrimmed-output/--untrimmed-paired-output options."
            )
    else:
        if options.untrimmed_paired_output:
            parser.error(
                "Option --untrimmed-paired-output can only be used when "
                "trimming paired-end reads (with option -p).")
        if input_filename.endswith('.qual'):
            parser.error("Need a FASTA file in addition to the QUAL file.")
        if options.format is not None and quality_filename is not None:
            parser.error(
                "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used."
            )

    if options.format is not None and options.format.lower() not in [
            'fasta', 'fastq', 'sra-fastq'
    ]:
        parser.error(
            "The input file format must be either 'fasta', 'fastq' or "
            "'sra-fastq' (not '{0}').".format(options.format))

    if options.quality_cutoff is not None:
        cutoffs = options.quality_cutoff.split(',')
        if len(cutoffs) == 1:
            try:
                cutoffs = [0, int(cutoffs[0])]
            except ValueError as e:
                parser.error(
                    "Quality cutoff value not recognized: {0}".format(e))
        elif len(cutoffs) == 2:
            try:
                cutoffs = [int(cutoffs[0]), int(cutoffs[1])]
            except ValueError as e:
                parser.error(
                    "Quality cutoff value not recognized: {0}".format(e))
        else:
            parser.error(
                "Expected one value or two values separated by comma for the quality cutoff"
            )
    else:
        cutoffs = None
    writers = []
    too_short_outfile = None  # too short reads go here
    too_short_filter = None
    # TODO pass file name to TooShortReadFilter, add a .close() method?
    if options.minimum_length > 0:
        if options.too_short_output:
            too_short_outfile = xopen(options.too_short_output, 'w')
        else:
            too_short_outfile = None
        too_short_filter = TooShortReadFilter(options.minimum_length,
                                              too_short_outfile,
                                              paired == 'both')
        writers.append(too_short_filter)
    too_long_outfile = None  # too long reads go here
    too_long_filter = None
    if options.maximum_length < sys.maxsize:
        if options.too_long_output is not None:
            too_long_outfile = xopen(options.too_long_output, 'w')
        else:
            too_long_outfile = None
        too_long_filter = TooLongReadFilter(options.maximum_length,
                                            too_long_outfile,
                                            check_second=paired == 'both')
        writers.append(too_long_filter)

    if options.max_n != -1:
        writers.append(
            NContentFilter(options.max_n, check_second=paired == 'both'))

    demultiplexer = None
    if options.output is not None and '{name}' in options.output:
        if options.discard_trimmed:
            parser.error("Do not use --discard-trimmed when demultiplexing.")
        if paired:
            parser.error(
                "Demultiplexing not supported for paired-end files, yet.")
        untrimmed = options.output.format(name='unknown')
        if options.untrimmed_output:
            untrimmed = options.untrimmed_output
        if options.discard_untrimmed:
            untrimmed = None
        demultiplexer = Demultiplexer(options.output, untrimmed)
        writers.append(demultiplexer)
        trimmed_outfile, untrimmed_outfile = None, None
        trimmed_paired_outfile, untrimmed_paired_outfile = None, None
    else:
        trimmed_outfile, untrimmed_outfile = trimmed_and_untrimmed_files(
            default_outfile, options.output, options.untrimmed_output,
            options.discard_trimmed, options.discard_untrimmed)

        trimmed_paired_outfile, untrimmed_paired_outfile = trimmed_and_untrimmed_files(
            None,  # applies when not trimming paired-end data
            options.paired_output,
            options.untrimmed_paired_output,
            options.discard_trimmed,
            options.discard_untrimmed)

        if untrimmed_outfile or untrimmed_paired_outfile:
            writers.append(
                DiscardUntrimmedFilter(untrimmed_outfile,
                                       untrimmed_paired_outfile,
                                       check_second=paired == 'both'))
        writer = DiscardTrimmedFilter(trimmed_outfile,
                                      trimmed_paired_outfile,
                                      check_second=paired == 'both')
        writers.append(writer)
        del writer

    if options.maq:
        options.colorspace = True
        options.double_encode = True
        options.trim_primer = True
        options.strip_suffix.append('_F3')
        options.suffix = "/1"
    if options.zero_cap is None:
        options.zero_cap = options.colorspace
    if options.trim_primer and not options.colorspace:
        parser.error("Trimming the primer makes only sense in colorspace.")
    if options.double_encode and not options.colorspace:
        parser.error("Double-encoding makes only sense in colorspace.")
    if options.anywhere and options.colorspace:
        parser.error(
            "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)."
        )
    if not (0 <= options.error_rate <= 1.):
        parser.error("The maximum error rate must be between 0 and 1.")
    if options.overlap < 1:
        parser.error("The overlap must be at least 1.")

    if options.rest_file is not None:
        options.rest_file = xopen(options.rest_file, 'w')
        rest_writer = RestFileWriter(options.rest_file)
    else:
        rest_writer = None
    if options.info_file is not None:
        options.info_file = xopen(options.info_file, 'w')
    if options.wildcard_file is not None:
        options.wildcard_file = xopen(options.wildcard_file, 'w')

    if options.colorspace:
        if options.match_read_wildcards:
            parser.error('IUPAC wildcards not supported in colorspace')
        options.match_adapter_wildcards = False

    ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter
    try:
        # TODO refactor this a bit
        def collect(back, anywhere, front):
            adapters = []
            for name, seq, where in gather_adapters(back, anywhere, front):
                if not seq:
                    parser.error("The adapter sequence is empty.")
                adapter = ADAPTER_CLASS(seq,
                                        where,
                                        options.error_rate,
                                        options.overlap,
                                        options.match_read_wildcards,
                                        options.match_adapter_wildcards,
                                        name=name,
                                        indels=options.indels)
                if options.debug:
                    adapter.enable_debug()
                adapters.append(adapter)
            return adapters

        adapters = collect(options.adapters, options.anywhere, options.front)
        adapters2 = collect(options.adapters2, options.anywhere2,
                            options.front2)
    except IOError as e:
        if e.errno == errno.ENOENT:
            parser.error(e)
        raise

    if not adapters and not adapters2 and not cutoffs and \
      options.cut == [] and options.cut2 == [] and \
      options.minimum_length == 0 and \
      options.maximum_length == sys.maxsize and \
      quality_filename is None and \
      options.max_n == -1:
        parser.error("You need to provide at least one adapter sequence.")

    try:
        reader = seqio.open(input_filename,
                            file2=input_paired_filename,
                            qualfile=quality_filename,
                            colorspace=options.colorspace,
                            fileformat=options.format)
    except (seqio.UnknownFileType, IOError) as e:
        parser.error(e)

    # Create the processing pipeline consisting of a list of "modifiers".
    modifiers = []
    if options.cut:
        if len(options.cut) > 2:
            parser.error("You cannot remove bases from more than two ends.")
        if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0:
            parser.error("You cannot remove bases from the same end twice.")
        for cut in options.cut:
            if cut != 0:
                modifiers.append(UnconditionalCutter(cut))

    if cutoffs:
        modifiers.append(
            QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))
    if adapters:
        adapter_cutter = AdapterCutter(adapters, options.times,
                                       options.wildcard_file,
                                       options.info_file, rest_writer,
                                       options.action)
        modifiers.append(adapter_cutter)
    else:
        adapter_cutter = None

    # Modifiers that apply to both reads of paired-end reads
    modifiers_both = []
    if options.trim_n:
        modifiers_both.append(NEndTrimmer())
    if options.length_tag:
        modifiers_both.append(LengthTagModifier(options.length_tag))
    if options.strip_f3:
        options.strip_suffix.append('_F3')
    for suffix in options.strip_suffix:
        modifiers_both.append(SuffixRemover(suffix))
    if options.prefix or options.suffix:
        modifiers_both.append(PrefixSuffixAdder(options.prefix,
                                                options.suffix))
    if options.double_encode:
        modifiers_both.append(DoubleEncoder())
    if options.zero_cap and reader.delivers_qualities:
        modifiers_both.append(ZeroCapper(quality_base=options.quality_base))
    if options.trim_primer:
        modifiers_both.append(PrimerTrimmer)
    modifiers.extend(modifiers_both)

    # For paired-end data, create a second processing pipeline.
    # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to
    # be backwards compatible and *no modifications* are done to the second read.
    modifiers2 = []
    if paired == 'both':
        if options.cut2:
            if len(options.cut2) > 2:
                parser.error(
                    "You cannot remove bases from more than two ends.")
            if len(options.cut2
                   ) == 2 and options.cut2[0] * options.cut2[1] > 0:
                parser.error(
                    "You cannot remove bases from the same end twice.")
            for cut in options.cut2:
                if cut != 0:
                    modifiers2.append(UnconditionalCutter(cut))

        if cutoffs:
            modifiers2.append(
                QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))
        if adapters2:
            adapter_cutter2 = AdapterCutter(adapters2, options.times, None,
                                            None, None, options.action)
            modifiers2.append(adapter_cutter2)
        else:
            adapter_cutter2 = None
        modifiers2.extend(modifiers_both)

    # Due to backwards compatibility, from here on logging output needs to be
    # sent to standard output instead of standard error if the -o option is used.
    if options.output:
        logger.root.handlers = []
        logging.basicConfig(level=logging.INFO,
                            format='%(message)s',
                            stream=sys.stdout)
    logger.info("This is cutadapt %s with Python %s", __version__,
                platform.python_version())
    logger.info("Command line parameters: %s", " ".join(cmdlineargs))
    logger.info(
        "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...",
        len(adapters) + len(adapters2),
        's' if len(adapters) + len(adapters2) != 1 else '',
        options.error_rate * 100, {
            False: 'single-end',
            'first': 'paired-end legacy',
            'both': 'paired-end'
        }[paired])

    start_time = time.clock()
    try:
        if paired:
            stats = process_paired_reads(reader, modifiers, modifiers2,
                                         writers)
        else:
            stats = process_single_reads(reader, modifiers, writers)
    except KeyboardInterrupt as e:
        print("Interrupted", file=sys.stderr)
        sys.exit(130)
    except IOError as e:
        if e.errno == errno.EPIPE:
            sys.exit(1)
        raise
    except (seqio.FormatError, EOFError) as e:
        sys.exit("cutadapt: error: {0}".format(e))

    # close open files
    for f in [
            trimmed_outfile, untrimmed_outfile, trimmed_paired_outfile,
            untrimmed_paired_outfile, options.rest_file, options.wildcard_file,
            options.info_file, too_short_outfile, too_long_outfile,
            options.info_file, demultiplexer
    ]:
        if f is not None and f is not sys.stdin and f is not sys.stdout:
            f.close()

    elapsed_time = time.clock() - start_time
    if not options.quiet:
        stats.collect((adapters, adapters2), elapsed_time, modifiers,
                      modifiers2, writers)
        # send statistics to stderr if result was sent to stdout
        stat_file = sys.stderr if options.output is None else None
        with redirect_standard_output(stat_file):
            print_report(stats, (adapters, adapters2))
コード例 #5
0
def pipeline_from_parsed_args(options, paired, pair_filter_mode, quality_filename, is_interleaved_output):
	"""
	Setup a processing pipeline from parsed command-line options.

	If there are any problems parsing the arguments, a CommandLineError is thrown.

	Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline)
	"""

	if not paired:
		if options.untrimmed_paired_output:
			raise CommandLineError("Option --untrimmed-paired-output can only be used when "
				"trimming paired-end reads (with option -p).")

	if paired:
		if not is_interleaved_output:
			if not options.paired_output:
				raise CommandLineError("When paired-end trimming is enabled via -A/-G/-B/-U, "
					"a second output file needs to be specified via -p (--paired-output).")
			if not options.output:
				raise CommandLineError("When you use -p or --paired-output, you must also "
					"use the -o option.")

		if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output):
			raise CommandLineError("When trimming paired-end reads, you must use either none "
				"or both of the --untrimmed-output/--untrimmed-paired-output options.")
		if options.too_short_output and not options.too_short_paired_output:
			raise CommandLineError("When using --too-short-output with paired-end "
				"reads, you also need to use --too-short-paired-output")
		if options.too_long_output and not options.too_long_paired_output:
			raise CommandLineError("When using --too-long-output with paired-end "
				"reads, you also need to use --too-long-paired-output")
	elif quality_filename is not None:
		if options.format is not None:
			raise CommandLineError('If a pair of .fasta and .qual files is given, the -f/--format '
				'parameter cannot be used.')

	if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']:
		raise CommandLineError("The input file format must be either 'fasta', 'fastq' or "
			"'sra-fastq' (not '{0}').".format(options.format))

	if options.maq:
		options.colorspace = True
		options.double_encode = True
		options.trim_primer = True
		options.strip_suffix.append('_F3')
		options.suffix = "/1"
	if options.zero_cap is None:
		options.zero_cap = options.colorspace
	if options.trim_primer and not options.colorspace:
		raise CommandLineError("Trimming the primer makes only sense in colorspace.")
	if options.double_encode and not options.colorspace:
		raise CommandLineError("Double-encoding makes only sense in colorspace.")
	if options.anywhere and options.colorspace:
		raise CommandLineError("Using --anywhere with colorspace reads is currently not supported "
			"(if you think this may be useful, contact the author).")
	if not (0 <= options.error_rate <= 1.):
		raise CommandLineError("The maximum error rate must be between 0 and 1.")
	if options.overlap < 1:
		raise CommandLineError("The overlap must be at least 1.")
	if not (0 <= options.gc_content <= 100):
		raise CommandLineError("GC content must be given as percentage between 0 and 100")
	if options.action == 'none':
		options.action = None

	if options.colorspace:
		if options.match_read_wildcards:
			raise CommandLineError('IUPAC wildcards not supported in colorspace')
		options.match_adapter_wildcards = False

	adapter_parser = AdapterParser(
		colorspace=options.colorspace,
		max_error_rate=options.error_rate,
		min_overlap=options.overlap,
		read_wildcards=options.match_read_wildcards,
		adapter_wildcards=options.match_adapter_wildcards,
		indels=options.indels)

	try:
		adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front)
		adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2)
	except IOError as e:
		if e.errno == errno.ENOENT:
			raise CommandLineError(e)
		raise
	except ValueError as e:
		raise CommandLineError(e)
	if options.debug:
		for adapter in adapters + adapters2:
			adapter.enable_debug()

	# Create the processing pipeline.
	# If no second-read adapters were given (via -A/-G/-B/-U), we need to
	# be backwards compatible and *no modifications* are done to the second read.
	if paired:
		pipeline = PairedEndPipeline(pair_filter_mode, modify_first_read_only=paired == 'first')
	else:
		pipeline = SingleEndPipeline()

	if options.cut:
		if len(options.cut) > 2:
			raise CommandLineError("You cannot remove bases from more than two ends.")
		if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0:
			raise CommandLineError("You cannot remove bases from the same end twice.")
		for cut in options.cut:
			if cut != 0:
				pipeline.add1(UnconditionalCutter(cut))

	if options.cut2:
		if len(options.cut2) > 2:
			raise CommandLineError("You cannot remove bases from more than two ends.")
		if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0:
			raise CommandLineError("You cannot remove bases from the same end twice.")
		for cut in options.cut2:
			if cut != 0:
				pipeline.add2(UnconditionalCutter(cut))

	if options.nextseq_trim is not None:
		pipeline.add(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base))
	if options.quality_cutoff is not None:
		cutoffs = parse_cutoffs(options.quality_cutoff)
		pipeline.add(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))

	if adapters:
		adapter_cutter = AdapterCutter(adapters, options.times, options.action)
		pipeline.add1(adapter_cutter)
	if adapters2:
		adapter_cutter2 = AdapterCutter(adapters2, options.times, options.action)
		pipeline.add2(adapter_cutter2)

	# Modifiers that apply to both reads of paired-end reads unless in legacy mode
	if options.length is not None:
		pipeline.add(Shortener(options.length))
	if options.trim_n:
		pipeline.add(NEndTrimmer())
	if options.length_tag:
		pipeline.add(LengthTagModifier(options.length_tag))
	if options.strip_f3:
		options.strip_suffix.append('_F3')
	for suffix in options.strip_suffix:
		pipeline.add(SuffixRemover(suffix))
	if options.prefix or options.suffix:
		pipeline.add(PrefixSuffixAdder(options.prefix, options.suffix))
	if options.double_encode:
		pipeline.add(DoubleEncoder())
	if options.zero_cap:
		pipeline.add(ZeroCapper(quality_base=options.quality_base))
	if options.trim_primer:
		pipeline.add(PrimerTrimmer())

	# Set filtering parameters
	# Minimum/maximum length
	for attr in 'minimum_length', 'maximum_length':
		param = getattr(options, attr)
		if param is not None:
			lengths = parse_lengths(param)
			if not paired and len(lengths) == 2:
				raise CommandLineError('Two minimum or maximum lengths given for single-end data')
			if paired and len(lengths) == 1:
				lengths = (lengths[0], lengths[0])
			setattr(pipeline, attr, lengths)
	pipeline.max_n = options.max_n
	pipeline.discard_casava = options.discard_casava
	pipeline.discard_trimmed = options.discard_trimmed
	pipeline.discard_untrimmed = options.discard_untrimmed

	return pipeline
コード例 #6
0
def pipeline_from_parsed_args(args, paired, file_opener, adapters,
                              adapters2) -> Pipeline:
    """
    Setup a processing pipeline from parsed command-line arguments.

    If there are any problems parsing the arguments, a CommandLineError is raised.

    Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline)
    """
    if args.action == 'none':
        args.action = None

    # Create the processing pipeline
    if paired:
        pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter
        pipeline = PairedEndPipeline(pair_filter_mode,
                                     file_opener)  # type: Any
    else:
        pipeline = SingleEndPipeline(file_opener)

    # When adapters are being trimmed only in R1 or R2, override the pair filter mode
    # as using the default of 'any' would regard all read pairs as untrimmed.
    if isinstance(pipeline,
                  PairedEndPipeline) and (not adapters2 or not adapters) and (
                      args.discard_untrimmed or args.untrimmed_output
                      or args.untrimmed_paired_output):
        pipeline.override_untrimmed_pair_filter = True

    add_unconditional_cutters(pipeline, args.cut, args.cut2, paired)

    pipeline_add = pipeline.add_both if paired else pipeline.add

    if args.nextseq_trim is not None:
        pipeline_add(
            NextseqQualityTrimmer(args.nextseq_trim, args.quality_base))
    if args.quality_cutoff is not None:
        cutoffs = parse_cutoffs(args.quality_cutoff)
        pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base))

    add_adapter_cutter(
        pipeline,
        adapters,
        adapters2,
        paired,
        args.pair_adapters,
        args.action,
        args.times,
        args.reverse_complement,
        args.index,
    )

    for modifier in modifiers_applying_to_both_ends_if_paired(args):
        pipeline_add(modifier)

    # Set filtering parameters
    # Minimum/maximum length
    for attr in 'minimum_length', 'maximum_length':
        param = getattr(args, attr)
        if param is not None:
            lengths = parse_lengths(param)
            if not paired and len(lengths) == 2:
                raise CommandLineError(
                    'Two minimum or maximum lengths given for single-end data')
            if paired and len(lengths) == 1:
                lengths = (lengths[0], lengths[0])
            setattr(pipeline, attr, lengths)
    pipeline.max_n = args.max_n
    pipeline.max_expected_errors = args.max_expected_errors
    pipeline.discard_casava = args.discard_casava
    pipeline.discard_trimmed = args.discard_trimmed
    pipeline.discard_untrimmed = args.discard_untrimmed

    return pipeline
コード例 #7
0
def pipeline_from_parsed_args(args, paired, is_interleaved_output):
    """
    Setup a processing pipeline from parsed command-line arguments.

    If there are any problems parsing the arguments, a CommandLineError is thrown.

    Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline)
    """

    if not paired:
        if args.untrimmed_paired_output:
            raise CommandLineError(
                "Option --untrimmed-paired-output can only be used when "
                "trimming paired-end reads (with option -p).")

    if paired:
        if not is_interleaved_output:
            if not args.paired_output:
                raise CommandLineError(
                    "When a paired-end trimming option such as -A/-G/-B/-U, "
                    "is used, a second output file needs to be specified via -p (--paired-output)."
                )
            if not args.output:
                raise CommandLineError(
                    "When you use -p or --paired-output, you must also "
                    "use the -o option.")

        if bool(args.untrimmed_output) != bool(args.untrimmed_paired_output):
            raise CommandLineError(
                "When trimming paired-end reads, you must use either none "
                "or both of the --untrimmed-output/--untrimmed-paired-output options."
            )
        if args.too_short_output and not args.too_short_paired_output:
            raise CommandLineError(
                "When using --too-short-output with paired-end "
                "reads, you also need to use --too-short-paired-output")
        if args.too_long_output and not args.too_long_paired_output:
            raise CommandLineError(
                "When using --too-long-output with paired-end "
                "reads, you also need to use --too-long-paired-output")

    if args.format is not None:
        logger.warning(
            "Option --format is deprecated and ignored because the input file format is "
            "always auto-detected")

    if not (0 <= args.error_rate < 1.):
        raise CommandLineError(
            "The maximum error rate must be at least 0 and less than 1.")
    if args.overlap < 1:
        raise CommandLineError("The overlap must be at least 1.")
    if not (0 <= args.gc_content <= 100):
        raise CommandLineError(
            "GC content must be given as percentage between 0 and 100")
    if args.action == 'none':
        args.action = None

    adapter_parser = AdapterParser(
        max_error_rate=args.error_rate,
        min_overlap=args.overlap,
        read_wildcards=args.match_read_wildcards,
        adapter_wildcards=args.match_adapter_wildcards,
        indels=args.indels,
    )
    try:
        adapters = adapter_parser.parse_multi(args.adapters, args.anywhere,
                                              args.front)
        adapters2 = adapter_parser.parse_multi(args.adapters2, args.anywhere2,
                                               args.front2)
    except IOError as e:
        if e.errno == errno.ENOENT:
            raise CommandLineError(e)
        raise
    except ValueError as e:
        raise CommandLineError(e)
    if args.debug:
        for adapter in adapters + adapters2:
            adapter.enable_debug()

    # Create the processing pipeline
    if paired:
        pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter
        pipeline = PairedEndPipeline(pair_filter_mode)
    else:
        pipeline = SingleEndPipeline()

    # When adapters are being trimmed only in R1 or R2, override the pair filter mode
    # as using the default of 'any' would regard all read pairs as untrimmed.
    if paired and (not adapters2
                   or not adapters) and (args.discard_untrimmed
                                         or args.untrimmed_output
                                         or args.untrimmed_paired_output):
        pipeline.override_untrimmed_pair_filter = True

    for i, cut_arg in enumerate([args.cut, args.cut2]):
        # cut_arg is a list
        if not cut_arg:
            continue
        if len(cut_arg) > 2:
            raise CommandLineError(
                "You cannot remove bases from more than two ends.")
        if len(cut_arg) == 2 and cut_arg[0] * cut_arg[1] > 0:
            raise CommandLineError(
                "You cannot remove bases from the same end twice.")
        for c in cut_arg:
            if c == 0:
                continue
            if i == 0:  # R1
                if paired:
                    pipeline.add(UnconditionalCutter(c), None)
                else:
                    pipeline.add(UnconditionalCutter(c))
            else:
                # R2
                assert isinstance(pipeline, PairedEndPipeline)
                pipeline.add(None, UnconditionalCutter(c))

    pipeline_add = pipeline.add_both if paired else pipeline.add

    if args.nextseq_trim is not None:
        pipeline_add(
            NextseqQualityTrimmer(args.nextseq_trim, args.quality_base))
    if args.quality_cutoff is not None:
        cutoffs = parse_cutoffs(args.quality_cutoff)
        pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base))

    if args.pair_adapters:
        if not paired:
            raise CommandLineError(
                "Option --pair-adapters can only be used when trimming "
                "paired-end reads")
        if args.times != 1:
            raise CommandLineError(
                "--pair-adapters cannot be used with --times")
        try:
            cutter = PairedAdapterCutter(adapters, adapters2, args.action)
        except PairedAdapterCutterError as e:
            raise CommandLineError("--pair-adapters: " + str(e))
        pipeline.add_paired_modifier(cutter)
    else:
        adapter_cutter, adapter_cutter2 = None, None
        if adapters:
            adapter_cutter = AdapterCutter(adapters, args.times, args.action)
        if adapters2:
            adapter_cutter2 = AdapterCutter(adapters2, args.times, args.action)
        if paired:
            if adapter_cutter or adapter_cutter2:
                pipeline.add(adapter_cutter, adapter_cutter2)
        else:
            if adapter_cutter:
                pipeline.add(adapter_cutter)

    # Remaining modifiers that apply to both reads of paired-end reads
    if args.length is not None:
        pipeline_add(Shortener(args.length))
    if args.trim_n:
        pipeline_add(NEndTrimmer())
    if args.length_tag:
        pipeline_add(LengthTagModifier(args.length_tag))
    for suffix in args.strip_suffix:
        pipeline_add(SuffixRemover(suffix))
    if args.prefix or args.suffix:
        pipeline_add(PrefixSuffixAdder(args.prefix, args.suffix))
    if args.zero_cap:
        pipeline_add(ZeroCapper(quality_base=args.quality_base))

    # Set filtering parameters
    # Minimum/maximum length
    for attr in 'minimum_length', 'maximum_length':
        param = getattr(args, attr)
        if param is not None:
            lengths = parse_lengths(param)
            if not paired and len(lengths) == 2:
                raise CommandLineError(
                    'Two minimum or maximum lengths given for single-end data')
            if paired and len(lengths) == 1:
                lengths = (lengths[0], lengths[0])
            setattr(pipeline, attr, lengths)
    pipeline.max_n = args.max_n
    pipeline.discard_casava = args.discard_casava
    pipeline.discard_trimmed = args.discard_trimmed
    pipeline.discard_untrimmed = args.discard_untrimmed

    return pipeline
コード例 #8
0
ファイル: __main__.py プロジェクト: rhpvorderman/cutadapt
def pipeline_from_parsed_args(args, paired, is_interleaved_output):
    """
    Setup a processing pipeline from parsed command-line arguments.

    If there are any problems parsing the arguments, a CommandLineError is raised.

    Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline)
    """
    check_arguments(args, paired, is_interleaved_output)
    if args.action == 'none':
        args.action = None

    adapter_parser = AdapterParser(
        max_error_rate=args.error_rate,
        min_overlap=args.overlap,
        read_wildcards=args.match_read_wildcards,
        adapter_wildcards=args.match_adapter_wildcards,
        indels=args.indels,
    )
    try:
        adapters = adapter_parser.parse_multi(args.adapters)
        adapters2 = adapter_parser.parse_multi(args.adapters2)
    except (FileNotFoundError, ValueError) as e:
        raise CommandLineError(e)
    warn_duplicate_adapters(adapters)
    warn_duplicate_adapters(adapters2)
    if args.debug:
        for adapter in adapters + adapters2:
            adapter.enable_debug()

    # Create the processing pipeline
    if paired:
        pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter
        pipeline = PairedEndPipeline(pair_filter_mode)
    else:
        pipeline = SingleEndPipeline()

    # When adapters are being trimmed only in R1 or R2, override the pair filter mode
    # as using the default of 'any' would regard all read pairs as untrimmed.
    if paired and (not adapters2
                   or not adapters) and (args.discard_untrimmed
                                         or args.untrimmed_output
                                         or args.untrimmed_paired_output):
        pipeline.override_untrimmed_pair_filter = True

    add_unconditional_cutters(pipeline, args.cut, args.cut2, paired)

    pipeline_add = pipeline.add_both if paired else pipeline.add

    if args.nextseq_trim is not None:
        pipeline_add(
            NextseqQualityTrimmer(args.nextseq_trim, args.quality_base))
    if args.quality_cutoff is not None:
        cutoffs = parse_cutoffs(args.quality_cutoff)
        pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base))

    if args.pair_adapters:
        try:
            cutter = PairedAdapterCutter(adapters, adapters2, args.action)
        except PairedAdapterCutterError as e:
            raise CommandLineError("--pair-adapters: " + str(e))
        pipeline.add_paired_modifier(cutter)
    else:
        adapter_cutter, adapter_cutter2 = None, None
        if adapters:
            adapter_cutter = AdapterCutter(adapters, args.times, args.action)
        if adapters2:
            adapter_cutter2 = AdapterCutter(adapters2, args.times, args.action)
        if paired:
            if adapter_cutter or adapter_cutter2:
                pipeline.add(adapter_cutter, adapter_cutter2)
        else:
            if adapter_cutter:
                pipeline.add(adapter_cutter)

    for modifier in modifiers_applying_to_both_ends_if_paired(args):
        pipeline_add(modifier)

    # Set filtering parameters
    # Minimum/maximum length
    for attr in 'minimum_length', 'maximum_length':
        param = getattr(args, attr)
        if param is not None:
            lengths = parse_lengths(param)
            if not paired and len(lengths) == 2:
                raise CommandLineError(
                    'Two minimum or maximum lengths given for single-end data')
            if paired and len(lengths) == 1:
                lengths = (lengths[0], lengths[0])
            setattr(pipeline, attr, lengths)
    pipeline.max_n = args.max_n
    pipeline.discard_casava = args.discard_casava
    pipeline.discard_trimmed = args.discard_trimmed
    pipeline.discard_untrimmed = args.discard_untrimmed

    return pipeline
コード例 #9
0
def main(cmdlineargs=None, default_outfile=sys.stdout):
    """
	Main function that evaluates command-line parameters and iterates
	over all reads.

	default_outfile is the file to which trimmed reads are sent if the ``-o``
	parameter is not used.
	"""
    parser = get_option_parser()
    if cmdlineargs is None:
        cmdlineargs = sys.argv[1:]
    options, args = parser.parse_args(args=cmdlineargs)
    # Setup logging only if there are not already any handlers (can happen when
    # this function is being called externally such as from unit tests)
    if not logging.root.handlers:
        setup_logging(stdout=bool(options.output), quiet=options.quiet)

    if len(args) == 0:
        parser.error(
            "At least one parameter needed: name of a FASTA or FASTQ file.")
    elif len(args) > 2:
        parser.error("Too many parameters.")
    input_filename = args[0]
    if input_filename.endswith('.qual'):
        parser.error(
            "If a .qual file is given, it must be the second argument.")

    # Find out which 'mode' we need to use.
    # Default: single-read trimming (neither -p nor -A/-G/-B/-U/--interleaved given)
    paired = False
    if options.paired_output:
        # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U).
        # This exists for backwards compatibility ('legacy mode').
        paired = 'first'
    # Any of these options switch off legacy mode
    if (options.adapters2 or options.front2 or options.anywhere2
            or options.cut2 or options.interleaved or options.pair_filter
            or options.too_short_paired_output
            or options.too_long_paired_output):
        # Full paired-end trimming when both -p and -A/-G/-B/-U given
        # Read modifications (such as quality trimming) are applied also to second read.
        paired = 'both'

    if paired and len(args) == 1 and not options.interleaved:
        parser.error("When paired-end trimming is enabled via -A/-G/-B/-U/"
                     "--interleaved or -p, two input files are required.")
    if not paired:
        if options.untrimmed_paired_output:
            parser.error(
                "Option --untrimmed-paired-output can only be used when "
                "trimming paired-end reads (with option -p).")

    interleaved_input = False
    interleaved_output = False
    if options.interleaved:
        interleaved_input = len(args) == 1
        interleaved_output = not options.paired_output
        if not interleaved_input and not interleaved_output:
            parser.error(
                "When --interleaved is used, you cannot provide both two input files and two output files"
            )

    # Assign input_paired_filename and quality_filename
    input_paired_filename = None
    quality_filename = None
    if paired:
        if not interleaved_input:
            input_paired_filename = args[1]
        if not interleaved_output:
            if not options.paired_output:
                parser.error(
                    "When paired-end trimming is enabled via -A/-G/-B/-U, "
                    "a second output file needs to be specified via -p (--paired-output)."
                )
            if not options.output:
                parser.error(
                    "When you use -p or --paired-output, you must also "
                    "use the -o option.")

        if bool(options.untrimmed_output) != bool(
                options.untrimmed_paired_output):
            parser.error(
                "When trimming paired-end reads, you must use either none "
                "or both of the --untrimmed-output/--untrimmed-paired-output options."
            )
        if options.too_short_output and not options.too_short_paired_output:
            parser.error(
                "When using --too-short-output with paired-end "
                "reads, you also need to use --too-short-paired-output")
        if options.too_long_output and not options.too_long_paired_output:
            parser.error(
                "When using --too-long-output with paired-end "
                "reads, you also need to use --too-long-paired-output")
    elif len(args) == 2:
        quality_filename = args[1]
        if options.format is not None:
            parser.error(
                "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used."
            )

    if options.format is not None and options.format.lower() not in [
            'fasta', 'fastq', 'sra-fastq'
    ]:
        parser.error(
            "The input file format must be either 'fasta', 'fastq' or "
            "'sra-fastq' (not '{0}').".format(options.format))

    # Open input file(s)
    try:
        reader = seqio.open(input_filename,
                            file2=input_paired_filename,
                            qualfile=quality_filename,
                            colorspace=options.colorspace,
                            fileformat=options.format,
                            interleaved=interleaved_input)
    except (seqio.UnknownFileType, IOError) as e:
        parser.error(e)

    if options.quality_cutoff is not None:
        cutoffs = options.quality_cutoff.split(',')
        if len(cutoffs) == 1:
            try:
                cutoffs = [0, int(cutoffs[0])]
            except ValueError as e:
                parser.error(
                    "Quality cutoff value not recognized: {0}".format(e))
        elif len(cutoffs) == 2:
            try:
                cutoffs = [int(cutoffs[0]), int(cutoffs[1])]
            except ValueError as e:
                parser.error(
                    "Quality cutoff value not recognized: {0}".format(e))
        else:
            parser.error(
                "Expected one value or two values separated by comma for the quality cutoff"
            )
    else:
        cutoffs = None

    open_writer = functools.partial(seqio.open,
                                    mode='w',
                                    qualities=reader.delivers_qualities,
                                    colorspace=options.colorspace)

    if options.pair_filter is None:
        options.pair_filter = 'any'
    min_affected = 2 if options.pair_filter == 'both' else 1
    if not paired:
        filter_wrapper = Redirector
    elif paired == 'first':
        filter_wrapper = LegacyPairedRedirector
    elif paired == 'both':
        filter_wrapper = functools.partial(PairedRedirector,
                                           min_affected=min_affected)
    filters = []
    # TODO open_files = []
    too_short_writer = None  # too short reads go here
    # TODO pass file name to TooShortReadFilter, add a .close() method?
    if options.minimum_length > 0:
        if options.too_short_output:
            too_short_writer = open_writer(options.too_short_output,
                                           options.too_short_paired_output)
        filters.append(
            filter_wrapper(too_short_writer,
                           TooShortReadFilter(options.minimum_length)))
    too_long_writer = None  # too long reads go here
    if options.maximum_length < sys.maxsize:
        if options.too_long_output is not None:
            too_long_writer = open_writer(options.too_long_output,
                                          options.too_long_paired_output)
        filters.append(
            filter_wrapper(too_long_writer,
                           TooLongReadFilter(options.maximum_length)))

    if options.max_n != -1:
        filters.append(filter_wrapper(None, NContentFilter(options.max_n)))

    if int(options.discard_trimmed) + int(options.discard_untrimmed) + int(
            options.untrimmed_output is not None) > 1:
        parser.error(
            "Only one of the --discard-trimmed, --discard-untrimmed "
            "and --untrimmed-output options can be used at the same time.")
    demultiplexer = None
    untrimmed_writer = None
    writer = None
    if options.output is not None and '{name}' in options.output:
        if options.discard_trimmed:
            parser.error("Do not use --discard-trimmed when demultiplexing.")
        if paired:
            parser.error(
                "Demultiplexing not supported for paired-end files, yet.")
        untrimmed = options.output.replace('{name}', 'unknown')
        if options.untrimmed_output:
            untrimmed = options.untrimmed_output
        if options.discard_untrimmed:
            untrimmed = None
        demultiplexer = Demultiplexer(options.output,
                                      untrimmed,
                                      qualities=reader.delivers_qualities,
                                      colorspace=options.colorspace)
        filters.append(demultiplexer)
    else:
        # Set up the remaining filters to deal with --discard-trimmed,
        # --discard-untrimmed and --untrimmed-output. These options
        # are mutually exclusive in order to avoid brain damage.
        if options.discard_trimmed:
            filters.append(filter_wrapper(None, DiscardTrimmedFilter()))
        elif options.discard_untrimmed:
            filters.append(filter_wrapper(None, DiscardUntrimmedFilter()))
        elif options.untrimmed_output:
            untrimmed_writer = open_writer(options.untrimmed_output,
                                           options.untrimmed_paired_output)
            filters.append(
                filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter()))

        # Finally, figure out where the reads that passed all the previous
        # filters should go.
        if options.output is not None:
            writer = open_writer(options.output,
                                 options.paired_output,
                                 interleaved=interleaved_output)
        else:
            writer = open_writer(default_outfile,
                                 interleaved=interleaved_output)
        if not paired:
            filters.append(NoFilter(writer))
        else:
            filters.append(PairedNoFilter(writer))

    if options.maq:
        options.colorspace = True
        options.double_encode = True
        options.trim_primer = True
        options.strip_suffix.append('_F3')
        options.suffix = "/1"
    if options.zero_cap is None:
        options.zero_cap = options.colorspace
    if options.trim_primer and not options.colorspace:
        parser.error("Trimming the primer makes only sense in colorspace.")
    if options.double_encode and not options.colorspace:
        parser.error("Double-encoding makes only sense in colorspace.")
    if options.anywhere and options.colorspace:
        parser.error(
            "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)."
        )
    if not (0 <= options.error_rate <= 1.):
        parser.error("The maximum error rate must be between 0 and 1.")
    if options.overlap < 1:
        parser.error("The overlap must be at least 1.")

    if options.rest_file is not None:
        options.rest_file = xopen(options.rest_file, 'w')
        rest_writer = RestFileWriter(options.rest_file)
    else:
        rest_writer = None
    if options.info_file is not None:
        options.info_file = xopen(options.info_file, 'w')
    if options.wildcard_file is not None:
        options.wildcard_file = xopen(options.wildcard_file, 'w')

    if options.colorspace:
        if options.match_read_wildcards:
            parser.error('IUPAC wildcards not supported in colorspace')
        options.match_adapter_wildcards = False

    adapter_parser = AdapterParser(
        colorspace=options.colorspace,
        max_error_rate=options.error_rate,
        min_overlap=options.overlap,
        read_wildcards=options.match_read_wildcards,
        adapter_wildcards=options.match_adapter_wildcards,
        indels=options.indels)

    try:
        adapters = adapter_parser.parse_multi(options.adapters,
                                              options.anywhere, options.front)
        adapters2 = adapter_parser.parse_multi(options.adapters2,
                                               options.anywhere2,
                                               options.front2)
    except IOError as e:
        if e.errno == errno.ENOENT:
            parser.error(e)
        raise
    except ValueError as e:
        parser.error(e)
    if options.debug:
        for adapter in adapters + adapters2:
            adapter.enable_debug()

    # Create the single-end processing pipeline (a list of "modifiers")
    modifiers = []
    if options.cut:
        if len(options.cut) > 2:
            parser.error("You cannot remove bases from more than two ends.")
        if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0:
            parser.error("You cannot remove bases from the same end twice.")
        for cut in options.cut:
            if cut != 0:
                modifiers.append(UnconditionalCutter(cut))

    if options.nextseq_trim is not None:
        modifiers.append(
            NextseqQualityTrimmer(options.nextseq_trim, options.quality_base))

    if cutoffs:
        modifiers.append(
            QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))
    if adapters:
        adapter_cutter = AdapterCutter(adapters, options.times,
                                       options.wildcard_file,
                                       options.info_file, rest_writer,
                                       options.action)
        modifiers.append(adapter_cutter)

    # Modifiers that apply to both reads of paired-end reads unless in legacy mode
    modifiers_both = []
    if options.length is not None:
        modifiers_both.append(Shortener(options.length))
    if options.trim_n:
        modifiers_both.append(NEndTrimmer())
    if options.length_tag:
        modifiers_both.append(LengthTagModifier(options.length_tag))
    if options.strip_f3:
        options.strip_suffix.append('_F3')
    for suffix in options.strip_suffix:
        modifiers_both.append(SuffixRemover(suffix))
    if options.prefix or options.suffix:
        modifiers_both.append(PrefixSuffixAdder(options.prefix,
                                                options.suffix))
    if options.double_encode:
        modifiers_both.append(DoubleEncoder())
    if options.zero_cap and reader.delivers_qualities:
        modifiers_both.append(ZeroCapper(quality_base=options.quality_base))
    if options.trim_primer:
        modifiers_both.append(PrimerTrimmer)
    modifiers.extend(modifiers_both)

    # For paired-end data, create a second processing pipeline.
    # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to
    # be backwards compatible and *no modifications* are done to the second read.
    modifiers2 = []
    if paired == 'both':
        if options.cut2:
            if len(options.cut2) > 2:
                parser.error(
                    "You cannot remove bases from more than two ends.")
            if len(options.cut2
                   ) == 2 and options.cut2[0] * options.cut2[1] > 0:
                parser.error(
                    "You cannot remove bases from the same end twice.")
            for cut in options.cut2:
                if cut != 0:
                    modifiers2.append(UnconditionalCutter(cut))

        if cutoffs:
            modifiers2.append(
                QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))
        if adapters2:
            adapter_cutter2 = AdapterCutter(adapters2, options.times, None,
                                            None, None, options.action)
            modifiers2.append(adapter_cutter2)
        else:
            adapter_cutter2 = None
        modifiers2.extend(modifiers_both)

    if paired:
        pipeline = PairedEndPipeline(reader, modifiers, modifiers2, filters)
    else:
        pipeline = SingleEndPipeline(reader, modifiers, filters)

    logger.info("This is cutadapt %s with Python %s", __version__,
                platform.python_version())
    logger.info("Command line parameters: %s", " ".join(cmdlineargs))
    logger.info(
        "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...",
        len(adapters) + len(adapters2),
        's' if len(adapters) + len(adapters2) != 1 else '',
        options.error_rate * 100, {
            False: 'single-end',
            'first': 'paired-end legacy',
            'both': 'paired-end'
        }[paired])

    if paired == 'first' and (modifiers_both or cutoffs):
        logger.warning('\n'.join(
            textwrap.wrap(
                'WARNING: Requested read '
                'modifications are applied only to the first '
                'read since backwards compatibility mode is enabled. '
                'To modify both reads, also use any of the -A/-B/-G/-U options. '
                'Use a dummy adapter sequence when necessary: -A XXX')))

    start_time = time.clock()
    try:
        stats = pipeline.run()
    except KeyboardInterrupt as e:
        print("Interrupted", file=sys.stderr)
        sys.exit(130)
    except IOError as e:
        if e.errno == errno.EPIPE:
            sys.exit(1)
        raise
    except (seqio.FormatError, EOFError) as e:
        sys.exit("cutadapt: error: {0}".format(e))

    # close open files
    for f in [
            writer, untrimmed_writer, options.rest_file, options.wildcard_file,
            options.info_file, too_short_writer, too_long_writer,
            options.info_file, demultiplexer
    ]:
        if f is not None and f is not sys.stdin and f is not sys.stdout:
            f.close()

    elapsed_time = time.clock() - start_time
    if not options.quiet:
        stats.collect((adapters, adapters2), elapsed_time, modifiers,
                      modifiers2, filters)
        # send statistics to stderr if result was sent to stdout
        stat_file = sys.stderr if options.output is None else None
        with redirect_standard_output(stat_file):
            print_report(stats, (adapters, adapters2))
コード例 #10
0
ファイル: cutadapt.py プロジェクト: rhshah/cutadapt
def pipeline_from_parsed_args(options, args, default_outfile):
	"""
	Setup a processing pipeline from parsed command-line options.

	If there are any problems parsing the arguments, a CommandlineError is thrown.
	"""
	if len(args) == 0:
		raise CommandlineError("At least one parameter needed: name of a FASTA or FASTQ file.")
	elif len(args) > 2:
		raise CommandlineError("Too many parameters.")
	input_filename = args[0]
	if input_filename.endswith('.qual'):
		raise CommandlineError("If a .qual file is given, it must be the second argument.")

	# Find out which 'mode' we need to use.
	# Default: single-read trimming (neither -p nor -A/-G/-B/-U/--interleaved given)
	paired = False
	if options.paired_output:
		# Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U).
		# This exists for backwards compatibility ('legacy mode').
		paired = 'first'
	# Any of these options switch off legacy mode
	if (options.adapters2 or options.front2 or options.anywhere2 or
			options.cut2 or options.interleaved or options.pair_filter or
			options.too_short_paired_output or options.too_long_paired_output):
		# Full paired-end trimming when both -p and -A/-G/-B/-U given
		# Read modifications (such as quality trimming) are applied also to second read.
		paired = 'both'

	if paired and len(args) == 1 and not options.interleaved:
		raise CommandlineError("When paired-end trimming is enabled via -A/-G/-B/-U/"
			"--interleaved or -p, two input files are required.")
	if not paired:
		if options.untrimmed_paired_output:
			raise CommandlineError("Option --untrimmed-paired-output can only be used when "
				"trimming paired-end reads (with option -p).")

	interleaved_input = False
	interleaved_output = False
	if options.interleaved:
		interleaved_input = len(args) == 1
		interleaved_output = not options.paired_output
		if not interleaved_input and not interleaved_output:
			raise CommandlineError("When --interleaved is used, you cannot provide both two input files and two output files")

	# Assign input_paired_filename and quality_filename
	input_paired_filename = None
	quality_filename = None
	if paired:
		if not interleaved_input:
			input_paired_filename = args[1]
		if not interleaved_output:
			if not options.paired_output:
				raise CommandlineError("When paired-end trimming is enabled via -A/-G/-B/-U, "
					"a second output file needs to be specified via -p (--paired-output).")
			if not options.output:
				raise CommandlineError("When you use -p or --paired-output, you must also "
					"use the -o option.")

		if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output):
			raise CommandlineError("When trimming paired-end reads, you must use either none "
				"or both of the --untrimmed-output/--untrimmed-paired-output options.")
		if options.too_short_output and not options.too_short_paired_output:
			raise CommandlineError("When using --too-short-output with paired-end "
				"reads, you also need to use --too-short-paired-output")
		if options.too_long_output and not options.too_long_paired_output:
			raise CommandlineError("When using --too-long-output with paired-end "
				"reads, you also need to use --too-long-paired-output")
	elif len(args) == 2:
		quality_filename = args[1]
		if options.format is not None:
			raise CommandlineError("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.")

	if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']:
		raise CommandlineError("The input file format must be either 'fasta', 'fastq' or "
			"'sra-fastq' (not '{0}').".format(options.format))

	# Open input file(s)
	try:
		reader = seqio.open(input_filename, file2=input_paired_filename,
				qualfile=quality_filename, colorspace=options.colorspace,
				fileformat=options.format, interleaved=interleaved_input)
	except (seqio.UnknownFileType, IOError) as e:
		raise CommandlineError(e)

	if options.quality_cutoff is not None:
		cutoffs = options.quality_cutoff.split(',')
		if len(cutoffs) == 1:
			try:
				cutoffs = [0, int(cutoffs[0])]
			except ValueError as e:
				raise CommandlineError("Quality cutoff value not recognized: {0}".format(e))
		elif len(cutoffs) == 2:
			try:
				cutoffs = [int(cutoffs[0]), int(cutoffs[1])]
			except ValueError as e:
				raise CommandlineError("Quality cutoff value not recognized: {0}".format(e))
		else:
			raise CommandlineError("Expected one value or two values separated by comma for the quality cutoff")
	else:
		cutoffs = None

	open_writer = functools.partial(seqio.open, mode='w',
		qualities=reader.delivers_qualities, colorspace=options.colorspace)

	if options.pair_filter is None:
		options.pair_filter = 'any'
	min_affected = 2 if options.pair_filter == 'both' else 1
	if not paired:
		filter_wrapper = Redirector
	elif paired == 'first':
		filter_wrapper = LegacyPairedRedirector
	elif paired == 'both':
		filter_wrapper = functools.partial(PairedRedirector, min_affected=min_affected)
	filters = []
	# TODO open_files = []
	too_short_writer = None  # too short reads go here
	# TODO pass file name to TooShortReadFilter, add a .close() method?
	if options.minimum_length > 0:
		if options.too_short_output:
			too_short_writer = open_writer(options.too_short_output, options.too_short_paired_output)
		filters.append(filter_wrapper(too_short_writer, TooShortReadFilter(options.minimum_length)))
	too_long_writer = None  # too long reads go here
	if options.maximum_length < sys.maxsize:
		if options.too_long_output is not None:
			too_long_writer = open_writer(options.too_long_output, options.too_long_paired_output)
		filters.append(filter_wrapper(too_long_writer, TooLongReadFilter(options.maximum_length)))

	if options.max_n != -1:
		filters.append(filter_wrapper(None, NContentFilter(options.max_n)))

	if int(options.discard_trimmed) + int(options.discard_untrimmed) + int(options.untrimmed_output is not None) > 1:
		raise CommandlineError("Only one of the --discard-trimmed, --discard-untrimmed "
			"and --untrimmed-output options can be used at the same time.")
	demultiplexer = None
	untrimmed_writer = None
	writer = None
	if options.output is not None and '{name}' in options.output:
		if options.discard_trimmed:
			raise CommandlineError("Do not use --discard-trimmed when demultiplexing.")
		if paired:
			raise CommandlineError("Demultiplexing not supported for paired-end files, yet.")
		untrimmed = options.output.replace('{name}', 'unknown')
		if options.untrimmed_output:
			untrimmed = options.untrimmed_output
		if options.discard_untrimmed:
			untrimmed = None
		demultiplexer = Demultiplexer(options.output, untrimmed,
			qualities=reader.delivers_qualities, colorspace=options.colorspace)
		filters.append(demultiplexer)
	else:
		# Set up the remaining filters to deal with --discard-trimmed,
		# --discard-untrimmed and --untrimmed-output. These options
		# are mutually exclusive in order to avoid brain damage.
		if options.discard_trimmed:
			filters.append(filter_wrapper(None, DiscardTrimmedFilter()))
		elif options.discard_untrimmed:
			filters.append(filter_wrapper(None, DiscardUntrimmedFilter()))
		elif options.untrimmed_output:
			untrimmed_writer = open_writer(options.untrimmed_output,
				options.untrimmed_paired_output)
			filters.append(filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter()))

		# Finally, figure out where the reads that passed all the previous
		# filters should go.
		if options.output is not None:
			writer = open_writer(options.output, options.paired_output, interleaved=interleaved_output)
		else:
			writer = open_writer(default_outfile, interleaved=interleaved_output)
		if not paired:
			filters.append(NoFilter(writer))
		else:
			filters.append(PairedNoFilter(writer))

	if options.maq:
		options.colorspace = True
		options.double_encode = True
		options.trim_primer = True
		options.strip_suffix.append('_F3')
		options.suffix = "/1"
	if options.zero_cap is None:
		options.zero_cap = options.colorspace
	if options.trim_primer and not options.colorspace:
		raise CommandlineError("Trimming the primer makes only sense in colorspace.")
	if options.double_encode and not options.colorspace:
		raise CommandlineError("Double-encoding makes only sense in colorspace.")
	if options.anywhere and options.colorspace:
		raise CommandlineError("Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author).")
	if not (0 <= options.error_rate <= 1.):
		raise CommandlineError("The maximum error rate must be between 0 and 1.")
	if options.overlap < 1:
		raise CommandlineError("The overlap must be at least 1.")

	if options.rest_file is not None:
		options.rest_file = xopen(options.rest_file, 'w')
		rest_writer = RestFileWriter(options.rest_file)
	else:
		rest_writer = None
	if options.info_file is not None:
		options.info_file = xopen(options.info_file, 'w')
	if options.wildcard_file is not None:
		options.wildcard_file = xopen(options.wildcard_file, 'w')

	if options.colorspace:
		if options.match_read_wildcards:
			raise CommandlineError('IUPAC wildcards not supported in colorspace')
		options.match_adapter_wildcards = False

	adapter_parser = AdapterParser(
		colorspace=options.colorspace,
		max_error_rate=options.error_rate,
		min_overlap=options.overlap,
		read_wildcards=options.match_read_wildcards,
		adapter_wildcards=options.match_adapter_wildcards,
		indels=options.indels)

	try:
		adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front)
		adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2)
	except IOError as e:
		if e.errno == errno.ENOENT:
			raise CommandlineError(e)
		raise
	except ValueError as e:
		raise CommandlineError(e)
	if options.debug:
		for adapter in adapters + adapters2:
			adapter.enable_debug()

	# Create the single-end processing pipeline (a list of "modifiers")
	modifiers = []
	if options.cut:
		if len(options.cut) > 2:
			raise CommandlineError("You cannot remove bases from more than two ends.")
		if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0:
			raise CommandlineError("You cannot remove bases from the same end twice.")
		for cut in options.cut:
			if cut != 0:
				modifiers.append(UnconditionalCutter(cut))

	if options.nextseq_trim is not None:
		modifiers.append(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base))

	if cutoffs:
		modifiers.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))
	if adapters:
		adapter_cutter = AdapterCutter(adapters, options.times,
				options.wildcard_file, options.info_file,
				rest_writer, options.action)
		modifiers.append(adapter_cutter)

	# Modifiers that apply to both reads of paired-end reads unless in legacy mode
	modifiers_both = []
	if options.length is not None:
		modifiers_both.append(Shortener(options.length))
	if options.trim_n:
		modifiers_both.append(NEndTrimmer())
	if options.length_tag:
		modifiers_both.append(LengthTagModifier(options.length_tag))
	if options.strip_f3:
		options.strip_suffix.append('_F3')
	for suffix in options.strip_suffix:
		modifiers_both.append(SuffixRemover(suffix))
	if options.prefix or options.suffix:
		modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix))
	if options.double_encode:
		modifiers_both.append(DoubleEncoder())
	if options.zero_cap and reader.delivers_qualities:
		modifiers_both.append(ZeroCapper(quality_base=options.quality_base))
	if options.trim_primer:
		modifiers_both.append(PrimerTrimmer)
	modifiers.extend(modifiers_both)

	# For paired-end data, create a second processing pipeline.
	# However, if no second-read adapters were given (via -A/-G/-B/-U), we need to
	# be backwards compatible and *no modifications* are done to the second read.
	modifiers2 = []
	if paired == 'both':
		if options.cut2:
			if len(options.cut2) > 2:
				raise CommandlineError("You cannot remove bases from more than two ends.")
			if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0:
				raise CommandlineError("You cannot remove bases from the same end twice.")
			for cut in options.cut2:
				if cut != 0:
					modifiers2.append(UnconditionalCutter(cut))

		if cutoffs:
			modifiers2.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base))
		if adapters2:
			adapter_cutter2 = AdapterCutter(adapters2, options.times,
					None, None, None, options.action)
			modifiers2.append(adapter_cutter2)
		modifiers2.extend(modifiers_both)

	if paired:
		pipeline = PairedEndPipeline(adapters, adapters2, reader, modifiers, modifiers2, filters)
	else:
		pipeline = SingleEndPipeline(adapters, adapters2, reader, modifiers, filters)

	# TODO the following should be done some other way
	pipeline.paired = paired
	pipeline.error_rate = options.error_rate
	pipeline.should_print_warning = paired == 'first' and (modifiers_both or cutoffs)
	for f in [writer, untrimmed_writer,
			options.rest_file, options.wildcard_file,
			options.info_file, too_short_writer, too_long_writer,
			options.info_file, demultiplexer]:
		pipeline.register_file_to_close(f)
	return pipeline
コード例 #11
0
def main(cmdlineargs=None, default_outfile=sys.stdout):
    """
	Main function that evaluates command-line parameters and iterates
	over all reads.

	default_outfile is the file to which trimmed reads are sent if the ``-o``
	parameter is not used.
	"""
    parser = get_option_parser()
    if cmdlineargs is None:
        cmdlineargs = sys.argv[1:]
    options, args = parser.parse_args(args=cmdlineargs)

    if len(args) == 0:
        parser.error(
            "At least one parameter needed: name of a FASTA or FASTQ file.")
    elif len(args) > 2:
        parser.error("Too many parameters.")

    input_filename = args[0]

    # If a second file name was given, then we either have single-end reads
    # provided as a pair of .fasta/.qual files or we have paired-end reads.
    quality_filename = None
    input_paired_filename = None
    if len(args) == 2:
        if args[0].endswith('.qual'):
            parser.error("The QUAL file must be the second argument.")
        if args[1].endswith('.qual'):
            quality_filename = args[1]
        else:
            input_paired_filename = args[1]
            if not options.paired_output:
                parser.error(
                    'You must use --paired-output when trimming paired-end reads.'
                )

    if len(args) == 1 and options.paired_output:
        parser.error(
            "You specified a --paired-output file, but gave only one input file."
        )
    if options.paired_output and bool(options.untrimmed_output) != bool(
            options.untrimmed_paired_output):
        parser.error(
            "When trimming paired-end reads, you must use either none "
            "or both of the --untrimmed-output/--untrimmed-paired-output options."
        )
    if options.untrimmed_paired_output and not options.paired_output:
        parser.error(
            "Option --untrimmed-paired-output can only be used when "
            "trimming paired-end reads (with option --paired-output).")
    if input_filename.endswith('.qual'):
        parser.error("Need a FASTA file in addition to the QUAL file.")

    if options.format is not None and options.format.lower() not in [
            'fasta', 'fastq', 'sra-fastq'
    ]:
        parser.error(
            "The input file format must be either 'fasta', 'fastq' or "
            "'sra-fastq' (not '{0}').".format(options.format))

    # TODO should this really be an error?
    if options.format is not None and quality_filename is not None:
        parser.error(
            "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used."
        )

    writers = []
    too_short_outfile = None  # too short reads go here
    too_short_filter = None
    # TODO pass file name to TooShortReadFilter, add a .close() method?
    if options.minimum_length > 0:
        if options.too_short_output:
            too_short_outfile = xopen(options.too_short_output, 'w')
        else:
            too_short_outfile = None
        too_short_filter = TooShortReadFilter(options.minimum_length,
                                              too_short_outfile)
        writers.append(too_short_filter)
    too_long_outfile = None  # too long reads go here
    too_long_filter = None
    if options.maximum_length < sys.maxsize:
        if options.too_long_output is not None:
            too_long_outfile = xopen(options.too_long_output, 'w')
        else:
            too_long_outfile = None
        too_long_filter = TooLongReadFilter(options.maximum_length,
                                            too_long_outfile)
        writers.append(too_long_filter)

    demultiplexer = None
    if options.output is not None and '{name}' in options.output:
        if options.discard_trimmed:
            parser.error("Do not use --discard-trimmed when demultiplexing.")
        if input_paired_filename:
            parser.error(
                "Demultiplexing not supported for paired-end files, yet.")
        untrimmed = options.output.format(name='unknown')
        if options.untrimmed_output:
            untrimmed = options.untrimmed_output
        if options.discard_untrimmed:
            untrimmed = None
        demultiplexer = Demultiplexer(options.output, untrimmed)
        writers.append(demultiplexer)
        trimmed_outfile, untrimmed_outfile = None, None
        trimmed_paired_outfile, untrimmed_paired_outfile = None, None
    else:
        trimmed_outfile, untrimmed_outfile = trimmed_and_untrimmed_files(
            default_outfile, options.output, options.untrimmed_output,
            options.discard_trimmed, options.discard_untrimmed)

        trimmed_paired_outfile, untrimmed_paired_outfile = trimmed_and_untrimmed_files(
            None,  # applies when not trimming paired-end data
            options.paired_output,
            options.untrimmed_paired_output,
            options.discard_trimmed,
            options.discard_untrimmed)

        writer = ProcessedReadWriter(trimmed_outfile, trimmed_paired_outfile,
                                     untrimmed_outfile,
                                     untrimmed_paired_outfile)
        writers.append(writer)

    if options.maq:
        options.colorspace = True
        options.double_encode = True
        options.trim_primer = True
        options.strip_suffix.append('_F3')
        options.suffix = "/1"
    if options.zero_cap is None:
        options.zero_cap = options.colorspace
    if options.trim_primer and not options.colorspace:
        parser.error("Trimming the primer makes only sense in colorspace.")
    if options.double_encode and not options.colorspace:
        parser.error("Double-encoding makes only sense in colorspace.")
    if options.anywhere and options.colorspace:
        parser.error(
            "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)."
        )
    if not (0 <= options.error_rate <= 1.):
        parser.error("The maximum error rate must be between 0 and 1.")
    if options.overlap < 1:
        parser.error("The overlap must be at least 1.")

    if options.rest_file is not None:
        options.rest_file = xopen(options.rest_file, 'w')
        rest_writer = RestFileWriter(options.rest_file)
    else:
        rest_writer = None
    if options.info_file is not None:
        options.info_file = xopen(options.info_file, 'w')
    if options.wildcard_file is not None:
        options.wildcard_file = xopen(options.wildcard_file, 'w')

    if options.colorspace:
        if options.match_read_wildcards:
            parser.error('IUPAC wildcards not supported in colorspace')
        options.match_adapter_wildcards = False
    adapters = []
    ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter

    try:
        for name, seq, where in gather_adapters(options.adapters,
                                                options.anywhere,
                                                options.front):
            if not seq:
                parser.error("The adapter sequence is empty")
            if not options.indels and where not in (PREFIX, SUFFIX):
                parser.error(
                    "Not allowing indels is currently supported only for anchored 5' and 3' adapters."
                )
            adapter = ADAPTER_CLASS(seq,
                                    where,
                                    options.error_rate,
                                    options.overlap,
                                    options.match_read_wildcards,
                                    options.match_adapter_wildcards,
                                    name=name,
                                    indels=options.indels)
            adapters.append(adapter)
    except IOError as e:
        if e.errno == errno.ENOENT:
            print("Error:", e, file=sys.stderr)
            sys.exit(1)
        raise

    if not adapters and options.quality_cutoff == 0 and options.cut == 0 and \
      options.minimum_length == 0 and options.maximum_length == sys.maxsize:
        parser.error("You need to provide at least one adapter sequence.")

    if input_paired_filename:
        reader = seqio.PairedSequenceReader(input_filename,
                                            input_paired_filename,
                                            colorspace=options.colorspace,
                                            fileformat=options.format)
    else:
        reader = read_sequences(input_filename,
                                quality_filename,
                                colorspace=options.colorspace,
                                fileformat=options.format)

    # Create the processing pipeline as a list of "modifiers".
    modifiers = []
    if options.cut:
        if len(options.cut) > 2:
            parser.error("You cannot remove bases from more than two ends.")
        if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0:
            parser.error("You cannot remove bases from the same end twice.")
        for cut in options.cut:
            if cut != 0:
                modifiers.append(UnconditionalCutter(cut))

    if options.quality_cutoff > 0:
        modifiers.append(
            QualityTrimmer(options.quality_cutoff, options.quality_base))
    if adapters:
        adapter_cutter = AdapterCutter(adapters, options.times,
                                       options.wildcard_file,
                                       options.info_file, rest_writer,
                                       options.action)
        modifiers.append(adapter_cutter)
    else:
        adapter_cutter = None
    if options.length_tag:
        modifiers.append(LengthTagModifier(options.length_tag))
    if options.strip_f3:
        options.strip_suffix.append('_F3')
    for suffix in options.strip_suffix:
        modifiers.append(SuffixRemover(suffix))
    if options.prefix or options.suffix:
        modifiers.append(PrefixSuffixAdder(options.prefix, options.suffix))
    if options.double_encode:
        modifiers.append(DoubleEncoder())
    if options.zero_cap and reader.delivers_qualities:
        modifiers.append(ZeroCapper(quality_base=options.quality_base))
    if options.trim_primer:
        modifiers.append(PrimerTrimmer)

    start_time = time.clock()
    try:
        if input_paired_filename:
            stats = process_paired_reads(reader, modifiers, writers)
        else:
            stats = process_single_reads(reader, modifiers, writers)
    except KeyboardInterrupt as e:
        print("Interrupted", file=sys.stderr)
        sys.exit(1)
    except IOError as e:
        if e.errno == errno.EPIPE:
            sys.exit(1)
        raise
    except seqio.FormatError as e:
        print("Error:", e, file=sys.stderr)
        sys.exit(1)

    # close open files
    for f in [
            trimmed_outfile, untrimmed_outfile, trimmed_paired_outfile,
            untrimmed_paired_outfile, options.rest_file, options.wildcard_file,
            options.info_file, too_short_outfile, too_long_outfile,
            options.info_file, demultiplexer
    ]:
        if f is not None and f is not sys.stdin and f is not sys.stdout:
            f.close()

    if not options.quiet:
        # send statistics to stderr if result was sent to stdout
        stat_file = sys.stderr if options.output is None else None
        print_statistics(adapters,
                         time.clock() - start_time,
                         stats,
                         options.action,
                         adapter_cutter.reads_matched if adapter_cutter else 0,
                         options.error_rate,
                         too_short_filter.too_short if too_short_filter else 0,
                         too_long_filter.too_long if too_long_filter else 0,
                         cmdlineargs,
                         file=stat_file)
コード例 #12
0
def pipeline_from_parsed_args(args, paired, is_interleaved_output):
    """
    Setup a processing pipeline from parsed command-line arguments.

    If there are any problems parsing the arguments, a CommandLineError is thrown.

    Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline)
    """
    check_arguments(args, paired, is_interleaved_output)
    if args.action == 'none':
        args.action = None

    adapter_parser = AdapterParser(
        max_error_rate=args.error_rate,
        min_overlap=args.overlap,
        read_wildcards=args.match_read_wildcards,
        adapter_wildcards=args.match_adapter_wildcards,
        indels=args.indels,
    )
    try:
        adapters = adapter_parser.parse_multi(args.adapters, args.anywhere, args.front)
        adapters2 = adapter_parser.parse_multi(args.adapters2, args.anywhere2, args.front2)
    except IOError as e:
        if e.errno == errno.ENOENT:
            raise CommandLineError(e)
        raise
    except ValueError as e:
        raise CommandLineError(e)
    if args.debug:
        for adapter in adapters + adapters2:
            adapter.enable_debug()

    # Create the processing pipeline
    if paired:
        pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter
        pipeline = PairedEndPipeline(pair_filter_mode)
    else:
        pipeline = SingleEndPipeline()

    # When adapters are being trimmed only in R1 or R2, override the pair filter mode
    # as using the default of 'any' would regard all read pairs as untrimmed.
    if paired and (not adapters2 or not adapters) and (
            args.discard_untrimmed or args.untrimmed_output or args.untrimmed_paired_output):
        pipeline.override_untrimmed_pair_filter = True

    for i, cut_arg in enumerate([args.cut, args.cut2]):
        # cut_arg is a list
        if not cut_arg:
            continue
        if len(cut_arg) > 2:
            raise CommandLineError("You cannot remove bases from more than two ends.")
        if len(cut_arg) == 2 and cut_arg[0] * cut_arg[1] > 0:
            raise CommandLineError("You cannot remove bases from the same end twice.")
        for c in cut_arg:
            if c == 0:
                continue
            if i == 0:  # R1
                if paired:
                    pipeline.add(UnconditionalCutter(c), None)
                else:
                    pipeline.add(UnconditionalCutter(c))
            else:
                # R2
                assert isinstance(pipeline, PairedEndPipeline)
                pipeline.add(None, UnconditionalCutter(c))

    pipeline_add = pipeline.add_both if paired else pipeline.add

    if args.nextseq_trim is not None:
        pipeline_add(NextseqQualityTrimmer(args.nextseq_trim, args.quality_base))
    if args.quality_cutoff is not None:
        cutoffs = parse_cutoffs(args.quality_cutoff)
        pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base))

    if args.pair_adapters:
        if not paired:
            raise CommandLineError("Option --pair-adapters can only be used when trimming "
                "paired-end reads")
        if args.times != 1:
            raise CommandLineError("--pair-adapters cannot be used with --times")
        try:
            cutter = PairedAdapterCutter(adapters, adapters2, args.action)
        except PairedAdapterCutterError as e:
            raise CommandLineError("--pair-adapters: " + str(e))
        pipeline.add_paired_modifier(cutter)
    else:
        adapter_cutter, adapter_cutter2 = None, None
        if adapters:
            adapter_cutter = AdapterCutter(adapters, args.times, args.action)
        if adapters2:
            adapter_cutter2 = AdapterCutter(adapters2, args.times, args.action)
        if paired:
            if adapter_cutter or adapter_cutter2:
                pipeline.add(adapter_cutter, adapter_cutter2)
        else:
            if adapter_cutter:
                pipeline.add(adapter_cutter)

    # Remaining modifiers that apply to both reads of paired-end reads
    if args.length is not None:
        pipeline_add(Shortener(args.length))
    if args.trim_n:
        pipeline_add(NEndTrimmer())
    if args.length_tag:
        pipeline_add(LengthTagModifier(args.length_tag))
    for suffix in args.strip_suffix:
        pipeline_add(SuffixRemover(suffix))
    if args.prefix or args.suffix:
        pipeline_add(PrefixSuffixAdder(args.prefix, args.suffix))
    if args.zero_cap:
        pipeline_add(ZeroCapper(quality_base=args.quality_base))

    # Set filtering parameters
    # Minimum/maximum length
    for attr in 'minimum_length', 'maximum_length':
        param = getattr(args, attr)
        if param is not None:
            lengths = parse_lengths(param)
            if not paired and len(lengths) == 2:
                raise CommandLineError('Two minimum or maximum lengths given for single-end data')
            if paired and len(lengths) == 1:
                lengths = (lengths[0], lengths[0])
            setattr(pipeline, attr, lengths)
    pipeline.max_n = args.max_n
    pipeline.discard_casava = args.discard_casava
    pipeline.discard_trimmed = args.discard_trimmed
    pipeline.discard_untrimmed = args.discard_untrimmed

    return pipeline
コード例 #13
0
ファイル: cutadapt.py プロジェクト: vittoriozamboni/cutadapt
def main(cmdlineargs=None, trimmed_outfile=sys.stdout):
    """
	Main function that evaluates command-line parameters and iterates
	over all reads.

	trimmed_outfile is the default output file to which trimmed reads
	are sent. It can be overriden by using the '-o' parameter.
	"""
    parser = get_option_parser()
    if cmdlineargs is None:
        cmdlineargs = sys.argv[1:]
    options, args = parser.parse_args(args=cmdlineargs)

    if len(args) == 0:
        parser.error(
            "At least one parameter needed: name of a FASTA or FASTQ file.")
    elif len(args) > 2:
        parser.error("Too many parameters.")

    input_filename = args[0]
    quality_filename = None
    pe_filename = None
    if len(args) == 2:
        if args[1].endswith('.qual'):
            quality_filename = args[1]
        else:
            pe_filename = args[1]
            if not options.paired_output:
                parser.error(
                    'you must use --paired-output when trimming paired-end reads'
                )

    if len(args) == 1 and options.paired_output:
        parser.error(
            "You specified a --paired-output file, but gave only one input file."
        )
    if input_filename.endswith('.qual') and quality_filename.endswith('fasta'):
        parser.error(
            "FASTA and QUAL file given, but the FASTA file must be first.")

    if options.format is not None and options.format.lower() not in [
            'fasta', 'fastq', 'sra-fastq'
    ]:
        parser.error(
            "The input file format must be either 'fasta', 'fastq' or 'sra-fastq' (not '{0}')."
            .format(options.format))

    # TODO should this really be an error?
    if options.format is not None and quality_filename is not None:
        parser.error(
            "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used."
        )

    # default output files (overwritten below)
    too_short_outfile = None  # too short reads go here
    too_long_outfile = None  # too long reads go here
    pe_outfile = None
    if options.output is not None:
        trimmed_outfile = xopen(options.output, 'w')
    untrimmed_outfile = trimmed_outfile  # reads without adapters go here
    if options.untrimmed_output is not None:
        untrimmed_outfile = xopen(options.untrimmed_output, 'w')
    if options.too_short_output is not None:
        too_short_outfile = xopen(options.too_short_output, 'w')
    if options.too_long_output is not None:
        too_long_outfile = xopen(options.too_long_output, 'w')
    if options.paired_output:
        pe_outfile = xopen(options.paired_output, 'w')

    if options.maq:
        options.colorspace = True
        options.double_encode = True
        options.trim_primer = True
        options.strip_suffix.append('_F3')
        options.suffix = "/1"
        options.zero_cap = True
    if options.trim_primer and not options.colorspace:
        parser.error("Trimming the primer makes only sense in color space.")
    if options.double_encode and not options.colorspace:
        parser.error("Double-encoding makes only sense in color space.")
    if options.anywhere and options.colorspace:
        parser.error(
            "Using --anywhere with color space reads is currently not supported (if you think this may be useful, contact the author)."
        )
    if not (0 <= options.error_rate <= 1.):
        parser.error("The maximum error rate must be between 0 and 1.")
    if options.overlap < 1:
        parser.error("The overlap must be at least 1.")

    if options.rest_file is not None:
        options.rest_file = xopen(options.rest_file, 'w')
        rest_writer = RestFileWriter(options.rest_file)
    else:
        rest_writer = None
    if options.info_file is not None:
        options.info_file = xopen(options.info_file, 'w')
    if options.wildcard_file is not None:
        options.wildcard_file = xopen(options.wildcard_file, 'w')

    adapters = []

    def parse_adapter_name(seq):
        """
		Parse an adapter given as 'name=adapt' into 'name' and 'adapt'.
		"""
        fields = seq.split('=', 1)
        if len(fields) > 1:
            name, seq = fields
            name = name.strip()
        else:
            name = None
        seq = seq.strip()
        return name, seq

    ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter

    def append_adapters(adapter_list, where):
        for seq in adapter_list:
            name, seq = parse_adapter_name(seq)
            w = where
            if w == FRONT and seq.startswith('^'):
                seq = seq[1:]
                w = PREFIX
            elif not options.indels:
                parser.error(
                    "Not allowing indels is currently supported only for anchored 5' adapters."
                )
            if not seq:
                parser.error("The adapter sequence is empty")
            adapter = ADAPTER_CLASS(seq,
                                    w,
                                    options.error_rate,
                                    options.overlap,
                                    options.match_read_wildcards,
                                    options.match_adapter_wildcards,
                                    name=name,
                                    indels=options.indels)
            adapters.append(adapter)

    append_adapters(options.adapters, BACK)
    append_adapters(options.anywhere, ANYWHERE)
    append_adapters(options.front, FRONT)

    # make sure these aren't used by accident
    del options.adapters
    del options.anywhere
    del options.front

    if not adapters and options.quality_cutoff == 0 and options.cut == 0:
        parser.error("You need to provide at least one adapter sequence.")

    modifiers = []
    if options.cut:
        modifiers.append(UnconditionalCutter(options.cut))
    if options.quality_cutoff > 0:
        modifiers.append(
            QualityTrimmer(options.quality_cutoff, options.quality_base))
    if adapters:
        adapter_cutter = RepeatedAdapterCutter(adapters, options.times,
                                               options.wildcard_file,
                                               options.info_file, options.trim,
                                               rest_writer,
                                               options.mask_adapter)
        modifiers.append(adapter_cutter)
    else:
        adapter_cutter = None
    if options.length_tag:
        modifiers.append(LengthTagModifier(options.length_tag))
    if options.strip_f3:
        options.strip_suffix.append('_F3')
    for suffix in options.strip_suffix:
        modifiers.append(SuffixRemover(suffix))
    if options.prefix or options.suffix:
        modifiers.append(PrefixSuffixAdder(options.prefix, options.suffix))
    if options.double_encode:
        modifiers.append(DoubleEncoder())
    if options.zero_cap:
        modifiers.append(ZeroCapper(quality_base=options.quality_base))
    if options.trim_primer:
        modifiers.append(PrimerTrimmer)

    readfilter = ReadFilter(options.minimum_length, options.maximum_length,
                            too_short_outfile, too_long_outfile,
                            options.discard_trimmed, options.discard_untrimmed)
    start_time = time.clock()
    try:
        reader = read_sequences(input_filename,
                                quality_filename,
                                colorspace=options.colorspace,
                                fileformat=options.format)
        if pe_filename:
            pe_reader = read_sequences(pe_filename,
                                       None,
                                       colorspace=options.colorspace,
                                       fileformat=options.format)
        else:
            pe_reader = None
        stats = process_reads(reader, pe_reader, modifiers, readfilter,
                              trimmed_outfile, untrimmed_outfile, pe_outfile)
    except IOError as e:
        if e.errno == errno.EPIPE:
            sys.exit(1)
        raise
    except seqio.FormatError as e:
        print("Error:", e, file=sys.stderr)
        sys.exit(1)
    # close open files
    for f in [
            options.rest_file, options.wildcard_file, options.info_file,
            too_short_outfile, too_long_outfile, options.info_file
    ]:
        if f is not None:
            f.close()
    # send statistics to stderr if result was sent to stdout
    stat_file = sys.stderr if options.output is None else None

    print_statistics(adapters,
                     time.clock() - start_time,
                     stats,
                     options.trim,
                     adapter_cutter.reads_matched if adapter_cutter else 0,
                     options.error_rate,
                     readfilter.too_short,
                     readfilter.too_long,
                     cmdlineargs,
                     file=stat_file)