def main(cmdlineargs=None, default_outfile=sys.stdout): """ Main function that evaluates command-line parameters and iterates over all reads. default_outfile is the file to which trimmed reads are sent if the ``-o`` parameter is not used. """ logging.basicConfig(level=logging.INFO, format='%(message)s') # %(levelname)s parser = get_option_parser() if cmdlineargs is None: cmdlineargs = sys.argv[1:] options, args = parser.parse_args(args=cmdlineargs) if len(args) == 0: parser.error( "At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] # Find out which 'mode' we need to use. # Default: single-read trimming (neither -p nor -A/-G/-B/-U given) paired = False if options.paired_output: # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U). # This exists for backwards compatibility ('legacy mode'). paired = 'first' if options.adapters2 or options.front2 or options.anywhere2 or options.cut2: # Full paired-end trimming when both -p and -A/-G/-B/-U given # Also the read modifications (such as quality trimming) are applied # to second read. paired = 'both' if paired and len(args) == 1: parser.error( "When paired-end trimming is enabled via -A/-G/-B/-U or -p, " "two input files are required.") if paired: input_paired_filename = args[1] quality_filename = None else: input_paired_filename = None if len(args) == 2: if args[0].endswith('.qual'): parser.error("The QUAL file must be the second argument.") quality_filename = args[1] else: quality_filename = None if paired: if not options.paired_output: parser.error( "When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output)." ) if bool(options.untrimmed_output) != bool( options.untrimmed_paired_output): parser.error( "When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options." ) else: if options.untrimmed_paired_output: parser.error( "Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") if input_filename.endswith('.qual'): parser.error("Need a FASTA file in addition to the QUAL file.") if options.format is not None and quality_filename is not None: parser.error( "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used." ) if options.format is not None and options.format.lower() not in [ 'fasta', 'fastq', 'sra-fastq' ]: parser.error( "The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) if options.quality_cutoff is not None: cutoffs = options.quality_cutoff.split(',') if len(cutoffs) == 1: try: cutoffs = [0, int(cutoffs[0])] except ValueError as e: parser.error( "Quality cutoff value not recognized: {0}".format(e)) elif len(cutoffs) == 2: try: cutoffs = [int(cutoffs[0]), int(cutoffs[1])] except ValueError as e: parser.error( "Quality cutoff value not recognized: {0}".format(e)) else: parser.error( "Expected one value or two values separated by comma for the quality cutoff" ) else: cutoffs = None writers = [] too_short_outfile = None # too short reads go here too_short_filter = None # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_outfile = xopen(options.too_short_output, 'w') else: too_short_outfile = None too_short_filter = TooShortReadFilter(options.minimum_length, too_short_outfile, paired == 'both') writers.append(too_short_filter) too_long_outfile = None # too long reads go here too_long_filter = None if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_outfile = xopen(options.too_long_output, 'w') else: too_long_outfile = None too_long_filter = TooLongReadFilter(options.maximum_length, too_long_outfile, check_second=paired == 'both') writers.append(too_long_filter) if options.max_n != -1: writers.append( NContentFilter(options.max_n, check_second=paired == 'both')) demultiplexer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: parser.error("Do not use --discard-trimmed when demultiplexing.") if paired: parser.error( "Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.format(name='unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed) writers.append(demultiplexer) trimmed_outfile, untrimmed_outfile = None, None trimmed_paired_outfile, untrimmed_paired_outfile = None, None else: trimmed_outfile, untrimmed_outfile = trimmed_and_untrimmed_files( default_outfile, options.output, options.untrimmed_output, options.discard_trimmed, options.discard_untrimmed) trimmed_paired_outfile, untrimmed_paired_outfile = trimmed_and_untrimmed_files( None, # applies when not trimming paired-end data options.paired_output, options.untrimmed_paired_output, options.discard_trimmed, options.discard_untrimmed) if untrimmed_outfile or untrimmed_paired_outfile: writers.append( DiscardUntrimmedFilter(untrimmed_outfile, untrimmed_paired_outfile, check_second=paired == 'both')) writer = DiscardTrimmedFilter(trimmed_outfile, trimmed_paired_outfile, check_second=paired == 'both') writers.append(writer) del writer if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: parser.error( "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)." ) if not (0 <= options.error_rate <= 1.): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: parser.error('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter try: # TODO refactor this a bit def collect(back, anywhere, front): adapters = [] for name, seq, where in gather_adapters(back, anywhere, front): if not seq: parser.error("The adapter sequence is empty.") adapter = ADAPTER_CLASS(seq, where, options.error_rate, options.overlap, options.match_read_wildcards, options.match_adapter_wildcards, name=name, indels=options.indels) if options.debug: adapter.enable_debug() adapters.append(adapter) return adapters adapters = collect(options.adapters, options.anywhere, options.front) adapters2 = collect(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: parser.error(e) raise if not adapters and not adapters2 and not cutoffs and \ options.cut == [] and options.cut2 == [] and \ options.minimum_length == 0 and \ options.maximum_length == sys.maxsize and \ quality_filename is None and \ options.max_n == -1: parser.error("You need to provide at least one adapter sequence.") try: reader = seqio.open(input_filename, file2=input_paired_filename, qualfile=quality_filename, colorspace=options.colorspace, fileformat=options.format) except (seqio.UnknownFileType, IOError) as e: parser.error(e) # Create the processing pipeline consisting of a list of "modifiers". modifiers = [] if options.cut: if len(options.cut) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if cutoffs: modifiers.append( QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) else: adapter_cutter = None # Modifiers that apply to both reads of paired-end reads modifiers_both = [] if options.trim_n: modifiers_both.append(NEndTrimmer()) if options.length_tag: modifiers_both.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers_both.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers_both.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers_both.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers_both.append(PrimerTrimmer) modifiers.extend(modifiers_both) # For paired-end data, create a second processing pipeline. # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. modifiers2 = [] if paired == 'both': if options.cut2: if len(options.cut2) > 2: parser.error( "You cannot remove bases from more than two ends.") if len(options.cut2 ) == 2 and options.cut2[0] * options.cut2[1] > 0: parser.error( "You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: modifiers2.append(UnconditionalCutter(cut)) if cutoffs: modifiers2.append( QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, None, None, None, options.action) modifiers2.append(adapter_cutter2) else: adapter_cutter2 = None modifiers2.extend(modifiers_both) # Due to backwards compatibility, from here on logging output needs to be # sent to standard output instead of standard error if the -o option is used. if options.output: logger.root.handlers = [] logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout) logger.info("This is cutadapt %s with Python %s", __version__, platform.python_version()) logger.info("Command line parameters: %s", " ".join(cmdlineargs)) logger.info( "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...", len(adapters) + len(adapters2), 's' if len(adapters) + len(adapters2) != 1 else '', options.error_rate * 100, { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[paired]) start_time = time.clock() try: if paired: stats = process_paired_reads(reader, modifiers, modifiers2, writers) else: stats = process_single_reads(reader, modifiers, writers) except KeyboardInterrupt as e: print("Interrupted", file=sys.stderr) sys.exit(130) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise except (seqio.FormatError, EOFError) as e: sys.exit("cutadapt: error: {0}".format(e)) # close open files for f in [ trimmed_outfile, untrimmed_outfile, trimmed_paired_outfile, untrimmed_paired_outfile, options.rest_file, options.wildcard_file, options.info_file, too_short_outfile, too_long_outfile, options.info_file, demultiplexer ]: if f is not None and f is not sys.stdin and f is not sys.stdout: f.close() elapsed_time = time.clock() - start_time if not options.quiet: stats.collect((adapters, adapters2), elapsed_time, modifiers, modifiers2, writers) # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None with redirect_standard_output(stat_file): print_report(stats, (adapters, adapters2))
def pipeline_from_parsed_args(options, paired, pair_filter_mode, quality_filename, is_interleaved_output): """ Setup a processing pipeline from parsed command-line options. If there are any problems parsing the arguments, a CommandLineError is thrown. Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) """ if not paired: if options.untrimmed_paired_output: raise CommandLineError("Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") if paired: if not is_interleaved_output: if not options.paired_output: raise CommandLineError("When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output).") if not options.output: raise CommandLineError("When you use -p or --paired-output, you must also " "use the -o option.") if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output): raise CommandLineError("When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options.") if options.too_short_output and not options.too_short_paired_output: raise CommandLineError("When using --too-short-output with paired-end " "reads, you also need to use --too-short-paired-output") if options.too_long_output and not options.too_long_paired_output: raise CommandLineError("When using --too-long-output with paired-end " "reads, you also need to use --too-long-paired-output") elif quality_filename is not None: if options.format is not None: raise CommandLineError('If a pair of .fasta and .qual files is given, the -f/--format ' 'parameter cannot be used.') if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']: raise CommandLineError("The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: raise CommandLineError("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: raise CommandLineError("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: raise CommandLineError("Using --anywhere with colorspace reads is currently not supported " "(if you think this may be useful, contact the author).") if not (0 <= options.error_rate <= 1.): raise CommandLineError("The maximum error rate must be between 0 and 1.") if options.overlap < 1: raise CommandLineError("The overlap must be at least 1.") if not (0 <= options.gc_content <= 100): raise CommandLineError("GC content must be given as percentage between 0 and 100") if options.action == 'none': options.action = None if options.colorspace: if options.match_read_wildcards: raise CommandLineError('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False adapter_parser = AdapterParser( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels) try: adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: raise CommandLineError(e) raise except ValueError as e: raise CommandLineError(e) if options.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the processing pipeline. # If no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. if paired: pipeline = PairedEndPipeline(pair_filter_mode, modify_first_read_only=paired == 'first') else: pipeline = SingleEndPipeline() if options.cut: if len(options.cut) > 2: raise CommandLineError("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: raise CommandLineError("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: pipeline.add1(UnconditionalCutter(cut)) if options.cut2: if len(options.cut2) > 2: raise CommandLineError("You cannot remove bases from more than two ends.") if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0: raise CommandLineError("You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: pipeline.add2(UnconditionalCutter(cut)) if options.nextseq_trim is not None: pipeline.add(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base)) if options.quality_cutoff is not None: cutoffs = parse_cutoffs(options.quality_cutoff) pipeline.add(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.action) pipeline.add1(adapter_cutter) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, options.action) pipeline.add2(adapter_cutter2) # Modifiers that apply to both reads of paired-end reads unless in legacy mode if options.length is not None: pipeline.add(Shortener(options.length)) if options.trim_n: pipeline.add(NEndTrimmer()) if options.length_tag: pipeline.add(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: pipeline.add(SuffixRemover(suffix)) if options.prefix or options.suffix: pipeline.add(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: pipeline.add(DoubleEncoder()) if options.zero_cap: pipeline.add(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: pipeline.add(PrimerTrimmer()) # Set filtering parameters # Minimum/maximum length for attr in 'minimum_length', 'maximum_length': param = getattr(options, attr) if param is not None: lengths = parse_lengths(param) if not paired and len(lengths) == 2: raise CommandLineError('Two minimum or maximum lengths given for single-end data') if paired and len(lengths) == 1: lengths = (lengths[0], lengths[0]) setattr(pipeline, attr, lengths) pipeline.max_n = options.max_n pipeline.discard_casava = options.discard_casava pipeline.discard_trimmed = options.discard_trimmed pipeline.discard_untrimmed = options.discard_untrimmed return pipeline
def main(cmdlineargs=None, default_outfile=sys.stdout): """ Main function that evaluates command-line parameters and iterates over all reads. default_outfile is the file to which trimmed reads are sent if the ``-o`` parameter is not used. """ parser = get_option_parser() if cmdlineargs is None: cmdlineargs = sys.argv[1:] options, args = parser.parse_args(args=cmdlineargs) # Setup logging only if there are not already any handlers (can happen when # this function is being called externally such as from unit tests) if not logging.root.handlers: setup_logging(stdout=bool(options.output), quiet=options.quiet) if len(args) == 0: parser.error( "At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] if input_filename.endswith('.qual'): parser.error( "If a .qual file is given, it must be the second argument.") # Find out which 'mode' we need to use. # Default: single-read trimming (neither -p nor -A/-G/-B/-U/--interleaved given) paired = False if options.paired_output: # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U). # This exists for backwards compatibility ('legacy mode'). paired = 'first' # Any of these options switch off legacy mode if (options.adapters2 or options.front2 or options.anywhere2 or options.cut2 or options.interleaved or options.pair_filter or options.too_short_paired_output or options.too_long_paired_output): # Full paired-end trimming when both -p and -A/-G/-B/-U given # Read modifications (such as quality trimming) are applied also to second read. paired = 'both' if paired and len(args) == 1 and not options.interleaved: parser.error("When paired-end trimming is enabled via -A/-G/-B/-U/" "--interleaved or -p, two input files are required.") if not paired: if options.untrimmed_paired_output: parser.error( "Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") interleaved_input = False interleaved_output = False if options.interleaved: interleaved_input = len(args) == 1 interleaved_output = not options.paired_output if not interleaved_input and not interleaved_output: parser.error( "When --interleaved is used, you cannot provide both two input files and two output files" ) # Assign input_paired_filename and quality_filename input_paired_filename = None quality_filename = None if paired: if not interleaved_input: input_paired_filename = args[1] if not interleaved_output: if not options.paired_output: parser.error( "When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output)." ) if not options.output: parser.error( "When you use -p or --paired-output, you must also " "use the -o option.") if bool(options.untrimmed_output) != bool( options.untrimmed_paired_output): parser.error( "When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options." ) if options.too_short_output and not options.too_short_paired_output: parser.error( "When using --too-short-output with paired-end " "reads, you also need to use --too-short-paired-output") if options.too_long_output and not options.too_long_paired_output: parser.error( "When using --too-long-output with paired-end " "reads, you also need to use --too-long-paired-output") elif len(args) == 2: quality_filename = args[1] if options.format is not None: parser.error( "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used." ) if options.format is not None and options.format.lower() not in [ 'fasta', 'fastq', 'sra-fastq' ]: parser.error( "The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) # Open input file(s) try: reader = seqio.open(input_filename, file2=input_paired_filename, qualfile=quality_filename, colorspace=options.colorspace, fileformat=options.format, interleaved=interleaved_input) except (seqio.UnknownFileType, IOError) as e: parser.error(e) if options.quality_cutoff is not None: cutoffs = options.quality_cutoff.split(',') if len(cutoffs) == 1: try: cutoffs = [0, int(cutoffs[0])] except ValueError as e: parser.error( "Quality cutoff value not recognized: {0}".format(e)) elif len(cutoffs) == 2: try: cutoffs = [int(cutoffs[0]), int(cutoffs[1])] except ValueError as e: parser.error( "Quality cutoff value not recognized: {0}".format(e)) else: parser.error( "Expected one value or two values separated by comma for the quality cutoff" ) else: cutoffs = None open_writer = functools.partial(seqio.open, mode='w', qualities=reader.delivers_qualities, colorspace=options.colorspace) if options.pair_filter is None: options.pair_filter = 'any' min_affected = 2 if options.pair_filter == 'both' else 1 if not paired: filter_wrapper = Redirector elif paired == 'first': filter_wrapper = LegacyPairedRedirector elif paired == 'both': filter_wrapper = functools.partial(PairedRedirector, min_affected=min_affected) filters = [] # TODO open_files = [] too_short_writer = None # too short reads go here # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_writer = open_writer(options.too_short_output, options.too_short_paired_output) filters.append( filter_wrapper(too_short_writer, TooShortReadFilter(options.minimum_length))) too_long_writer = None # too long reads go here if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_writer = open_writer(options.too_long_output, options.too_long_paired_output) filters.append( filter_wrapper(too_long_writer, TooLongReadFilter(options.maximum_length))) if options.max_n != -1: filters.append(filter_wrapper(None, NContentFilter(options.max_n))) if int(options.discard_trimmed) + int(options.discard_untrimmed) + int( options.untrimmed_output is not None) > 1: parser.error( "Only one of the --discard-trimmed, --discard-untrimmed " "and --untrimmed-output options can be used at the same time.") demultiplexer = None untrimmed_writer = None writer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: parser.error("Do not use --discard-trimmed when demultiplexing.") if paired: parser.error( "Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.replace('{name}', 'unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed, qualities=reader.delivers_qualities, colorspace=options.colorspace) filters.append(demultiplexer) else: # Set up the remaining filters to deal with --discard-trimmed, # --discard-untrimmed and --untrimmed-output. These options # are mutually exclusive in order to avoid brain damage. if options.discard_trimmed: filters.append(filter_wrapper(None, DiscardTrimmedFilter())) elif options.discard_untrimmed: filters.append(filter_wrapper(None, DiscardUntrimmedFilter())) elif options.untrimmed_output: untrimmed_writer = open_writer(options.untrimmed_output, options.untrimmed_paired_output) filters.append( filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter())) # Finally, figure out where the reads that passed all the previous # filters should go. if options.output is not None: writer = open_writer(options.output, options.paired_output, interleaved=interleaved_output) else: writer = open_writer(default_outfile, interleaved=interleaved_output) if not paired: filters.append(NoFilter(writer)) else: filters.append(PairedNoFilter(writer)) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: parser.error( "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)." ) if not (0 <= options.error_rate <= 1.): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: parser.error('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False adapter_parser = AdapterParser( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels) try: adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: parser.error(e) raise except ValueError as e: parser.error(e) if options.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the single-end processing pipeline (a list of "modifiers") modifiers = [] if options.cut: if len(options.cut) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if options.nextseq_trim is not None: modifiers.append( NextseqQualityTrimmer(options.nextseq_trim, options.quality_base)) if cutoffs: modifiers.append( QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) # Modifiers that apply to both reads of paired-end reads unless in legacy mode modifiers_both = [] if options.length is not None: modifiers_both.append(Shortener(options.length)) if options.trim_n: modifiers_both.append(NEndTrimmer()) if options.length_tag: modifiers_both.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers_both.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers_both.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers_both.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers_both.append(PrimerTrimmer) modifiers.extend(modifiers_both) # For paired-end data, create a second processing pipeline. # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. modifiers2 = [] if paired == 'both': if options.cut2: if len(options.cut2) > 2: parser.error( "You cannot remove bases from more than two ends.") if len(options.cut2 ) == 2 and options.cut2[0] * options.cut2[1] > 0: parser.error( "You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: modifiers2.append(UnconditionalCutter(cut)) if cutoffs: modifiers2.append( QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, None, None, None, options.action) modifiers2.append(adapter_cutter2) else: adapter_cutter2 = None modifiers2.extend(modifiers_both) if paired: pipeline = PairedEndPipeline(reader, modifiers, modifiers2, filters) else: pipeline = SingleEndPipeline(reader, modifiers, filters) logger.info("This is cutadapt %s with Python %s", __version__, platform.python_version()) logger.info("Command line parameters: %s", " ".join(cmdlineargs)) logger.info( "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...", len(adapters) + len(adapters2), 's' if len(adapters) + len(adapters2) != 1 else '', options.error_rate * 100, { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[paired]) if paired == 'first' and (modifiers_both or cutoffs): logger.warning('\n'.join( textwrap.wrap( 'WARNING: Requested read ' 'modifications are applied only to the first ' 'read since backwards compatibility mode is enabled. ' 'To modify both reads, also use any of the -A/-B/-G/-U options. ' 'Use a dummy adapter sequence when necessary: -A XXX'))) start_time = time.clock() try: stats = pipeline.run() except KeyboardInterrupt as e: print("Interrupted", file=sys.stderr) sys.exit(130) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise except (seqio.FormatError, EOFError) as e: sys.exit("cutadapt: error: {0}".format(e)) # close open files for f in [ writer, untrimmed_writer, options.rest_file, options.wildcard_file, options.info_file, too_short_writer, too_long_writer, options.info_file, demultiplexer ]: if f is not None and f is not sys.stdin and f is not sys.stdout: f.close() elapsed_time = time.clock() - start_time if not options.quiet: stats.collect((adapters, adapters2), elapsed_time, modifiers, modifiers2, filters) # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None with redirect_standard_output(stat_file): print_report(stats, (adapters, adapters2))
def pipeline_from_parsed_args(options, args, default_outfile): """ Setup a processing pipeline from parsed command-line options. If there are any problems parsing the arguments, a CommandlineError is thrown. """ if len(args) == 0: raise CommandlineError("At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: raise CommandlineError("Too many parameters.") input_filename = args[0] if input_filename.endswith('.qual'): raise CommandlineError("If a .qual file is given, it must be the second argument.") # Find out which 'mode' we need to use. # Default: single-read trimming (neither -p nor -A/-G/-B/-U/--interleaved given) paired = False if options.paired_output: # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U). # This exists for backwards compatibility ('legacy mode'). paired = 'first' # Any of these options switch off legacy mode if (options.adapters2 or options.front2 or options.anywhere2 or options.cut2 or options.interleaved or options.pair_filter or options.too_short_paired_output or options.too_long_paired_output): # Full paired-end trimming when both -p and -A/-G/-B/-U given # Read modifications (such as quality trimming) are applied also to second read. paired = 'both' if paired and len(args) == 1 and not options.interleaved: raise CommandlineError("When paired-end trimming is enabled via -A/-G/-B/-U/" "--interleaved or -p, two input files are required.") if not paired: if options.untrimmed_paired_output: raise CommandlineError("Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") interleaved_input = False interleaved_output = False if options.interleaved: interleaved_input = len(args) == 1 interleaved_output = not options.paired_output if not interleaved_input and not interleaved_output: raise CommandlineError("When --interleaved is used, you cannot provide both two input files and two output files") # Assign input_paired_filename and quality_filename input_paired_filename = None quality_filename = None if paired: if not interleaved_input: input_paired_filename = args[1] if not interleaved_output: if not options.paired_output: raise CommandlineError("When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output).") if not options.output: raise CommandlineError("When you use -p or --paired-output, you must also " "use the -o option.") if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output): raise CommandlineError("When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options.") if options.too_short_output and not options.too_short_paired_output: raise CommandlineError("When using --too-short-output with paired-end " "reads, you also need to use --too-short-paired-output") if options.too_long_output and not options.too_long_paired_output: raise CommandlineError("When using --too-long-output with paired-end " "reads, you also need to use --too-long-paired-output") elif len(args) == 2: quality_filename = args[1] if options.format is not None: raise CommandlineError("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.") if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']: raise CommandlineError("The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) # Open input file(s) try: reader = seqio.open(input_filename, file2=input_paired_filename, qualfile=quality_filename, colorspace=options.colorspace, fileformat=options.format, interleaved=interleaved_input) except (seqio.UnknownFileType, IOError) as e: raise CommandlineError(e) if options.quality_cutoff is not None: cutoffs = options.quality_cutoff.split(',') if len(cutoffs) == 1: try: cutoffs = [0, int(cutoffs[0])] except ValueError as e: raise CommandlineError("Quality cutoff value not recognized: {0}".format(e)) elif len(cutoffs) == 2: try: cutoffs = [int(cutoffs[0]), int(cutoffs[1])] except ValueError as e: raise CommandlineError("Quality cutoff value not recognized: {0}".format(e)) else: raise CommandlineError("Expected one value or two values separated by comma for the quality cutoff") else: cutoffs = None open_writer = functools.partial(seqio.open, mode='w', qualities=reader.delivers_qualities, colorspace=options.colorspace) if options.pair_filter is None: options.pair_filter = 'any' min_affected = 2 if options.pair_filter == 'both' else 1 if not paired: filter_wrapper = Redirector elif paired == 'first': filter_wrapper = LegacyPairedRedirector elif paired == 'both': filter_wrapper = functools.partial(PairedRedirector, min_affected=min_affected) filters = [] # TODO open_files = [] too_short_writer = None # too short reads go here # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_writer = open_writer(options.too_short_output, options.too_short_paired_output) filters.append(filter_wrapper(too_short_writer, TooShortReadFilter(options.minimum_length))) too_long_writer = None # too long reads go here if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_writer = open_writer(options.too_long_output, options.too_long_paired_output) filters.append(filter_wrapper(too_long_writer, TooLongReadFilter(options.maximum_length))) if options.max_n != -1: filters.append(filter_wrapper(None, NContentFilter(options.max_n))) if int(options.discard_trimmed) + int(options.discard_untrimmed) + int(options.untrimmed_output is not None) > 1: raise CommandlineError("Only one of the --discard-trimmed, --discard-untrimmed " "and --untrimmed-output options can be used at the same time.") demultiplexer = None untrimmed_writer = None writer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: raise CommandlineError("Do not use --discard-trimmed when demultiplexing.") if paired: raise CommandlineError("Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.replace('{name}', 'unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed, qualities=reader.delivers_qualities, colorspace=options.colorspace) filters.append(demultiplexer) else: # Set up the remaining filters to deal with --discard-trimmed, # --discard-untrimmed and --untrimmed-output. These options # are mutually exclusive in order to avoid brain damage. if options.discard_trimmed: filters.append(filter_wrapper(None, DiscardTrimmedFilter())) elif options.discard_untrimmed: filters.append(filter_wrapper(None, DiscardUntrimmedFilter())) elif options.untrimmed_output: untrimmed_writer = open_writer(options.untrimmed_output, options.untrimmed_paired_output) filters.append(filter_wrapper(untrimmed_writer, DiscardUntrimmedFilter())) # Finally, figure out where the reads that passed all the previous # filters should go. if options.output is not None: writer = open_writer(options.output, options.paired_output, interleaved=interleaved_output) else: writer = open_writer(default_outfile, interleaved=interleaved_output) if not paired: filters.append(NoFilter(writer)) else: filters.append(PairedNoFilter(writer)) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: raise CommandlineError("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: raise CommandlineError("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: raise CommandlineError("Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author).") if not (0 <= options.error_rate <= 1.): raise CommandlineError("The maximum error rate must be between 0 and 1.") if options.overlap < 1: raise CommandlineError("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: raise CommandlineError('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False adapter_parser = AdapterParser( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels) try: adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: raise CommandlineError(e) raise except ValueError as e: raise CommandlineError(e) if options.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the single-end processing pipeline (a list of "modifiers") modifiers = [] if options.cut: if len(options.cut) > 2: raise CommandlineError("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: raise CommandlineError("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if options.nextseq_trim is not None: modifiers.append(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base)) if cutoffs: modifiers.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) # Modifiers that apply to both reads of paired-end reads unless in legacy mode modifiers_both = [] if options.length is not None: modifiers_both.append(Shortener(options.length)) if options.trim_n: modifiers_both.append(NEndTrimmer()) if options.length_tag: modifiers_both.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers_both.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers_both.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers_both.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers_both.append(PrimerTrimmer) modifiers.extend(modifiers_both) # For paired-end data, create a second processing pipeline. # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. modifiers2 = [] if paired == 'both': if options.cut2: if len(options.cut2) > 2: raise CommandlineError("You cannot remove bases from more than two ends.") if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0: raise CommandlineError("You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: modifiers2.append(UnconditionalCutter(cut)) if cutoffs: modifiers2.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, None, None, None, options.action) modifiers2.append(adapter_cutter2) modifiers2.extend(modifiers_both) if paired: pipeline = PairedEndPipeline(adapters, adapters2, reader, modifiers, modifiers2, filters) else: pipeline = SingleEndPipeline(adapters, adapters2, reader, modifiers, filters) # TODO the following should be done some other way pipeline.paired = paired pipeline.error_rate = options.error_rate pipeline.should_print_warning = paired == 'first' and (modifiers_both or cutoffs) for f in [writer, untrimmed_writer, options.rest_file, options.wildcard_file, options.info_file, too_short_writer, too_long_writer, options.info_file, demultiplexer]: pipeline.register_file_to_close(f) return pipeline
def main(cmdlineargs=None, default_outfile=sys.stdout): """ Main function that evaluates command-line parameters and iterates over all reads. default_outfile is the file to which trimmed reads are sent if the ``-o`` parameter is not used. """ parser = get_option_parser() if cmdlineargs is None: cmdlineargs = sys.argv[1:] options, args = parser.parse_args(args=cmdlineargs) if len(args) == 0: parser.error( "At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] # If a second file name was given, then we either have single-end reads # provided as a pair of .fasta/.qual files or we have paired-end reads. quality_filename = None input_paired_filename = None if len(args) == 2: if args[0].endswith('.qual'): parser.error("The QUAL file must be the second argument.") if args[1].endswith('.qual'): quality_filename = args[1] else: input_paired_filename = args[1] if not options.paired_output: parser.error( 'You must use --paired-output when trimming paired-end reads.' ) if len(args) == 1 and options.paired_output: parser.error( "You specified a --paired-output file, but gave only one input file." ) if options.paired_output and bool(options.untrimmed_output) != bool( options.untrimmed_paired_output): parser.error( "When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options." ) if options.untrimmed_paired_output and not options.paired_output: parser.error( "Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option --paired-output).") if input_filename.endswith('.qual'): parser.error("Need a FASTA file in addition to the QUAL file.") if options.format is not None and options.format.lower() not in [ 'fasta', 'fastq', 'sra-fastq' ]: parser.error( "The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) # TODO should this really be an error? if options.format is not None and quality_filename is not None: parser.error( "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used." ) writers = [] too_short_outfile = None # too short reads go here too_short_filter = None # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_outfile = xopen(options.too_short_output, 'w') else: too_short_outfile = None too_short_filter = TooShortReadFilter(options.minimum_length, too_short_outfile) writers.append(too_short_filter) too_long_outfile = None # too long reads go here too_long_filter = None if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_outfile = xopen(options.too_long_output, 'w') else: too_long_outfile = None too_long_filter = TooLongReadFilter(options.maximum_length, too_long_outfile) writers.append(too_long_filter) demultiplexer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: parser.error("Do not use --discard-trimmed when demultiplexing.") if input_paired_filename: parser.error( "Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.format(name='unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed) writers.append(demultiplexer) trimmed_outfile, untrimmed_outfile = None, None trimmed_paired_outfile, untrimmed_paired_outfile = None, None else: trimmed_outfile, untrimmed_outfile = trimmed_and_untrimmed_files( default_outfile, options.output, options.untrimmed_output, options.discard_trimmed, options.discard_untrimmed) trimmed_paired_outfile, untrimmed_paired_outfile = trimmed_and_untrimmed_files( None, # applies when not trimming paired-end data options.paired_output, options.untrimmed_paired_output, options.discard_trimmed, options.discard_untrimmed) writer = ProcessedReadWriter(trimmed_outfile, trimmed_paired_outfile, untrimmed_outfile, untrimmed_paired_outfile) writers.append(writer) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: parser.error( "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)." ) if not (0 <= options.error_rate <= 1.): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: parser.error('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False adapters = [] ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter try: for name, seq, where in gather_adapters(options.adapters, options.anywhere, options.front): if not seq: parser.error("The adapter sequence is empty") if not options.indels and where not in (PREFIX, SUFFIX): parser.error( "Not allowing indels is currently supported only for anchored 5' and 3' adapters." ) adapter = ADAPTER_CLASS(seq, where, options.error_rate, options.overlap, options.match_read_wildcards, options.match_adapter_wildcards, name=name, indels=options.indels) adapters.append(adapter) except IOError as e: if e.errno == errno.ENOENT: print("Error:", e, file=sys.stderr) sys.exit(1) raise if not adapters and options.quality_cutoff == 0 and options.cut == 0 and \ options.minimum_length == 0 and options.maximum_length == sys.maxsize: parser.error("You need to provide at least one adapter sequence.") if input_paired_filename: reader = seqio.PairedSequenceReader(input_filename, input_paired_filename, colorspace=options.colorspace, fileformat=options.format) else: reader = read_sequences(input_filename, quality_filename, colorspace=options.colorspace, fileformat=options.format) # Create the processing pipeline as a list of "modifiers". modifiers = [] if options.cut: if len(options.cut) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if options.quality_cutoff > 0: modifiers.append( QualityTrimmer(options.quality_cutoff, options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) else: adapter_cutter = None if options.length_tag: modifiers.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers.append(PrimerTrimmer) start_time = time.clock() try: if input_paired_filename: stats = process_paired_reads(reader, modifiers, writers) else: stats = process_single_reads(reader, modifiers, writers) except KeyboardInterrupt as e: print("Interrupted", file=sys.stderr) sys.exit(1) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise except seqio.FormatError as e: print("Error:", e, file=sys.stderr) sys.exit(1) # close open files for f in [ trimmed_outfile, untrimmed_outfile, trimmed_paired_outfile, untrimmed_paired_outfile, options.rest_file, options.wildcard_file, options.info_file, too_short_outfile, too_long_outfile, options.info_file, demultiplexer ]: if f is not None and f is not sys.stdin and f is not sys.stdout: f.close() if not options.quiet: # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None print_statistics(adapters, time.clock() - start_time, stats, options.action, adapter_cutter.reads_matched if adapter_cutter else 0, options.error_rate, too_short_filter.too_short if too_short_filter else 0, too_long_filter.too_long if too_long_filter else 0, cmdlineargs, file=stat_file)
def main(cmdlineargs=None, trimmed_outfile=sys.stdout): """ Main function that evaluates command-line parameters and iterates over all reads. trimmed_outfile is the default output file to which trimmed reads are sent. It can be overriden by using the '-o' parameter. """ parser = get_option_parser() if cmdlineargs is None: cmdlineargs = sys.argv[1:] options, args = parser.parse_args(args=cmdlineargs) if len(args) == 0: parser.error( "At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] quality_filename = None pe_filename = None if len(args) == 2: if args[1].endswith('.qual'): quality_filename = args[1] else: pe_filename = args[1] if not options.paired_output: parser.error( 'you must use --paired-output when trimming paired-end reads' ) if len(args) == 1 and options.paired_output: parser.error( "You specified a --paired-output file, but gave only one input file." ) if input_filename.endswith('.qual') and quality_filename.endswith('fasta'): parser.error( "FASTA and QUAL file given, but the FASTA file must be first.") if options.format is not None and options.format.lower() not in [ 'fasta', 'fastq', 'sra-fastq' ]: parser.error( "The input file format must be either 'fasta', 'fastq' or 'sra-fastq' (not '{0}')." .format(options.format)) # TODO should this really be an error? if options.format is not None and quality_filename is not None: parser.error( "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used." ) # default output files (overwritten below) too_short_outfile = None # too short reads go here too_long_outfile = None # too long reads go here pe_outfile = None if options.output is not None: trimmed_outfile = xopen(options.output, 'w') untrimmed_outfile = trimmed_outfile # reads without adapters go here if options.untrimmed_output is not None: untrimmed_outfile = xopen(options.untrimmed_output, 'w') if options.too_short_output is not None: too_short_outfile = xopen(options.too_short_output, 'w') if options.too_long_output is not None: too_long_outfile = xopen(options.too_long_output, 'w') if options.paired_output: pe_outfile = xopen(options.paired_output, 'w') if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" options.zero_cap = True if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in color space.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in color space.") if options.anywhere and options.colorspace: parser.error( "Using --anywhere with color space reads is currently not supported (if you think this may be useful, contact the author)." ) if not (0 <= options.error_rate <= 1.): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') adapters = [] def parse_adapter_name(seq): """ Parse an adapter given as 'name=adapt' into 'name' and 'adapt'. """ fields = seq.split('=', 1) if len(fields) > 1: name, seq = fields name = name.strip() else: name = None seq = seq.strip() return name, seq ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter def append_adapters(adapter_list, where): for seq in adapter_list: name, seq = parse_adapter_name(seq) w = where if w == FRONT and seq.startswith('^'): seq = seq[1:] w = PREFIX elif not options.indels: parser.error( "Not allowing indels is currently supported only for anchored 5' adapters." ) if not seq: parser.error("The adapter sequence is empty") adapter = ADAPTER_CLASS(seq, w, options.error_rate, options.overlap, options.match_read_wildcards, options.match_adapter_wildcards, name=name, indels=options.indels) adapters.append(adapter) append_adapters(options.adapters, BACK) append_adapters(options.anywhere, ANYWHERE) append_adapters(options.front, FRONT) # make sure these aren't used by accident del options.adapters del options.anywhere del options.front if not adapters and options.quality_cutoff == 0 and options.cut == 0: parser.error("You need to provide at least one adapter sequence.") modifiers = [] if options.cut: modifiers.append(UnconditionalCutter(options.cut)) if options.quality_cutoff > 0: modifiers.append( QualityTrimmer(options.quality_cutoff, options.quality_base)) if adapters: adapter_cutter = RepeatedAdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, options.trim, rest_writer, options.mask_adapter) modifiers.append(adapter_cutter) else: adapter_cutter = None if options.length_tag: modifiers.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers.append(DoubleEncoder()) if options.zero_cap: modifiers.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers.append(PrimerTrimmer) readfilter = ReadFilter(options.minimum_length, options.maximum_length, too_short_outfile, too_long_outfile, options.discard_trimmed, options.discard_untrimmed) start_time = time.clock() try: reader = read_sequences(input_filename, quality_filename, colorspace=options.colorspace, fileformat=options.format) if pe_filename: pe_reader = read_sequences(pe_filename, None, colorspace=options.colorspace, fileformat=options.format) else: pe_reader = None stats = process_reads(reader, pe_reader, modifiers, readfilter, trimmed_outfile, untrimmed_outfile, pe_outfile) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise except seqio.FormatError as e: print("Error:", e, file=sys.stderr) sys.exit(1) # close open files for f in [ options.rest_file, options.wildcard_file, options.info_file, too_short_outfile, too_long_outfile, options.info_file ]: if f is not None: f.close() # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None print_statistics(adapters, time.clock() - start_time, stats, options.trim, adapter_cutter.reads_matched if adapter_cutter else 0, options.error_rate, readfilter.too_short, readfilter.too_long, cmdlineargs, file=stat_file)