def collect(back, anywhere, front): adapters = [] for name, seq, where in gather_adapters(back, anywhere, front): if not seq: parser.error("The adapter sequence is empty.") adapter = ADAPTER_CLASS(seq, where, options.error_rate, options.overlap, options.match_read_wildcards, options.match_adapter_wildcards, name=name, indels=options.indels) if options.debug: adapter.enable_debug() adapters.append(adapter) return adapters
def collect(back, anywhere, front): adapters = [] for name, seq, where in gather_adapters(back, anywhere, front): if not seq: parser.error("The adapter sequence is empty.") if not options.indels and where not in (PREFIX, SUFFIX): parser.error("Not allowing indels is currently supported only for anchored 5' and 3' adapters.") adapter = ADAPTER_CLASS(seq, where, options.error_rate, options.overlap, options.match_read_wildcards, options.match_adapter_wildcards, name=name, indels=options.indels) adapters.append(adapter) return adapters
def __init__(self, queue=None, results=None, adapter=None, phred64=False): super(Worker, self).__init__() self.queue=queue self.results = results self.phred = 64 if phred64 else 33 self.modifiers = [QualityTrimmer(0, 10, self.phred)] self.adapters = [] self.error_rate = 0.12 self.min_length = 16 if adapter.startswith('+'): self.modifiers.append(UnconditionalCutter(int(adapter))) elif adapter == 'none': self.adapter = None else: for name,seq,where in gather_adapters(adapter.split(','), [], []): self.adapters.append(Adapter(seq, where, self.error_rate, name=name)) adapter_cutter = AdapterCutter(self.adapters) self.modifiers.append(adapter_cutter)
def main(cmdlineargs=None, default_outfile=sys.stdout): """ Main function that evaluates command-line parameters and iterates over all reads. default_outfile is the file to which trimmed reads are sent if the ``-o`` parameter is not used. """ parser = get_option_parser() if cmdlineargs is None: cmdlineargs = sys.argv[1:] options, args = parser.parse_args(args=cmdlineargs) if len(args) == 0: parser.error("At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] # If a second file name was given, then we either have single-end reads # provided as a pair of .fasta/.qual files or we have paired-end reads. quality_filename = None input_paired_filename = None if len(args) == 2: if args[0].endswith(".qual"): parser.error("The QUAL file must be the second argument.") if args[1].endswith(".qual"): quality_filename = args[1] else: input_paired_filename = args[1] if not options.paired_output: parser.error("You must use --paired-output when trimming paired-end reads.") if len(args) == 1 and options.paired_output: parser.error("You specified a --paired-output file, but gave only one input file.") if options.paired_output and bool(options.untrimmed_output) != bool(options.untrimmed_paired_output): parser.error( "When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options." ) if options.untrimmed_paired_output and not options.paired_output: parser.error( "Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option --paired-output)." ) if input_filename.endswith(".qual"): parser.error("Need a FASTA file in addition to the QUAL file.") if options.format is not None and options.format.lower() not in ["fasta", "fastq", "sra-fastq"]: parser.error( "The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format) ) # TODO should this really be an error? if options.format is not None and quality_filename is not None: parser.error("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.") writers = [] too_short_outfile = None # too short reads go here too_short_filter = None # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_outfile = xopen(options.too_short_output, "w") else: too_short_outfile = None too_short_filter = TooShortReadFilter(options.minimum_length, too_short_outfile) writers.append(too_short_filter) too_long_outfile = None # too long reads go here too_long_filter = None if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_outfile = xopen(options.too_long_output, "w") else: too_long_outfile = None too_long_filter = TooLongReadFilter(options.maximum_length, too_long_outfile) writers.append(too_long_filter) demultiplexer = None if options.output is not None and "{name}" in options.output: if options.discard_trimmed: parser.error("Do not use --discard-trimmed when demultiplexing.") if input_paired_filename: parser.error("Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.format(name="unknown") if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed) writers.append(demultiplexer) trimmed_outfile, untrimmed_outfile = None, None trimmed_paired_outfile, untrimmed_paired_outfile = None, None else: trimmed_outfile, untrimmed_outfile = trimmed_and_untrimmed_files( default_outfile, options.output, options.untrimmed_output, options.discard_trimmed, options.discard_untrimmed, ) trimmed_paired_outfile, untrimmed_paired_outfile = trimmed_and_untrimmed_files( None, # applies when not trimming paired-end data options.paired_output, options.untrimmed_paired_output, options.discard_trimmed, options.discard_untrimmed, ) writer = ProcessedReadWriter( trimmed_outfile, trimmed_paired_outfile, untrimmed_outfile, untrimmed_paired_outfile ) writers.append(writer) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append("_F3") options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: parser.error( "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)." ) if not (0 <= options.error_rate <= 1.0): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, "w") rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, "w") if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, "w") if options.colorspace: if options.match_read_wildcards: parser.error("IUPAC wildcards not supported in colorspace") options.match_adapter_wildcards = False adapters = [] ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter try: for name, seq, where in gather_adapters(options.adapters, options.anywhere, options.front): if not seq: parser.error("The adapter sequence is empty") if not options.indels and where not in (PREFIX, SUFFIX): parser.error("Not allowing indels is currently supported only for anchored 5' and 3' adapters.") adapter = ADAPTER_CLASS( seq, where, options.error_rate, options.overlap, options.match_read_wildcards, options.match_adapter_wildcards, name=name, indels=options.indels, ) adapters.append(adapter) except IOError as e: if e.errno == errno.ENOENT: print("Error:", e, file=sys.stderr) sys.exit(1) raise if ( not adapters and options.quality_cutoff == 0 and options.cut == 0 and options.minimum_length == 0 and options.maximum_length == sys.maxsize ): parser.error("You need to provide at least one adapter sequence.") if input_paired_filename: reader = seqio.PairedSequenceReader( input_filename, input_paired_filename, colorspace=options.colorspace, fileformat=options.format ) else: reader = read_sequences( input_filename, quality_filename, colorspace=options.colorspace, fileformat=options.format ) # Create the processing pipeline as a list of "modifiers". modifiers = [] if options.cut: if len(options.cut) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if options.quality_cutoff > 0: modifiers.append(QualityTrimmer(options.quality_cutoff, options.quality_base)) if adapters: adapter_cutter = AdapterCutter( adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action ) modifiers.append(adapter_cutter) else: adapter_cutter = None if options.length_tag: modifiers.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append("_F3") for suffix in options.strip_suffix: modifiers.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers.append(PrimerTrimmer) start_time = time.clock() try: if input_paired_filename: stats = process_paired_reads(reader, modifiers, writers) else: stats = process_single_reads(reader, modifiers, writers) except KeyboardInterrupt as e: print("Interrupted", file=sys.stderr) sys.exit(1) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise except seqio.FormatError as e: print("Error:", e, file=sys.stderr) sys.exit(1) # close open files for f in [ trimmed_outfile, untrimmed_outfile, trimmed_paired_outfile, untrimmed_paired_outfile, options.rest_file, options.wildcard_file, options.info_file, too_short_outfile, too_long_outfile, options.info_file, demultiplexer, ]: if f is not None and f is not sys.stdin and f is not sys.stdout: f.close() if not options.quiet: # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None print_statistics( adapters, time.clock() - start_time, stats, options.action, adapter_cutter.reads_matched if adapter_cutter else 0, options.error_rate, too_short_filter.too_short if too_short_filter else 0, too_long_filter.too_long if too_long_filter else 0, cmdlineargs, file=stat_file, )
def main(cmdlineargs=None, default_outfile=sys.stdout): """ Main function that evaluates command-line parameters and iterates over all reads. default_outfile is the file to which trimmed reads are sent if the ``-o`` parameter is not used. """ logging.basicConfig(level=logging.INFO, format='%(message)s') # %(levelname)s parser = get_option_parser() if cmdlineargs is None: cmdlineargs = sys.argv[1:] options, args = parser.parse_args(args=cmdlineargs) if len(args) == 0: parser.error("At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] # Find out which 'mode' we need to use. # Default: single-read trimming (neither -p nor -A/-G/-B/-U given) paired = False if options.paired_output: # Modify first read only, keep second in sync (-p given, but not -A/-G/-B/-U). # This exists for backwards compatibility ('legacy mode'). paired = 'first' if options.adapters2 or options.front2 or options.anywhere2 or options.cut2: # Full paired-end trimming when both -p and -A/-G/-B/-U given # Also the read modifications (such as quality trimming) are applied # to second read. paired = 'both' if paired and len(args) == 1: parser.error("When paired-end trimming is enabled via -A/-G/-B/-U or -p, " "two input files are required.") if paired: input_paired_filename = args[1] quality_filename = None else: input_paired_filename = None if len(args) == 2: if args[0].endswith('.qual'): parser.error("The QUAL file must be the second argument.") quality_filename = args[1] else: quality_filename = None if paired: if not options.paired_output: parser.error("When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output).") if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output): parser.error("When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options.") else: if options.untrimmed_paired_output: parser.error("Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") if input_filename.endswith('.qual'): parser.error("Need a FASTA file in addition to the QUAL file.") if options.format is not None and quality_filename is not None: parser.error("If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used.") if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']: parser.error("The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) if options.quality_cutoff is not None: cutoffs = options.quality_cutoff.split(',') if len(cutoffs) == 1: try: cutoffs = [0, int(cutoffs[0])] except ValueError as e: parser.error("Quality cutoff value not recognized: {0}".format(e)) elif len(cutoffs) == 2: try: cutoffs = [int(cutoffs[0]), int(cutoffs[1])] except ValueError as e: parser.error("Quality cutoff value not recognized: {0}".format(e)) else: parser.error("Expected one value or two values separated by comma for the quality cutoff") else: cutoffs = None writers = [] too_short_outfile = None # too short reads go here too_short_filter = None # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_outfile = xopen(options.too_short_output, 'w') else: too_short_outfile = None too_short_filter = TooShortReadFilter(options.minimum_length, too_short_outfile, paired=='both') writers.append(too_short_filter) too_long_outfile = None # too long reads go here too_long_filter = None if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_outfile = xopen(options.too_long_output, 'w') else: too_long_outfile = None too_long_filter = TooLongReadFilter(options.maximum_length, too_long_outfile, check_second=paired=='both') writers.append(too_long_filter) if options.max_n != -1: writers.append(NContentTrimmer(options.max_n, check_second=paired=='both')) demultiplexer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: parser.error("Do not use --discard-trimmed when demultiplexing.") if paired: parser.error("Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.format(name='unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed) writers.append(demultiplexer) trimmed_outfile, untrimmed_outfile = None, None trimmed_paired_outfile, untrimmed_paired_outfile = None, None else: trimmed_outfile, untrimmed_outfile = trimmed_and_untrimmed_files( default_outfile, options.output, options.untrimmed_output, options.discard_trimmed, options.discard_untrimmed) trimmed_paired_outfile, untrimmed_paired_outfile = trimmed_and_untrimmed_files( None, # applies when not trimming paired-end data options.paired_output, options.untrimmed_paired_output, options.discard_trimmed, options.discard_untrimmed) writer = ProcessedReadWriter( trimmed_outfile, trimmed_paired_outfile, untrimmed_outfile, untrimmed_paired_outfile, check_second=paired=='both' ) writers.append(writer) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: parser.error("Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author).") if not (0 <= options.error_rate <= 1.): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: parser.error('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter try: # TODO refactor, code duplicated adapters = [] for name, seq, where in gather_adapters(options.adapters, options.anywhere, options.front): if not seq: parser.error("The adapter sequence is empty.") if not options.indels and where not in (PREFIX, SUFFIX): parser.error("Not allowing indels is currently supported only for anchored 5' and 3' adapters.") adapter = ADAPTER_CLASS(seq, where, options.error_rate, options.overlap, options.match_read_wildcards, options.match_adapter_wildcards, name=name, indels=options.indels) adapters.append(adapter) adapters2 = [] for name, seq, where in gather_adapters(options.adapters2, options.anywhere2, options.front2): if not seq: parser.error("The adapter sequence is empty.") if not options.indels and where != PREFIX: parser.error("Not allowing indels is currently supported only for anchored 5' and 3' adapters.") adapter = ADAPTER_CLASS(seq, where, options.error_rate, options.overlap, options.match_read_wildcards, options.match_adapter_wildcards, name=name, indels=options.indels) adapters2.append(adapter) except IOError as e: if e.errno == errno.ENOENT: parser.error(e) raise if not adapters and not adapters2 and not cutoffs and \ options.cut == [] and options.cut2 == [] and \ options.minimum_length == 0 and \ options.maximum_length == sys.maxsize and \ quality_filename is None and \ options.max_n == -1: parser.error("You need to provide at least one adapter sequence.") try: reader = seqio.open(input_filename, file2=input_paired_filename, qualfile=quality_filename, colorspace=options.colorspace, fileformat=options.format) except (seqio.UnknownFileType, IOError) as e: parser.error(e) # Create the processing pipeline consisting of a list of "modifiers". modifiers = [] if options.cut: if len(options.cut) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if cutoffs: modifiers.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) else: adapter_cutter = None # Modifiers that apply to both reads of paired-end reads modifiers_both = [] if options.trim_n: modifiers_both.append(NEndTrimmer()) if options.length_tag: modifiers_both.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers_both.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers_both.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers_both.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers_both.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers_both.append(PrimerTrimmer) modifiers.extend(modifiers_both) # For paired-end data, create a second processing pipeline. # However, if no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. modifiers2 = [] if paired == 'both': if options.cut2: if len(options.cut2) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: modifiers2.append(UnconditionalCutter(cut)) if cutoffs: modifiers2.append(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, None, None, None, options.action) modifiers2.append(adapter_cutter2) else: adapter_cutter2 = None modifiers2.extend(modifiers_both) # Due to backwards compatibility, from here on logging output needs to be # sent to standard output instead of standard error if the -o option is used. if options.output: logger.root.handlers = [] logging.basicConfig(level=logging.INFO, format='%(message)s', stream=sys.stdout) logger.info("This is cutadapt %s with Python %s", __version__, platform.python_version()) logger.info("Command line parameters: %s", " ".join(cmdlineargs)) logger.info("Trimming %s adapter%s with at most %.1f%% errors in %s mode ...", len(adapters) + len(adapters2), 's' if len(adapters) + len(adapters2) != 1 else '', options.error_rate * 100, { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[paired]) start_time = time.clock() try: if paired: stats = process_paired_reads(reader, modifiers, modifiers2, writers) else: stats = process_single_reads(reader, modifiers, writers) except KeyboardInterrupt as e: print("Interrupted", file=sys.stderr) sys.exit(130) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise except (seqio.FormatError, EOFError) as e: sys.exit("cutadapt: error: {0}".format(e)) # close open files for f in [trimmed_outfile, untrimmed_outfile, trimmed_paired_outfile, untrimmed_paired_outfile, options.rest_file, options.wildcard_file, options.info_file, too_short_outfile, too_long_outfile, options.info_file, demultiplexer]: if f is not None and f is not sys.stdin and f is not sys.stdout: f.close() elapsed_time = time.clock() - start_time if not options.quiet: stats.collect((adapters, adapters2), elapsed_time, modifiers, modifiers2, writers) # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None with redirect_standard_output(stat_file): print_report(stats, (adapters, adapters2))
def parse_adapters(adapter, error_rate=None): adapters = [] for name, seq, where in gather_adapters(adapter.split(','), [], []): adapters.append(Adapter(seq, where, error_rate, name=name)) return adapters
def main(cmdlineargs=None, default_outfile=sys.stdout): """ Main function that evaluates command-line parameters and iterates over all reads. default_outfile is the file to which trimmed reads are sent if the ``-o`` parameter is not used. """ parser = get_option_parser() if cmdlineargs is None: cmdlineargs = sys.argv[1:] options, args = parser.parse_args(args=cmdlineargs) if len(args) == 0: parser.error( "At least one parameter needed: name of a FASTA or FASTQ file.") elif len(args) > 2: parser.error("Too many parameters.") input_filename = args[0] # If a second file name was given, then we either have single-end reads # provided as a pair of .fasta/.qual files or we have paired-end reads. quality_filename = None input_paired_filename = None if len(args) == 2: if args[0].endswith('.qual'): parser.error("The QUAL file must be the second argument.") if args[1].endswith('.qual'): quality_filename = args[1] else: input_paired_filename = args[1] if not options.paired_output: parser.error( 'You must use --paired-output when trimming paired-end reads.' ) if len(args) == 1 and options.paired_output: parser.error( "You specified a --paired-output file, but gave only one input file." ) if options.paired_output and bool(options.untrimmed_output) != bool( options.untrimmed_paired_output): parser.error( "When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options." ) if options.untrimmed_paired_output and not options.paired_output: parser.error( "Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option --paired-output).") if input_filename.endswith('.qual'): parser.error("Need a FASTA file in addition to the QUAL file.") if options.format is not None and options.format.lower() not in [ 'fasta', 'fastq', 'sra-fastq' ]: parser.error( "The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) # TODO should this really be an error? if options.format is not None and quality_filename is not None: parser.error( "If a pair of .fasta and .qual files is given, the -f/--format parameter cannot be used." ) writers = [] too_short_outfile = None # too short reads go here too_short_filter = None # TODO pass file name to TooShortReadFilter, add a .close() method? if options.minimum_length > 0: if options.too_short_output: too_short_outfile = xopen(options.too_short_output, 'w') else: too_short_outfile = None too_short_filter = TooShortReadFilter(options.minimum_length, too_short_outfile) writers.append(too_short_filter) too_long_outfile = None # too long reads go here too_long_filter = None if options.maximum_length < sys.maxsize: if options.too_long_output is not None: too_long_outfile = xopen(options.too_long_output, 'w') else: too_long_outfile = None too_long_filter = TooLongReadFilter(options.maximum_length, too_long_outfile) writers.append(too_long_filter) demultiplexer = None if options.output is not None and '{name}' in options.output: if options.discard_trimmed: parser.error("Do not use --discard-trimmed when demultiplexing.") if input_paired_filename: parser.error( "Demultiplexing not supported for paired-end files, yet.") untrimmed = options.output.format(name='unknown') if options.untrimmed_output: untrimmed = options.untrimmed_output if options.discard_untrimmed: untrimmed = None demultiplexer = Demultiplexer(options.output, untrimmed) writers.append(demultiplexer) trimmed_outfile, untrimmed_outfile = None, None trimmed_paired_outfile, untrimmed_paired_outfile = None, None else: trimmed_outfile, untrimmed_outfile = trimmed_and_untrimmed_files( default_outfile, options.output, options.untrimmed_output, options.discard_trimmed, options.discard_untrimmed) trimmed_paired_outfile, untrimmed_paired_outfile = trimmed_and_untrimmed_files( None, # applies when not trimming paired-end data options.paired_output, options.untrimmed_paired_output, options.discard_trimmed, options.discard_untrimmed) writer = ProcessedReadWriter(trimmed_outfile, trimmed_paired_outfile, untrimmed_outfile, untrimmed_paired_outfile) writers.append(writer) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: parser.error("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: parser.error("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: parser.error( "Using --anywhere with colorspace reads is currently not supported (if you think this may be useful, contact the author)." ) if not (0 <= options.error_rate <= 1.): parser.error("The maximum error rate must be between 0 and 1.") if options.overlap < 1: parser.error("The overlap must be at least 1.") if options.rest_file is not None: options.rest_file = xopen(options.rest_file, 'w') rest_writer = RestFileWriter(options.rest_file) else: rest_writer = None if options.info_file is not None: options.info_file = xopen(options.info_file, 'w') if options.wildcard_file is not None: options.wildcard_file = xopen(options.wildcard_file, 'w') if options.colorspace: if options.match_read_wildcards: parser.error('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False adapters = [] ADAPTER_CLASS = ColorspaceAdapter if options.colorspace else Adapter try: for name, seq, where in gather_adapters(options.adapters, options.anywhere, options.front): if not seq: parser.error("The adapter sequence is empty") if not options.indels and where not in (PREFIX, SUFFIX): parser.error( "Not allowing indels is currently supported only for anchored 5' and 3' adapters." ) adapter = ADAPTER_CLASS(seq, where, options.error_rate, options.overlap, options.match_read_wildcards, options.match_adapter_wildcards, name=name, indels=options.indels) adapters.append(adapter) except IOError as e: if e.errno == errno.ENOENT: print("Error:", e, file=sys.stderr) sys.exit(1) raise if not adapters and options.quality_cutoff == 0 and options.cut == 0 and \ options.minimum_length == 0 and options.maximum_length == sys.maxsize: parser.error("You need to provide at least one adapter sequence.") if input_paired_filename: reader = seqio.PairedSequenceReader(input_filename, input_paired_filename, colorspace=options.colorspace, fileformat=options.format) else: reader = read_sequences(input_filename, quality_filename, colorspace=options.colorspace, fileformat=options.format) # Create the processing pipeline as a list of "modifiers". modifiers = [] if options.cut: if len(options.cut) > 2: parser.error("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: parser.error("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: modifiers.append(UnconditionalCutter(cut)) if options.quality_cutoff > 0: modifiers.append( QualityTrimmer(options.quality_cutoff, options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.wildcard_file, options.info_file, rest_writer, options.action) modifiers.append(adapter_cutter) else: adapter_cutter = None if options.length_tag: modifiers.append(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: modifiers.append(SuffixRemover(suffix)) if options.prefix or options.suffix: modifiers.append(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: modifiers.append(DoubleEncoder()) if options.zero_cap and reader.delivers_qualities: modifiers.append(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: modifiers.append(PrimerTrimmer) start_time = time.clock() try: if input_paired_filename: stats = process_paired_reads(reader, modifiers, writers) else: stats = process_single_reads(reader, modifiers, writers) except KeyboardInterrupt as e: print("Interrupted", file=sys.stderr) sys.exit(1) except IOError as e: if e.errno == errno.EPIPE: sys.exit(1) raise except seqio.FormatError as e: print("Error:", e, file=sys.stderr) sys.exit(1) # close open files for f in [ trimmed_outfile, untrimmed_outfile, trimmed_paired_outfile, untrimmed_paired_outfile, options.rest_file, options.wildcard_file, options.info_file, too_short_outfile, too_long_outfile, options.info_file, demultiplexer ]: if f is not None and f is not sys.stdin and f is not sys.stdout: f.close() if not options.quiet: # send statistics to stderr if result was sent to stdout stat_file = sys.stderr if options.output is None else None print_statistics(adapters, time.clock() - start_time, stats, options.action, adapter_cutter.reads_matched if adapter_cutter else 0, options.error_rate, too_short_filter.too_short if too_short_filter else 0, too_long_filter.too_long if too_long_filter else 0, cmdlineargs, file=stat_file)