def pipeline_from_parsed_args(options, paired, pair_filter_mode, quality_filename, is_interleaved_output): """ Setup a processing pipeline from parsed command-line options. If there are any problems parsing the arguments, a CommandLineError is thrown. Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) """ if not paired: if options.untrimmed_paired_output: raise CommandLineError("Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") if paired: if not is_interleaved_output: if not options.paired_output: raise CommandLineError("When paired-end trimming is enabled via -A/-G/-B/-U, " "a second output file needs to be specified via -p (--paired-output).") if not options.output: raise CommandLineError("When you use -p or --paired-output, you must also " "use the -o option.") if bool(options.untrimmed_output) != bool(options.untrimmed_paired_output): raise CommandLineError("When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options.") if options.too_short_output and not options.too_short_paired_output: raise CommandLineError("When using --too-short-output with paired-end " "reads, you also need to use --too-short-paired-output") if options.too_long_output and not options.too_long_paired_output: raise CommandLineError("When using --too-long-output with paired-end " "reads, you also need to use --too-long-paired-output") elif quality_filename is not None: if options.format is not None: raise CommandLineError('If a pair of .fasta and .qual files is given, the -f/--format ' 'parameter cannot be used.') if options.format is not None and options.format.lower() not in ['fasta', 'fastq', 'sra-fastq']: raise CommandLineError("The input file format must be either 'fasta', 'fastq' or " "'sra-fastq' (not '{0}').".format(options.format)) if options.maq: options.colorspace = True options.double_encode = True options.trim_primer = True options.strip_suffix.append('_F3') options.suffix = "/1" if options.zero_cap is None: options.zero_cap = options.colorspace if options.trim_primer and not options.colorspace: raise CommandLineError("Trimming the primer makes only sense in colorspace.") if options.double_encode and not options.colorspace: raise CommandLineError("Double-encoding makes only sense in colorspace.") if options.anywhere and options.colorspace: raise CommandLineError("Using --anywhere with colorspace reads is currently not supported " "(if you think this may be useful, contact the author).") if not (0 <= options.error_rate <= 1.): raise CommandLineError("The maximum error rate must be between 0 and 1.") if options.overlap < 1: raise CommandLineError("The overlap must be at least 1.") if not (0 <= options.gc_content <= 100): raise CommandLineError("GC content must be given as percentage between 0 and 100") if options.action == 'none': options.action = None if options.colorspace: if options.match_read_wildcards: raise CommandLineError('IUPAC wildcards not supported in colorspace') options.match_adapter_wildcards = False adapter_parser = AdapterParser( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels) try: adapters = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: raise CommandLineError(e) raise except ValueError as e: raise CommandLineError(e) if options.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the processing pipeline. # If no second-read adapters were given (via -A/-G/-B/-U), we need to # be backwards compatible and *no modifications* are done to the second read. if paired: pipeline = PairedEndPipeline(pair_filter_mode, modify_first_read_only=paired == 'first') else: pipeline = SingleEndPipeline() if options.cut: if len(options.cut) > 2: raise CommandLineError("You cannot remove bases from more than two ends.") if len(options.cut) == 2 and options.cut[0] * options.cut[1] > 0: raise CommandLineError("You cannot remove bases from the same end twice.") for cut in options.cut: if cut != 0: pipeline.add1(UnconditionalCutter(cut)) if options.cut2: if len(options.cut2) > 2: raise CommandLineError("You cannot remove bases from more than two ends.") if len(options.cut2) == 2 and options.cut2[0] * options.cut2[1] > 0: raise CommandLineError("You cannot remove bases from the same end twice.") for cut in options.cut2: if cut != 0: pipeline.add2(UnconditionalCutter(cut)) if options.nextseq_trim is not None: pipeline.add(NextseqQualityTrimmer(options.nextseq_trim, options.quality_base)) if options.quality_cutoff is not None: cutoffs = parse_cutoffs(options.quality_cutoff) pipeline.add(QualityTrimmer(cutoffs[0], cutoffs[1], options.quality_base)) if adapters: adapter_cutter = AdapterCutter(adapters, options.times, options.action) pipeline.add1(adapter_cutter) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, options.times, options.action) pipeline.add2(adapter_cutter2) # Modifiers that apply to both reads of paired-end reads unless in legacy mode if options.length is not None: pipeline.add(Shortener(options.length)) if options.trim_n: pipeline.add(NEndTrimmer()) if options.length_tag: pipeline.add(LengthTagModifier(options.length_tag)) if options.strip_f3: options.strip_suffix.append('_F3') for suffix in options.strip_suffix: pipeline.add(SuffixRemover(suffix)) if options.prefix or options.suffix: pipeline.add(PrefixSuffixAdder(options.prefix, options.suffix)) if options.double_encode: pipeline.add(DoubleEncoder()) if options.zero_cap: pipeline.add(ZeroCapper(quality_base=options.quality_base)) if options.trim_primer: pipeline.add(PrimerTrimmer()) # Set filtering parameters # Minimum/maximum length for attr in 'minimum_length', 'maximum_length': param = getattr(options, attr) if param is not None: lengths = parse_lengths(param) if not paired and len(lengths) == 2: raise CommandLineError('Two minimum or maximum lengths given for single-end data') if paired and len(lengths) == 1: lengths = (lengths[0], lengths[0]) setattr(pipeline, attr, lengths) pipeline.max_n = options.max_n pipeline.discard_casava = options.discard_casava pipeline.discard_trimmed = options.discard_trimmed pipeline.discard_untrimmed = options.discard_untrimmed return pipeline
def pipeline_from_parsed_args(args, paired, file_opener, adapters, adapters2) -> Pipeline: """ Setup a processing pipeline from parsed command-line arguments. If there are any problems parsing the arguments, a CommandLineError is raised. Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) """ if args.action == 'none': args.action = None # Create the processing pipeline if paired: pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter pipeline = PairedEndPipeline(pair_filter_mode, file_opener) # type: Any else: pipeline = SingleEndPipeline(file_opener) # When adapters are being trimmed only in R1 or R2, override the pair filter mode # as using the default of 'any' would regard all read pairs as untrimmed. if isinstance(pipeline, PairedEndPipeline) and (not adapters2 or not adapters) and ( args.discard_untrimmed or args.untrimmed_output or args.untrimmed_paired_output): pipeline.override_untrimmed_pair_filter = True add_unconditional_cutters(pipeline, args.cut, args.cut2, paired) pipeline_add = pipeline.add_both if paired else pipeline.add if args.nextseq_trim is not None: pipeline_add( NextseqQualityTrimmer(args.nextseq_trim, args.quality_base)) if args.quality_cutoff is not None: cutoffs = parse_cutoffs(args.quality_cutoff) pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base)) add_adapter_cutter( pipeline, adapters, adapters2, paired, args.pair_adapters, args.action, args.times, args.reverse_complement, args.index, ) for modifier in modifiers_applying_to_both_ends_if_paired(args): pipeline_add(modifier) # Set filtering parameters # Minimum/maximum length for attr in 'minimum_length', 'maximum_length': param = getattr(args, attr) if param is not None: lengths = parse_lengths(param) if not paired and len(lengths) == 2: raise CommandLineError( 'Two minimum or maximum lengths given for single-end data') if paired and len(lengths) == 1: lengths = (lengths[0], lengths[0]) setattr(pipeline, attr, lengths) pipeline.max_n = args.max_n pipeline.max_expected_errors = args.max_expected_errors pipeline.discard_casava = args.discard_casava pipeline.discard_trimmed = args.discard_trimmed pipeline.discard_untrimmed = args.discard_untrimmed return pipeline
def pipeline_from_parsed_args(args, paired, is_interleaved_output): """ Setup a processing pipeline from parsed command-line arguments. If there are any problems parsing the arguments, a CommandLineError is raised. Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) """ check_arguments(args, paired, is_interleaved_output) if args.action == 'none': args.action = None adapter_parser = AdapterParser( max_error_rate=args.error_rate, min_overlap=args.overlap, read_wildcards=args.match_read_wildcards, adapter_wildcards=args.match_adapter_wildcards, indels=args.indels, ) try: adapters = adapter_parser.parse_multi(args.adapters) adapters2 = adapter_parser.parse_multi(args.adapters2) except (FileNotFoundError, ValueError) as e: raise CommandLineError(e) warn_duplicate_adapters(adapters) warn_duplicate_adapters(adapters2) if args.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the processing pipeline if paired: pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter pipeline = PairedEndPipeline(pair_filter_mode) else: pipeline = SingleEndPipeline() # When adapters are being trimmed only in R1 or R2, override the pair filter mode # as using the default of 'any' would regard all read pairs as untrimmed. if paired and (not adapters2 or not adapters) and (args.discard_untrimmed or args.untrimmed_output or args.untrimmed_paired_output): pipeline.override_untrimmed_pair_filter = True add_unconditional_cutters(pipeline, args.cut, args.cut2, paired) pipeline_add = pipeline.add_both if paired else pipeline.add if args.nextseq_trim is not None: pipeline_add( NextseqQualityTrimmer(args.nextseq_trim, args.quality_base)) if args.quality_cutoff is not None: cutoffs = parse_cutoffs(args.quality_cutoff) pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base)) if args.pair_adapters: try: cutter = PairedAdapterCutter(adapters, adapters2, args.action) except PairedAdapterCutterError as e: raise CommandLineError("--pair-adapters: " + str(e)) pipeline.add_paired_modifier(cutter) else: adapter_cutter, adapter_cutter2 = None, None if adapters: adapter_cutter = AdapterCutter(adapters, args.times, args.action) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, args.times, args.action) if paired: if adapter_cutter or adapter_cutter2: pipeline.add(adapter_cutter, adapter_cutter2) else: if adapter_cutter: pipeline.add(adapter_cutter) for modifier in modifiers_applying_to_both_ends_if_paired(args): pipeline_add(modifier) # Set filtering parameters # Minimum/maximum length for attr in 'minimum_length', 'maximum_length': param = getattr(args, attr) if param is not None: lengths = parse_lengths(param) if not paired and len(lengths) == 2: raise CommandLineError( 'Two minimum or maximum lengths given for single-end data') if paired and len(lengths) == 1: lengths = (lengths[0], lengths[0]) setattr(pipeline, attr, lengths) pipeline.max_n = args.max_n pipeline.discard_casava = args.discard_casava pipeline.discard_trimmed = args.discard_trimmed pipeline.discard_untrimmed = args.discard_untrimmed return pipeline
def pipeline_from_parsed_args(args, paired, is_interleaved_output): """ Setup a processing pipeline from parsed command-line arguments. If there are any problems parsing the arguments, a CommandLineError is thrown. Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) """ if not paired: if args.untrimmed_paired_output: raise CommandLineError( "Option --untrimmed-paired-output can only be used when " "trimming paired-end reads (with option -p).") if paired: if not is_interleaved_output: if not args.paired_output: raise CommandLineError( "When a paired-end trimming option such as -A/-G/-B/-U, " "is used, a second output file needs to be specified via -p (--paired-output)." ) if not args.output: raise CommandLineError( "When you use -p or --paired-output, you must also " "use the -o option.") if bool(args.untrimmed_output) != bool(args.untrimmed_paired_output): raise CommandLineError( "When trimming paired-end reads, you must use either none " "or both of the --untrimmed-output/--untrimmed-paired-output options." ) if args.too_short_output and not args.too_short_paired_output: raise CommandLineError( "When using --too-short-output with paired-end " "reads, you also need to use --too-short-paired-output") if args.too_long_output and not args.too_long_paired_output: raise CommandLineError( "When using --too-long-output with paired-end " "reads, you also need to use --too-long-paired-output") if args.format is not None: logger.warning( "Option --format is deprecated and ignored because the input file format is " "always auto-detected") if not (0 <= args.error_rate < 1.): raise CommandLineError( "The maximum error rate must be at least 0 and less than 1.") if args.overlap < 1: raise CommandLineError("The overlap must be at least 1.") if not (0 <= args.gc_content <= 100): raise CommandLineError( "GC content must be given as percentage between 0 and 100") if args.action == 'none': args.action = None adapter_parser = AdapterParser( max_error_rate=args.error_rate, min_overlap=args.overlap, read_wildcards=args.match_read_wildcards, adapter_wildcards=args.match_adapter_wildcards, indels=args.indels, ) try: adapters = adapter_parser.parse_multi(args.adapters, args.anywhere, args.front) adapters2 = adapter_parser.parse_multi(args.adapters2, args.anywhere2, args.front2) except IOError as e: if e.errno == errno.ENOENT: raise CommandLineError(e) raise except ValueError as e: raise CommandLineError(e) if args.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the processing pipeline if paired: pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter pipeline = PairedEndPipeline(pair_filter_mode) else: pipeline = SingleEndPipeline() # When adapters are being trimmed only in R1 or R2, override the pair filter mode # as using the default of 'any' would regard all read pairs as untrimmed. if paired and (not adapters2 or not adapters) and (args.discard_untrimmed or args.untrimmed_output or args.untrimmed_paired_output): pipeline.override_untrimmed_pair_filter = True for i, cut_arg in enumerate([args.cut, args.cut2]): # cut_arg is a list if not cut_arg: continue if len(cut_arg) > 2: raise CommandLineError( "You cannot remove bases from more than two ends.") if len(cut_arg) == 2 and cut_arg[0] * cut_arg[1] > 0: raise CommandLineError( "You cannot remove bases from the same end twice.") for c in cut_arg: if c == 0: continue if i == 0: # R1 if paired: pipeline.add(UnconditionalCutter(c), None) else: pipeline.add(UnconditionalCutter(c)) else: # R2 assert isinstance(pipeline, PairedEndPipeline) pipeline.add(None, UnconditionalCutter(c)) pipeline_add = pipeline.add_both if paired else pipeline.add if args.nextseq_trim is not None: pipeline_add( NextseqQualityTrimmer(args.nextseq_trim, args.quality_base)) if args.quality_cutoff is not None: cutoffs = parse_cutoffs(args.quality_cutoff) pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base)) if args.pair_adapters: if not paired: raise CommandLineError( "Option --pair-adapters can only be used when trimming " "paired-end reads") if args.times != 1: raise CommandLineError( "--pair-adapters cannot be used with --times") try: cutter = PairedAdapterCutter(adapters, adapters2, args.action) except PairedAdapterCutterError as e: raise CommandLineError("--pair-adapters: " + str(e)) pipeline.add_paired_modifier(cutter) else: adapter_cutter, adapter_cutter2 = None, None if adapters: adapter_cutter = AdapterCutter(adapters, args.times, args.action) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, args.times, args.action) if paired: if adapter_cutter or adapter_cutter2: pipeline.add(adapter_cutter, adapter_cutter2) else: if adapter_cutter: pipeline.add(adapter_cutter) # Remaining modifiers that apply to both reads of paired-end reads if args.length is not None: pipeline_add(Shortener(args.length)) if args.trim_n: pipeline_add(NEndTrimmer()) if args.length_tag: pipeline_add(LengthTagModifier(args.length_tag)) for suffix in args.strip_suffix: pipeline_add(SuffixRemover(suffix)) if args.prefix or args.suffix: pipeline_add(PrefixSuffixAdder(args.prefix, args.suffix)) if args.zero_cap: pipeline_add(ZeroCapper(quality_base=args.quality_base)) # Set filtering parameters # Minimum/maximum length for attr in 'minimum_length', 'maximum_length': param = getattr(args, attr) if param is not None: lengths = parse_lengths(param) if not paired and len(lengths) == 2: raise CommandLineError( 'Two minimum or maximum lengths given for single-end data') if paired and len(lengths) == 1: lengths = (lengths[0], lengths[0]) setattr(pipeline, attr, lengths) pipeline.max_n = args.max_n pipeline.discard_casava = args.discard_casava pipeline.discard_trimmed = args.discard_trimmed pipeline.discard_untrimmed = args.discard_untrimmed return pipeline
def pipeline_from_parsed_args(args, paired, is_interleaved_output): """ Setup a processing pipeline from parsed command-line arguments. If there are any problems parsing the arguments, a CommandLineError is thrown. Return an instance of Pipeline (SingleEndPipeline or PairedEndPipeline) """ check_arguments(args, paired, is_interleaved_output) if args.action == 'none': args.action = None adapter_parser = AdapterParser( max_error_rate=args.error_rate, min_overlap=args.overlap, read_wildcards=args.match_read_wildcards, adapter_wildcards=args.match_adapter_wildcards, indels=args.indels, ) try: adapters = adapter_parser.parse_multi(args.adapters, args.anywhere, args.front) adapters2 = adapter_parser.parse_multi(args.adapters2, args.anywhere2, args.front2) except IOError as e: if e.errno == errno.ENOENT: raise CommandLineError(e) raise except ValueError as e: raise CommandLineError(e) if args.debug: for adapter in adapters + adapters2: adapter.enable_debug() # Create the processing pipeline if paired: pair_filter_mode = 'any' if args.pair_filter is None else args.pair_filter pipeline = PairedEndPipeline(pair_filter_mode) else: pipeline = SingleEndPipeline() # When adapters are being trimmed only in R1 or R2, override the pair filter mode # as using the default of 'any' would regard all read pairs as untrimmed. if paired and (not adapters2 or not adapters) and ( args.discard_untrimmed or args.untrimmed_output or args.untrimmed_paired_output): pipeline.override_untrimmed_pair_filter = True for i, cut_arg in enumerate([args.cut, args.cut2]): # cut_arg is a list if not cut_arg: continue if len(cut_arg) > 2: raise CommandLineError("You cannot remove bases from more than two ends.") if len(cut_arg) == 2 and cut_arg[0] * cut_arg[1] > 0: raise CommandLineError("You cannot remove bases from the same end twice.") for c in cut_arg: if c == 0: continue if i == 0: # R1 if paired: pipeline.add(UnconditionalCutter(c), None) else: pipeline.add(UnconditionalCutter(c)) else: # R2 assert isinstance(pipeline, PairedEndPipeline) pipeline.add(None, UnconditionalCutter(c)) pipeline_add = pipeline.add_both if paired else pipeline.add if args.nextseq_trim is not None: pipeline_add(NextseqQualityTrimmer(args.nextseq_trim, args.quality_base)) if args.quality_cutoff is not None: cutoffs = parse_cutoffs(args.quality_cutoff) pipeline_add(QualityTrimmer(cutoffs[0], cutoffs[1], args.quality_base)) if args.pair_adapters: if not paired: raise CommandLineError("Option --pair-adapters can only be used when trimming " "paired-end reads") if args.times != 1: raise CommandLineError("--pair-adapters cannot be used with --times") try: cutter = PairedAdapterCutter(adapters, adapters2, args.action) except PairedAdapterCutterError as e: raise CommandLineError("--pair-adapters: " + str(e)) pipeline.add_paired_modifier(cutter) else: adapter_cutter, adapter_cutter2 = None, None if adapters: adapter_cutter = AdapterCutter(adapters, args.times, args.action) if adapters2: adapter_cutter2 = AdapterCutter(adapters2, args.times, args.action) if paired: if adapter_cutter or adapter_cutter2: pipeline.add(adapter_cutter, adapter_cutter2) else: if adapter_cutter: pipeline.add(adapter_cutter) # Remaining modifiers that apply to both reads of paired-end reads if args.length is not None: pipeline_add(Shortener(args.length)) if args.trim_n: pipeline_add(NEndTrimmer()) if args.length_tag: pipeline_add(LengthTagModifier(args.length_tag)) for suffix in args.strip_suffix: pipeline_add(SuffixRemover(suffix)) if args.prefix or args.suffix: pipeline_add(PrefixSuffixAdder(args.prefix, args.suffix)) if args.zero_cap: pipeline_add(ZeroCapper(quality_base=args.quality_base)) # Set filtering parameters # Minimum/maximum length for attr in 'minimum_length', 'maximum_length': param = getattr(args, attr) if param is not None: lengths = parse_lengths(param) if not paired and len(lengths) == 2: raise CommandLineError('Two minimum or maximum lengths given for single-end data') if paired and len(lengths) == 1: lengths = (lengths[0], lengths[0]) setattr(pipeline, attr, lengths) pipeline.max_n = args.max_n pipeline.discard_casava = args.discard_casava pipeline.discard_trimmed = args.discard_trimmed pipeline.discard_untrimmed = args.discard_untrimmed return pipeline