def __call__(self): options = self.options match_probability = RandomMatchProbability() # Create Adapters has_adapters1 = options.adapters or options.anywhere or options.front has_adapters2 = options.adapters2 or options.anywhere2 or options.front2 adapters1 = adapters2 = [] if has_adapters1 or has_adapters2: adapter_cache = super().load_known_adapters() parser_args = dict( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels, indel_cost=options.indel_cost, cache=adapter_cache, gc_content=options.gc_content, match_probability=match_probability, alphabet=options.alphabet) if options.adapter_max_rmp: parser_args['max_rmp'] = options.adapter_max_rmp adapter_parser = AdapterParser(**parser_args) if has_adapters1: adapters1 = adapter_parser.parse_multi( options.adapters, options.anywhere, options.front) if has_adapters2: adapters2 = adapter_parser.parse_multi( options.adapters2, options.anywhere2, options.front2) if options.cache_adapters: adapter_cache.save() # Create Modifiers # TODO: can this be replaced with an argparse required group? if ( not adapters1 and not adapters2 and not options.quality_cutoff and options.nextseq_trim is None and options.cut == [] and options.cut2 == [] and options.cut_min == [] and options.cut_min2 == [] and ( options.minimum_length is None or options.minimum_length <= 0) and options.maximum_length == sys.maxsize and not options.trim_n and not self.has_qualfile and options.max_n is None and (not options.paired or options.overwrite_low_quality is None)): raise ValueError( "You need to provide at least one adapter sequence.") if ( options.aligner == 'insert' and any( not a or len(a) != 1 or a[0].where != BACK for a in (adapters1, adapters2))): raise ValueError( "Insert aligner requires a single 3' adapter for each read") if options.debug: for adapter in adapters1 + adapters2: adapter.enable_debug() if options.paired: modifiers = PairedEndModifiers(options.paired) else: modifiers = SingleEndModifiers() for oper in options.op_order: if oper == 'W' and options.overwrite_low_quality: lowq, highq, window = options.overwrite_low_quality modifiers.add_modifier( OverwriteRead, worse_read_min_quality=lowq, better_read_min_quality=highq, window_size=window, base=options.quality_base) elif oper == 'A' and (adapters1 or adapters2): # TODO: generalize this using some kind of factory class if options.aligner == 'insert': # Use different base probabilities if we're trimming # bisulfite data. # TODO: this doesn't seem to help things, so commenting it # out for now #if options.bisulfite: # base_probs = dict(match_prob=0.33, mismatch_prob=0.67) # else: # base_probs = dict(match_prob=0.25, mismatch_prob=0.75) modifiers.add_modifier( InsertAdapterCutter, adapter1=adapters1[0], adapter2=adapters2[0], action=options.action, mismatch_action=options.correct_mismatches, max_insert_mismatch_frac=\ options.insert_match_error_rate, max_adapter_mismatch_frac=\ options.insert_match_adapter_error_rate, match_probability=match_probability, insert_max_rmp=options.insert_max_rmp, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards) else: a1_args = dict( adapters=adapters1, times=options.times, action=options.action) if adapters1 else None a2_args = dict( adapters=adapters2, times=options.times, action=options.action) if adapters2 else None modifiers.add_modifier_pair(AdapterCutter, a1_args, a2_args) elif oper == 'C' and (options.cut or options.cut2): modifiers.add_modifier_pair( UnconditionalCutter, dict(lengths=options.cut), dict(lengths=options.cut2)) elif oper == 'G' and (options.nextseq_trim is not None): modifiers.add_modifier( NextseqQualityTrimmer, cutoff=options.nextseq_trim, base=options.quality_base) elif oper == 'Q' and options.quality_cutoff: modifiers.add_modifier( QualityTrimmer, cutoff_front=options.quality_cutoff[0], cutoff_back=options.quality_cutoff[1], base=options.quality_base) if options.bisulfite: if isinstance(options.bisulfite, str): if "non-directional" in options.bisulfite: modifiers.add_modifier( NonDirectionalBisulfiteTrimmer, rrbs=options.bisulfite=="non-directional-rrbs") elif options.bisulfite == "rrbs": modifiers.add_modifier(RRBSTrimmer) elif options.bisulfite in ("epignome", "truseq"): # Trimming leads to worse results #modifiers.add_modifier(TruSeqBisulfiteTrimmer) pass elif options.bisulfite == "swift": modifiers.add_modifier(SwiftBisulfiteTrimmer) else: if options.bisulfite[0]: modifiers.add_modifier( MinCutter, read=1, **(options.bisulfite[0])) if len(options.bisulfite) > 1 and options.bisulfite[1]: modifiers.add_modifier( MinCutter, read=2, **(options.bisulfite[1])) if options.trim_n: modifiers.add_modifier(NEndTrimmer) if options.cut_min or options.cut_min2: modifiers.add_modifier_pair( MinCutter, dict(lengths=options.cut_min), dict(lengths=options.cut_min2)) if options.length_tag: modifiers.add_modifier( LengthTagModifier, length_tag=options.length_tag) if options.strip_suffix: modifiers.add_modifier(SuffixRemover, suffixes=options.strip_suffix) if options.prefix or options.suffix: modifiers.add_modifier( PrefixSuffixAdder, prefix=options.prefix, suffix=options.suffix) if options.double_encode: modifiers.add_modifier(DoubleEncoder) if options.zero_cap and self.delivers_qualities: modifiers.add_modifier( ZeroCapper, quality_base=options.quality_base) if options.trim_primer: modifiers.add_modifier(PrimerTrimmer) if options.merge_overlapping: modifiers.add_modifier( MergeOverlapping, min_overlap=options.merge_min_overlap, error_rate=options.merge_error_rate, mismatch_action=options.correct_mismatches) # Create Filters and Formatters min_affected = 2 if options.pair_filter == 'both' else 1 filters = Filters(FilterFactory(options.paired, min_affected)) output1 = output2 = None interleaved = False if options.interleaved_output: output1 = options.interleaved_output interleaved = True else: output1 = options.output output2 = options.paired_output seq_formatter_args = dict( qualities=self.delivers_qualities, colorspace=options.colorspace, interleaved=interleaved ) formatters = Formatters(output1, seq_formatter_args) force_create = [] if options.merge_overlapping: filters.add_filter(MergedReadFilter) if options.merged_output: formatters.add_seq_formatter( MergedReadFilter, options.merged_output) if options.minimum_length is not None and options.minimum_length > 0: filters.add_filter(TooShortReadFilter, options.minimum_length) if options.too_short_output: formatters.add_seq_formatter( TooShortReadFilter, options.too_short_output, options.too_short_paired_output) if options.maximum_length < sys.maxsize: filters.add_filter(TooLongReadFilter, options.maximum_length) if options.too_long_output is not None: formatters.add_seq_formatter( TooLongReadFilter, options.too_long_output, options.too_long_paired_output) if options.max_n is not None: filters.add_filter(NContentFilter, options.max_n) if options.discard_trimmed: filters.add_filter(TrimmedFilter) if not formatters.multiplexed: if output1 is not None: formatters.add_seq_formatter(NoFilter, output1, output2) if output1 != STDOUT and options.writer_process: force_create.append(output1) if output2 is not None: force_create.append(output2) elif not (options.discard_trimmed and options.untrimmed_output): formatters.add_seq_formatter(NoFilter, options.default_outfile) if options.default_outfile != STDOUT and options.writer_process: force_create.append(options.default_outfile) if options.discard_untrimmed or options.untrimmed_output: filters.add_filter(UntrimmedFilter) if not options.discard_untrimmed: if formatters.multiplexed: untrimmed = options.untrimmed_output or output1.format( name='unknown') formatters.add_seq_formatter(UntrimmedFilter, untrimmed) formatters.add_seq_formatter(NoFilter, untrimmed) elif options.untrimmed_output: formatters.add_seq_formatter( UntrimmedFilter, options.untrimmed_output, options.untrimmed_paired_output) if options.rest_file: formatters.add_info_formatter(RestFormatter(options.rest_file)) if options.info_file: formatters.add_info_formatter(InfoFormatter(options.info_file)) if options.wildcard_file: formatters.add_info_formatter( WildcardFormatter(options.wildcard_file)) if options.paired: mixin_class = PairedEndPipelineMixin else: mixin_class = SingleEndPipelineMixin writers = Writers(force_create) record_handler = RecordHandler(modifiers, filters, formatters) if options.stats: record_handler = StatsRecordHandlerWrapper( record_handler, options.paired, options.stats, qualities=self.delivers_qualities, quality_base=self.quality_base) logger = logging.getLogger() num_adapters = sum(len(a) for a in modifiers.get_adapters()) logger.info( "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...", num_adapters, 's' if num_adapters > 1 else '', options.error_rate * 100, { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[options.paired]) if ( options.paired == 'first' and ( len(record_handler.modifiers.get_modifiers(read=2)) > 0 or options.quality_cutoff)): logger.warning('\n'.join(textwrap.wrap( 'Requested read modifications are applied only to the ' 'first read since backwards compatibility mode is enabled. ' 'To modify both reads, also use any of the -A/-B/-G/-U ' 'options. Use a dummy adapter sequence when necessary: ' '-A XXX'))) if options.threads is None: # Run single-threaded version result_handler = WorkerResultHandler(WriterResultHandler(writers)) pipeline_class = type( 'TrimPipelineImpl', (mixin_class, TrimPipeline), {}) pipeline = pipeline_class(record_handler, result_handler) self.summary.update(mode='serial', threads=1) return run_interruptible(pipeline, self, raise_on_error=True) else: # Run multiprocessing version self.summary.update(mode='parallel', threads=options.threads) return self.run_parallel(record_handler, writers, mixin_class)
def __call__(self): options = self.options match_probability = RandomMatchProbability() # Create Adapters has_adapters1 = options.adapters or options.anywhere or options.front has_adapters2 = options.adapters2 or options.anywhere2 or options.front2 adapters1 = adapters2 = [] if has_adapters1 or has_adapters2: adapter_cache = super().load_known_adapters() parser_args = dict( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels, indel_cost=options.indel_cost, cache=adapter_cache, gc_content=options.gc_content, match_probability=match_probability, alphabet=options.alphabet) if options.adapter_max_rmp: parser_args['max_rmp'] = options.adapter_max_rmp adapter_parser = AdapterParser(**parser_args) if has_adapters1: adapters1 = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) if has_adapters2: adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) if options.cache_adapters: adapter_cache.save() # Create Modifiers # TODO: can this be replaced with an argparse required group? if (not adapters1 and not adapters2 and not options.quality_cutoff and options.nextseq_trim is None and options.cut == [] and options.cut2 == [] and options.cut_min == [] and options.cut_min2 == [] and (options.minimum_length is None or options.minimum_length <= 0) and options.maximum_length == sys.maxsize and not options.trim_n and not self.has_qualfile and options.max_n is None and (not options.paired or options.overwrite_low_quality is None)): raise ValueError( "You need to provide at least one adapter sequence.") if (options.aligner == 'insert' and any(not a or len(a) != 1 or a[0].where != BACK for a in (adapters1, adapters2))): raise ValueError( "Insert aligner requires a single 3' adapter for each read") if options.debug: for adapter in adapters1 + adapters2: adapter.enable_debug() if options.paired: modifiers = PairedEndModifiers(options.paired) else: modifiers = SingleEndModifiers() for oper in options.op_order: if oper == 'W' and options.overwrite_low_quality: lowq, highq, window = options.overwrite_low_quality modifiers.add_modifier(OverwriteRead, worse_read_min_quality=lowq, better_read_min_quality=highq, window_size=window, base=options.quality_base) elif oper == 'A' and (adapters1 or adapters2): # TODO: generalize this using some kind of factory class if options.aligner == 'insert': # Use different base probabilities if we're trimming # bisulfite data. # TODO: this doesn't seem to help things, so commenting it # out for now #if options.bisulfite: # base_probs = dict(match_prob=0.33, mismatch_prob=0.67) # else: # base_probs = dict(match_prob=0.25, mismatch_prob=0.75) modifiers.add_modifier( InsertAdapterCutter, adapter1=adapters1[0], adapter2=adapters2[0], action=options.action, mismatch_action=options.correct_mismatches, max_insert_mismatch_frac=\ options.insert_match_error_rate, max_adapter_mismatch_frac=\ options.insert_match_adapter_error_rate, match_probability=match_probability, insert_max_rmp=options.insert_max_rmp, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards) else: a1_args = dict( adapters=adapters1, times=options.times, action=options.action) if adapters1 else None a2_args = dict( adapters=adapters2, times=options.times, action=options.action) if adapters2 else None modifiers.add_modifier_pair(AdapterCutter, a1_args, a2_args) elif oper == 'C' and (options.cut or options.cut2): modifiers.add_modifier_pair(UnconditionalCutter, dict(lengths=options.cut), dict(lengths=options.cut2)) elif oper == 'G' and (options.nextseq_trim is not None): modifiers.add_modifier(NextseqQualityTrimmer, cutoff=options.nextseq_trim, base=options.quality_base) elif oper == 'Q' and options.quality_cutoff: modifiers.add_modifier(QualityTrimmer, cutoff_front=options.quality_cutoff[0], cutoff_back=options.quality_cutoff[1], base=options.quality_base) if options.bisulfite: if isinstance(options.bisulfite, str): if "non-directional" in options.bisulfite: modifiers.add_modifier( NonDirectionalBisulfiteTrimmer, rrbs=options.bisulfite == "non-directional-rrbs") elif options.bisulfite == "rrbs": modifiers.add_modifier(RRBSTrimmer) elif options.bisulfite in ("epignome", "truseq"): # Trimming leads to worse results #modifiers.add_modifier(TruSeqBisulfiteTrimmer) pass elif options.bisulfite == "swift": modifiers.add_modifier(SwiftBisulfiteTrimmer) else: if options.bisulfite[0]: modifiers.add_modifier(MinCutter, read=1, **(options.bisulfite[0])) if len(options.bisulfite) > 1 and options.bisulfite[1]: modifiers.add_modifier(MinCutter, read=2, **(options.bisulfite[1])) if options.trim_n: modifiers.add_modifier(NEndTrimmer) if options.cut_min or options.cut_min2: modifiers.add_modifier_pair(MinCutter, dict(lengths=options.cut_min), dict(lengths=options.cut_min2)) if options.length_tag: modifiers.add_modifier(LengthTagModifier, length_tag=options.length_tag) if options.strip_suffix: modifiers.add_modifier(SuffixRemover, suffixes=options.strip_suffix) if options.prefix or options.suffix: modifiers.add_modifier(PrefixSuffixAdder, prefix=options.prefix, suffix=options.suffix) if options.double_encode: modifiers.add_modifier(DoubleEncoder) if options.zero_cap and self.delivers_qualities: modifiers.add_modifier(ZeroCapper, quality_base=options.quality_base) if options.trim_primer: modifiers.add_modifier(PrimerTrimmer) if options.merge_overlapping: modifiers.add_modifier(MergeOverlapping, min_overlap=options.merge_min_overlap, error_rate=options.merge_error_rate, mismatch_action=options.correct_mismatches) # Create Filters and Formatters min_affected = 2 if options.pair_filter == 'both' else 1 filters = Filters(FilterFactory(options.paired, min_affected)) output1 = output2 = None interleaved = False if options.interleaved_output: output1 = options.interleaved_output interleaved = True else: output1 = options.output output2 = options.paired_output seq_formatter_args = dict(qualities=self.delivers_qualities, colorspace=options.colorspace, interleaved=interleaved) formatters = Formatters(output1, seq_formatter_args) force_create = [] if options.merge_overlapping: filters.add_filter(MergedReadFilter) if options.merged_output: formatters.add_seq_formatter(MergedReadFilter, options.merged_output) if options.minimum_length is not None and options.minimum_length > 0: filters.add_filter(TooShortReadFilter, options.minimum_length) if options.too_short_output: formatters.add_seq_formatter(TooShortReadFilter, options.too_short_output, options.too_short_paired_output) if options.maximum_length < sys.maxsize: filters.add_filter(TooLongReadFilter, options.maximum_length) if options.too_long_output is not None: formatters.add_seq_formatter(TooLongReadFilter, options.too_long_output, options.too_long_paired_output) if options.max_n is not None: filters.add_filter(NContentFilter, options.max_n) if options.discard_trimmed: filters.add_filter(TrimmedFilter) if not formatters.multiplexed: if output1 is not None: formatters.add_seq_formatter(NoFilter, output1, output2) if output1 != STDOUT and options.writer_process: force_create.append(output1) if output2 is not None: force_create.append(output2) elif not (options.discard_trimmed and options.untrimmed_output): formatters.add_seq_formatter(NoFilter, options.default_outfile) if options.default_outfile != STDOUT and options.writer_process: force_create.append(options.default_outfile) if options.discard_untrimmed or options.untrimmed_output: filters.add_filter(UntrimmedFilter) if not options.discard_untrimmed: if formatters.multiplexed: untrimmed = options.untrimmed_output or output1.format( name='unknown') formatters.add_seq_formatter(UntrimmedFilter, untrimmed) formatters.add_seq_formatter(NoFilter, untrimmed) elif options.untrimmed_output: formatters.add_seq_formatter(UntrimmedFilter, options.untrimmed_output, options.untrimmed_paired_output) if options.rest_file: formatters.add_info_formatter(RestFormatter(options.rest_file)) if options.info_file: formatters.add_info_formatter(InfoFormatter(options.info_file)) if options.wildcard_file: formatters.add_info_formatter( WildcardFormatter(options.wildcard_file)) if options.paired: mixin_class = PairedEndPipelineMixin else: mixin_class = SingleEndPipelineMixin writers = Writers(force_create) record_handler = RecordHandler(modifiers, filters, formatters) if options.stats: record_handler = StatsRecordHandlerWrapper( record_handler, options.paired, options.stats, qualities=self.delivers_qualities, quality_base=self.quality_base) logger = logging.getLogger() num_adapters = sum(len(a) for a in modifiers.get_adapters()) logger.info( "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...", num_adapters, 's' if num_adapters > 1 else '', options.error_rate * 100, { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[options.paired]) if (options.paired == 'first' and (len(record_handler.modifiers.get_modifiers(read=2)) > 0 or options.quality_cutoff)): logger.warning('\n'.join( textwrap.wrap( 'Requested read modifications are applied only to the ' 'first read since backwards compatibility mode is enabled. ' 'To modify both reads, also use any of the -A/-B/-G/-U ' 'options. Use a dummy adapter sequence when necessary: ' '-A XXX'))) if options.threads is None: # Run single-threaded version result_handler = WorkerResultHandler(WriterResultHandler(writers)) pipeline_class = type('TrimPipelineImpl', (mixin_class, TrimPipeline), {}) pipeline = pipeline_class(record_handler, result_handler) self.summary.update(mode='serial', threads=1) return run_interruptible(pipeline, self, raise_on_error=True) else: # Run multiprocessing version self.summary.update(mode='parallel', threads=options.threads) return self.run_parallel(record_handler, writers, mixin_class)
def create_trim_params(options, parser, default_outfile): from atropos.adapters import AdapterParser, BACK from atropos.modifiers import ( Modifiers, AdapterCutter, InsertAdapterCutter, UnconditionalCutter, NextseqQualityTrimmer, QualityTrimmer, NonDirectionalBisulfiteTrimmer, RRBSTrimmer, SwiftBisulfiteTrimmer, MinCutter, NEndTrimmer, LengthTagModifier, SuffixRemover, PrefixSuffixAdder, DoubleEncoder, ZeroCapper, PrimerTrimmer, MergeOverlapping, OverwriteRead) from atropos.filters import ( Filters, FilterFactory, TooShortReadFilter, TooLongReadFilter, NContentFilter, TrimmedFilter, UntrimmedFilter, NoFilter, MergedReadFilter) from atropos.trim import Pipeline, PipelineWithStats from atropos.seqio import Formatters, RestFormatter, InfoFormatter, WildcardFormatter, Writers from atropos.util import RandomMatchProbability reader, input_names, qualities, has_qual_file = create_reader(options, parser) if options.adapter_max_rmp or options.aligner == 'insert': match_probability = RandomMatchProbability() # Create Adapters has_adapters1 = options.adapters or options.anywhere or options.front has_adapters2 = options.adapters2 or options.anywhere2 or options.front2 adapters1 = adapters2 = [] if has_adapters1 or has_adapters2: adapter_cache = load_known_adapters(options) parser_args = dict( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels, indel_cost=options.indel_cost, cache=adapter_cache ) if options.adapter_max_rmp: parser_args['match_probability'] = match_probability parser_args['max_rmp'] = options.adapter_max_rmp adapter_parser = AdapterParser(**parser_args) try: if has_adapters1: adapters1 = adapter_parser.parse_multi( options.adapters, options.anywhere, options.front) if has_adapters2: adapters2 = adapter_parser.parse_multi( options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: parser.error(e) raise except ValueError as e: parser.error(e) if options.cache_adapters: adapter_cache.save() # Create Modifiers # TODO: can this be replaced with an argparse required group? if not adapters1 and not adapters2 and not options.quality_cutoff and \ options.nextseq_trim is None and \ options.cut == [] and options.cut2 == [] and \ options.cut_min == [] and options.cut_min2 == [] and \ (options.minimum_length is None or options.minimum_length <= 0) and \ options.maximum_length == sys.maxsize and \ not has_qual_file and options.max_n is None and not options.trim_n \ and (not options.paired or options.overwrite_low_quality is None): parser.error("You need to provide at least one adapter sequence.") if options.aligner == 'insert': if not adapters1 or len(adapters1) != 1 or adapters1[0].where != BACK or \ not adapters2 or len(adapters2) != 1 or adapters2[0].where != BACK: parser.error("Insert aligner requires a single 3' adapter for each read") if options.debug: for adapter in adapters1 + adapters2: adapter.enable_debug() modifiers = Modifiers(options.paired) for op in options.op_order: if op == 'W' and options.overwrite_low_quality: lowq, highq, window = options.overwrite_low_quality modifiers.add_modifier(OverwriteRead, worse_read_min_quality=lowq, better_read_min_quality=highq, window_size=window, base=options.quality_base) elif op == 'A' and (adapters1 or adapters2): # TODO: generalize this using some kind of factory class if options.aligner == 'insert': # Use different base probabilities if we're trimming bisulfite data. # TODO: this doesn't seem to help things, so commenting it out for now #base_probs = dict(p1=0.33, p2=0.67) if options.bisulfite else dict(p1=0.25, p2=0.75) modifiers.add_modifier(InsertAdapterCutter, adapter1=adapters1[0], adapter2=adapters2[0], action=options.action, mismatch_action=options.correct_mismatches, max_insert_mismatch_frac=options.insert_match_error_rate, max_adapter_mismatch_frac=options.insert_match_adapter_error_rate, match_probability=match_probability, insert_max_rmp=options.insert_max_rmp) else: a1_args = a2_args = None if adapters1: a1_args = dict(adapters=adapters1, times=options.times, action=options.action) if adapters2: a2_args = dict(adapters=adapters2, times=options.times, action=options.action) modifiers.add_modifier_pair(AdapterCutter, a1_args, a2_args) elif op == 'C' and (options.cut or options.cut2): modifiers.add_modifier_pair(UnconditionalCutter, dict(lengths=options.cut), dict(lengths=options.cut2) ) elif op == 'G' and (options.nextseq_trim is not None): modifiers.add_modifier(NextseqQualityTrimmer, read=1, cutoff=options.nextseq_trim, base=options.quality_base) elif op == 'Q' and options.quality_cutoff: modifiers.add_modifier(QualityTrimmer, cutoff_front=options.quality_cutoff[0], cutoff_back=options.quality_cutoff[1], base=options.quality_base) if options.bisulfite: if isinstance(options.bisulfite, str): if "non-directional" in options.bisulfite: modifiers.add_modifier(NonDirectionalBisulfiteTrimmer, rrbs=options.bisulfite=="non-directional-rrbs") elif options.bisulfite == "rrbs": modifiers.add_modifier(RRBSTrimmer) elif options.bisulfite in ("epignome", "truseq"): # Trimming leads to worse results #modifiers.add_modifier(TruSeqBisulfiteTrimmer) pass elif options.bisulfite == "swift": modifiers.add_modifier(SwiftBisulfiteTrimmer) else: if options.bisulfite[0]: modifiers.add_modifier(MinCutter, read=1, **(options.bisulfite[0])) if len(options.bisulfite) > 1 and options.bisulfite[1]: modifiers.add_modifier(MinCutter, read=2, **(options.bisulfite[1])) if options.trim_n: modifiers.add_modifier(NEndTrimmer) if options.cut_min or options.cut_min2: modifiers.add_modifier_pair(MinCutter, dict(lengths=options.cut_min), dict(lengths=options.cut_min2) ) if options.length_tag: modifiers.add_modifier(LengthTagModifier, length_tag=options.length_tag) if options.strip_suffix: modifiers.add_modifier(SuffixRemover, suffixes=options.strip_suffix) if options.prefix or options.suffix: modifiers.add_modifier(PrefixSuffixAdder, prefix=options.prefix, suffix=options.suffix) if options.double_encode: modifiers.add_modifier(DoubleEncoder) if options.zero_cap and qualities: modifiers.add_modifier(ZeroCapper, quality_base=options.quality_base) if options.trim_primer: modifiers.add_modifier(PrimerTrimmer) if options.merge_overlapping: modifiers.add_modifier(MergeOverlapping, min_overlap=options.merge_min_overlap, error_rate=options.merge_error_rate, mismatch_action=options.correct_mismatches) # Create Filters and Formatters min_affected = 2 if options.pair_filter == 'both' else 1 filters = Filters(FilterFactory(options.paired, min_affected)) output1 = output2 = None interleaved = False if options.interleaved_output: output1 = options.interleaved_output interleaved = True else: output1 = options.output output2 = options.paired_output seq_formatter_args = dict( qualities=qualities, colorspace=options.colorspace, interleaved=interleaved ) formatters = Formatters(output1, seq_formatter_args) force_create = [] if options.merge_overlapping: filters.add_filter(MergedReadFilter) if options.merged_output: formatters.add_seq_formatter(MergedReadFilter, options.merged_output) if options.minimum_length is not None and options.minimum_length > 0: filters.add_filter(TooShortReadFilter, options.minimum_length) if options.too_short_output: formatters.add_seq_formatter(TooShortReadFilter, options.too_short_output, options.too_short_paired_output) if options.maximum_length < sys.maxsize: filters.add_filter(TooLongReadFilter, options.maximum_length) if options.too_long_output is not None: formatters.add_seq_formatter(TooLongReadFilter, options.too_long_output, options.too_long_paired_output) if options.max_n is not None: filters.add_filter(NContentFilter, options.max_n) if options.discard_trimmed: filters.add_filter(TrimmedFilter) if not formatters.multiplexed: if output1 is not None: formatters.add_seq_formatter(NoFilter, output1, output2) if output1 != STDOUT and options.writer_process: force_create.append(output1) if output2 is not None: force_create.append(output2) elif not (options.discard_trimmed and options.untrimmed_output): formatters.add_seq_formatter(NoFilter, default_outfile) if default_outfile != STDOUT and options.writer_process: force_create.append(default_outfile) if options.discard_untrimmed or options.untrimmed_output: filters.add_filter(UntrimmedFilter) if not options.discard_untrimmed: if formatters.multiplexed: untrimmed = options.untrimmed_output or output1.format(name='unknown') formatters.add_seq_formatter(UntrimmedFilter, untrimmed) formatters.add_seq_formatter(NoFilter, untrimmed) elif options.untrimmed_output: formatters.add_seq_formatter(UntrimmedFilter, options.untrimmed_output, options.untrimmed_paired_output) if options.rest_file: formatters.add_info_formatter(RestFormatter(options.rest_file)) if options.info_file: formatters.add_info_formatter(InfoFormatter(options.info_file)) if options.wildcard_file: formatters.add_info_formatter(WildcardFormatter(options.wildcard_file)) writers = Writers(force_create) if options.stats: read_stats = ReadStatistics( options.stats, options.paired, qualities=qualities, tile_key_regexp=options.tile_key_regexp) pipeline = PipelineWithStats(modifiers, filters, read_stats) else: pipeline = Pipeline(modifiers, filters) return (reader, pipeline, formatters, writers)
from argparse import ArgumentParser from atropos.modifiers import AdapterCutter, InsertAdapterCutter from atropos.adapters import AdapterParser from atropos.seqio import open_reader adapter_parser = AdapterParser() A1 = adapter_parser.parse('AGATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG') A2 = adapter_parser.parse('AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT') def create_adapter(): ac1 = AdapterCutter([A1]) ac2 = AdapterCutter([A2]) return lambda read1,read2: (ac1(read1), ac2(read2)) def create_insert(): return InsertAdapterCutter(A1, A2) def time_process(reader, aligner): from datetime import datetime start = datetime.now() i = run_process(reader, aligner) stop = datetime.now() t = stop-start print("Time to process {} read pairs: {}".format(i, t)) return t.total_seconds() def run_process(reader, aligner): for i, (read1, read2) in enumerate(reader, 1): read1_new, read2_new = aligner(read1, read2) return i
def create_atropos_params(options, parser, default_outfile): from atropos.adapters import AdapterParser, BACK from atropos.modifiers import ( Modifiers, AdapterCutter, InsertAdapterCutter, UnconditionalCutter, NextseqQualityTrimmer, QualityTrimmer, NonDirectionalBisulfiteTrimmer, RRBSTrimmer, SwiftBisulfiteTrimmer, MinCutter, NEndTrimmer, LengthTagModifier, SuffixRemover, PrefixSuffixAdder, DoubleEncoder, ZeroCapper, PrimerTrimmer, MergeOverlapping, OverwriteRead) from atropos.filters import ( Filters, FilterFactory, TooShortReadFilter, TooLongReadFilter, NContentFilter, TrimmedFilter, UntrimmedFilter, NoFilter, MergedReadFilter) from atropos.seqio import Formatters, RestFormatter, InfoFormatter, WildcardFormatter, Writers from atropos.util import RandomMatchProbability reader, input_names, qualities, has_qual_file = create_reader(options, parser) if options.adapter_max_rmp or options.aligner == 'insert': match_probability = RandomMatchProbability() # Create Adapters has_adapters1 = options.adapters or options.anywhere or options.front has_adapters2 = options.adapters2 or options.anywhere2 or options.front2 adapters1 = adapters2 = [] if has_adapters1 or has_adapters2: adapter_cache = load_known_adapters(options) parser_args = dict( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels, indel_cost=options.indel_cost, cache=adapter_cache ) if options.adapter_max_rmp: parser_args['match_probability'] = match_probability parser_args['max_rmp'] = options.adapter_max_rmp adapter_parser = AdapterParser(**parser_args) try: if has_adapters1: adapters1 = adapter_parser.parse_multi( options.adapters, options.anywhere, options.front) if has_adapters2: adapters2 = adapter_parser.parse_multi( options.adapters2, options.anywhere2, options.front2) except IOError as e: if e.errno == errno.ENOENT: parser.error(e) raise except ValueError as e: parser.error(e) if options.cache_adapters: adapter_cache.save() # Create Modifiers # TODO: can this be replaced with an argparse required group? if not adapters1 and not adapters2 and not options.quality_cutoff and \ options.nextseq_trim is None and \ options.cut == [] and options.cut2 == [] and \ options.cut_min == [] and options.cut_min2 == [] and \ (options.minimum_length is None or options.minimum_length <= 0) and \ options.maximum_length == sys.maxsize and \ not has_qual_file and options.max_n is None and not options.trim_n \ and (not options.paired or options.overwrite_low_quality is None): parser.error("You need to provide at least one adapter sequence.") if options.aligner == 'insert': if not adapters1 or len(adapters1) != 1 or adapters1[0].where != BACK or \ not adapters2 or len(adapters2) != 1 or adapters2[0].where != BACK: parser.error("Insert aligner requires a single 3' adapter for each read") if options.debug: for adapter in adapters1 + adapters2: adapter.enable_debug() modifiers = Modifiers(options.paired) for op in options.op_order: if op == 'W' and options.overwrite_low_quality: lowq, highq, window = options.overwrite_low_quality modifiers.add_modifier(OverwriteRead, worse_read_min_quality=lowq, better_read_min_quality=highq, window_size=window, base=options.quality_base) elif op == 'A' and (adapters1 or adapters2): # TODO: generalize this using some kind of factory class if options.aligner == 'insert': # Use different base probabilities if we're trimming bisulfite data. # TODO: this doesn't seem to help things, so commenting it out for now #base_probs = dict(p1=0.33, p2=0.67) if options.bisulfite else dict(p1=0.25, p2=0.75) modifiers.add_modifier(InsertAdapterCutter, adapter1=adapters1[0], adapter2=adapters2[0], action=options.action, mismatch_action=options.correct_mismatches, max_insert_mismatch_frac=options.insert_match_error_rate, max_adapter_mismatch_frac=options.insert_match_adapter_error_rate, match_probability=match_probability, insert_max_rmp=options.insert_max_rmp) else: a1_args = a2_args = None if adapters1: a1_args = dict(adapters=adapters1, times=options.times, action=options.action) if adapters2: a2_args = dict(adapters=adapters2, times=options.times, action=options.action) modifiers.add_modifier_pair(AdapterCutter, a1_args, a2_args) elif op == 'C' and (options.cut or options.cut2): modifiers.add_modifier_pair(UnconditionalCutter, dict(lengths=options.cut), dict(lengths=options.cut2) ) elif op == 'G' and (options.nextseq_trim is not None): modifiers.add_modifier(NextseqQualityTrimmer, read=1, cutoff=options.nextseq_trim, base=options.quality_base) elif op == 'Q' and options.quality_cutoff: modifiers.add_modifier(QualityTrimmer, cutoff_front=options.quality_cutoff[0], cutoff_back=options.quality_cutoff[1], base=options.quality_base) if options.bisulfite: if isinstance(options.bisulfite, str): if "non-directional" in options.bisulfite: modifiers.add_modifier(NonDirectionalBisulfiteTrimmer, rrbs=options.bisulfite=="non-directional-rrbs") elif options.bisulfite == "rrbs": modifiers.add_modifier(RRBSTrimmer) elif options.bisulfite in ("epignome", "truseq"): # Trimming leads to worse results #modifiers.add_modifier(TruSeqBisulfiteTrimmer) pass elif options.bisulfite == "swift": modifiers.add_modifier(SwiftBisulfiteTrimmer) else: if options.bisulfite[0]: modifiers.add_modifier(MinCutter, read=1, **(options.bisulfite[0])) if len(options.bisulfite) > 1 and options.bisulfite[1]: modifiers.add_modifier(MinCutter, read=2, **(options.bisulfite[1])) if options.trim_n: modifiers.add_modifier(NEndTrimmer) if options.cut_min or options.cut_min2: modifiers.add_modifier_pair(MinCutter, dict(lengths=options.cut_min), dict(lengths=options.cut_min2) ) if options.length_tag: modifiers.add_modifier(LengthTagModifier, length_tag=options.length_tag) if options.strip_suffix: modifiers.add_modifier(SuffixRemover, suffixes=options.strip_suffix) if options.prefix or options.suffix: modifiers.add_modifier(PrefixSuffixAdder, prefix=options.prefix, suffix=options.suffix) if options.double_encode: modifiers.add_modifier(DoubleEncoder) if options.zero_cap and qualities: modifiers.add_modifier(ZeroCapper, quality_base=options.quality_base) if options.trim_primer: modifiers.add_modifier(PrimerTrimmer) if options.merge_overlapping: modifiers.add_modifier(MergeOverlapping, min_overlap=options.merge_min_overlap, error_rate=options.merge_error_rate, mismatch_action=options.correct_mismatches) # Create Filters and Formatters min_affected = 2 if options.pair_filter == 'both' else 1 filters = Filters(FilterFactory(options.paired, min_affected)) output1 = output2 = None interleaved = False if options.interleaved_output: output1 = options.interleaved_output interleaved = True else: output1 = options.output output2 = options.paired_output seq_formatter_args = dict( qualities=qualities, colorspace=options.colorspace, interleaved=interleaved ) formatters = Formatters(output1, seq_formatter_args) force_create = [] if options.merge_overlapping: filters.add_filter(MergedReadFilter) if options.merged_output: formatters.add_seq_formatter(MergedReadFilter, options.merged_output) if options.minimum_length is not None and options.minimum_length > 0: filters.add_filter(TooShortReadFilter, options.minimum_length) if options.too_short_output: formatters.add_seq_formatter(TooShortReadFilter, options.too_short_output, options.too_short_paired_output) if options.maximum_length < sys.maxsize: filters.add_filter(TooLongReadFilter, options.maximum_length) if options.too_long_output is not None: formatters.add_seq_formatter(TooLongReadFilter, options.too_long_output, options.too_long_paired_output) if options.max_n is not None: filters.add_filter(NContentFilter, options.max_n) if options.discard_trimmed: filters.add_filter(TrimmedFilter) if not formatters.multiplexed: if output1 is not None: formatters.add_seq_formatter(NoFilter, output1, output2) if output1 != STDOUT and options.writer_process: force_create.append(output1) if output2 is not None: force_create.append(output2) elif not (options.discard_trimmed and options.untrimmed_output): formatters.add_seq_formatter(NoFilter, default_outfile) if default_outfile != STDOUT and options.writer_process: force_create.append(default_outfile) if options.discard_untrimmed or options.untrimmed_output: filters.add_filter(UntrimmedFilter) if not options.discard_untrimmed: if formatters.multiplexed: untrimmed = options.untrimmed_output or output1.format(name='unknown') formatters.add_seq_formatter(UntrimmedFilter, untrimmed) formatters.add_seq_formatter(NoFilter, untrimmed) elif options.untrimmed_output: formatters.add_seq_formatter(UntrimmedFilter, options.untrimmed_output, options.untrimmed_paired_output) if options.rest_file: formatters.add_info_formatter(RestFormatter(options.rest_file)) if options.info_file: formatters.add_info_formatter(InfoFormatter(options.info_file)) if options.wildcard_file: formatters.add_info_formatter(WildcardFormatter(options.wildcard_file)) writers = Writers(force_create) return AtroposParams(reader, modifiers, filters, formatters, writers)
from argparse import ArgumentParser from atropos.modifiers import AdapterCutter, InsertAdapterCutter from atropos.adapters import AdapterParser from atropos.seqio import open_reader adapter_parser = AdapterParser() A1 = adapter_parser.parse( 'AGATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG') A2 = adapter_parser.parse( 'AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT') def create_adapter(): ac1 = AdapterCutter([A1]) ac2 = AdapterCutter([A2]) return lambda read1, read2: (ac1(read1), ac2(read2)) def create_insert(): return InsertAdapterCutter(A1, A2) def time_process(reader, aligner): from datetime import datetime start = datetime.now() i = run_process(reader, aligner) stop = datetime.now() t = stop - start print("Time to process {} read pairs: {}".format(i, t)) return t.total_seconds()