def __call__(self): kmer_size = self.kmer_size or 12 n_reads = self.max_reads overrep_cutoff = 100 include = self.include_contaminants or "all" known_contaminants = None if include != 'unknown': known_contaminants = self.load_known_adapters() detector = self.detector if not detector: if known_contaminants and include == 'known': detector = 'known' elif n_reads <= 50000: detector = 'heuristic' else: detector = 'khmer' detector_args = dict(known_contaminants=known_contaminants) if detector == 'known': logging.getLogger().debug( "Detecting contaminants using the known-only algorithm") detector_class = KnownContaminantDetector detector_args['min_kmer_match_frac'] = self.min_kmer_match_frac elif detector == 'heuristic': logging.getLogger().debug( "Detecting contaminants using the heuristic algorithm") detector_class = HeuristicDetector detector_args['min_frequency'] = self.min_frequency detector_args['min_contaminant_match_frac'] = \ self.min_contaminant_match_frac elif detector == 'khmer': logging.getLogger().debug( "Detecting contaminants using the kmer-based algorithm") detector_class = KhmerDetector summary_args = dict( kmer_size=kmer_size, n_reads=n_reads, overrep_cutoff=overrep_cutoff, include=include, past_end_bases=self.past_end_bases) detector_args.update(summary_args) if self.paired: detector = PairedDetector(detector_class, **detector_args) else: detector = detector_class(**detector_args) self.summary['detect'] = summary_args if known_contaminants: self.summary['detect']['known_contaminants'] = \ known_contaminants.summarize() logging.getLogger().info( "Detecting adapters and other potential contaminant " "sequences based on %d-mers in %d reads", kmer_size, n_reads) # currently only single-threaded operation is supproted self.summary.update(mode='serial', threads=1) return run_interruptible(detector, self, raise_on_error=True)
def run(self): """Run the pipeline. Returns: The return code. """ retcode = run_interruptible(self) self.terminate(retcode) return retcode
def __call__(self): if self.paired: pipeline_class = PairedEndQcPipeline else: pipeline_class = SingleEndQcPipeline pipeline_args = dict( qualities=self.delivers_qualities, quality_base=self.quality_base) if self.stats: pipeline_args.update(self.stats) if self.threads is None: self.summary.update(mode='serial', threads=1) pipeline = pipeline_class(**pipeline_args) return run_interruptible(pipeline, self) else: self.summary.update(mode='parallel', threads=self.threads) return self.run_parallel(pipeline_class, pipeline_args)
def __call__(self): if not self.delivers_qualities: raise ValueError( "Cannot estimate error rate without base qualities") if self.algorithm == 'quality': estimator_class = BaseQualityErrorEstimator elif self.algorithm == 'shadow': estimator_class = ShadowRegressionErrorEstimator estimator_args = dict(max_read_len=self.max_bases) if self.paired: estimator = PairedErrorEstimator(estimator_class=estimator_class, **estimator_args) else: estimator = estimator_class(**estimator_args) self.summary['errorrate'] = estimator_args # currently only single-threaded operation is supproted self.summary.update(mode='serial', threads=1) return run_interruptible(estimator, self, raise_on_error=True)
def __call__(self): if not self.delivers_qualities: raise ValueError( "Cannot estimate error rate without base qualities") if self.algorithm == 'quality': estimator_class = BaseQualityErrorEstimator elif self.algorithm == 'shadow': estimator_class = ShadowRegressionErrorEstimator estimator_args = dict(max_read_len=self.max_bases) if self.paired: estimator = PairedErrorEstimator( estimator_class=estimator_class, **estimator_args) else: estimator = estimator_class(**estimator_args) self.summary['errorrate'] = estimator_args # currently only single-threaded operation is supproted self.summary.update(mode='serial', threads=1) return run_interruptible(estimator, self, raise_on_error=True)
def run_parallel(reader, pipeline, formatters, writers, threads=2, timeout=30, preserve_order=False, input_queue_size=0, result_queue_size=0, use_writer_process=True, compression=None): """ Execute atropos in parallel mode. reader :: iterator over batches of reads (most likely a BatchIterator) pipeline :: formatters :: writers :: threads :: number of worker threads to use; additional threads are used for the main proccess and the writer process (if requested). timeout :: number of seconds after which waiting processes escalate their messages from DEBUG to ERROR. preserve_order :: whether to preserve the input order of reads when writing (only valid when `use_writer_process=True`) input_queue_size :: max number of items that can be in the input queue, or 0 for no limit (be warned that this could explode memory usage) result_queue_size :: max number of items that can be in the result queue, or 0 for no limit (be warned that this could explode memory usage) use_writer_process :: if True, a separate thread will be used to write results to disk. Otherwise, each worker thread will write its results to an output file with a '.N' extension, where N is the thread index. This is useful in cases where the I/O is the main bottleneck. compression If "writer", the writer process perform data compression, otherwise the worker processes performs compression. """ logging.getLogger().debug( "Starting atropos in parallel mode with threads={}, timeout={}".format( threads, timeout)) if threads < 2: raise ValueError("'threads' must be >= 2") # Reserve a thread for the writer process if it will be doing the compression and if one is available. if compression is None: compression = "writer" if use_writer_process and can_use_system_compression( ) else "worker" if compression == "writer" and threads > 2: threads -= 1 timeout = max(timeout, RETRY_INTERVAL) # Queue by which batches of reads are sent to worker processes input_queue = Queue(input_queue_size) # Queue by which results are sent from the worker processes to the writer process result_queue = Queue(result_queue_size) # Queue for processes to send summary information back to main process summary_queue = Queue(threads) # Aggregate summary summary = Summary(trimmer_classes=pipeline.modifiers.get_trimmer_classes()) if use_writer_process: worker_result_handler = QueueResultHandler(result_queue) if compression == "writer": worker_result_handler = WorkerResultHandler(worker_result_handler) else: worker_result_handler = CompressingWorkerResultHandler( worker_result_handler) # Shared variable for communicating with writer thread writer_control = Control(CONTROL_ACTIVE) # result handler if preserve_order: writer_result_handler = OrderPreservingWriterResultHandler( writers, compressed=compression == "worker") else: writer_result_handler = WriterResultHandler( writers, compressed=compression == "worker") # writer process writer_process = ResultProcess(writer_result_handler, result_queue, writer_control, timeout) writer_process.start() else: worker_result_handler = WorkerResultHandler( WriterResultHandler(writers, use_suffix=True)) # Start worker processes, reserve a thread for the reader process, # which we will get back after it completes worker_args = (input_queue, summary_queue, timeout, worker_result_handler, pipeline, formatters) worker_processes = launch_workers(threads - 1, TrimWorkerProcess, worker_args) def ensure_alive(): ensure_processes(worker_processes) if (use_writer_process and not (writer_process.is_alive() and writer_control.check_value(CONTROL_ACTIVE))): raise Exception("Writer process exited") def _run(worker_processes): # Add batches of reads to the input queue. Provide a timeout callback # to check that subprocesses are alive. num_batches = enqueue_all(enumerate(reader, 1), input_queue, timeout, ensure_alive) logging.getLogger().debug( "Main loop complete; saw {} batches".format(num_batches)) # Tell the worker processes no more input is coming enqueue_all((None, ) * threads, input_queue, timeout, ensure_alive) # Tell the writer thread the max number of batches to expect if use_writer_process: writer_control.set_value(num_batches) # Now that the reader process is done, it essentially # frees up another thread to use for a worker worker_processes.extend( launch_workers(1, TrimWorkerProcess, worker_args, offset=threads - 1)) # Wait for all summaries to be available on queue def summary_timeout_callback(): try: ensure_processes( worker_processes, "Workers are still alive and haven't returned summaries: {}", alive=False) except Exception as e: logging.getLogger().error(e) wait_on(lambda: summary_queue.full(), wait_message="Waiting on worker summaries {}", timeout=timeout, wait=True, timeout_callback=summary_timeout_callback) # Process summary information from worker processes logging.getLogger().debug( "Processing summary information from worker processes") seen_summaries = set() seen_batches = set() def summary_fail_callback(): missing_summaries = set(range(1, threads)) - seen_summaries raise Exception("Missing summaries from processes {}".format( ",".join(str(s) for s in missing))) for i in range(1, threads + 1): batch = dequeue(summary_queue, fail_callback=summary_fail_callback) worker_index, worker_batches, process_stats, adapter_stats = batch if process_stats is None or adapter_stats is None: raise Exception( "Worker process {} died unexpectedly".format(worker_index)) else: logging.getLogger().debug( "Processing summary for worker {}".format(worker_index)) seen_summaries.add(worker_index) seen_batches |= worker_batches summary.add_process_stats(process_stats) summary.add_adapter_stats(adapter_stats) # Check if any batches were missed if num_batches > 0: missing_batches = set(range(1, num_batches + 1)) - seen_batches if len(missing_batches) > 0: raise Exception("Workers did not process batches {}".format( ",".join(str(b) for b in missing_batches))) if use_writer_process: # Wait for writer to complete wait_on_process(writer_process, timeout) try: rc = run_interruptible(_run, worker_processes) finally: # notify all threads that they should stop logging.getLogger().debug("Exiting all processes") def kill(process): if rc <= 1: wait_on_process(process, timeout, terminate=True) elif process.is_alive(): process.terminate() for process in worker_processes: kill(process) if use_writer_process: kill(writer_process) report = summary.finish() if rc == 0 else None details = dict(mode='parallel', hreads=threads) return (rc, report, details)
def __call__(self): options = self.options match_probability = RandomMatchProbability() # Create Adapters has_adapters1 = options.adapters or options.anywhere or options.front has_adapters2 = options.adapters2 or options.anywhere2 or options.front2 adapters1 = adapters2 = [] if has_adapters1 or has_adapters2: adapter_cache = super().load_known_adapters() parser_args = dict( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels, indel_cost=options.indel_cost, cache=adapter_cache, gc_content=options.gc_content, match_probability=match_probability, alphabet=options.alphabet) if options.adapter_max_rmp: parser_args['max_rmp'] = options.adapter_max_rmp adapter_parser = AdapterParser(**parser_args) if has_adapters1: adapters1 = adapter_parser.parse_multi( options.adapters, options.anywhere, options.front) if has_adapters2: adapters2 = adapter_parser.parse_multi( options.adapters2, options.anywhere2, options.front2) if options.cache_adapters: adapter_cache.save() # Create Modifiers # TODO: can this be replaced with an argparse required group? if ( not adapters1 and not adapters2 and not options.quality_cutoff and options.nextseq_trim is None and options.cut == [] and options.cut2 == [] and options.cut_min == [] and options.cut_min2 == [] and ( options.minimum_length is None or options.minimum_length <= 0) and options.maximum_length == sys.maxsize and not options.trim_n and not self.has_qualfile and options.max_n is None and (not options.paired or options.overwrite_low_quality is None)): raise ValueError( "You need to provide at least one adapter sequence.") if ( options.aligner == 'insert' and any( not a or len(a) != 1 or a[0].where != BACK for a in (adapters1, adapters2))): raise ValueError( "Insert aligner requires a single 3' adapter for each read") if options.debug: for adapter in adapters1 + adapters2: adapter.enable_debug() if options.paired: modifiers = PairedEndModifiers(options.paired) else: modifiers = SingleEndModifiers() for oper in options.op_order: if oper == 'W' and options.overwrite_low_quality: lowq, highq, window = options.overwrite_low_quality modifiers.add_modifier( OverwriteRead, worse_read_min_quality=lowq, better_read_min_quality=highq, window_size=window, base=options.quality_base) elif oper == 'A' and (adapters1 or adapters2): # TODO: generalize this using some kind of factory class if options.aligner == 'insert': # Use different base probabilities if we're trimming # bisulfite data. # TODO: this doesn't seem to help things, so commenting it # out for now #if options.bisulfite: # base_probs = dict(match_prob=0.33, mismatch_prob=0.67) # else: # base_probs = dict(match_prob=0.25, mismatch_prob=0.75) modifiers.add_modifier( InsertAdapterCutter, adapter1=adapters1[0], adapter2=adapters2[0], action=options.action, mismatch_action=options.correct_mismatches, max_insert_mismatch_frac=\ options.insert_match_error_rate, max_adapter_mismatch_frac=\ options.insert_match_adapter_error_rate, match_probability=match_probability, insert_max_rmp=options.insert_max_rmp, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards) else: a1_args = dict( adapters=adapters1, times=options.times, action=options.action) if adapters1 else None a2_args = dict( adapters=adapters2, times=options.times, action=options.action) if adapters2 else None modifiers.add_modifier_pair(AdapterCutter, a1_args, a2_args) elif oper == 'C' and (options.cut or options.cut2): modifiers.add_modifier_pair( UnconditionalCutter, dict(lengths=options.cut), dict(lengths=options.cut2)) elif oper == 'G' and (options.nextseq_trim is not None): modifiers.add_modifier( NextseqQualityTrimmer, cutoff=options.nextseq_trim, base=options.quality_base) elif oper == 'Q' and options.quality_cutoff: modifiers.add_modifier( QualityTrimmer, cutoff_front=options.quality_cutoff[0], cutoff_back=options.quality_cutoff[1], base=options.quality_base) if options.bisulfite: if isinstance(options.bisulfite, str): if "non-directional" in options.bisulfite: modifiers.add_modifier( NonDirectionalBisulfiteTrimmer, rrbs=options.bisulfite=="non-directional-rrbs") elif options.bisulfite == "rrbs": modifiers.add_modifier(RRBSTrimmer) elif options.bisulfite in ("epignome", "truseq"): # Trimming leads to worse results #modifiers.add_modifier(TruSeqBisulfiteTrimmer) pass elif options.bisulfite == "swift": modifiers.add_modifier(SwiftBisulfiteTrimmer) else: if options.bisulfite[0]: modifiers.add_modifier( MinCutter, read=1, **(options.bisulfite[0])) if len(options.bisulfite) > 1 and options.bisulfite[1]: modifiers.add_modifier( MinCutter, read=2, **(options.bisulfite[1])) if options.trim_n: modifiers.add_modifier(NEndTrimmer) if options.cut_min or options.cut_min2: modifiers.add_modifier_pair( MinCutter, dict(lengths=options.cut_min), dict(lengths=options.cut_min2)) if options.length_tag: modifiers.add_modifier( LengthTagModifier, length_tag=options.length_tag) if options.strip_suffix: modifiers.add_modifier(SuffixRemover, suffixes=options.strip_suffix) if options.prefix or options.suffix: modifiers.add_modifier( PrefixSuffixAdder, prefix=options.prefix, suffix=options.suffix) if options.double_encode: modifiers.add_modifier(DoubleEncoder) if options.zero_cap and self.delivers_qualities: modifiers.add_modifier( ZeroCapper, quality_base=options.quality_base) if options.trim_primer: modifiers.add_modifier(PrimerTrimmer) if options.merge_overlapping: modifiers.add_modifier( MergeOverlapping, min_overlap=options.merge_min_overlap, error_rate=options.merge_error_rate, mismatch_action=options.correct_mismatches) # Create Filters and Formatters min_affected = 2 if options.pair_filter == 'both' else 1 filters = Filters(FilterFactory(options.paired, min_affected)) output1 = output2 = None interleaved = False if options.interleaved_output: output1 = options.interleaved_output interleaved = True else: output1 = options.output output2 = options.paired_output seq_formatter_args = dict( qualities=self.delivers_qualities, colorspace=options.colorspace, interleaved=interleaved ) formatters = Formatters(output1, seq_formatter_args) force_create = [] if options.merge_overlapping: filters.add_filter(MergedReadFilter) if options.merged_output: formatters.add_seq_formatter( MergedReadFilter, options.merged_output) if options.minimum_length is not None and options.minimum_length > 0: filters.add_filter(TooShortReadFilter, options.minimum_length) if options.too_short_output: formatters.add_seq_formatter( TooShortReadFilter, options.too_short_output, options.too_short_paired_output) if options.maximum_length < sys.maxsize: filters.add_filter(TooLongReadFilter, options.maximum_length) if options.too_long_output is not None: formatters.add_seq_formatter( TooLongReadFilter, options.too_long_output, options.too_long_paired_output) if options.max_n is not None: filters.add_filter(NContentFilter, options.max_n) if options.discard_trimmed: filters.add_filter(TrimmedFilter) if not formatters.multiplexed: if output1 is not None: formatters.add_seq_formatter(NoFilter, output1, output2) if output1 != STDOUT and options.writer_process: force_create.append(output1) if output2 is not None: force_create.append(output2) elif not (options.discard_trimmed and options.untrimmed_output): formatters.add_seq_formatter(NoFilter, options.default_outfile) if options.default_outfile != STDOUT and options.writer_process: force_create.append(options.default_outfile) if options.discard_untrimmed or options.untrimmed_output: filters.add_filter(UntrimmedFilter) if not options.discard_untrimmed: if formatters.multiplexed: untrimmed = options.untrimmed_output or output1.format( name='unknown') formatters.add_seq_formatter(UntrimmedFilter, untrimmed) formatters.add_seq_formatter(NoFilter, untrimmed) elif options.untrimmed_output: formatters.add_seq_formatter( UntrimmedFilter, options.untrimmed_output, options.untrimmed_paired_output) if options.rest_file: formatters.add_info_formatter(RestFormatter(options.rest_file)) if options.info_file: formatters.add_info_formatter(InfoFormatter(options.info_file)) if options.wildcard_file: formatters.add_info_formatter( WildcardFormatter(options.wildcard_file)) if options.paired: mixin_class = PairedEndPipelineMixin else: mixin_class = SingleEndPipelineMixin writers = Writers(force_create) record_handler = RecordHandler(modifiers, filters, formatters) if options.stats: record_handler = StatsRecordHandlerWrapper( record_handler, options.paired, options.stats, qualities=self.delivers_qualities, quality_base=self.quality_base) logger = logging.getLogger() num_adapters = sum(len(a) for a in modifiers.get_adapters()) logger.info( "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...", num_adapters, 's' if num_adapters > 1 else '', options.error_rate * 100, { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[options.paired]) if ( options.paired == 'first' and ( len(record_handler.modifiers.get_modifiers(read=2)) > 0 or options.quality_cutoff)): logger.warning('\n'.join(textwrap.wrap( 'Requested read modifications are applied only to the ' 'first read since backwards compatibility mode is enabled. ' 'To modify both reads, also use any of the -A/-B/-G/-U ' 'options. Use a dummy adapter sequence when necessary: ' '-A XXX'))) if options.threads is None: # Run single-threaded version result_handler = WorkerResultHandler(WriterResultHandler(writers)) pipeline_class = type( 'TrimPipelineImpl', (mixin_class, TrimPipeline), {}) pipeline = pipeline_class(record_handler, result_handler) self.summary.update(mode='serial', threads=1) return run_interruptible(pipeline, self, raise_on_error=True) else: # Run multiprocessing version self.summary.update(mode='parallel', threads=options.threads) return self.run_parallel(record_handler, writers, mixin_class)
def __call__(self): options = self.options match_probability = RandomMatchProbability() # Create Adapters has_adapters1 = options.adapters or options.anywhere or options.front has_adapters2 = options.adapters2 or options.anywhere2 or options.front2 adapters1 = adapters2 = [] if has_adapters1 or has_adapters2: adapter_cache = super().load_known_adapters() parser_args = dict( colorspace=options.colorspace, max_error_rate=options.error_rate, min_overlap=options.overlap, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards, indels=options.indels, indel_cost=options.indel_cost, cache=adapter_cache, gc_content=options.gc_content, match_probability=match_probability, alphabet=options.alphabet) if options.adapter_max_rmp: parser_args['max_rmp'] = options.adapter_max_rmp adapter_parser = AdapterParser(**parser_args) if has_adapters1: adapters1 = adapter_parser.parse_multi(options.adapters, options.anywhere, options.front) if has_adapters2: adapters2 = adapter_parser.parse_multi(options.adapters2, options.anywhere2, options.front2) if options.cache_adapters: adapter_cache.save() # Create Modifiers # TODO: can this be replaced with an argparse required group? if (not adapters1 and not adapters2 and not options.quality_cutoff and options.nextseq_trim is None and options.cut == [] and options.cut2 == [] and options.cut_min == [] and options.cut_min2 == [] and (options.minimum_length is None or options.minimum_length <= 0) and options.maximum_length == sys.maxsize and not options.trim_n and not self.has_qualfile and options.max_n is None and (not options.paired or options.overwrite_low_quality is None)): raise ValueError( "You need to provide at least one adapter sequence.") if (options.aligner == 'insert' and any(not a or len(a) != 1 or a[0].where != BACK for a in (adapters1, adapters2))): raise ValueError( "Insert aligner requires a single 3' adapter for each read") if options.debug: for adapter in adapters1 + adapters2: adapter.enable_debug() if options.paired: modifiers = PairedEndModifiers(options.paired) else: modifiers = SingleEndModifiers() for oper in options.op_order: if oper == 'W' and options.overwrite_low_quality: lowq, highq, window = options.overwrite_low_quality modifiers.add_modifier(OverwriteRead, worse_read_min_quality=lowq, better_read_min_quality=highq, window_size=window, base=options.quality_base) elif oper == 'A' and (adapters1 or adapters2): # TODO: generalize this using some kind of factory class if options.aligner == 'insert': # Use different base probabilities if we're trimming # bisulfite data. # TODO: this doesn't seem to help things, so commenting it # out for now #if options.bisulfite: # base_probs = dict(match_prob=0.33, mismatch_prob=0.67) # else: # base_probs = dict(match_prob=0.25, mismatch_prob=0.75) modifiers.add_modifier( InsertAdapterCutter, adapter1=adapters1[0], adapter2=adapters2[0], action=options.action, mismatch_action=options.correct_mismatches, max_insert_mismatch_frac=\ options.insert_match_error_rate, max_adapter_mismatch_frac=\ options.insert_match_adapter_error_rate, match_probability=match_probability, insert_max_rmp=options.insert_max_rmp, read_wildcards=options.match_read_wildcards, adapter_wildcards=options.match_adapter_wildcards) else: a1_args = dict( adapters=adapters1, times=options.times, action=options.action) if adapters1 else None a2_args = dict( adapters=adapters2, times=options.times, action=options.action) if adapters2 else None modifiers.add_modifier_pair(AdapterCutter, a1_args, a2_args) elif oper == 'C' and (options.cut or options.cut2): modifiers.add_modifier_pair(UnconditionalCutter, dict(lengths=options.cut), dict(lengths=options.cut2)) elif oper == 'G' and (options.nextseq_trim is not None): modifiers.add_modifier(NextseqQualityTrimmer, cutoff=options.nextseq_trim, base=options.quality_base) elif oper == 'Q' and options.quality_cutoff: modifiers.add_modifier(QualityTrimmer, cutoff_front=options.quality_cutoff[0], cutoff_back=options.quality_cutoff[1], base=options.quality_base) if options.bisulfite: if isinstance(options.bisulfite, str): if "non-directional" in options.bisulfite: modifiers.add_modifier( NonDirectionalBisulfiteTrimmer, rrbs=options.bisulfite == "non-directional-rrbs") elif options.bisulfite == "rrbs": modifiers.add_modifier(RRBSTrimmer) elif options.bisulfite in ("epignome", "truseq"): # Trimming leads to worse results #modifiers.add_modifier(TruSeqBisulfiteTrimmer) pass elif options.bisulfite == "swift": modifiers.add_modifier(SwiftBisulfiteTrimmer) else: if options.bisulfite[0]: modifiers.add_modifier(MinCutter, read=1, **(options.bisulfite[0])) if len(options.bisulfite) > 1 and options.bisulfite[1]: modifiers.add_modifier(MinCutter, read=2, **(options.bisulfite[1])) if options.trim_n: modifiers.add_modifier(NEndTrimmer) if options.cut_min or options.cut_min2: modifiers.add_modifier_pair(MinCutter, dict(lengths=options.cut_min), dict(lengths=options.cut_min2)) if options.length_tag: modifiers.add_modifier(LengthTagModifier, length_tag=options.length_tag) if options.strip_suffix: modifiers.add_modifier(SuffixRemover, suffixes=options.strip_suffix) if options.prefix or options.suffix: modifiers.add_modifier(PrefixSuffixAdder, prefix=options.prefix, suffix=options.suffix) if options.double_encode: modifiers.add_modifier(DoubleEncoder) if options.zero_cap and self.delivers_qualities: modifiers.add_modifier(ZeroCapper, quality_base=options.quality_base) if options.trim_primer: modifiers.add_modifier(PrimerTrimmer) if options.merge_overlapping: modifiers.add_modifier(MergeOverlapping, min_overlap=options.merge_min_overlap, error_rate=options.merge_error_rate, mismatch_action=options.correct_mismatches) # Create Filters and Formatters min_affected = 2 if options.pair_filter == 'both' else 1 filters = Filters(FilterFactory(options.paired, min_affected)) output1 = output2 = None interleaved = False if options.interleaved_output: output1 = options.interleaved_output interleaved = True else: output1 = options.output output2 = options.paired_output seq_formatter_args = dict(qualities=self.delivers_qualities, colorspace=options.colorspace, interleaved=interleaved) formatters = Formatters(output1, seq_formatter_args) force_create = [] if options.merge_overlapping: filters.add_filter(MergedReadFilter) if options.merged_output: formatters.add_seq_formatter(MergedReadFilter, options.merged_output) if options.minimum_length is not None and options.minimum_length > 0: filters.add_filter(TooShortReadFilter, options.minimum_length) if options.too_short_output: formatters.add_seq_formatter(TooShortReadFilter, options.too_short_output, options.too_short_paired_output) if options.maximum_length < sys.maxsize: filters.add_filter(TooLongReadFilter, options.maximum_length) if options.too_long_output is not None: formatters.add_seq_formatter(TooLongReadFilter, options.too_long_output, options.too_long_paired_output) if options.max_n is not None: filters.add_filter(NContentFilter, options.max_n) if options.discard_trimmed: filters.add_filter(TrimmedFilter) if not formatters.multiplexed: if output1 is not None: formatters.add_seq_formatter(NoFilter, output1, output2) if output1 != STDOUT and options.writer_process: force_create.append(output1) if output2 is not None: force_create.append(output2) elif not (options.discard_trimmed and options.untrimmed_output): formatters.add_seq_formatter(NoFilter, options.default_outfile) if options.default_outfile != STDOUT and options.writer_process: force_create.append(options.default_outfile) if options.discard_untrimmed or options.untrimmed_output: filters.add_filter(UntrimmedFilter) if not options.discard_untrimmed: if formatters.multiplexed: untrimmed = options.untrimmed_output or output1.format( name='unknown') formatters.add_seq_formatter(UntrimmedFilter, untrimmed) formatters.add_seq_formatter(NoFilter, untrimmed) elif options.untrimmed_output: formatters.add_seq_formatter(UntrimmedFilter, options.untrimmed_output, options.untrimmed_paired_output) if options.rest_file: formatters.add_info_formatter(RestFormatter(options.rest_file)) if options.info_file: formatters.add_info_formatter(InfoFormatter(options.info_file)) if options.wildcard_file: formatters.add_info_formatter( WildcardFormatter(options.wildcard_file)) if options.paired: mixin_class = PairedEndPipelineMixin else: mixin_class = SingleEndPipelineMixin writers = Writers(force_create) record_handler = RecordHandler(modifiers, filters, formatters) if options.stats: record_handler = StatsRecordHandlerWrapper( record_handler, options.paired, options.stats, qualities=self.delivers_qualities, quality_base=self.quality_base) logger = logging.getLogger() num_adapters = sum(len(a) for a in modifiers.get_adapters()) logger.info( "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...", num_adapters, 's' if num_adapters > 1 else '', options.error_rate * 100, { False: 'single-end', 'first': 'paired-end legacy', 'both': 'paired-end' }[options.paired]) if (options.paired == 'first' and (len(record_handler.modifiers.get_modifiers(read=2)) > 0 or options.quality_cutoff)): logger.warning('\n'.join( textwrap.wrap( 'Requested read modifications are applied only to the ' 'first read since backwards compatibility mode is enabled. ' 'To modify both reads, also use any of the -A/-B/-G/-U ' 'options. Use a dummy adapter sequence when necessary: ' '-A XXX'))) if options.threads is None: # Run single-threaded version result_handler = WorkerResultHandler(WriterResultHandler(writers)) pipeline_class = type('TrimPipelineImpl', (mixin_class, TrimPipeline), {}) pipeline = pipeline_class(record_handler, result_handler) self.summary.update(mode='serial', threads=1) return run_interruptible(pipeline, self, raise_on_error=True) else: # Run multiprocessing version self.summary.update(mode='parallel', threads=options.threads) return self.run_parallel(record_handler, writers, mixin_class)