Exemple #1
0
 def __call__(self):
     options = self.options
     match_probability = RandomMatchProbability()
     
     # Create Adapters
     
     has_adapters1 = options.adapters or options.anywhere or options.front
     has_adapters2 = options.adapters2 or options.anywhere2 or options.front2
     
     adapters1 = adapters2 = []
     if has_adapters1 or has_adapters2:
         adapter_cache = super().load_known_adapters()
         parser_args = dict(
             colorspace=options.colorspace,
             max_error_rate=options.error_rate,
             min_overlap=options.overlap,
             read_wildcards=options.match_read_wildcards,
             adapter_wildcards=options.match_adapter_wildcards,
             indels=options.indels, indel_cost=options.indel_cost,
             cache=adapter_cache, gc_content=options.gc_content,
             match_probability=match_probability, alphabet=options.alphabet)
         if options.adapter_max_rmp:
             parser_args['max_rmp'] = options.adapter_max_rmp
         adapter_parser = AdapterParser(**parser_args)
         
         if has_adapters1:
             adapters1 = adapter_parser.parse_multi(
                 options.adapters, options.anywhere, options.front)
         if has_adapters2:
             adapters2 = adapter_parser.parse_multi(
                 options.adapters2, options.anywhere2, options.front2)
         
         if options.cache_adapters:
             adapter_cache.save()
     
     # Create Modifiers
     
     # TODO: can this be replaced with an argparse required group?
     if (
             not adapters1 and not adapters2 and
             not options.quality_cutoff and
             options.nextseq_trim is None and
             options.cut == [] and options.cut2 == [] and
             options.cut_min == [] and options.cut_min2 == [] and
             (
                 options.minimum_length is None or
                 options.minimum_length <= 0) and
             options.maximum_length == sys.maxsize and not options.trim_n and
             not self.has_qualfile and options.max_n is None and
             (not options.paired or options.overwrite_low_quality is None)):
         raise ValueError(
             "You need to provide at least one adapter sequence.")
     
     if (
             options.aligner == 'insert' and any(
                 not a or len(a) != 1 or a[0].where != BACK
                 for a in (adapters1, adapters2))):
         raise ValueError(
             "Insert aligner requires a single 3' adapter for each read")
     
     if options.debug:
         for adapter in adapters1 + adapters2:
             adapter.enable_debug()
     
     if options.paired:
         modifiers = PairedEndModifiers(options.paired)
     else:
         modifiers = SingleEndModifiers()
     
     for oper in options.op_order:
         if oper == 'W' and options.overwrite_low_quality:
             lowq, highq, window = options.overwrite_low_quality
             modifiers.add_modifier(
                 OverwriteRead,
                 worse_read_min_quality=lowq, better_read_min_quality=highq,
                 window_size=window, base=options.quality_base)
             
         elif oper == 'A' and (adapters1 or adapters2):
             # TODO: generalize this using some kind of factory class
             if options.aligner == 'insert':
                 # Use different base probabilities if we're trimming
                 # bisulfite data.
                 # TODO: this doesn't seem to help things, so commenting it
                 # out for now
                 #if options.bisulfite:
                 #   base_probs = dict(match_prob=0.33, mismatch_prob=0.67)
                 # else:
                 #   base_probs = dict(match_prob=0.25, mismatch_prob=0.75)
                 modifiers.add_modifier(
                     InsertAdapterCutter,
                     adapter1=adapters1[0], adapter2=adapters2[0],
                     action=options.action,
                     mismatch_action=options.correct_mismatches,
                     max_insert_mismatch_frac=\
                         options.insert_match_error_rate,
                     max_adapter_mismatch_frac=\
                         options.insert_match_adapter_error_rate,
                     match_probability=match_probability,
                     insert_max_rmp=options.insert_max_rmp,
                     read_wildcards=options.match_read_wildcards,
                     adapter_wildcards=options.match_adapter_wildcards)
             else:
                 a1_args = dict(
                     adapters=adapters1,
                     times=options.times,
                     action=options.action) if adapters1 else None
                 a2_args = dict(
                     adapters=adapters2,
                     times=options.times,
                     action=options.action) if adapters2 else None
                 modifiers.add_modifier_pair(AdapterCutter, a1_args, a2_args)
         elif oper == 'C' and (options.cut or options.cut2):
             modifiers.add_modifier_pair(
                 UnconditionalCutter,
                 dict(lengths=options.cut),
                 dict(lengths=options.cut2))
         elif oper == 'G' and (options.nextseq_trim is not None):
             modifiers.add_modifier(
                 NextseqQualityTrimmer,
                 cutoff=options.nextseq_trim,
                 base=options.quality_base)
         elif oper == 'Q' and options.quality_cutoff:
             modifiers.add_modifier(
                 QualityTrimmer,
                 cutoff_front=options.quality_cutoff[0],
                 cutoff_back=options.quality_cutoff[1],
                 base=options.quality_base)
     
     if options.bisulfite:
         if isinstance(options.bisulfite, str):
             if "non-directional" in options.bisulfite:
                 modifiers.add_modifier(
                     NonDirectionalBisulfiteTrimmer,
                     rrbs=options.bisulfite=="non-directional-rrbs")
             elif options.bisulfite == "rrbs":
                 modifiers.add_modifier(RRBSTrimmer)
             elif options.bisulfite in ("epignome", "truseq"):
                 # Trimming leads to worse results
                 #modifiers.add_modifier(TruSeqBisulfiteTrimmer)
                 pass
             elif options.bisulfite == "swift":
                 modifiers.add_modifier(SwiftBisulfiteTrimmer)
         else:
             if options.bisulfite[0]:
                 modifiers.add_modifier(
                     MinCutter, read=1, **(options.bisulfite[0]))
             if len(options.bisulfite) > 1 and options.bisulfite[1]:
                 modifiers.add_modifier(
                     MinCutter, read=2, **(options.bisulfite[1]))
     
     if options.trim_n:
         modifiers.add_modifier(NEndTrimmer)
     
     if options.cut_min or options.cut_min2:
         modifiers.add_modifier_pair(
             MinCutter,
             dict(lengths=options.cut_min),
             dict(lengths=options.cut_min2))
     
     if options.length_tag:
         modifiers.add_modifier(
             LengthTagModifier, length_tag=options.length_tag)
     
     if options.strip_suffix:
         modifiers.add_modifier(SuffixRemover, suffixes=options.strip_suffix)
     
     if options.prefix or options.suffix:
         modifiers.add_modifier(
             PrefixSuffixAdder, prefix=options.prefix, suffix=options.suffix)
     
     if options.double_encode:
         modifiers.add_modifier(DoubleEncoder)
     
     if options.zero_cap and self.delivers_qualities:
         modifiers.add_modifier(
             ZeroCapper, quality_base=options.quality_base)
     
     if options.trim_primer:
         modifiers.add_modifier(PrimerTrimmer)
     
     if options.merge_overlapping:
         modifiers.add_modifier(
             MergeOverlapping,
             min_overlap=options.merge_min_overlap,
             error_rate=options.merge_error_rate,
             mismatch_action=options.correct_mismatches)
     
     # Create Filters and Formatters
     
     min_affected = 2 if options.pair_filter == 'both' else 1
     filters = Filters(FilterFactory(options.paired, min_affected))
     
     output1 = output2 = None
     interleaved = False
     if options.interleaved_output:
         output1 = options.interleaved_output
         interleaved = True
     else:
         output1 = options.output
         output2 = options.paired_output
     
     seq_formatter_args = dict(
         qualities=self.delivers_qualities,
         colorspace=options.colorspace,
         interleaved=interleaved
     )
     formatters = Formatters(output1, seq_formatter_args)
     force_create = []
         
     if options.merge_overlapping:
         filters.add_filter(MergedReadFilter)
         if options.merged_output:
             formatters.add_seq_formatter(
                 MergedReadFilter, options.merged_output)
         
     if options.minimum_length is not None and options.minimum_length > 0:
         filters.add_filter(TooShortReadFilter, options.minimum_length)
         if options.too_short_output:
             formatters.add_seq_formatter(
                 TooShortReadFilter,
                 options.too_short_output, options.too_short_paired_output)
     
     if options.maximum_length < sys.maxsize:
         filters.add_filter(TooLongReadFilter, options.maximum_length)
         if options.too_long_output is not None:
             formatters.add_seq_formatter(
                 TooLongReadFilter,
                 options.too_long_output, options.too_long_paired_output)
     
     if options.max_n is not None:
         filters.add_filter(NContentFilter, options.max_n)
     
     if options.discard_trimmed:
         filters.add_filter(TrimmedFilter)
     
     if not formatters.multiplexed:
         if output1 is not None:
             formatters.add_seq_formatter(NoFilter, output1, output2)
             if output1 != STDOUT and options.writer_process:
                 force_create.append(output1)
                 if output2 is not None:
                     force_create.append(output2)
         elif not (options.discard_trimmed and options.untrimmed_output):
             formatters.add_seq_formatter(NoFilter, options.default_outfile)
             if options.default_outfile != STDOUT and options.writer_process:
                 force_create.append(options.default_outfile)
     
     if options.discard_untrimmed or options.untrimmed_output:
         filters.add_filter(UntrimmedFilter)
     
     if not options.discard_untrimmed:
         if formatters.multiplexed:
             untrimmed = options.untrimmed_output or output1.format(
                 name='unknown')
             formatters.add_seq_formatter(UntrimmedFilter, untrimmed)
             formatters.add_seq_formatter(NoFilter, untrimmed)
         elif options.untrimmed_output:
             formatters.add_seq_formatter(
                 UntrimmedFilter,
                 options.untrimmed_output, options.untrimmed_paired_output)
     
     if options.rest_file:
         formatters.add_info_formatter(RestFormatter(options.rest_file))
     if options.info_file:
         formatters.add_info_formatter(InfoFormatter(options.info_file))
     if options.wildcard_file:
         formatters.add_info_formatter(
             WildcardFormatter(options.wildcard_file))
     
     if options.paired:
         mixin_class = PairedEndPipelineMixin
     else:
         mixin_class = SingleEndPipelineMixin
     writers = Writers(force_create)
     record_handler = RecordHandler(modifiers, filters, formatters)
     if options.stats:
         record_handler = StatsRecordHandlerWrapper(
             record_handler, options.paired, options.stats,
             qualities=self.delivers_qualities,
             quality_base=self.quality_base)
     
     logger = logging.getLogger()
     num_adapters = sum(len(a) for a in modifiers.get_adapters())
     logger.info(
         "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...",
         num_adapters,
         's' if num_adapters > 1 else '', options.error_rate * 100,
         {
             False: 'single-end',
             'first': 'paired-end legacy',
             'both': 'paired-end'
         }[options.paired])
     if (
             options.paired == 'first' and (
                 len(record_handler.modifiers.get_modifiers(read=2)) > 0 or
                 options.quality_cutoff)):
         logger.warning('\n'.join(textwrap.wrap(
             'Requested read modifications are applied only to the '
             'first read since backwards compatibility mode is enabled. '
             'To modify both reads, also use any of the -A/-B/-G/-U '
             'options. Use a dummy adapter sequence when necessary: '
             '-A XXX')))
     
     if options.threads is None:
         # Run single-threaded version
         result_handler = WorkerResultHandler(WriterResultHandler(writers))
         pipeline_class = type(
             'TrimPipelineImpl', (mixin_class, TrimPipeline), {})
         pipeline = pipeline_class(record_handler, result_handler)
         self.summary.update(mode='serial', threads=1)
         return run_interruptible(pipeline, self, raise_on_error=True)
     else:
         # Run multiprocessing version
         self.summary.update(mode='parallel', threads=options.threads)
         return self.run_parallel(record_handler, writers, mixin_class)
Exemple #2
0
def create_trim_params(options, parser, default_outfile):
    from atropos.adapters import AdapterParser, BACK
    from atropos.modifiers import (
        Modifiers, AdapterCutter, InsertAdapterCutter, UnconditionalCutter,
        NextseqQualityTrimmer, QualityTrimmer, NonDirectionalBisulfiteTrimmer,
        RRBSTrimmer, SwiftBisulfiteTrimmer, MinCutter, NEndTrimmer,
        LengthTagModifier, SuffixRemover, PrefixSuffixAdder, DoubleEncoder,
        ZeroCapper, PrimerTrimmer, MergeOverlapping, OverwriteRead)
    from atropos.filters import (
        Filters, FilterFactory, TooShortReadFilter, TooLongReadFilter,
        NContentFilter, TrimmedFilter, UntrimmedFilter, NoFilter,
        MergedReadFilter)
    from atropos.trim import Pipeline, PipelineWithStats
    from atropos.seqio import Formatters, RestFormatter, InfoFormatter, WildcardFormatter, Writers
    from atropos.util import RandomMatchProbability
    
    reader, input_names, qualities, has_qual_file = create_reader(options, parser)
    
    if options.adapter_max_rmp or options.aligner == 'insert':
        match_probability = RandomMatchProbability()
    
    # Create Adapters
    
    has_adapters1 = options.adapters or options.anywhere or options.front
    has_adapters2 = options.adapters2 or options.anywhere2 or options.front2
    
    adapters1 = adapters2 = []
    if has_adapters1 or has_adapters2:
        adapter_cache = load_known_adapters(options)
        parser_args = dict(
            colorspace=options.colorspace,
            max_error_rate=options.error_rate,
            min_overlap=options.overlap,
            read_wildcards=options.match_read_wildcards,
            adapter_wildcards=options.match_adapter_wildcards,
            indels=options.indels, indel_cost=options.indel_cost,
            cache=adapter_cache
        )
        if options.adapter_max_rmp:
            parser_args['match_probability'] = match_probability
            parser_args['max_rmp'] = options.adapter_max_rmp
        adapter_parser = AdapterParser(**parser_args)
        
        try:
            if has_adapters1:
                adapters1 = adapter_parser.parse_multi(
                    options.adapters, options.anywhere, options.front)
            if has_adapters2:
                adapters2 = adapter_parser.parse_multi(
                    options.adapters2, options.anywhere2, options.front2)
        except IOError as e:
            if e.errno == errno.ENOENT:
                parser.error(e)
            raise
        except ValueError as e:
            parser.error(e)
        
        if options.cache_adapters:
            adapter_cache.save()
    
    # Create Modifiers
    
    # TODO: can this be replaced with an argparse required group?
    if not adapters1 and not adapters2 and not options.quality_cutoff and \
            options.nextseq_trim is None and \
            options.cut == [] and options.cut2 == [] and \
            options.cut_min == [] and options.cut_min2 == [] and \
            (options.minimum_length is None or options.minimum_length <= 0) and \
            options.maximum_length == sys.maxsize and \
            not has_qual_file and options.max_n is None and not options.trim_n \
            and (not options.paired or options.overwrite_low_quality is None):
        parser.error("You need to provide at least one adapter sequence.")
    
    if options.aligner == 'insert':
        if not adapters1 or len(adapters1) != 1 or adapters1[0].where != BACK or \
                not adapters2 or len(adapters2) != 1 or adapters2[0].where != BACK:
            parser.error("Insert aligner requires a single 3' adapter for each read")
    
    if options.debug:
        for adapter in adapters1 + adapters2:
            adapter.enable_debug()
    
    modifiers = Modifiers(options.paired)
            
    for op in options.op_order:
        if op == 'W' and options.overwrite_low_quality:
            lowq, highq, window = options.overwrite_low_quality
            modifiers.add_modifier(OverwriteRead,
                worse_read_min_quality=lowq, better_read_min_quality=highq,
                window_size=window, base=options.quality_base)
            
        elif op == 'A' and (adapters1 or adapters2):
            # TODO: generalize this using some kind of factory class
            if options.aligner == 'insert':
                # Use different base probabilities if we're trimming bisulfite data.
                # TODO: this doesn't seem to help things, so commenting it out for now
                #base_probs = dict(p1=0.33, p2=0.67) if options.bisulfite else dict(p1=0.25, p2=0.75)
                modifiers.add_modifier(InsertAdapterCutter,
                    adapter1=adapters1[0], adapter2=adapters2[0], action=options.action,
                    mismatch_action=options.correct_mismatches,
                    max_insert_mismatch_frac=options.insert_match_error_rate,
                    max_adapter_mismatch_frac=options.insert_match_adapter_error_rate,
                    match_probability=match_probability,
                    insert_max_rmp=options.insert_max_rmp)
            else:
                a1_args = a2_args = None
                if adapters1:
                    a1_args = dict(adapters=adapters1, times=options.times, action=options.action)
                if adapters2:
                    a2_args = dict(adapters=adapters2, times=options.times, action=options.action)
                modifiers.add_modifier_pair(AdapterCutter, a1_args, a2_args)
        elif op == 'C' and (options.cut or options.cut2):
            modifiers.add_modifier_pair(UnconditionalCutter,
                dict(lengths=options.cut),
                dict(lengths=options.cut2)
            )
        elif op == 'G' and (options.nextseq_trim is not None):
            modifiers.add_modifier(NextseqQualityTrimmer,
                read=1, cutoff=options.nextseq_trim, base=options.quality_base)
        elif op == 'Q' and options.quality_cutoff:
            modifiers.add_modifier(QualityTrimmer,
                cutoff_front=options.quality_cutoff[0],
                cutoff_back=options.quality_cutoff[1],
                base=options.quality_base)
    
    if options.bisulfite:
        if isinstance(options.bisulfite, str):
            if "non-directional" in options.bisulfite:
                modifiers.add_modifier(NonDirectionalBisulfiteTrimmer,
                    rrbs=options.bisulfite=="non-directional-rrbs")
            elif options.bisulfite == "rrbs":
                modifiers.add_modifier(RRBSTrimmer)
            elif options.bisulfite in ("epignome", "truseq"):
                # Trimming leads to worse results
                #modifiers.add_modifier(TruSeqBisulfiteTrimmer)
                pass
            elif options.bisulfite == "swift":
                modifiers.add_modifier(SwiftBisulfiteTrimmer)
        else:
            if options.bisulfite[0]:
                modifiers.add_modifier(MinCutter, read=1, **(options.bisulfite[0]))
            if len(options.bisulfite) > 1 and options.bisulfite[1]:
                modifiers.add_modifier(MinCutter, read=2, **(options.bisulfite[1]))
    
    if options.trim_n:
        modifiers.add_modifier(NEndTrimmer)
    
    if options.cut_min or options.cut_min2:
        modifiers.add_modifier_pair(MinCutter,
            dict(lengths=options.cut_min),
            dict(lengths=options.cut_min2)
        )
    
    if options.length_tag:
        modifiers.add_modifier(LengthTagModifier, length_tag=options.length_tag)
    
    if options.strip_suffix:
        modifiers.add_modifier(SuffixRemover, suffixes=options.strip_suffix)
    
    if options.prefix or options.suffix:
        modifiers.add_modifier(PrefixSuffixAdder, prefix=options.prefix, suffix=options.suffix)
    
    if options.double_encode:
        modifiers.add_modifier(DoubleEncoder)
    
    if options.zero_cap and qualities:
        modifiers.add_modifier(ZeroCapper, quality_base=options.quality_base)
    
    if options.trim_primer:
        modifiers.add_modifier(PrimerTrimmer)
    
    if options.merge_overlapping:
        modifiers.add_modifier(MergeOverlapping,
            min_overlap=options.merge_min_overlap,
            error_rate=options.merge_error_rate,
            mismatch_action=options.correct_mismatches)
    
    # Create Filters and Formatters
    
    min_affected = 2 if options.pair_filter == 'both' else 1
    filters = Filters(FilterFactory(options.paired, min_affected))
    
    output1 = output2 = None
    interleaved = False
    if options.interleaved_output:
        output1 = options.interleaved_output
        interleaved = True
    else:
        output1 = options.output
        output2 = options.paired_output
    
    seq_formatter_args = dict(
        qualities=qualities,
        colorspace=options.colorspace,
        interleaved=interleaved
    )
    formatters = Formatters(output1, seq_formatter_args)
    force_create = []
        
    if options.merge_overlapping:
        filters.add_filter(MergedReadFilter)
        if options.merged_output:
            formatters.add_seq_formatter(MergedReadFilter, options.merged_output)
        
    if options.minimum_length is not None and options.minimum_length > 0:
        filters.add_filter(TooShortReadFilter, options.minimum_length)
        if options.too_short_output:
            formatters.add_seq_formatter(TooShortReadFilter,
                options.too_short_output, options.too_short_paired_output)

    if options.maximum_length < sys.maxsize:
        filters.add_filter(TooLongReadFilter, options.maximum_length)
        if options.too_long_output is not None:
            formatters.add_seq_formatter(TooLongReadFilter,
                options.too_long_output, options.too_long_paired_output)

    if options.max_n is not None:
        filters.add_filter(NContentFilter, options.max_n)

    if options.discard_trimmed:
        filters.add_filter(TrimmedFilter)

    if not formatters.multiplexed:
        if output1 is not None:
            formatters.add_seq_formatter(NoFilter, output1, output2)
            if output1 != STDOUT and options.writer_process:
                force_create.append(output1)
                if output2 is not None:
                    force_create.append(output2)
        elif not (options.discard_trimmed and options.untrimmed_output):
            formatters.add_seq_formatter(NoFilter, default_outfile)
            if default_outfile != STDOUT and options.writer_process:
                force_create.append(default_outfile)
    
    if options.discard_untrimmed or options.untrimmed_output:
        filters.add_filter(UntrimmedFilter)

    if not options.discard_untrimmed:
        if formatters.multiplexed:
            untrimmed = options.untrimmed_output or output1.format(name='unknown')
            formatters.add_seq_formatter(UntrimmedFilter, untrimmed)
            formatters.add_seq_formatter(NoFilter, untrimmed)
        elif options.untrimmed_output:
            formatters.add_seq_formatter(UntrimmedFilter,
                options.untrimmed_output, options.untrimmed_paired_output)

    if options.rest_file:
        formatters.add_info_formatter(RestFormatter(options.rest_file))
    if options.info_file:
        formatters.add_info_formatter(InfoFormatter(options.info_file))
    if options.wildcard_file:
        formatters.add_info_formatter(WildcardFormatter(options.wildcard_file))
    
    writers = Writers(force_create)
    
    if options.stats:
        read_stats = ReadStatistics(
            options.stats, options.paired, qualities=qualities,
            tile_key_regexp=options.tile_key_regexp)
        pipeline = PipelineWithStats(modifiers, filters, read_stats)
    else:
        pipeline = Pipeline(modifiers, filters)
    
    return (reader, pipeline, formatters, writers)
Exemple #3
0
    def __call__(self):
        options = self.options
        match_probability = RandomMatchProbability()

        # Create Adapters

        has_adapters1 = options.adapters or options.anywhere or options.front
        has_adapters2 = options.adapters2 or options.anywhere2 or options.front2

        adapters1 = adapters2 = []
        if has_adapters1 or has_adapters2:
            adapter_cache = super().load_known_adapters()
            parser_args = dict(
                colorspace=options.colorspace,
                max_error_rate=options.error_rate,
                min_overlap=options.overlap,
                read_wildcards=options.match_read_wildcards,
                adapter_wildcards=options.match_adapter_wildcards,
                indels=options.indels,
                indel_cost=options.indel_cost,
                cache=adapter_cache,
                gc_content=options.gc_content,
                match_probability=match_probability,
                alphabet=options.alphabet)
            if options.adapter_max_rmp:
                parser_args['max_rmp'] = options.adapter_max_rmp
            adapter_parser = AdapterParser(**parser_args)

            if has_adapters1:
                adapters1 = adapter_parser.parse_multi(options.adapters,
                                                       options.anywhere,
                                                       options.front)
            if has_adapters2:
                adapters2 = adapter_parser.parse_multi(options.adapters2,
                                                       options.anywhere2,
                                                       options.front2)

            if options.cache_adapters:
                adapter_cache.save()

        # Create Modifiers

        # TODO: can this be replaced with an argparse required group?
        if (not adapters1 and not adapters2 and not options.quality_cutoff
                and options.nextseq_trim is None and options.cut == []
                and options.cut2 == [] and options.cut_min == []
                and options.cut_min2 == [] and
            (options.minimum_length is None or options.minimum_length <= 0)
                and options.maximum_length == sys.maxsize
                and not options.trim_n and not self.has_qualfile
                and options.max_n is None and
            (not options.paired or options.overwrite_low_quality is None)):
            raise ValueError(
                "You need to provide at least one adapter sequence.")

        if (options.aligner == 'insert'
                and any(not a or len(a) != 1 or a[0].where != BACK
                        for a in (adapters1, adapters2))):
            raise ValueError(
                "Insert aligner requires a single 3' adapter for each read")

        if options.debug:
            for adapter in adapters1 + adapters2:
                adapter.enable_debug()

        if options.paired:
            modifiers = PairedEndModifiers(options.paired)
        else:
            modifiers = SingleEndModifiers()

        for oper in options.op_order:
            if oper == 'W' and options.overwrite_low_quality:
                lowq, highq, window = options.overwrite_low_quality
                modifiers.add_modifier(OverwriteRead,
                                       worse_read_min_quality=lowq,
                                       better_read_min_quality=highq,
                                       window_size=window,
                                       base=options.quality_base)

            elif oper == 'A' and (adapters1 or adapters2):
                # TODO: generalize this using some kind of factory class
                if options.aligner == 'insert':
                    # Use different base probabilities if we're trimming
                    # bisulfite data.
                    # TODO: this doesn't seem to help things, so commenting it
                    # out for now
                    #if options.bisulfite:
                    #   base_probs = dict(match_prob=0.33, mismatch_prob=0.67)
                    # else:
                    #   base_probs = dict(match_prob=0.25, mismatch_prob=0.75)
                    modifiers.add_modifier(
                        InsertAdapterCutter,
                        adapter1=adapters1[0], adapter2=adapters2[0],
                        action=options.action,
                        mismatch_action=options.correct_mismatches,
                        max_insert_mismatch_frac=\
                            options.insert_match_error_rate,
                        max_adapter_mismatch_frac=\
                            options.insert_match_adapter_error_rate,
                        match_probability=match_probability,
                        insert_max_rmp=options.insert_max_rmp,
                        read_wildcards=options.match_read_wildcards,
                        adapter_wildcards=options.match_adapter_wildcards)
                else:
                    a1_args = dict(
                        adapters=adapters1,
                        times=options.times,
                        action=options.action) if adapters1 else None
                    a2_args = dict(
                        adapters=adapters2,
                        times=options.times,
                        action=options.action) if adapters2 else None
                    modifiers.add_modifier_pair(AdapterCutter, a1_args,
                                                a2_args)
            elif oper == 'C' and (options.cut or options.cut2):
                modifiers.add_modifier_pair(UnconditionalCutter,
                                            dict(lengths=options.cut),
                                            dict(lengths=options.cut2))
            elif oper == 'G' and (options.nextseq_trim is not None):
                modifiers.add_modifier(NextseqQualityTrimmer,
                                       cutoff=options.nextseq_trim,
                                       base=options.quality_base)
            elif oper == 'Q' and options.quality_cutoff:
                modifiers.add_modifier(QualityTrimmer,
                                       cutoff_front=options.quality_cutoff[0],
                                       cutoff_back=options.quality_cutoff[1],
                                       base=options.quality_base)

        if options.bisulfite:
            if isinstance(options.bisulfite, str):
                if "non-directional" in options.bisulfite:
                    modifiers.add_modifier(
                        NonDirectionalBisulfiteTrimmer,
                        rrbs=options.bisulfite == "non-directional-rrbs")
                elif options.bisulfite == "rrbs":
                    modifiers.add_modifier(RRBSTrimmer)
                elif options.bisulfite in ("epignome", "truseq"):
                    # Trimming leads to worse results
                    #modifiers.add_modifier(TruSeqBisulfiteTrimmer)
                    pass
                elif options.bisulfite == "swift":
                    modifiers.add_modifier(SwiftBisulfiteTrimmer)
            else:
                if options.bisulfite[0]:
                    modifiers.add_modifier(MinCutter,
                                           read=1,
                                           **(options.bisulfite[0]))
                if len(options.bisulfite) > 1 and options.bisulfite[1]:
                    modifiers.add_modifier(MinCutter,
                                           read=2,
                                           **(options.bisulfite[1]))

        if options.trim_n:
            modifiers.add_modifier(NEndTrimmer)

        if options.cut_min or options.cut_min2:
            modifiers.add_modifier_pair(MinCutter,
                                        dict(lengths=options.cut_min),
                                        dict(lengths=options.cut_min2))

        if options.length_tag:
            modifiers.add_modifier(LengthTagModifier,
                                   length_tag=options.length_tag)

        if options.strip_suffix:
            modifiers.add_modifier(SuffixRemover,
                                   suffixes=options.strip_suffix)

        if options.prefix or options.suffix:
            modifiers.add_modifier(PrefixSuffixAdder,
                                   prefix=options.prefix,
                                   suffix=options.suffix)

        if options.double_encode:
            modifiers.add_modifier(DoubleEncoder)

        if options.zero_cap and self.delivers_qualities:
            modifiers.add_modifier(ZeroCapper,
                                   quality_base=options.quality_base)

        if options.trim_primer:
            modifiers.add_modifier(PrimerTrimmer)

        if options.merge_overlapping:
            modifiers.add_modifier(MergeOverlapping,
                                   min_overlap=options.merge_min_overlap,
                                   error_rate=options.merge_error_rate,
                                   mismatch_action=options.correct_mismatches)

        # Create Filters and Formatters

        min_affected = 2 if options.pair_filter == 'both' else 1
        filters = Filters(FilterFactory(options.paired, min_affected))

        output1 = output2 = None
        interleaved = False
        if options.interleaved_output:
            output1 = options.interleaved_output
            interleaved = True
        else:
            output1 = options.output
            output2 = options.paired_output

        seq_formatter_args = dict(qualities=self.delivers_qualities,
                                  colorspace=options.colorspace,
                                  interleaved=interleaved)
        formatters = Formatters(output1, seq_formatter_args)
        force_create = []

        if options.merge_overlapping:
            filters.add_filter(MergedReadFilter)
            if options.merged_output:
                formatters.add_seq_formatter(MergedReadFilter,
                                             options.merged_output)

        if options.minimum_length is not None and options.minimum_length > 0:
            filters.add_filter(TooShortReadFilter, options.minimum_length)
            if options.too_short_output:
                formatters.add_seq_formatter(TooShortReadFilter,
                                             options.too_short_output,
                                             options.too_short_paired_output)

        if options.maximum_length < sys.maxsize:
            filters.add_filter(TooLongReadFilter, options.maximum_length)
            if options.too_long_output is not None:
                formatters.add_seq_formatter(TooLongReadFilter,
                                             options.too_long_output,
                                             options.too_long_paired_output)

        if options.max_n is not None:
            filters.add_filter(NContentFilter, options.max_n)

        if options.discard_trimmed:
            filters.add_filter(TrimmedFilter)

        if not formatters.multiplexed:
            if output1 is not None:
                formatters.add_seq_formatter(NoFilter, output1, output2)
                if output1 != STDOUT and options.writer_process:
                    force_create.append(output1)
                    if output2 is not None:
                        force_create.append(output2)
            elif not (options.discard_trimmed and options.untrimmed_output):
                formatters.add_seq_formatter(NoFilter, options.default_outfile)
                if options.default_outfile != STDOUT and options.writer_process:
                    force_create.append(options.default_outfile)

        if options.discard_untrimmed or options.untrimmed_output:
            filters.add_filter(UntrimmedFilter)

        if not options.discard_untrimmed:
            if formatters.multiplexed:
                untrimmed = options.untrimmed_output or output1.format(
                    name='unknown')
                formatters.add_seq_formatter(UntrimmedFilter, untrimmed)
                formatters.add_seq_formatter(NoFilter, untrimmed)
            elif options.untrimmed_output:
                formatters.add_seq_formatter(UntrimmedFilter,
                                             options.untrimmed_output,
                                             options.untrimmed_paired_output)

        if options.rest_file:
            formatters.add_info_formatter(RestFormatter(options.rest_file))
        if options.info_file:
            formatters.add_info_formatter(InfoFormatter(options.info_file))
        if options.wildcard_file:
            formatters.add_info_formatter(
                WildcardFormatter(options.wildcard_file))

        if options.paired:
            mixin_class = PairedEndPipelineMixin
        else:
            mixin_class = SingleEndPipelineMixin
        writers = Writers(force_create)
        record_handler = RecordHandler(modifiers, filters, formatters)
        if options.stats:
            record_handler = StatsRecordHandlerWrapper(
                record_handler,
                options.paired,
                options.stats,
                qualities=self.delivers_qualities,
                quality_base=self.quality_base)

        logger = logging.getLogger()
        num_adapters = sum(len(a) for a in modifiers.get_adapters())
        logger.info(
            "Trimming %s adapter%s with at most %.1f%% errors in %s mode ...",
            num_adapters, 's' if num_adapters > 1 else '',
            options.error_rate * 100, {
                False: 'single-end',
                'first': 'paired-end legacy',
                'both': 'paired-end'
            }[options.paired])
        if (options.paired == 'first'
                and (len(record_handler.modifiers.get_modifiers(read=2)) > 0
                     or options.quality_cutoff)):
            logger.warning('\n'.join(
                textwrap.wrap(
                    'Requested read modifications are applied only to the '
                    'first read since backwards compatibility mode is enabled. '
                    'To modify both reads, also use any of the -A/-B/-G/-U '
                    'options. Use a dummy adapter sequence when necessary: '
                    '-A XXX')))

        if options.threads is None:
            # Run single-threaded version
            result_handler = WorkerResultHandler(WriterResultHandler(writers))
            pipeline_class = type('TrimPipelineImpl',
                                  (mixin_class, TrimPipeline), {})
            pipeline = pipeline_class(record_handler, result_handler)
            self.summary.update(mode='serial', threads=1)
            return run_interruptible(pipeline, self, raise_on_error=True)
        else:
            # Run multiprocessing version
            self.summary.update(mode='parallel', threads=options.threads)
            return self.run_parallel(record_handler, writers, mixin_class)
Exemple #4
0
def create_atropos_params(options, parser, default_outfile):
    from atropos.adapters import AdapterParser, BACK
    from atropos.modifiers import (
        Modifiers, AdapterCutter, InsertAdapterCutter, UnconditionalCutter,
        NextseqQualityTrimmer, QualityTrimmer, NonDirectionalBisulfiteTrimmer,
        RRBSTrimmer, SwiftBisulfiteTrimmer, MinCutter, NEndTrimmer,
        LengthTagModifier, SuffixRemover, PrefixSuffixAdder, DoubleEncoder,
        ZeroCapper, PrimerTrimmer, MergeOverlapping, OverwriteRead)
    from atropos.filters import (
        Filters, FilterFactory, TooShortReadFilter, TooLongReadFilter,
        NContentFilter, TrimmedFilter, UntrimmedFilter, NoFilter,
        MergedReadFilter)
    from atropos.seqio import Formatters, RestFormatter, InfoFormatter, WildcardFormatter, Writers
    from atropos.util import RandomMatchProbability
    
    reader, input_names, qualities, has_qual_file = create_reader(options, parser)
    
    if options.adapter_max_rmp or options.aligner == 'insert':
        match_probability = RandomMatchProbability()
    
    # Create Adapters
    
    has_adapters1 = options.adapters or options.anywhere or options.front
    has_adapters2 = options.adapters2 or options.anywhere2 or options.front2
    
    adapters1 = adapters2 = []
    if has_adapters1 or has_adapters2:
        adapter_cache = load_known_adapters(options)
        parser_args = dict(
            colorspace=options.colorspace,
            max_error_rate=options.error_rate,
            min_overlap=options.overlap,
            read_wildcards=options.match_read_wildcards,
            adapter_wildcards=options.match_adapter_wildcards,
            indels=options.indels, indel_cost=options.indel_cost,
            cache=adapter_cache
        )
        if options.adapter_max_rmp:
            parser_args['match_probability'] = match_probability
            parser_args['max_rmp'] = options.adapter_max_rmp
        adapter_parser = AdapterParser(**parser_args)
        
        try:
            if has_adapters1:
                adapters1 = adapter_parser.parse_multi(
                    options.adapters, options.anywhere, options.front)
            if has_adapters2:
                adapters2 = adapter_parser.parse_multi(
                    options.adapters2, options.anywhere2, options.front2)
        except IOError as e:
            if e.errno == errno.ENOENT:
                parser.error(e)
            raise
        except ValueError as e:
            parser.error(e)
        
        if options.cache_adapters:
            adapter_cache.save()
    
    # Create Modifiers
    
    # TODO: can this be replaced with an argparse required group?
    if not adapters1 and not adapters2 and not options.quality_cutoff and \
            options.nextseq_trim is None and \
            options.cut == [] and options.cut2 == [] and \
            options.cut_min == [] and options.cut_min2 == [] and \
            (options.minimum_length is None or options.minimum_length <= 0) and \
            options.maximum_length == sys.maxsize and \
            not has_qual_file and options.max_n is None and not options.trim_n \
            and (not options.paired or options.overwrite_low_quality is None):
        parser.error("You need to provide at least one adapter sequence.")
    
    if options.aligner == 'insert':
        if not adapters1 or len(adapters1) != 1 or adapters1[0].where != BACK or \
                not adapters2 or len(adapters2) != 1 or adapters2[0].where != BACK:
            parser.error("Insert aligner requires a single 3' adapter for each read")
    
    if options.debug:
        for adapter in adapters1 + adapters2:
            adapter.enable_debug()
    
    modifiers = Modifiers(options.paired)
            
    for op in options.op_order:
        if op == 'W' and options.overwrite_low_quality:
            lowq, highq, window = options.overwrite_low_quality
            modifiers.add_modifier(OverwriteRead,
                worse_read_min_quality=lowq, better_read_min_quality=highq,
                window_size=window, base=options.quality_base)
            
        elif op == 'A' and (adapters1 or adapters2):
            # TODO: generalize this using some kind of factory class
            if options.aligner == 'insert':
                # Use different base probabilities if we're trimming bisulfite data.
                # TODO: this doesn't seem to help things, so commenting it out for now
                #base_probs = dict(p1=0.33, p2=0.67) if options.bisulfite else dict(p1=0.25, p2=0.75)
                modifiers.add_modifier(InsertAdapterCutter,
                    adapter1=adapters1[0], adapter2=adapters2[0], action=options.action,
                    mismatch_action=options.correct_mismatches,
                    max_insert_mismatch_frac=options.insert_match_error_rate,
                    max_adapter_mismatch_frac=options.insert_match_adapter_error_rate,
                    match_probability=match_probability,
                    insert_max_rmp=options.insert_max_rmp)
            else:
                a1_args = a2_args = None
                if adapters1:
                    a1_args = dict(adapters=adapters1, times=options.times, action=options.action)
                if adapters2:
                    a2_args = dict(adapters=adapters2, times=options.times, action=options.action)
                modifiers.add_modifier_pair(AdapterCutter, a1_args, a2_args)
        elif op == 'C' and (options.cut or options.cut2):
            modifiers.add_modifier_pair(UnconditionalCutter,
                dict(lengths=options.cut),
                dict(lengths=options.cut2)
            )
        elif op == 'G' and (options.nextseq_trim is not None):
            modifiers.add_modifier(NextseqQualityTrimmer,
                read=1, cutoff=options.nextseq_trim, base=options.quality_base)
        elif op == 'Q' and options.quality_cutoff:
            modifiers.add_modifier(QualityTrimmer,
                cutoff_front=options.quality_cutoff[0],
                cutoff_back=options.quality_cutoff[1],
                base=options.quality_base)
    
    if options.bisulfite:
        if isinstance(options.bisulfite, str):
            if "non-directional" in options.bisulfite:
                modifiers.add_modifier(NonDirectionalBisulfiteTrimmer,
                    rrbs=options.bisulfite=="non-directional-rrbs")
            elif options.bisulfite == "rrbs":
                modifiers.add_modifier(RRBSTrimmer)
            elif options.bisulfite in ("epignome", "truseq"):
                # Trimming leads to worse results
                #modifiers.add_modifier(TruSeqBisulfiteTrimmer)
                pass
            elif options.bisulfite == "swift":
                modifiers.add_modifier(SwiftBisulfiteTrimmer)
        else:
            if options.bisulfite[0]:
                modifiers.add_modifier(MinCutter, read=1, **(options.bisulfite[0]))
            if len(options.bisulfite) > 1 and options.bisulfite[1]:
                modifiers.add_modifier(MinCutter, read=2, **(options.bisulfite[1]))
    
    if options.trim_n:
        modifiers.add_modifier(NEndTrimmer)
    
    if options.cut_min or options.cut_min2:
        modifiers.add_modifier_pair(MinCutter,
            dict(lengths=options.cut_min),
            dict(lengths=options.cut_min2)
        )
    
    if options.length_tag:
        modifiers.add_modifier(LengthTagModifier, length_tag=options.length_tag)
    
    if options.strip_suffix:
        modifiers.add_modifier(SuffixRemover, suffixes=options.strip_suffix)
    
    if options.prefix or options.suffix:
        modifiers.add_modifier(PrefixSuffixAdder, prefix=options.prefix, suffix=options.suffix)
    
    if options.double_encode:
        modifiers.add_modifier(DoubleEncoder)
    
    if options.zero_cap and qualities:
        modifiers.add_modifier(ZeroCapper, quality_base=options.quality_base)
    
    if options.trim_primer:
        modifiers.add_modifier(PrimerTrimmer)
    
    if options.merge_overlapping:
        modifiers.add_modifier(MergeOverlapping,
            min_overlap=options.merge_min_overlap,
            error_rate=options.merge_error_rate,
            mismatch_action=options.correct_mismatches)
    
    # Create Filters and Formatters
    
    min_affected = 2 if options.pair_filter == 'both' else 1
    filters = Filters(FilterFactory(options.paired, min_affected))
    
    output1 = output2 = None
    interleaved = False
    if options.interleaved_output:
        output1 = options.interleaved_output
        interleaved = True
    else:
        output1 = options.output
        output2 = options.paired_output
    
    seq_formatter_args = dict(
        qualities=qualities,
        colorspace=options.colorspace,
        interleaved=interleaved
    )
    formatters = Formatters(output1, seq_formatter_args)
    force_create = []
        
    if options.merge_overlapping:
        filters.add_filter(MergedReadFilter)
        if options.merged_output:
            formatters.add_seq_formatter(MergedReadFilter, options.merged_output)
        
    if options.minimum_length is not None and options.minimum_length > 0:
        filters.add_filter(TooShortReadFilter, options.minimum_length)
        if options.too_short_output:
            formatters.add_seq_formatter(TooShortReadFilter,
                options.too_short_output, options.too_short_paired_output)

    if options.maximum_length < sys.maxsize:
        filters.add_filter(TooLongReadFilter, options.maximum_length)
        if options.too_long_output is not None:
            formatters.add_seq_formatter(TooLongReadFilter,
                options.too_long_output, options.too_long_paired_output)

    if options.max_n is not None:
        filters.add_filter(NContentFilter, options.max_n)

    if options.discard_trimmed:
        filters.add_filter(TrimmedFilter)

    if not formatters.multiplexed:
        if output1 is not None:
            formatters.add_seq_formatter(NoFilter, output1, output2)
            if output1 != STDOUT and options.writer_process:
                force_create.append(output1)
                if output2 is not None:
                    force_create.append(output2)
        elif not (options.discard_trimmed and options.untrimmed_output):
            formatters.add_seq_formatter(NoFilter, default_outfile)
            if default_outfile != STDOUT and options.writer_process:
                force_create.append(default_outfile)
    
    if options.discard_untrimmed or options.untrimmed_output:
        filters.add_filter(UntrimmedFilter)

    if not options.discard_untrimmed:
        if formatters.multiplexed:
            untrimmed = options.untrimmed_output or output1.format(name='unknown')
            formatters.add_seq_formatter(UntrimmedFilter, untrimmed)
            formatters.add_seq_formatter(NoFilter, untrimmed)
        elif options.untrimmed_output:
            formatters.add_seq_formatter(UntrimmedFilter,
                options.untrimmed_output, options.untrimmed_paired_output)

    if options.rest_file:
        formatters.add_info_formatter(RestFormatter(options.rest_file))
    if options.info_file:
        formatters.add_info_formatter(InfoFormatter(options.info_file))
    if options.wildcard_file:
        formatters.add_info_formatter(WildcardFormatter(options.wildcard_file))
    
    writers = Writers(force_create)
    
    return AtroposParams(reader, modifiers, filters, formatters, writers)