Exemple #1
0
    def __init__(self,
                 input_file,
                 output_file,
                 algorithm="auto",
                 options={},
                 dependencies=()):
        command = AtomicCmdBuilder(
            _PRESETS[algorithm.lower()] + ["%(IN_FASTA)s"],
            IN_FASTA=input_file,
            OUT_STDOUT=output_file,
            CHECK_VERSION=MAFFT_VERSION,
        )

        apply_options(command, options)

        self._output_file = output_file

        CommandNode.__init__(
            self,
            command=command.finalize(),
            description="<MAFFTNode (%s): '%s' -> '%s'>" % (
                algorithm,
                input_file,
                output_file,
            ),
            dependencies=dependencies,
        )
Exemple #2
0
    def __init__(self,
                 infile,
                 bedfile,
                 outfile,
                 padding,
                 options={},
                 dependencies=()):
        params = factory.new("vcf_to_fasta")
        params.set_option("--padding", padding)
        params.set_option("--genotype", "%(IN_VCFFILE)s")
        params.set_option("--intervals", "%(IN_INTERVALS)s")

        params.set_kwargs(
            IN_VCFFILE=infile,
            IN_TABIX=infile + ".tbi",
            IN_INTERVALS=bedfile,
            OUT_STDOUT=outfile,
        )

        apply_options(params, options)

        description = "<BuildRegions: '%s' -> '%s'>" % (
            infile,
            outfile,
        )
        CommandNode.__init__(
            self,
            description=description,
            command=params.finalize(),
            dependencies=dependencies,
        )
Exemple #3
0
    def __init__(
            self,
            input_file_1,
            output_file,
            reference,
            prefix,
            input_file_2=None,
            threads=1,
            algorithm="mem",
            mapping_options={},
            cleanup_options={},
            dependencies=(),
    ):
        if algorithm not in ("mem", "bwasw"):
            raise NotImplementedError("BWA algorithm %r not implemented" %
                                      (algorithm, ))

        threads = _get_max_threads(reference, threads)

        aln = _new_bwa_command(
            ("bwa", algorithm, prefix, "%(IN_FILE_1)s"),
            prefix,
            IN_FILE_1=input_file_1,
            OUT_STDOUT=AtomicCmd.PIPE,
        )

        if input_file_2:
            aln.add_value("%(IN_FILE_2)s")
            aln.set_kwargs(IN_FILE_2=input_file_2)

        aln.set_option("-t", threads)
        # Mark alternative hits as secondary; required by e.g. Picard
        aln.set_option("-M")

        cleanup = _new_cleanup_command(aln,
                                       output_file,
                                       reference,
                                       paired_end=input_file_1
                                       and input_file_2)

        apply_options(aln, mapping_options)
        apply_options(cleanup, cleanup_options)

        description = _get_node_description(
            name="BWA",
            algorithm="%s%s" %
            (algorithm.upper(), "_PE" if input_file_2 else "_SE"),
            input_files_1=input_file_1,
            input_files_2=input_file_2,
            prefix=prefix,
        )

        CommandNode.__init__(
            self,
            command=ParallelCmds([aln.finalize(),
                                  cleanup.finalize()]),
            description=description,
            threads=threads,
            dependencies=dependencies,
        )
Exemple #4
0
    def __init__(self,
                 input_file,
                 output_prefix,
                 threads=1,
                 options={},
                 dependencies=()):
        # See below for parameters in common between SE/PE
        cmd = _get_common_parameters(threads=threads, options=options)

        # Prefix for output files, ensure that all end up in temp folder
        cmd.set_option("--basename", "%(TEMP_OUT_BASENAME)s")

        output_tmpl = output_prefix + ".%s.gz"
        cmd.set_kwargs(
            TEMP_OUT_BASENAME=os.path.basename(output_prefix),
            OUT_SETTINGS=output_prefix + ".settings",
            OUT_MATE_1=output_tmpl % ("truncated", ),
            OUT_DISCARDED=output_tmpl % ("discarded", ),
        )

        cmd.set_option("--file1", "%(IN_READS_1)s")
        cmd.set_kwargs(IN_READS_1=input_file)

        apply_options(cmd, options)

        CommandNode.__init__(
            self,
            command=cmd.finalize(),
            threads=threads,
            description="<AdapterRM (SE): %s -> '%s.*'>" % (
                fileutils.describe_files(input_file),
                output_prefix,
            ),
            dependencies=dependencies,
        )
Exemple #5
0
    def _mapdamage_rescale(self, config, destination, prefix, files_and_nodes):
        output_filename = self.folder + ".rescaled.bam"

        # Generates basic plots / table files
        plot = self._mapdamage_plot(config=config,
                                    destination=destination,
                                    prefix=prefix,
                                    files_and_nodes=files_and_nodes)

        # Builds model of post-mortem DNA damage
        model = MapDamageModelNode.customize(reference=prefix["Reference"],
                                             directory=destination,
                                             dependencies=plot)
        apply_options(model.command, self.options["mapDamage"])
        model = model.build_node()

        # Rescales BAM quality scores using model built above
        input_files = files_and_nodes.keys()
        scale = MapDamageRescaleNode.customize(config=config,
                                               reference=prefix["Reference"],
                                               input_files=input_files,
                                               output_file=output_filename,
                                               directory=destination,
                                               dependencies=model)
        apply_options(scale.command, self.options["mapDamage"])
        scale = scale.build_node()

        # Grab indexing and validation nodes, required by ROIs and GATK
        index_required = bool(prefix.get("RegionsOfInterest")) \
            or self.options["Features"]["RealignedBAM"]
        validate = index_and_validate_bam(config, prefix, scale,
                                          create_index=index_required)

        return {output_filename: validate}, (model,)
Exemple #6
0
def _apply_vcf_filter_options(vcffilter, genotyping, sample):
    filter_cfg = genotyping["VCF_Filter"]
    apply_options(vcffilter.commands["filter"], filter_cfg)
    if filter_cfg["MaxReadDepth"][sample]:
        max_depth = filter_cfg["MaxReadDepth"][sample]
        vcffilter.commands["filter"].set_option("--max-read-depth", max_depth)
    return vcffilter.build_node()
Exemple #7
0
    def _mapdamage_rescale(self, config, destination, prefix, files_and_nodes):
        model = self._mapdamage_model(config=config,
                                      destination=destination,
                                      prefix=prefix,
                                      files_and_nodes=files_and_nodes)

        # Rescales BAM quality scores using model built above
        input_files = files_and_nodes.keys()
        output_filename = self.folder + ".rescaled.bam"

        scale = MapDamageRescaleNode.customize(config=config,
                                               reference=prefix["Reference"],
                                               input_files=input_files,
                                               output_file=output_filename,
                                               directory=destination,
                                               dependencies=model)
        apply_options(scale.command, self.options["mapDamage"])
        scale = scale.build_node()

        # Grab indexing and validation nodes, required by ROIs and GATK
        index_required = bool(prefix.get("RegionsOfInterest")) \
            or self.options["Features"]["RealignedBAM"]
        validate = index_and_validate_bam(config=config,
                                          prefix=prefix,
                                          node=scale,
                                          create_index=index_required)

        return {output_filename: validate}, (model, )
Exemple #8
0
    def __init__(self, infile, outfile, regions, options, dependencies=()):
        vcffilter = factory.new("vcf_filter")
        vcffilter.add_value("%(IN_VCF)s")

        for contig in regions["HomozygousContigs"]:
            vcffilter.add_option("--homozygous-chromosome", contig)
        vcffilter.set_kwargs(IN_VCF=infile, OUT_STDOUT=AtomicCmd.PIPE)

        apply_options(vcffilter, options)

        bgzip = AtomicCmdBuilder(["bgzip"],
                                 IN_STDIN=vcffilter,
                                 OUT_STDOUT=outfile)

        description = "<VCFFilter: '%s' -> '%s'>" % (
            infile,
            outfile,
        )
        CommandNode.__init__(
            self,
            description=description,
            command=ParallelCmds([vcffilter.finalize(),
                                  bgzip.finalize()]),
            dependencies=dependencies,
        )
Exemple #9
0
    def __init__(
        self,
        reference,
        input_files,
        output_directory,
        title="mapDamage",
        options={},
        dependencies=(),
    ):
        merge = merge_bam_files_command(input_files)
        command = AtomicCmdBuilder(
            [
                "mapDamage",
                "--no-stats",
                # Prevent references with many contigs from using excessive
                # amounts of memory, at the cost of per-contig statistics:
                "--merge-reference-sequences",
                "-t",
                title,
                "-i",
                "-",
                "-d",
                "%(TEMP_DIR)s",
                "-r",
                "%(IN_REFERENCE)s",
            ],
            IN_STDIN=merge,
            IN_REFERENCE=reference,
            OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"),
            OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"),
            OUT_COMP_USER=os.path.join(output_directory, "dnacomp.txt"),
            OUT_PLOT_FRAG=os.path.join(
                output_directory, "Fragmisincorporation_plot.pdf"
            ),
            OUT_PLOT_LEN=os.path.join(output_directory, "Length_plot.pdf"),
            OUT_LENGTH=os.path.join(output_directory, "lgdistribution.txt"),
            OUT_MISINCORP=os.path.join(output_directory, "misincorporation.txt"),
            OUT_LOG=os.path.join(output_directory, "Runtime_log.txt"),
            TEMP_OUT_STDOUT="pipe_mapDamage.stdout",
            TEMP_OUT_STDERR="pipe_mapDamage.stderr",
            CHECK_RSCRIPT=RSCRIPT_VERSION,
            CHECK_MAPDAMAGE=MAPDAMAGE_VERSION,
        )

        apply_options(command, options)

        CommandNode.__init__(
            self,
            command=ParallelCmds([merge, command.finalize()]),
            description="<mapDamage (plots): %s -> '%s'>"
            % (describe_files(merge.input_files), output_directory,),
            dependencies=dependencies,
        )
Exemple #10
0
    def __init__(
            self,
            reference,
            infile,
            bedfile,
            outfile,
            mpileup_options={},
            bcftools_options={},
            dependencies=(),
    ):
        mpileup = AtomicCmdBuilder(
            ("bcftools", "mpileup", "%(IN_BAMFILE)s"),
            IN_BAMFILE=infile,
            IN_INTERVALS=bedfile,
            OUT_STDOUT=AtomicCmd.PIPE,
            CHECK_VERSION=BCFTOOLS_VERSION,
        )

        # Ignore read-groups for pileup
        mpileup.add_option("--ignore-RG")
        # Reference sequence (FASTA)
        mpileup.add_option("--fasta-ref", reference)
        # Output compressed VCF
        mpileup.add_option("--output-type", "u")

        if bedfile:
            mpileup.set_option("--regions-file", "%(IN_INTERVALS)s")

        apply_options(mpileup, mpileup_options)

        genotype = AtomicCmdBuilder(
            ("bcftools", "call", "-"),
            IN_STDIN=mpileup,
            IN_BAMFILE=infile,
            OUT_STDOUT=outfile,
            CHECK_VERSION=BCFTOOLS_VERSION,
        )

        genotype.set_option("--output-type", "z")

        apply_options(genotype, bcftools_options)

        CommandNode.__init__(
            self,
            description="<GenotypeRegions: '%s' -> '%s'>" % (
                infile,
                outfile,
            ),
            command=ParallelCmds([mpileup.finalize(),
                                  genotype.finalize()]),
            dependencies=dependencies,
        )
Exemple #11
0
    def _mapdamage_plot(self, config, destination, prefix, files_and_nodes):
        title = "mapDamage plot for library %r" % (self.name, )

        dependencies = files_and_nodes.values()
        plot = MapDamagePlotNode.customize(config=config,
                                           reference=prefix["Path"],
                                           input_files=files_and_nodes.keys(),
                                           output_directory=destination,
                                           title=title,
                                           dependencies=dependencies)
        apply_options(plot.command, self.options["mapDamage"])

        return plot.build_node()
Exemple #12
0
    def _mapdamage_model(self, config, destination, prefix, files_and_nodes):
        # Generates basic plots / table files
        plot = self._mapdamage_plot(config=config,
                                    destination=destination,
                                    prefix=prefix,
                                    files_and_nodes=files_and_nodes)

        # Builds model of post-mortem DNA damage
        model = MapDamageModelNode.customize(reference=prefix["Reference"],
                                             directory=destination,
                                             dependencies=plot)
        apply_options(model.command, self.options["mapDamage"])
        return model.build_node()
Exemple #13
0
    def _mapdamage_model(self, config, destination, prefix, files_and_nodes):
        # Generates basic plots / table files
        plot = self._mapdamage_plot(config=config,
                                    destination=destination,
                                    prefix=prefix,
                                    files_and_nodes=files_and_nodes)

        # Builds model of post-mortem DNA damage
        model = MapDamageModelNode.customize(reference=prefix["Reference"],
                                             directory=destination,
                                             dependencies=plot)
        apply_options(model.command, self.options["mapDamage"])
        return model.build_node()
Exemple #14
0
    def _mapdamage_plot(self, config, destination, prefix, files_and_nodes):
        title = "mapDamage plot for library %r" % (self.name,)

        dependencies = files_and_nodes.values()
        plot = MapDamagePlotNode.customize(config=config,
                                           reference=prefix["Path"],
                                           input_files=files_and_nodes.keys(),
                                           output_directory=destination,
                                           title=title,
                                           dependencies=dependencies)
        apply_options(plot.command, self.options["mapDamage"])

        return plot.build_node()
Exemple #15
0
    def __init__(
            self,
            input_file_fq_1,
            input_file_fq_2,
            input_file_sai_1,
            input_file_sai_2,
            output_file,
            reference,
            prefix,
            mapping_options={},
            cleanup_options={},
            dependencies=(),
    ):
        sampe = _new_bwa_command(
            (
                "bwa",
                "sampe",
                prefix,
                "%(IN_SAI_1)s",
                "%(IN_SAI_2)s",
                "%(IN_FQ_1)s",
                "%(IN_FQ_2)s",
            ),
            prefix,
            IN_SAI_1=input_file_sai_1,
            IN_SAI_2=input_file_sai_2,
            IN_FQ_1=input_file_fq_1,
            IN_FQ_2=input_file_fq_2,
            OUT_STDOUT=AtomicCmd.PIPE,
        )

        cleanup = _new_cleanup_command(sampe,
                                       output_file,
                                       reference,
                                       paired_end=True)

        apply_options(sampe, mapping_options)
        apply_options(cleanup, cleanup_options)

        CommandNode.__init__(
            self,
            command=ParallelCmds([sampe.finalize(),
                                  cleanup.finalize()]),
            description=_get_node_description(
                name="BWA Sampe",
                input_files_1=input_file_fq_1,
                input_files_2=input_file_fq_2,
                prefix=prefix,
            ),
            dependencies=dependencies,
        )
Exemple #16
0
    def _build_bowtie2(self, config, prefix, record, parameters):
        self._set_pe_input_files(parameters)
        node = Bowtie2Node.customize(threads=config.bowtie2_max_threads,
                                     **parameters)

        command = node.commands["aln"]
        if self.options["QualityOffset"] == 33:
            command.set_option("--phred33")
        else:
            command.set_option("--phred64")

        apply_options(command, self.options["Aligners"]["Bowtie2"])

        return self._finalize_nodes(config, prefix, parameters, node)
Exemple #17
0
    def _build_bwa_algorithm(self, config, prefix, record, parameters):
        if self.options["QualityOffset"] != 33:
            raise MakefileError("Mapping with BWA using the %r algorithm "
                                "currently does not support QualityOffsets "
                                "other than 33; please convert your FASTQ "
                                "if you wish to proceed.")

        self._set_pe_input_files(parameters)
        node = BWAAlgorithmNode.customize(**parameters)

        apply_options(node.commands["aln"],
                      self.options["Aligners"]["BWA"])

        return self._finalize_nodes(config, prefix, parameters, node)
Exemple #18
0
    def __init__(
            self,
            input_file_1,
            input_file_2,
            output_prefix,
            collapse=True,
            threads=1,
            options={},
            dependencies=(),
    ):
        cmd = _get_common_parameters(threads=threads, options=options)

        # Prefix for output files, to ensure that all end up in temp folder
        cmd.set_option("--basename", "%(TEMP_OUT_BASENAME)s")

        output_tmpl = output_prefix + ".%s.gz"
        cmd.set_kwargs(
            TEMP_OUT_BASENAME=os.path.basename(output_prefix),
            OUT_SETTINGS=output_prefix + ".settings",
            OUT_READS_1=output_tmpl % ("pair1.truncated", ),
            OUT_READS_2=output_tmpl % ("pair2.truncated", ),
            OUT_SINGLETON=output_tmpl % ("singleton.truncated", ),
            OUT_DISCARDED=output_tmpl % ("discarded", ),
        )

        if collapse:
            cmd.set_option("--collapse")

            cmd.set_kwargs(
                OUT_COLLAPSED=output_tmpl % ("collapsed", ),
                OUT_COLLAPSED_TRUNC=output_tmpl % ("collapsed.truncated", ),
            )

        cmd.set_option("--file1", "%(IN_READS_1)s")
        cmd.set_option("--file2", "%(IN_READS_2)s")
        cmd.set_kwargs(IN_READS_1=input_file_1, IN_READS_2=input_file_2)

        apply_options(cmd, options)

        CommandNode.__init__(
            self,
            command=cmd.finalize(),
            threads=threads,
            description="<AdapterRM (PE): %s -> '%s.*'>" % (
                fileutils.describe_paired_files(input_file_1, input_file_2),
                output_prefix,
            ),
            dependencies=dependencies,
        )
Exemple #19
0
    def _init_raw_reads(self, config, record):
        ar_options = dict(record["Options"]["AdapterRemoval"])
        # Setup of "--collapsed" is handled by the node itself
        collapse_reads = ar_options.pop("--collapse")
        collapse_reads = collapse_reads or collapse_reads is None

        init_args = {
            "output_prefix": os.path.join(self.folder, "reads"),
            "output_format": record["Options"]["CompressionFormat"],
            "threads": config.adapterremoval_max_threads
        }
        output_tmpl = "{output_prefix}.%s.{output_format}".format(**init_args)

        if ("SE" in record["Data"]):
            self.files["Single"] = output_tmpl % ("truncated", )
            init_args["input_files"] = record["Data"]["SE"]
            command = SE_AdapterRemovalNode.customize(**init_args)
        else:
            self.files["Singleton"] = output_tmpl % ("singleton.truncated", )
            self.files["Paired"] = output_tmpl % ("pair{Pair}.truncated", )

            if collapse_reads:
                self.files["Collapsed"] = output_tmpl % ("collapsed", )
                self.files["CollapsedTruncated"] = output_tmpl % (
                    "collapsed.truncated", )

            init_args["collapse"] = collapse_reads
            init_args["input_files_1"] = record["Data"]["PE_1"]
            init_args["input_files_2"] = record["Data"]["PE_2"]
            command = PE_AdapterRemovalNode.customize(**init_args)

        # Ensure that any user-specified list of adapters is tracked
        if "--adapter-list" in ar_options:
            adapter_list = ar_options.pop("--adapter-list")
            command.command.set_option("--adapter-list", "%(IN_ADAPTER_LIST)s")
            command.command.set_kwargs(IN_ADAPTER_LIST=adapter_list)

        apply_options(command.command, ar_options)

        output_quality = self.quality_offset
        if output_quality == "Solexa":
            output_quality = "64"

        command.command.set_option("--qualitybase", self.quality_offset)
        command.command.set_option("--qualitybase-output", output_quality)

        self.stats = os.path.join(self.folder, "reads.settings")
        self.nodes = (command.build_node(), )
Exemple #20
0
def build_msa_nodes(options, settings, regions, filtering, dependencies):
    if settings["Program"].lower() != "mafft":
        raise RuntimeError("Only MAFFT support has been implemented!")

    sequencedir = os.path.join(options.destination, "alignments",
                               regions["Name"])
    # Run on full set of sequences
    sequences = regions["Sequences"][None]

    node = CollectSequencesNode(fasta_files=regions["Genotypes"],
                                destination=sequencedir,
                                sequences=sequences,
                                dependencies=dependencies)

    if settings["Enabled"]:
        fasta_files = {}
        algorithm = settings["MAFFT"]["Algorithm"]
        for sequence in sequences:
            input_file = os.path.join(sequencedir, sequence + ".fasta")
            output_file = os.path.join(sequencedir, sequence + ".afa")

            mafft = MAFFTNode.customize(input_file=input_file,
                                        output_file=output_file,
                                        algorithm=algorithm,
                                        dependencies=node)
            apply_options(mafft.command, settings["MAFFT"])
            fasta_files[output_file] = mafft.build_node()
    else:
        fasta_files = dict((filename, node) for filename in node.output_files)

    if not any(filtering.itervalues()):
        return fasta_files.values()

    destination = sequencedir + ".filtered"
    filtering = dict(filtering)
    filtered_nodes = []

    for (filename, node) in fasta_files.iteritems():
        output_filename = fileutils.reroot_path(destination, filename)
        filtered_node = FilterSingletonsNode(input_file=filename,
                                             output_file=output_filename,
                                             filter_by=filtering,
                                             dependencies=node)

        filtered_nodes.append(filtered_node)

    return filtered_nodes
Exemple #21
0
    def __init__(self, reference, directory, options={}, dependencies=()):
        command = AtomicCmdBuilder(
            [
                "mapDamage",
                "--stats-only",
                "-r",
                "%(IN_REFERENCE)s",
                "-d",
                "%(TEMP_DIR)s",
            ],
            IN_REFERENCE=reference,
            TEMP_OUT_FREQ_3p="3pGtoA_freq.txt",
            TEMP_OUT_FREQ_5p="5pCtoT_freq.txt",
            TEMP_OUT_COMP_USER="******",
            TEMP_OUT_MISINCORP="misincorporation.txt",
            TEMP_OUT_LOG="Runtime_log.txt",
            TEMP_OUT_STDOUT="pipe_mapDamage.stdout",
            TEMP_OUT_STDERR="pipe_mapDamage.stderr",
            OUT_COMP_GENOME=os.path.join(directory, "dnacomp_genome.csv"),
            OUT_MCMC_PROBS=os.path.join(directory, "Stats_out_MCMC_correct_prob.csv"),
            OUT_MCMC_HIST=os.path.join(directory, "Stats_out_MCMC_hist.pdf"),
            OUT_MCMC_ITER=os.path.join(directory, "Stats_out_MCMC_iter.csv"),
            OUT_MCMC_ITERSUM=os.path.join(
                directory, "Stats_out_MCMC_iter_summ_stat.csv"
            ),
            OUT_MCMC_POSTPRED=os.path.join(directory, "Stats_out_MCMC_post_pred.pdf"),
            OUT_MCMC_TRACE=os.path.join(directory, "Stats_out_MCMC_trace.pdf"),
            CHECK_RSCRIPT=RSCRIPT_VERSION,
            CHECK_MAPDAMAGE=MAPDAMAGE_VERSION,
            CHECK_R_INLINE=rtools.requirement("inline"),
            CHECK_R_GGPLOT2=rtools.requirement("ggplot2"),
            CHECK_R_RCPP=rtools.requirement("Rcpp"),
            CHECK_R_GAM=rtools.requirement("gam"),
            CHECK_R_RCPPGSL=rtools.requirement("RcppGSL"),
        )

        apply_options(command, options)

        self._directory = directory

        CommandNode.__init__(
            self,
            command=command.finalize(),
            description="<mapDamage (model): %r>" % (directory,),
            dependencies=dependencies,
        )
Exemple #22
0
def build_msa_nodes(options, settings, regions, filtering, dependencies):
    if settings["Program"].lower() != "mafft":
        raise RuntimeError("Only MAFFT support has been implemented!")

    sequencedir = os.path.join(options.destination, "alignments", regions["Name"])
    # Run on full set of sequences
    sequences = regions["Sequences"][None]

    node = CollectSequencesNode(fasta_files=regions["Genotypes"],
                                destination=sequencedir,
                                sequences=sequences,
                                dependencies=dependencies)

    if settings["Enabled"]:
        fasta_files = {}
        algorithm = settings["MAFFT"]["Algorithm"]
        for sequence in sequences:
            input_file = os.path.join(sequencedir, sequence + ".fasta")
            output_file = os.path.join(sequencedir, sequence + ".afa")

            mafft = MAFFTNode.customize(input_file=input_file,
                                        output_file=output_file,
                                        algorithm=algorithm,
                                        dependencies=node)
            apply_options(mafft.command, settings["MAFFT"])
            fasta_files[output_file] = mafft.build_node()
    else:
        fasta_files = dict((filename, node) for filename in node.output_files)

    if not any(filtering.itervalues()):
        return fasta_files.values()

    destination = sequencedir + ".filtered"
    filtering = dict(filtering)
    filtered_nodes = []

    for (filename, node) in fasta_files.iteritems():
        output_filename = fileutils.reroot_path(destination, filename)
        filtered_node = FilterSingletonsNode(input_file=filename,
                                             output_file=output_filename,
                                             filter_by=filtering,
                                             dependencies=node)

        filtered_nodes.append(filtered_node)

    return filtered_nodes
Exemple #23
0
    def _init_raw_reads(self, config, record):
        ar_options = dict(record["Options"]["AdapterRemoval"])
        # Setup of "--collapsed" is handled by the node itself
        collapse_reads = ar_options.pop("--collapse")
        collapse_reads = collapse_reads or collapse_reads is None

        init_args = {"output_prefix": os.path.join(self.folder, "reads"),
                     "output_format": record["Options"]["CompressionFormat"],
                     "threads": config.adapterremoval_max_threads}
        output_tmpl = "{output_prefix}.%s.{output_format}".format(**init_args)

        if ("SE" in record["Data"]):
            self.files["Single"] = output_tmpl % ("truncated",)
            init_args["input_files"] = record["Data"]["SE"]
            command = SE_AdapterRemovalNode.customize(**init_args)
        else:
            self.files["Singleton"] = output_tmpl % ("singleton.truncated",)
            self.files["Paired"] = output_tmpl % ("pair{Pair}.truncated",)

            if collapse_reads:
                self.files["Collapsed"] = output_tmpl % ("collapsed",)
                self.files["CollapsedTruncated"] = output_tmpl % ("collapsed.truncated",)

            init_args["collapse"] = collapse_reads
            init_args["input_files_1"] = record["Data"]["PE_1"]
            init_args["input_files_2"] = record["Data"]["PE_2"]
            command = PE_AdapterRemovalNode.customize(**init_args)

        # Ensure that any user-specified list of adapters is tracked
        if "--adapter-list" in ar_options:
            adapter_list = ar_options.pop("--adapter-list")
            command.command.set_option("--adapter-list", "%(IN_ADAPTER_LIST)s")
            command.command.set_kwargs(IN_ADAPTER_LIST=adapter_list)

        apply_options(command.command, ar_options)

        output_quality = self.quality_offset
        if output_quality == "Solexa":
            output_quality = "64"

        command.command.set_option("--qualitybase", self.quality_offset)
        command.command.set_option("--qualitybase-output", output_quality)

        self.stats = os.path.join(self.folder, "reads.settings")
        self.nodes = (command.build_node(),)
Exemple #24
0
    def __init__(
        self,
        reference,
        input_files,
        output_file,
        directory,
        options={},
        dependencies=(),
    ):
        stats_out_fname = "Stats_out_MCMC_correct_prob.csv"

        merge = merge_bam_files_command(input_files)
        command = AtomicCmdBuilder(
            [
                "mapDamage",
                "--rescale-only",
                "-i",
                "-",
                "-d",
                "%(TEMP_DIR)s",
                "-r",
                "%(IN_REFERENCE)s",
                "--rescale-out",
                "%(OUT_BAM)s",
            ],
            IN_STDIN=merge,
            IN_REFERENCE=reference,
            TEMP_OUT_LOG="Runtime_log.txt",
            TEMP_OUT_CSV=stats_out_fname,
            OUT_BAM=output_file,
            CHECK_VERSION=MAPDAMAGE_VERSION,
        )

        apply_options(command, options)

        self._directory = directory

        CommandNode.__init__(
            self,
            command=ParallelCmds([merge, command.finalize()]),
            description="<mapDamage (rescale): %s -> %r>"
            % (describe_files(merge.input_files), output_file,),
            dependencies=dependencies,
        )
Exemple #25
0
    def _build_bwa_backtrack_aln(self, parameters, input_file, output_file):
        """
        """
        node = BWABacktrack.customize(input_file=input_file,
                                      output_file=output_file,
                                      threads=parameters["threads"],
                                      prefix=parameters["prefix"],
                                      reference=parameters["reference"],
                                      dependencies=parameters["dependencies"])

        if not self.options["Aligners"]["BWA"]["UseSeed"]:
            node.commands["aln"].set_option("-l", 2 ** 16 - 1)

        if self.options["QualityOffset"] in (64, "Solexa"):
            node.commands["aln"].set_option("-I")

        apply_options(node.commands["aln"], self.options["Aligners"]["BWA"])

        return node.build_node()
Exemple #26
0
    def __init__(
            self,
            input_file,
            output_file,
            reference,
            prefix,
            threads=1,
            mapping_options={},
            dependencies=(),
    ):
        threads = _get_max_threads(reference, threads)

        aln = _new_bwa_command(
            ("bwa", "aln"),
            prefix,
            IN_FILE=input_file,
            OUT_STDOUT=output_file,
        )
        aln.add_value(prefix)
        aln.add_value("%(IN_FILE)s")
        aln.set_option("-t", threads)

        apply_options(aln, mapping_options)

        description = _get_node_description(
            name="BWA",
            algorithm="Backtrack",
            input_files_1=input_file,
            prefix=prefix,
            threads=threads,
        )

        CommandNode.__init__(
            self,
            command=aln.finalize(),
            description=description,
            threads=threads,
            dependencies=dependencies,
        )
Exemple #27
0
    def __init__(
            self,
            input_file_fq,
            input_file_sai,
            output_file,
            reference,
            prefix,
            mapping_options={},
            cleanup_options={},
            dependencies=(),
    ):
        samse = _new_bwa_command(
            ("bwa", "samse"),
            prefix,
            IN_FILE_SAI=input_file_sai,
            IN_FILE_FQ=input_file_fq,
            OUT_STDOUT=AtomicCmd.PIPE,
        )
        samse.add_value(prefix)
        samse.add_value("%(IN_FILE_SAI)s")
        samse.add_value("%(IN_FILE_FQ)s")

        cleanup = _new_cleanup_command(samse, output_file, reference)

        apply_options(samse, mapping_options)
        apply_options(cleanup, cleanup_options)

        CommandNode.__init__(
            self,
            command=ParallelCmds([samse.finalize(),
                                  cleanup.finalize()]),
            description=_get_node_description(name="BWA Samse",
                                              input_files_1=input_file_fq,
                                              prefix=prefix),
            dependencies=dependencies,
        )
Exemple #28
0
def test_apply_options__single_option__boolean__set_when_value_is_none():
    mock = flexmock()
    mock.should_receive('set_option').with_args('-v')
    apply_options(mock, {"-v": None})
Exemple #29
0
def test_apply_options__single_option__user_pred__ignore_when_pred_is_false():
    mock = flexmock()
    apply_options(mock, {"BAR_FOO": 17}, _user_pred)
Exemple #30
0
def test_apply_options__single_option__user_pred__set_when_pred_is_true():
    mock = flexmock()
    mock.should_receive('set_option').with_args('FOO_BAR', 17).once()
    apply_options(mock, {"FOO_BAR": 17}, _user_pred)
Exemple #31
0
def test_apply_options__single_option__default_pred__ignore_when_pred_is_false():
    mock = flexmock()
    apply_options(mock, {"Other": None})
Exemple #32
0
def test_apply_options__single_option__default_pred__set_when_pred_is_true():
    mock = flexmock()
    mock.should_receive('set_option').with_args('--foo', 17).once()
    apply_options(mock, {"--foo": 17})
Exemple #33
0
def test_apply_options__single_option__default_pred__set_when_pred_is_true():
    mock = flexmock()
    mock.should_receive('set_option').with_args('--foo', 17).once()
    apply_options(mock, {"--foo": 17})
Exemple #34
0
def test_apply_options__single_option__user_pred__ignore_when_pred_is_false():
    mock = flexmock()
    apply_options(mock, {"BAR_FOO": 17}, _user_pred)
Exemple #35
0
def test_apply_options__multiple_option():
    mock = flexmock()
    mock.should_receive('add_option').with_args('--foo', 3).once()
    mock.should_receive('add_option').with_args('--foo', 17).once()
    apply_options(mock, {"--foo": [3, 17]})
Exemple #36
0
def test_apply_options__single_option__user_pred__set_when_pred_is_true():
    mock = flexmock()
    mock.should_receive('set_option').with_args('FOO_BAR', 17).once()
    apply_options(mock, {"FOO_BAR": 17}, _user_pred)
Exemple #37
0
    def __init__(
            self,
            input_file_1,
            input_file_2,
            output_file,
            reference,
            prefix,
            threads=2,
            log_file=None,
            mapping_options={},
            cleanup_options={},
            dependencies=(),
    ):
        # Setting IN_FILE_2 to None makes AtomicCmd ignore this key
        aln = _bowtie2_template(
            ("bowtie2", ),
            prefix,
            OUT_STDOUT=AtomicCmd.PIPE,
            CHECK_VERSION=BOWTIE2_VERSION,
        )

        aln.set_option("-x", prefix)

        if log_file is not None:
            aln.set_kwargs(OUT_STDERR=log_file)

        if input_file_1 and not input_file_2:
            aln.add_option("-U", input_file_1)
        elif input_file_1 and input_file_2:
            aln.add_option("-1", input_file_1)
            aln.add_option("-2", input_file_2)
        else:
            raise NodeError("Input 1, OR both input 1 and input 2 must "
                            "be specified for Bowtie2 node")

        max_threads = _get_max_threads(reference, threads)
        aln.set_option("--threads", max_threads)

        cleanup = _new_cleanup_command(aln,
                                       output_file,
                                       reference,
                                       paired_end=input_file_1
                                       and input_file_2)

        apply_options(aln, mapping_options)
        apply_options(cleanup, cleanup_options)

        algorithm = "PE" if input_file_2 else "SE"
        description = _get_node_description(
            name="Bowtie2",
            algorithm=algorithm,
            input_files_1=input_file_1,
            input_files_2=input_file_2,
            prefix=prefix,
            threads=threads,
        )

        CommandNode.__init__(
            self,
            command=ParallelCmds([aln.finalize(),
                                  cleanup.finalize()]),
            description=description,
            threads=threads,
            dependencies=dependencies,
        )
Exemple #38
0
def test_apply_options__single_option__boolean__pop_when_value_is_false():
    mock = flexmock()
    mock.should_receive('pop_option').with_args('-v')
    apply_options(mock, {"-v": False})
Exemple #39
0
def test_apply_options__single_option__boolean__pop_when_value_is_false():
    mock = flexmock()
    mock.should_receive('pop_option').with_args('-v')
    apply_options(mock, {"-v": False})
Exemple #40
0
def test_apply_options__single_option__default_pred__ignore_false_pred():
    mock = flexmock()
    apply_options(mock, {"Other": None})
Exemple #41
0
def test_apply_options__multiple_option():
    mock = flexmock()
    mock.should_receive('add_option').with_args('--foo', 3).once()
    mock.should_receive('add_option').with_args('--foo', 17).once()
    apply_options(mock, {"--foo": [3, 17]})
Exemple #42
0
def test_apply_options__single_option__boolean__set_when_value_is_none():
    mock = flexmock()
    mock.should_receive('set_option').with_args('-v')
    apply_options(mock, {"-v": None})