Exemple #1
0
def test_builder__set_kwargs__after_finalize():
    expected = {"IN_PATH": "/a/b/"}
    builder = AtomicCmdBuilder("echo")
    builder.set_kwargs(IN_PATH="/a/b/")
    builder.finalize()
    assert_raises(AtomicCmdBuilderError, builder.set_kwargs, OUT_PATH="/dst/file")
    assert_equal(builder.kwargs, expected)
Exemple #2
0
def test_builder__set_kwargs__after_finalize():
    expected = {"IN_PATH": "/a/b/"}
    builder = AtomicCmdBuilder("echo")
    builder.set_kwargs(IN_PATH="/a/b/")
    builder.finalize()
    assert_raises(AtomicCmdBuilderError,
                  builder.set_kwargs,
                  OUT_PATH="/dst/file")
    assert_equal(builder.kwargs, expected)
Exemple #3
0
    def __init__(
            self,
            reference,
            infile,
            bedfile,
            outfile,
            mpileup_options={},
            bcftools_options={},
            dependencies=(),
    ):
        mpileup = AtomicCmdBuilder(
            ("bcftools", "mpileup", "%(IN_BAMFILE)s"),
            IN_BAMFILE=infile,
            IN_INTERVALS=bedfile,
            OUT_STDOUT=AtomicCmd.PIPE,
            CHECK_VERSION=BCFTOOLS_VERSION,
        )

        # Ignore read-groups for pileup
        mpileup.add_option("--ignore-RG")
        # Reference sequence (FASTA)
        mpileup.add_option("--fasta-ref", reference)
        # Output compressed VCF
        mpileup.add_option("--output-type", "u")

        if bedfile:
            mpileup.set_option("--regions-file", "%(IN_INTERVALS)s")

        apply_options(mpileup, mpileup_options)

        genotype = AtomicCmdBuilder(
            ("bcftools", "call", "-"),
            IN_STDIN=mpileup,
            IN_BAMFILE=infile,
            OUT_STDOUT=outfile,
            CHECK_VERSION=BCFTOOLS_VERSION,
        )

        genotype.set_option("--output-type", "z")

        apply_options(genotype, bcftools_options)

        CommandNode.__init__(
            self,
            description="<GenotypeRegions: '%s' -> '%s'>" % (
                infile,
                outfile,
            ),
            command=ParallelCmds([mpileup.finalize(),
                                  genotype.finalize()]),
            dependencies=dependencies,
        )
Exemple #4
0
    def __init__(self, infile, outfile, regions, options, dependencies=()):
        vcffilter = factory.new("vcf_filter")
        vcffilter.add_value("%(IN_VCF)s")

        for contig in regions["HomozygousContigs"]:
            vcffilter.add_option("--homozygous-chromosome", contig)
        vcffilter.set_kwargs(IN_VCF=infile, OUT_STDOUT=AtomicCmd.PIPE)

        apply_options(vcffilter, options)

        bgzip = AtomicCmdBuilder(["bgzip"],
                                 IN_STDIN=vcffilter,
                                 OUT_STDOUT=outfile)

        description = "<VCFFilter: '%s' -> '%s'>" % (
            infile,
            outfile,
        )
        CommandNode.__init__(
            self,
            description=description,
            command=ParallelCmds([vcffilter.finalize(),
                                  bgzip.finalize()]),
            dependencies=dependencies,
        )
Exemple #5
0
    def __init__(self,
                 input_file,
                 output_file,
                 algorithm="auto",
                 options={},
                 dependencies=()):
        command = AtomicCmdBuilder(
            _PRESETS[algorithm.lower()] + ["%(IN_FASTA)s"],
            IN_FASTA=input_file,
            OUT_STDOUT=output_file,
            CHECK_VERSION=MAFFT_VERSION,
        )

        apply_options(command, options)

        self._output_file = output_file

        CommandNode.__init__(
            self,
            command=command.finalize(),
            description="<MAFFTNode (%s): '%s' -> '%s'>" % (
                algorithm,
                input_file,
                output_file,
            ),
            dependencies=dependencies,
        )
Exemple #6
0
def merge_bam_files_command(input_files):
    merge = AtomicCmdBuilder(
        ["samtools", "merge", "-u", "-"],
        OUT_STDOUT=AtomicCmd.PIPE,
        CHECK_VERSION=SAMTOOLS_VERSION,
    )

    merge.add_multiple_values(input_files)

    return merge.finalize()
Exemple #7
0
def test_builder__finalize__calls_atomiccmd():
    was_called = []

    class _AtomicCmdMock:
        def __init__(self, *args, **kwargs):
            assert_equal(args, (["echo", "-out", "%(OUT_FILE)s", "%(IN_FILE)s"],))
            assert_equal(kwargs, {"IN_FILE": "/in/file",
                                  "OUT_FILE": "/out/file",
                                  "set_cwd": True})
            was_called.append(True)

    with Monkeypatch("paleomix.atomiccmd.builder.AtomicCmd", _AtomicCmdMock):
        builder = AtomicCmdBuilder("echo", set_cwd=True)
        builder.add_option("-out", "%(OUT_FILE)s")
        builder.add_value("%(IN_FILE)s")
        builder.set_kwargs(OUT_FILE="/out/file",
                           IN_FILE="/in/file")

        builder.finalize()
        assert was_called
Exemple #8
0
    def __init__(
        self,
        reference,
        input_files,
        output_directory,
        title="mapDamage",
        options={},
        dependencies=(),
    ):
        merge = merge_bam_files_command(input_files)
        command = AtomicCmdBuilder(
            [
                "mapDamage",
                "--no-stats",
                # Prevent references with many contigs from using excessive
                # amounts of memory, at the cost of per-contig statistics:
                "--merge-reference-sequences",
                "-t",
                title,
                "-i",
                "-",
                "-d",
                "%(TEMP_DIR)s",
                "-r",
                "%(IN_REFERENCE)s",
            ],
            IN_STDIN=merge,
            IN_REFERENCE=reference,
            OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"),
            OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"),
            OUT_COMP_USER=os.path.join(output_directory, "dnacomp.txt"),
            OUT_PLOT_FRAG=os.path.join(
                output_directory, "Fragmisincorporation_plot.pdf"
            ),
            OUT_PLOT_LEN=os.path.join(output_directory, "Length_plot.pdf"),
            OUT_LENGTH=os.path.join(output_directory, "lgdistribution.txt"),
            OUT_MISINCORP=os.path.join(output_directory, "misincorporation.txt"),
            OUT_LOG=os.path.join(output_directory, "Runtime_log.txt"),
            TEMP_OUT_STDOUT="pipe_mapDamage.stdout",
            TEMP_OUT_STDERR="pipe_mapDamage.stderr",
            CHECK_RSCRIPT=RSCRIPT_VERSION,
            CHECK_MAPDAMAGE=MAPDAMAGE_VERSION,
        )

        apply_options(command, options)

        CommandNode.__init__(
            self,
            command=ParallelCmds([merge, command.finalize()]),
            description="<mapDamage (plots): %s -> '%s'>"
            % (describe_files(merge.input_files), output_directory,),
            dependencies=dependencies,
        )
Exemple #9
0
def test_builder__finalize__calls_atomiccmd():
    was_called = []

    class _AtomicCmdMock(object):
        def __init__(self, *args, **kwargs):
            assert_equal(args,
                         (["echo", "-out", "%(OUT_FILE)s", "%(IN_FILE)s"], ))
            assert_equal(kwargs, {
                "IN_FILE": "/in/file",
                "OUT_FILE": "/out/file",
                "set_cwd": True
            })
            was_called.append(True)

    with Monkeypatch("paleomix.atomiccmd.builder.AtomicCmd", _AtomicCmdMock):
        builder = AtomicCmdBuilder("echo", set_cwd=True)
        builder.add_option("-out", "%(OUT_FILE)s")
        builder.add_value("%(IN_FILE)s")
        builder.set_kwargs(OUT_FILE="/out/file", IN_FILE="/in/file")

        builder.finalize()
        assert was_called
Exemple #10
0
    def __init__(self, input_file, k_groups, output_root,
                 samples=None, dependencies=()):
        self._samples = samples
        self._input_file = input_file
        self._k_groups = k_groups

        group_key = "Group(%i)" % (self._k_groups,)
        self._supervised = samples and any((row[group_key] != '-')
                                           for row in samples.itervalues())

        assert k_groups in (2, 3), k_groups
        prefix = os.path.splitext(os.path.basename(input_file))[0]
        output_prefix = os.path.join(output_root,
                                     "%s.%i" % (prefix, k_groups))

        cmd = AtomicCmdBuilder("admixture",
                               IN_FILE_BED=input_file,
                               IN_FILE_BIM=fileutils.swap_ext(input_file,
                                                              ".bim"),
                               IN_FILE_FAM=fileutils.swap_ext(input_file,
                                                              ".fam"),

                               TEMP_OUT_FILE_BED=prefix + ".bed",
                               TEMP_OUT_FILE_BIM=prefix + ".bim",
                               TEMP_OUT_FILE_FAM=prefix + ".fam",
                               TEMP_OUT_FILE_POP=prefix + ".pop",

                               OUT_P=output_prefix + ".P",
                               OUT_Q=output_prefix + ".Q",
                               OUT_STDOUT=output_prefix + ".log",

                               CHECK_VERSION=ADMIXTURE_VERSION,
                               set_cwd=True)

        cmd.set_option("-s", random.randint(0, 2 ** 16 - 1))

        if self._supervised:
            cmd.set_option("--supervised")

        cmd.add_value("%(TEMP_OUT_FILE_BED)s")
        cmd.add_value(int(k_groups))

        CommandNode.__init__(self,
                             description="<Admixture -> '%s.*''>"
                             % (output_prefix,),
                             command=cmd.finalize(),
                             dependencies=dependencies)
Exemple #11
0
    def __init__(self, input_file, k_groups, output_root,
                 samples=None, dependencies=()):
        self._samples = samples
        self._input_file = input_file
        self._k_groups = k_groups

        group_key = "Group(%i)" % (self._k_groups,)
        self._supervised = samples and any((row[group_key] != '-')
                                           for row in samples.itervalues())

        assert k_groups in (2, 3), k_groups
        prefix = os.path.splitext(os.path.basename(input_file))[0]
        output_prefix = os.path.join(output_root,
                                     "%s.%i" % (prefix, k_groups))

        cmd = AtomicCmdBuilder("admixture",
                               IN_FILE_BED=input_file,
                               IN_FILE_BIM=fileutils.swap_ext(input_file,
                                                              ".bim"),
                               IN_FILE_FAM=fileutils.swap_ext(input_file,
                                                              ".fam"),

                               TEMP_OUT_FILE_BED=prefix + ".bed",
                               TEMP_OUT_FILE_BIM=prefix + ".bim",
                               TEMP_OUT_FILE_FAM=prefix + ".fam",
                               TEMP_OUT_FILE_POP=prefix + ".pop",

                               OUT_P=output_prefix + ".P",
                               OUT_Q=output_prefix + ".Q",
                               OUT_STDOUT=output_prefix + ".log",

                               CHECK_VERSION=ADMIXTURE_VERSION,
                               set_cwd=True)

        cmd.set_option("-s", random.randint(0, 2 ** 16 - 1))

        if self._supervised:
            cmd.set_option("--supervised")

        cmd.add_value("%(TEMP_OUT_FILE_BED)s")
        cmd.add_value(int(k_groups))

        CommandNode.__init__(self,
                             description="<Admixture -> '%s.*''>"
                             % (output_prefix,),
                             command=cmd.finalize(),
                             dependencies=dependencies)
Exemple #12
0
    def __init__(self, reference, directory, options={}, dependencies=()):
        command = AtomicCmdBuilder(
            [
                "mapDamage",
                "--stats-only",
                "-r",
                "%(IN_REFERENCE)s",
                "-d",
                "%(TEMP_DIR)s",
            ],
            IN_REFERENCE=reference,
            TEMP_OUT_FREQ_3p="3pGtoA_freq.txt",
            TEMP_OUT_FREQ_5p="5pCtoT_freq.txt",
            TEMP_OUT_COMP_USER="******",
            TEMP_OUT_MISINCORP="misincorporation.txt",
            TEMP_OUT_LOG="Runtime_log.txt",
            TEMP_OUT_STDOUT="pipe_mapDamage.stdout",
            TEMP_OUT_STDERR="pipe_mapDamage.stderr",
            OUT_COMP_GENOME=os.path.join(directory, "dnacomp_genome.csv"),
            OUT_MCMC_PROBS=os.path.join(directory, "Stats_out_MCMC_correct_prob.csv"),
            OUT_MCMC_HIST=os.path.join(directory, "Stats_out_MCMC_hist.pdf"),
            OUT_MCMC_ITER=os.path.join(directory, "Stats_out_MCMC_iter.csv"),
            OUT_MCMC_ITERSUM=os.path.join(
                directory, "Stats_out_MCMC_iter_summ_stat.csv"
            ),
            OUT_MCMC_POSTPRED=os.path.join(directory, "Stats_out_MCMC_post_pred.pdf"),
            OUT_MCMC_TRACE=os.path.join(directory, "Stats_out_MCMC_trace.pdf"),
            CHECK_RSCRIPT=RSCRIPT_VERSION,
            CHECK_MAPDAMAGE=MAPDAMAGE_VERSION,
            CHECK_R_INLINE=rtools.requirement("inline"),
            CHECK_R_GGPLOT2=rtools.requirement("ggplot2"),
            CHECK_R_RCPP=rtools.requirement("Rcpp"),
            CHECK_R_GAM=rtools.requirement("gam"),
            CHECK_R_RCPPGSL=rtools.requirement("RcppGSL"),
        )

        apply_options(command, options)

        self._directory = directory

        CommandNode.__init__(
            self,
            command=command.finalize(),
            description="<mapDamage (model): %r>" % (directory,),
            dependencies=dependencies,
        )
Exemple #13
0
    def __init__(self,
                 input_alignment,
                 input_partition,
                 output_file,
                 dependencies=()):
        """
        Arguments:
        input_alignment  -- An alignment file in a format readable by RAxML.
        input_partition  -- A set of partitions in a format readable by RAxML.
        output_filename  -- Filename for the output binary sequence."""

        command = AtomicCmdBuilder("parse-examl", set_cwd=True)

        command.set_option("-s", "%(TEMP_OUT_ALN)s")
        command.set_option("-q", "%(TEMP_OUT_PART)s")
        # Output file will be named output.binary, and placed in the CWD
        command.set_option("-n", "output")

        # Substitution model
        command.set_option("-m", "DNA", fixed=False)

        command.set_kwargs(  # Auto-delete: Symlinks
            TEMP_OUT_PART=os.path.basename(input_partition),
            TEMP_OUT_ALN=os.path.basename(input_alignment),
            # Input files, are not used directly (see below)
            IN_ALIGNMENT=input_alignment,
            IN_PARTITION=input_partition,
            # Final output file, are not created directly
            OUT_BINARY=output_file,
            CHECK_EXAML=PARSER_VERSION,
        )

        CommandNode.__init__(
            self,
            command=command.finalize(),
            description="<ExaMLParser: '%s' -> '%s'>" %
            (input_alignment, output_file),
            dependencies=dependencies,
        )

        self._symlinks = [
            os.path.abspath(input_alignment),
            os.path.abspath(input_partition),
        ]
        self._output_file = os.path.basename(output_file)
Exemple #14
0
    def __init__(
        self,
        reference,
        input_files,
        output_file,
        directory,
        options={},
        dependencies=(),
    ):
        stats_out_fname = "Stats_out_MCMC_correct_prob.csv"

        merge = merge_bam_files_command(input_files)
        command = AtomicCmdBuilder(
            [
                "mapDamage",
                "--rescale-only",
                "-i",
                "-",
                "-d",
                "%(TEMP_DIR)s",
                "-r",
                "%(IN_REFERENCE)s",
                "--rescale-out",
                "%(OUT_BAM)s",
            ],
            IN_STDIN=merge,
            IN_REFERENCE=reference,
            TEMP_OUT_LOG="Runtime_log.txt",
            TEMP_OUT_CSV=stats_out_fname,
            OUT_BAM=output_file,
            CHECK_VERSION=MAPDAMAGE_VERSION,
        )

        apply_options(command, options)

        self._directory = directory

        CommandNode.__init__(
            self,
            command=ParallelCmds([merge, command.finalize()]),
            description="<mapDamage (rescale): %s -> %r>"
            % (describe_files(merge.input_files), output_file,),
            dependencies=dependencies,
        )
Exemple #15
0
    def __init__(self, input_alignment, input_partitions, output_tree, dependencies=()):
        command = AtomicCmdBuilder("raxmlHPC")

        # Compute a randomized parsimony starting tree
        command.set_option("-y")
        # Output files are saved with a .Pypeline postfix, and subsequently renamed
        command.set_option("-n", "Pypeline")
        # Model required, but not used
        command.set_option("-m", "GTRGAMMA")
        # Ensures that output is saved to the temporary directory
        command.set_option("-w", "%(TEMP_DIR)s")
        # Set random seed for bootstrap generation. May be set to allow replicability
        command.set_option("-p", int(random.random() * 2 ** 31 - 1), fixed=False)

        # Symlink to sequence and partitions, to prevent the creation of *.reduced files
        # outside temp folder
        command.set_option("-s", "%(TEMP_OUT_ALIGNMENT)s")
        command.set_option("-q", "%(TEMP_OUT_PARTITION)s")

        command.set_kwargs(
            IN_ALIGNMENT=input_alignment,
            IN_PARTITION=input_partitions,
            # TEMP_OUT_ is used to automatically remove these files
            TEMP_OUT_ALIGNMENT="RAxML_alignment",
            TEMP_OUT_PARTITION="RAxML_partitions",
            TEMP_OUT_INFO="RAxML_info.Pypeline",
            OUT_TREE=output_tree,
            CHECK_VERSION=RAXML_VERSION,
        )

        self._input_alignment = input_alignment
        self._input_partitions = input_partitions
        self._output_tree = output_tree

        CommandNode.__init__(
            self,
            command=command.finalize(),
            description="<RAxMLParsimonyTree: '%s' -> '%s'>"
            % (input_alignment, output_tree),
            dependencies=dependencies,
        )
Exemple #16
0
    def __init__(self,
                 input_file,
                 k_groups,
                 output_root,
                 groups,
                 dependencies=()):
        self._groups = groups
        self._input_file = input_file

        prefix = os.path.splitext(os.path.basename(input_file))[0]
        output_prefix = os.path.join(output_root, "%s.%i" % (prefix, k_groups))

        cmd = AtomicCmdBuilder(
            "admixture",
            IN_FILE_BED=input_file,
            IN_FILE_BIM=fileutils.swap_ext(input_file, ".bim"),
            IN_FILE_FAM=fileutils.swap_ext(input_file, ".fam"),
            TEMP_OUT_FILE_BED=prefix + ".bed",
            TEMP_OUT_FILE_BIM=prefix + ".bim",
            TEMP_OUT_FILE_FAM=prefix + ".fam",
            TEMP_OUT_FILE_POP=prefix + ".pop",
            OUT_P=output_prefix + ".P",
            OUT_Q=output_prefix + ".Q",
            OUT_STDOUT=output_prefix + ".log",
            CHECK_VERSION=ADMIXTURE_VERSION,
            set_cwd=True,
        )

        cmd.set_option("-s", random.randint(0, 2**16 - 1))
        cmd.set_option("--supervised")

        cmd.add_value("%(TEMP_OUT_FILE_BED)s")
        cmd.add_value(int(k_groups))

        CommandNode.__init__(
            self,
            description="<Admixture -> '%s.*''>" % (output_prefix, ),
            command=cmd.finalize(),
            dependencies=dependencies,
        )
Exemple #17
0
 def _do_test_builder__add_or_set_option__after_finalize(setter):
     builder = AtomicCmdBuilder("find")
     builder.finalize()
     assert_raises(AtomicCmdBuilderError, setter, builder, "-size", "1")
Exemple #18
0
def test_builder__finalize__returns_singleton():
    builder = AtomicCmdBuilder("echo")
    assert builder.finalize() is builder.finalize()
Exemple #19
0
 def _do_test_builder__add_or_set_option__after_finalize(setter):
     builder = AtomicCmdBuilder("find")
     builder.finalize()
     assert_raises(AtomicCmdBuilderError, setter, builder, "-size", "1")
Exemple #20
0
def test_builder__finalize__returns_singleton():
    builder = AtomicCmdBuilder("echo")
    assert builder.finalize() is builder.finalize()
Exemple #21
0
    def __init__(
        self,
        input_alignment,
        output_template,
        input_partition=None,
        model="GTRGAMMAI",
        replicates="autoMRE",
        threads=1,
        dependencies=(),
    ):
        """
        Arguments:
        input_alignment  -- An alignment file in a format readable by RAxML.
        input_partition  -- A set of partitions in a format readable by RAxML.
        output_template  -- A template string used to construct final filenames. Should
                            consist of a full path, including a single '%s', which is
                            replaced with the  variable part of RAxML output files (e.g.
                            'info', 'bestTree', ...).

                            Example destination: '/disk/project/SN013420.RAxML.%s'
                            Example output:      '/disk/project/SN013420.RAxML.bestTree'
        """

        if threads > 1:
            command = AtomicCmdBuilder("raxmlHPC-PTHREADS")
            command.set_option("-T", threads)
            version = RAXML_PTHREADS_VERSION
        else:
            command = AtomicCmdBuilder("raxmlHPC")
            version = RAXML_VERSION

        # Perform rapid bootstrapping
        command.set_option("-f", "a")
        # Output files are saved with a .PALEOMIX postfix, and subsequently renamed
        command.set_option("-n", "PALEOMIX")
        # Ensures that output is saved to the temporary directory
        command.set_option("-w", "%(TEMP_DIR)s")
        # Symlink to sequence and partitions, to prevent the creation of *.reduced files
        # outside temp folder. In addition, it may be nessesary to remove the .reduced
        # files if created
        command.set_option("-s", "%(TEMP_OUT_ALN)s")

        if input_partition is not None:
            command.set_option("-q", "%(TEMP_OUT_PART)s")
            command.set_kwargs(
                IN_PARTITION=input_partition,
                TEMP_OUT_PART=os.path.basename(input_partition),
                TEMP_OUT_PART_R=os.path.basename(input_partition) + ".reduced",
            )

        command.set_kwargs(
            # Auto-delete: Symlinks and .reduced files that RAxML may generate
            TEMP_OUT_ALN=os.path.basename(input_alignment),
            TEMP_OUT_ALN_R=os.path.basename(input_alignment) + ".reduced",
            # Input files, are not used directly (see below)
            IN_ALIGNMENT=input_alignment,
            # Final output files, are not created directly
            OUT_INFO=output_template % "info",
            OUT_BESTTREE=output_template % "bestTree",
            OUT_BOOTSTRAP=output_template % "bootstrap",
            OUT_BIPART=output_template % "bipartitions",
            OUT_BIPARTLABEL=output_template % "bipartitionsBranchLabels",
            CHECK_VERSION=version,
        )

        # Use the GTRGAMMA model of NT substitution by default
        command.set_option("-m", model, fixed=False)
        # Enable Rapid Boostrapping and set random seed. May be set to a fixed value to
        # allow replicability.
        command.set_option("-x", int(random.random() * 2 ** 31 - 1), fixed=False)
        # Set random seed for parsimony inference. May be set to allow replicability.
        command.set_option("-p", int(random.random() * 2 ** 31 - 1), fixed=False)
        # Terminate bootstrapping upon convergence, not after N repetitions
        command.set_option("-N", replicates, fixed=False)

        self._symlinks = [input_alignment, input_partition]
        self._template = os.path.basename(output_template)

        CommandNode.__init__(
            self,
            command=command.finalize(),
            description="<RAxMLRapidBS: '%s' -> '%s'>"
            % (input_alignment, output_template % ("*",)),
            threads=threads,
            dependencies=dependencies,
        )