Esempi in Python per swap_ext, esempi in Python per paleomix.common.fileutils.swap_ext

Esempio n. 1

0

Mostra file

    def _build_bwa_backtrack_pe(self, config, prefix, record, parameters):
        template = parameters.pop("input_file")
        output_bam = parameters.pop("output_file")

        output_sai_1 = swap_ext(output_bam, "%i.sai" % (1, ))
        aln_node_1 = self._build_bwa_backtrack_aln(
            parameters=parameters,
            input_file=template.format(Pair=1),
            output_file=output_sai_1,
        )

        output_sai_2 = swap_ext(output_bam, "%i.sai" % (2, ))
        aln_node_2 = self._build_bwa_backtrack_aln(
            parameters=parameters,
            input_file=template.format(Pair=2),
            output_file=output_sai_2,
        )

        return BWASampe(
            input_file_sai_1=output_sai_1,
            input_file_sai_2=output_sai_2,
            input_file_fq_1=template.format(Pair=1),
            input_file_fq_2=template.format(Pair=2),
            output_file=output_bam,
            prefix=parameters["prefix"],
            reference=parameters["reference"],
            mapping_options=self.options["Aligners"]["BWA"],
            cleanup_options=self._cleanup_options("BWA"),
            dependencies=(aln_node_1, aln_node_2),
        )

Esempio n. 2

0

Mostra file

File: phylo.py Progetto: muslih14/paleomix

def _build_examl_bootstraps(options, phylo, destination, input_alignment, input_partition, dependencies):
    bootstraps = []
    num_bootstraps = phylo["ExaML"]["Bootstraps"]
    bootstrap_destination = os.path.join(destination, "bootstraps")
    bootstrap_template    = os.path.join(bootstrap_destination, "bootstrap.%04i.phy")

    for bootstrap_num in xrange(num_bootstraps):
        bootstrap_alignment = bootstrap_template % (bootstrap_num,)
        bootstrap = PHYLIPBootstrapNode(input_alignment  = input_alignment,
                                        input_partition  = input_partition,
                                        output_alignment = bootstrap_alignment,
                                        seed             = random.randint(1, 2**32 - 1),
                                        dependencies     = dependencies)

        bootstrap_binary      = swap_ext(bootstrap_alignment, ".binary")
        bootstrap_final       = swap_ext(bootstrap_alignment, ".%s")
        bs_binary   = ExaMLParserNode(input_alignment = bootstrap_alignment,
                                      input_partition = input_partition,
                                      output_file     = bootstrap_binary,
                                      dependencies    = bootstrap)

        bootstraps.append(_examl_nodes(options          = options,
                                       settings         = phylo,
                                       input_alignment  = bootstrap_alignment,
                                       input_partitions = input_partition,
                                       input_binary     = bootstrap_binary,
                                       output_template  = bootstrap_final,
                                       dependencies     = bs_binary))

    if bootstraps:
        return _build_rerooted_trees(bootstraps, phylo["RootTreesOn"])

    return None

Esempio n. 3

0

Mostra file

File: nuclear.py Progetto: muslih14/paleomix

    def _setup(self, config, temp):
        CommandNode._setup(self, config, temp)

        input_files = [
            self._input_file,
            fileutils.swap_ext(self._input_file, ".bim"),
            fileutils.swap_ext(self._input_file, ".fam"),
        ]

        for filename in input_files:
            basename = os.path.basename(filename)
            os.symlink(os.path.abspath(filename), os.path.join(temp, basename))

        if self._supervised:
            fam_filename = fileutils.swap_ext(self._input_file, ".fam")

            pop_filename = fileutils.swap_ext(fam_filename, ".pop")
            pop_filename = fileutils.reroot_path(temp, pop_filename)

            key = "Group(%i)" % (self._k_groups,)
            with open(fam_filename) as fam_handle:
                with open(pop_filename, "w") as pop_handle:
                    for line in fam_handle:
                        sample, _ = line.split(None, 1)
                        group = self._samples.get(sample, {}).get(key, "-")

                        pop_handle.write("%s\n" % (group,))

Esempio n. 4

0

Mostra file

    def _setup(self, config, temp):
        CommandNode._setup(self, config, temp)

        input_files = [
            self._input_file,
            fileutils.swap_ext(self._input_file, ".bim"),
            fileutils.swap_ext(self._input_file, ".fam"),
        ]

        for filename in input_files:
            basename = os.path.basename(filename)
            os.symlink(os.path.abspath(filename), os.path.join(temp, basename))

        if self._supervised:
            fam_filename = fileutils.swap_ext(self._input_file, ".fam")

            pop_filename = fileutils.swap_ext(fam_filename, ".pop")
            pop_filename = fileutils.reroot_path(temp, pop_filename)

            key = "Group(%i)" % (self._k_groups, )
            with open(fam_filename) as fam_handle:
                with open(pop_filename, "w") as pop_handle:
                    for line in fam_handle:
                        sample, _ = line.split(None, 1)
                        group = self._samples.get(sample, {}).get(key, "-")

                        pop_handle.write("%s\n" % (group, ))

Esempio n. 5

0

Mostra file

    def _setup(self, config, temp):
        CommandNode._setup(self, config, temp)

        pipe_fname = os.path.join(temp, self.PIPE_FILE)
        if len(self._input_bams) > 1:
            os.mkfifo(pipe_fname)
        else:
            source_fname = os.path.abspath(self._input_bams[0])
            os.symlink(source_fname, pipe_fname)

            if self._index_format:
                os.symlink(swap_ext(source_fname, self._index_format),
                           swap_ext(pipe_fname, self._index_format))

Esempio n. 6

0

Mostra file

File: picard.py Progetto: MikkelSchubert/paleomix

    def _setup(self, config, temp):
        CommandNode._setup(self, config, temp)

        pipe_fname = os.path.join(temp, self.PIPE_FILE)
        if len(self._input_bams) > 1:
            os.mkfifo(pipe_fname)
        else:
            source_fname = os.path.abspath(self._input_bams[0])
            os.symlink(source_fname, pipe_fname)

            if self._index_format:
                os.symlink(swap_ext(source_fname, self._index_format),
                           swap_ext(pipe_fname, self._index_format))

Esempio n. 7

0

Mostra file

    def customize(cls,
                  config,
                  input_bams,
                  output_bam,
                  output_metrics=None,
                  keep_dupes=False,
                  dependencies=()):
        params = picard_command(config, "MarkDuplicates")
        _set_max_open_files(params, "MAX_FILE_HANDLES")

        params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
        params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=")
        # Validation is mostly left to manual ValidateSamFile runs; required
        # because .csi indexed BAM records can have "invalid" bins.
        params.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=")
        params.add_multiple_options("I", input_bams, sep="=")

        if not keep_dupes:
            # Remove duplicates from output by default to save disk-space
            params.set_option("REMOVE_DUPLICATES",
                              "True",
                              sep="=",
                              fixed=False)

        output_metrics = output_metrics or swap_ext(output_bam, ".metrics")
        params.set_kwargs(OUT_BAM=output_bam, OUT_METRICS=output_metrics)

        return {"command": params, "dependencies": dependencies}

Esempio n. 8

0

Mostra file

File: picard.py Progetto: muslih14/paleomix

    def __init__(self, config, input_bams, pipename="input.bam", indexed=True):
        self.pipe = pipename
        self.indexed = indexed
        self.files = safe_coerce_to_tuple(input_bams)

        self.commands = []
        self.kwargs = {"TEMP_IN_BAM": self.pipe}
        if len(self.files) > 1:
            params = picard_command(config, "MergeSamFiles")

            params.set_option("SO", "coordinate", sep="=", fixed=False)
            params.set_option("CREATE_INDEX", "False", sep="=")
            params.set_option("COMPRESSION_LEVEL", 0, sep="=")
            params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=")
            params.add_multiple_options("I", input_bams, sep="=")

            params.set_kwargs(TEMP_OUT_BAM=self.pipe)

            self.commands = [params.finalize()]
        else:
            # Ensure that the actual command depends on the input
            self.kwargs["IN_FILE_00"] = self.files[0]

            if indexed:
                self.kwargs["IN_FILE_01"] = swap_ext(self.files[0], ".bai")

Esempio n. 9

0

Mostra file

    def __init__(
            self,
            config,
            input_bam,
            input_index=None,
            output_log=None,
            ignored_checks=(),
            big_genome_mode=False,
            dependencies=(),
    ):
        builder = picard_command(config, "ValidateSamFile")
        _set_max_open_files(builder, "MAX_OPEN_TEMP_FILES")

        if True or big_genome_mode:
            self._configure_for_big_genome(config, builder)

        builder.set_option("I", "%(IN_BAM)s", sep="=")
        for check in ignored_checks:
            builder.add_option("IGNORE", check, sep="=")

        output_log = output_log or swap_ext(input_bam, ".validated")
        builder.set_kwargs(IN_BAM=input_bam,
                           IN_INDEX=input_index,
                           OUT_STDOUT=output_log)

        description = "<Validate BAM: '%s'>" % (input_bam, )
        PicardNode.__init__(
            self,
            command=builder.finalize(),
            description=description,
            dependencies=dependencies,
        )

Esempio n. 10

0

Mostra file

File: nuclear.py Progetto: muslih14/paleomix

    def __init__(self, output_root, table, bamfile, downsample,
                 dependencies=()):
        cmd = factory.new("build_tped")
        cmd.set_option("--name", "Sample")
        cmd.set_option("--downsample", downsample)
        cmd.add_value("%(TEMP_DIR)s")
        cmd.add_value("%(IN_TABLE)s")
        cmd.add_value("%(IN_BAM)s")

        if not downsample:
            # Needed for random access (chromosomes are read 1 ... 31)
            cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai"))

        cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"),
                       OUT_SUMMARY=os.path.join(output_root, "common.summary"),
                       OUT_TPED_INCL_TS=os.path.join(output_root,
                                                     "incl_ts.tped"),
                       OUT_TPED_EXCL_TS=os.path.join(output_root,
                                                     "excl_ts.tped"),
                       IN_TABLE=table,
                       IN_BAM=bamfile)

        CommandNode.__init__(self,
                             description="<BuildTPEDFiles -> %r>"
                             % (os.path.join(output_root, '*'),),
                             command=cmd.finalize(),
                             dependencies=dependencies)

Esempio n. 11

0

Mostra file

File: gatk.py Progetto: jelber2/paleomix

    def __init__(self, config, reference, infiles, outfile,
                 threads=1, dependencies=()):
        threads = _get_max_threads(reference, threads)
        infiles = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
        command = AtomicJavaCmdBuilder(jar_file,
                                       jre_options=config.jre_options)
        command.set_option("-T", "RealignerTargetCreator")
        command.set_option("-R", "%(IN_REFERENCE)s")
        command.set_option("-o", "%(OUT_INTERVALS)s")
        command.set_option("-nt", threads)

        _set_input_files(command, infiles)
        command.set_kwargs(IN_REFERENCE=reference,
                           IN_REF_DICT=fileutils.swap_ext(reference, ".dict"),
                           OUT_INTERVALS=outfile,
                           CHECK_GATK=_get_gatk_version_check(config))

        description = "<GATK Indel Realigner (training): %s -> %r>" \
            % (describe_files(infiles), outfile)
        CommandNode.__init__(self,
                             threads=threads,
                             description=description,
                             command=command.finalize(),
                             dependencies=dependencies)

Esempio n. 12

0

Mostra file

File: picard.py Progetto: MikkelSchubert/paleomix

    def _teardown(self, config, temp):
        os.remove(os.path.join(temp, self.PIPE_FILE))
        if self._index_format:
            os.remove(os.path.join(temp, swap_ext(self.PIPE_FILE,
                                                  self._index_format)))

        CommandNode._teardown(self, config, temp)

Esempio n. 13

0

Mostra file

    def __init__(self,
                 output_root,
                 table,
                 bamfile,
                 downsample,
                 dependencies=()):
        cmd = factory.new("zonkey_tped")
        cmd.set_option("--name", "Sample")
        cmd.set_option("--downsample", downsample)
        cmd.add_value("%(TEMP_DIR)s")
        cmd.add_value("%(IN_TABLE)s")
        cmd.add_value("%(IN_BAM)s")

        if not downsample:
            # Needed for random access (chromosomes are read 1 ... 31)
            cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai"))

        cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"),
                       OUT_SUMMARY=os.path.join(output_root, "common.summary"),
                       OUT_TPED_INCL_TS=os.path.join(output_root,
                                                     "incl_ts.tped"),
                       OUT_TPED_EXCL_TS=os.path.join(output_root,
                                                     "excl_ts.tped"),
                       IN_TABLE=table,
                       IN_BAM=bamfile)

        CommandNode.__init__(self,
                             description="<BuildTPEDFiles -> %r>" %
                             (os.path.join(output_root, '*'), ),
                             command=cmd.finalize(),
                             dependencies=dependencies)

Esempio n. 14

0

Mostra file

File: lane.py Progetto: jelber2/paleomix

    def _build_bwa_backtrack_pe(self, config, prefix, record, parameters):
        template = parameters.pop("input_file")
        output_bam = parameters.pop("output_file")

        aln_files = []
        aln_nodes = []
        for mate in (1, 2):
            input_file = template.format(Pair=mate)
            output_sai = swap_ext(output_bam, "%i.sai" % (mate,))

            aln_node = self._build_bwa_backtrack_aln(parameters=parameters,
                                                     input_file=input_file,
                                                     output_file=output_sai)

            aln_files.append(output_sai)
            aln_nodes.append(aln_node)

        sam_node = BWASampe.customize(input_file_sai_1=aln_files[0],
                                      input_file_sai_2=aln_files[1],
                                      input_file_fq_1=template.format(Pair=1),
                                      input_file_fq_2=template.format(Pair=2),
                                      output_file=output_bam,
                                      prefix=parameters['prefix'],
                                      reference=parameters["reference"],
                                      dependencies=aln_nodes)

        return self._finalize_nodes(config, prefix, parameters, sam_node)

Esempio n. 15

0

Mostra file

File: makefile.py Progetto: muslih14/paleomix

    def _collect_subsets(roi, subset, path):
        if roi not in subsets_by_regions:
            raise MakefileError("Subset of unknown region (%r) requested at %r"
                                % (roi, path))

        roi_fname = swap_ext(subsets_by_regions[roi]["BED"], subset + ".names")
        if not os.path.isfile(roi_fname):
            raise MakefileError("Subset file does not exist for Regions Of "
                                "Interest:\n  Region = %r\n  Subset = %r\n"
                                "  Path   = %r"
                                % (roi, subset, roi_fname))

        sequences = set()
        with open(roi_fname) as handle:
            for line in handle:
                line = line.strip()
                if line and not line.startswith("#"):
                    sequences.add(line)

        known_seqs = subsets_by_regions[roi]["Sequences"][None]
        unknown_seqs = sequences - known_seqs
        if unknown_seqs:
            message = ("Unknown sequences in subset file:\n"
                       "  File   = %r\n  Region = %r\n  Subset = %r\n"
                       "  Unknown sequence names =") \
                       % (roi_fname, roi, subset)
            unknown_seqs = list(sorted(unknown_seqs))
            if len(unknown_seqs) > 5:
                unknown_seqs = unknown_seqs[:5] + ["..."]
            message = "\n    - ".join([message] + unknown_seqs)
            raise MakefileError(message)

        subsets_by_regions[roi]["SubsetFiles"][subset] = (roi_fname,)
        subsets_by_regions[roi]["Sequences"][subset] = frozenset(sequences)

Esempio n. 16

0

Mostra file

File: samtools.py Progetto: jelber2/paleomix

    def __init__(self, infile, index_format='.bai', dependencies=()):
        basename = os.path.basename(infile)

        if index_format == '.bai':
            samtools_version = SAMTOOLS_VERSION
            samtools_call = ["samtools", "index", "%(TEMP_IN_BAM)s"]
        elif index_format == '.csi':
            samtools_version = SAMTOOLS_VERSION_1x
            samtools_call = ["samtools", "index", "-c", "%(TEMP_IN_BAM)s"]
        else:
            raise ValueError("Unknown format type %r; expected .bai or .csi" %
                             (index_format, ))

        cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"],
                             IN_BAM=infile,
                             TEMP_OUT_BAM=basename,
                             set_cwd=True)

        cmd_index = AtomicCmd(samtools_call,
                              TEMP_IN_BAM=basename,
                              CHECK_SAM=samtools_version)

        cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"],
                               TEMP_IN_BAM=basename + index_format,
                               OUT_BAM=swap_ext(infile, index_format))

        commands = SequentialCmds((cmd_link, cmd_index, cmd_rename))

        CommandNode.__init__(self,
                             description="<BAMIndex (%s): '%s'>" %
                             (index_format[1:].upper(), infile),
                             command=commands,
                             dependencies=dependencies)

Esempio n. 17

0

Mostra file

File: samtools.py Progetto: MikkelSchubert/paleomix

    def __init__(self, infile, index_format='.bai', dependencies=()):
        basename = os.path.basename(infile)

        if index_format == '.bai':
            samtools_version = SAMTOOLS_VERSION
            samtools_call = ["samtools", "index", "%(TEMP_IN_BAM)s"]
        elif index_format == '.csi':
            samtools_version = SAMTOOLS_VERSION_1x
            samtools_call = ["samtools", "index", "-c", "%(TEMP_IN_BAM)s"]
        else:
            raise ValueError("Unknown format type %r; expected .bai or .csi"
                             % (index_format,))

        cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"],
                             IN_BAM=infile,
                             TEMP_OUT_BAM=basename,
                             set_cwd=True)

        cmd_index = AtomicCmd(samtools_call,
                              TEMP_IN_BAM=basename,
                              CHECK_SAM=samtools_version)

        cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"],
                               TEMP_IN_BAM=basename + index_format,
                               OUT_BAM=swap_ext(infile, index_format))

        commands = SequentialCmds((cmd_link, cmd_index, cmd_rename))

        CommandNode.__init__(self,
                             description="<BAMIndex (%s): '%s'>"
                             % (index_format[1:].upper(), infile),
                             command=commands,
                             dependencies=dependencies)

Esempio n. 18

0

Mostra file

File: nodes.py Progetto: muslih14/paleomix

def index_and_validate_bam(config, prefix, node, log_file=None,
                           create_index=True):
    input_file, has_index = _get_input_file(node)
    if not has_index and create_index:
        node = BAMIndexNode(infile=input_file,
                            dependencies=node)

    validation_params = ValidateBAMNode.customize(config=config,
                                                  input_bam=input_file,
                                                  output_log=log_file,
                                                  dependencies=node)

    # Ensure that the validation node is re-run if the index changes
    if has_index or create_index:
        bai_filename = swap_ext(input_file, ".bai")
        validation_params.command.set_kwargs(IN_BAI=bai_filename)

    # Check MD tags against reference sequence
    # FIXME: Disabled due to issues with Picard/Samtools disagreeing,
    #   backwards compatibility. See the discussion at
    #     http://sourceforge.net/mailarchive/message.php?msg_id=31348639
    # validation_params.command.set_kwargs(IN_REF=prefix["Reference"])
    # validation_params.command.add_option("R", "%(IN_REF)s", sep="=")

    # Ignored since we may filter out misses and low-quality hits during
    # mapping, which leads to a large proportion of missing PE mates.
    validation_params.command.add_option("IGNORE", "MATE_NOT_FOUND",
                                         sep="=")
    # Ignored due to high rate of false positives for lanes with few hits,
    # where high-quality reads may cause mis-identification of qualities
    validation_params.command.add_option("IGNORE",
                                         "INVALID_QUALITY_FORMAT", sep="=")

    return validation_params.build_node()

Esempio n. 19

0

Mostra file

File: makefile.py Progetto: jelber2/paleomix

    def _collect_subsets(roi, subset, path):
        if roi not in subsets_by_regions:
            raise MakefileError("Subset of unknown region (%r) requested at %r"
                                % (roi, path))

        roi_fname = swap_ext(subsets_by_regions[roi]["BED"], subset + ".names")
        if not os.path.isfile(roi_fname):
            raise MakefileError("Subset file does not exist for Regions Of "
                                "Interest:\n  Region = %r\n  Subset = %r\n"
                                "  Path   = %r"
                                % (roi, subset, roi_fname))

        sequences = set()
        with open(roi_fname) as handle:
            for line in handle:
                line = line.strip()
                if line and not line.startswith("#"):
                    sequences.add(line)

        known_seqs = subsets_by_regions[roi]["Sequences"][None]
        unknown_seqs = sequences - known_seqs
        if unknown_seqs:
            message = ("Unknown sequences in subset file:\n"
                       "  File   = %r\n  Region = %r\n  Subset = %r\n"
                       "  Unknown sequence names =") \
                       % (roi_fname, roi, subset)
            unknown_seqs = list(sorted(unknown_seqs))
            if len(unknown_seqs) > 5:
                unknown_seqs = unknown_seqs[:5] + ["..."]
            message = "\n    - ".join([message] + unknown_seqs)
            raise MakefileError(message)

        subsets_by_regions[roi]["SubsetFiles"][subset] = (roi_fname,)
        subsets_by_regions[roi]["Sequences"][subset] = frozenset(sequences)

Esempio n. 20

0

Mostra file

File: gatk.py Progetto: jelber2/paleomix

    def __init__(self, config, reference, intervals, infiles, outfile,
                 dependencies=()):
        self._basename = os.path.basename(outfile)

        infiles = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
        command = AtomicJavaCmdBuilder(jar_file,
                                       jre_options=config.jre_options)
        command.set_option("-T", "IndelRealigner")
        command.set_option("-R", "%(IN_REFERENCE)s")
        command.set_option("-targetIntervals", "%(IN_INTERVALS)s")
        command.set_option("-o", "%(OUT_BAMFILE)s")
        command.set_option("--bam_compression", 0)
        command.set_option("--disable_bam_indexing")
        _set_input_files(command, infiles)

        command.set_kwargs(IN_REFERENCE=reference,
                           IN_REF_DICT=fileutils.swap_ext(reference, ".dict"),
                           IN_INTERVALS=intervals,
                           OUT_BAMFILE=outfile,
                           CHECK_GATK=_get_gatk_version_check(config))

        calmd = AtomicCmd(["samtools", "calmd", "-b",
                           "%(TEMP_IN_BAM)s", "%(IN_REF)s"],
                          TEMP_IN_BAM=self._basename,
                          IN_REF=reference,
                          TEMP_OUT_STDOUT=self._basename + ".calmd",
                          CHECK_VERSION=SAMTOOLS_VERSION)

        description = "<GATK Indel Realigner (aligning): %s -> %r>" \
            % (describe_files(infiles), outfile)
        CommandNode.__init__(self,
                             description=description,
                             command=ParallelCmds([command.finalize(), calmd]),
                             dependencies=dependencies)

Esempio n. 21

0

Mostra file

File: nuclear.py Progetto: muslih14/paleomix

    def __init__(self, input_file, k_groups, output_root,
                 samples=None, dependencies=()):
        self._samples = samples
        self._input_file = input_file
        self._k_groups = k_groups

        group_key = "Group(%i)" % (self._k_groups,)
        self._supervised = samples and any((row[group_key] != '-')
                                           for row in samples.itervalues())

        assert k_groups in (2, 3), k_groups
        prefix = os.path.splitext(os.path.basename(input_file))[0]
        output_prefix = os.path.join(output_root,
                                     "%s.%i" % (prefix, k_groups))

        cmd = AtomicCmdBuilder("admixture",
                               IN_FILE_BED=input_file,
                               IN_FILE_BIM=fileutils.swap_ext(input_file,
                                                              ".bim"),
                               IN_FILE_FAM=fileutils.swap_ext(input_file,
                                                              ".fam"),

                               TEMP_OUT_FILE_BED=prefix + ".bed",
                               TEMP_OUT_FILE_BIM=prefix + ".bim",
                               TEMP_OUT_FILE_FAM=prefix + ".fam",
                               TEMP_OUT_FILE_POP=prefix + ".pop",

                               OUT_P=output_prefix + ".P",
                               OUT_Q=output_prefix + ".Q",
                               OUT_STDOUT=output_prefix + ".log",

                               CHECK_VERSION=ADMIXTURE_VERSION,
                               set_cwd=True)

        cmd.set_option("-s", random.randint(0, 2 ** 16 - 1))

        if self._supervised:
            cmd.set_option("--supervised")

        cmd.add_value("%(TEMP_OUT_FILE_BED)s")
        cmd.add_value(int(k_groups))

        CommandNode.__init__(self,
                             description="<Admixture -> '%s.*''>"
                             % (output_prefix,),
                             command=cmd.finalize(),
                             dependencies=dependencies)

Esempio n. 22

0

Mostra file

File: nuclear.py Progetto: jelber2/paleomix

    def __init__(self, input_file, k_groups, output_root,
                 samples=None, dependencies=()):
        self._samples = samples
        self._input_file = input_file
        self._k_groups = k_groups

        group_key = "Group(%i)" % (self._k_groups,)
        self._supervised = samples and any((row[group_key] != '-')
                                           for row in samples.itervalues())

        assert k_groups in (2, 3), k_groups
        prefix = os.path.splitext(os.path.basename(input_file))[0]
        output_prefix = os.path.join(output_root,
                                     "%s.%i" % (prefix, k_groups))

        cmd = AtomicCmdBuilder("admixture",
                               IN_FILE_BED=input_file,
                               IN_FILE_BIM=fileutils.swap_ext(input_file,
                                                              ".bim"),
                               IN_FILE_FAM=fileutils.swap_ext(input_file,
                                                              ".fam"),

                               TEMP_OUT_FILE_BED=prefix + ".bed",
                               TEMP_OUT_FILE_BIM=prefix + ".bim",
                               TEMP_OUT_FILE_FAM=prefix + ".fam",
                               TEMP_OUT_FILE_POP=prefix + ".pop",

                               OUT_P=output_prefix + ".P",
                               OUT_Q=output_prefix + ".Q",
                               OUT_STDOUT=output_prefix + ".log",

                               CHECK_VERSION=ADMIXTURE_VERSION,
                               set_cwd=True)

        cmd.set_option("-s", random.randint(0, 2 ** 16 - 1))

        if self._supervised:
            cmd.set_option("--supervised")

        cmd.add_value("%(TEMP_OUT_FILE_BED)s")
        cmd.add_value(int(k_groups))

        CommandNode.__init__(self,
                             description="<Admixture -> '%s.*''>"
                             % (output_prefix,),
                             command=cmd.finalize(),
                             dependencies=dependencies)

Esempio n. 23

0

Mostra file

    def _teardown(self, config, temp):
        os.remove(os.path.join(temp, self.PIPE_FILE))
        if self._index_format:
            os.remove(
                os.path.join(temp, swap_ext(self.PIPE_FILE,
                                            self._index_format)))

        CommandNode._teardown(self, config, temp)

Esempio n. 24

0

Mostra file

File: gatk.py Progetto: jelber2/paleomix

def _set_input_files(command, input_files):
    keys = {}
    for (index, filename) in enumerate(input_files):
        command.add_option("-I", "%%(IN_BAMFILE_%02i)s" % index)
        keys["IN_BAMFILE_%02i" % index] = filename
        keys["IN_BAIFILE_%02i" % index] = swap_ext(filename, ".bai")

    command.set_kwargs(**keys)

Esempio n. 25

0

Mostra file

def _build_examl_bootstraps(options, phylo, destination, input_alignment,
                            input_partition, dependencies):
    bootstraps = []
    num_bootstraps = phylo["ExaML"]["Bootstraps"]
    bootstrap_destination = os.path.join(destination, "bootstraps")
    bootstrap_template = os.path.join(bootstrap_destination,
                                      "bootstrap.%04i.phy")

    for bootstrap_num in range(num_bootstraps):
        bootstrap_alignment = bootstrap_template % (bootstrap_num, )
        bootstrap = PHYLIPBootstrapNode(
            input_alignment=input_alignment,
            input_partition=input_partition,
            output_alignment=bootstrap_alignment,
            seed=random.randint(1, 2**32 - 1),
            dependencies=dependencies,
        )

        bootstrap_binary = swap_ext(bootstrap_alignment, ".binary")
        bootstrap_final = swap_ext(bootstrap_alignment, ".%s")
        bs_binary = ExaMLParserNode(
            input_alignment=bootstrap_alignment,
            input_partition=input_partition,
            output_file=bootstrap_binary,
            dependencies=bootstrap,
        )

        bootstraps.append(
            _examl_nodes(
                options=options,
                settings=phylo,
                input_alignment=bootstrap_alignment,
                input_partitions=input_partition,
                input_binary=bootstrap_binary,
                output_template=bootstrap_final,
                dependencies=bs_binary,
            ))

    if bootstraps:
        return _build_rerooted_trees(bootstraps, phylo["RootTreesOn"])

    return None

Esempio n. 26

0

Mostra file

File: picard.py Progetto: muslih14/paleomix

    def customize(cls, config, input_bam, output_log=None, dependencies=()):
        params = picard_command(config, "ValidateSamFile")

        params.set_option("I", "%(IN_BAM)s", sep="=")

        output_log = output_log or swap_ext(input_bam, ".validated")
        params.set_kwargs(IN_BAM=input_bam,
                          OUT_STDOUT=output_log)

        return {"command": params,
                "dependencies": dependencies}

Esempio n. 27

0

Mostra file

File: picard.py Progetto: muslih14/paleomix

    def customize(cls, config, reference, dependencies=()):
        params = picard_command(config, "CreateSequenceDictionary")

        params.set_option("R", "%(TEMP_OUT_REF)s", sep="=")
        params.set_option("O", "%(OUT_DICT)s", sep="=")
        params.set_kwargs(IN_REF=reference,
                          TEMP_OUT_REF=os.path.basename(reference),
                          OUT_DICT=swap_ext(reference, ".dict"))

        return {"command": params,
                "dependencies": dependencies}

Esempio n. 28

0

Mostra file

File: picard.py Progetto: muslih14/paleomix

    def _setup(self, config, temp_root):
        CommandNode._setup(self, config, temp_root)
        dst_fname = os.path.join(temp_root, self._bam_input.pipe)
        if len(self._bam_input.files) > 1:
            os.mkfifo(dst_fname)
        else:
            src_fname, = self._bam_input.files
            os.symlink(os.path.join(os.getcwd(), src_fname), dst_fname)

            if self._bam_input.indexed:
                src_fname = os.path.join(os.getcwd(), swap_ext(src_fname, ".bai"))
                os.symlink(src_fname, dst_fname + ".bai")

Esempio n. 29

0

Mostra file

    def __init__(self,
                 input_file,
                 k_groups,
                 output_root,
                 groups,
                 dependencies=()):
        self._groups = groups
        self._input_file = input_file

        prefix = os.path.splitext(os.path.basename(input_file))[0]
        output_prefix = os.path.join(output_root, "%s.%i" % (prefix, k_groups))

        cmd = AtomicCmdBuilder(
            "admixture",
            IN_FILE_BED=input_file,
            IN_FILE_BIM=fileutils.swap_ext(input_file, ".bim"),
            IN_FILE_FAM=fileutils.swap_ext(input_file, ".fam"),
            TEMP_OUT_FILE_BED=prefix + ".bed",
            TEMP_OUT_FILE_BIM=prefix + ".bim",
            TEMP_OUT_FILE_FAM=prefix + ".fam",
            TEMP_OUT_FILE_POP=prefix + ".pop",
            OUT_P=output_prefix + ".P",
            OUT_Q=output_prefix + ".Q",
            OUT_STDOUT=output_prefix + ".log",
            CHECK_VERSION=ADMIXTURE_VERSION,
            set_cwd=True,
        )

        cmd.set_option("-s", random.randint(0, 2**16 - 1))
        cmd.set_option("--supervised")

        cmd.add_value("%(TEMP_OUT_FILE_BED)s")
        cmd.add_value(int(k_groups))

        CommandNode.__init__(
            self,
            description="<Admixture -> '%s.*''>" % (output_prefix, ),
            command=cmd.finalize(),
            dependencies=dependencies,
        )

Esempio n. 30

0

Mostra file

File: picard.py Progetto: muslih14/paleomix

    def customize(cls, config, input_bams, output_bam, dependencies=()):
        params = picard_command(config, "MergeSamFiles")

        params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
        params.set_option("CREATE_INDEX", "True", sep="=")
        params.set_option("SO", "coordinate", sep="=", fixed=False)
        params.add_multiple_options("I", input_bams, sep="=")

        params.set_kwargs(OUT_BAM=output_bam,
                          OUT_BAI=swap_ext(output_bam, ".bai"))

        return {"command": params,
                "dependencies": dependencies}

Esempio n. 31

0

Mostra file

File: picard.py Progetto: muslih14/paleomix

    def customize(cls, config, input_bams, output_bam, output_metrics=None,
                  keep_dupes=False, dependencies=()):
        params = picard_command(config, "MarkDuplicates")

        # Create .bai index, since it is required by a lot of other programs
        params.set_option("CREATE_INDEX", "True", sep="=")

        params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
        params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=")
        params.add_multiple_options("I", input_bams, sep="=")

        if not keep_dupes:
            # Remove duplicates from output by default to save disk-space
            params.set_option("REMOVE_DUPLICATES", "True",
                              sep="=", fixed=False)

        output_metrics = output_metrics or swap_ext(output_bam, ".metrics")
        params.set_kwargs(OUT_BAM=output_bam,
                          OUT_BAI=swap_ext(output_bam, ".bai"),
                          OUT_METRICS=output_metrics)

        return {"command": params,
                "dependencies": dependencies}

Esempio n. 32

0

Mostra file

File: database.py Progetto: MikkelSchubert/paleomix

def _validate_mito_bam(data, handle, info):
    if data.mitochondria is None:
        # No mitochondrial data .. skip phylogeny
        return True

    references = handle.references
    min_length = min((len(record.sequence))
                     for record in data.mitochondria.itervalues())

    for bam_contig, bam_length in zip(references, handle.lengths):
        if bam_contig not in data.mitochondria:
            continue

        db_sequence = data.mitochondria[bam_contig].sequence
        db_length = len(db_sequence) - db_sequence.count("-")

        if bam_length != db_length:
            print_err("ERROR: Length of mitochondrial contig %r (%i bp) "
                      "does not match the length of the corresponding "
                      "sequence in the database (%i bp)"
                      % (bam_contig, bam_length, db_length))
            return False

        if not os.path.exists(handle.filename + '.bai') \
                and not os.path.exists(swap_ext(handle.filename, '.bai')):
            print_info('    - Attempting to index BAM file %r!'
                       % (handle.filename,))
            pysam.index(handle.filename)

        # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
        for line in "".join(pysam.idxstats(handle.filename)).split('\n'):
            line = line.strip()
            if not line:
                continue

            name, _, hits, _ = line.split('\t')
            if (name == bam_contig) and not int(hits):
                print_err("WARNING: Mitochondrial BAM (%r) does not contain "
                          "any reads aligned to contig %r; inferring an "
                          "phylogeny is not possible."
                          % (handle.filename, name))
                return True

        info.mt_contig = bam_contig
        info.mt_length = bam_length
        info.mt_padding = len(db_sequence) - min_length

        return True
    return True

Esempio n. 33

0

Mostra file

File: database.py Progetto: jelber2/paleomix

def _validate_mito_bam(data, handle, info):
    if data.mitochondria is None:
        # No mitochondrial data .. skip phylogeny
        return True

    references = handle.references
    min_length = min(
        (len(record.sequence)) for record in data.mitochondria.itervalues())

    for bam_contig, bam_length in zip(references, handle.lengths):
        if bam_contig not in data.mitochondria:
            continue

        db_sequence = data.mitochondria[bam_contig].sequence
        db_length = len(db_sequence) - db_sequence.count("-")

        if bam_length != db_length:
            print_err("ERROR: Length of mitochondrial contig %r (%i bp) "
                      "does not match the length of the corresponding "
                      "sequence in the database (%i bp)" %
                      (bam_contig, bam_length, db_length))
            return False

        if not os.path.exists(handle.filename + '.bai') \
                and not os.path.exists(swap_ext(handle.filename, '.bai')):
            print_info('    - Attempting to index BAM file %r!' %
                       (handle.filename, ))
            pysam.index(handle.filename)

        # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
        for line in "".join(pysam.idxstats(handle.filename)).split('\n'):
            line = line.strip()
            if not line:
                continue

            name, _, hits, _ = line.split('\t')
            if (name == bam_contig) and not int(hits):
                print_err("WARNING: Mitochondrial BAM (%r) does not contain "
                          "any reads aligned to contig %r; inferring an "
                          "phylogeny is not possible." %
                          (handle.filename, name))
                return True

        info.mt_contig = bam_contig
        info.mt_length = bam_length
        info.mt_padding = len(db_sequence) - min_length

        return True
    return True

Esempio n. 34

0

Mostra file

File: lane.py Progetto: jelber2/paleomix

    def _build_bwa_backtrack_se(self, config, prefix, record, parameters):
        input_file_fq = parameters.pop("input_file")
        output_file_bam = parameters.pop("output_file")
        output_file_sai = swap_ext(output_file_bam, ".sai")

        aln_node = self._build_bwa_backtrack_aln(parameters=parameters,
                                                 input_file=input_file_fq,
                                                 output_file=output_file_sai)

        sam_node = BWASamse.customize(input_file_fq=input_file_fq,
                                      input_file_sai=output_file_sai,
                                      output_file=output_file_bam,
                                      prefix=parameters["prefix"],
                                      reference=parameters["reference"],
                                      dependencies=aln_node)

        return self._finalize_nodes(config, prefix, parameters, sam_node)

Esempio n. 35

0

Mostra file

File: picard.py Progetto: MikkelSchubert/paleomix

    def __init__(self, config, reference, dependencies=()):
        self._in_reference = os.path.abspath(reference)

        builder = picard_command(config, "CreateSequenceDictionary")

        builder.set_option("R", "%(TEMP_OUT_REF)s", sep="=")
        builder.set_option("O", "%(OUT_DICT)s", sep="=")
        builder.set_kwargs(IN_REFERENCE=reference,
                           TEMP_OUT_REF=os.path.basename(reference),
                           OUT_DICT=swap_ext(reference, ".dict"))

        description = "<SequenceDictionary: '%s'>" % (reference,)

        PicardNode.__init__(self,
                            command=builder.finalize(),
                            description=description,
                            dependencies=dependencies)

Esempio n. 36

0

Mostra file

    def __init__(self, config, reference, dependencies=()):
        self._in_reference = os.path.abspath(reference)

        builder = picard_command(config, "CreateSequenceDictionary")

        builder.set_option("R", "%(TEMP_OUT_REF)s", sep="=")
        builder.set_option("O", "%(OUT_DICT)s", sep="=")
        builder.set_kwargs(IN_REFERENCE=reference,
                           TEMP_OUT_REF=os.path.basename(reference),
                           OUT_DICT=swap_ext(reference, ".dict"))

        description = "<SequenceDictionary: '%s'>" % (reference, )

        PicardNode.__init__(self,
                            command=builder.finalize(),
                            description=description,
                            dependencies=dependencies)

Esempio n. 37

0

Mostra file

    def _build_bwa_backtrack_se(self, config, prefix, record, parameters):
        input_file_fq = parameters.pop("input_file")
        output_file_bam = parameters.pop("output_file")
        output_file_sai = swap_ext(output_file_bam, ".sai")

        aln_node = self._build_bwa_backtrack_aln(parameters=parameters,
                                                 input_file=input_file_fq,
                                                 output_file=output_file_sai)

        return BWASamse(
            input_file_fq=input_file_fq,
            input_file_sai=output_file_sai,
            output_file=output_file_bam,
            prefix=parameters["prefix"],
            reference=parameters["reference"],
            mapping_options=self.options["Aligners"]["BWA"],
            cleanup_options=self._cleanup_options("BWA"),
            dependencies=aln_node,
        )

Esempio n. 38

0

Mostra file

File: picard.py Progetto: MikkelSchubert/paleomix

    def __init__(self, config, input_bam, input_index=None, output_log=None,
                 ignored_checks=(), dependencies=()):
        builder = picard_command(config, "ValidateSamFile")
        _set_max_open_files(builder, "MAX_OPEN_TEMP_FILES")

        builder.set_option("I", "%(IN_BAM)s", sep="=")
        for check in ignored_checks:
            builder.add_option("IGNORE", check, sep="=")

        output_log = output_log or swap_ext(input_bam, ".validated")
        builder.set_kwargs(IN_BAM=input_bam,
                           IN_INDEX=input_index,
                           OUT_STDOUT=output_log)

        description = "<Validate BAM: '%s'>" % (input_bam,)
        PicardNode.__init__(self,
                            command=builder.finalize(),
                            description=description,
                            dependencies=dependencies)

Esempio n. 39

0

Mostra file

File: statistics.py Progetto: jelber2/paleomix

def _build_coverage_nodes_cached(files_and_nodes, target_name, roi_name,
                                 roi_filename, cache):
    output_ext = ".coverage"
    if roi_name:
        output_ext = ".%s.coverage" % roi_name

    coverages = {}
    for (input_filename, node) in files_and_nodes.iteritems():
        output_filename = swap_ext(input_filename, output_ext)

        cache_key = (roi_filename, input_filename)
        if cache_key not in cache:
            cache[cache_key] = CoverageNode(input_file=input_filename,
                                            output_file=output_filename,
                                            target_name=target_name,
                                            regions_file=roi_filename,
                                            dependencies=node)

        coverages[output_filename] = cache[cache_key]
    return coverages

Esempio n. 40

0

Mostra file

File: statistics.py Progetto: MikkelSchubert/paleomix

def _build_coverage_nodes_cached(files_and_nodes, target_name,
                                 roi_name, roi_filename, cache):
    output_ext = ".coverage"
    if roi_name:
        output_ext = ".%s.coverage" % roi_name

    coverages = {}
    for (input_filename, node) in files_and_nodes.iteritems():
        output_filename = swap_ext(input_filename, output_ext)

        cache_key = (roi_filename, input_filename)
        if cache_key not in cache:
            cache[cache_key] = CoverageNode(input_file=input_filename,
                                            output_file=output_filename,
                                            target_name=target_name,
                                            regions_file=roi_filename,
                                            dependencies=node)

        coverages[output_filename] = cache[cache_key]
    return coverages

Esempio n. 41

0

Mostra file

File: samtools.py Progetto: muslih14/paleomix

    def __init__(self, infile, dependencies=()):
        basename = os.path.basename(infile)

        cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"],
                             IN_BAM=infile,
                             TEMP_OUT_BAM=basename,
                             set_cwd=True)

        cmd_index = AtomicCmd(["samtools", "index", "%(TEMP_IN_BAM)s"],
                              TEMP_IN_BAM=basename,
                              CHECK_SAM=SAMTOOLS_VERSION)

        cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"],
                               TEMP_IN_BAM=basename + ".bai",
                               OUT_BAM=swap_ext(infile, ".bai"))

        commands = SequentialCmds((cmd_link, cmd_index, cmd_rename))

        CommandNode.__init__(self,
                             description="<BAMIndex: '%s'>" % (infile,),
                             command=commands,
                             dependencies=dependencies)

Esempio n. 42

0

Mostra file

File: picard.py Progetto: MikkelSchubert/paleomix

    def customize(cls, config, input_bams, output_bam, output_metrics=None,
                  keep_dupes=False, dependencies=()):
        params = picard_command(config, "MarkDuplicates")
        _set_max_open_files(params, "MAX_FILE_HANDLES")

        params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
        params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=")
        # Validation is mostly left to manual ValidateSamFile runs; required
        # because .csi indexed BAM records can have "invalid" bins.
        params.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=")
        params.add_multiple_options("I", input_bams, sep="=")

        if not keep_dupes:
            # Remove duplicates from output by default to save disk-space
            params.set_option("REMOVE_DUPLICATES", "True",
                              sep="=", fixed=False)

        output_metrics = output_metrics or swap_ext(output_bam, ".metrics")
        params.set_kwargs(OUT_BAM=output_bam,
                          OUT_METRICS=output_metrics)

        return {"command": params,
                "dependencies": dependencies}

Esempio n. 43

0

Mostra file

File: genotype.py Progetto: jelber2/paleomix

def build_sampling_nodes(options, genotyping, sample, regions, dependencies):
    fasta_file = regions["Genotypes"][sample]
    pileup_file = swap_ext(fasta_file, ".pileup.bgz")

    padding = genotyping["Padding"]
    slop, node = build_regions_nodes(regions, padding, dependencies)

    bam_file = "%s.%s.bam" % (sample, regions["Prefix"])
    bam_file = os.path.join(options.samples_root, bam_file)
    if regions["Realigned"]:
        bam_file = add_postfix(bam_file, ".realigned")
    bai_node = build_bam_index_node(bam_file)

    genotype = GenotypeRegionsNode.customize(pileup_only=True,
                                             reference=regions["FASTA"],
                                             bedfile=slop,
                                             infile=bam_file,
                                             outfile=pileup_file,
                                             nbatches=options.samtools_max_threads,
                                             dependencies=node + (bai_node,))
    apply_samtools_options(genotype.command, genotyping["MPileup"],
                           "--mpileup-argument")
    genotype = genotype.build_node()

    tabix = TabixIndexNode(infile=pileup_file,
                           preset="pileup",
                           dependencies=genotype)

    builder = SampleRegionsNode(infile=pileup_file,
                                bedfile=regions["BED"],
                                outfile=fasta_file,
                                dependencies=tabix)

    faidx = FastaIndexNode(infile=fasta_file,
                           dependencies=builder)

    return (faidx,)

Esempio n. 44

0

Mostra file

File: commands.py Progetto: jelber2/paleomix

    def __init__(self,
                 config,
                 target_name,
                 input_files,
                 output_file,
                 prefix,
                 regions_file=None,
                 dependencies=()):
        input_files = safe_coerce_to_tuple(input_files)
        index_format = regions_file and prefix['IndexFormat']

        builder = factory.new("depths")
        builder.add_value("%(TEMP_IN_BAM)s")
        builder.add_value("%(OUT_FILE)s")
        builder.set_option("--target-name", target_name)
        builder.set_kwargs(OUT_FILE=output_file,
                           TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE)
        builder.add_multiple_kwargs(input_files)

        if regions_file:
            index_file = swap_ext(MultiBAMInputNode.PIPE_FILE, index_format)

            builder.set_option('--regions-file', '%(IN_REGIONS)s')
            builder.set_kwargs(IN_REGIONS=regions_file,
                               TEMP_IN_INDEX=index_file)

        description = "<DepthHistogram: %s -> '%s'>" \
            % (describe_files(input_files), output_file)

        MultiBAMInputNode.__init__(self,
                                   config=config,
                                   input_bams=input_files,
                                   index_format=index_format,
                                   command=builder.finalize(),
                                   description=description,
                                   dependencies=dependencies)

Esempio n. 45

0

Mostra file

def test_swap_ext__empty_ext_vs_new_ext():
    assert_equal(swap_ext("name", "bar"), "name.bar")

Esempio n. 46

0

Mostra file

File: genotype.py Progetto: jelber2/paleomix

def build_genotyping_nodes_cached(options, genotyping, sample, regions,
                                  dependencies):
    """Carries out genotyping, filtering of calls, and indexing of files for a
    given sample and prefix. If the option 'GenotypeEntirePrefix' is enabled,
    the BAM is genotyped once, and each set of RegionsOfInterest simply extract
    the relevant regions during construction of the consensus sequence.

    Parameters:
        options: An options object (c.f. paleomix.tools.phylo_pipeline.config).
        genotyping: Genotyping options defined for a specific set of areas of
                    interest, corresponding to Genotyping:NAME in the makefile.
        sample: The name of the sample to be genotyped.
        egions: A dictionary for a 'RegionsOfInterest' from the makefile.
        dependencies: Depenencies that must be met before genotyping starts.

    Returns a tuple containing the filename of the filtered and tabix-indexed
    VCF file, and the top-level node generating this file. Multiple calls for
    the same BAM and prefix will return the same VCF and nodes if the option
    for 'GenotypeEntirePrefix' is enabled, otherwise each ROI is genotyped
    individiually.

    Output files are generated in ./results/PROJECT/genotyping. If the option
    for 'GenotypeEntirePrefix' is enabled, the following files are generated:
        SAMPLE.PREFIX.vcf.bgz: Unfiltered calls for variant/non-variant sites.
        SAMPLE.PREFIX.vcf.pileup.bgz: Pileup of sites containing SNPs.
        SAMPLE.PREFIX.vcf.pileup.bgz.tbi: Tabix index of the pileup.
        SAMPLE.PREFIX.filtered.vcf.bgz: Variant calls filtered with vcf_filter.
        SAMPLE.PREFIX.filtered.vcf.bgz.tbi: Tabix index for the filtered VCF.

    If 'GenotypeEntirePrefix' is not enabled for a given ROI, the following
    files are generated for that ROI (see descriptions above):
        SAMPLE.PREFIX.ROI.filtered.vcf.bgz
        SAMPLE.PREFIX.ROI.filtered.vcf.bgz.tbi
        SAMPLE.PREFIX.ROI.vcf.bgz
        SAMPLE.PREFIX.ROI.vcf.pileup.bgz
        SAMPLE.PREFIX.ROI.vcf.pileup.bgz.tbi

    In addition, the following files are generated for each set of
    RegionsOfInterest (ROI), regardless of the 'GenotypeEntirePrefix' option:
        SAMPLE.PREFIX.ROI.CDS.fasta: FASTA sequence of each feature in the ROI.
        SAMPLE.PREFIX.ROI.CDS.fasta.fai: FASTA index generated using SAMTools.

    """
    output_prefix, bamfile, bedfile, dependencies \
        = build_genotyping_bedfile_nodes(options, genotyping, sample, regions,
                                         dependencies)

    if (bamfile, output_prefix) in _VCF_CACHE:
        return _VCF_CACHE[(bamfile, output_prefix)]

    calls = swap_ext(output_prefix, ".vcf.bgz")
    pileups = swap_ext(output_prefix, ".vcf.pileup.bgz")
    filtered = swap_ext(output_prefix, ".filtered.vcf.bgz")

    # 1. Call samtools mpilup | bcftools view on the bam
    genotype = GenotypeRegionsNode.customize(reference=regions["FASTA"],
                                             bedfile=bedfile,
                                             infile=bamfile,
                                             outfile=calls,
                                             nbatches=options.samtools_max_threads,
                                             dependencies=dependencies)

    apply_samtools_options(genotype.command, genotyping["MPileup"],
                           "--mpileup-argument")
    apply_samtools_options(genotype.command, genotyping["BCFTools"],
                           "--bcftools-argument")
    genotype = genotype.build_node()

    # 2. Collect pileups of sites with SNPs, to allow proper filtering by
    #    frequency of the minor allele, as only the major non-ref allele is
    #    counted in the VCF (c.f. field DP4).
    vcfpileup = VCFPileupNode.customize(reference=regions["FASTA"],
                                        infile_bam=bamfile,
                                        infile_vcf=calls,
                                        outfile=pileups,
                                        dependencies=genotype)
    apply_samtools_options(vcfpileup.command, genotyping["MPileup"],
                           "--mpileup-argument")
    vcfpileup = vcfpileup.build_node()

    vcf_tabix = TabixIndexNode(infile=pileups,
                               preset="pileup",
                               dependencies=vcfpileup)

    # 3. Filter all sites using the 'vcf_filter' command
    vcffilter = VCFFilterNode.customize(infile=calls,
                                        pileup=pileups,
                                        outfile=filtered,
                                        regions=regions,
                                        dependencies=vcf_tabix)
    vcffilter = _apply_vcf_filter_options(vcffilter, genotyping, sample)

    # 4. Tabix index. This allows random-access to the VCF file when building
    #    the consensus FASTA sequence later in the pipeline.
    tabix = TabixIndexNode(infile=filtered,
                           preset="vcf",
                           dependencies=vcffilter)

    _VCF_CACHE[(bamfile, output_prefix)] = (filtered, tabix)
    return filtered, tabix

Esempio n. 47

0

Mostra file

File: pipeline.py Progetto: tmancill/paleomix

def finalize_run_config(parser, args):
    log = logging.getLogger(__name__)
    if args.command in ("run", "dryrun") and not (1 <= len(args.files) <= 3):
        parser.print_usage()
        return

    args.multisample = False

    known_samples = set(args.database.samples) | set(("Sample", ))
    unknown_samples = set(args.treemix_outgroup) - known_samples
    if unknown_samples:
        log.error(
            "Argument --treemix-outgroup includes unknown sample(s): %s; known "
            "samples are %s. Note that names are case-sensitive."
            ", ".join(map(repr, sorted(unknown_samples))),
            ", ".join(map(repr, sorted(known_samples))),
        )
        return

    if len(args.files) == 1:
        args.files.append(fileutils.swap_ext(args.files[0], ".zonkey"))

    if len(args.files) == 2:
        filename, args.destination = args.files

        if os.path.exists(
                args.destination) and not os.path.isdir(args.destination):
            log.error("Destination %r is not a directory", args.destination)
            return
        elif not os.path.isfile(filename):
            log.error("Not a valid filename: %r", filename)
            return
        elif _is_bamfile(filename):
            args.samples = {
                "-": {
                    "Root": args.destination,
                    "Files": [filename]
                }
            }
        else:
            args.multisample = True
            if not _read_sample_table(args, filename):
                return
    elif len(args.files) == 3:
        filename_1, filename_2, args.destination = args.files

        args.samples = {
            "-": {
                "Root": args.destination,
                "Files": [filename_1, filename_2]
            }
        }
    else:
        raise RuntimeError("Unexpected number of arguments: %r" %
                           (args.files, ))

    # Identify (mito or nuc?) and validate BAM files provided by user
    if not _process_samples(args):
        return

    return args

Esempio n. 48

0

Mostra file

def test_swap_ext__dot_ext_vs_new_dot_ext():
    assert_equal(swap_ext("name", ".bar"), "name.bar")

Esempio n. 49

0

Mostra file

                  "names are case-sensitive." %
                  (", ".join(map(repr, sorted(unknown_samples))), ", ".join(
                      map(repr, sorted(known_samples)))))
        return

    if config.command in ("mito", "example"):
        if len(args) != 2:
            print_err("ERROR: Wrong number of arguments!")
            print_usage()
            return

        config.destination = args[1]
        config.samples = {}
    elif len(args) == 2:
        filename = args[1]
        config.destination = fileutils.swap_ext(filename, ".zonkey")

        if not os.path.isfile(filename):
            print_err("ERROR: Not a valid filename: %r" % (filename, ))
            return
        elif _is_bamfile(filename):
            # Called as either of
            #   zonkey run <SampleDB> <nuclear.bam>
            #   zonkey run <SampleDB> <mitochondrial.bam>
            config.samples = {
                "-": {
                    "Root": config.destination,
                    "Files": [filename]
                }
            }
        else:

Esempio n. 50

0

Mostra file

File: common.py Progetto: MikkelSchubert/paleomix

def parse_arguments(argv, ext):
    prog = "paleomix %s" % (ext.strip("."),)
    usage = "%s [options] sorted.bam [out%s]" % (prog, ext)
    parser = argparse.ArgumentParser(prog=prog, usage=usage)

    parser.add_argument("infile", metavar="BAM",
                        help="Filename of a sorted BAM file. If set to '-' "
                             "the file is read from STDIN.")
    parser.add_argument("outfile", metavar="OUTPUT", nargs='?',
                        help="Filename of output table; defaults to name of "
                             "the input BAM with a '%s' extension. If "
                             "set to '-' the table is printed to STDOUT."
                             % (ext,))
    parser.add_argument("--target-name", default=None, metavar="NAME",
                        help="Name used for 'Target' column; defaults to the "
                             "filename of the BAM file.")
    parser.add_argument("--regions-file", default=None, dest="regions_fpath",
                        help="BED file containing regions of interest; %s "
                             "is calculated only for these grouping by the "
                             "name used in the BED file, or the contig name "
                             "if no name has been specified for a record."
                             % (ext.strip("."),))
    parser.add_argument('--max-contigs', default=100, type=int,
                        help="The maximum number of contigs allowed in a BAM "
                             "file. If this number is exceeded, the entire "
                             "set of contigs is aggregated into one pseudo-"
                             "contig named '<Genome>'. This is done to "
                             "limit table sizes [default: %(default)s]")
    parser.add_argument('--ignore-readgroups',
                        default=False, action="store_true",
                        help="Ignore readgroup information in reads, and only "
                             "provide aggregated statistics; this is required "
                             "if readgroup information is missing or partial "
                             "[default: %(default)s]")
    parser.add_argument('--overwrite-output',
                        default=False, action="store_true",
                        help="Overwrite output file if it it exists; by "
                             "default, the script will terminate if the file "
                             "already exists.")

    args = parser.parse_args(argv)
    if not args.outfile:
        args.outfile = swap_ext(args.infile, ext)

    if args.ignore_readgroups:
        args.get_readgroup_func = _get_readgroup_ignored
    else:
        args.get_readgroup_func = _get_readgroup

    if not args.target_name:
        if args.infile == "-":
            args.target_name = "<STDIN>"
        else:
            args.target_name = os.path.basename(args.infile)

    if os.path.exists(args.outfile) and not args.overwrite_output:
        parser.error("Destination filename already exists (%r); use option "
                     "--overwrite-output to allow overwriting of this file."
                     % (args.outfile,))

    return args

Esempio n. 51

0

Mostra file

File: common.py Progetto: tmancill/paleomix

def parse_arguments(argv, ext):
    prog = "paleomix %s" % (ext.strip("."), )
    usage = "%s [options] sorted.bam [out%s]" % (prog, ext)
    parser = argparse.ArgumentParser(prog=prog, usage=usage)

    parser.add_argument(
        "infile",
        metavar="BAM",
        help="Filename of a sorted BAM file. If set to '-' "
        "the file is read from STDIN.",
    )
    parser.add_argument(
        "outfile",
        metavar="OUTPUT",
        nargs="?",
        help="Filename of output table; defaults to name of "
        "the input BAM with a '%s' extension. If "
        "set to '-' the table is printed to STDOUT." % (ext, ),
    )
    parser.add_argument(
        "--target-name",
        default=None,
        metavar="NAME",
        help="Name used for 'Target' column; defaults to the "
        "filename of the BAM file.",
    )
    parser.add_argument(
        "--regions-file",
        default=None,
        dest="regions_fpath",
        help="BED file containing regions of interest; %s "
        "is calculated only for these grouping by the "
        "name used in the BED file, or the contig name "
        "if no name has been specified for a record." % (ext.strip("."), ),
    )
    parser.add_argument(
        "--max-contigs",
        default=100,
        type=int,
        help="The maximum number of contigs allowed in a BAM "
        "file. If this number is exceeded, the entire "
        "set of contigs is aggregated into one pseudo-"
        "contig named '<Genome>'. This is done to "
        "limit table sizes [default: %(default)s]",
    )
    parser.add_argument(
        "--ignore-readgroups",
        default=False,
        action="store_true",
        help="Ignore readgroup information in reads, and only "
        "provide aggregated statistics; this is required "
        "if readgroup information is missing or partial "
        "[default: %(default)s]",
    )
    parser.add_argument(
        "--overwrite-output",
        default=False,
        action="store_true",
        help="Overwrite output file if it it exists; by "
        "default, the script will terminate if the file "
        "already exists.",
    )

    args = parser.parse_args(argv)
    if not args.outfile:
        args.outfile = swap_ext(args.infile, ext)

    if args.ignore_readgroups:
        args.get_readgroup_func = _get_readgroup_ignored
    else:
        args.get_readgroup_func = _get_readgroup

    if not args.target_name:
        if args.infile == "-":
            args.target_name = "<STDIN>"
        else:
            args.target_name = os.path.basename(args.infile)

    if os.path.exists(args.outfile) and not args.overwrite_output:
        parser.error("Destination filename already exists (%r); use option "
                     "--overwrite-output to allow overwriting of this file." %
                     (args.outfile, ))

    return args

Esempio n. 52

0

Mostra file

def test_swap_ext__has_ext_vs_new_ext():
    assert_equal(swap_ext("name.foo", "bar"), "name.bar")

Esempio n. 53

0

Mostra file

File: config.py Progetto: MikkelSchubert/paleomix

                  "names are case-sensitive."
                  % (", ".join(map(repr, sorted(unknown_samples))),
                     ", ".join(map(repr, sorted(known_samples)))))
        return

    if config.command in ("mito", "example"):
        if len(args) != 2:
            print_err("ERROR: Wrong number of arguments!")
            print_usage()
            return

        config.destination = args[1]
        config.samples = {}
    elif len(args) == 2:
        filename = args[1]
        config.destination = fileutils.swap_ext(filename, ".zonkey")

        if not os.path.isfile(filename):
            print_err("ERROR: Not a valid filename: %r" % (filename,))
            return
        elif _is_bamfile(filename):
            # Called as either of
            #   zonkey run <SampleDB> <nuclear.bam>
            #   zonkey run <SampleDB> <mitochondrial.bam>
            config.samples = {"-": {"Root": config.destination,
                                    "Files": [filename]}}
        else:
            config.multisample = True
            if not _read_sample_table(config, filename):
                return
    elif 3 <= len(args) <= 4: