コード例 #1
0
    def _build_bwa_backtrack_pe(self, config, prefix, record, parameters):
        template = parameters.pop("input_file")
        output_bam = parameters.pop("output_file")

        output_sai_1 = swap_ext(output_bam, "%i.sai" % (1, ))
        aln_node_1 = self._build_bwa_backtrack_aln(
            parameters=parameters,
            input_file=template.format(Pair=1),
            output_file=output_sai_1,
        )

        output_sai_2 = swap_ext(output_bam, "%i.sai" % (2, ))
        aln_node_2 = self._build_bwa_backtrack_aln(
            parameters=parameters,
            input_file=template.format(Pair=2),
            output_file=output_sai_2,
        )

        return BWASampe(
            input_file_sai_1=output_sai_1,
            input_file_sai_2=output_sai_2,
            input_file_fq_1=template.format(Pair=1),
            input_file_fq_2=template.format(Pair=2),
            output_file=output_bam,
            prefix=parameters["prefix"],
            reference=parameters["reference"],
            mapping_options=self.options["Aligners"]["BWA"],
            cleanup_options=self._cleanup_options("BWA"),
            dependencies=(aln_node_1, aln_node_2),
        )
コード例 #2
0
ファイル: phylo.py プロジェクト: muslih14/paleomix
def _build_examl_bootstraps(options, phylo, destination, input_alignment, input_partition, dependencies):
    bootstraps = []
    num_bootstraps = phylo["ExaML"]["Bootstraps"]
    bootstrap_destination = os.path.join(destination, "bootstraps")
    bootstrap_template    = os.path.join(bootstrap_destination, "bootstrap.%04i.phy")

    for bootstrap_num in xrange(num_bootstraps):
        bootstrap_alignment = bootstrap_template % (bootstrap_num,)
        bootstrap = PHYLIPBootstrapNode(input_alignment  = input_alignment,
                                        input_partition  = input_partition,
                                        output_alignment = bootstrap_alignment,
                                        seed             = random.randint(1, 2**32 - 1),
                                        dependencies     = dependencies)

        bootstrap_binary      = swap_ext(bootstrap_alignment, ".binary")
        bootstrap_final       = swap_ext(bootstrap_alignment, ".%s")
        bs_binary   = ExaMLParserNode(input_alignment = bootstrap_alignment,
                                      input_partition = input_partition,
                                      output_file     = bootstrap_binary,
                                      dependencies    = bootstrap)

        bootstraps.append(_examl_nodes(options          = options,
                                       settings         = phylo,
                                       input_alignment  = bootstrap_alignment,
                                       input_partitions = input_partition,
                                       input_binary     = bootstrap_binary,
                                       output_template  = bootstrap_final,
                                       dependencies     = bs_binary))

    if bootstraps:
        return _build_rerooted_trees(bootstraps, phylo["RootTreesOn"])

    return None
コード例 #3
0
ファイル: nuclear.py プロジェクト: muslih14/paleomix
    def _setup(self, config, temp):
        CommandNode._setup(self, config, temp)

        input_files = [
            self._input_file,
            fileutils.swap_ext(self._input_file, ".bim"),
            fileutils.swap_ext(self._input_file, ".fam"),
        ]

        for filename in input_files:
            basename = os.path.basename(filename)
            os.symlink(os.path.abspath(filename), os.path.join(temp, basename))

        if self._supervised:
            fam_filename = fileutils.swap_ext(self._input_file, ".fam")

            pop_filename = fileutils.swap_ext(fam_filename, ".pop")
            pop_filename = fileutils.reroot_path(temp, pop_filename)

            key = "Group(%i)" % (self._k_groups,)
            with open(fam_filename) as fam_handle:
                with open(pop_filename, "w") as pop_handle:
                    for line in fam_handle:
                        sample, _ = line.split(None, 1)
                        group = self._samples.get(sample, {}).get(key, "-")

                        pop_handle.write("%s\n" % (group,))
コード例 #4
0
    def _setup(self, config, temp):
        CommandNode._setup(self, config, temp)

        input_files = [
            self._input_file,
            fileutils.swap_ext(self._input_file, ".bim"),
            fileutils.swap_ext(self._input_file, ".fam"),
        ]

        for filename in input_files:
            basename = os.path.basename(filename)
            os.symlink(os.path.abspath(filename), os.path.join(temp, basename))

        if self._supervised:
            fam_filename = fileutils.swap_ext(self._input_file, ".fam")

            pop_filename = fileutils.swap_ext(fam_filename, ".pop")
            pop_filename = fileutils.reroot_path(temp, pop_filename)

            key = "Group(%i)" % (self._k_groups, )
            with open(fam_filename) as fam_handle:
                with open(pop_filename, "w") as pop_handle:
                    for line in fam_handle:
                        sample, _ = line.split(None, 1)
                        group = self._samples.get(sample, {}).get(key, "-")

                        pop_handle.write("%s\n" % (group, ))
コード例 #5
0
    def _setup(self, config, temp):
        CommandNode._setup(self, config, temp)

        pipe_fname = os.path.join(temp, self.PIPE_FILE)
        if len(self._input_bams) > 1:
            os.mkfifo(pipe_fname)
        else:
            source_fname = os.path.abspath(self._input_bams[0])
            os.symlink(source_fname, pipe_fname)

            if self._index_format:
                os.symlink(swap_ext(source_fname, self._index_format),
                           swap_ext(pipe_fname, self._index_format))
コード例 #6
0
ファイル: picard.py プロジェクト: MikkelSchubert/paleomix
    def _setup(self, config, temp):
        CommandNode._setup(self, config, temp)

        pipe_fname = os.path.join(temp, self.PIPE_FILE)
        if len(self._input_bams) > 1:
            os.mkfifo(pipe_fname)
        else:
            source_fname = os.path.abspath(self._input_bams[0])
            os.symlink(source_fname, pipe_fname)

            if self._index_format:
                os.symlink(swap_ext(source_fname, self._index_format),
                           swap_ext(pipe_fname, self._index_format))
コード例 #7
0
    def customize(cls,
                  config,
                  input_bams,
                  output_bam,
                  output_metrics=None,
                  keep_dupes=False,
                  dependencies=()):
        params = picard_command(config, "MarkDuplicates")
        _set_max_open_files(params, "MAX_FILE_HANDLES")

        params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
        params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=")
        # Validation is mostly left to manual ValidateSamFile runs; required
        # because .csi indexed BAM records can have "invalid" bins.
        params.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=")
        params.add_multiple_options("I", input_bams, sep="=")

        if not keep_dupes:
            # Remove duplicates from output by default to save disk-space
            params.set_option("REMOVE_DUPLICATES",
                              "True",
                              sep="=",
                              fixed=False)

        output_metrics = output_metrics or swap_ext(output_bam, ".metrics")
        params.set_kwargs(OUT_BAM=output_bam, OUT_METRICS=output_metrics)

        return {"command": params, "dependencies": dependencies}
コード例 #8
0
ファイル: picard.py プロジェクト: muslih14/paleomix
    def __init__(self, config, input_bams, pipename="input.bam", indexed=True):
        self.pipe = pipename
        self.indexed = indexed
        self.files = safe_coerce_to_tuple(input_bams)

        self.commands = []
        self.kwargs = {"TEMP_IN_BAM": self.pipe}
        if len(self.files) > 1:
            params = picard_command(config, "MergeSamFiles")

            params.set_option("SO", "coordinate", sep="=", fixed=False)
            params.set_option("CREATE_INDEX", "False", sep="=")
            params.set_option("COMPRESSION_LEVEL", 0, sep="=")
            params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=")
            params.add_multiple_options("I", input_bams, sep="=")

            params.set_kwargs(TEMP_OUT_BAM=self.pipe)

            self.commands = [params.finalize()]
        else:
            # Ensure that the actual command depends on the input
            self.kwargs["IN_FILE_00"] = self.files[0]

            if indexed:
                self.kwargs["IN_FILE_01"] = swap_ext(self.files[0], ".bai")
コード例 #9
0
    def __init__(
            self,
            config,
            input_bam,
            input_index=None,
            output_log=None,
            ignored_checks=(),
            big_genome_mode=False,
            dependencies=(),
    ):
        builder = picard_command(config, "ValidateSamFile")
        _set_max_open_files(builder, "MAX_OPEN_TEMP_FILES")

        if True or big_genome_mode:
            self._configure_for_big_genome(config, builder)

        builder.set_option("I", "%(IN_BAM)s", sep="=")
        for check in ignored_checks:
            builder.add_option("IGNORE", check, sep="=")

        output_log = output_log or swap_ext(input_bam, ".validated")
        builder.set_kwargs(IN_BAM=input_bam,
                           IN_INDEX=input_index,
                           OUT_STDOUT=output_log)

        description = "<Validate BAM: '%s'>" % (input_bam, )
        PicardNode.__init__(
            self,
            command=builder.finalize(),
            description=description,
            dependencies=dependencies,
        )
コード例 #10
0
ファイル: nuclear.py プロジェクト: muslih14/paleomix
    def __init__(self, output_root, table, bamfile, downsample,
                 dependencies=()):
        cmd = factory.new("build_tped")
        cmd.set_option("--name", "Sample")
        cmd.set_option("--downsample", downsample)
        cmd.add_value("%(TEMP_DIR)s")
        cmd.add_value("%(IN_TABLE)s")
        cmd.add_value("%(IN_BAM)s")

        if not downsample:
            # Needed for random access (chromosomes are read 1 ... 31)
            cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai"))

        cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"),
                       OUT_SUMMARY=os.path.join(output_root, "common.summary"),
                       OUT_TPED_INCL_TS=os.path.join(output_root,
                                                     "incl_ts.tped"),
                       OUT_TPED_EXCL_TS=os.path.join(output_root,
                                                     "excl_ts.tped"),
                       IN_TABLE=table,
                       IN_BAM=bamfile)

        CommandNode.__init__(self,
                             description="<BuildTPEDFiles -> %r>"
                             % (os.path.join(output_root, '*'),),
                             command=cmd.finalize(),
                             dependencies=dependencies)
コード例 #11
0
ファイル: gatk.py プロジェクト: jelber2/paleomix
    def __init__(self, config, reference, infiles, outfile,
                 threads=1, dependencies=()):
        threads = _get_max_threads(reference, threads)
        infiles = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
        command = AtomicJavaCmdBuilder(jar_file,
                                       jre_options=config.jre_options)
        command.set_option("-T", "RealignerTargetCreator")
        command.set_option("-R", "%(IN_REFERENCE)s")
        command.set_option("-o", "%(OUT_INTERVALS)s")
        command.set_option("-nt", threads)

        _set_input_files(command, infiles)
        command.set_kwargs(IN_REFERENCE=reference,
                           IN_REF_DICT=fileutils.swap_ext(reference, ".dict"),
                           OUT_INTERVALS=outfile,
                           CHECK_GATK=_get_gatk_version_check(config))

        description = "<GATK Indel Realigner (training): %s -> %r>" \
            % (describe_files(infiles), outfile)
        CommandNode.__init__(self,
                             threads=threads,
                             description=description,
                             command=command.finalize(),
                             dependencies=dependencies)
コード例 #12
0
ファイル: picard.py プロジェクト: MikkelSchubert/paleomix
    def _teardown(self, config, temp):
        os.remove(os.path.join(temp, self.PIPE_FILE))
        if self._index_format:
            os.remove(os.path.join(temp, swap_ext(self.PIPE_FILE,
                                                  self._index_format)))

        CommandNode._teardown(self, config, temp)
コード例 #13
0
    def __init__(self,
                 output_root,
                 table,
                 bamfile,
                 downsample,
                 dependencies=()):
        cmd = factory.new("zonkey_tped")
        cmd.set_option("--name", "Sample")
        cmd.set_option("--downsample", downsample)
        cmd.add_value("%(TEMP_DIR)s")
        cmd.add_value("%(IN_TABLE)s")
        cmd.add_value("%(IN_BAM)s")

        if not downsample:
            # Needed for random access (chromosomes are read 1 ... 31)
            cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai"))

        cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"),
                       OUT_SUMMARY=os.path.join(output_root, "common.summary"),
                       OUT_TPED_INCL_TS=os.path.join(output_root,
                                                     "incl_ts.tped"),
                       OUT_TPED_EXCL_TS=os.path.join(output_root,
                                                     "excl_ts.tped"),
                       IN_TABLE=table,
                       IN_BAM=bamfile)

        CommandNode.__init__(self,
                             description="<BuildTPEDFiles -> %r>" %
                             (os.path.join(output_root, '*'), ),
                             command=cmd.finalize(),
                             dependencies=dependencies)
コード例 #14
0
ファイル: lane.py プロジェクト: jelber2/paleomix
    def _build_bwa_backtrack_pe(self, config, prefix, record, parameters):
        template = parameters.pop("input_file")
        output_bam = parameters.pop("output_file")

        aln_files = []
        aln_nodes = []
        for mate in (1, 2):
            input_file = template.format(Pair=mate)
            output_sai = swap_ext(output_bam, "%i.sai" % (mate,))

            aln_node = self._build_bwa_backtrack_aln(parameters=parameters,
                                                     input_file=input_file,
                                                     output_file=output_sai)

            aln_files.append(output_sai)
            aln_nodes.append(aln_node)

        sam_node = BWASampe.customize(input_file_sai_1=aln_files[0],
                                      input_file_sai_2=aln_files[1],
                                      input_file_fq_1=template.format(Pair=1),
                                      input_file_fq_2=template.format(Pair=2),
                                      output_file=output_bam,
                                      prefix=parameters['prefix'],
                                      reference=parameters["reference"],
                                      dependencies=aln_nodes)

        return self._finalize_nodes(config, prefix, parameters, sam_node)
コード例 #15
0
ファイル: makefile.py プロジェクト: muslih14/paleomix
    def _collect_subsets(roi, subset, path):
        if roi not in subsets_by_regions:
            raise MakefileError("Subset of unknown region (%r) requested at %r"
                                % (roi, path))

        roi_fname = swap_ext(subsets_by_regions[roi]["BED"], subset + ".names")
        if not os.path.isfile(roi_fname):
            raise MakefileError("Subset file does not exist for Regions Of "
                                "Interest:\n  Region = %r\n  Subset = %r\n"
                                "  Path   = %r"
                                % (roi, subset, roi_fname))

        sequences = set()
        with open(roi_fname) as handle:
            for line in handle:
                line = line.strip()
                if line and not line.startswith("#"):
                    sequences.add(line)

        known_seqs = subsets_by_regions[roi]["Sequences"][None]
        unknown_seqs = sequences - known_seqs
        if unknown_seqs:
            message = ("Unknown sequences in subset file:\n"
                       "  File   = %r\n  Region = %r\n  Subset = %r\n"
                       "  Unknown sequence names =") \
                       % (roi_fname, roi, subset)
            unknown_seqs = list(sorted(unknown_seqs))
            if len(unknown_seqs) > 5:
                unknown_seqs = unknown_seqs[:5] + ["..."]
            message = "\n    - ".join([message] + unknown_seqs)
            raise MakefileError(message)

        subsets_by_regions[roi]["SubsetFiles"][subset] = (roi_fname,)
        subsets_by_regions[roi]["Sequences"][subset] = frozenset(sequences)
コード例 #16
0
ファイル: samtools.py プロジェクト: jelber2/paleomix
    def __init__(self, infile, index_format='.bai', dependencies=()):
        basename = os.path.basename(infile)

        if index_format == '.bai':
            samtools_version = SAMTOOLS_VERSION
            samtools_call = ["samtools", "index", "%(TEMP_IN_BAM)s"]
        elif index_format == '.csi':
            samtools_version = SAMTOOLS_VERSION_1x
            samtools_call = ["samtools", "index", "-c", "%(TEMP_IN_BAM)s"]
        else:
            raise ValueError("Unknown format type %r; expected .bai or .csi" %
                             (index_format, ))

        cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"],
                             IN_BAM=infile,
                             TEMP_OUT_BAM=basename,
                             set_cwd=True)

        cmd_index = AtomicCmd(samtools_call,
                              TEMP_IN_BAM=basename,
                              CHECK_SAM=samtools_version)

        cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"],
                               TEMP_IN_BAM=basename + index_format,
                               OUT_BAM=swap_ext(infile, index_format))

        commands = SequentialCmds((cmd_link, cmd_index, cmd_rename))

        CommandNode.__init__(self,
                             description="<BAMIndex (%s): '%s'>" %
                             (index_format[1:].upper(), infile),
                             command=commands,
                             dependencies=dependencies)
コード例 #17
0
ファイル: samtools.py プロジェクト: MikkelSchubert/paleomix
    def __init__(self, infile, index_format='.bai', dependencies=()):
        basename = os.path.basename(infile)

        if index_format == '.bai':
            samtools_version = SAMTOOLS_VERSION
            samtools_call = ["samtools", "index", "%(TEMP_IN_BAM)s"]
        elif index_format == '.csi':
            samtools_version = SAMTOOLS_VERSION_1x
            samtools_call = ["samtools", "index", "-c", "%(TEMP_IN_BAM)s"]
        else:
            raise ValueError("Unknown format type %r; expected .bai or .csi"
                             % (index_format,))

        cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"],
                             IN_BAM=infile,
                             TEMP_OUT_BAM=basename,
                             set_cwd=True)

        cmd_index = AtomicCmd(samtools_call,
                              TEMP_IN_BAM=basename,
                              CHECK_SAM=samtools_version)

        cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"],
                               TEMP_IN_BAM=basename + index_format,
                               OUT_BAM=swap_ext(infile, index_format))

        commands = SequentialCmds((cmd_link, cmd_index, cmd_rename))

        CommandNode.__init__(self,
                             description="<BAMIndex (%s): '%s'>"
                             % (index_format[1:].upper(), infile),
                             command=commands,
                             dependencies=dependencies)
コード例 #18
0
ファイル: nodes.py プロジェクト: muslih14/paleomix
def index_and_validate_bam(config, prefix, node, log_file=None,
                           create_index=True):
    input_file, has_index = _get_input_file(node)
    if not has_index and create_index:
        node = BAMIndexNode(infile=input_file,
                            dependencies=node)

    validation_params = ValidateBAMNode.customize(config=config,
                                                  input_bam=input_file,
                                                  output_log=log_file,
                                                  dependencies=node)

    # Ensure that the validation node is re-run if the index changes
    if has_index or create_index:
        bai_filename = swap_ext(input_file, ".bai")
        validation_params.command.set_kwargs(IN_BAI=bai_filename)

    # Check MD tags against reference sequence
    # FIXME: Disabled due to issues with Picard/Samtools disagreeing,
    #   backwards compatibility. See the discussion at
    #     http://sourceforge.net/mailarchive/message.php?msg_id=31348639
    # validation_params.command.set_kwargs(IN_REF=prefix["Reference"])
    # validation_params.command.add_option("R", "%(IN_REF)s", sep="=")

    # Ignored since we may filter out misses and low-quality hits during
    # mapping, which leads to a large proportion of missing PE mates.
    validation_params.command.add_option("IGNORE", "MATE_NOT_FOUND",
                                         sep="=")
    # Ignored due to high rate of false positives for lanes with few hits,
    # where high-quality reads may cause mis-identification of qualities
    validation_params.command.add_option("IGNORE",
                                         "INVALID_QUALITY_FORMAT", sep="=")

    return validation_params.build_node()
コード例 #19
0
ファイル: makefile.py プロジェクト: jelber2/paleomix
    def _collect_subsets(roi, subset, path):
        if roi not in subsets_by_regions:
            raise MakefileError("Subset of unknown region (%r) requested at %r"
                                % (roi, path))

        roi_fname = swap_ext(subsets_by_regions[roi]["BED"], subset + ".names")
        if not os.path.isfile(roi_fname):
            raise MakefileError("Subset file does not exist for Regions Of "
                                "Interest:\n  Region = %r\n  Subset = %r\n"
                                "  Path   = %r"
                                % (roi, subset, roi_fname))

        sequences = set()
        with open(roi_fname) as handle:
            for line in handle:
                line = line.strip()
                if line and not line.startswith("#"):
                    sequences.add(line)

        known_seqs = subsets_by_regions[roi]["Sequences"][None]
        unknown_seqs = sequences - known_seqs
        if unknown_seqs:
            message = ("Unknown sequences in subset file:\n"
                       "  File   = %r\n  Region = %r\n  Subset = %r\n"
                       "  Unknown sequence names =") \
                       % (roi_fname, roi, subset)
            unknown_seqs = list(sorted(unknown_seqs))
            if len(unknown_seqs) > 5:
                unknown_seqs = unknown_seqs[:5] + ["..."]
            message = "\n    - ".join([message] + unknown_seqs)
            raise MakefileError(message)

        subsets_by_regions[roi]["SubsetFiles"][subset] = (roi_fname,)
        subsets_by_regions[roi]["Sequences"][subset] = frozenset(sequences)
コード例 #20
0
ファイル: gatk.py プロジェクト: jelber2/paleomix
    def __init__(self, config, reference, intervals, infiles, outfile,
                 dependencies=()):
        self._basename = os.path.basename(outfile)

        infiles = safe_coerce_to_tuple(infiles)
        jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
        command = AtomicJavaCmdBuilder(jar_file,
                                       jre_options=config.jre_options)
        command.set_option("-T", "IndelRealigner")
        command.set_option("-R", "%(IN_REFERENCE)s")
        command.set_option("-targetIntervals", "%(IN_INTERVALS)s")
        command.set_option("-o", "%(OUT_BAMFILE)s")
        command.set_option("--bam_compression", 0)
        command.set_option("--disable_bam_indexing")
        _set_input_files(command, infiles)

        command.set_kwargs(IN_REFERENCE=reference,
                           IN_REF_DICT=fileutils.swap_ext(reference, ".dict"),
                           IN_INTERVALS=intervals,
                           OUT_BAMFILE=outfile,
                           CHECK_GATK=_get_gatk_version_check(config))

        calmd = AtomicCmd(["samtools", "calmd", "-b",
                           "%(TEMP_IN_BAM)s", "%(IN_REF)s"],
                          TEMP_IN_BAM=self._basename,
                          IN_REF=reference,
                          TEMP_OUT_STDOUT=self._basename + ".calmd",
                          CHECK_VERSION=SAMTOOLS_VERSION)

        description = "<GATK Indel Realigner (aligning): %s -> %r>" \
            % (describe_files(infiles), outfile)
        CommandNode.__init__(self,
                             description=description,
                             command=ParallelCmds([command.finalize(), calmd]),
                             dependencies=dependencies)
コード例 #21
0
ファイル: nuclear.py プロジェクト: muslih14/paleomix
    def __init__(self, input_file, k_groups, output_root,
                 samples=None, dependencies=()):
        self._samples = samples
        self._input_file = input_file
        self._k_groups = k_groups

        group_key = "Group(%i)" % (self._k_groups,)
        self._supervised = samples and any((row[group_key] != '-')
                                           for row in samples.itervalues())

        assert k_groups in (2, 3), k_groups
        prefix = os.path.splitext(os.path.basename(input_file))[0]
        output_prefix = os.path.join(output_root,
                                     "%s.%i" % (prefix, k_groups))

        cmd = AtomicCmdBuilder("admixture",
                               IN_FILE_BED=input_file,
                               IN_FILE_BIM=fileutils.swap_ext(input_file,
                                                              ".bim"),
                               IN_FILE_FAM=fileutils.swap_ext(input_file,
                                                              ".fam"),

                               TEMP_OUT_FILE_BED=prefix + ".bed",
                               TEMP_OUT_FILE_BIM=prefix + ".bim",
                               TEMP_OUT_FILE_FAM=prefix + ".fam",
                               TEMP_OUT_FILE_POP=prefix + ".pop",

                               OUT_P=output_prefix + ".P",
                               OUT_Q=output_prefix + ".Q",
                               OUT_STDOUT=output_prefix + ".log",

                               CHECK_VERSION=ADMIXTURE_VERSION,
                               set_cwd=True)

        cmd.set_option("-s", random.randint(0, 2 ** 16 - 1))

        if self._supervised:
            cmd.set_option("--supervised")

        cmd.add_value("%(TEMP_OUT_FILE_BED)s")
        cmd.add_value(int(k_groups))

        CommandNode.__init__(self,
                             description="<Admixture -> '%s.*''>"
                             % (output_prefix,),
                             command=cmd.finalize(),
                             dependencies=dependencies)
コード例 #22
0
ファイル: nuclear.py プロジェクト: jelber2/paleomix
    def __init__(self, input_file, k_groups, output_root,
                 samples=None, dependencies=()):
        self._samples = samples
        self._input_file = input_file
        self._k_groups = k_groups

        group_key = "Group(%i)" % (self._k_groups,)
        self._supervised = samples and any((row[group_key] != '-')
                                           for row in samples.itervalues())

        assert k_groups in (2, 3), k_groups
        prefix = os.path.splitext(os.path.basename(input_file))[0]
        output_prefix = os.path.join(output_root,
                                     "%s.%i" % (prefix, k_groups))

        cmd = AtomicCmdBuilder("admixture",
                               IN_FILE_BED=input_file,
                               IN_FILE_BIM=fileutils.swap_ext(input_file,
                                                              ".bim"),
                               IN_FILE_FAM=fileutils.swap_ext(input_file,
                                                              ".fam"),

                               TEMP_OUT_FILE_BED=prefix + ".bed",
                               TEMP_OUT_FILE_BIM=prefix + ".bim",
                               TEMP_OUT_FILE_FAM=prefix + ".fam",
                               TEMP_OUT_FILE_POP=prefix + ".pop",

                               OUT_P=output_prefix + ".P",
                               OUT_Q=output_prefix + ".Q",
                               OUT_STDOUT=output_prefix + ".log",

                               CHECK_VERSION=ADMIXTURE_VERSION,
                               set_cwd=True)

        cmd.set_option("-s", random.randint(0, 2 ** 16 - 1))

        if self._supervised:
            cmd.set_option("--supervised")

        cmd.add_value("%(TEMP_OUT_FILE_BED)s")
        cmd.add_value(int(k_groups))

        CommandNode.__init__(self,
                             description="<Admixture -> '%s.*''>"
                             % (output_prefix,),
                             command=cmd.finalize(),
                             dependencies=dependencies)
コード例 #23
0
    def _teardown(self, config, temp):
        os.remove(os.path.join(temp, self.PIPE_FILE))
        if self._index_format:
            os.remove(
                os.path.join(temp, swap_ext(self.PIPE_FILE,
                                            self._index_format)))

        CommandNode._teardown(self, config, temp)
コード例 #24
0
ファイル: gatk.py プロジェクト: jelber2/paleomix
def _set_input_files(command, input_files):
    keys = {}
    for (index, filename) in enumerate(input_files):
        command.add_option("-I", "%%(IN_BAMFILE_%02i)s" % index)
        keys["IN_BAMFILE_%02i" % index] = filename
        keys["IN_BAIFILE_%02i" % index] = swap_ext(filename, ".bai")

    command.set_kwargs(**keys)
コード例 #25
0
def _build_examl_bootstraps(options, phylo, destination, input_alignment,
                            input_partition, dependencies):
    bootstraps = []
    num_bootstraps = phylo["ExaML"]["Bootstraps"]
    bootstrap_destination = os.path.join(destination, "bootstraps")
    bootstrap_template = os.path.join(bootstrap_destination,
                                      "bootstrap.%04i.phy")

    for bootstrap_num in range(num_bootstraps):
        bootstrap_alignment = bootstrap_template % (bootstrap_num, )
        bootstrap = PHYLIPBootstrapNode(
            input_alignment=input_alignment,
            input_partition=input_partition,
            output_alignment=bootstrap_alignment,
            seed=random.randint(1, 2**32 - 1),
            dependencies=dependencies,
        )

        bootstrap_binary = swap_ext(bootstrap_alignment, ".binary")
        bootstrap_final = swap_ext(bootstrap_alignment, ".%s")
        bs_binary = ExaMLParserNode(
            input_alignment=bootstrap_alignment,
            input_partition=input_partition,
            output_file=bootstrap_binary,
            dependencies=bootstrap,
        )

        bootstraps.append(
            _examl_nodes(
                options=options,
                settings=phylo,
                input_alignment=bootstrap_alignment,
                input_partitions=input_partition,
                input_binary=bootstrap_binary,
                output_template=bootstrap_final,
                dependencies=bs_binary,
            ))

    if bootstraps:
        return _build_rerooted_trees(bootstraps, phylo["RootTreesOn"])

    return None
コード例 #26
0
ファイル: picard.py プロジェクト: muslih14/paleomix
    def customize(cls, config, input_bam, output_log=None, dependencies=()):
        params = picard_command(config, "ValidateSamFile")

        params.set_option("I", "%(IN_BAM)s", sep="=")

        output_log = output_log or swap_ext(input_bam, ".validated")
        params.set_kwargs(IN_BAM=input_bam,
                          OUT_STDOUT=output_log)

        return {"command": params,
                "dependencies": dependencies}
コード例 #27
0
ファイル: picard.py プロジェクト: muslih14/paleomix
    def customize(cls, config, reference, dependencies=()):
        params = picard_command(config, "CreateSequenceDictionary")

        params.set_option("R", "%(TEMP_OUT_REF)s", sep="=")
        params.set_option("O", "%(OUT_DICT)s", sep="=")
        params.set_kwargs(IN_REF=reference,
                          TEMP_OUT_REF=os.path.basename(reference),
                          OUT_DICT=swap_ext(reference, ".dict"))

        return {"command": params,
                "dependencies": dependencies}
コード例 #28
0
ファイル: picard.py プロジェクト: muslih14/paleomix
    def _setup(self, config, temp_root):
        CommandNode._setup(self, config, temp_root)
        dst_fname = os.path.join(temp_root, self._bam_input.pipe)
        if len(self._bam_input.files) > 1:
            os.mkfifo(dst_fname)
        else:
            src_fname, = self._bam_input.files
            os.symlink(os.path.join(os.getcwd(), src_fname), dst_fname)

            if self._bam_input.indexed:
                src_fname = os.path.join(os.getcwd(), swap_ext(src_fname, ".bai"))
                os.symlink(src_fname, dst_fname + ".bai")
コード例 #29
0
    def __init__(self,
                 input_file,
                 k_groups,
                 output_root,
                 groups,
                 dependencies=()):
        self._groups = groups
        self._input_file = input_file

        prefix = os.path.splitext(os.path.basename(input_file))[0]
        output_prefix = os.path.join(output_root, "%s.%i" % (prefix, k_groups))

        cmd = AtomicCmdBuilder(
            "admixture",
            IN_FILE_BED=input_file,
            IN_FILE_BIM=fileutils.swap_ext(input_file, ".bim"),
            IN_FILE_FAM=fileutils.swap_ext(input_file, ".fam"),
            TEMP_OUT_FILE_BED=prefix + ".bed",
            TEMP_OUT_FILE_BIM=prefix + ".bim",
            TEMP_OUT_FILE_FAM=prefix + ".fam",
            TEMP_OUT_FILE_POP=prefix + ".pop",
            OUT_P=output_prefix + ".P",
            OUT_Q=output_prefix + ".Q",
            OUT_STDOUT=output_prefix + ".log",
            CHECK_VERSION=ADMIXTURE_VERSION,
            set_cwd=True,
        )

        cmd.set_option("-s", random.randint(0, 2**16 - 1))
        cmd.set_option("--supervised")

        cmd.add_value("%(TEMP_OUT_FILE_BED)s")
        cmd.add_value(int(k_groups))

        CommandNode.__init__(
            self,
            description="<Admixture -> '%s.*''>" % (output_prefix, ),
            command=cmd.finalize(),
            dependencies=dependencies,
        )
コード例 #30
0
ファイル: picard.py プロジェクト: muslih14/paleomix
    def customize(cls, config, input_bams, output_bam, dependencies=()):
        params = picard_command(config, "MergeSamFiles")

        params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
        params.set_option("CREATE_INDEX", "True", sep="=")
        params.set_option("SO", "coordinate", sep="=", fixed=False)
        params.add_multiple_options("I", input_bams, sep="=")

        params.set_kwargs(OUT_BAM=output_bam,
                          OUT_BAI=swap_ext(output_bam, ".bai"))

        return {"command": params,
                "dependencies": dependencies}
コード例 #31
0
ファイル: picard.py プロジェクト: muslih14/paleomix
    def customize(cls, config, input_bams, output_bam, output_metrics=None,
                  keep_dupes=False, dependencies=()):
        params = picard_command(config, "MarkDuplicates")

        # Create .bai index, since it is required by a lot of other programs
        params.set_option("CREATE_INDEX", "True", sep="=")

        params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
        params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=")
        params.add_multiple_options("I", input_bams, sep="=")

        if not keep_dupes:
            # Remove duplicates from output by default to save disk-space
            params.set_option("REMOVE_DUPLICATES", "True",
                              sep="=", fixed=False)

        output_metrics = output_metrics or swap_ext(output_bam, ".metrics")
        params.set_kwargs(OUT_BAM=output_bam,
                          OUT_BAI=swap_ext(output_bam, ".bai"),
                          OUT_METRICS=output_metrics)

        return {"command": params,
                "dependencies": dependencies}
コード例 #32
0
ファイル: database.py プロジェクト: MikkelSchubert/paleomix
def _validate_mito_bam(data, handle, info):
    if data.mitochondria is None:
        # No mitochondrial data .. skip phylogeny
        return True

    references = handle.references
    min_length = min((len(record.sequence))
                     for record in data.mitochondria.itervalues())

    for bam_contig, bam_length in zip(references, handle.lengths):
        if bam_contig not in data.mitochondria:
            continue

        db_sequence = data.mitochondria[bam_contig].sequence
        db_length = len(db_sequence) - db_sequence.count("-")

        if bam_length != db_length:
            print_err("ERROR: Length of mitochondrial contig %r (%i bp) "
                      "does not match the length of the corresponding "
                      "sequence in the database (%i bp)"
                      % (bam_contig, bam_length, db_length))
            return False

        if not os.path.exists(handle.filename + '.bai') \
                and not os.path.exists(swap_ext(handle.filename, '.bai')):
            print_info('    - Attempting to index BAM file %r!'
                       % (handle.filename,))
            pysam.index(handle.filename)

        # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
        for line in "".join(pysam.idxstats(handle.filename)).split('\n'):
            line = line.strip()
            if not line:
                continue

            name, _, hits, _ = line.split('\t')
            if (name == bam_contig) and not int(hits):
                print_err("WARNING: Mitochondrial BAM (%r) does not contain "
                          "any reads aligned to contig %r; inferring an "
                          "phylogeny is not possible."
                          % (handle.filename, name))
                return True

        info.mt_contig = bam_contig
        info.mt_length = bam_length
        info.mt_padding = len(db_sequence) - min_length

        return True
    return True
コード例 #33
0
ファイル: database.py プロジェクト: jelber2/paleomix
def _validate_mito_bam(data, handle, info):
    if data.mitochondria is None:
        # No mitochondrial data .. skip phylogeny
        return True

    references = handle.references
    min_length = min(
        (len(record.sequence)) for record in data.mitochondria.itervalues())

    for bam_contig, bam_length in zip(references, handle.lengths):
        if bam_contig not in data.mitochondria:
            continue

        db_sequence = data.mitochondria[bam_contig].sequence
        db_length = len(db_sequence) - db_sequence.count("-")

        if bam_length != db_length:
            print_err("ERROR: Length of mitochondrial contig %r (%i bp) "
                      "does not match the length of the corresponding "
                      "sequence in the database (%i bp)" %
                      (bam_contig, bam_length, db_length))
            return False

        if not os.path.exists(handle.filename + '.bai') \
                and not os.path.exists(swap_ext(handle.filename, '.bai')):
            print_info('    - Attempting to index BAM file %r!' %
                       (handle.filename, ))
            pysam.index(handle.filename)

        # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
        for line in "".join(pysam.idxstats(handle.filename)).split('\n'):
            line = line.strip()
            if not line:
                continue

            name, _, hits, _ = line.split('\t')
            if (name == bam_contig) and not int(hits):
                print_err("WARNING: Mitochondrial BAM (%r) does not contain "
                          "any reads aligned to contig %r; inferring an "
                          "phylogeny is not possible." %
                          (handle.filename, name))
                return True

        info.mt_contig = bam_contig
        info.mt_length = bam_length
        info.mt_padding = len(db_sequence) - min_length

        return True
    return True
コード例 #34
0
ファイル: lane.py プロジェクト: jelber2/paleomix
    def _build_bwa_backtrack_se(self, config, prefix, record, parameters):
        input_file_fq = parameters.pop("input_file")
        output_file_bam = parameters.pop("output_file")
        output_file_sai = swap_ext(output_file_bam, ".sai")

        aln_node = self._build_bwa_backtrack_aln(parameters=parameters,
                                                 input_file=input_file_fq,
                                                 output_file=output_file_sai)

        sam_node = BWASamse.customize(input_file_fq=input_file_fq,
                                      input_file_sai=output_file_sai,
                                      output_file=output_file_bam,
                                      prefix=parameters["prefix"],
                                      reference=parameters["reference"],
                                      dependencies=aln_node)

        return self._finalize_nodes(config, prefix, parameters, sam_node)
コード例 #35
0
ファイル: picard.py プロジェクト: MikkelSchubert/paleomix
    def __init__(self, config, reference, dependencies=()):
        self._in_reference = os.path.abspath(reference)

        builder = picard_command(config, "CreateSequenceDictionary")

        builder.set_option("R", "%(TEMP_OUT_REF)s", sep="=")
        builder.set_option("O", "%(OUT_DICT)s", sep="=")
        builder.set_kwargs(IN_REFERENCE=reference,
                           TEMP_OUT_REF=os.path.basename(reference),
                           OUT_DICT=swap_ext(reference, ".dict"))

        description = "<SequenceDictionary: '%s'>" % (reference,)

        PicardNode.__init__(self,
                            command=builder.finalize(),
                            description=description,
                            dependencies=dependencies)
コード例 #36
0
    def __init__(self, config, reference, dependencies=()):
        self._in_reference = os.path.abspath(reference)

        builder = picard_command(config, "CreateSequenceDictionary")

        builder.set_option("R", "%(TEMP_OUT_REF)s", sep="=")
        builder.set_option("O", "%(OUT_DICT)s", sep="=")
        builder.set_kwargs(IN_REFERENCE=reference,
                           TEMP_OUT_REF=os.path.basename(reference),
                           OUT_DICT=swap_ext(reference, ".dict"))

        description = "<SequenceDictionary: '%s'>" % (reference, )

        PicardNode.__init__(self,
                            command=builder.finalize(),
                            description=description,
                            dependencies=dependencies)
コード例 #37
0
    def _build_bwa_backtrack_se(self, config, prefix, record, parameters):
        input_file_fq = parameters.pop("input_file")
        output_file_bam = parameters.pop("output_file")
        output_file_sai = swap_ext(output_file_bam, ".sai")

        aln_node = self._build_bwa_backtrack_aln(parameters=parameters,
                                                 input_file=input_file_fq,
                                                 output_file=output_file_sai)

        return BWASamse(
            input_file_fq=input_file_fq,
            input_file_sai=output_file_sai,
            output_file=output_file_bam,
            prefix=parameters["prefix"],
            reference=parameters["reference"],
            mapping_options=self.options["Aligners"]["BWA"],
            cleanup_options=self._cleanup_options("BWA"),
            dependencies=aln_node,
        )
コード例 #38
0
ファイル: picard.py プロジェクト: MikkelSchubert/paleomix
    def __init__(self, config, input_bam, input_index=None, output_log=None,
                 ignored_checks=(), dependencies=()):
        builder = picard_command(config, "ValidateSamFile")
        _set_max_open_files(builder, "MAX_OPEN_TEMP_FILES")

        builder.set_option("I", "%(IN_BAM)s", sep="=")
        for check in ignored_checks:
            builder.add_option("IGNORE", check, sep="=")

        output_log = output_log or swap_ext(input_bam, ".validated")
        builder.set_kwargs(IN_BAM=input_bam,
                           IN_INDEX=input_index,
                           OUT_STDOUT=output_log)

        description = "<Validate BAM: '%s'>" % (input_bam,)
        PicardNode.__init__(self,
                            command=builder.finalize(),
                            description=description,
                            dependencies=dependencies)
コード例 #39
0
ファイル: statistics.py プロジェクト: jelber2/paleomix
def _build_coverage_nodes_cached(files_and_nodes, target_name, roi_name,
                                 roi_filename, cache):
    output_ext = ".coverage"
    if roi_name:
        output_ext = ".%s.coverage" % roi_name

    coverages = {}
    for (input_filename, node) in files_and_nodes.iteritems():
        output_filename = swap_ext(input_filename, output_ext)

        cache_key = (roi_filename, input_filename)
        if cache_key not in cache:
            cache[cache_key] = CoverageNode(input_file=input_filename,
                                            output_file=output_filename,
                                            target_name=target_name,
                                            regions_file=roi_filename,
                                            dependencies=node)

        coverages[output_filename] = cache[cache_key]
    return coverages
コード例 #40
0
ファイル: statistics.py プロジェクト: MikkelSchubert/paleomix
def _build_coverage_nodes_cached(files_and_nodes, target_name,
                                 roi_name, roi_filename, cache):
    output_ext = ".coverage"
    if roi_name:
        output_ext = ".%s.coverage" % roi_name

    coverages = {}
    for (input_filename, node) in files_and_nodes.iteritems():
        output_filename = swap_ext(input_filename, output_ext)

        cache_key = (roi_filename, input_filename)
        if cache_key not in cache:
            cache[cache_key] = CoverageNode(input_file=input_filename,
                                            output_file=output_filename,
                                            target_name=target_name,
                                            regions_file=roi_filename,
                                            dependencies=node)

        coverages[output_filename] = cache[cache_key]
    return coverages
コード例 #41
0
ファイル: samtools.py プロジェクト: muslih14/paleomix
    def __init__(self, infile, dependencies=()):
        basename = os.path.basename(infile)

        cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"],
                             IN_BAM=infile,
                             TEMP_OUT_BAM=basename,
                             set_cwd=True)

        cmd_index = AtomicCmd(["samtools", "index", "%(TEMP_IN_BAM)s"],
                              TEMP_IN_BAM=basename,
                              CHECK_SAM=SAMTOOLS_VERSION)

        cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"],
                               TEMP_IN_BAM=basename + ".bai",
                               OUT_BAM=swap_ext(infile, ".bai"))

        commands = SequentialCmds((cmd_link, cmd_index, cmd_rename))

        CommandNode.__init__(self,
                             description="<BAMIndex: '%s'>" % (infile,),
                             command=commands,
                             dependencies=dependencies)
コード例 #42
0
ファイル: picard.py プロジェクト: MikkelSchubert/paleomix
    def customize(cls, config, input_bams, output_bam, output_metrics=None,
                  keep_dupes=False, dependencies=()):
        params = picard_command(config, "MarkDuplicates")
        _set_max_open_files(params, "MAX_FILE_HANDLES")

        params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
        params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=")
        # Validation is mostly left to manual ValidateSamFile runs; required
        # because .csi indexed BAM records can have "invalid" bins.
        params.set_option("VALIDATION_STRINGENCY", "LENIENT", sep="=")
        params.add_multiple_options("I", input_bams, sep="=")

        if not keep_dupes:
            # Remove duplicates from output by default to save disk-space
            params.set_option("REMOVE_DUPLICATES", "True",
                              sep="=", fixed=False)

        output_metrics = output_metrics or swap_ext(output_bam, ".metrics")
        params.set_kwargs(OUT_BAM=output_bam,
                          OUT_METRICS=output_metrics)

        return {"command": params,
                "dependencies": dependencies}
コード例 #43
0
ファイル: genotype.py プロジェクト: jelber2/paleomix
def build_sampling_nodes(options, genotyping, sample, regions, dependencies):
    fasta_file = regions["Genotypes"][sample]
    pileup_file = swap_ext(fasta_file, ".pileup.bgz")

    padding = genotyping["Padding"]
    slop, node = build_regions_nodes(regions, padding, dependencies)

    bam_file = "%s.%s.bam" % (sample, regions["Prefix"])
    bam_file = os.path.join(options.samples_root, bam_file)
    if regions["Realigned"]:
        bam_file = add_postfix(bam_file, ".realigned")
    bai_node = build_bam_index_node(bam_file)

    genotype = GenotypeRegionsNode.customize(pileup_only=True,
                                             reference=regions["FASTA"],
                                             bedfile=slop,
                                             infile=bam_file,
                                             outfile=pileup_file,
                                             nbatches=options.samtools_max_threads,
                                             dependencies=node + (bai_node,))
    apply_samtools_options(genotype.command, genotyping["MPileup"],
                           "--mpileup-argument")
    genotype = genotype.build_node()

    tabix = TabixIndexNode(infile=pileup_file,
                           preset="pileup",
                           dependencies=genotype)

    builder = SampleRegionsNode(infile=pileup_file,
                                bedfile=regions["BED"],
                                outfile=fasta_file,
                                dependencies=tabix)

    faidx = FastaIndexNode(infile=fasta_file,
                           dependencies=builder)

    return (faidx,)
コード例 #44
0
ファイル: commands.py プロジェクト: jelber2/paleomix
    def __init__(self,
                 config,
                 target_name,
                 input_files,
                 output_file,
                 prefix,
                 regions_file=None,
                 dependencies=()):
        input_files = safe_coerce_to_tuple(input_files)
        index_format = regions_file and prefix['IndexFormat']

        builder = factory.new("depths")
        builder.add_value("%(TEMP_IN_BAM)s")
        builder.add_value("%(OUT_FILE)s")
        builder.set_option("--target-name", target_name)
        builder.set_kwargs(OUT_FILE=output_file,
                           TEMP_IN_BAM=MultiBAMInputNode.PIPE_FILE)
        builder.add_multiple_kwargs(input_files)

        if regions_file:
            index_file = swap_ext(MultiBAMInputNode.PIPE_FILE, index_format)

            builder.set_option('--regions-file', '%(IN_REGIONS)s')
            builder.set_kwargs(IN_REGIONS=regions_file,
                               TEMP_IN_INDEX=index_file)

        description = "<DepthHistogram: %s -> '%s'>" \
            % (describe_files(input_files), output_file)

        MultiBAMInputNode.__init__(self,
                                   config=config,
                                   input_bams=input_files,
                                   index_format=index_format,
                                   command=builder.finalize(),
                                   description=description,
                                   dependencies=dependencies)
コード例 #45
0
def test_swap_ext__empty_ext_vs_new_ext():
    assert_equal(swap_ext("name", "bar"), "name.bar")
コード例 #46
0
ファイル: genotype.py プロジェクト: jelber2/paleomix
def build_genotyping_nodes_cached(options, genotyping, sample, regions,
                                  dependencies):
    """Carries out genotyping, filtering of calls, and indexing of files for a
    given sample and prefix. If the option 'GenotypeEntirePrefix' is enabled,
    the BAM is genotyped once, and each set of RegionsOfInterest simply extract
    the relevant regions during construction of the consensus sequence.

    Parameters:
        options: An options object (c.f. paleomix.tools.phylo_pipeline.config).
        genotyping: Genotyping options defined for a specific set of areas of
                    interest, corresponding to Genotyping:NAME in the makefile.
        sample: The name of the sample to be genotyped.
        egions: A dictionary for a 'RegionsOfInterest' from the makefile.
        dependencies: Depenencies that must be met before genotyping starts.

    Returns a tuple containing the filename of the filtered and tabix-indexed
    VCF file, and the top-level node generating this file. Multiple calls for
    the same BAM and prefix will return the same VCF and nodes if the option
    for 'GenotypeEntirePrefix' is enabled, otherwise each ROI is genotyped
    individiually.

    Output files are generated in ./results/PROJECT/genotyping. If the option
    for 'GenotypeEntirePrefix' is enabled, the following files are generated:
        SAMPLE.PREFIX.vcf.bgz: Unfiltered calls for variant/non-variant sites.
        SAMPLE.PREFIX.vcf.pileup.bgz: Pileup of sites containing SNPs.
        SAMPLE.PREFIX.vcf.pileup.bgz.tbi: Tabix index of the pileup.
        SAMPLE.PREFIX.filtered.vcf.bgz: Variant calls filtered with vcf_filter.
        SAMPLE.PREFIX.filtered.vcf.bgz.tbi: Tabix index for the filtered VCF.

    If 'GenotypeEntirePrefix' is not enabled for a given ROI, the following
    files are generated for that ROI (see descriptions above):
        SAMPLE.PREFIX.ROI.filtered.vcf.bgz
        SAMPLE.PREFIX.ROI.filtered.vcf.bgz.tbi
        SAMPLE.PREFIX.ROI.vcf.bgz
        SAMPLE.PREFIX.ROI.vcf.pileup.bgz
        SAMPLE.PREFIX.ROI.vcf.pileup.bgz.tbi

    In addition, the following files are generated for each set of
    RegionsOfInterest (ROI), regardless of the 'GenotypeEntirePrefix' option:
        SAMPLE.PREFIX.ROI.CDS.fasta: FASTA sequence of each feature in the ROI.
        SAMPLE.PREFIX.ROI.CDS.fasta.fai: FASTA index generated using SAMTools.

    """
    output_prefix, bamfile, bedfile, dependencies \
        = build_genotyping_bedfile_nodes(options, genotyping, sample, regions,
                                         dependencies)

    if (bamfile, output_prefix) in _VCF_CACHE:
        return _VCF_CACHE[(bamfile, output_prefix)]

    calls = swap_ext(output_prefix, ".vcf.bgz")
    pileups = swap_ext(output_prefix, ".vcf.pileup.bgz")
    filtered = swap_ext(output_prefix, ".filtered.vcf.bgz")

    # 1. Call samtools mpilup | bcftools view on the bam
    genotype = GenotypeRegionsNode.customize(reference=regions["FASTA"],
                                             bedfile=bedfile,
                                             infile=bamfile,
                                             outfile=calls,
                                             nbatches=options.samtools_max_threads,
                                             dependencies=dependencies)

    apply_samtools_options(genotype.command, genotyping["MPileup"],
                           "--mpileup-argument")
    apply_samtools_options(genotype.command, genotyping["BCFTools"],
                           "--bcftools-argument")
    genotype = genotype.build_node()

    # 2. Collect pileups of sites with SNPs, to allow proper filtering by
    #    frequency of the minor allele, as only the major non-ref allele is
    #    counted in the VCF (c.f. field DP4).
    vcfpileup = VCFPileupNode.customize(reference=regions["FASTA"],
                                        infile_bam=bamfile,
                                        infile_vcf=calls,
                                        outfile=pileups,
                                        dependencies=genotype)
    apply_samtools_options(vcfpileup.command, genotyping["MPileup"],
                           "--mpileup-argument")
    vcfpileup = vcfpileup.build_node()

    vcf_tabix = TabixIndexNode(infile=pileups,
                               preset="pileup",
                               dependencies=vcfpileup)

    # 3. Filter all sites using the 'vcf_filter' command
    vcffilter = VCFFilterNode.customize(infile=calls,
                                        pileup=pileups,
                                        outfile=filtered,
                                        regions=regions,
                                        dependencies=vcf_tabix)
    vcffilter = _apply_vcf_filter_options(vcffilter, genotyping, sample)

    # 4. Tabix index. This allows random-access to the VCF file when building
    #    the consensus FASTA sequence later in the pipeline.
    tabix = TabixIndexNode(infile=filtered,
                           preset="vcf",
                           dependencies=vcffilter)

    _VCF_CACHE[(bamfile, output_prefix)] = (filtered, tabix)
    return filtered, tabix
コード例 #47
0
ファイル: pipeline.py プロジェクト: tmancill/paleomix
def finalize_run_config(parser, args):
    log = logging.getLogger(__name__)
    if args.command in ("run", "dryrun") and not (1 <= len(args.files) <= 3):
        parser.print_usage()
        return

    args.multisample = False

    known_samples = set(args.database.samples) | set(("Sample", ))
    unknown_samples = set(args.treemix_outgroup) - known_samples
    if unknown_samples:
        log.error(
            "Argument --treemix-outgroup includes unknown sample(s): %s; known "
            "samples are %s. Note that names are case-sensitive."
            ", ".join(map(repr, sorted(unknown_samples))),
            ", ".join(map(repr, sorted(known_samples))),
        )
        return

    if len(args.files) == 1:
        args.files.append(fileutils.swap_ext(args.files[0], ".zonkey"))

    if len(args.files) == 2:
        filename, args.destination = args.files

        if os.path.exists(
                args.destination) and not os.path.isdir(args.destination):
            log.error("Destination %r is not a directory", args.destination)
            return
        elif not os.path.isfile(filename):
            log.error("Not a valid filename: %r", filename)
            return
        elif _is_bamfile(filename):
            args.samples = {
                "-": {
                    "Root": args.destination,
                    "Files": [filename]
                }
            }
        else:
            args.multisample = True
            if not _read_sample_table(args, filename):
                return
    elif len(args.files) == 3:
        filename_1, filename_2, args.destination = args.files

        args.samples = {
            "-": {
                "Root": args.destination,
                "Files": [filename_1, filename_2]
            }
        }
    else:
        raise RuntimeError("Unexpected number of arguments: %r" %
                           (args.files, ))

    # Identify (mito or nuc?) and validate BAM files provided by user
    if not _process_samples(args):
        return

    return args
コード例 #48
0
def test_swap_ext__dot_ext_vs_new_dot_ext():
    assert_equal(swap_ext("name", ".bar"), "name.bar")
コード例 #49
0
                  "names are case-sensitive." %
                  (", ".join(map(repr, sorted(unknown_samples))), ", ".join(
                      map(repr, sorted(known_samples)))))
        return

    if config.command in ("mito", "example"):
        if len(args) != 2:
            print_err("ERROR: Wrong number of arguments!")
            print_usage()
            return

        config.destination = args[1]
        config.samples = {}
    elif len(args) == 2:
        filename = args[1]
        config.destination = fileutils.swap_ext(filename, ".zonkey")

        if not os.path.isfile(filename):
            print_err("ERROR: Not a valid filename: %r" % (filename, ))
            return
        elif _is_bamfile(filename):
            # Called as either of
            #   zonkey run <SampleDB> <nuclear.bam>
            #   zonkey run <SampleDB> <mitochondrial.bam>
            config.samples = {
                "-": {
                    "Root": config.destination,
                    "Files": [filename]
                }
            }
        else:
コード例 #50
0
ファイル: common.py プロジェクト: MikkelSchubert/paleomix
def parse_arguments(argv, ext):
    prog = "paleomix %s" % (ext.strip("."),)
    usage = "%s [options] sorted.bam [out%s]" % (prog, ext)
    parser = argparse.ArgumentParser(prog=prog, usage=usage)

    parser.add_argument("infile", metavar="BAM",
                        help="Filename of a sorted BAM file. If set to '-' "
                             "the file is read from STDIN.")
    parser.add_argument("outfile", metavar="OUTPUT", nargs='?',
                        help="Filename of output table; defaults to name of "
                             "the input BAM with a '%s' extension. If "
                             "set to '-' the table is printed to STDOUT."
                             % (ext,))
    parser.add_argument("--target-name", default=None, metavar="NAME",
                        help="Name used for 'Target' column; defaults to the "
                             "filename of the BAM file.")
    parser.add_argument("--regions-file", default=None, dest="regions_fpath",
                        help="BED file containing regions of interest; %s "
                             "is calculated only for these grouping by the "
                             "name used in the BED file, or the contig name "
                             "if no name has been specified for a record."
                             % (ext.strip("."),))
    parser.add_argument('--max-contigs', default=100, type=int,
                        help="The maximum number of contigs allowed in a BAM "
                             "file. If this number is exceeded, the entire "
                             "set of contigs is aggregated into one pseudo-"
                             "contig named '<Genome>'. This is done to "
                             "limit table sizes [default: %(default)s]")
    parser.add_argument('--ignore-readgroups',
                        default=False, action="store_true",
                        help="Ignore readgroup information in reads, and only "
                             "provide aggregated statistics; this is required "
                             "if readgroup information is missing or partial "
                             "[default: %(default)s]")
    parser.add_argument('--overwrite-output',
                        default=False, action="store_true",
                        help="Overwrite output file if it it exists; by "
                             "default, the script will terminate if the file "
                             "already exists.")

    args = parser.parse_args(argv)
    if not args.outfile:
        args.outfile = swap_ext(args.infile, ext)

    if args.ignore_readgroups:
        args.get_readgroup_func = _get_readgroup_ignored
    else:
        args.get_readgroup_func = _get_readgroup

    if not args.target_name:
        if args.infile == "-":
            args.target_name = "<STDIN>"
        else:
            args.target_name = os.path.basename(args.infile)

    if os.path.exists(args.outfile) and not args.overwrite_output:
        parser.error("Destination filename already exists (%r); use option "
                     "--overwrite-output to allow overwriting of this file."
                     % (args.outfile,))

    return args
コード例 #51
0
ファイル: common.py プロジェクト: tmancill/paleomix
def parse_arguments(argv, ext):
    prog = "paleomix %s" % (ext.strip("."), )
    usage = "%s [options] sorted.bam [out%s]" % (prog, ext)
    parser = argparse.ArgumentParser(prog=prog, usage=usage)

    parser.add_argument(
        "infile",
        metavar="BAM",
        help="Filename of a sorted BAM file. If set to '-' "
        "the file is read from STDIN.",
    )
    parser.add_argument(
        "outfile",
        metavar="OUTPUT",
        nargs="?",
        help="Filename of output table; defaults to name of "
        "the input BAM with a '%s' extension. If "
        "set to '-' the table is printed to STDOUT." % (ext, ),
    )
    parser.add_argument(
        "--target-name",
        default=None,
        metavar="NAME",
        help="Name used for 'Target' column; defaults to the "
        "filename of the BAM file.",
    )
    parser.add_argument(
        "--regions-file",
        default=None,
        dest="regions_fpath",
        help="BED file containing regions of interest; %s "
        "is calculated only for these grouping by the "
        "name used in the BED file, or the contig name "
        "if no name has been specified for a record." % (ext.strip("."), ),
    )
    parser.add_argument(
        "--max-contigs",
        default=100,
        type=int,
        help="The maximum number of contigs allowed in a BAM "
        "file. If this number is exceeded, the entire "
        "set of contigs is aggregated into one pseudo-"
        "contig named '<Genome>'. This is done to "
        "limit table sizes [default: %(default)s]",
    )
    parser.add_argument(
        "--ignore-readgroups",
        default=False,
        action="store_true",
        help="Ignore readgroup information in reads, and only "
        "provide aggregated statistics; this is required "
        "if readgroup information is missing or partial "
        "[default: %(default)s]",
    )
    parser.add_argument(
        "--overwrite-output",
        default=False,
        action="store_true",
        help="Overwrite output file if it it exists; by "
        "default, the script will terminate if the file "
        "already exists.",
    )

    args = parser.parse_args(argv)
    if not args.outfile:
        args.outfile = swap_ext(args.infile, ext)

    if args.ignore_readgroups:
        args.get_readgroup_func = _get_readgroup_ignored
    else:
        args.get_readgroup_func = _get_readgroup

    if not args.target_name:
        if args.infile == "-":
            args.target_name = "<STDIN>"
        else:
            args.target_name = os.path.basename(args.infile)

    if os.path.exists(args.outfile) and not args.overwrite_output:
        parser.error("Destination filename already exists (%r); use option "
                     "--overwrite-output to allow overwriting of this file." %
                     (args.outfile, ))

    return args
コード例 #52
0
def test_swap_ext__has_ext_vs_new_ext():
    assert_equal(swap_ext("name.foo", "bar"), "name.bar")
コード例 #53
0
ファイル: config.py プロジェクト: MikkelSchubert/paleomix
                  "names are case-sensitive."
                  % (", ".join(map(repr, sorted(unknown_samples))),
                     ", ".join(map(repr, sorted(known_samples)))))
        return

    if config.command in ("mito", "example"):
        if len(args) != 2:
            print_err("ERROR: Wrong number of arguments!")
            print_usage()
            return

        config.destination = args[1]
        config.samples = {}
    elif len(args) == 2:
        filename = args[1]
        config.destination = fileutils.swap_ext(filename, ".zonkey")

        if not os.path.isfile(filename):
            print_err("ERROR: Not a valid filename: %r" % (filename,))
            return
        elif _is_bamfile(filename):
            # Called as either of
            #   zonkey run <SampleDB> <nuclear.bam>
            #   zonkey run <SampleDB> <mitochondrial.bam>
            config.samples = {"-": {"Root": config.destination,
                                    "Files": [filename]}}
        else:
            config.multisample = True
            if not _read_sample_table(config, filename):
                return
    elif 3 <= len(args) <= 4: