Ejemplo n.º 1
0
 def cache_remote_file(url, md5_digest, casdir):
     return bunnies.run_cmd(
         ["cas", "-put", url, "-get", "md5:" + md5_digest,
          casdir]).stdout.decode('utf-8').strip()
Ejemplo n.º 2
0
    def run(self, resources=None, **params):
        """ this runs in the image """
        import os
        import os.path
        import sys

        def cache_remote_file(url, md5_digest, casdir):
            return bunnies.run_cmd(
                ["cas", "-put", url, "-get", "md5:" + md5_digest,
                 casdir]).stdout.decode('utf-8').strip()

        workdir = params['workdir']

        s3_output_prefix = self.output_prefix()

        local_input_dir = os.path.join(workdir, "input")
        local_output_dir = os.path.join(workdir, "output")

        os.makedirs(local_output_dir, exist_ok=True)
        os.makedirs(local_input_dir, exist_ok=True)

        cas_dir = "/localscratch/cas"
        os.makedirs(cas_dir, exist_ok=True)

        #
        # download reference in scratch space shared with other jobs
        # in the same compute environment
        #
        ref_target = self.ref.ls()
        ref_idx_target = self.ref_idx.ls()
        ref_path = cache_remote_file(ref_target['url'],
                                     ref_target['digests']['md5'], cas_dir)
        _ = cache_remote_file(ref_idx_target['url'],
                              ref_idx_target['digests']['md5'], cas_dir)
        bam_target = self.sample_bam.ls()

        log.info("genotyping BAM sample %s: bam=%s (size=%5.3fGiB)...",
                 self.params, bam_target['bam']['url'],
                 bam_target['bam']['size'] / (1024 * 1024 * 1024))

        # download the bai too
        bam_path = os.path.join(local_input_dir,
                                os.path.basename(bam_target['bam']['url']))
        bai_path = os.path.join(local_input_dir,
                                os.path.basename(bam_target['bai']['url']))
        bunnies.transfers.s3_download_file(bam_target['bam']['url'], bam_path)
        bunnies.transfers.s3_download_file(bam_target['bai']['url'], bai_path)

        num_threads = resources['vcpus']
        memory_mb = resources['memory']

        mb_per_worker = (memory_mb - 200) // num_threads
        java_heap = "-Xmx%dm" % (mb_per_worker, )
        vc_args = [
            "vc", "-o", s3_output_prefix, "-i", bam_path, "-w", workdir, "-r",
            ref_path, "-n",
            str(num_threads), "-minbp", "0", "-nsegments",
            str(num_threads * 5), "-bgzip", "-gatk4", "-javaoptions", java_heap
        ]
        if self.params['hc_options']:
            vc_args += ["-vcoptions", " ".join(self.params['hc_options'])]

        bunnies.run_cmd(vc_args,
                        stdout=sys.stdout,
                        stderr=sys.stderr,
                        cwd=workdir)
        bunnies.run_cmd(["ls", "-lh", local_output_dir],
                        stdout=sys.stdout,
                        stderr=sys.stderr,
                        cwd=workdir)

        def _check_output_file(fname, is_optional=False):
            try:
                output_url = os.path.join(s3_output_prefix, fname)
                meta = bunnies.get_blob_meta(output_url)
                return {
                    "size": meta['ContentLength'],
                    "url": output_url,
                    "etag": meta['ETag']
                }
            except FileNotFoundError:
                if is_optional:
                    return None
                raise Exception("missing file: " + output_url)

        pfx = self.sample_name
        output = {
            "gvcf": _check_output_file(pfx + ".g.vcf.gz", True),
            "gvcf_idx": _check_output_file(pfx + ".g.vcf.gz.tbi", True),
            "input_bed": _check_output_file(pfx + ".input.bed", False),
            "output_bed": _check_output_file(pfx + ".scatter.bed", True),
            "scatter_log": _check_output_file(pfx + ".scatter.log", True)
        }
        return output
Ejemplo n.º 3
0
    def run(self, resources=None, **params):
        """ this runs in the image """
        import os
        import sys
        import tempfile
        import json

        def cache_remote_file(url, md5_digest, casdir):
            return bunnies.run_cmd(
                ["cas", "-put", url, "-get", "md5:" + md5_digest,
                 casdir]).stdout.decode('utf-8').strip()

        workdir = params['workdir']
        s3_output_prefix = self.output_prefix()
        local_output_dir = os.path.join(workdir, "output")

        cas_dir = "/localscratch/cas"
        os.makedirs(cas_dir, exist_ok=True)
        os.makedirs(local_output_dir, exist_ok=True)

        #
        # download reference in /scratch
        # /scratch is shared with other jobs in the same compute environment
        #
        ref_target = self.ref.ls()
        ref_idx_target = self.ref_idx.ls()
        ref_path = cache_remote_file(ref_target['url'],
                                     ref_target['digests']['md5'], cas_dir)
        _ = cache_remote_file(ref_idx_target['url'],
                              ref_idx_target['digests']['md5'], cas_dir)

        align_args = ["align", "-cas", cas_dir]
        if self.params['lossy']:
            align_args.append("-lossy")

        r1_target = self.r1.ls()
        r2_target = self.r2.ls() if self.r2 else None

        # write jobfile
        jobfile_doc = {
            self.params['sample_name']: {
                "name":
                self.params['sample_name'],
                "locations":
                [[r1_target['url'], "md5:" + r1_target['digests']['md5']],
                 [r2_target['url'], "md5:" +
                  r2_target['digests']['md5']] if r2_target else ["", ""]]
            }
        }
        log.info("align job: %s", repr(jobfile_doc))
        with tempfile.NamedTemporaryFile(suffix=".job.txt",
                                         mode="wt",
                                         prefix=self.params['sample_name'],
                                         dir=workdir,
                                         delete=False) as jobfile_fd:
            json.dump(jobfile_doc, jobfile_fd)

        num_threads = resources['vcpus']
        align_args += [
            "-r",
            ref_path,
            "-i",
            jobfile_fd.name,
            "-o",
            s3_output_prefix,
            "-w",
            workdir,
            "-m",  # autodetect readgroup info
            "-d",
            "1",  # mark duplicates
            "-n",
            str(num_threads),
            "-stats"
        ]

        bunnies.run_cmd(align_args,
                        stdout=sys.stdout,
                        stderr=sys.stderr,
                        cwd=workdir)

        def _check_output_file(field, url, is_optional=False):
            try:
                meta = bunnies.utils.get_blob_meta(url)
                return {"size": meta['ContentLength'], "url": url}

            except bunnies.exc.NoSuchFile:
                if is_optional:
                    return None
                raise Exception("output %s missing: %s" % (field, url))

        sn = self.params['sample_name']

        def od(x):
            return os.path.join(s3_output_prefix, x)

        output = {
            "bam":
            _check_output_file("bam", "%s.bam" % od(sn)),
            "bamstats":
            _check_output_file("bamstats", "%s.bamstats.txt" % od(sn)),
            "bai":
            _check_output_file("bai", "%s.bai" % od(sn)),
            "illuminametrics":
            _check_output_file("illuminametrics",
                               "%s.illuminametrics.txt" % od(sn)),
            "dupmetrics":
            _check_output_file("dupmetrics", "%s.dupmetrics.txt" % od(sn)),
            "bam_md5":
            _check_output_file("bam.md5", "%s.bam.md5" % od(sn))
        }

        return output
Ejemplo n.º 4
0
    def run(self, **params):
        """ this runs in the image """
        workdir = params['workdir']

        s3_output_prefix = self.output_prefix()

        local_output_dir = os.path.join(workdir, "output")
        local_input_dir = os.path.join(workdir, "input")

        # download input samples
        os.makedirs(local_output_dir, exist_ok=True)
        os.makedirs(local_input_dir, exist_ok=True)

        all_srcs = []
        all_dests = []
        for inputi, inputval in self.inputs.items():
            aligned_target = inputval.ls()
            bam_src, bam_dest = aligned_target['bam']['url'], os.path.join(
                local_input_dir, "input_%s.bam" % (inputi, ))
            bai_src, bai_dest = aligned_target['bai']['url'], os.path.join(
                local_input_dir, "input_%s.bai" % (inputi, ))
            bunnies.transfers.s3_download_file(bai_src, bai_dest)
            bunnies.transfers.s3_download_file(bam_src, bam_dest)
            all_srcs.append({"bam": bam_src, "bai": bai_src})
            all_dests += [bam_dest, bai_dest]

        merge_args = [
            os.path.join(params["scriptdir"], "scripts", "lane_merger.sh"),
            "--samtools",
            "/usr/bin/samtools",
            "--sambamba",
            "/usr/local/bin/sambamba_v0.6.6",
            "--samplename",
            self.sample_name,
            "--tmpdir",
            workdir,
            "--delete-old",
            os.path.join(local_output_dir, self.sample_name) +
            ".bam",  # output.bam
        ] + all_dests

        bunnies.run_cmd(merge_args,
                        stdout=sys.stdout,
                        stderr=sys.stderr,
                        cwd=workdir)

        with open(
                os.path.join(local_output_dir,
                             self.sample_name + ".bam.merged.txt"),
                "w") as merge_manifest:
            for src in all_srcs:
                merge_manifest.write("\t".join([
                    self.sample_name, src['bam'],
                    os.path.join(s3_output_prefix, self.sample_name + ".bam")
                ]) + "\n")

        bunnies.run_cmd(["find", local_output_dir],
                        stdout=sys.stdout,
                        stderr=sys.stderr,
                        cwd=workdir)
        pfx = self.sample_name

        def _check_output_file(fname, is_optional=False):
            try:
                inpath = os.path.join(local_output_dir, fname)
                output_url = os.path.join(s3_output_prefix, fname)
                st_size = os.stat(inpath).st_size
                bunnies.transfers.s3_upload_file(inpath, output_url)
                return {"size": st_size, "url": output_url}
            except FileNotFoundError:
                if is_optional:
                    return None
                raise Exception("missing file: " + inpath)

        output = {
            "bam": _check_output_file(pfx + ".bam", False),
            "bai": _check_output_file(pfx + ".bam.bai", False),
            "bam_md5": _check_output_file(pfx + ".bam.md5", False),
            "dupmetrics": _check_output_file(pfx + ".dupmetrics.txt", True),
            "bamstats": _check_output_file(pfx + ".bamstats.txt", False),
            "merge_manifest": _check_output_file(pfx + ".bam.merged.txt",
                                                 False)
        }
        return output