def cache_remote_file(url, md5_digest, casdir): return bunnies.run_cmd( ["cas", "-put", url, "-get", "md5:" + md5_digest, casdir]).stdout.decode('utf-8').strip()
def run(self, resources=None, **params): """ this runs in the image """ import os import os.path import sys def cache_remote_file(url, md5_digest, casdir): return bunnies.run_cmd( ["cas", "-put", url, "-get", "md5:" + md5_digest, casdir]).stdout.decode('utf-8').strip() workdir = params['workdir'] s3_output_prefix = self.output_prefix() local_input_dir = os.path.join(workdir, "input") local_output_dir = os.path.join(workdir, "output") os.makedirs(local_output_dir, exist_ok=True) os.makedirs(local_input_dir, exist_ok=True) cas_dir = "/localscratch/cas" os.makedirs(cas_dir, exist_ok=True) # # download reference in scratch space shared with other jobs # in the same compute environment # ref_target = self.ref.ls() ref_idx_target = self.ref_idx.ls() ref_path = cache_remote_file(ref_target['url'], ref_target['digests']['md5'], cas_dir) _ = cache_remote_file(ref_idx_target['url'], ref_idx_target['digests']['md5'], cas_dir) bam_target = self.sample_bam.ls() log.info("genotyping BAM sample %s: bam=%s (size=%5.3fGiB)...", self.params, bam_target['bam']['url'], bam_target['bam']['size'] / (1024 * 1024 * 1024)) # download the bai too bam_path = os.path.join(local_input_dir, os.path.basename(bam_target['bam']['url'])) bai_path = os.path.join(local_input_dir, os.path.basename(bam_target['bai']['url'])) bunnies.transfers.s3_download_file(bam_target['bam']['url'], bam_path) bunnies.transfers.s3_download_file(bam_target['bai']['url'], bai_path) num_threads = resources['vcpus'] memory_mb = resources['memory'] mb_per_worker = (memory_mb - 200) // num_threads java_heap = "-Xmx%dm" % (mb_per_worker, ) vc_args = [ "vc", "-o", s3_output_prefix, "-i", bam_path, "-w", workdir, "-r", ref_path, "-n", str(num_threads), "-minbp", "0", "-nsegments", str(num_threads * 5), "-bgzip", "-gatk4", "-javaoptions", java_heap ] if self.params['hc_options']: vc_args += ["-vcoptions", " ".join(self.params['hc_options'])] bunnies.run_cmd(vc_args, stdout=sys.stdout, stderr=sys.stderr, cwd=workdir) bunnies.run_cmd(["ls", "-lh", local_output_dir], stdout=sys.stdout, stderr=sys.stderr, cwd=workdir) def _check_output_file(fname, is_optional=False): try: output_url = os.path.join(s3_output_prefix, fname) meta = bunnies.get_blob_meta(output_url) return { "size": meta['ContentLength'], "url": output_url, "etag": meta['ETag'] } except FileNotFoundError: if is_optional: return None raise Exception("missing file: " + output_url) pfx = self.sample_name output = { "gvcf": _check_output_file(pfx + ".g.vcf.gz", True), "gvcf_idx": _check_output_file(pfx + ".g.vcf.gz.tbi", True), "input_bed": _check_output_file(pfx + ".input.bed", False), "output_bed": _check_output_file(pfx + ".scatter.bed", True), "scatter_log": _check_output_file(pfx + ".scatter.log", True) } return output
def run(self, resources=None, **params): """ this runs in the image """ import os import sys import tempfile import json def cache_remote_file(url, md5_digest, casdir): return bunnies.run_cmd( ["cas", "-put", url, "-get", "md5:" + md5_digest, casdir]).stdout.decode('utf-8').strip() workdir = params['workdir'] s3_output_prefix = self.output_prefix() local_output_dir = os.path.join(workdir, "output") cas_dir = "/localscratch/cas" os.makedirs(cas_dir, exist_ok=True) os.makedirs(local_output_dir, exist_ok=True) # # download reference in /scratch # /scratch is shared with other jobs in the same compute environment # ref_target = self.ref.ls() ref_idx_target = self.ref_idx.ls() ref_path = cache_remote_file(ref_target['url'], ref_target['digests']['md5'], cas_dir) _ = cache_remote_file(ref_idx_target['url'], ref_idx_target['digests']['md5'], cas_dir) align_args = ["align", "-cas", cas_dir] if self.params['lossy']: align_args.append("-lossy") r1_target = self.r1.ls() r2_target = self.r2.ls() if self.r2 else None # write jobfile jobfile_doc = { self.params['sample_name']: { "name": self.params['sample_name'], "locations": [[r1_target['url'], "md5:" + r1_target['digests']['md5']], [r2_target['url'], "md5:" + r2_target['digests']['md5']] if r2_target else ["", ""]] } } log.info("align job: %s", repr(jobfile_doc)) with tempfile.NamedTemporaryFile(suffix=".job.txt", mode="wt", prefix=self.params['sample_name'], dir=workdir, delete=False) as jobfile_fd: json.dump(jobfile_doc, jobfile_fd) num_threads = resources['vcpus'] align_args += [ "-r", ref_path, "-i", jobfile_fd.name, "-o", s3_output_prefix, "-w", workdir, "-m", # autodetect readgroup info "-d", "1", # mark duplicates "-n", str(num_threads), "-stats" ] bunnies.run_cmd(align_args, stdout=sys.stdout, stderr=sys.stderr, cwd=workdir) def _check_output_file(field, url, is_optional=False): try: meta = bunnies.utils.get_blob_meta(url) return {"size": meta['ContentLength'], "url": url} except bunnies.exc.NoSuchFile: if is_optional: return None raise Exception("output %s missing: %s" % (field, url)) sn = self.params['sample_name'] def od(x): return os.path.join(s3_output_prefix, x) output = { "bam": _check_output_file("bam", "%s.bam" % od(sn)), "bamstats": _check_output_file("bamstats", "%s.bamstats.txt" % od(sn)), "bai": _check_output_file("bai", "%s.bai" % od(sn)), "illuminametrics": _check_output_file("illuminametrics", "%s.illuminametrics.txt" % od(sn)), "dupmetrics": _check_output_file("dupmetrics", "%s.dupmetrics.txt" % od(sn)), "bam_md5": _check_output_file("bam.md5", "%s.bam.md5" % od(sn)) } return output
def run(self, **params): """ this runs in the image """ workdir = params['workdir'] s3_output_prefix = self.output_prefix() local_output_dir = os.path.join(workdir, "output") local_input_dir = os.path.join(workdir, "input") # download input samples os.makedirs(local_output_dir, exist_ok=True) os.makedirs(local_input_dir, exist_ok=True) all_srcs = [] all_dests = [] for inputi, inputval in self.inputs.items(): aligned_target = inputval.ls() bam_src, bam_dest = aligned_target['bam']['url'], os.path.join( local_input_dir, "input_%s.bam" % (inputi, )) bai_src, bai_dest = aligned_target['bai']['url'], os.path.join( local_input_dir, "input_%s.bai" % (inputi, )) bunnies.transfers.s3_download_file(bai_src, bai_dest) bunnies.transfers.s3_download_file(bam_src, bam_dest) all_srcs.append({"bam": bam_src, "bai": bai_src}) all_dests += [bam_dest, bai_dest] merge_args = [ os.path.join(params["scriptdir"], "scripts", "lane_merger.sh"), "--samtools", "/usr/bin/samtools", "--sambamba", "/usr/local/bin/sambamba_v0.6.6", "--samplename", self.sample_name, "--tmpdir", workdir, "--delete-old", os.path.join(local_output_dir, self.sample_name) + ".bam", # output.bam ] + all_dests bunnies.run_cmd(merge_args, stdout=sys.stdout, stderr=sys.stderr, cwd=workdir) with open( os.path.join(local_output_dir, self.sample_name + ".bam.merged.txt"), "w") as merge_manifest: for src in all_srcs: merge_manifest.write("\t".join([ self.sample_name, src['bam'], os.path.join(s3_output_prefix, self.sample_name + ".bam") ]) + "\n") bunnies.run_cmd(["find", local_output_dir], stdout=sys.stdout, stderr=sys.stderr, cwd=workdir) pfx = self.sample_name def _check_output_file(fname, is_optional=False): try: inpath = os.path.join(local_output_dir, fname) output_url = os.path.join(s3_output_prefix, fname) st_size = os.stat(inpath).st_size bunnies.transfers.s3_upload_file(inpath, output_url) return {"size": st_size, "url": output_url} except FileNotFoundError: if is_optional: return None raise Exception("missing file: " + inpath) output = { "bam": _check_output_file(pfx + ".bam", False), "bai": _check_output_file(pfx + ".bam.bai", False), "bam_md5": _check_output_file(pfx + ".bam.md5", False), "dupmetrics": _check_output_file(pfx + ".dupmetrics.txt", True), "bamstats": _check_output_file(pfx + ".bamstats.txt", False), "merge_manifest": _check_output_file(pfx + ".bam.merged.txt", False) } return output