def write_filtered(vcffile, lhome, store=None): if vcffile.startswith("s3://"): vcffile = pull_from_s3(vcffile) filteredvcf = op.basename(vcffile).replace(".vcf", ".filtered.vcf") cmd = "python {}/scripts/lobSTR_filter_vcf.py".format(lhome) cmd += " --vcf {}".format(vcffile) cmd += " --loc-cov 5 --loc-log-score 0.8" #cmd += " --loc-call-rate 0.8 --loc-max-ref-length 80" #cmd += " --call-cov 5 --call-log-score 0.8 --call-dist-end 20" sh(cmd, outfile=filteredvcf) if store: push_to_s3(store, filteredvcf) return filteredvcf
def write_csv_ev(filename, store, cleanup): lv = LobSTRvcf() lv.parse(filename, cleanup=cleanup) csvfile = op.basename(filename) + ".csv" fw = open(csvfile, "w") print >> fw, lv.csvline fw.close() evfile = op.basename(filename) + ".ev" fw = open(evfile, "w") print >> fw, lv.evline fw.close() # Save to s3 push_to_s3(store, csvfile) push_to_s3(store, evfile)
def write_csv_ev(filename, filtered, cleanup, store=None): lv = LobSTRvcf() lv.parse(filename, filtered=filtered, cleanup=cleanup) csvfile = op.basename(filename) + ".csv" evfile = op.basename(filename) + ".ev" fw = open(csvfile, "w") print(lv.csvline, file=fw) fw.close() fw = open(evfile, "w") print(lv.evline, file=fw) fw.close() # Save to s3 if store: push_to_s3(store, csvfile) push_to_s3(store, evfile)
def write_csv_ev(filename, filtered, cleanup, store=None, stutter=False): lv = LobSTRvcf() lv.parse(filename, filtered=filtered, cleanup=cleanup, stutter=stutter) csvfile = op.basename(filename) + ".csv" evfile = op.basename(filename) + ".ev" if stutter: fw = open(evfile, "w") print >> fw, lv.evline fw.close() return fw = open(csvfile, "w") print >> fw, lv.csvline fw.close() fw = open(evfile, "w") print >> fw, lv.evline fw.close() # Save to s3 if store: push_to_s3(store, csvfile) push_to_s3(store, evfile)
def lobstr(args): """ %prog lobstr lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In addition, bamfile can be S3 location and --lobstr_home can be S3 location (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/) """ p = OptionParser(lobstr.__doc__) p.add_option("--haploid", default="chrY,chrM", help="Use haploid model for these chromosomes") p.add_option("--chr", help="Run only this chromosome") p.add_option("--simulation", default=False, action="store_true", help="Simulation mode") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) bamfile = opts.input_bam_path if len(args) < 1 or bamfile is None: sys.exit(not p.print_help()) lbindices = args if opts.simulation: # Simulation mode cmd, vcf_file = allelotype_on_chr(bamfile, "chr4", "/mnt/software/lobSTR/", "TREDs", haploid=opts.haploid) stats_file = vcf_file.rsplit(".", 1)[0] + ".allelotype.stats" results_dir = "lobstr_results" mkdir(results_dir) sh(cmd) sh("mv {} {}/ && rm {}".format(vcf_file, results_dir, stats_file)) return s3mode = bamfile.startswith("s3") store = opts.output_path cleanup = not opts.nocleanup workdir = opts.workdir mkdir(workdir) os.chdir(workdir) lhome = opts.lobstr_home if lhome.startswith("s3://"): lhome = pull_from_s3(lhome, overwrite=False) exec_id, sample_id = opts.workflow_execution_id, opts.sample_id prefix = [x for x in (exec_id, sample_id) if x] if prefix: pf = "_".join(prefix) else: pf = bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: makefile = "makefile.{0}".format(lbidx) mm = MakeManager(filename=makefile) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx, haploid=opts.haploid) mm.add(bamfile, vcffile, cmd) filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf") cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile) cmd += " --lobstr_home {}".format(lhome) mm.add(vcffile, filteredvcffile, cmd) vcffiles.append(filteredvcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if cleanup: mm.clean() sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
def cn(args): """ %prog cn workdir 102340_NA12878 \ s3://hli-bix-us-west-2/kubernetes/wf-root-test/102340_NA12878/lpierce-ccn_gcn-v2/ Download CCN output folder and convert cib to copy number per 1Kb. """ p = OptionParser(cn.__doc__) p.add_option("--binsize", default=1000, type="int", help="Window size along chromosome") p.add_option( "--cleanup", default=False, action="store_true", help="Clean up downloaded s3 folder", ) p.add_option( "--hmm", default=False, action="store_true", help="Run HMM caller after computing CN", ) p.add_option( "--upload", default="s3://hli-mv-data-science/htang/ccn", help="Upload cn and seg results to s3", ) p.add_option("--rebuildgc", help="Rebuild GC directory rather than pulling from S3") opts, args = p.parse_args(args) if len(args) == 2: workdir, sample_key = args s3dir = None elif len(args) == 3: workdir, sample_key, s3dir = args else: sys.exit(not p.print_help()) n = opts.binsize rebuildgc = opts.rebuildgc mkdir(workdir) sampledir = op.join(workdir, sample_key) if s3dir: sync_from_s3(s3dir, target_dir=sampledir) assert op.exists(sampledir), "Directory {} doesn't exist!".format( sampledir) cndir = op.join(workdir, sample_key + "-cn") if op.exists(cndir): logging.debug("Directory {} exists. Skipped.".format(cndir)) return gcdir = "gc" if rebuildgc: build_gc_array(fastafile=rebuildgc, n=n, gcdir=gcdir) if not op.exists(gcdir): sync_from_s3("s3://hli-mv-data-science/htang/ccn/gc", target_dir=gcdir) # Build GC correction table gc_bin = defaultdict(list) gc_med = {} coverage = [] for seqid in allsomes: gcfile = op.join(gcdir, "{}.{}.gc".format(seqid, n)) if not op.exists(gcfile): logging.error("File {} not found. Continue anyway.".format(gcfile)) continue gc = np.fromfile(gcfile, dtype=np.uint8) cibfile = op.join(sampledir, "{}.{}.cib".format(sample_key, seqid)) cib = load_cib(cibfile) print(seqid, gc.shape[0], cib.shape[0], file=sys.stderr) if seqid in autosomes: for gci, k in zip(gc, cib): gc_bin[gci].append(k) coverage.append((seqid, gc, cib)) for gci, k in gc_bin.items(): nonzero_k = [x for x in k if x] gc_med[gci] = med = np.median(nonzero_k) / 2 print(gci, len(nonzero_k), med, file=sys.stderr) mkdir(cndir) apply_fun = np.vectorize(gc_med.get) # Apply the GC correction over coverage for seqid, gc, cib in coverage: nitems = cib.shape[0] beta = apply_fun(gc[:nitems]) beta_cn = cib / beta cnfile = op.join(cndir, "{}.{}.cn".format(sample_key, seqid)) beta_cn.tofile(cnfile) # Run HMM caller if asked segfile = hmm([workdir, sample_key]) if opts.hmm else None upload = opts.upload if upload: push_to_s3(upload, cndir) if segfile: push_to_s3(upload, segfile) if opts.cleanup: import shutil shutil.rmtree(sampledir) shutil.rmtree(cndir)
def lobstr(args): """ %prog lobstr lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In addition, bamfile can be S3 location and --lobstr_home can be S3 location (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/) """ p = OptionParser(lobstr.__doc__) p.add_option("--chr", help="Run only this chromosome") p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str-data") opts, args = p.parse_args(args) bamfile = opts.input_bam_path if len(args) < 1 or bamfile is None: sys.exit(not p.print_help()) lbindices = args s3mode = bamfile.startswith("s3") store = opts.output_path cleanup = not opts.nocleanup workdir = opts.workdir mkdir(workdir) os.chdir(workdir) lhome = opts.lobstr_home if lhome.startswith("s3://"): lhome = pull_from_s3(lhome, overwrite=False) exec_id, sample_id = opts.workflow_execution_id, opts.sample_id prefix = [x for x in (exec_id, sample_id) if x] if prefix: pf = "_".join(prefix) else: pf = bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: makefile = "makefile.{0}".format(lbidx) mm = MakeManager(filename=makefile) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx) mm.add(bamfile, vcffile, cmd) filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf") cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile) cmd += " --lobstr_home {}".format(lhome) mm.add(vcffile, filteredvcffile, cmd) vcffiles.append(filteredvcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if cleanup: mm.clean() sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
def cn(args): """ %prog cn workdir 102340_NA12878 \ s3://hli-bix-us-west-2/kubernetes/wf-root-test/102340_NA12878/lpierce-ccn_gcn-v2/ Download CCN output folder and convert cib to copy number per 1Kb. """ p = OptionParser(cn.__doc__) p.add_option("--binsize", default=1000, type="int", help="Window size along chromosome") p.add_option("--cleanup", default=False, action="store_true", help="Clean up downloaded s3 folder") p.add_option("--hmm", default=False, action="store_true", help="Run HMM caller after computing CN") p.add_option("--upload", default="s3://hli-mv-data-science/htang/ccn", help="Upload cn and seg results to s3") p.add_option("--rebuildgc", help="Rebuild GC directory rather than pulling from S3") opts, args = p.parse_args(args) if len(args) == 2: workdir, sample_key = args s3dir = None elif len(args) == 3: workdir, sample_key, s3dir = args else: sys.exit(not p.print_help()) n = opts.binsize rebuildgc = opts.rebuildgc mkdir(workdir) sampledir = op.join(workdir, sample_key) if s3dir: sync_from_s3(s3dir, target_dir=sampledir) assert op.exists(sampledir), \ "Directory {} doesn't exist!".format(sampledir) cndir = op.join(workdir, sample_key + "-cn") if op.exists(cndir): logging.debug("Directory {} exists. Skipped.".format(cndir)) return gcdir = "gc" if rebuildgc: build_gc_array(fastafile=rebuildgc, n=n, gcdir=gcdir) if not op.exists(gcdir): sync_from_s3("s3://hli-mv-data-science/htang/ccn/gc", target_dir=gcdir) # Build GC correction table gc_bin = defaultdict(list) gc_med = {} coverage = [] for seqid in allsomes: gcfile = op.join(gcdir, "{}.{}.gc".format(seqid, n)) if not op.exists(gcfile): logging.error("File {} not found. Continue anyway.".format(gcfile)) continue gc = np.fromfile(gcfile, dtype=np.uint8) cibfile = op.join(sampledir, "{}.{}.cib".format(sample_key, seqid)) cib = load_cib(cibfile) print >> sys.stderr, seqid, gc.shape[0], cib.shape[0] if seqid in autosomes: for gci, k in zip(gc, cib): gc_bin[gci].append(k) coverage.append((seqid, gc, cib)) for gci, k in gc_bin.items(): nonzero_k = [x for x in k if x] gc_med[gci] = med = np.median(nonzero_k) / 2 print >> sys.stderr, gci, len(nonzero_k), med mkdir(cndir) apply_fun = np.vectorize(gc_med.get) # Apply the GC correction over coverage for seqid, gc, cib in coverage: nitems = cib.shape[0] beta = apply_fun(gc[:nitems]) beta_cn = cib / beta cnfile = op.join(cndir, "{}.{}.cn".format(sample_key, seqid)) beta_cn.tofile(cnfile) # Run HMM caller if asked segfile = hmm([workdir, sample_key]) if opts.hmm else None upload = opts.upload if upload: push_to_s3(upload, cndir) if segfile: push_to_s3(upload, segfile) if opts.cleanup: import shutil shutil.rmtree(sampledir) shutil.rmtree(cndir)
def run_mito( chrMfa, bamfile, opts, realignonly=False, svonly=False, store=None, cleanup=False ): from jcvi.formats.sam import get_minibam region = "chrM" minibam = op.basename(bamfile).replace(".bam", ".{}.bam".format(region)) if not op.exists(minibam): get_minibam(bamfile, region) else: logging.debug("{} found. Skipped.".format(minibam)) speedseq_bin = op.join(opts.speedseq_home, "speedseq") realign = minibam.rsplit(".", 1)[0] + ".realign" realignbam = realign + ".bam" margs = " -v -t {} -o {}".format(opts.cpus, realign) if need_update(minibam, realign + ".bam"): cmd = speedseq_bin + " realign" cmd += margs cmd += " {} {}".format(chrMfa, minibam) sh(cmd) else: logging.debug("{} found. Skipped.".format(realignbam)) if realignonly: return depthfile = realign + ".depth" if need_update(realignbam, depthfile): coverage( [ chrMfa, realignbam, "--nosort", "--format=coverage", "--outfile={}".format(depthfile), ] ) if store: push_to_s3(store, depthfile) vcffile = realign + ".sv.vcf.gz" if need_update(realignbam, vcffile): cmd = speedseq_bin + " sv" cmd += margs cmd += " -R {}".format(chrMfa) cmd += " -m {}".format(opts.support) cmd += " -B {} -D {} -S {}".format( realignbam, realign + ".discordants.bam", realign + ".splitters.bam" ) sh(cmd) else: logging.debug("{} found. Skipped.".format(vcffile)) if store: push_to_s3(store, vcffile) if svonly: if cleanup: do_cleanup(minibam, realignbam) return piledriver = realign + ".piledriver" if need_update(realignbam, piledriver): cmd = "bamtools piledriver -fasta {}".format(chrMfa) cmd += " -in {}".format(realignbam) sh(cmd, outfile=piledriver) if store: push_to_s3(store, piledriver) if cleanup: do_cleanup(minibam, realignbam)
def lobstr(args): """ %prog lobstr bamfile lobstr_index1 lobstr_index2 ... Run lobSTR on a big BAM file. There can be multiple lobSTR indices. """ p = OptionParser(lobstr.__doc__) p.add_option("--chr", help="Run only this chromosome") p.add_option("--prefix", help="Use prefix file name") p.set_home("lobstr") p.set_cpus() p.set_aws_opts(store="hli-mv-data-science/htang/str") opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) bamfile = args[0] lbindices = args[1:] s3mode = bamfile.startswith("s3") store = opts.store workdir = opts.workdir mkdir(workdir) os.chdir(workdir) pf = opts.prefix or bamfile.split("/")[-1].split(".")[0] if s3mode: gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1]) remotegzfile = "s3://{0}/{1}".format(store, gzfile) if check_exists_s3(remotegzfile): logging.debug("Object `{0}` exists. Computation skipped."\ .format(remotegzfile)) return localbamfile = pf + ".bam" localbaifile = localbamfile + ".bai" if op.exists(localbamfile): logging.debug("BAM file already downloaded.") else: pull_from_s3(bamfile, localbamfile) if op.exists(localbaifile): logging.debug("BAM index file already downloaded.") else: remotebaifile = bamfile + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: remotebaifile = bamfile.rsplit(".")[0] + ".bai" if check_exists_s3(remotebaifile): pull_from_s3(remotebaifile, localbaifile) else: logging.debug("BAM index cannot be found in S3!") sh("samtools index {0}".format(localbamfile)) bamfile = localbamfile lhome = opts.lobstr_home chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"]) for lbidx in lbindices: mm = MakeManager(filename="makefile.{0}".format(lbidx)) vcffiles = [] for chr in chrs: cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx) mm.add(bamfile, vcffile, cmd) vcffiles.append(vcffile) gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx) cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles)) cmd += " | bgzip -c > {0}".format(gzfile) mm.add(vcffiles, gzfile, cmd) mm.run(cpus=opts.cpus) if s3mode: push_to_s3(store, gzfile) if opts.cleanup: sh("rm -f *")
def run_mito(chrMfa, bamfile, opts, realignonly=False, svonly=False, store=None, cleanup=False): from jcvi.formats.sam import get_minibam region = "chrM" minibam = op.basename(bamfile).replace(".bam", ".{}.bam".format(region)) if not op.exists(minibam): get_minibam(bamfile, region) else: logging.debug("{} found. Skipped.".format(minibam)) speedseq_bin = op.join(opts.speedseq_home, "speedseq") realign = minibam.rsplit(".", 1)[0] + ".realign" realignbam = realign + ".bam" margs = " -v -t {} -o {}".format(opts.cpus, realign) if need_update(minibam, realign + ".bam"): cmd = speedseq_bin + " realign" cmd += margs cmd += " {} {}".format(chrMfa, minibam) sh(cmd) else: logging.debug("{} found. Skipped.".format(realignbam)) if realignonly: return depthfile = realign + ".depth" if need_update(realignbam, depthfile): coverage([chrMfa, realignbam, "--nosort", "--format=coverage", "--outfile={}".format(depthfile)]) if store: push_to_s3(store, depthfile) vcffile = realign + ".sv.vcf.gz" if need_update(realignbam, vcffile): cmd = speedseq_bin + " sv" cmd += margs cmd += " -R {}".format(chrMfa) cmd += " -m {}".format(opts.support) cmd += " -B {} -D {} -S {}".format(realignbam, realign + ".discordants.bam", realign + ".splitters.bam") sh(cmd) else: logging.debug("{} found. Skipped.".format(vcffile)) if store: push_to_s3(store, vcffile) if svonly: if cleanup: do_cleanup(minibam, realignbam) return piledriver = realign + ".piledriver" if need_update(realignbam, piledriver): cmd = "bamtools piledriver -fasta {}".format(chrMfa) cmd += " -in {}".format(realignbam) sh(cmd, outfile=piledriver) if store: push_to_s3(store, piledriver) if cleanup: do_cleanup(minibam, realignbam)