def main(args): options = parse_args(args) RealTimeLogger.start_master() filtered_gams = [] skip_words = options.skip.split(",") for gam in options.in_gams: skip_gam = False for word in skip_words: if len(word) > 0 and word in gam: skip_gam = True if not skip_gam: filtered_gams.append(gam) options.in_gams = filtered_gams for gam in options.in_gams: if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam": raise RuntimeError("Input gam paths must be of the form " ".../<alg>/<reads>/<filename>.gam") # Make a root job root_job = Job.wrapJobFn(call_variants, options, cores=1, memory="2G", disk="2G") # Run it and see how many jobs fail failed_jobs = Job.Runner.startToil(root_job, options) if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) RealTimeLogger.stop_master()
def main(args): options = parse_args(args) RealTimeLogger.start_master() for gam in options.in_gams: if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam": raise RuntimeError("Input gam paths must be of the form " ".../<alg>/<reads>/<filename>.gam") robust_makedirs(json_out_path(options)) robust_makedirs(compare_out_path(options)) # Make a root job root_job = Job.wrapJobFn(compute_all_indexes, options, cores=1, memory="2G", disk=0) # Run it and see how many jobs fail if not options.only_summary: failed_jobs = Job.Runner.startToil(root_job, options) else: failed_jobs = 0 if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) RealTimeLogger.stop_master() # make some tables from the json comparison output #dist_table(options) #acc_table(options) snp_count_table(options) graph_size_table(options)
def main(args): options = parse_args(args) RealTimeLogger.start_master() for graph in options.graphs: if os.path.splitext(graph)[1] != ".vg": raise RuntimeError("Input graphs expected to have .vg extension") # Make a root job root_job = Job.wrapJobFn(compute_kmer_indexes, options, cores=1, memory="2G", disk=0) # Run it and see how many jobs fail if not options.only_summary: failed_jobs = Job.Runner.startToil(root_job, options) else: failed_jobs = 0 if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) RealTimeLogger.stop_master() # Do the drawing outside toil to get around weird import problems cluster_comparisons(options)
def main(args): options = parse_args(args) RealTimeLogger.start_master() robust_makedirs(options.out_dir) vcfmap = munge_vcf_results(options.comp_dir) mergetable = make_trio_vcfs(vcfmap, options) do_mendel(mergetable, options)
def main(args): options = parse_args(args) RealTimeLogger.start_master() # Make a root job root_job = Job.wrapJobFn(run_and_evaluate, options, cores=1, memory="2G", disk="2G") # Run it and get the return value answer = Job.Runner.startToil(root_job, options) RealTimeLogger.stop_master() print("Root return value:") print(answer)
def main(args): options = parse_args(args) RealTimeLogger.start_master() if options.classic: # expect call_dir/SAMPLE/region.vcf for sampledir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(sampledir): sample = os.path.basename(sampledir) vcfs = [] outfile = os.path.join(sampledir, "TOTAL.vcf") for vcf in glob.glob(os.path.join(sampledir, "*.vcf")): if os.path.basename(vcf) in ["BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf", "MHC.vcf"]: run("vcfsort {} > {}.sort".format(vcf, vcf), fail_hard = True) run("bgzip -c {}.sort > {}.gz".format(vcf, vcf), fail_hard = True) run("rm -f {}.sort".format(vcf)) run("tabix -f -p vcf {}.gz".format(vcf), fail_hard = True) vcfs.append("{}.gz".format(vcf)) if len(vcfs) > 0: run("vt cat {} > {}".format(" ".join(vcfs), outfile), fail_hard = True) run("vcfsort {} > {}.sort".format(outfile, outfile), fail_hard = True) run("mv {}.sort {}".format(outfile, outfile), fail_hard = True) run("bgzip -c {} > {}.gz".format(outfile, outfile), fail_hard = True) run("tabix -f -p vcf {}.gz".format(outfile), fail_hard = True) return 0 # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf # count up regions regions = set() for regiondir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(regiondir): region = os.path.basename(regiondir) # avoid crufty directories (including outputs of previous runs of this script) if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]: regions.add(region) print regions # count up graphs (that are present in every region) graphs = set() gcount = defaultdict(int) for region in regions: for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")): if os.path.isdir(graphdir): graph = os.path.basename(graphdir) gcount[graph] = gcount[graph] + 1 for graph, count in gcount.items(): if count == len(regions): graphs.add(graph) print graphs # count up samples samples = set() scount = defaultdict(int) for region in regions: for graph in graphs: for vcf in glob.glob(os.path.join(options.call_dir, region, graph, "*_sample.vcf")): sample = os.path.basename(vcf).split("_")[0] scount[sample] = scount[sample] + 1 for sample, count in scount.items(): samples.add(sample) print samples # make our output directory out_dir = os.path.join(options.call_dir, options.name) robust_makedirs(out_dir) for graph in graphs: g_out_dir = os.path.join(out_dir, graph) for sample in samples: vcf_files = [] for region in regions: vcf = os.path.join(options.call_dir, region, graph, "{}_sample.vcf".format(sample)) if os.path.isfile(vcf): vcf_files.append((region, vcf)) # this sample doesn't span all regions, skip it if len(vcf_files) < len(regions): print "Skipping Sample {} for Graph {}".format(sample, graph) continue # output vcf merge_vcf_path = os.path.join(out_dir, graph, "{}_sample.vcf".format(sample)) # working directory for intermediates / debugging work_path = os.path.join(out_dir, graph, "input", sample) robust_makedirs(work_path) # preprocess all the vcfs and leave in input dir input_files = [] for region, vcf in vcf_files: outbase = os.path.join(work_path, region) run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard = True) run("bgzip -f {}.vcf".format(outbase)) run("tabix -f -p vcf {}.vcf.gz".format(outbase)) input_files.append("{}.vcf.gz".format(outbase)) # run the merge run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path), fail_hard = True) # make an index just in case run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path), fail_hard = True) run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path), fail_hard = True) run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True) run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True) return 0
def main(args): options = parse_args(args) RealTimeLogger.start_master() if options.classic: # expect call_dir/SAMPLE/region.vcf for sampledir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(sampledir): sample = os.path.basename(sampledir) vcfs = [] outfile = os.path.join(sampledir, "TOTAL.vcf") for vcf in glob.glob(os.path.join(sampledir, "*.vcf")): if os.path.basename(vcf) in [ "BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf", "MHC.vcf" ]: run("vcfsort {} > {}.sort".format(vcf, vcf), fail_hard=True) run("bgzip -c {}.sort > {}.gz".format(vcf, vcf), fail_hard=True) run("rm -f {}.sort".format(vcf)) run("tabix -f -p vcf {}.gz".format(vcf), fail_hard=True) vcfs.append("{}.gz".format(vcf)) if len(vcfs) > 0: run("vt cat {} > {}".format(" ".join(vcfs), outfile), fail_hard=True) run("vcfsort {} > {}.sort".format(outfile, outfile), fail_hard=True) run("mv {}.sort {}".format(outfile, outfile), fail_hard=True) run("bgzip -c {} > {}.gz".format(outfile, outfile), fail_hard=True) run("tabix -f -p vcf {}.gz".format(outfile), fail_hard=True) return 0 # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf # count up regions regions = set() for regiondir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(regiondir): region = os.path.basename(regiondir) # avoid crufty directories (including outputs of previous runs of this script) if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]: regions.add(region) print regions # count up graphs (that are present in every region) graphs = set() gcount = defaultdict(int) for region in regions: for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")): if os.path.isdir(graphdir): graph = os.path.basename(graphdir) gcount[graph] = gcount[graph] + 1 for graph, count in gcount.items(): if count == len(regions): graphs.add(graph) print graphs # count up samples samples = set() scount = defaultdict(int) for region in regions: for graph in graphs: for vcf in glob.glob( os.path.join(options.call_dir, region, graph, "*_sample.vcf")): sample = os.path.basename(vcf).split("_")[0] scount[sample] = scount[sample] + 1 for sample, count in scount.items(): samples.add(sample) print samples # make our output directory out_dir = os.path.join(options.call_dir, options.name) robust_makedirs(out_dir) for graph in graphs: g_out_dir = os.path.join(out_dir, graph) for sample in samples: vcf_files = [] for region in regions: vcf = os.path.join(options.call_dir, region, graph, "{}_sample.vcf".format(sample)) if os.path.isfile(vcf): vcf_files.append((region, vcf)) # this sample doesn't span all regions, skip it if len(vcf_files) < len(regions): print "Skipping Sample {} for Graph {}".format(sample, graph) continue # output vcf merge_vcf_path = os.path.join(out_dir, graph, "{}_sample.vcf".format(sample)) # working directory for intermediates / debugging work_path = os.path.join(out_dir, graph, "input", sample) robust_makedirs(work_path) # preprocess all the vcfs and leave in input dir input_files = [] for region, vcf in vcf_files: outbase = os.path.join(work_path, region) run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard=True) run("bgzip -f {}.vcf".format(outbase)) run("tabix -f -p vcf {}.vcf.gz".format(outbase)) input_files.append("{}.vcf.gz".format(outbase)) # run the merge run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path), fail_hard=True) # make an index just in case run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path), fail_hard=True) run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path), fail_hard=True) run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard=True) run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard=True) return 0