def main(args): options = parse_args(args) RealTimeLogger.start_master() filtered_gams = [] skip_words = options.skip.split(",") for gam in options.in_gams: skip_gam = False for word in skip_words: if len(word) > 0 and word in gam: skip_gam = True if not skip_gam: filtered_gams.append(gam) options.in_gams = filtered_gams for gam in options.in_gams: if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam": raise RuntimeError("Input gam paths must be of the form " ".../<alg>/<reads>/<filename>.gam") # Make a root job root_job = Job.wrapJobFn(call_variants, options, cores=1, memory="2G", disk="2G") # Run it and see how many jobs fail failed_jobs = Job.Runner.startToil(root_job, options) if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) RealTimeLogger.stop_master()
def main(args): options = parse_args(args) RealTimeLogger.start_master() for graph in options.graphs: if os.path.splitext(graph)[1] != ".vg": raise RuntimeError("Input graphs expected to have .vg extension") # Make a root job root_job = Job.wrapJobFn(compute_kmer_indexes, options, cores=1, memory="2G", disk=0) # Run it and see how many jobs fail if not options.only_summary: failed_jobs = Job.Runner.startToil(root_job, options) else: failed_jobs = 0 if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) RealTimeLogger.stop_master() # Do the drawing outside toil to get around weird import problems cluster_comparisons(options)
def compute_vg_variants(job, input_gam, options): """ run vg pileup and vg call on the input """ input_graph_path = graph_path(input_gam, options) out_pileup_path = pileup_path(input_gam, options) out_sample_vg_path = sample_vg_path(input_gam, options) out_augmented_vg_path = augmented_vg_path(input_gam, options) do_pu = options.overwrite or not os.path.isfile(out_pileup_path) do_call = do_pu or not os.path.isfile(out_sample_vg_path) do_aug = do_pu or not os.path.isfile(out_augmented_vg_path) if do_pu: RealTimeLogger.get().info("Computing Variants for {} {}".format( input_graph_path, input_gam)) robust_makedirs(os.path.dirname(out_pileup_path)) run("vg pileup {} {} -t {} > {}".format(input_graph_path, input_gam, options.vg_cores, out_pileup_path)) if do_call: robust_makedirs(os.path.dirname(out_sample_vg_path)) run("vg call {} {} -r 0.001 -d 50 -e 150 -s 25 -t {} | vg ids -c - | vg ids -s - > {}".format(input_graph_path, out_pileup_path, options.vg_cores, out_sample_vg_path)) if do_aug: robust_makedirs(os.path.dirname(out_augmented_vg_path)) run("vg call {} {} -r 0.001 -d 50 -e 150 -s 25 -t {} -l | vg ids -c - | vg ids -s - > {}".format(input_graph_path, out_pileup_path, options.vg_cores, out_augmented_vg_path))
def run(cmd, stdout=sys.stdout, stderr=sys.stderr, timeout_sec=sys.maxint, timeout_dep=None, fail_hard=False): """ run command in shell and barf if it doesn't work or times out """ RealTimeLogger.get().info("RUN: {}".format(cmd)) proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=stdout, stderr=stderr) def timeout_fail(proc, cmd): os.kill(proc.pid, signal.SIGKILL) proc.kill() # we often check to see if some output file exists before running # if we timeout, make sure the file exists so rerunning wont timeout again # at same place (unless overwrite explicitly desired) if timeout_dep is not None and os.path.exists(timeout_dep): os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep)) if fail_hard is True: raise RuntimeError("Command: {} timed out".format(cmd)) else: RealTimeLogger.get().warning("Command: {} timed out".format(cmd)) # based on a comment in http://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout timer = Timer(timeout_sec, timeout_fail, [proc, cmd]) try: timer.start() stdout, stderr = proc.communicate() finally: timer.cancel() sts = proc.wait() if sts != 0: if fail_hard is True: raise RuntimeError("Command: %s exited with non-zero status %i" % (cmd, sts)) else: RealTimeLogger.get().warning("Command: {} exited with non-zero status {}".format(cmd, sts)) return sts
def main(args): options = parse_args(args) RealTimeLogger.start_master() filtered_gams = [] skip_words = options.skip.split(",") for gam in options.in_gams: skip_gam = False for word in skip_words: if len(word) > 0 and word in gam: skip_gam = True if not skip_gam: filtered_gams.append(gam) options.in_gams = filtered_gams for gam in options.in_gams: if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam": raise RuntimeError("Input gam paths must be of the form " ".../<alg>/<reads>/<filename>.gam") # Make a root job root_job = Job.wrapJobFn(call_variants, options, cores=1, memory="2G", disk="2G") # Run it and see how many jobs fail failed_jobs = Job.Runner.startToil(root_job, options) if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) RealTimeLogger.stop_master()
def main(args): options = parse_args(args) RealTimeLogger.start_master() for gam in options.in_gams: if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam": raise RuntimeError("Input gam paths must be of the form " ".../<alg>/<reads>/<filename>.gam") robust_makedirs(json_out_path(options)) robust_makedirs(compare_out_path(options)) # Make a root job root_job = Job.wrapJobFn(compute_all_indexes, options, cores=1, memory="2G", disk=0) # Run it and see how many jobs fail if not options.only_summary: failed_jobs = Job.Runner.startToil(root_job, options) else: failed_jobs = 0 if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) RealTimeLogger.stop_master() # make some tables from the json comparison output #dist_table(options) #acc_table(options) snp_count_table(options) graph_size_table(options)
def timeout_fail(proc, cmd): os.kill(proc.pid, signal.SIGKILL) proc.kill() # we often check to see if some output file exists before running # if we timeout, make sure the file exists so rerunning wont timeout again # at same place (unless overwrite explicitly desired) if timeout_dep is not None and os.path.exists(timeout_dep): os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep)) if fail_hard is True: raise RuntimeError("Command: {} timed out".format(cmd)) else: RealTimeLogger.get().warning("Command: {} timed out".format(cmd))
def timeout_fail(proc, cmd): os.kill(proc.pid, signal.SIGKILL) proc.kill() # we often check to see if some output file exists before running # if we timeout, make sure the file exists so rerunning wont timeout again # at same place (unless overwrite explicitly desired) if timeout_dep is not None and os.path.exists(timeout_dep): os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep)) if fail_hard is True: raise RuntimeError("Command: {} timed out".format(cmd)) else: RealTimeLogger.get().warning("Command: {} timed out".format(cmd))
def run(cmd, stdout = sys.stdout, stderr = sys.stderr): """ run command in shell and barf if it doesn't work (copied from system() in sonlib.bioio """ RealTimeLogger.get().info("RUN: {}".format(cmd)) sts = subprocess.call(cmd, shell=True, bufsize=-1, stdout=stdout, stderr=stderr) if sts != 0: raise RuntimeError("Command: %s exited with non-zero status %i" % (cmd, sts)) return sts
def main(args): options = parse_args(args) RealTimeLogger.start_master() robust_makedirs(options.out_dir) vcfmap = munge_vcf_results(options.comp_dir) mergetable = make_trio_vcfs(vcfmap, options) do_mendel(mergetable, options)
def main(args): options = parse_args(args) RealTimeLogger.start_master() robust_makedirs(options.out_dir) vcfmap = munge_vcf_results(options.comp_dir) mergetable = make_trio_vcfs(vcfmap, options) do_mendel(mergetable, options)
def main(args): options = parse_args(args) RealTimeLogger.start_master() # Make a root job root_job = Job.wrapJobFn(run_and_evaluate, options, cores=1, memory="2G", disk="2G") # Run it and get the return value answer = Job.Runner.startToil(root_job, options) RealTimeLogger.stop_master() print("Root return value:") print(answer)
def timeout_fail(procs, cmds): """ Called when the given processes, launched from the given commands, have run for too long. Kills the processes and logs an error. """ for proc in procs: os.kill(proc.pid, signal.SIGKILL) proc.kill() # we often check to see if some output file exists before running # if we timeout, make sure the file exists so rerunning wont timeout again # at same place (unless overwrite explicitly desired) if timeout_dep is not None and os.path.exists(timeout_dep): os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep)) if fail_hard is True: raise RuntimeError("Command: {} timed out".format(" | ".join(cmds))) else: RealTimeLogger.get().warning("Command: {} timed out".format(" | ".join(cmds)))
def run(cmd, stdout = sys.stdout, stderr = sys.stderr, timeout_sec = sys.maxint, timeout_dep = None, fail_hard = False): """ run command in shell and barf if it doesn't work or times out """ RealTimeLogger.get().info("RUN: {}".format(cmd)) proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=stdout, stderr=stderr) def timeout_fail(proc, cmd): os.kill(proc.pid, signal.SIGKILL) proc.kill() # we often check to see if some output file exists before running # if we timeout, make sure the file exists so rerunning wont timeout again # at same place (unless overwrite explicitly desired) if timeout_dep is not None and os.path.exists(timeout_dep): os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep)) if fail_hard is True: raise RuntimeError("Command: {} timed out".format(cmd)) else: RealTimeLogger.get().warning("Command: {} timed out".format(cmd)) # based on a comment in http://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout timer = Timer(timeout_sec, timeout_fail, [proc, cmd]) try: timer.start() stdout, stderr = proc.communicate() finally: timer.cancel() sts = proc.wait() if sts != 0: if fail_hard is True: raise RuntimeError("Command: %s exited with non-zero status %i" % (cmd, sts)) else: RealTimeLogger.get().warning("Command: {} exited with non-zero status {}".format(cmd, sts)) return sts
def add_optional_followon(job, job_function, *args, **kwargs): """ Given a job, and a Toil job function with pre-execution support, create a new follow on of the given job that runs the given job function with the given args and kwargs (which are probably all Toil args like "cores" and "memory"). The job function has to support "pre-execution": you can call it with None for the job argument, and it will return True if it thinks it really needs to be scheduled, and False otherwise. This lets us encapsulate the "is the job done?" code with the "what does the job do?" code, which might be a good design. Note that kwargs are discarded in pre-execution mode, because Toil doesn't offer a simple way to strip the ones it is going to consume, and the job functions shouldn't have to deal with extra ones. If the job needs to be scheduled, returns the newly created Toil job. Otherwise, returns the Toil job we were going to add a followon to. """ # Run the job with None as a first argument and see what it returns. TODO: # filter out Toil kwargs and enable us to pass the remaining ones to the job # function. need_to_run = job_function(None, *args) if need_to_run != False and need_to_run != True: # It's not speaking the protocol raise RuntimeError("Job function {} does not have pre-execution support!".format(job_function)) if need_to_run: # Actually schedule RealTimeLogger.get().debug("Running {}".format(job_function)) return job.addFollowOnJobFn(job, job_function, *args, **kwargs) else: # Don't schedule and give back the job we were adding onto instead. RealTimeLogger.get().debug("Skipping {}".format(job_function)) return job
def main(args): options = parse_args(args) RealTimeLogger.start_master() if options.classic: # expect call_dir/SAMPLE/region.vcf for sampledir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(sampledir): sample = os.path.basename(sampledir) vcfs = [] outfile = os.path.join(sampledir, "TOTAL.vcf") for vcf in glob.glob(os.path.join(sampledir, "*.vcf")): if os.path.basename(vcf) in [ "BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf", "MHC.vcf" ]: run("vcfsort {} > {}.sort".format(vcf, vcf), fail_hard=True) run("bgzip -c {}.sort > {}.gz".format(vcf, vcf), fail_hard=True) run("rm -f {}.sort".format(vcf)) run("tabix -f -p vcf {}.gz".format(vcf), fail_hard=True) vcfs.append("{}.gz".format(vcf)) if len(vcfs) > 0: run("vt cat {} > {}".format(" ".join(vcfs), outfile), fail_hard=True) run("vcfsort {} > {}.sort".format(outfile, outfile), fail_hard=True) run("mv {}.sort {}".format(outfile, outfile), fail_hard=True) run("bgzip -c {} > {}.gz".format(outfile, outfile), fail_hard=True) run("tabix -f -p vcf {}.gz".format(outfile), fail_hard=True) return 0 # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf # count up regions regions = set() for regiondir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(regiondir): region = os.path.basename(regiondir) # avoid crufty directories (including outputs of previous runs of this script) if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]: regions.add(region) print regions # count up graphs (that are present in every region) graphs = set() gcount = defaultdict(int) for region in regions: for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")): if os.path.isdir(graphdir): graph = os.path.basename(graphdir) gcount[graph] = gcount[graph] + 1 for graph, count in gcount.items(): if count == len(regions): graphs.add(graph) print graphs # count up samples samples = set() scount = defaultdict(int) for region in regions: for graph in graphs: for vcf in glob.glob( os.path.join(options.call_dir, region, graph, "*_sample.vcf")): sample = os.path.basename(vcf).split("_")[0] scount[sample] = scount[sample] + 1 for sample, count in scount.items(): samples.add(sample) print samples # make our output directory out_dir = os.path.join(options.call_dir, options.name) robust_makedirs(out_dir) for graph in graphs: g_out_dir = os.path.join(out_dir, graph) for sample in samples: vcf_files = [] for region in regions: vcf = os.path.join(options.call_dir, region, graph, "{}_sample.vcf".format(sample)) if os.path.isfile(vcf): vcf_files.append((region, vcf)) # this sample doesn't span all regions, skip it if len(vcf_files) < len(regions): print "Skipping Sample {} for Graph {}".format(sample, graph) continue # output vcf merge_vcf_path = os.path.join(out_dir, graph, "{}_sample.vcf".format(sample)) # working directory for intermediates / debugging work_path = os.path.join(out_dir, graph, "input", sample) robust_makedirs(work_path) # preprocess all the vcfs and leave in input dir input_files = [] for region, vcf in vcf_files: outbase = os.path.join(work_path, region) run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard=True) run("bgzip -f {}.vcf".format(outbase)) run("tabix -f -p vcf {}.vcf.gz".format(outbase)) input_files.append("{}.vcf.gz".format(outbase)) # run the merge run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path), fail_hard=True) # make an index just in case run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path), fail_hard=True) run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path), fail_hard=True) run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard=True) run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard=True) return 0
def main(args): options = parse_args(args) RealTimeLogger.start_master() if options.classic: # expect call_dir/SAMPLE/region.vcf for sampledir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(sampledir): sample = os.path.basename(sampledir) vcfs = [] outfile = os.path.join(sampledir, "TOTAL.vcf") for vcf in glob.glob(os.path.join(sampledir, "*.vcf")): if os.path.basename(vcf) in ["BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf", "MHC.vcf"]: run("vcfsort {} > {}.sort".format(vcf, vcf), fail_hard = True) run("bgzip -c {}.sort > {}.gz".format(vcf, vcf), fail_hard = True) run("rm -f {}.sort".format(vcf)) run("tabix -f -p vcf {}.gz".format(vcf), fail_hard = True) vcfs.append("{}.gz".format(vcf)) if len(vcfs) > 0: run("vt cat {} > {}".format(" ".join(vcfs), outfile), fail_hard = True) run("vcfsort {} > {}.sort".format(outfile, outfile), fail_hard = True) run("mv {}.sort {}".format(outfile, outfile), fail_hard = True) run("bgzip -c {} > {}.gz".format(outfile, outfile), fail_hard = True) run("tabix -f -p vcf {}.gz".format(outfile), fail_hard = True) return 0 # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf # count up regions regions = set() for regiondir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(regiondir): region = os.path.basename(regiondir) # avoid crufty directories (including outputs of previous runs of this script) if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]: regions.add(region) print regions # count up graphs (that are present in every region) graphs = set() gcount = defaultdict(int) for region in regions: for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")): if os.path.isdir(graphdir): graph = os.path.basename(graphdir) gcount[graph] = gcount[graph] + 1 for graph, count in gcount.items(): if count == len(regions): graphs.add(graph) print graphs # count up samples samples = set() scount = defaultdict(int) for region in regions: for graph in graphs: for vcf in glob.glob(os.path.join(options.call_dir, region, graph, "*_sample.vcf")): sample = os.path.basename(vcf).split("_")[0] scount[sample] = scount[sample] + 1 for sample, count in scount.items(): samples.add(sample) print samples # make our output directory out_dir = os.path.join(options.call_dir, options.name) robust_makedirs(out_dir) for graph in graphs: g_out_dir = os.path.join(out_dir, graph) for sample in samples: vcf_files = [] for region in regions: vcf = os.path.join(options.call_dir, region, graph, "{}_sample.vcf".format(sample)) if os.path.isfile(vcf): vcf_files.append((region, vcf)) # this sample doesn't span all regions, skip it if len(vcf_files) < len(regions): print "Skipping Sample {} for Graph {}".format(sample, graph) continue # output vcf merge_vcf_path = os.path.join(out_dir, graph, "{}_sample.vcf".format(sample)) # working directory for intermediates / debugging work_path = os.path.join(out_dir, graph, "input", sample) robust_makedirs(work_path) # preprocess all the vcfs and leave in input dir input_files = [] for region, vcf in vcf_files: outbase = os.path.join(work_path, region) run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard = True) run("bgzip -f {}.vcf".format(outbase)) run("tabix -f -p vcf {}.vcf.gz".format(outbase)) input_files.append("{}.vcf.gz".format(outbase)) # run the merge run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path), fail_hard = True) # make an index just in case run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path), fail_hard = True) run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path), fail_hard = True) run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True) run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True) return 0
def compute_vg_variants(job, input_gam, options): """ run vg pileup and vg call on the input """ input_graph_path = graph_path(input_gam, options) out_pileup_path = pileup_path(input_gam, options) out_sample_vg_path = sample_vg_path(input_gam, options) out_sample_txt_path = sample_txt_path(input_gam, options) out_augmented_vg_path = augmented_vg_path(input_gam, options) do_pu = options.overwrite or not os.path.isfile(out_pileup_path) do_call = do_pu or not os.path.isfile(out_augmented_vg_path) do_sample = options.sample and (do_pu or not os.path.isfile(out_sample_vg_path)) do_vcf = do_call or not os.path.isfile(out_sample_vg_path.replace(".vg", ".vcf")) if do_pu: RealTimeLogger.get().info("Computing Variants for {} {}".format( input_graph_path, input_gam)) robust_makedirs(os.path.dirname(out_pileup_path)) run("vg filter {} {} | vg pileup {} - {} -t {} > {}".format(input_gam, options.filter_opts, input_graph_path, options.pileup_opts, options.vg_cores, out_pileup_path), fail_hard = True) if do_call: robust_makedirs(os.path.dirname(out_sample_vg_path)) run("vg call {} {} {} -l -c {} -t {} > {}".format(input_graph_path, out_pileup_path, options.call_opts, out_sample_txt_path, options.vg_cores, out_augmented_vg_path), fail_hard = True) if do_vcf: region = alignment_region_tag(input_gam, options) g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed") with open(g1kbed_path) as f: contig, offset = f.readline().split()[0:2] # make the vcf # can only do this if there is a "ref" path in the vg graph ref = None res_path = temp_path(options) for ref_name in ["ref", contig]: run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, ref_name, res_path)) with open(res_path) as res_file: if res_file.read()[0] == "1": ref = ref_name break run("rm {}".format(res_path)) if ref is not None: tasks = [] run("glenn2vcf {} {} -o {} -r {} -c {} -s {} -d {} > {} 2> {}".format(out_augmented_vg_path, out_sample_txt_path, offset, ref, contig, alignment_sample_tag(input_gam, options), options.depth, out_sample_vg_path.replace(".vg", ".vcf"), out_sample_vg_path.replace(".vg", ".vcf.stderr")), fail_hard = True) if do_sample: robust_makedirs(os.path.dirname(out_augmented_vg_path)) run("vg call {} {} {} -t {} | vg ids -cs - > {}".format(input_graph_path, out_pileup_path, options.call_opts, options.vg_cores, out_sample_vg_path), fail_hard = True)
def run(cmds, stdout = sys.stdout, stderr = sys.stderr, timeout_sec = sys.maxint, timeout_dep = None, fail_hard = False): """ Run commands in the given list in the shell, piping each in the next. Throw an exception if any of the commands fails or if the whole pipeline times out. """ def timeout_fail(procs, cmds): """ Called when the given processes, launched from the given commands, have run for too long. Kills the processes and logs an error. """ for proc in procs: os.kill(proc.pid, signal.SIGKILL) proc.kill() # we often check to see if some output file exists before running # if we timeout, make sure the file exists so rerunning wont timeout again # at same place (unless overwrite explicitly desired) if timeout_dep is not None and os.path.exists(timeout_dep): os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep)) if fail_hard is True: raise RuntimeError("Command: {} timed out".format(" | ".join(cmds))) else: RealTimeLogger.get().warning("Command: {} timed out".format(" | ".join(cmds))) RealTimeLogger.get().info("RUN: {}".format(" | ".join(cmds))) # We have a list of processes, one per command. procs = [] # We remember the previous process's standard output last_stdout = None for cmd in cmds[:-1]: # All but the last command feed their standard output into pipes proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdin=last_stdout, stdout=subprocess.PIPE, stderr=stderr) last_stdout = proc.stdout procs.append(proc) for cmd in cmds[-1:]: # The last command, if any, just dumps to our standard output proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdin=last_stdout, stdout=stdout, stderr=stderr) procs.append(proc) # We collect the return codes statuses = [] # based on a comment in http://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout timer = Timer(timeout_sec, timeout_fail, [procs, cmds]) try: timer.start() for proc, cmd in itertools.izip(procs, cmds): sts = proc.wait() statuses.append(sts) if sts != 0: message = "Command: {} in pipeline {} exited with non-zero status {}".format(cmd, " | ".join(cmds), sts) if fail_hard is True: raise RuntimeError(message) else: RealTimeLogger.get().warning(message) finally: timer.cancel() if len(statuses) > 0: # Return the max return code (0 if everything worked) return max(statuses) else: # Nothing bad haoppened because nothing happened return 0
def run_experiment(job, options): """ Toil job to run an experiment on a variety of conditions and compare the results. """ # Make the IOStore we can search for GAMs gam_store = IOStore.get(options.in_gams) # And one so we can check if truth files exist truth_store = IOStore.get(options.truth) # This will hold best F score by region, graph, sample, and then condition. # We stick in dicts by condition. results = collections.defaultdict(lambda: collections.defaultdict(dict)) # Make some experimental conditions with filter, pileup, call, # and glenn2vcf options. # First define the lists we want the product of for all the parameters grid = [{ # vg filter "-r": [0.97], # minimum score to keep primary alignment [default=0] "-d": [0], # mininum (primary - secondary) score delta to keep secondary alignment "-e": [0], # minimum (primary - secondary) score delta to keep primary alignment "-a": [""], # use (secondary / primary) for delta comparisons "-f": [""], # normalize score based on length "-u": [""], # use substitution count instead of score "-s": [2], # minimum score to keep secondary alignment [default=0] "-o": [0] # filter reads whose alignments begin or end with an insert > N [default=99999] }, { # vg pileup "-w": [40], # size of window to apply -m option (default=0) "-m": [2], # ignore bases with > N mismatches within window centered on read (default=1) "-q": [10] # ignore bases with PHRED quality < N (default=0) }, { # vg call "-r": [0.0001], # Prior for being heterozygous "-b": [1.0], # Max strand bias "-f": [0.05], # Min fraction of reads required to support a variant "-d": [4] # Min pileup depth }, { # glenn2vcf "--depth": [10], # search depth not read depth "--min_fraction": [0.15], # Min fraction of average coverage to call at "--min_count": [6], # Min total supporting reads for an allele to have it "--max_het_bias": [4.2] # Max bias towards one alt of a called het }, { # vcfeval "--all-records": [""], "--vcf-score-field": ["XAAD"] }] # Make the whole grid of conditions for the grid search conditions = [ExperimentCondition(*point) for point in make_grid(grid)] # Add a condition that opens everything way up so we can try and # get maximum recall. conditions.append(ExperimentCondition( { # vg filter "-r": 0, "-d": 0.05, "-e": 0.05, "-a": "", "-f": "", "-u": "", "-s": 10000, "-o": 99999 }, { # vg pileup "-w": 40, "-m": 10, "-q": 10 }, { # vg call "-r": 0.0001, "-b": 0.4, "-f": 0.25, "-d": 11 }, { # glenn2vcf "--depth": 10, "--min_fraction": 0, # Min fraction of average coverage to call at "--min_count": 1, # Min total supporting reads for an allele to have it "--max_het_bias": 20 # Max bias towards one alt of a called het }, { # vcfeval "--all-records": "", "--vcf-score-field": "XAAD" }) ) RealTimeLogger.get().info("Running {} conditions...".format(len(conditions))) for region_dir in gam_store.list_input_directory(""): # Within every region we have samples for, look through all the # different graphs. if options.important_regions is not None and region_dir not in options.important_regions: # Skip it if it's unimportant continue for graph_dir in gam_store.list_input_directory(region_dir): # Within every graph for a region, we have a collection of samples. if ("{}:{}".format(region_dir, graph_dir) in options.blacklist or region_dir in options.blacklist or graph_dir in options.blacklist): # We don't want to process this region/graph pair. RealTimeLogger.get().info("Skipping {} graph {}".format( region_dir, graph_dir)) continue if options.important_graphs is not None and graph_dir not in options.important_graphs: # Skip it if it's unimportant continue for filename in gam_store.list_input_directory("{}/{}".format( region_dir, graph_dir)): # Look at each potential sample file # Is this file a sample? match = re.match("(.+)\\.gam$", filename) if not match: # It's not a sample continue if options.important_samples is not None and filename not in options.important_samples: # Skip it if it's unimportant continue # Otherwise, compose the full GAM key gam_key = "{}/{}/{}".format(region_dir, graph_dir, filename) if (not truth_store.exists(truth_compressed_key(gam_key)) or not truth_store.exists(truth_index_key(gam_key))): # We don't have a truth for this sample, so don't bother doing it. RealTimeLogger.get().warning("Skipping missing truth for {}".format(gam_key)) continue # Kick off a pipeline to make the variant calls. # TODO: assumes all the extra directories we need to read stuff from are set exp_job = job.addChildJobFn(run_conditions, gam_key, conditions, options, cores=1, memory="2G", disk="10G") # Save the best F score by condition under this region, graph, and sample filename results[region_dir][graph_dir][filename] = exp_job.rv() # Give back the results # TODO: we run it through JSON to fix the pickle-ability. return de_defaultdict(results)