def run(cmd, stdout=sys.stdout, stderr=sys.stderr, timeout_sec=sys.maxint, timeout_dep=None, fail_hard=False): """ run command in shell and barf if it doesn't work or times out """ RealTimeLogger.get().info("RUN: {}".format(cmd)) proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=stdout, stderr=stderr) def timeout_fail(proc, cmd): os.kill(proc.pid, signal.SIGKILL) proc.kill() # we often check to see if some output file exists before running # if we timeout, make sure the file exists so rerunning wont timeout again # at same place (unless overwrite explicitly desired) if timeout_dep is not None and os.path.exists(timeout_dep): os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep)) if fail_hard is True: raise RuntimeError("Command: {} timed out".format(cmd)) else: RealTimeLogger.get().warning("Command: {} timed out".format(cmd)) # based on a comment in http://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout timer = Timer(timeout_sec, timeout_fail, [proc, cmd]) try: timer.start() stdout, stderr = proc.communicate() finally: timer.cancel() sts = proc.wait() if sts != 0: if fail_hard is True: raise RuntimeError("Command: %s exited with non-zero status %i" % (cmd, sts)) else: RealTimeLogger.get().warning("Command: {} exited with non-zero status {}".format(cmd, sts)) return sts
def compute_vg_variants(job, input_gam, options): """ run vg pileup and vg call on the input """ input_graph_path = graph_path(input_gam, options) out_pileup_path = pileup_path(input_gam, options) out_sample_vg_path = sample_vg_path(input_gam, options) out_augmented_vg_path = augmented_vg_path(input_gam, options) do_pu = options.overwrite or not os.path.isfile(out_pileup_path) do_call = do_pu or not os.path.isfile(out_sample_vg_path) do_aug = do_pu or not os.path.isfile(out_augmented_vg_path) if do_pu: RealTimeLogger.get().info("Computing Variants for {} {}".format( input_graph_path, input_gam)) robust_makedirs(os.path.dirname(out_pileup_path)) run("vg pileup {} {} -t {} > {}".format(input_graph_path, input_gam, options.vg_cores, out_pileup_path)) if do_call: robust_makedirs(os.path.dirname(out_sample_vg_path)) run("vg call {} {} -r 0.001 -d 50 -e 150 -s 25 -t {} | vg ids -c - | vg ids -s - > {}".format(input_graph_path, out_pileup_path, options.vg_cores, out_sample_vg_path)) if do_aug: robust_makedirs(os.path.dirname(out_augmented_vg_path)) run("vg call {} {} -r 0.001 -d 50 -e 150 -s 25 -t {} -l | vg ids -c - | vg ids -s - > {}".format(input_graph_path, out_pileup_path, options.vg_cores, out_augmented_vg_path))
def timeout_fail(proc, cmd): os.kill(proc.pid, signal.SIGKILL) proc.kill() # we often check to see if some output file exists before running # if we timeout, make sure the file exists so rerunning wont timeout again # at same place (unless overwrite explicitly desired) if timeout_dep is not None and os.path.exists(timeout_dep): os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep)) if fail_hard is True: raise RuntimeError("Command: {} timed out".format(cmd)) else: RealTimeLogger.get().warning("Command: {} timed out".format(cmd))
def timeout_fail(proc, cmd): os.kill(proc.pid, signal.SIGKILL) proc.kill() # we often check to see if some output file exists before running # if we timeout, make sure the file exists so rerunning wont timeout again # at same place (unless overwrite explicitly desired) if timeout_dep is not None and os.path.exists(timeout_dep): os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep)) if fail_hard is True: raise RuntimeError("Command: {} timed out".format(cmd)) else: RealTimeLogger.get().warning("Command: {} timed out".format(cmd))
def run(cmd, stdout = sys.stdout, stderr = sys.stderr): """ run command in shell and barf if it doesn't work (copied from system() in sonlib.bioio """ RealTimeLogger.get().info("RUN: {}".format(cmd)) sts = subprocess.call(cmd, shell=True, bufsize=-1, stdout=stdout, stderr=stderr) if sts != 0: raise RuntimeError("Command: %s exited with non-zero status %i" % (cmd, sts)) return sts
def timeout_fail(procs, cmds): """ Called when the given processes, launched from the given commands, have run for too long. Kills the processes and logs an error. """ for proc in procs: os.kill(proc.pid, signal.SIGKILL) proc.kill() # we often check to see if some output file exists before running # if we timeout, make sure the file exists so rerunning wont timeout again # at same place (unless overwrite explicitly desired) if timeout_dep is not None and os.path.exists(timeout_dep): os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep)) if fail_hard is True: raise RuntimeError("Command: {} timed out".format(" | ".join(cmds))) else: RealTimeLogger.get().warning("Command: {} timed out".format(" | ".join(cmds)))
def add_optional_followon(job, job_function, *args, **kwargs): """ Given a job, and a Toil job function with pre-execution support, create a new follow on of the given job that runs the given job function with the given args and kwargs (which are probably all Toil args like "cores" and "memory"). The job function has to support "pre-execution": you can call it with None for the job argument, and it will return True if it thinks it really needs to be scheduled, and False otherwise. This lets us encapsulate the "is the job done?" code with the "what does the job do?" code, which might be a good design. Note that kwargs are discarded in pre-execution mode, because Toil doesn't offer a simple way to strip the ones it is going to consume, and the job functions shouldn't have to deal with extra ones. If the job needs to be scheduled, returns the newly created Toil job. Otherwise, returns the Toil job we were going to add a followon to. """ # Run the job with None as a first argument and see what it returns. TODO: # filter out Toil kwargs and enable us to pass the remaining ones to the job # function. need_to_run = job_function(None, *args) if need_to_run != False and need_to_run != True: # It's not speaking the protocol raise RuntimeError("Job function {} does not have pre-execution support!".format(job_function)) if need_to_run: # Actually schedule RealTimeLogger.get().debug("Running {}".format(job_function)) return job.addFollowOnJobFn(job, job_function, *args, **kwargs) else: # Don't schedule and give back the job we were adding onto instead. RealTimeLogger.get().debug("Skipping {}".format(job_function)) return job
def run(cmd, stdout = sys.stdout, stderr = sys.stderr, timeout_sec = sys.maxint, timeout_dep = None, fail_hard = False): """ run command in shell and barf if it doesn't work or times out """ RealTimeLogger.get().info("RUN: {}".format(cmd)) proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=stdout, stderr=stderr) def timeout_fail(proc, cmd): os.kill(proc.pid, signal.SIGKILL) proc.kill() # we often check to see if some output file exists before running # if we timeout, make sure the file exists so rerunning wont timeout again # at same place (unless overwrite explicitly desired) if timeout_dep is not None and os.path.exists(timeout_dep): os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep)) if fail_hard is True: raise RuntimeError("Command: {} timed out".format(cmd)) else: RealTimeLogger.get().warning("Command: {} timed out".format(cmd)) # based on a comment in http://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout timer = Timer(timeout_sec, timeout_fail, [proc, cmd]) try: timer.start() stdout, stderr = proc.communicate() finally: timer.cancel() sts = proc.wait() if sts != 0: if fail_hard is True: raise RuntimeError("Command: %s exited with non-zero status %i" % (cmd, sts)) else: RealTimeLogger.get().warning("Command: {} exited with non-zero status {}".format(cmd, sts)) return sts
def compute_vg_variants(job, input_gam, options): """ run vg pileup and vg call on the input """ input_graph_path = graph_path(input_gam, options) out_pileup_path = pileup_path(input_gam, options) out_sample_vg_path = sample_vg_path(input_gam, options) out_sample_txt_path = sample_txt_path(input_gam, options) out_augmented_vg_path = augmented_vg_path(input_gam, options) do_pu = options.overwrite or not os.path.isfile(out_pileup_path) do_call = do_pu or not os.path.isfile(out_augmented_vg_path) do_sample = options.sample and (do_pu or not os.path.isfile(out_sample_vg_path)) do_vcf = do_call or not os.path.isfile(out_sample_vg_path.replace(".vg", ".vcf")) if do_pu: RealTimeLogger.get().info("Computing Variants for {} {}".format( input_graph_path, input_gam)) robust_makedirs(os.path.dirname(out_pileup_path)) run("vg filter {} {} | vg pileup {} - {} -t {} > {}".format(input_gam, options.filter_opts, input_graph_path, options.pileup_opts, options.vg_cores, out_pileup_path), fail_hard = True) if do_call: robust_makedirs(os.path.dirname(out_sample_vg_path)) run("vg call {} {} {} -l -c {} -t {} > {}".format(input_graph_path, out_pileup_path, options.call_opts, out_sample_txt_path, options.vg_cores, out_augmented_vg_path), fail_hard = True) if do_vcf: region = alignment_region_tag(input_gam, options) g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed") with open(g1kbed_path) as f: contig, offset = f.readline().split()[0:2] # make the vcf # can only do this if there is a "ref" path in the vg graph ref = None res_path = temp_path(options) for ref_name in ["ref", contig]: run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, ref_name, res_path)) with open(res_path) as res_file: if res_file.read()[0] == "1": ref = ref_name break run("rm {}".format(res_path)) if ref is not None: tasks = [] run("glenn2vcf {} {} -o {} -r {} -c {} -s {} -d {} > {} 2> {}".format(out_augmented_vg_path, out_sample_txt_path, offset, ref, contig, alignment_sample_tag(input_gam, options), options.depth, out_sample_vg_path.replace(".vg", ".vcf"), out_sample_vg_path.replace(".vg", ".vcf.stderr")), fail_hard = True) if do_sample: robust_makedirs(os.path.dirname(out_augmented_vg_path)) run("vg call {} {} {} -t {} | vg ids -cs - > {}".format(input_graph_path, out_pileup_path, options.call_opts, options.vg_cores, out_sample_vg_path), fail_hard = True)
def run_experiment(job, options): """ Toil job to run an experiment on a variety of conditions and compare the results. """ # Make the IOStore we can search for GAMs gam_store = IOStore.get(options.in_gams) # And one so we can check if truth files exist truth_store = IOStore.get(options.truth) # This will hold best F score by region, graph, sample, and then condition. # We stick in dicts by condition. results = collections.defaultdict(lambda: collections.defaultdict(dict)) # Make some experimental conditions with filter, pileup, call, # and glenn2vcf options. # First define the lists we want the product of for all the parameters grid = [{ # vg filter "-r": [0.97], # minimum score to keep primary alignment [default=0] "-d": [0], # mininum (primary - secondary) score delta to keep secondary alignment "-e": [0], # minimum (primary - secondary) score delta to keep primary alignment "-a": [""], # use (secondary / primary) for delta comparisons "-f": [""], # normalize score based on length "-u": [""], # use substitution count instead of score "-s": [2], # minimum score to keep secondary alignment [default=0] "-o": [0] # filter reads whose alignments begin or end with an insert > N [default=99999] }, { # vg pileup "-w": [40], # size of window to apply -m option (default=0) "-m": [2], # ignore bases with > N mismatches within window centered on read (default=1) "-q": [10] # ignore bases with PHRED quality < N (default=0) }, { # vg call "-r": [0.0001], # Prior for being heterozygous "-b": [1.0], # Max strand bias "-f": [0.05], # Min fraction of reads required to support a variant "-d": [4] # Min pileup depth }, { # glenn2vcf "--depth": [10], # search depth not read depth "--min_fraction": [0.15], # Min fraction of average coverage to call at "--min_count": [6], # Min total supporting reads for an allele to have it "--max_het_bias": [4.2] # Max bias towards one alt of a called het }, { # vcfeval "--all-records": [""], "--vcf-score-field": ["XAAD"] }] # Make the whole grid of conditions for the grid search conditions = [ExperimentCondition(*point) for point in make_grid(grid)] # Add a condition that opens everything way up so we can try and # get maximum recall. conditions.append(ExperimentCondition( { # vg filter "-r": 0, "-d": 0.05, "-e": 0.05, "-a": "", "-f": "", "-u": "", "-s": 10000, "-o": 99999 }, { # vg pileup "-w": 40, "-m": 10, "-q": 10 }, { # vg call "-r": 0.0001, "-b": 0.4, "-f": 0.25, "-d": 11 }, { # glenn2vcf "--depth": 10, "--min_fraction": 0, # Min fraction of average coverage to call at "--min_count": 1, # Min total supporting reads for an allele to have it "--max_het_bias": 20 # Max bias towards one alt of a called het }, { # vcfeval "--all-records": "", "--vcf-score-field": "XAAD" }) ) RealTimeLogger.get().info("Running {} conditions...".format(len(conditions))) for region_dir in gam_store.list_input_directory(""): # Within every region we have samples for, look through all the # different graphs. if options.important_regions is not None and region_dir not in options.important_regions: # Skip it if it's unimportant continue for graph_dir in gam_store.list_input_directory(region_dir): # Within every graph for a region, we have a collection of samples. if ("{}:{}".format(region_dir, graph_dir) in options.blacklist or region_dir in options.blacklist or graph_dir in options.blacklist): # We don't want to process this region/graph pair. RealTimeLogger.get().info("Skipping {} graph {}".format( region_dir, graph_dir)) continue if options.important_graphs is not None and graph_dir not in options.important_graphs: # Skip it if it's unimportant continue for filename in gam_store.list_input_directory("{}/{}".format( region_dir, graph_dir)): # Look at each potential sample file # Is this file a sample? match = re.match("(.+)\\.gam$", filename) if not match: # It's not a sample continue if options.important_samples is not None and filename not in options.important_samples: # Skip it if it's unimportant continue # Otherwise, compose the full GAM key gam_key = "{}/{}/{}".format(region_dir, graph_dir, filename) if (not truth_store.exists(truth_compressed_key(gam_key)) or not truth_store.exists(truth_index_key(gam_key))): # We don't have a truth for this sample, so don't bother doing it. RealTimeLogger.get().warning("Skipping missing truth for {}".format(gam_key)) continue # Kick off a pipeline to make the variant calls. # TODO: assumes all the extra directories we need to read stuff from are set exp_job = job.addChildJobFn(run_conditions, gam_key, conditions, options, cores=1, memory="2G", disk="10G") # Save the best F score by condition under this region, graph, and sample filename results[region_dir][graph_dir][filename] = exp_job.rv() # Give back the results # TODO: we run it through JSON to fix the pickle-ability. return de_defaultdict(results)
def run(cmds, stdout = sys.stdout, stderr = sys.stderr, timeout_sec = sys.maxint, timeout_dep = None, fail_hard = False): """ Run commands in the given list in the shell, piping each in the next. Throw an exception if any of the commands fails or if the whole pipeline times out. """ def timeout_fail(procs, cmds): """ Called when the given processes, launched from the given commands, have run for too long. Kills the processes and logs an error. """ for proc in procs: os.kill(proc.pid, signal.SIGKILL) proc.kill() # we often check to see if some output file exists before running # if we timeout, make sure the file exists so rerunning wont timeout again # at same place (unless overwrite explicitly desired) if timeout_dep is not None and os.path.exists(timeout_dep): os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep)) if fail_hard is True: raise RuntimeError("Command: {} timed out".format(" | ".join(cmds))) else: RealTimeLogger.get().warning("Command: {} timed out".format(" | ".join(cmds))) RealTimeLogger.get().info("RUN: {}".format(" | ".join(cmds))) # We have a list of processes, one per command. procs = [] # We remember the previous process's standard output last_stdout = None for cmd in cmds[:-1]: # All but the last command feed their standard output into pipes proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdin=last_stdout, stdout=subprocess.PIPE, stderr=stderr) last_stdout = proc.stdout procs.append(proc) for cmd in cmds[-1:]: # The last command, if any, just dumps to our standard output proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdin=last_stdout, stdout=stdout, stderr=stderr) procs.append(proc) # We collect the return codes statuses = [] # based on a comment in http://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout timer = Timer(timeout_sec, timeout_fail, [procs, cmds]) try: timer.start() for proc, cmd in itertools.izip(procs, cmds): sts = proc.wait() statuses.append(sts) if sts != 0: message = "Command: {} in pipeline {} exited with non-zero status {}".format(cmd, " | ".join(cmds), sts) if fail_hard is True: raise RuntimeError(message) else: RealTimeLogger.get().warning(message) finally: timer.cancel() if len(statuses) > 0: # Return the max return code (0 if everything worked) return max(statuses) else: # Nothing bad haoppened because nothing happened return 0