def run(cmd, stdout=sys.stdout, stderr=sys.stderr, timeout_sec=sys.maxint, timeout_dep=None, fail_hard=False):
    """ run command in shell and barf if it doesn't work or times out 
    """
    RealTimeLogger.get().info("RUN: {}".format(cmd))

    proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=stdout, stderr=stderr)

    def timeout_fail(proc, cmd):
        os.kill(proc.pid, signal.SIGKILL)
        proc.kill()
        # we often check to see if some output file exists before running
        # if we timeout, make sure the file exists so rerunning wont timeout again
        # at same place (unless overwrite explicitly desired)
        if timeout_dep is not None and os.path.exists(timeout_dep):
            os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep))
        if fail_hard is True:
            raise RuntimeError("Command: {} timed out".format(cmd))
        else:
            RealTimeLogger.get().warning("Command: {} timed out".format(cmd))

    # based on a comment in http://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout
    timer = Timer(timeout_sec, timeout_fail, [proc, cmd])
    try:
        timer.start()
        stdout, stderr = proc.communicate()
    finally:
        timer.cancel()
    sts = proc.wait()

    if sts != 0:
        if fail_hard is True:
            raise RuntimeError("Command: %s exited with non-zero status %i" % (cmd, sts))
        else:
            RealTimeLogger.get().warning("Command: {} exited with non-zero status {}".format(cmd, sts))
    return sts
Beispiel #2
0
def compute_vg_variants(job, input_gam, options):
    """ run vg pileup and vg call on the input
    """
    input_graph_path = graph_path(input_gam, options)
    out_pileup_path = pileup_path(input_gam, options)
    out_sample_vg_path = sample_vg_path(input_gam, options)
    out_augmented_vg_path = augmented_vg_path(input_gam, options)

    do_pu = options.overwrite or not os.path.isfile(out_pileup_path)
    do_call = do_pu or not os.path.isfile(out_sample_vg_path)
    do_aug = do_pu or not os.path.isfile(out_augmented_vg_path)

    if do_pu:
        RealTimeLogger.get().info("Computing Variants for {} {}".format(
            input_graph_path,
            input_gam))
        robust_makedirs(os.path.dirname(out_pileup_path))
        run("vg pileup {} {} -t {} > {}".format(input_graph_path,
                                                input_gam,
                                                options.vg_cores,
                                                out_pileup_path))

    if do_call:
        robust_makedirs(os.path.dirname(out_sample_vg_path))
        run("vg call {} {} -r 0.001 -d 50 -e 150 -s 25 -t {} | vg ids -c - | vg ids -s -  > {}".format(input_graph_path,
                                                                                                       out_pileup_path,
                                                                                                       options.vg_cores,
                                                                                                       out_sample_vg_path))

    if do_aug:
        robust_makedirs(os.path.dirname(out_augmented_vg_path))
        run("vg call {} {} -r 0.001 -d 50 -e 150 -s 25 -t {} -l | vg ids -c - | vg ids -s - > {}".format(input_graph_path,
                                                                                                         out_pileup_path,
                                                                                                         options.vg_cores,
                                                                                                         out_augmented_vg_path))
 def timeout_fail(proc, cmd):
     os.kill(proc.pid, signal.SIGKILL)
     proc.kill()
     # we often check to see if some output file exists before running
     # if we timeout, make sure the file exists so rerunning wont timeout again
     # at same place (unless overwrite explicitly desired)
     if timeout_dep is not None and os.path.exists(timeout_dep):
         os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep))
     if fail_hard is True:
         raise RuntimeError("Command: {} timed out".format(cmd))
     else:
         RealTimeLogger.get().warning("Command: {} timed out".format(cmd))
 def timeout_fail(proc, cmd):
     os.kill(proc.pid, signal.SIGKILL)
     proc.kill()
     # we often check to see if some output file exists before running
     # if we timeout, make sure the file exists so rerunning wont timeout again
     # at same place (unless overwrite explicitly desired)
     if timeout_dep is not None and os.path.exists(timeout_dep):
         os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep))
     if fail_hard is True:
         raise RuntimeError("Command: {} timed out".format(cmd))
     else:
         RealTimeLogger.get().warning("Command: {} timed out".format(cmd))
Beispiel #5
0
def run(cmd, stdout = sys.stdout, stderr = sys.stderr):
    """ run command in shell and barf if it doesn't work
    (copied from system() in sonlib.bioio
    """
    RealTimeLogger.get().info("RUN: {}".format(cmd))

    sts = subprocess.call(cmd, shell=True, bufsize=-1,
                          stdout=stdout, stderr=stderr)
    if sts != 0:
        raise RuntimeError("Command: %s exited with non-zero status %i" %
                           (cmd, sts))
    return sts
Beispiel #6
0
 def timeout_fail(procs, cmds):
     """
     Called when the given processes, launched from the given commands, have
     run for too long. Kills the processes and logs an error.
     
     """
     
     for proc in procs:
         os.kill(proc.pid, signal.SIGKILL)
         proc.kill()
         
     # we often check to see if some output file exists before running
     # if we timeout, make sure the file exists so rerunning wont timeout again
     # at same place (unless overwrite explicitly desired)
     if timeout_dep is not None and os.path.exists(timeout_dep):
         os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep))
     if fail_hard is True:
         raise RuntimeError("Command: {} timed out".format(" | ".join(cmds)))
     else:
         RealTimeLogger.get().warning("Command: {} timed out".format(" | ".join(cmds)))
Beispiel #7
0
def add_optional_followon(job, job_function, *args, **kwargs):
    """
    Given a job, and a Toil job function with pre-execution support, create a
    new follow on of the given job that runs the given job function with the
    given args and kwargs (which are probably all Toil args like "cores" and
    "memory").
    
    The job function has to support "pre-execution": you can call it with None
    for the job argument, and it will return True if it thinks it really needs
    to be scheduled, and False otherwise. This lets us encapsulate the "is the
    job done?" code with the "what does the job do?" code, which might be a good
    design.
    
    Note that kwargs are discarded in pre-execution mode, because Toil doesn't
    offer a simple way to strip the ones it is going to consume, and the job
    functions shouldn't have to deal with extra ones.
    
    If the job needs to be scheduled, returns the newly created Toil job.
    Otherwise, returns the Toil job we were going to add a followon to.
    """
    
    # Run the job with None as a first argument and see what it returns. TODO:
    # filter out Toil kwargs and enable us to pass the remaining ones to the job
    # function.
    need_to_run = job_function(None, *args)
    
    if need_to_run != False and need_to_run != True:
        # It's not speaking the protocol
        raise RuntimeError("Job function {} does not have pre-execution support!".format(job_function))
        
    if need_to_run:
        # Actually schedule
        RealTimeLogger.get().debug("Running {}".format(job_function))
        return job.addFollowOnJobFn(job, job_function, *args, **kwargs)
    else:
        # Don't schedule and give back the job we were adding onto instead.
        RealTimeLogger.get().debug("Skipping {}".format(job_function))
        return job
def run(cmd, stdout = sys.stdout, stderr = sys.stderr, timeout_sec = sys.maxint,
        timeout_dep = None, fail_hard = False):
    """ run command in shell and barf if it doesn't work or times out 
    """
    RealTimeLogger.get().info("RUN: {}".format(cmd))

    proc = subprocess.Popen(cmd, shell=True, bufsize=-1,
                            stdout=stdout, stderr=stderr)
    
    def timeout_fail(proc, cmd):
        os.kill(proc.pid, signal.SIGKILL)
        proc.kill()
        # we often check to see if some output file exists before running
        # if we timeout, make sure the file exists so rerunning wont timeout again
        # at same place (unless overwrite explicitly desired)
        if timeout_dep is not None and os.path.exists(timeout_dep):
            os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep))
        if fail_hard is True:
            raise RuntimeError("Command: {} timed out".format(cmd))
        else:
            RealTimeLogger.get().warning("Command: {} timed out".format(cmd))

    # based on a comment in http://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout
    timer = Timer(timeout_sec, timeout_fail, [proc, cmd])
    try:
        timer.start()
        stdout, stderr = proc.communicate()
    finally:
        timer.cancel()
    sts = proc.wait()
    
    if sts != 0:
        if fail_hard is True:
            raise RuntimeError("Command: %s exited with non-zero status %i" %
                               (cmd, sts))
        else:
            RealTimeLogger.get().warning("Command: {} exited with non-zero status {}".format(cmd, sts))
    return sts
def compute_vg_variants(job, input_gam, options):
    """ run vg pileup and vg call on the input
    """
    input_graph_path = graph_path(input_gam, options)
    out_pileup_path = pileup_path(input_gam, options)
    out_sample_vg_path = sample_vg_path(input_gam, options)
    out_sample_txt_path = sample_txt_path(input_gam, options)    
    out_augmented_vg_path = augmented_vg_path(input_gam, options)
    do_pu = options.overwrite or not os.path.isfile(out_pileup_path)
    do_call = do_pu or not os.path.isfile(out_augmented_vg_path)
    do_sample = options.sample and (do_pu or not os.path.isfile(out_sample_vg_path))
    do_vcf = do_call or not os.path.isfile(out_sample_vg_path.replace(".vg", ".vcf"))

    if do_pu:
        RealTimeLogger.get().info("Computing Variants for {} {}".format(
            input_graph_path,
            input_gam))
        robust_makedirs(os.path.dirname(out_pileup_path))
        run("vg filter {} {} | vg pileup {} - {} -t {} > {}".format(input_gam,
                                                                    options.filter_opts,
                                                                    input_graph_path,
                                                                    options.pileup_opts,
                                                                    options.vg_cores,
                                                                    out_pileup_path),
            fail_hard = True)

    if do_call:
        robust_makedirs(os.path.dirname(out_sample_vg_path))
        run("vg call {} {} {} -l -c {} -t {} > {}".format(input_graph_path,
                                                          out_pileup_path,
                                                          options.call_opts,
                                                          out_sample_txt_path,
                                                          options.vg_cores,
                                                          out_augmented_vg_path),
            fail_hard = True)

    if do_vcf:
        region = alignment_region_tag(input_gam, options)
        g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed")            
        with open(g1kbed_path) as f:
            contig, offset = f.readline().split()[0:2]
            
        # make the vcf
        # can only do this if there is a "ref" path in the vg graph
        ref = None
        res_path = temp_path(options)
        for ref_name in ["ref", contig]:
            run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, ref_name, res_path))
            with open(res_path) as res_file:
                if res_file.read()[0] == "1":
                    ref = ref_name
                    break
        run("rm {}".format(res_path))
                
        if ref is not None:
            tasks = []
            run("glenn2vcf {} {} -o {} -r {} -c {} -s {} -d {} > {} 2> {}".format(out_augmented_vg_path,
                                                                                  out_sample_txt_path,
                                                                                  offset,
                                                                                  ref,
                                                                                  contig,
                                                                                  alignment_sample_tag(input_gam, options),
                                                                                  options.depth,
                                                                                  out_sample_vg_path.replace(".vg", ".vcf"),
                                                                                  out_sample_vg_path.replace(".vg", ".vcf.stderr")),
                fail_hard = True)

    if do_sample:
        robust_makedirs(os.path.dirname(out_augmented_vg_path))
        run("vg call {} {} {} -t {} | vg ids -cs - > {}".format(input_graph_path,
                                                                out_pileup_path,
                                                                options.call_opts,
                                                                options.vg_cores,
                                                                out_sample_vg_path),
            fail_hard = True)
Beispiel #10
0
def run_experiment(job, options):
    """
    Toil job to run an experiment on a variety of conditions and compare the
    results.
    """
    
    # Make the IOStore we can search for GAMs
    gam_store = IOStore.get(options.in_gams)
    # And one so we can check if truth files exist
    truth_store = IOStore.get(options.truth)
    
    # This will hold best F score by region, graph, sample, and then condition.
    # We stick in dicts by condition.
    results = collections.defaultdict(lambda: collections.defaultdict(dict))
    
    # Make some experimental conditions with filter, pileup, call,
    # and glenn2vcf options. 
    
    # First define the lists we want the product of for all the parameters
    grid = [{ # vg filter 
            "-r": [0.97], # minimum score to keep primary alignment [default=0]
            "-d": [0], # mininum (primary - secondary) score delta to keep secondary alignment
            "-e": [0], # minimum (primary - secondary) score delta to keep primary alignment
            "-a": [""], # use (secondary / primary) for delta comparisons
            "-f": [""], # normalize score based on length
            "-u": [""], # use substitution count instead of score
            "-s": [2], # minimum score to keep secondary alignment [default=0]
            "-o": [0] #  filter reads whose alignments begin or end with an insert > N [default=99999]
        }, { # vg pileup
            "-w": [40], # size of window to apply -m option (default=0)
            "-m": [2], # ignore bases with > N mismatches within window centered on read (default=1)
            "-q": [10] # ignore bases with PHRED quality < N (default=0)
        }, { # vg call
            "-r": [0.0001], # Prior for being heterozygous
            "-b": [1.0], # Max strand bias
            "-f": [0.05], # Min fraction of reads required to support a variant
            "-d": [4] # Min pileup depth
        }, { # glenn2vcf
            "--depth": [10], # search depth not read depth
            "--min_fraction": [0.15], # Min fraction of average coverage to call at
            "--min_count": [6], # Min total supporting reads for an allele to have it
            "--max_het_bias": [4.2] # Max bias towards one alt of a called het
        }, { # vcfeval
            "--all-records": [""],
            "--vcf-score-field": ["XAAD"]
        }]
        
    # Make the whole grid of conditions for the grid search
    conditions = [ExperimentCondition(*point) for point in make_grid(grid)]
            
        
    # Add a condition that opens everything way up so we can try and
    # get maximum recall.
    conditions.append(ExperimentCondition(
        { # vg filter 
            "-r": 0,
            "-d": 0.05,
            "-e": 0.05,
            "-a": "",
            "-f": "",
            "-u": "",
            "-s": 10000,
            "-o": 99999
        }, { # vg pileup
            "-w": 40,
            "-m": 10,
            "-q": 10
        }, { # vg call
            "-r": 0.0001,
            "-b": 0.4,
            "-f": 0.25,
            "-d": 11
        }, { # glenn2vcf
            "--depth": 10,
            "--min_fraction": 0, # Min fraction of average coverage to call at
            "--min_count": 1, # Min total supporting reads for an allele to have it
            "--max_het_bias": 20 # Max bias towards one alt of a called het
        }, { # vcfeval
            "--all-records": "",
            "--vcf-score-field": "XAAD"
        })
    )
    
    RealTimeLogger.get().info("Running {} conditions...".format(len(conditions)))
    
    for region_dir in gam_store.list_input_directory(""):
        # Within every region we have samples for, look through all the
        # different graphs.
        
        if options.important_regions is not None and region_dir not in options.important_regions:
            # Skip it if it's unimportant
            continue
        
        for graph_dir in gam_store.list_input_directory(region_dir):
            # Within every graph for a region, we have a collection of samples.
            
            if ("{}:{}".format(region_dir, graph_dir) in options.blacklist or
                region_dir in options.blacklist or
                graph_dir in options.blacklist):
                # We don't want to process this region/graph pair.
                RealTimeLogger.get().info("Skipping {} graph {}".format(
                    region_dir, graph_dir))
                continue
                
            if options.important_graphs is not None and graph_dir not in options.important_graphs:
                # Skip it if it's unimportant
                continue
                
            for filename in gam_store.list_input_directory("{}/{}".format(
                region_dir, graph_dir)):
                # Look at each potential sample file
                
                # Is this file a sample?
                match = re.match("(.+)\\.gam$", filename)
                
                if not match:
                    # It's not a sample
                    continue
                    
                if options.important_samples is not None and filename not in options.important_samples:
                    # Skip it if it's unimportant
                    continue
                    
                # Otherwise, compose the full GAM key
                gam_key = "{}/{}/{}".format(region_dir, graph_dir, filename)
                
                if (not truth_store.exists(truth_compressed_key(gam_key)) or
                    not truth_store.exists(truth_index_key(gam_key))):
                    
                    # We don't have a truth for this sample, so don't bother doing it.
                    RealTimeLogger.get().warning("Skipping missing truth for {}".format(gam_key))
                    continue
                
                # Kick off a pipeline to make the variant calls.
                # TODO: assumes all the extra directories we need to read stuff from are set
                exp_job = job.addChildJobFn(run_conditions, gam_key, conditions, options,
                    cores=1, memory="2G", disk="10G")
                    
                # Save the best F score by condition under this region, graph, and sample filename
                results[region_dir][graph_dir][filename] = exp_job.rv()
                
    # Give back the results
    # TODO: we run it through JSON to fix the pickle-ability.
    return de_defaultdict(results)
Beispiel #11
0
def run(cmds, stdout = sys.stdout, stderr = sys.stderr, timeout_sec = sys.maxint,
        timeout_dep = None, fail_hard = False):
    """
    Run commands in the given list in the shell, piping each in the next. Throw
    an exception if any of the commands fails or if the whole pipeline times
    out.
    
    
    """
    
    
    def timeout_fail(procs, cmds):
        """
        Called when the given processes, launched from the given commands, have
        run for too long. Kills the processes and logs an error.
        
        """
        
        for proc in procs:
            os.kill(proc.pid, signal.SIGKILL)
            proc.kill()
            
        # we often check to see if some output file exists before running
        # if we timeout, make sure the file exists so rerunning wont timeout again
        # at same place (unless overwrite explicitly desired)
        if timeout_dep is not None and os.path.exists(timeout_dep):
            os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep))
        if fail_hard is True:
            raise RuntimeError("Command: {} timed out".format(" | ".join(cmds)))
        else:
            RealTimeLogger.get().warning("Command: {} timed out".format(" | ".join(cmds)))
    
    RealTimeLogger.get().info("RUN: {}".format(" | ".join(cmds)))

    # We have a list of processes, one per command.
    procs = []
    # We remember the previous process's standard output
    last_stdout = None

    for cmd in cmds[:-1]:
        # All but the last command feed their standard output into pipes
        proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdin=last_stdout,
                                stdout=subprocess.PIPE, stderr=stderr)
        last_stdout = proc.stdout
        procs.append(proc)
        
    for cmd in cmds[-1:]:
        # The last command, if any, just dumps to our standard output
        proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdin=last_stdout,
                                stdout=stdout, stderr=stderr)
        procs.append(proc)
    
    
    # We collect the return codes
    statuses = []
        
    # based on a comment in http://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout
    timer = Timer(timeout_sec, timeout_fail, [procs, cmds])
    try:
        timer.start()
        
        for proc, cmd in itertools.izip(procs, cmds):
            sts = proc.wait()
            statuses.append(sts)
        
            if sts != 0:
                message = "Command: {} in pipeline {} exited with non-zero status {}".format(cmd, " | ".join(cmds), sts)
                if fail_hard is True:
                    raise RuntimeError(message)
                else:
                    RealTimeLogger.get().warning(message) 
        
    finally:
        timer.cancel()

    if len(statuses) > 0:
        # Return the max return code (0 if everything worked)
        return max(statuses)
    else:
        # Nothing bad haoppened because nothing happened
        return 0