def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    filtered_gams = []
    skip_words = options.skip.split(",")
    for gam in options.in_gams:
        skip_gam = False
        for word in skip_words:
            if len(word) > 0 and word in gam:
                skip_gam = True
        if not skip_gam:
            filtered_gams.append(gam)
    options.in_gams = filtered_gams

    for gam in options.in_gams:
        if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam":
            raise RuntimeError("Input gam paths must be of the form "
                               ".../<alg>/<reads>/<filename>.gam")

    # Make a root job
    root_job = Job.wrapJobFn(call_variants, options,
                             cores=1, memory="2G", disk="2G")
    
    # Run it and see how many jobs fail
    failed_jobs = Job.Runner.startToil(root_job,  options)
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()
Ejemplo n.º 2
0
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    for graph in options.graphs:
        if os.path.splitext(graph)[1] != ".vg":
            raise RuntimeError("Input graphs expected to have .vg extension")

    # Make a root job
    root_job = Job.wrapJobFn(compute_kmer_indexes, options,
        cores=1, memory="2G", disk=0)
    
    # Run it and see how many jobs fail
    if not options.only_summary:
        failed_jobs = Job.Runner.startToil(root_job,  options)
    else:
        failed_jobs = 0
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()

    # Do the drawing outside toil to get around weird import problems
    cluster_comparisons(options)
Ejemplo n.º 3
0
def compute_vg_variants(job, input_gam, options):
    """ run vg pileup and vg call on the input
    """
    input_graph_path = graph_path(input_gam, options)
    out_pileup_path = pileup_path(input_gam, options)
    out_sample_vg_path = sample_vg_path(input_gam, options)
    out_augmented_vg_path = augmented_vg_path(input_gam, options)

    do_pu = options.overwrite or not os.path.isfile(out_pileup_path)
    do_call = do_pu or not os.path.isfile(out_sample_vg_path)
    do_aug = do_pu or not os.path.isfile(out_augmented_vg_path)

    if do_pu:
        RealTimeLogger.get().info("Computing Variants for {} {}".format(
            input_graph_path,
            input_gam))
        robust_makedirs(os.path.dirname(out_pileup_path))
        run("vg pileup {} {} -t {} > {}".format(input_graph_path,
                                                input_gam,
                                                options.vg_cores,
                                                out_pileup_path))

    if do_call:
        robust_makedirs(os.path.dirname(out_sample_vg_path))
        run("vg call {} {} -r 0.001 -d 50 -e 150 -s 25 -t {} | vg ids -c - | vg ids -s -  > {}".format(input_graph_path,
                                                                                                       out_pileup_path,
                                                                                                       options.vg_cores,
                                                                                                       out_sample_vg_path))

    if do_aug:
        robust_makedirs(os.path.dirname(out_augmented_vg_path))
        run("vg call {} {} -r 0.001 -d 50 -e 150 -s 25 -t {} -l | vg ids -c - | vg ids -s - > {}".format(input_graph_path,
                                                                                                         out_pileup_path,
                                                                                                         options.vg_cores,
                                                                                                         out_augmented_vg_path))
def run(cmd, stdout=sys.stdout, stderr=sys.stderr, timeout_sec=sys.maxint, timeout_dep=None, fail_hard=False):
    """ run command in shell and barf if it doesn't work or times out 
    """
    RealTimeLogger.get().info("RUN: {}".format(cmd))

    proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdout=stdout, stderr=stderr)

    def timeout_fail(proc, cmd):
        os.kill(proc.pid, signal.SIGKILL)
        proc.kill()
        # we often check to see if some output file exists before running
        # if we timeout, make sure the file exists so rerunning wont timeout again
        # at same place (unless overwrite explicitly desired)
        if timeout_dep is not None and os.path.exists(timeout_dep):
            os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep))
        if fail_hard is True:
            raise RuntimeError("Command: {} timed out".format(cmd))
        else:
            RealTimeLogger.get().warning("Command: {} timed out".format(cmd))

    # based on a comment in http://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout
    timer = Timer(timeout_sec, timeout_fail, [proc, cmd])
    try:
        timer.start()
        stdout, stderr = proc.communicate()
    finally:
        timer.cancel()
    sts = proc.wait()

    if sts != 0:
        if fail_hard is True:
            raise RuntimeError("Command: %s exited with non-zero status %i" % (cmd, sts))
        else:
            RealTimeLogger.get().warning("Command: {} exited with non-zero status {}".format(cmd, sts))
    return sts
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    filtered_gams = []
    skip_words = options.skip.split(",")
    for gam in options.in_gams:
        skip_gam = False
        for word in skip_words:
            if len(word) > 0 and word in gam:
                skip_gam = True
        if not skip_gam:
            filtered_gams.append(gam)
    options.in_gams = filtered_gams

    for gam in options.in_gams:
        if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam":
            raise RuntimeError("Input gam paths must be of the form "
                               ".../<alg>/<reads>/<filename>.gam")

    # Make a root job
    root_job = Job.wrapJobFn(call_variants, options,
                             cores=1, memory="2G", disk="2G")
    
    # Run it and see how many jobs fail
    failed_jobs = Job.Runner.startToil(root_job,  options)
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()
Ejemplo n.º 6
0
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    for gam in options.in_gams:
        if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam":
            raise RuntimeError("Input gam paths must be of the form "
                               ".../<alg>/<reads>/<filename>.gam")
    robust_makedirs(json_out_path(options))
    robust_makedirs(compare_out_path(options))
                    
    # Make a root job
    root_job = Job.wrapJobFn(compute_all_indexes, options,
        cores=1, memory="2G", disk=0)
    
    # Run it and see how many jobs fail
    if not options.only_summary:
        failed_jobs = Job.Runner.startToil(root_job,  options)
    else:
        failed_jobs = 0
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()

    # make some tables from the json comparison output
    #dist_table(options)
    #acc_table(options)
    snp_count_table(options)
    graph_size_table(options)
 def timeout_fail(proc, cmd):
     os.kill(proc.pid, signal.SIGKILL)
     proc.kill()
     # we often check to see if some output file exists before running
     # if we timeout, make sure the file exists so rerunning wont timeout again
     # at same place (unless overwrite explicitly desired)
     if timeout_dep is not None and os.path.exists(timeout_dep):
         os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep))
     if fail_hard is True:
         raise RuntimeError("Command: {} timed out".format(cmd))
     else:
         RealTimeLogger.get().warning("Command: {} timed out".format(cmd))
 def timeout_fail(proc, cmd):
     os.kill(proc.pid, signal.SIGKILL)
     proc.kill()
     # we often check to see if some output file exists before running
     # if we timeout, make sure the file exists so rerunning wont timeout again
     # at same place (unless overwrite explicitly desired)
     if timeout_dep is not None and os.path.exists(timeout_dep):
         os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep))
     if fail_hard is True:
         raise RuntimeError("Command: {} timed out".format(cmd))
     else:
         RealTimeLogger.get().warning("Command: {} timed out".format(cmd))
Ejemplo n.º 9
0
def run(cmd, stdout = sys.stdout, stderr = sys.stderr):
    """ run command in shell and barf if it doesn't work
    (copied from system() in sonlib.bioio
    """
    RealTimeLogger.get().info("RUN: {}".format(cmd))

    sts = subprocess.call(cmd, shell=True, bufsize=-1,
                          stdout=stdout, stderr=stderr)
    if sts != 0:
        raise RuntimeError("Command: %s exited with non-zero status %i" %
                           (cmd, sts))
    return sts
def main(args):

    options = parse_args(args)

    RealTimeLogger.start_master()

    robust_makedirs(options.out_dir)

    vcfmap = munge_vcf_results(options.comp_dir)

    mergetable = make_trio_vcfs(vcfmap, options)

    do_mendel(mergetable, options)
def main(args):
                                        
    options = parse_args(args)

    RealTimeLogger.start_master()

    robust_makedirs(options.out_dir)
                                    
    vcfmap = munge_vcf_results(options.comp_dir)

    mergetable = make_trio_vcfs(vcfmap, options)

    do_mendel(mergetable, options)
Ejemplo n.º 12
0
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    # Make a root job
    root_job = Job.wrapJobFn(run_and_evaluate, options,
                             cores=1, memory="2G", disk="2G")
    
    # Run it and get the return value
    answer = Job.Runner.startToil(root_job,  options)

    RealTimeLogger.stop_master()
    
    print("Root return value:")
    print(answer)
Ejemplo n.º 13
0
 def timeout_fail(procs, cmds):
     """
     Called when the given processes, launched from the given commands, have
     run for too long. Kills the processes and logs an error.
     
     """
     
     for proc in procs:
         os.kill(proc.pid, signal.SIGKILL)
         proc.kill()
         
     # we often check to see if some output file exists before running
     # if we timeout, make sure the file exists so rerunning wont timeout again
     # at same place (unless overwrite explicitly desired)
     if timeout_dep is not None and os.path.exists(timeout_dep):
         os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep))
     if fail_hard is True:
         raise RuntimeError("Command: {} timed out".format(" | ".join(cmds)))
     else:
         RealTimeLogger.get().warning("Command: {} timed out".format(" | ".join(cmds)))
def run(cmd, stdout = sys.stdout, stderr = sys.stderr, timeout_sec = sys.maxint,
        timeout_dep = None, fail_hard = False):
    """ run command in shell and barf if it doesn't work or times out 
    """
    RealTimeLogger.get().info("RUN: {}".format(cmd))

    proc = subprocess.Popen(cmd, shell=True, bufsize=-1,
                            stdout=stdout, stderr=stderr)
    
    def timeout_fail(proc, cmd):
        os.kill(proc.pid, signal.SIGKILL)
        proc.kill()
        # we often check to see if some output file exists before running
        # if we timeout, make sure the file exists so rerunning wont timeout again
        # at same place (unless overwrite explicitly desired)
        if timeout_dep is not None and os.path.exists(timeout_dep):
            os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep))
        if fail_hard is True:
            raise RuntimeError("Command: {} timed out".format(cmd))
        else:
            RealTimeLogger.get().warning("Command: {} timed out".format(cmd))

    # based on a comment in http://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout
    timer = Timer(timeout_sec, timeout_fail, [proc, cmd])
    try:
        timer.start()
        stdout, stderr = proc.communicate()
    finally:
        timer.cancel()
    sts = proc.wait()
    
    if sts != 0:
        if fail_hard is True:
            raise RuntimeError("Command: %s exited with non-zero status %i" %
                               (cmd, sts))
        else:
            RealTimeLogger.get().warning("Command: {} exited with non-zero status {}".format(cmd, sts))
    return sts
Ejemplo n.º 15
0
def add_optional_followon(job, job_function, *args, **kwargs):
    """
    Given a job, and a Toil job function with pre-execution support, create a
    new follow on of the given job that runs the given job function with the
    given args and kwargs (which are probably all Toil args like "cores" and
    "memory").
    
    The job function has to support "pre-execution": you can call it with None
    for the job argument, and it will return True if it thinks it really needs
    to be scheduled, and False otherwise. This lets us encapsulate the "is the
    job done?" code with the "what does the job do?" code, which might be a good
    design.
    
    Note that kwargs are discarded in pre-execution mode, because Toil doesn't
    offer a simple way to strip the ones it is going to consume, and the job
    functions shouldn't have to deal with extra ones.
    
    If the job needs to be scheduled, returns the newly created Toil job.
    Otherwise, returns the Toil job we were going to add a followon to.
    """
    
    # Run the job with None as a first argument and see what it returns. TODO:
    # filter out Toil kwargs and enable us to pass the remaining ones to the job
    # function.
    need_to_run = job_function(None, *args)
    
    if need_to_run != False and need_to_run != True:
        # It's not speaking the protocol
        raise RuntimeError("Job function {} does not have pre-execution support!".format(job_function))
        
    if need_to_run:
        # Actually schedule
        RealTimeLogger.get().debug("Running {}".format(job_function))
        return job.addFollowOnJobFn(job, job_function, *args, **kwargs)
    else:
        # Don't schedule and give back the job we were adding onto instead.
        RealTimeLogger.get().debug("Skipping {}".format(job_function))
        return job
def main(args):

    options = parse_args(args)

    RealTimeLogger.start_master()

    if options.classic:
        # expect call_dir/SAMPLE/region.vcf

        for sampledir in glob.glob(os.path.join(options.call_dir, "*")):
            if os.path.isdir(sampledir):
                sample = os.path.basename(sampledir)
                vcfs = []
                outfile = os.path.join(sampledir, "TOTAL.vcf")
                for vcf in glob.glob(os.path.join(sampledir, "*.vcf")):
                    if os.path.basename(vcf) in [
                            "BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf",
                            "MHC.vcf"
                    ]:
                        run("vcfsort {} > {}.sort".format(vcf, vcf),
                            fail_hard=True)
                        run("bgzip -c {}.sort > {}.gz".format(vcf, vcf),
                            fail_hard=True)
                        run("rm -f {}.sort".format(vcf))
                        run("tabix -f -p vcf {}.gz".format(vcf),
                            fail_hard=True)
                        vcfs.append("{}.gz".format(vcf))
                if len(vcfs) > 0:
                    run("vt cat {} > {}".format(" ".join(vcfs), outfile),
                        fail_hard=True)
                    run("vcfsort {} > {}.sort".format(outfile, outfile),
                        fail_hard=True)
                    run("mv {}.sort {}".format(outfile, outfile),
                        fail_hard=True)
                    run("bgzip -c {} > {}.gz".format(outfile, outfile),
                        fail_hard=True)
                    run("tabix -f -p vcf {}.gz".format(outfile),
                        fail_hard=True)

        return 0

    # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf

    # count up regions
    regions = set()
    for regiondir in glob.glob(os.path.join(options.call_dir, "*")):
        if os.path.isdir(regiondir):
            region = os.path.basename(regiondir)
            # avoid crufty directories (including outputs of previous runs of this script)
            if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]:
                regions.add(region)

    print regions

    # count up graphs (that are present in every region)
    graphs = set()
    gcount = defaultdict(int)
    for region in regions:
        for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")):
            if os.path.isdir(graphdir):
                graph = os.path.basename(graphdir)
                gcount[graph] = gcount[graph] + 1

    for graph, count in gcount.items():
        if count == len(regions):
            graphs.add(graph)

    print graphs

    # count up samples
    samples = set()
    scount = defaultdict(int)
    for region in regions:
        for graph in graphs:
            for vcf in glob.glob(
                    os.path.join(options.call_dir, region, graph,
                                 "*_sample.vcf")):
                sample = os.path.basename(vcf).split("_")[0]
                scount[sample] = scount[sample] + 1

    for sample, count in scount.items():
        samples.add(sample)

    print samples

    # make our output directory
    out_dir = os.path.join(options.call_dir, options.name)
    robust_makedirs(out_dir)

    for graph in graphs:
        g_out_dir = os.path.join(out_dir, graph)

        for sample in samples:
            vcf_files = []

            for region in regions:
                vcf = os.path.join(options.call_dir, region, graph,
                                   "{}_sample.vcf".format(sample))
                if os.path.isfile(vcf):
                    vcf_files.append((region, vcf))

            # this sample doesn't span all regions, skip it
            if len(vcf_files) < len(regions):
                print "Skipping Sample {} for Graph {}".format(sample, graph)
                continue

            # output vcf
            merge_vcf_path = os.path.join(out_dir, graph,
                                          "{}_sample.vcf".format(sample))

            # working directory for intermediates / debugging
            work_path = os.path.join(out_dir, graph, "input", sample)
            robust_makedirs(work_path)

            # preprocess all the vcfs and leave in input dir
            input_files = []
            for region, vcf in vcf_files:
                outbase = os.path.join(work_path, region)
                run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard=True)
                run("bgzip -f {}.vcf".format(outbase))
                run("tabix -f -p vcf {}.vcf.gz".format(outbase))
                input_files.append("{}.vcf.gz".format(outbase))

            # run the merge
            run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path),
                fail_hard=True)

            # make an index just in case
            run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)
            run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)
            run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)
            run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)

    return 0
def main(args):
    
    options = parse_args(args)

    RealTimeLogger.start_master()

    if options.classic:
        # expect call_dir/SAMPLE/region.vcf

        for sampledir in glob.glob(os.path.join(options.call_dir, "*")):
            if os.path.isdir(sampledir):
                sample = os.path.basename(sampledir)
                vcfs = []
                outfile = os.path.join(sampledir, "TOTAL.vcf")
                for vcf in glob.glob(os.path.join(sampledir, "*.vcf")):
                    if os.path.basename(vcf) in ["BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf", "MHC.vcf"]:
                        run("vcfsort {} > {}.sort".format(vcf, vcf), fail_hard = True)
                        run("bgzip -c {}.sort > {}.gz".format(vcf, vcf), fail_hard = True)
                        run("rm -f {}.sort".format(vcf))
                        run("tabix -f -p vcf {}.gz".format(vcf), fail_hard = True)
                        vcfs.append("{}.gz".format(vcf))
                if len(vcfs) > 0:
                    run("vt cat {} > {}".format(" ".join(vcfs), outfile),
                        fail_hard = True)
                    run("vcfsort {} > {}.sort".format(outfile, outfile), fail_hard = True)
                    run("mv {}.sort {}".format(outfile, outfile), fail_hard = True)
                    run("bgzip -c {} > {}.gz".format(outfile, outfile), fail_hard = True)
                    run("tabix -f -p vcf {}.gz".format(outfile), fail_hard = True)

        return 0

    # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf

    # count up regions
    regions = set()
    for regiondir in glob.glob(os.path.join(options.call_dir, "*")):
        if os.path.isdir(regiondir):
            region = os.path.basename(regiondir)
            # avoid crufty directories (including outputs of previous runs of this script)
            if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]:
                regions.add(region)

    print regions

    # count up graphs (that are present in every region)
    graphs = set()
    gcount = defaultdict(int)
    for region in regions:
        for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")):
            if os.path.isdir(graphdir):
                graph = os.path.basename(graphdir)
                gcount[graph] = gcount[graph] + 1
    
    for graph, count in gcount.items():
        if count == len(regions):
            graphs.add(graph)

    print graphs

    # count up samples
    samples = set()
    scount = defaultdict(int)
    for region in regions:
        for graph in graphs:
            for vcf in glob.glob(os.path.join(options.call_dir, region, graph, "*_sample.vcf")):
                sample = os.path.basename(vcf).split("_")[0]
                scount[sample] = scount[sample] + 1

    for sample, count in scount.items():
        samples.add(sample)

    print samples

    # make our output directory
    out_dir = os.path.join(options.call_dir, options.name)
    robust_makedirs(out_dir)

    for graph in graphs:
        g_out_dir = os.path.join(out_dir, graph)

        for sample in samples:
            vcf_files = []

            for region in regions:
                vcf = os.path.join(options.call_dir, region, graph, "{}_sample.vcf".format(sample))
                if os.path.isfile(vcf):
                    vcf_files.append((region, vcf))

            # this sample doesn't span all regions, skip it
            if len(vcf_files) < len(regions):
                print "Skipping Sample {} for Graph {}".format(sample, graph)
                continue
            
            # output vcf
            merge_vcf_path = os.path.join(out_dir, graph, "{}_sample.vcf".format(sample))

            # working directory for intermediates / debugging
            work_path = os.path.join(out_dir, graph, "input", sample)
            robust_makedirs(work_path)

            # preprocess all the vcfs and leave in input dir
            input_files = []
            for region, vcf in vcf_files:
                outbase = os.path.join(work_path, region)
                run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard = True)
                run("bgzip -f {}.vcf".format(outbase))
                run("tabix -f -p vcf {}.vcf.gz".format(outbase))
                input_files.append("{}.vcf.gz".format(outbase))
            
            # run the merge
            run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path), fail_hard = True)

            # make an index just in case
            run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
            run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
            run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
            run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
        
    return 0
def compute_vg_variants(job, input_gam, options):
    """ run vg pileup and vg call on the input
    """
    input_graph_path = graph_path(input_gam, options)
    out_pileup_path = pileup_path(input_gam, options)
    out_sample_vg_path = sample_vg_path(input_gam, options)
    out_sample_txt_path = sample_txt_path(input_gam, options)    
    out_augmented_vg_path = augmented_vg_path(input_gam, options)
    do_pu = options.overwrite or not os.path.isfile(out_pileup_path)
    do_call = do_pu or not os.path.isfile(out_augmented_vg_path)
    do_sample = options.sample and (do_pu or not os.path.isfile(out_sample_vg_path))
    do_vcf = do_call or not os.path.isfile(out_sample_vg_path.replace(".vg", ".vcf"))

    if do_pu:
        RealTimeLogger.get().info("Computing Variants for {} {}".format(
            input_graph_path,
            input_gam))
        robust_makedirs(os.path.dirname(out_pileup_path))
        run("vg filter {} {} | vg pileup {} - {} -t {} > {}".format(input_gam,
                                                                    options.filter_opts,
                                                                    input_graph_path,
                                                                    options.pileup_opts,
                                                                    options.vg_cores,
                                                                    out_pileup_path),
            fail_hard = True)

    if do_call:
        robust_makedirs(os.path.dirname(out_sample_vg_path))
        run("vg call {} {} {} -l -c {} -t {} > {}".format(input_graph_path,
                                                          out_pileup_path,
                                                          options.call_opts,
                                                          out_sample_txt_path,
                                                          options.vg_cores,
                                                          out_augmented_vg_path),
            fail_hard = True)

    if do_vcf:
        region = alignment_region_tag(input_gam, options)
        g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed")            
        with open(g1kbed_path) as f:
            contig, offset = f.readline().split()[0:2]
            
        # make the vcf
        # can only do this if there is a "ref" path in the vg graph
        ref = None
        res_path = temp_path(options)
        for ref_name in ["ref", contig]:
            run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, ref_name, res_path))
            with open(res_path) as res_file:
                if res_file.read()[0] == "1":
                    ref = ref_name
                    break
        run("rm {}".format(res_path))
                
        if ref is not None:
            tasks = []
            run("glenn2vcf {} {} -o {} -r {} -c {} -s {} -d {} > {} 2> {}".format(out_augmented_vg_path,
                                                                                  out_sample_txt_path,
                                                                                  offset,
                                                                                  ref,
                                                                                  contig,
                                                                                  alignment_sample_tag(input_gam, options),
                                                                                  options.depth,
                                                                                  out_sample_vg_path.replace(".vg", ".vcf"),
                                                                                  out_sample_vg_path.replace(".vg", ".vcf.stderr")),
                fail_hard = True)

    if do_sample:
        robust_makedirs(os.path.dirname(out_augmented_vg_path))
        run("vg call {} {} {} -t {} | vg ids -cs - > {}".format(input_graph_path,
                                                                out_pileup_path,
                                                                options.call_opts,
                                                                options.vg_cores,
                                                                out_sample_vg_path),
            fail_hard = True)
Ejemplo n.º 19
0
def run(cmds, stdout = sys.stdout, stderr = sys.stderr, timeout_sec = sys.maxint,
        timeout_dep = None, fail_hard = False):
    """
    Run commands in the given list in the shell, piping each in the next. Throw
    an exception if any of the commands fails or if the whole pipeline times
    out.
    
    
    """
    
    
    def timeout_fail(procs, cmds):
        """
        Called when the given processes, launched from the given commands, have
        run for too long. Kills the processes and logs an error.
        
        """
        
        for proc in procs:
            os.kill(proc.pid, signal.SIGKILL)
            proc.kill()
            
        # we often check to see if some output file exists before running
        # if we timeout, make sure the file exists so rerunning wont timeout again
        # at same place (unless overwrite explicitly desired)
        if timeout_dep is not None and os.path.exists(timeout_dep):
            os.system("rm -rf {}; echo timeout > {}".format(timeout_dep, timeout_dep))
        if fail_hard is True:
            raise RuntimeError("Command: {} timed out".format(" | ".join(cmds)))
        else:
            RealTimeLogger.get().warning("Command: {} timed out".format(" | ".join(cmds)))
    
    RealTimeLogger.get().info("RUN: {}".format(" | ".join(cmds)))

    # We have a list of processes, one per command.
    procs = []
    # We remember the previous process's standard output
    last_stdout = None

    for cmd in cmds[:-1]:
        # All but the last command feed their standard output into pipes
        proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdin=last_stdout,
                                stdout=subprocess.PIPE, stderr=stderr)
        last_stdout = proc.stdout
        procs.append(proc)
        
    for cmd in cmds[-1:]:
        # The last command, if any, just dumps to our standard output
        proc = subprocess.Popen(cmd, shell=True, bufsize=-1, stdin=last_stdout,
                                stdout=stdout, stderr=stderr)
        procs.append(proc)
    
    
    # We collect the return codes
    statuses = []
        
    # based on a comment in http://stackoverflow.com/questions/1191374/using-module-subprocess-with-timeout
    timer = Timer(timeout_sec, timeout_fail, [procs, cmds])
    try:
        timer.start()
        
        for proc, cmd in itertools.izip(procs, cmds):
            sts = proc.wait()
            statuses.append(sts)
        
            if sts != 0:
                message = "Command: {} in pipeline {} exited with non-zero status {}".format(cmd, " | ".join(cmds), sts)
                if fail_hard is True:
                    raise RuntimeError(message)
                else:
                    RealTimeLogger.get().warning(message) 
        
    finally:
        timer.cancel()

    if len(statuses) > 0:
        # Return the max return code (0 if everything worked)
        return max(statuses)
    else:
        # Nothing bad haoppened because nothing happened
        return 0
Ejemplo n.º 20
0
def run_experiment(job, options):
    """
    Toil job to run an experiment on a variety of conditions and compare the
    results.
    """
    
    # Make the IOStore we can search for GAMs
    gam_store = IOStore.get(options.in_gams)
    # And one so we can check if truth files exist
    truth_store = IOStore.get(options.truth)
    
    # This will hold best F score by region, graph, sample, and then condition.
    # We stick in dicts by condition.
    results = collections.defaultdict(lambda: collections.defaultdict(dict))
    
    # Make some experimental conditions with filter, pileup, call,
    # and glenn2vcf options. 
    
    # First define the lists we want the product of for all the parameters
    grid = [{ # vg filter 
            "-r": [0.97], # minimum score to keep primary alignment [default=0]
            "-d": [0], # mininum (primary - secondary) score delta to keep secondary alignment
            "-e": [0], # minimum (primary - secondary) score delta to keep primary alignment
            "-a": [""], # use (secondary / primary) for delta comparisons
            "-f": [""], # normalize score based on length
            "-u": [""], # use substitution count instead of score
            "-s": [2], # minimum score to keep secondary alignment [default=0]
            "-o": [0] #  filter reads whose alignments begin or end with an insert > N [default=99999]
        }, { # vg pileup
            "-w": [40], # size of window to apply -m option (default=0)
            "-m": [2], # ignore bases with > N mismatches within window centered on read (default=1)
            "-q": [10] # ignore bases with PHRED quality < N (default=0)
        }, { # vg call
            "-r": [0.0001], # Prior for being heterozygous
            "-b": [1.0], # Max strand bias
            "-f": [0.05], # Min fraction of reads required to support a variant
            "-d": [4] # Min pileup depth
        }, { # glenn2vcf
            "--depth": [10], # search depth not read depth
            "--min_fraction": [0.15], # Min fraction of average coverage to call at
            "--min_count": [6], # Min total supporting reads for an allele to have it
            "--max_het_bias": [4.2] # Max bias towards one alt of a called het
        }, { # vcfeval
            "--all-records": [""],
            "--vcf-score-field": ["XAAD"]
        }]
        
    # Make the whole grid of conditions for the grid search
    conditions = [ExperimentCondition(*point) for point in make_grid(grid)]
            
        
    # Add a condition that opens everything way up so we can try and
    # get maximum recall.
    conditions.append(ExperimentCondition(
        { # vg filter 
            "-r": 0,
            "-d": 0.05,
            "-e": 0.05,
            "-a": "",
            "-f": "",
            "-u": "",
            "-s": 10000,
            "-o": 99999
        }, { # vg pileup
            "-w": 40,
            "-m": 10,
            "-q": 10
        }, { # vg call
            "-r": 0.0001,
            "-b": 0.4,
            "-f": 0.25,
            "-d": 11
        }, { # glenn2vcf
            "--depth": 10,
            "--min_fraction": 0, # Min fraction of average coverage to call at
            "--min_count": 1, # Min total supporting reads for an allele to have it
            "--max_het_bias": 20 # Max bias towards one alt of a called het
        }, { # vcfeval
            "--all-records": "",
            "--vcf-score-field": "XAAD"
        })
    )
    
    RealTimeLogger.get().info("Running {} conditions...".format(len(conditions)))
    
    for region_dir in gam_store.list_input_directory(""):
        # Within every region we have samples for, look through all the
        # different graphs.
        
        if options.important_regions is not None and region_dir not in options.important_regions:
            # Skip it if it's unimportant
            continue
        
        for graph_dir in gam_store.list_input_directory(region_dir):
            # Within every graph for a region, we have a collection of samples.
            
            if ("{}:{}".format(region_dir, graph_dir) in options.blacklist or
                region_dir in options.blacklist or
                graph_dir in options.blacklist):
                # We don't want to process this region/graph pair.
                RealTimeLogger.get().info("Skipping {} graph {}".format(
                    region_dir, graph_dir))
                continue
                
            if options.important_graphs is not None and graph_dir not in options.important_graphs:
                # Skip it if it's unimportant
                continue
                
            for filename in gam_store.list_input_directory("{}/{}".format(
                region_dir, graph_dir)):
                # Look at each potential sample file
                
                # Is this file a sample?
                match = re.match("(.+)\\.gam$", filename)
                
                if not match:
                    # It's not a sample
                    continue
                    
                if options.important_samples is not None and filename not in options.important_samples:
                    # Skip it if it's unimportant
                    continue
                    
                # Otherwise, compose the full GAM key
                gam_key = "{}/{}/{}".format(region_dir, graph_dir, filename)
                
                if (not truth_store.exists(truth_compressed_key(gam_key)) or
                    not truth_store.exists(truth_index_key(gam_key))):
                    
                    # We don't have a truth for this sample, so don't bother doing it.
                    RealTimeLogger.get().warning("Skipping missing truth for {}".format(gam_key))
                    continue
                
                # Kick off a pipeline to make the variant calls.
                # TODO: assumes all the extra directories we need to read stuff from are set
                exp_job = job.addChildJobFn(run_conditions, gam_key, conditions, options,
                    cores=1, memory="2G", disk="10G")
                    
                # Save the best F score by condition under this region, graph, and sample filename
                results[region_dir][graph_dir][filename] = exp_job.rv()
                
    # Give back the results
    # TODO: we run it through JSON to fix the pickle-ability.
    return de_defaultdict(results)