def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    filtered_gams = []
    skip_words = options.skip.split(",")
    for gam in options.in_gams:
        skip_gam = False
        for word in skip_words:
            if len(word) > 0 and word in gam:
                skip_gam = True
        if not skip_gam:
            filtered_gams.append(gam)
    options.in_gams = filtered_gams

    for gam in options.in_gams:
        if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam":
            raise RuntimeError("Input gam paths must be of the form "
                               ".../<alg>/<reads>/<filename>.gam")

    # Make a root job
    root_job = Job.wrapJobFn(call_variants, options,
                             cores=1, memory="2G", disk="2G")
    
    # Run it and see how many jobs fail
    failed_jobs = Job.Runner.startToil(root_job,  options)
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    filtered_gams = []
    skip_words = options.skip.split(",")
    for gam in options.in_gams:
        skip_gam = False
        for word in skip_words:
            if len(word) > 0 and word in gam:
                skip_gam = True
        if not skip_gam:
            filtered_gams.append(gam)
    options.in_gams = filtered_gams

    for gam in options.in_gams:
        if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam":
            raise RuntimeError("Input gam paths must be of the form "
                               ".../<alg>/<reads>/<filename>.gam")

    # Make a root job
    root_job = Job.wrapJobFn(call_variants, options,
                             cores=1, memory="2G", disk="2G")
    
    # Run it and see how many jobs fail
    failed_jobs = Job.Runner.startToil(root_job,  options)
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()
Beispiel #3
0
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    for gam in options.in_gams:
        if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam":
            raise RuntimeError("Input gam paths must be of the form "
                               ".../<alg>/<reads>/<filename>.gam")
    robust_makedirs(json_out_path(options))
    robust_makedirs(compare_out_path(options))
                    
    # Make a root job
    root_job = Job.wrapJobFn(compute_all_indexes, options,
        cores=1, memory="2G", disk=0)
    
    # Run it and see how many jobs fail
    if not options.only_summary:
        failed_jobs = Job.Runner.startToil(root_job,  options)
    else:
        failed_jobs = 0
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()

    # make some tables from the json comparison output
    #dist_table(options)
    #acc_table(options)
    snp_count_table(options)
    graph_size_table(options)
Beispiel #4
0
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    for graph in options.graphs:
        if os.path.splitext(graph)[1] != ".vg":
            raise RuntimeError("Input graphs expected to have .vg extension")

    # Make a root job
    root_job = Job.wrapJobFn(compute_kmer_indexes, options,
        cores=1, memory="2G", disk=0)
    
    # Run it and see how many jobs fail
    if not options.only_summary:
        failed_jobs = Job.Runner.startToil(root_job,  options)
    else:
        failed_jobs = 0
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()

    # Do the drawing outside toil to get around weird import problems
    cluster_comparisons(options)
def main(args):
                                        
    options = parse_args(args)

    RealTimeLogger.start_master()

    robust_makedirs(options.out_dir)
                                    
    vcfmap = munge_vcf_results(options.comp_dir)

    mergetable = make_trio_vcfs(vcfmap, options)

    do_mendel(mergetable, options)
def main(args):

    options = parse_args(args)

    RealTimeLogger.start_master()

    robust_makedirs(options.out_dir)

    vcfmap = munge_vcf_results(options.comp_dir)

    mergetable = make_trio_vcfs(vcfmap, options)

    do_mendel(mergetable, options)
Beispiel #7
0
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    # Make a root job
    root_job = Job.wrapJobFn(run_and_evaluate, options,
                             cores=1, memory="2G", disk="2G")
    
    # Run it and get the return value
    answer = Job.Runner.startToil(root_job,  options)

    RealTimeLogger.stop_master()
    
    print("Root return value:")
    print(answer)
def main(args):
    
    options = parse_args(args)

    RealTimeLogger.start_master()

    if options.classic:
        # expect call_dir/SAMPLE/region.vcf

        for sampledir in glob.glob(os.path.join(options.call_dir, "*")):
            if os.path.isdir(sampledir):
                sample = os.path.basename(sampledir)
                vcfs = []
                outfile = os.path.join(sampledir, "TOTAL.vcf")
                for vcf in glob.glob(os.path.join(sampledir, "*.vcf")):
                    if os.path.basename(vcf) in ["BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf", "MHC.vcf"]:
                        run("vcfsort {} > {}.sort".format(vcf, vcf), fail_hard = True)
                        run("bgzip -c {}.sort > {}.gz".format(vcf, vcf), fail_hard = True)
                        run("rm -f {}.sort".format(vcf))
                        run("tabix -f -p vcf {}.gz".format(vcf), fail_hard = True)
                        vcfs.append("{}.gz".format(vcf))
                if len(vcfs) > 0:
                    run("vt cat {} > {}".format(" ".join(vcfs), outfile),
                        fail_hard = True)
                    run("vcfsort {} > {}.sort".format(outfile, outfile), fail_hard = True)
                    run("mv {}.sort {}".format(outfile, outfile), fail_hard = True)
                    run("bgzip -c {} > {}.gz".format(outfile, outfile), fail_hard = True)
                    run("tabix -f -p vcf {}.gz".format(outfile), fail_hard = True)

        return 0

    # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf

    # count up regions
    regions = set()
    for regiondir in glob.glob(os.path.join(options.call_dir, "*")):
        if os.path.isdir(regiondir):
            region = os.path.basename(regiondir)
            # avoid crufty directories (including outputs of previous runs of this script)
            if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]:
                regions.add(region)

    print regions

    # count up graphs (that are present in every region)
    graphs = set()
    gcount = defaultdict(int)
    for region in regions:
        for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")):
            if os.path.isdir(graphdir):
                graph = os.path.basename(graphdir)
                gcount[graph] = gcount[graph] + 1
    
    for graph, count in gcount.items():
        if count == len(regions):
            graphs.add(graph)

    print graphs

    # count up samples
    samples = set()
    scount = defaultdict(int)
    for region in regions:
        for graph in graphs:
            for vcf in glob.glob(os.path.join(options.call_dir, region, graph, "*_sample.vcf")):
                sample = os.path.basename(vcf).split("_")[0]
                scount[sample] = scount[sample] + 1

    for sample, count in scount.items():
        samples.add(sample)

    print samples

    # make our output directory
    out_dir = os.path.join(options.call_dir, options.name)
    robust_makedirs(out_dir)

    for graph in graphs:
        g_out_dir = os.path.join(out_dir, graph)

        for sample in samples:
            vcf_files = []

            for region in regions:
                vcf = os.path.join(options.call_dir, region, graph, "{}_sample.vcf".format(sample))
                if os.path.isfile(vcf):
                    vcf_files.append((region, vcf))

            # this sample doesn't span all regions, skip it
            if len(vcf_files) < len(regions):
                print "Skipping Sample {} for Graph {}".format(sample, graph)
                continue
            
            # output vcf
            merge_vcf_path = os.path.join(out_dir, graph, "{}_sample.vcf".format(sample))

            # working directory for intermediates / debugging
            work_path = os.path.join(out_dir, graph, "input", sample)
            robust_makedirs(work_path)

            # preprocess all the vcfs and leave in input dir
            input_files = []
            for region, vcf in vcf_files:
                outbase = os.path.join(work_path, region)
                run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard = True)
                run("bgzip -f {}.vcf".format(outbase))
                run("tabix -f -p vcf {}.vcf.gz".format(outbase))
                input_files.append("{}.vcf.gz".format(outbase))
            
            # run the merge
            run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path), fail_hard = True)

            # make an index just in case
            run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
            run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
            run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
            run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
        
    return 0
def main(args):

    options = parse_args(args)

    RealTimeLogger.start_master()

    if options.classic:
        # expect call_dir/SAMPLE/region.vcf

        for sampledir in glob.glob(os.path.join(options.call_dir, "*")):
            if os.path.isdir(sampledir):
                sample = os.path.basename(sampledir)
                vcfs = []
                outfile = os.path.join(sampledir, "TOTAL.vcf")
                for vcf in glob.glob(os.path.join(sampledir, "*.vcf")):
                    if os.path.basename(vcf) in [
                            "BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf",
                            "MHC.vcf"
                    ]:
                        run("vcfsort {} > {}.sort".format(vcf, vcf),
                            fail_hard=True)
                        run("bgzip -c {}.sort > {}.gz".format(vcf, vcf),
                            fail_hard=True)
                        run("rm -f {}.sort".format(vcf))
                        run("tabix -f -p vcf {}.gz".format(vcf),
                            fail_hard=True)
                        vcfs.append("{}.gz".format(vcf))
                if len(vcfs) > 0:
                    run("vt cat {} > {}".format(" ".join(vcfs), outfile),
                        fail_hard=True)
                    run("vcfsort {} > {}.sort".format(outfile, outfile),
                        fail_hard=True)
                    run("mv {}.sort {}".format(outfile, outfile),
                        fail_hard=True)
                    run("bgzip -c {} > {}.gz".format(outfile, outfile),
                        fail_hard=True)
                    run("tabix -f -p vcf {}.gz".format(outfile),
                        fail_hard=True)

        return 0

    # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf

    # count up regions
    regions = set()
    for regiondir in glob.glob(os.path.join(options.call_dir, "*")):
        if os.path.isdir(regiondir):
            region = os.path.basename(regiondir)
            # avoid crufty directories (including outputs of previous runs of this script)
            if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]:
                regions.add(region)

    print regions

    # count up graphs (that are present in every region)
    graphs = set()
    gcount = defaultdict(int)
    for region in regions:
        for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")):
            if os.path.isdir(graphdir):
                graph = os.path.basename(graphdir)
                gcount[graph] = gcount[graph] + 1

    for graph, count in gcount.items():
        if count == len(regions):
            graphs.add(graph)

    print graphs

    # count up samples
    samples = set()
    scount = defaultdict(int)
    for region in regions:
        for graph in graphs:
            for vcf in glob.glob(
                    os.path.join(options.call_dir, region, graph,
                                 "*_sample.vcf")):
                sample = os.path.basename(vcf).split("_")[0]
                scount[sample] = scount[sample] + 1

    for sample, count in scount.items():
        samples.add(sample)

    print samples

    # make our output directory
    out_dir = os.path.join(options.call_dir, options.name)
    robust_makedirs(out_dir)

    for graph in graphs:
        g_out_dir = os.path.join(out_dir, graph)

        for sample in samples:
            vcf_files = []

            for region in regions:
                vcf = os.path.join(options.call_dir, region, graph,
                                   "{}_sample.vcf".format(sample))
                if os.path.isfile(vcf):
                    vcf_files.append((region, vcf))

            # this sample doesn't span all regions, skip it
            if len(vcf_files) < len(regions):
                print "Skipping Sample {} for Graph {}".format(sample, graph)
                continue

            # output vcf
            merge_vcf_path = os.path.join(out_dir, graph,
                                          "{}_sample.vcf".format(sample))

            # working directory for intermediates / debugging
            work_path = os.path.join(out_dir, graph, "input", sample)
            robust_makedirs(work_path)

            # preprocess all the vcfs and leave in input dir
            input_files = []
            for region, vcf in vcf_files:
                outbase = os.path.join(work_path, region)
                run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard=True)
                run("bgzip -f {}.vcf".format(outbase))
                run("tabix -f -p vcf {}.vcf.gz".format(outbase))
                input_files.append("{}.vcf.gz".format(outbase))

            # run the merge
            run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path),
                fail_hard=True)

            # make an index just in case
            run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)
            run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)
            run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)
            run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)

    return 0