def plot_heatmap(tsv, options):
    """ make a heatmap """
    out_dir = os.path.join(options.comp_dir, "heatmaps")
    robust_makedirs(out_dir)
    mat, col_names, row_names, row_label = read_tsv(tsv)
    names = name_map()

    for i in range(len(col_names)):
        if col_names[i] in names:
            col_names[i] = names[col_names[i]]
    for i in range(len(row_names)):
        if row_names[i] in names:
            row_names[i] = names[row_names[i]]

    if "_rename" in tsv:
        return
    fix_tsv = tsv.replace(".tsv", "_rename.tsv")
    write_tsv(fix_tsv, mat, col_names, row_names, row_label)

    out_hm = os.path.join(out_dir, os.path.basename(tsv).replace(".tsv", ".pdf"))
    ph_opts = "--skip {}".format(options.skip) if options.skip is not None else ""
    cmd = "scripts/plotHeatmap.py {} {} {}".format(fix_tsv, out_hm, ph_opts)
    print cmd
    os.system(cmd)

    cmd = "scripts/plotHeatmap.py {} {} {} --log_scale".format(fix_tsv, out_hm.replace(".pdf", "_log.pdf"), ph_opts)
    print cmd
    os.system(cmd)
def temp_path(options, prefix="tmp", ext="", length=6):
    """ get a temporary file in out_dir/temp
    """
    tempdir = os.path.join(options.out_dir, "temp")
    robust_makedirs(tempdir)
    tag = "".join([random.choice(string.ascii_uppercase + string.digits) for i in xrange(length)])
    return os.path.join(tempdir, prefix + tag + ext)
def plot_kmer_comp(tsv_path, options):
    """ take a kmer compare table and make a 
    jaccard boxplot for the first column and a 
    recall / precision ploot for the 2nd and third column
    """
    out_dir = os.path.join(options.comp_dir, "comp_plots")
    robust_makedirs(out_dir)
    out_name = os.path.basename(os.path.splitext(tsv_path)[0])
    out_base_path = os.path.join(out_dir, out_name)
    region = out_name.split("-")[-1].upper()

    params = " ".join(PLOT_PARAMS)
    # jaccard boxplot
    jac_tsv = out_base_path + "_jac.tsv"
    awkstr = '''awk '{if (NR!=1) print $1 "\t" $2}' '''
    run("{} {} > {}".format(awkstr, tsv_path, jac_tsv))
    jac_png = out_base_path + "_jac.png"
    run("scripts/boxplot.py {} --save {} --title \"{} KMER Set Jaccard\" --x_label \"Graph\" --y_label \"Jaccard Index\" --x_sideways {}".format(jac_tsv, jac_png, region, params))

    # precision recall scatter plot
    acc_tsv = out_base_path + "_acc.tsv"
    awkstr = '''awk '{if (NR!=1) print $1 "\t" $4 "\t" $3}' '''
    run("{} {} > {}".format(awkstr, tsv_path, acc_tsv))
    acc_png = out_base_path + "_acc.png"
    run("scripts/scatter.py {} --save {} --title \"{} KMER Set Accuracy\" --x_label \"Recall\" --y_label \"Precision\" --width 12 --height 9 --lines {}".format(acc_tsv, acc_png, region, params))
def plot_heatmap(tsv, options):
    """ make a heatmap """
    out_dir = os.path.join(options.comp_dir, "heatmaps")
    robust_makedirs(out_dir)
    mat, col_names, row_names, row_label = read_tsv(tsv)
    names = name_map()

    for i in range(len(col_names)):
        if col_names[i] in names:
            col_names[i] = names[col_names[i]]
    for i in range(len(row_names)):
        if row_names[i] in names:
            row_names[i] = names[row_names[i]]

    if "_rename" in tsv:
        return
    fix_tsv = tsv.replace(".tsv", "_rename.tsv")
    write_tsv(fix_tsv, mat, col_names, row_names, row_label)

    out_hm = os.path.join(out_dir, os.path.basename(tsv).replace(".tsv", ".pdf"))
    ph_opts = "--skip {}".format(options.skip) if options.skip is not None else ""
    cmd = "scripts/plotHeatmap.py {} {} {}".format(fix_tsv, out_hm, ph_opts)
    print cmd
    os.system(cmd)

    cmd = "scripts/plotHeatmap.py {} {} {} --log_scale".format(fix_tsv, out_hm.replace(".pdf", "_log.pdf"), ph_opts)
    print cmd
    os.system(cmd)
def make_best_calls(best_table, options):
    """ using softlinks, make a call set with best f1s from the roc.  this is dependent on the 
call directories being obtainable from the comparison directory by dropping extension """
    best_dir = options.out_dir.strip("/") + ".best"
    for region in best_table.keys():
        for graph in best_table[region].keys():

            comp_tsv_path = best_table[region][graph][0]
            comp_tsv_path = comp_tsv_path[:comp_tsv_path.find("/comp_tables")]
            call_base_path = os.path.splitext(comp_tsv_path)[0]
            call_path = os.path.join(call_base_path, region, graph)
            # gatk3 and platypus: we just link in their vcf since they don't have call directory
            if graph in ["gatk3", "platypus"]:
                robust_makedirs(os.path.join(best_dir, region, graph))
                
            else:
                robust_makedirs(os.path.join(best_dir, region))
                os.system("ln -fs {} {}".format(os.path.abspath(call_path),
                                                os.path.abspath(os.path.join(best_dir, region))))
            # link in the preprocessed vcf from the comp dir to the same directory
            comp_path = os.path.join(call_base_path +".comp")
            for pvcf in glob.glob(os.path.join(comp_path, "preprocessed_vcfs", region, "*_{}.vcf".format(graph))):
                os.system("ln -fs {} {}".format(os.path.abspath(pvcf),
                                                os.path.abspath(os.path.join(best_dir, region, graph, os.path.basename(pvcf).replace(graph, "sample_preprocessed")))))
            # link in the truth while we're at it
            for pvcf in glob.glob(os.path.join(comp_path, "preprocessed_vcfs", region, "*_platvcf*.vcf")):
                os.system("ln -fs {} {}".format(os.path.abspath(pvcf),
                                                os.path.abspath(os.path.join(best_dir, region, graph))))
Esempio n. 6
0
def main(args):
    
    options = parse_args(args) 
    
    RealTimeLogger.start_master()

    for gam in options.in_gams:
        if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam":
            raise RuntimeError("Input gam paths must be of the form "
                               ".../<alg>/<reads>/<filename>.gam")
    robust_makedirs(json_out_path(options))
    robust_makedirs(compare_out_path(options))
                    
    # Make a root job
    root_job = Job.wrapJobFn(compute_all_indexes, options,
        cores=1, memory="2G", disk=0)
    
    # Run it and see how many jobs fail
    if not options.only_summary:
        failed_jobs = Job.Runner.startToil(root_job,  options)
    else:
        failed_jobs = 0
    
    if failed_jobs > 0:
        raise Exception("{} jobs failed!".format(failed_jobs))
                               
    RealTimeLogger.stop_master()

    # make some tables from the json comparison output
    #dist_table(options)
    #acc_table(options)
    snp_count_table(options)
    graph_size_table(options)
def plot_kmer_comp(tsv_path, options):
    """ take a kmer compare table and make a 
    jaccard boxplot for the first column and a 
    recall / precision ploot for the 2nd and third column
    """
    out_dir = os.path.join(options.comp_dir, "comp_plots")
    robust_makedirs(out_dir)
    out_name = os.path.basename(os.path.splitext(tsv_path)[0])
    out_base_path = os.path.join(out_dir, out_name)
    sample = out_name.split("-")[-1].upper()
    region = out_name.split("-")[-2].upper()

    params = " ".join(PLOT_PARAMS)
    # jaccard boxplot
    jac_tsv = out_base_path + "_jac.tsv"
    awkstr = '''awk '{if (NR!=1) print $1 "\t" $2}' '''
    run("{} {} > {}".format(awkstr, tsv_path, jac_tsv))
    jac_png = out_base_path + "_jac.png"
    run("scripts/boxplot.py {} --save {} --title \"{} KMER Set Jaccard\" --x_label \"Graph\" --y_label \"Jaccard Index\" --x_sideways {}".format(jac_tsv, jac_png, region, params))

    # precision recall scatter plot
    acc_tsv = out_base_path + "_acc.tsv"
    awkstr = '''awk '{if (NR!=1) print $1 "\t" $4 "\t" $3}' '''
    run("{} {} > {}".format(awkstr, tsv_path, acc_tsv))
    acc_png = out_base_path + "_acc.png"
    run("scripts/scatter.py {} --save {} --title \"{} KMER Set Accuracy\" --x_label \"Recall\" --y_label \"Precision\" --width 12 --height 9 --lines {}".format(acc_tsv, acc_png, region, params))
def temp_path(options, prefix="tmp", ext="", length=6):
    """ get a temporary file in out_dir/temp
    """
    tempdir = os.path.join(options.out_dir, "temp")
    robust_makedirs(tempdir)
    tag = "".join([random.choice(
        string.ascii_uppercase + string.digits) for i in xrange(length)])
    return os.path.join(tempdir, prefix + tag + ext)
 def out_base_path(tag, label, extension):
     bd = tag if extension != ".tsv" else "tsv"
     ret = (
         os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region)
         + "_"
         + label
         + extension
     )
     robust_makedirs(os.path.dirname(ret))
     return ret
def main(args):

    options = parse_args(args)

    RealTimeLogger.start_master()

    robust_makedirs(options.out_dir)

    vcfmap = munge_vcf_results(options.comp_dir)

    mergetable = make_trio_vcfs(vcfmap, options)

    do_mendel(mergetable, options)
def main(args):
                                        
    options = parse_args(args)

    RealTimeLogger.start_master()

    robust_makedirs(options.out_dir)
                                    
    vcfmap = munge_vcf_results(options.comp_dir)

    mergetable = make_trio_vcfs(vcfmap, options)

    do_mendel(mergetable, options)
def main(args):
                                        
    options = parse_args(args)

    robust_makedirs(options.out_dir)
                                    
    evalmap = munge_vcfeval_results(options.comp_dir)
    counts_table = do_all_counts(evalmap, options)
    
    for region, rd in counts_table.items():
        for sample, graph_table in rd.items():
            tsv = counts_tsv(graph_table, options)
            with open(os.path.join(options.out_dir, "call_stats_{}_{}.tsv".format(region, sample)), "w") as f:
                f.write(tsv)
def make_trio_vcfs(vcfmap, options):
    """ merge up samples into same vcf using rtg return index of merged files"""
    robust_makedirs(options.out_dir)
    ped_file = os.path.join(options.out_dir, "predigree.ped")
    with open(ped_file, "w") as f:
        f.write(options.ped + "\n")

    mergetable = dict()

    for region, rd in vcfmap.items():
        mergetable[region] = dict()
        region_dir = os.path.join(options.out_dir, "trio_vcfs", region)
        robust_makedirs(region_dir)
        # round up all sampels for graph
        bygraph = dict()
        for sample, sd in rd.items():
            for graph, pvcf in sd.items():
                if graph not in bygraph:
                    bygraph[graph] = dict()
                bygraph[graph][sample] = pvcf

        # make a merged vcf for each graph
        for graph, sd in bygraph.items():
            input_vcfs = {"snp": [], "indel": [], "all": []}
            for sample, pvcf in sd.items():
                work_dir = os.path.join(region_dir, "input_vcf")
                merge_dir = os.path.join(region_dir, "merged_vcf")
                robust_makedirs(work_dir)
                robust_makedirs(merge_dir)
                for kind in input_vcfs.keys():
                    filter_vcf = os.path.join(
                        work_dir, "{}_{}_{}.vcf".format(graph, sample, kind))
                    vstr = "-v snps,mnps" if kind is "snp" else "-V snps,mnps" if kind is "indel" else ""
                    if options.clip is not None:
                        vstr += " -R {}".format(options.clip)
                    run("bcftools view {} -f PASS,. {} | bcftools norm - -f {} > {}"
                        .format(pvcf, vstr, options.chrom_fa_path, filter_vcf))
                    run("bgzip -f {}".format(filter_vcf))
                    run("tabix -f -p vcf {}.gz".format(filter_vcf))
                    input_vcfs[kind].append("{}.gz".format(filter_vcf))

            if len(sd.items()) >= 3 and \
               len(input_vcfs["all"]) == len(sd.items()) and\
               len(input_vcfs["snp"]) == len(sd.items()) and\
               len(input_vcfs["indel"]) == len(sd.items()):

                mergetable[region][graph] = dict()
                for kind in input_vcfs.keys():

                    out_vcf = os.path.join(
                        merge_dir, "{}_{}_merged.vcf.gz".format(graph, kind))
                    run("rm -f {}".format(out_vcf))
                    run("rtg vcfmerge {} -o {}".format(
                        " ".join(input_vcfs[kind]), out_vcf),
                        fail_hard=True)

                    mergetable[region][graph][kind] = out_vcf

    return mergetable
Esempio n. 14
0
def compute_kmer_comparison(job, graph1, graph2, options):
    """ run vg compare between two graphs
    """
    out_path = comp_path(graph1, graph2, options)
    graph1_index_path = index_path(graph1, options)
    assert os.path.exists(graph1_index_path)
    graph2_index_path = index_path(graph2, options)
    assert os.path.exists(graph2_index_path)

    do_comp = options.overwrite or not os.path.exists(out_path)
    
    if do_comp:
        robust_makedirs(os.path.dirname(out_path))        
        os.system("vg compare {} {} -t {} > {}".format(graph1, graph2,
                                                       min(options.vg_cores, 2), out_path))
def make_trio_vcfs(vcfmap, options):
    """ merge up samples into same vcf using rtg return index of merged files"""
    robust_makedirs(options.out_dir)
    ped_file = os.path.join(options.out_dir, "predigree.ped")
    with open(ped_file, "w") as f:
        f.write(options.ped + "\n")

    mergetable = dict()
    
    for region, rd in vcfmap.items():
        mergetable[region] = dict()
        region_dir = os.path.join(options.out_dir, "trio_vcfs", region)
        robust_makedirs(region_dir)
        # round up all sampels for graph
        bygraph = dict()
        for sample, sd in rd.items():
            for graph, pvcf in sd.items():
                if graph not in bygraph:
                    bygraph[graph] = dict()
                bygraph[graph][sample] = pvcf

        # make a merged vcf for each graph
        for graph, sd in bygraph.items():
            input_vcfs = { "snp" : [], "indel" : [], "all" : [] }
            for sample, pvcf in sd.items():
                work_dir = os.path.join(region_dir, "input_vcf")
                merge_dir = os.path.join(region_dir, "merged_vcf")
                robust_makedirs(work_dir)
                robust_makedirs(merge_dir)
                for kind in input_vcfs.keys():
                    filter_vcf = os.path.join(work_dir, "{}_{}_{}.vcf".format(graph, sample, kind))
                    vstr = "-v snps,mnps" if kind is "snp" else "-V snps,mnps" if kind is "indel" else ""
                    if options.clip is not None:
                        vstr += " -R {}".format(options.clip)
                    run("bcftools view {} -f PASS,. {} | bcftools norm - -f {} > {}".format(
                        pvcf, vstr, options.chrom_fa_path, filter_vcf))
                    run("bgzip -f {}".format(filter_vcf))
                    run("tabix -f -p vcf {}.gz".format(filter_vcf))
                    input_vcfs[kind].append("{}.gz".format(filter_vcf))

            if len(sd.items()) >= 3 and \
               len(input_vcfs["all"]) == len(sd.items()) and\
               len(input_vcfs["snp"]) == len(sd.items()) and\
               len(input_vcfs["indel"]) == len(sd.items()):

                mergetable[region][graph] = dict()
                for kind in input_vcfs.keys():

                    out_vcf = os.path.join(merge_dir, "{}_{}_merged.vcf.gz".format(graph, kind))
                    run("rm -f {}".format(out_vcf))
                    run("rtg vcfmerge {} -o {}".format(" ".join(input_vcfs[kind]), out_vcf), fail_hard = True)
                
                    mergetable[region][graph][kind] = out_vcf

    return mergetable
Esempio n. 16
0
def main(args):

    options = parse_args(args)

    robust_makedirs(options.out_dir)

    evalmap = munge_vcfeval_results(options.comp_dir)
    counts_table = do_all_counts(evalmap, options)

    for region, rd in counts_table.items():
        for sample, graph_table in rd.items():
            tsv = counts_tsv(graph_table, options)
            with open(
                    os.path.join(options.out_dir,
                                 "call_stats_{}_{}.tsv".format(region,
                                                               sample)),
                    "w") as f:
                f.write(tsv)
def do_mendel(mergetable, options):
    """ run rtg mendelian on all our merged vcfs """

    header = ["graph", "all", "snp", "indel"]
    for region, gd in mergetable.items():
        table = []
        for graph, mergefiles in gd.items():
            annot_dir = os.path.join(options.out_dir, "mendel", region, graph)
            robust_makedirs(annot_dir)
            concordance = dict()
            for kind, mergefile in mergefiles.items():
                out_vcf = os.path.join(annot_dir,
                                       "mendel_{}.vcf.gz".format(kind))
                con_vcf = os.path.join(annot_dir,
                                       "consistent_{}.vcf.gz".format(kind))
                incon_vcf = os.path.join(annot_dir,
                                         "inconsistent_{}.vcf.gz".format(kind))
                out_stdout = os.path.join(annot_dir,
                                          "mendel_{}.stdout".format(kind))

                run("rtg mendelian -l -i {} -t {} --pedigree {} --output {} --output-consistent {} --output-inconsistent {} > {}"
                    .format(mergefile,
                            os.path.join(options.comp_dir, "chrom.sdf"),
                            os.path.join(options.out_dir, "predigree.ped"),
                            out_vcf, con_vcf, incon_vcf, out_stdout))

                concordance[kind] = scrape_mendel(out_stdout)

            table.append([
                graph, concordance["all"], concordance["snp"],
                concordance["indel"]
            ])

        # write the tsv for this region
        with open(
                os.path.join(options.out_dir, "mendel-{}.tsv".format(region)),
                "w") as f:
            f.write("\t".join(header) + "\n")
            for row in table:
                if None not in row:
                    line = [str(s) for s in row]
                    f.write("\t".join(line) + "\n")
def compute_linear_variants(job, input_gam, options):
    """ project to bam, then run samtools to call some variants
    """
    input_graph_path = graph_path(input_gam, options)
    input_index_path = index_path(input_graph_path, options)

    # can only do this if there is a "ref" path in the vg graph
    res_path = temp_path(options)
    run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, "ref", res_path))
    has_ref = False
    with open(res_path) as res_file:
        has_ref = res_file.read()[0] == "1"
    run("rm {}".format(res_path))
    
    if has_ref:
        surject_path = projected_bam_path(input_gam, options)
        out_vcf_path = linear_vcf_path(input_gam, options)
        out_vg_path = linear_vg_path(input_gam, options)
        fasta_path = ref_path(input_gam, options)
        do_surject = options.overwrite or not os.path.isfile(surject_path)
        do_vcf = do_surject or not os.path.isfile(out_vcf_path + ".gz")
        do_vg = do_vcf or not os.path.isfile(out_vg_path)

        if do_surject:
            robust_makedirs(os.path.dirname(surject_path))
            prefix_path = temp_path(options, ".prefix")
            # surject to reference path (name hardcoded to ref for now)
            run("vg surject -d {} -p {} -b {} -t {} | samtools sort -o - {}> {}".format(
                input_index_path,
                "ref",
                input_gam,
                options.vg_cores,
                prefix_path,
                surject_path),
                timeout_sec=options.timeout,
                timeout_dep=surject_path)
            run("rm -f {}".format(prefix_path))

        if do_vcf:
            # todo: we assume that all graphs have same reference fasta, here.
            # this is false for, ex, simons which uses grchg37 instead of 38.

            # create pileup in bcf using samtools
            # http://samtools.sourceforge.net/mpileup.shtml
            assert os.path.isfile(fasta_path)
            robust_makedirs(os.path.dirname(out_vcf_path))
            run("samtools mpileup -I -u -t DP -f {} {} | bcftools call -m -V indels - > {}".format(
                fasta_path,
                surject_path,
                out_vcf_path))

            # make compressed index
            run("bgzip -f {}".format(out_vcf_path))
            run("tabix -f -p vcf {}.gz".format(out_vcf_path))

        if do_vg:
            # and convert back to vg...
            robust_makedirs(os.path.dirname(out_vg_path))
            run("vg construct -v {}.gz -r {} -t {} > {}".format(out_vcf_path, fasta_path,
                                                                options.vg_cores, out_vg_path))
Esempio n. 19
0
def compute_vg_variants(job, input_gam, options):
    """ run vg pileup and vg call on the input
    """
    input_graph_path = graph_path(input_gam, options)
    out_pileup_path = pileup_path(input_gam, options)
    out_sample_vg_path = sample_vg_path(input_gam, options)
    out_augmented_vg_path = augmented_vg_path(input_gam, options)

    do_pu = options.overwrite or not os.path.isfile(out_pileup_path)
    do_call = do_pu or not os.path.isfile(out_sample_vg_path)
    do_aug = do_pu or not os.path.isfile(out_augmented_vg_path)

    if do_pu:
        RealTimeLogger.get().info("Computing Variants for {} {}".format(
            input_graph_path,
            input_gam))
        robust_makedirs(os.path.dirname(out_pileup_path))
        run("vg pileup {} {} -t {} > {}".format(input_graph_path,
                                                input_gam,
                                                options.vg_cores,
                                                out_pileup_path))

    if do_call:
        robust_makedirs(os.path.dirname(out_sample_vg_path))
        run("vg call {} {} -r 0.001 -d 50 -e 150 -s 25 -t {} | vg ids -c - | vg ids -s -  > {}".format(input_graph_path,
                                                                                                       out_pileup_path,
                                                                                                       options.vg_cores,
                                                                                                       out_sample_vg_path))

    if do_aug:
        robust_makedirs(os.path.dirname(out_augmented_vg_path))
        run("vg call {} {} -r 0.001 -d 50 -e 150 -s 25 -t {} -l | vg ids -c - | vg ids -s - > {}".format(input_graph_path,
                                                                                                         out_pileup_path,
                                                                                                         options.vg_cores,
                                                                                                         out_augmented_vg_path))
Esempio n. 20
0
def make_best_calls(best_table, options):
    """ using softlinks, make a call set with best f1s from the roc.  this is dependent on the 
call directories being obtainable from the comparison directory by dropping extension """
    best_dir = options.out_dir.strip("/") + ".best"
    for region in best_table.keys():
        for graph in best_table[region].keys():

            comp_tsv_path = best_table[region][graph][0]
            comp_tsv_path = comp_tsv_path[:comp_tsv_path.find("/comp_tables")]
            call_base_path = os.path.splitext(comp_tsv_path)[0]
            call_path = os.path.join(call_base_path, region, graph)
            # gatk3 and platypus: we just link in their vcf since they don't have call directory
            if graph in ["gatk3", "platypus"]:
                robust_makedirs(os.path.join(best_dir, region, graph))

            else:
                robust_makedirs(os.path.join(best_dir, region))
                os.system("ln -fs {} {}".format(
                    os.path.abspath(call_path),
                    os.path.abspath(os.path.join(best_dir, region))))
            # link in the preprocessed vcf from the comp dir to the same directory
            comp_path = os.path.join(call_base_path + ".comp")
            for pvcf in glob.glob(
                    os.path.join(comp_path, "preprocessed_vcfs", region,
                                 "*_{}.vcf".format(graph))):
                os.system("ln -fs {} {}".format(
                    os.path.abspath(pvcf),
                    os.path.abspath(
                        os.path.join(
                            best_dir, region, graph,
                            os.path.basename(pvcf).replace(
                                graph, "sample_preprocessed")))))
            # link in the truth while we're at it
            for pvcf in glob.glob(
                    os.path.join(comp_path, "preprocessed_vcfs", region,
                                 "*_platvcf*.vcf")):
                os.system("ln -fs {} {}".format(
                    os.path.abspath(pvcf),
                    os.path.abspath(os.path.join(best_dir, region, graph))))
def do_mendel(mergetable, options):
    """ run rtg mendelian on all our merged vcfs """

    header = ["graph", "all", "snp", "indel"]
    for region, gd in mergetable.items():
        table = []
        for graph, mergefiles in gd.items():
            annot_dir = os.path.join(options.out_dir, "mendel", region, graph)
            robust_makedirs(annot_dir)
            concordance = dict()
            for kind, mergefile in mergefiles.items():
                out_vcf = os.path.join(annot_dir, "mendel_{}.vcf.gz".format(kind))
                con_vcf = os.path.join(annot_dir, "consistent_{}.vcf.gz".format(kind))
                incon_vcf = os.path.join(annot_dir, "inconsistent_{}.vcf.gz".format(kind))
                out_stdout = os.path.join(annot_dir, "mendel_{}.stdout".format(kind))

                run("rtg mendelian -l -i {} -t {} --pedigree {} --output {} --output-consistent {} --output-inconsistent {} > {}".format(
                    mergefile,
                    os.path.join(options.comp_dir, "chrom.sdf"),
                    os.path.join(options.out_dir, "predigree.ped"),
                    out_vcf,
                    con_vcf,
                    incon_vcf,
                    out_stdout))

                concordance[kind] = scrape_mendel(out_stdout)

            table.append([graph, concordance["all"], concordance["snp"], concordance["indel"]])
            
        # write the tsv for this region
        with open(os.path.join(options.out_dir, "mendel-{}.tsv".format(region)), "w") as f:
            f.write("\t".join(header) + "\n")
            for row in table:
                if None not in row:
                    line = [str(s) for s in row]
                    f.write("\t".join(line) + "\n")
def compute_snp1000g_baseline(job, input_gam, platinum, filter_indels, options):
    """ make 1000 genomes sample graph by filtering the vcf
    """
    # there is only one g1vcf graph per region per sample
    # this function is also going to get called once for each graph type
    # so we hack here to only run on refonly graphs (arbitrary choice)
    if alignment_graph_tag(input_gam, options) != "refonly":
        return

    sample = alignment_sample_tag(input_gam, options)

    if platinum is True and sample not in options.platinum_samples.split(","):
        return
    
    region = alignment_region_tag(input_gam, options)
    if platinum is False:
        g1kvcf_path = os.path.join(options.g1kvcf_path, region.upper() + ".vcf")
    else:
        g1kvcf_path = os.path.join(options.platinum_path, sample, region.upper() + ".vcf")
    g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed")
    filter_vcf_path = g1k_vcf_path(input_gam, platinum, filter_indels, options)
    filter_fa_path = g1k_fa_path(input_gam, platinum, filter_indels, options)
    filter_vg_path = g1k_vg_path(input_gam, platinum, filter_indels, options)
    fasta_path = options.chrom_fa_path

    do_filter = options.overwrite or not os.path.isfile(filter_vcf_path + ".gz")
    do_construct = do_filter or not os.path.isfile(filter_vg_path)

    # make sure we're dealing with a sample that's in the vcf
    if do_filter or do_construct:
        p = subprocess.Popen("grep {} {} | wc -l".format(sample, g1kvcf_path),
                             shell=True, stdout=subprocess.PIPE, stderr=sys.stderr, bufsize=-1)
        output, _ = p.communicate()
        assert p.wait() == 0
        if int(output) == 0:
            do_filter = False
            do_construct = False

    # make filtered compressed vcf for this sample
    if do_filter:            
        robust_makedirs(os.path.dirname(filter_vcf_path))
        if filter_indels is True:
            filter_input_path = filter_vcf_path + ".in"
            run("scripts/vcfFilterIndels.py {} > {}".format(g1kvcf_path, filter_input_path),
                fail_hard = True)
        else:
            filter_input_path = g1kvcf_path
        run("scripts/vcfFilterSample.py {} {} {} {} {}".format(filter_input_path,
                                                              fasta_path,
                                                              sample,
                                                              filter_vcf_path,
                                                              filter_fa_path),
            fail_hard = True)
        run("scripts/vcfsort {} > {}.sort ; mv {}.sort {}".format(filter_vcf_path,
                                                          filter_vcf_path,
                                                          filter_vcf_path,
                                                          filter_vcf_path))
        run("bgzip -f {}".format(filter_vcf_path), fail_hard = True)
        run("tabix -f -p vcf {}.gz".format(filter_vcf_path), fail_hard = True)

    # load it into a vg graph
    if do_construct:
        with open(g1kbed_path) as bed_file:
            coords = bed_file.readline().split()
            # convert from bed to vcf coordinates by adding one to start
            coords = (coords[0], int(coords[1]) + 1, int(coords[2]))
            run("vg construct -v {}.gz -r {} -t {} -R {}:{}-{} > {}".format(filter_vcf_path, filter_fa_path,
                                                                            options.vg_cores, 
                                                                            coords[0], coords[1], coords[2],
                                                                            filter_vg_path),
                fail_hard = True)    
def compute_vg_variants(job, input_gam, options):
    """ run vg pileup and vg call on the input
    """
    
    # Move to the appropriate working directory from wherever Toil dropped us
    os.chdir(options.cwd)
    
    input_graph_path = graph_path(input_gam, options)
    out_pileup_path = pileup_path(input_gam, options)
    out_sample_vg_path = sample_vg_path(input_gam, options)
    out_sample_vcf_path = out_sample_vg_path.replace(".vg", ".vcf")
    out_sample_txt_path = sample_txt_path(input_gam, options)    
    out_augmented_vg_path = augmented_vg_path(input_gam, options)
    out_gam_filter_path = gam_filter_path(input_gam, options)
    out_gam_index_path = gam_index_path(input_gam, options)
    out_bam_path = out_sample_vg_path.replace(".vg", ".bam")
    do_genotype = options.genotype and (options.overwrite or not os.path.isfile(out_sample_vcf_path))
    do_gam_filter= (options.overwrite or not os.path.isfile(out_gam_filter_path))
    do_gam_index = do_genotype and (do_gam_filter or options.overwrite or not os.path.isdir(out_gam_index_path))
    do_pu = not options.genotype and (options.overwrite or not os.path.isfile(out_pileup_path))
    do_call = not options.genotype and (do_pu or not os.path.isfile(out_sample_vcf_path))
    do_surject = options.surject and (options.overwrite or do_gam_filter or not os.path.isfile(out_bam_path))

    # We need an XG here for the mase graph, but I haven't got time to refactor
    # to make it in the right place. So just make it here.
    temp_xg_path = job.fileStore.getLocalTempDir() + "/filter.xg"

    if do_gam_filter or do_pu:
        # Make sure we have the xg index around, which fiulter may need.
        run("vg index -x {} {}".format(temp_xg_path, input_graph_path), fail_hard = True)

    if do_gam_filter:
        robust_makedirs(os.path.dirname(out_pileup_path))
        run("vg filter -x {} {} {} {} > {}".format(temp_xg_path, input_gam,
                                             options.filter_opts, input_graph_path,
                                             out_gam_filter_path),
            fail_hard = True)

    if do_gam_index:
        robust_makedirs(os.path.dirname(out_pileup_path))
        run("rm -rf {} ; vg index {} -N -d {}".format(out_gam_index_path, out_gam_filter_path, out_gam_index_path),
            fail_hard = True)        

    if do_pu:
        robust_makedirs(os.path.dirname(out_pileup_path))
        run("vg pileup {} {} {} -t {} > {}".format(input_graph_path,
                                                   out_gam_filter_path,
                                                   options.pileup_opts,
                                                   options.vg_cores,
                                                   out_pileup_path),
            fail_hard = True)
    ref = None
    bedLength = -1
    if do_call or do_genotype or do_surject:
        robust_makedirs(os.path.dirname(out_sample_vcf_path))
        region = alignment_region_tag(input_gam, options)
        g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed")            
        with open(g1kbed_path) as f:
            contig, offset, end = f.readline().split()[0:3]
            bedLength = int(end) - int(offset)
            
        # make the vcf
        # can only do this if there is a "ref" path in the vg graph
        ref = None
        res_path = temp_path(options)
        for ref_name in ["ref", contig]:
            run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, ref_name, res_path))
            with open(res_path) as res_file:
                if res_file.read()[0] == "1":
                    ref = ref_name
                    break
        run("rm {}".format(res_path))
                
    if ref is not None:
        if do_genotype:
            run("vg genotype {} {} -S -pv -q -i -C -o {} -r {} -c {} -s {} -t {} > {} 2> {}".format(input_graph_path,
                                                                                                 out_gam_index_path,
                                                                                                 offset,
                                                                                                 ref,
                                                                                                 contig,
                                                                                                 alignment_sample_tag(input_gam, options),
                                                                                                 options.vg_cores,
                                                                                                 out_sample_vcf_path,
                                                                                                 out_sample_vcf_path.replace(".vcf", ".vcf.stderr")),
                fail_hard = True)
        if do_call:
            run("vg call {} {} {} -t {} -o {} -r {} -c {} -S {} -A {} > {} 2> {}".format(input_graph_path,
                                                                                         out_pileup_path,
                                                                                         options.call_opts,
                                                                                         options.vg_cores,
                                                                                         offset,
                                                                                         ref,
                                                                                         contig,
                                                                                         alignment_sample_tag(input_gam, options),
                                                                                         out_augmented_vg_path,
                                                                                         out_sample_vg_path.replace(".vg", ".vcf"),
                                                                                         out_sample_vg_path.replace(".vg", ".vcf.stderr")),
                fail_hard = True)
        if do_surject:
            run("vg index {} -k {} -e {} -s -d {}.index -t {}".format(input_graph_path, 20, 5,
                                                                         os.path.join(os.path.dirname(out_bam_path), "graph"),
                                                                         options.vg_cores),
                fail_hard = True)
            run("vg surject {} -t {} -p {} -b -d {}.index > {}".format(out_gam_filter_path, options.vg_cores, ref,
                                                                       os.path.join(os.path.dirname(out_bam_path), "graph"),
                                                                       out_bam_path),
                fail_hard = True)
            
            # fix up chromosome coordinates so we can display on browser
            if contig[0] != "c":
                contig = "chr{}".format(contig)
            contigLength = {"chr5": 181538259, "chr6": 170805979, "chr13": 114364328,
                            "chr17": 83257441, "chr19": 58617616}
            # in header, change up the contig name and 
            run("samtools view -H {} | sed -e \"s/{}/{}/\" | sed -e \"s/{}/{}/\" > {}.sam".format(out_bam_path,
                                                                                                  ref, contig,
                                                                                                  bedLength, contigLength[contig],
                                                                                                  out_bam_path),
                fail_hard = True)
            # in body, add offset and fix contig, leave in sam for now so we can debug
            run("samtools view -F 256 {} | awk -v OFS=\'\\t\' \'{{$3=\"{}\"; $4=$4+{}; $5=60; $8=$8+{}; print $0}}\' >> {}.sam".format(out_bam_path,
                                                                                                                                     contig,
                                                                                                                                     offset, offset,
                                                                                                                                     out_bam_path),
                fail_hard = True)
            # back to bam
            run("samtools view {}.sam -b -F 4 | samtools sort - --threads {} -o {}".format(out_bam_path, options.vg_cores, out_bam_path),
                fail_hard = True)

            # and index
            run("samtools index -b {}".format(out_bam_path), fail_hard = True)
Esempio n. 24
0
def main(args):

    options = parse_args(args)

    robust_makedirs(os.path.join(options.out_dir, "comp_tables"))

    # compute average score for each roc dir in this table
    avg_table = []
    # [region][method] --> (path, f1)
    best_table = defaultdict(lambda: defaultdict(lambda: (None, -1)))
    first = True
    # sort the directories, assuming their names give info on their order
    # in the roc
    for comp_dir in sorted(
            options.comp_dirs)[options.skip_first:len(options.comp_dirs) -
                               options.skip_last]:
        print comp_dir
        # this can happen easily using wildcards in input
        if comp_dir == options.out_dir:
            continue
        # look through tsvs in comp_tables.
        for tsv in glob.glob(os.path.join(comp_dir, "comp_tables", "*.tsv")):
            # overwrite if first
            # strip header and append if second
            c = "cp {} ".format(
                tsv) if first is True else "tail -n +2 {} >> ".format(tsv)
            # just cat into the output directory
            os.system("{} {}".format(
                c,
                os.path.join(options.out_dir, "comp_tables",
                             os.path.basename(tsv))))
            print "{} {}".format(
                c,
                os.path.join(options.out_dir, "comp_tables",
                             os.path.basename(tsv)))
            tb = os.path.basename(tsv).split("-")
            if len(tb) > 2 and tb[0] == options.best_baseline and tb[
                    1] == options.best_comp:
                avg_table.append(avg_acc(tsv, options))
                update_best_table(best_table, tsv, options)

        first = False

    # make a call directory of links to the best in roc points for each graph
    make_best_calls(best_table, options)

    # write out our sompy vcf snp accutacy
    with open(
            os.path.join(
                options.out_dir,
                "{}-{}-{}_{}-avg.tsv".format(options.best_baseline,
                                             options.best_comp, options.pcol,
                                             options.rcol)), "w") as f:
        lines = sorted(avg_table)
        for line in lines:
            for tok in line:
                f.write(str(tok) + "\t")
            f.write("\n")

    # let's sort the output to make it easier to remove dead points
    if options.smooth is True:
        for tsv in glob.glob(
                os.path.join(options.out_dir, "comp_tables", "*.tsv")):
            print "smoothing {}".format(tsv)
            with open(tsv) as f:
                lines = [line for line in f]
                lines = [lines[0]] + sorted(
                    lines[1:],
                    key=lambda x: (x.split()[0], float(x.split()[
                        options.pcol]), 1 - float(x.split()[options.rcol])))
                # precisions can be bumpy (need to change to sensitivy?)
                # use simple smoother in the meantime
                lines = smooth_table([x.split() for x in lines], options)
            with open(tsv, "w") as f:
                for line in lines:
                    f.write(line)
def compute_vg_variants(job, input_gam, options):
    """ run vg pileup and vg call on the input
    """
    input_graph_path = graph_path(input_gam, options)
    out_pileup_path = pileup_path(input_gam, options)
    out_sample_vg_path = sample_vg_path(input_gam, options)
    out_sample_txt_path = sample_txt_path(input_gam, options)    
    out_augmented_vg_path = augmented_vg_path(input_gam, options)
    do_pu = options.overwrite or not os.path.isfile(out_pileup_path)
    do_call = do_pu or not os.path.isfile(out_augmented_vg_path)
    do_sample = options.sample and (do_pu or not os.path.isfile(out_sample_vg_path))
    do_vcf = do_call or not os.path.isfile(out_sample_vg_path.replace(".vg", ".vcf"))

    if do_pu:
        RealTimeLogger.get().info("Computing Variants for {} {}".format(
            input_graph_path,
            input_gam))
        robust_makedirs(os.path.dirname(out_pileup_path))
        run("vg filter {} {} | vg pileup {} - {} -t {} > {}".format(input_gam,
                                                                    options.filter_opts,
                                                                    input_graph_path,
                                                                    options.pileup_opts,
                                                                    options.vg_cores,
                                                                    out_pileup_path),
            fail_hard = True)

    if do_call:
        robust_makedirs(os.path.dirname(out_sample_vg_path))
        run("vg call {} {} {} -l -c {} -t {} > {}".format(input_graph_path,
                                                          out_pileup_path,
                                                          options.call_opts,
                                                          out_sample_txt_path,
                                                          options.vg_cores,
                                                          out_augmented_vg_path),
            fail_hard = True)

    if do_vcf:
        region = alignment_region_tag(input_gam, options)
        g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed")            
        with open(g1kbed_path) as f:
            contig, offset = f.readline().split()[0:2]
            
        # make the vcf
        # can only do this if there is a "ref" path in the vg graph
        ref = None
        res_path = temp_path(options)
        for ref_name in ["ref", contig]:
            run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, ref_name, res_path))
            with open(res_path) as res_file:
                if res_file.read()[0] == "1":
                    ref = ref_name
                    break
        run("rm {}".format(res_path))
                
        if ref is not None:
            tasks = []
            run("glenn2vcf {} {} -o {} -r {} -c {} -s {} -d {} > {} 2> {}".format(out_augmented_vg_path,
                                                                                  out_sample_txt_path,
                                                                                  offset,
                                                                                  ref,
                                                                                  contig,
                                                                                  alignment_sample_tag(input_gam, options),
                                                                                  options.depth,
                                                                                  out_sample_vg_path.replace(".vg", ".vcf"),
                                                                                  out_sample_vg_path.replace(".vg", ".vcf.stderr")),
                fail_hard = True)

    if do_sample:
        robust_makedirs(os.path.dirname(out_augmented_vg_path))
        run("vg call {} {} {} -t {} | vg ids -cs - > {}".format(input_graph_path,
                                                                out_pileup_path,
                                                                options.call_opts,
                                                                options.vg_cores,
                                                                out_sample_vg_path),
            fail_hard = True)
def compute_vg_variants(job, input_gam, options):
    """ run vg pileup and vg call on the input
    """

    # Move to the appropriate working directory from wherever Toil dropped us
    os.chdir(options.cwd)

    input_graph_path = graph_path(input_gam, options)
    out_pileup_path = pileup_path(input_gam, options)
    out_sample_vg_path = sample_vg_path(input_gam, options)
    out_sample_vcf_path = out_sample_vg_path.replace(".vg", ".vcf")
    out_sample_txt_path = sample_txt_path(input_gam, options)
    out_augmented_vg_path = augmented_vg_path(input_gam, options)
    out_gam_filter_path = gam_filter_path(input_gam, options)
    out_gam_index_path = gam_index_path(input_gam, options)
    out_bam_path = out_sample_vg_path.replace(".vg", ".bam")
    do_genotype = options.genotype and (options.overwrite or not os.path.isfile(out_sample_vcf_path))
    do_gam_filter = options.overwrite or not os.path.isfile(out_gam_filter_path)
    do_gam_index = do_genotype and (do_gam_filter or options.overwrite or not os.path.isdir(out_gam_index_path))
    do_pu = not options.genotype and (options.overwrite or not os.path.isfile(out_pileup_path))
    do_call = not options.genotype and (do_pu or not os.path.isfile(out_sample_vcf_path))
    do_surject = options.surject and (options.overwrite or do_gam_filter or not os.path.isfile(out_bam_path))

    # We need an XG here for the mase graph, but I haven't got time to refactor
    # to make it in the right place. So just make it here.
    temp_xg_path = job.fileStore.getLocalTempDir() + "/filter.xg"

    if do_gam_filter or do_pu:
        # Make sure we have the xg index around, which fiulter may need.
        run("vg index -x {} {}".format(temp_xg_path, input_graph_path), fail_hard=True)

    if do_gam_filter:
        robust_makedirs(os.path.dirname(out_pileup_path))
        run(
            "vg filter -x {} {} {} {} > {}".format(
                temp_xg_path, input_gam, options.filter_opts, input_graph_path, out_gam_filter_path
            ),
            fail_hard=True,
        )

    if do_gam_index:
        robust_makedirs(os.path.dirname(out_pileup_path))
        run(
            "rm -rf {} ; vg index {} -N -d {}".format(out_gam_index_path, out_gam_filter_path, out_gam_index_path),
            fail_hard=True,
        )

    if do_pu:
        robust_makedirs(os.path.dirname(out_pileup_path))
        run(
            "vg pileup {} {} {} -t {} > {}".format(
                input_graph_path, out_gam_filter_path, options.pileup_opts, options.vg_cores, out_pileup_path
            ),
            fail_hard=True,
        )
    ref = None
    bedLength = -1
    if do_call or do_genotype or do_surject:
        robust_makedirs(os.path.dirname(out_sample_vcf_path))
        region = alignment_region_tag(input_gam, options)
        g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed")
        with open(g1kbed_path) as f:
            contig, offset, end = f.readline().split()[0:3]
            bedLength = int(end) - int(offset)

        # make the vcf
        # can only do this if there is a "ref" path in the vg graph
        ref = None
        res_path = temp_path(options)
        for ref_name in ["ref", contig]:
            run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, ref_name, res_path))
            with open(res_path) as res_file:
                if res_file.read()[0] == "1":
                    ref = ref_name
                    break
        run("rm {}".format(res_path))

    if ref is not None:
        if do_genotype:
            run(
                "vg genotype {} {} -S -pv -q -i -C -o {} -r {} -c {} -s {} -t {} > {} 2> {}".format(
                    input_graph_path,
                    out_gam_index_path,
                    offset,
                    ref,
                    contig,
                    alignment_sample_tag(input_gam, options),
                    options.vg_cores,
                    out_sample_vcf_path,
                    out_sample_vcf_path.replace(".vcf", ".vcf.stderr"),
                ),
                fail_hard=True,
            )
        if do_call:
            run(
                "vg call {} {} {} -t {} -o {} -r {} -c {} -S {} -A {} > {} 2> {}".format(
                    input_graph_path,
                    out_pileup_path,
                    options.call_opts,
                    options.vg_cores,
                    offset,
                    ref,
                    contig,
                    alignment_sample_tag(input_gam, options),
                    out_augmented_vg_path,
                    out_sample_vg_path.replace(".vg", ".vcf"),
                    out_sample_vg_path.replace(".vg", ".vcf.stderr"),
                ),
                fail_hard=True,
            )
        if do_surject:
            run(
                "vg index {} -k {} -e {} -s -d {}.index -t {}".format(
                    input_graph_path, 20, 5, os.path.join(os.path.dirname(out_bam_path), "graph"), options.vg_cores
                ),
                fail_hard=True,
            )
            run(
                "vg surject {} -t {} -p {} -b -d {}.index > {}".format(
                    out_gam_filter_path,
                    options.vg_cores,
                    ref,
                    os.path.join(os.path.dirname(out_bam_path), "graph"),
                    out_bam_path,
                ),
                fail_hard=True,
            )

            # fix up chromosome coordinates so we can display on browser
            if contig[0] != "c":
                contig = "chr{}".format(contig)
            contigLength = {
                "chr5": 181538259,
                "chr6": 170805979,
                "chr13": 114364328,
                "chr17": 83257441,
                "chr19": 58617616,
            }
            # in header, change up the contig name and
            run(
                'samtools view -H {} | sed -e "s/{}/{}/" | sed -e "s/{}/{}/" > {}.sam'.format(
                    out_bam_path, ref, contig, bedLength, contigLength[contig], out_bam_path
                ),
                fail_hard=True,
            )
            # in body, add offset and fix contig, leave in sam for now so we can debug
            run(
                "samtools view -F 256 {} | awk -v OFS='\\t' '{{$3=\"{}\"; $4=$4+{}; $5=60; $8=$8+{}; print $0}}' >> {}.sam".format(
                    out_bam_path, contig, offset, offset, out_bam_path
                ),
                fail_hard=True,
            )
            # back to bam
            run(
                "samtools view {}.sam -b -F 4 | samtools sort - --threads {} -o {}".format(
                    out_bam_path, options.vg_cores, out_bam_path
                ),
                fail_hard=True,
            )

            # and index
            run("samtools index -b {}".format(out_bam_path), fail_hard=True)
def main(args):
    
    options = parse_args(args)

    robust_makedirs(os.path.join(options.out_dir, "comp_tables"))

    # compute average score for each roc dir in this table
    avg_table = []
    # [region][method] --> (path, f1)
    best_table = defaultdict(lambda : defaultdict(lambda : (None, -1)))
    first = True
    # sort the directories, assuming their names give info on their order
    # in the roc
    for comp_dir in sorted(options.comp_dirs)[options.skip_first:len(options.comp_dirs) - options.skip_last]:
        print comp_dir
        # this can happen easily using wildcards in input
        if comp_dir == options.out_dir:
            continue
        # look through tsvs in comp_tables.
        for tsv in glob.glob(os.path.join(comp_dir, "comp_tables", "*.tsv")):
            # overwrite if first
            # strip header and append if second
            c = "cp {} ".format(tsv) if first is True else "tail -n +2 {} >> ".format(tsv)
            # just cat into the output directory
            os.system("{} {}".format(c, os.path.join(options.out_dir, "comp_tables",
                                                     os.path.basename(tsv))))
            print "{} {}".format(c, os.path.join(options.out_dir, "comp_tables",
                                                     os.path.basename(tsv)))
            tb = os.path.basename(tsv).split("-")
            if len(tb) > 2 and tb[0] == options.best_baseline and tb[1] == options.best_comp:
                avg_table.append(avg_acc(tsv, options))
                update_best_table(best_table, tsv, options)
                
        first = False
        
    # make a call directory of links to the best in roc points for each graph
    make_best_calls(best_table, options)

    # write out our sompy vcf snp accutacy
    with open(os.path.join(options.out_dir, "{}-{}-{}_{}-avg.tsv".format(options.best_baseline,
                                                                         options.best_comp,
                                                                         options.pcol,
                                                                         options.rcol)), "w") as f:
        lines = sorted(avg_table)
        for line in lines:
            for tok in line:
                f.write(str(tok) + "\t")
            f.write("\n")

    # let's sort the output to make it easier to remove dead points
    if options.smooth is True:
        for tsv in glob.glob(os.path.join(options.out_dir, "comp_tables", "*.tsv")):
            print "smoothing {}".format(tsv)
            with open(tsv) as f:
                lines = [line for line in f]
                lines = [lines[0]] + sorted(lines[1:], key = lambda x : (x.split()[0], float(x.split()[options.pcol]), 1 - float(x.split()[options.rcol])))
                # precisions can be bumpy (need to change to sensitivy?)
                # use simple smoother in the meantime
                lines = smooth_table([x.split() for x in lines], options)
            with open(tsv, "w") as f:
                for line in lines:
                    f.write(line)
def main(args):

    options = parse_args(args)

    RealTimeLogger.start_master()

    if options.classic:
        # expect call_dir/SAMPLE/region.vcf

        for sampledir in glob.glob(os.path.join(options.call_dir, "*")):
            if os.path.isdir(sampledir):
                sample = os.path.basename(sampledir)
                vcfs = []
                outfile = os.path.join(sampledir, "TOTAL.vcf")
                for vcf in glob.glob(os.path.join(sampledir, "*.vcf")):
                    if os.path.basename(vcf) in [
                            "BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf",
                            "MHC.vcf"
                    ]:
                        run("vcfsort {} > {}.sort".format(vcf, vcf),
                            fail_hard=True)
                        run("bgzip -c {}.sort > {}.gz".format(vcf, vcf),
                            fail_hard=True)
                        run("rm -f {}.sort".format(vcf))
                        run("tabix -f -p vcf {}.gz".format(vcf),
                            fail_hard=True)
                        vcfs.append("{}.gz".format(vcf))
                if len(vcfs) > 0:
                    run("vt cat {} > {}".format(" ".join(vcfs), outfile),
                        fail_hard=True)
                    run("vcfsort {} > {}.sort".format(outfile, outfile),
                        fail_hard=True)
                    run("mv {}.sort {}".format(outfile, outfile),
                        fail_hard=True)
                    run("bgzip -c {} > {}.gz".format(outfile, outfile),
                        fail_hard=True)
                    run("tabix -f -p vcf {}.gz".format(outfile),
                        fail_hard=True)

        return 0

    # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf

    # count up regions
    regions = set()
    for regiondir in glob.glob(os.path.join(options.call_dir, "*")):
        if os.path.isdir(regiondir):
            region = os.path.basename(regiondir)
            # avoid crufty directories (including outputs of previous runs of this script)
            if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]:
                regions.add(region)

    print regions

    # count up graphs (that are present in every region)
    graphs = set()
    gcount = defaultdict(int)
    for region in regions:
        for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")):
            if os.path.isdir(graphdir):
                graph = os.path.basename(graphdir)
                gcount[graph] = gcount[graph] + 1

    for graph, count in gcount.items():
        if count == len(regions):
            graphs.add(graph)

    print graphs

    # count up samples
    samples = set()
    scount = defaultdict(int)
    for region in regions:
        for graph in graphs:
            for vcf in glob.glob(
                    os.path.join(options.call_dir, region, graph,
                                 "*_sample.vcf")):
                sample = os.path.basename(vcf).split("_")[0]
                scount[sample] = scount[sample] + 1

    for sample, count in scount.items():
        samples.add(sample)

    print samples

    # make our output directory
    out_dir = os.path.join(options.call_dir, options.name)
    robust_makedirs(out_dir)

    for graph in graphs:
        g_out_dir = os.path.join(out_dir, graph)

        for sample in samples:
            vcf_files = []

            for region in regions:
                vcf = os.path.join(options.call_dir, region, graph,
                                   "{}_sample.vcf".format(sample))
                if os.path.isfile(vcf):
                    vcf_files.append((region, vcf))

            # this sample doesn't span all regions, skip it
            if len(vcf_files) < len(regions):
                print "Skipping Sample {} for Graph {}".format(sample, graph)
                continue

            # output vcf
            merge_vcf_path = os.path.join(out_dir, graph,
                                          "{}_sample.vcf".format(sample))

            # working directory for intermediates / debugging
            work_path = os.path.join(out_dir, graph, "input", sample)
            robust_makedirs(work_path)

            # preprocess all the vcfs and leave in input dir
            input_files = []
            for region, vcf in vcf_files:
                outbase = os.path.join(work_path, region)
                run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard=True)
                run("bgzip -f {}.vcf".format(outbase))
                run("tabix -f -p vcf {}.vcf.gz".format(outbase))
                input_files.append("{}.vcf.gz".format(outbase))

            # run the merge
            run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path),
                fail_hard=True)

            # make an index just in case
            run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)
            run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)
            run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)
            run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)

    return 0
Esempio n. 29
0
def compute_tree(options, mat, names):
    """ make upgma hierarchical clustering and write it as png and
    graphviz dot
    """
    # oops, convert to biopython matrix
    matrix = []
    for i in xrange(len(names)):
        row = []
        for j in xrange(i + 1):
            # tree constructor writes 0-distances as 1s for some reason
            # so we hack around here
            val = float(mat[names[i]][names[j]])
            if val == 0.:
                val = 1e-10
            elif val == 1.:
                val = 1.1
            row.append(val)
        matrix.append(row)
    dm = _DistanceMatrix(names, matrix)

    # upgma tree
    constructor = DistanceTreeConstructor()
    tree = constructor.upgma(dm)
    robust_makedirs(os.path.dirname(tree_path(options)))
    Phylo.write(tree, tree_path(options), "newick")

    # png tree -- note : doesn't work in toil
    def f(x):
        if "Inner" in str(x):
            return ""
        else:
            return x
    Phylo.draw_graphviz(tree, label_func = f, node_size=1000, node_shape="s", font_size=10)
    pylab.savefig(tree_path(options).replace("newick", "png"))

    # graphviz
    # get networkx graph
    nxgraph = Phylo.to_networkx(tree)
    # make undirected
    nxgraph = nx.Graph(nxgraph)
    # push names to name labels
    nxgraph = nx.convert_node_labels_to_integers(nxgraph, label_attribute="label")
    for node_id in nxgraph.nodes():
        node = nxgraph.node[node_id]
        if "Inner" in str(node["label"]):
            node["label"] = "\"\""
            node["width"] = 0.001
            node["height"] = 0.001
        else:
            node["fontsize"] = 18
    for edge_id in nxgraph.edges():
        edge = nxgraph.edge[edge_id[0]][edge_id[1]]
        # in graphviz, weight means something else, so make it a label
        weight = float(edge["weight"])
        # undo hack from above
        if weight > 1:
            weight = 1.
        if weight <= 1e-10 or weight == 1.:
            weight = 0.
        edge["weight"] = None
        edge["label"] = "{0:.3g}".format(float(weight) * 100.)
        edge["fontsize"] = 14
        edge["len"] = draw_len(weight)
    nx.write_dot(nxgraph, tree_path(options).replace("newick", "dot"))
def plot_vcf_comp(tsv_path, options):
    """ take the big vcf compare table and make precision_recall plots for all the categories"""
    out_dir = os.path.join(options.comp_dir, "comp_plots")
    robust_makedirs(out_dir)
    out_name = os.path.basename(os.path.splitext(tsv_path)[0])
    sample = out_name.split("-")[-1].upper()
    region = out_name.split("-")[-2].upper()
    def out_base_path(tag, label, extension):
        bd = tag if extension != ".tsv" else "tsv"
        ret = os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region) + "_" + label + extension
        robust_makedirs(os.path.dirname(ret))
        return ret

    params = " ".join(PLOT_PARAMS)

    # precision recall scatter plot
    header = vcf_dist_header(options)
    # strip qual
    header = header[:-1]
    for i in range(len(header) / 2):
        prec_idx = 2 * i
        rec_idx = prec_idx + 1
        qual_idx = len(header)
        print prec_idx, header[prec_idx], rec_idx, header[rec_idx]
        ptoks = header[prec_idx].split("-")
        rtoks = header[rec_idx].split("-")
        assert ptoks[1] == "Precision"
        assert rtoks[1] == "Recall"
        assert ptoks[:1] == rtoks[:1]
        comp_cat  = ptoks[0]
        if comp_cat not in ["TOT", "SNP", "INDEL"]:
            continue
        label = header[prec_idx].replace("Precision", "acc")
        acc_tsv = out_base_path("pr", label, ".tsv")
        print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx)
        # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col
        awkcmd = '''if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}'''.format(rec_idx + 2, prec_idx + 2, qual_idx + 2)
        awkstr = "awk \'{" + awkcmd + "}\'"
        run("{} {} > {}".format(awkstr, tsv_path, acc_tsv))
        acc_png = out_base_path("pr", label, ".png")
        title = sample.upper() + " "
        if comp_cat == "TOT":
            title += " Total Accuracy"
        else:
            title += " {} Accuracy".format(comp_cat.title())
        if region == "TOTAL":
            title += ", all regions"
        else:
            title += ", {}".format(region)
        cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(acc_tsv, acc_png, title, params)
        print cmd
        os.system(cmd)

        #flatten to max f1 tsv and plot as bars
        f1_tsv = out_base_path("f1bar", label, ".tsv")
        f1_png = out_base_path("f1bar", label, ".png")
        f1_pr_tsv = out_base_path("f1pr", label, ".tsv")
        f1_pr_png = out_base_path("f1pr", label, ".png")
        f1_qual_tsv = out_base_path("f1qual", label, ".tsv")
        f1_qual_png = out_base_path("f1qual", label, ".png")

        make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options)
        cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {}".format(f1_tsv, f1_png, title, params)
        print cmd
        os.system(cmd)
        cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5".format(f1_pr_tsv, f1_pr_png, title, params)
        print cmd
        os.system(cmd)
        cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Quality for Max F1\" {}".format(f1_qual_tsv, f1_qual_png, title, params)
        print cmd
        os.system(cmd)
        
        if options.top is True:
            # top 20
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002".format(acc_tsv, acc_png.replace(".png", "_top20.png"), title, params)
            print cmd
            os.system(cmd)
            # top 20
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params)
            print cmd
            os.system(cmd)        
            # top 40
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top40.png"), title, params)
            print cmd
            os.system(cmd)
            # top .5 bar
            cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.5".format(f1_tsv, f1_png.replace(".png", "_top50.png"), title, params)
            print cmd
            os.system(cmd)
            # top .6 bar
            cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.6".format(f1_tsv, f1_png.replace(".png", "_top60.png"), title, params)
            print cmd
            os.system(cmd)
            # top .7 bar
            cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.7".format(f1_tsv, f1_png.replace(".png", "_top70.png"), title, params)
            print cmd
            os.system(cmd)            
            # top .85 bar
            cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.85".format(f1_tsv, f1_png.replace(".png", "_top85.png"), title, params)
            print cmd
            os.system(cmd)

            # top .25 f1pr scatter
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.746 --max_x 1.004 --min_y 0.746 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top25.png"), title, params)
            print cmd
            os.system(cmd)

            # top .50 f1pr scatter
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.496 --max_x 1.004 --min_y 0.496 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top50.png"), title, params)
            print cmd
            os.system(cmd)

            # top .65 f1pr scatter
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.646 --max_x 1.004 --min_y 0.646 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top65.png"), title, params)
            print cmd
            os.system(cmd)
 def out_base_path(tag, label, extension):
     bd = tag if extension != ".tsv" else "tsv"
     ret = os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region) + "_" + label + extension
     robust_makedirs(os.path.dirname(ret))
     return ret
def plot_vcf_comp(tsv_path, options):
    """ take the big vcf compare table and make precision_recall plots for all the categories"""
    out_dir = os.path.join(options.comp_dir, "comp_plots")
    robust_makedirs(out_dir)
    out_name = os.path.basename(os.path.splitext(tsv_path)[0])
    sample = out_name.split("-")[-1].upper()
    region = out_name.split("-")[-2].upper()

    def out_base_path(tag, label, extension):
        bd = tag if extension != ".tsv" else "tsv"
        ret = (
            os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region)
            + "_"
            + label
            + extension
        )
        robust_makedirs(os.path.dirname(ret))
        return ret

    params = " ".join(PLOT_PARAMS)

    # precision recall scatter plot
    header = vcf_dist_header(options)
    # strip qual
    header = header[:-1]
    for i in range(len(header) / 2):
        prec_idx = 2 * i
        rec_idx = prec_idx + 1
        qual_idx = len(header)
        print prec_idx, header[prec_idx], rec_idx, header[rec_idx]
        ptoks = header[prec_idx].split("-")
        rtoks = header[rec_idx].split("-")
        assert ptoks[1] == "Precision"
        assert rtoks[1] == "Recall"
        assert ptoks[:1] == rtoks[:1]
        comp_cat = ptoks[0]
        if comp_cat not in ["TOT", "SNP", "INDEL"]:
            continue
        label = header[prec_idx].replace("Precision", "acc")
        acc_tsv = out_base_path("pr", label, ".tsv")
        print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx)
        # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col
        awkcmd = """if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}""".format(rec_idx + 2, prec_idx + 2, qual_idx + 2)
        awkstr = "awk '{" + awkcmd + "}'"
        run("{} {} > {}".format(awkstr, tsv_path, acc_tsv))
        acc_png = out_base_path("pr", label, ".png")
        title = sample.upper() + " "
        if comp_cat == "TOT":
            title += " Total Accuracy"
        else:
            title += " {} Accuracy".format(comp_cat.title())
        if region == "TOTAL":
            title += ", all regions"
        else:
            title += ", {}".format(region)
        cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01'.format(
            acc_tsv, acc_png, title, params
        )
        print cmd
        os.system(cmd)

        # flatten to max f1 tsv and plot as bars
        f1_tsv = out_base_path("f1bar", label, ".tsv")
        f1_png = out_base_path("f1bar", label, ".png")
        f1_pr_tsv = out_base_path("f1pr", label, ".tsv")
        f1_pr_png = out_base_path("f1pr", label, ".png")
        f1_qual_tsv = out_base_path("f1qual", label, ".tsv")
        f1_qual_png = out_base_path("f1qual", label, ".png")

        make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options)
        cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {}'.format(
            f1_tsv, f1_png, title, params
        )
        print cmd
        os.system(cmd)
        cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5'.format(
            f1_pr_tsv, f1_pr_png, title, params
        )
        print cmd
        os.system(cmd)
        cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Quality for Max F1" {}'.format(
            f1_qual_tsv, f1_qual_png, title, params
        )
        print cmd
        os.system(cmd)

        if options.top is True:
            # top 20
            cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002'.format(
                acc_tsv, acc_png.replace(".png", "_top20.png"), title, params
            )
            print cmd
            os.system(cmd)
            # top 20
            cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004'.format(
                acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params
            )
            print cmd
            os.system(cmd)
            # top 40
            cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004'.format(
                acc_tsv, acc_png.replace(".png", "_top40.png"), title, params
            )
            print cmd
            os.system(cmd)
            # top .5 bar
            cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.5'.format(
                f1_tsv, f1_png.replace(".png", "_top50.png"), title, params
            )
            print cmd
            os.system(cmd)
            # top .6 bar
            cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.6'.format(
                f1_tsv, f1_png.replace(".png", "_top60.png"), title, params
            )
            print cmd
            os.system(cmd)
            # top .7 bar
            cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.7'.format(
                f1_tsv, f1_png.replace(".png", "_top70.png"), title, params
            )
            print cmd
            os.system(cmd)
            # top .85 bar
            cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.85'.format(
                f1_tsv, f1_png.replace(".png", "_top85.png"), title, params
            )
            print cmd
            os.system(cmd)

            # top .25 f1pr scatter
            cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.746 --max_x 1.004 --min_y 0.746 --max_y 1.004'.format(
                f1_pr_tsv, f1_pr_png.replace(".png", "_top25.png"), title, params
            )
            print cmd
            os.system(cmd)

            # top .50 f1pr scatter
            cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.496 --max_x 1.004 --min_y 0.496 --max_y 1.004'.format(
                f1_pr_tsv, f1_pr_png.replace(".png", "_top50.png"), title, params
            )
            print cmd
            os.system(cmd)

            # top .65 f1pr scatter
            cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.646 --max_x 1.004 --min_y 0.646 --max_y 1.004'.format(
                f1_pr_tsv, f1_pr_png.replace(".png", "_top65.png"), title, params
            )
            print cmd
            os.system(cmd)
def compute_linear_variants(job, input_gam, options):
    """ project to bam, then run samtools to call some variants
    """
    
    # Move to the appropriate working directory from wherever Toil dropped us
    os.chdir(options.cwd)
    
    input_graph_path = graph_path(input_gam, options)
    input_index_path = index_path(input_graph_path, options)

    # can only do this if there is a "ref" path in the vg graph
    res_path = temp_path(options)
    run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, "ref", res_path))
    has_ref = False
    with open(res_path) as res_file:
        has_ref = res_file.read()[0] == "1"
    run("rm {}".format(res_path))
    
    if has_ref:
        surject_path = projected_bam_path(input_gam, options)
        out_vcf_path = linear_vcf_path(input_gam, options)
        out_vg_path = linear_vg_path(input_gam, options)
        fasta_path = ref_path(input_gam, options)
        do_surject = options.overwrite or not os.path.isfile(surject_path)
        do_vcf = do_surject or not os.path.isfile(out_vcf_path + ".gz")
        do_vg = do_vcf or not os.path.isfile(out_vg_path)

        if do_surject:
            robust_makedirs(os.path.dirname(surject_path))
            prefix_path = temp_path(options, ".prefix")
            # surject to reference path (name hardcoded to ref for now)
            run("vg surject -d {} -p {} -b {} -t {} | samtools sort -o - {}> {}".format(
                input_index_path,
                "ref",
                input_gam,
                options.vg_cores,
                prefix_path,
                surject_path),
                timeout_sec=options.timeout,
                timeout_dep=surject_path)
            run("rm -f {}".format(prefix_path))

        if do_vcf:
            # todo: we assume that all graphs have same reference fasta, here.
            # this is false for, ex, simons which uses grchg37 instead of 38.

            # create pileup in bcf using samtools
            # http://samtools.sourceforge.net/mpileup.shtml
            assert os.path.isfile(fasta_path)
            robust_makedirs(os.path.dirname(out_vcf_path))
            run("samtools mpileup -I -u -t DP -f {} {} | bcftools call -m -V indels - > {}".format(
                fasta_path,
                surject_path,
                out_vcf_path))

            # make compressed index
            run("bgzip -f {}".format(out_vcf_path))
            run("tabix -f -p vcf {}.gz".format(out_vcf_path))

        if do_vg:
            # and convert back to vg...
            robust_makedirs(os.path.dirname(out_vg_path))
            run("vg construct -v {}.gz -r {} -t {} > {}".format(out_vcf_path, fasta_path,
                                                                options.vg_cores, out_vg_path))
def plot_vcf_comp(tsv_path, options):
    """ take the big vcf compare table and make precision_recall plots for all the categories"""
    out_dir = os.path.join(options.comp_dir, "comp_plots")
    robust_makedirs(out_dir)
    out_name = os.path.basename(os.path.splitext(tsv_path)[0])
    out_base_path = os.path.join(out_dir, out_name)
    region = out_name.split("-")[-1].upper()
    out_base_path_f1 = os.path.join(out_dir, "-".join(out_name.split("-")[:-1]) + "--f1-" + region)

    params = " ".join(PLOT_PARAMS)

    # precision recall scatter plot
    header = vcf_dist_header(options)
    # strip qual
    header = header[:-1]
    for i in range(len(header) / 2):
        prec_idx = 2 * i
        rec_idx = prec_idx + 1
        qual_idx = len(header)
        print prec_idx, header[prec_idx], rec_idx, header[rec_idx]
        ptoks = header[prec_idx].split("-")
        rtoks = header[rec_idx].split("-")
        assert ptoks[1] == "Precision"
        assert rtoks[1] == "Recall"
        assert ptoks[:1] == rtoks[:1]
        comp_cat  = ptoks[0]
        if comp_cat not in ["TOT", "SNP", "INDEL"]:
            continue
        label = header[prec_idx].replace("Precision", "acc")
        acc_tsv = out_base_path + "_" + label + ".tsv"
        print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx)
        # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col
        awkcmd = '''if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}'''.format(rec_idx + 2, prec_idx + 2, qual_idx + 2)
        awkstr = "awk \'{" + awkcmd + "}\'"
        run("{} {} > {}".format(awkstr, tsv_path, acc_tsv))
        acc_png = out_base_path + "_" + label + ".png"
        title = "VCF"
        if comp_cat == "TOT":
            title += " Total Accuracy"
        else:
            title += " {} Accuracy".format(comp_cat)
        title += " for {}".format(region)
        cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(acc_tsv, acc_png, title, params)
        print cmd
        os.system(cmd)

        #flatten to max f1 tsv and plot as bars
        f1_tsv = out_base_path_f1 + "_" + label + ".tsv"
        f1_png = out_base_path_f1 + "_" + label + ".png"
        f1_pr_tsv = out_base_path_f1.replace("-f1-", "-f1--pr-") + "_" + label + ".tsv"
        f1_pr_png = out_base_path_f1.replace("-f1-", "-f1--pr-") + "_" + label + ".png"
        f1_qual_tsv = out_base_path_f1.replace("-f1-", "-f1-qual-") + "_" + label + ".tsv"
        f1_qual_png = out_base_path_f1.replace("-f1-", "-f1-qual-") + "_" + label + ".png"

        make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options)
        cmd = "scripts/barchart.py {} --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {}".format(f1_tsv, f1_png, title, params)
        print cmd
        os.system(cmd)
        cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(f1_pr_tsv, f1_pr_png, title, params)
        print cmd
        os.system(cmd)
        cmd = "scripts/barchart.py {} --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Quality for Max F1\" {} --max 20".format(f1_qual_tsv, f1_qual_png, title, params)
        print cmd
        os.system(cmd)
        
        if options.top is True:
            # top 20
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002".format(acc_tsv, acc_png.replace(".png", "_top20.png"), title, params)
            print cmd
            os.system(cmd)
            # top 20
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params)
            print cmd
            os.system(cmd)        
            # top 40
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top40.png"), title, params)
            print cmd
            os.system(cmd)
def compute_snp1000g_baseline(job, input_gam, platinum, filter_indels, options):
    """ make 1000 genomes sample graph by filtering the vcf
    """
    
    # Move to the appropriate working directory from wherever Toil dropped us
    os.chdir(options.cwd)
    
    # there is only one g1vcf graph per region per sample
    # this function is also going to get called once for each graph type
    # so we hack here to only run on refonly graphs (arbitrary choice)
    if alignment_graph_tag(input_gam, options) != "refonly":
        return

    sample = alignment_sample_tag(input_gam, options)

    if platinum is True and sample not in options.platinum_samples.split(","):
        return
    
    region = alignment_region_tag(input_gam, options)
    if platinum is False:
        g1kvcf_path = os.path.join(options.g1kvcf_path, region.upper() + ".vcf")
    else:
        g1kvcf_path = os.path.join(options.platinum_path, sample, region.upper() + ".vcf")
    g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed")
    filter_vcf_path = g1k_vcf_path(input_gam, platinum, filter_indels, options)
    filter_fa_path = g1k_fa_path(input_gam, platinum, filter_indels, options)
    filter_vg_path = g1k_vg_path(input_gam, platinum, filter_indels, options)
    fasta_path = options.chrom_fa_path

    do_filter = options.overwrite or not os.path.isfile(filter_vcf_path + ".gz")
    do_construct = do_filter or not os.path.isfile(filter_vg_path)

    # make sure we're dealing with a sample that's in the vcf
    if do_filter or do_construct:
        p = subprocess.Popen("grep {} {} | wc -l".format(sample, g1kvcf_path),
                             shell=True, stdout=subprocess.PIPE, stderr=sys.stderr, bufsize=-1)
        output, _ = p.communicate()
        assert p.wait() == 0
        if int(output) == 0:
            do_filter = False
            do_construct = False

    # make filtered compressed vcf for this sample
    if do_filter:            
        robust_makedirs(os.path.dirname(filter_vcf_path))
        if filter_indels is True:
            filter_input_path = filter_vcf_path + ".in"
            run("scripts/vcfFilterIndels.py {} > {}".format(g1kvcf_path, filter_input_path),
                fail_hard = True)
        else:
            filter_input_path = g1kvcf_path
        run("scripts/vcfFilterSample.py {} {} {} {} {}".format(filter_input_path,
                                                              fasta_path,
                                                              sample,
                                                              filter_vcf_path,
                                                              filter_fa_path),
            fail_hard = True)
        run("scripts/vcfsort {} > {}.sort ; mv {}.sort {}".format(filter_vcf_path,
                                                          filter_vcf_path,
                                                          filter_vcf_path,
                                                          filter_vcf_path))
        run("bgzip -f {}".format(filter_vcf_path), fail_hard = True)
        run("tabix -f -p vcf {}.gz".format(filter_vcf_path), fail_hard = True)

    # load it into a vg graph
    if do_construct:
        with open(g1kbed_path) as bed_file:
            coords = bed_file.readline().split()
            # convert from bed to vcf coordinates by adding one to start
            coords = (coords[0], int(coords[1]) + 1, int(coords[2]))
            run("vg construct -v {}.gz -r {} -t {} -R {}:{}-{} > {}".format(filter_vcf_path, filter_fa_path,
                                                                            options.vg_cores, 
                                                                            coords[0], coords[1], coords[2],
                                                                            filter_vg_path),
                fail_hard = True)    
def main(args):
    
    options = parse_args(args)

    RealTimeLogger.start_master()

    if options.classic:
        # expect call_dir/SAMPLE/region.vcf

        for sampledir in glob.glob(os.path.join(options.call_dir, "*")):
            if os.path.isdir(sampledir):
                sample = os.path.basename(sampledir)
                vcfs = []
                outfile = os.path.join(sampledir, "TOTAL.vcf")
                for vcf in glob.glob(os.path.join(sampledir, "*.vcf")):
                    if os.path.basename(vcf) in ["BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf", "MHC.vcf"]:
                        run("vcfsort {} > {}.sort".format(vcf, vcf), fail_hard = True)
                        run("bgzip -c {}.sort > {}.gz".format(vcf, vcf), fail_hard = True)
                        run("rm -f {}.sort".format(vcf))
                        run("tabix -f -p vcf {}.gz".format(vcf), fail_hard = True)
                        vcfs.append("{}.gz".format(vcf))
                if len(vcfs) > 0:
                    run("vt cat {} > {}".format(" ".join(vcfs), outfile),
                        fail_hard = True)
                    run("vcfsort {} > {}.sort".format(outfile, outfile), fail_hard = True)
                    run("mv {}.sort {}".format(outfile, outfile), fail_hard = True)
                    run("bgzip -c {} > {}.gz".format(outfile, outfile), fail_hard = True)
                    run("tabix -f -p vcf {}.gz".format(outfile), fail_hard = True)

        return 0

    # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf

    # count up regions
    regions = set()
    for regiondir in glob.glob(os.path.join(options.call_dir, "*")):
        if os.path.isdir(regiondir):
            region = os.path.basename(regiondir)
            # avoid crufty directories (including outputs of previous runs of this script)
            if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]:
                regions.add(region)

    print regions

    # count up graphs (that are present in every region)
    graphs = set()
    gcount = defaultdict(int)
    for region in regions:
        for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")):
            if os.path.isdir(graphdir):
                graph = os.path.basename(graphdir)
                gcount[graph] = gcount[graph] + 1
    
    for graph, count in gcount.items():
        if count == len(regions):
            graphs.add(graph)

    print graphs

    # count up samples
    samples = set()
    scount = defaultdict(int)
    for region in regions:
        for graph in graphs:
            for vcf in glob.glob(os.path.join(options.call_dir, region, graph, "*_sample.vcf")):
                sample = os.path.basename(vcf).split("_")[0]
                scount[sample] = scount[sample] + 1

    for sample, count in scount.items():
        samples.add(sample)

    print samples

    # make our output directory
    out_dir = os.path.join(options.call_dir, options.name)
    robust_makedirs(out_dir)

    for graph in graphs:
        g_out_dir = os.path.join(out_dir, graph)

        for sample in samples:
            vcf_files = []

            for region in regions:
                vcf = os.path.join(options.call_dir, region, graph, "{}_sample.vcf".format(sample))
                if os.path.isfile(vcf):
                    vcf_files.append((region, vcf))

            # this sample doesn't span all regions, skip it
            if len(vcf_files) < len(regions):
                print "Skipping Sample {} for Graph {}".format(sample, graph)
                continue
            
            # output vcf
            merge_vcf_path = os.path.join(out_dir, graph, "{}_sample.vcf".format(sample))

            # working directory for intermediates / debugging
            work_path = os.path.join(out_dir, graph, "input", sample)
            robust_makedirs(work_path)

            # preprocess all the vcfs and leave in input dir
            input_files = []
            for region, vcf in vcf_files:
                outbase = os.path.join(work_path, region)
                run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard = True)
                run("bgzip -f {}.vcf".format(outbase))
                run("tabix -f -p vcf {}.vcf.gz".format(outbase))
                input_files.append("{}.vcf.gz".format(outbase))
            
            # run the merge
            run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path), fail_hard = True)

            # make an index just in case
            run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
            run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
            run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
            run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
        
    return 0