def make_trio_vcfs(vcfmap, options):
    """ merge up samples into same vcf using rtg return index of merged files"""
    robust_makedirs(options.out_dir)
    ped_file = os.path.join(options.out_dir, "predigree.ped")
    with open(ped_file, "w") as f:
        f.write(options.ped + "\n")

    mergetable = dict()

    for region, rd in vcfmap.items():
        mergetable[region] = dict()
        region_dir = os.path.join(options.out_dir, "trio_vcfs", region)
        robust_makedirs(region_dir)
        # round up all sampels for graph
        bygraph = dict()
        for sample, sd in rd.items():
            for graph, pvcf in sd.items():
                if graph not in bygraph:
                    bygraph[graph] = dict()
                bygraph[graph][sample] = pvcf

        # make a merged vcf for each graph
        for graph, sd in bygraph.items():
            input_vcfs = {"snp": [], "indel": [], "all": []}
            for sample, pvcf in sd.items():
                work_dir = os.path.join(region_dir, "input_vcf")
                merge_dir = os.path.join(region_dir, "merged_vcf")
                robust_makedirs(work_dir)
                robust_makedirs(merge_dir)
                for kind in input_vcfs.keys():
                    filter_vcf = os.path.join(
                        work_dir, "{}_{}_{}.vcf".format(graph, sample, kind))
                    vstr = "-v snps,mnps" if kind is "snp" else "-V snps,mnps" if kind is "indel" else ""
                    if options.clip is not None:
                        vstr += " -R {}".format(options.clip)
                    run("bcftools view {} -f PASS,. {} | bcftools norm - -f {} > {}"
                        .format(pvcf, vstr, options.chrom_fa_path, filter_vcf))
                    run("bgzip -f {}".format(filter_vcf))
                    run("tabix -f -p vcf {}.gz".format(filter_vcf))
                    input_vcfs[kind].append("{}.gz".format(filter_vcf))

            if len(sd.items()) >= 3 and \
               len(input_vcfs["all"]) == len(sd.items()) and\
               len(input_vcfs["snp"]) == len(sd.items()) and\
               len(input_vcfs["indel"]) == len(sd.items()):

                mergetable[region][graph] = dict()
                for kind in input_vcfs.keys():

                    out_vcf = os.path.join(
                        merge_dir, "{}_{}_merged.vcf.gz".format(graph, kind))
                    run("rm -f {}".format(out_vcf))
                    run("rtg vcfmerge {} -o {}".format(
                        " ".join(input_vcfs[kind]), out_vcf),
                        fail_hard=True)

                    mergetable[region][graph][kind] = out_vcf

    return mergetable
def make_trio_vcfs(vcfmap, options):
    """ merge up samples into same vcf using rtg return index of merged files"""
    robust_makedirs(options.out_dir)
    ped_file = os.path.join(options.out_dir, "predigree.ped")
    with open(ped_file, "w") as f:
        f.write(options.ped + "\n")

    mergetable = dict()
    
    for region, rd in vcfmap.items():
        mergetable[region] = dict()
        region_dir = os.path.join(options.out_dir, "trio_vcfs", region)
        robust_makedirs(region_dir)
        # round up all sampels for graph
        bygraph = dict()
        for sample, sd in rd.items():
            for graph, pvcf in sd.items():
                if graph not in bygraph:
                    bygraph[graph] = dict()
                bygraph[graph][sample] = pvcf

        # make a merged vcf for each graph
        for graph, sd in bygraph.items():
            input_vcfs = { "snp" : [], "indel" : [], "all" : [] }
            for sample, pvcf in sd.items():
                work_dir = os.path.join(region_dir, "input_vcf")
                merge_dir = os.path.join(region_dir, "merged_vcf")
                robust_makedirs(work_dir)
                robust_makedirs(merge_dir)
                for kind in input_vcfs.keys():
                    filter_vcf = os.path.join(work_dir, "{}_{}_{}.vcf".format(graph, sample, kind))
                    vstr = "-v snps,mnps" if kind is "snp" else "-V snps,mnps" if kind is "indel" else ""
                    if options.clip is not None:
                        vstr += " -R {}".format(options.clip)
                    run("bcftools view {} -f PASS,. {} | bcftools norm - -f {} > {}".format(
                        pvcf, vstr, options.chrom_fa_path, filter_vcf))
                    run("bgzip -f {}".format(filter_vcf))
                    run("tabix -f -p vcf {}.gz".format(filter_vcf))
                    input_vcfs[kind].append("{}.gz".format(filter_vcf))

            if len(sd.items()) >= 3 and \
               len(input_vcfs["all"]) == len(sd.items()) and\
               len(input_vcfs["snp"]) == len(sd.items()) and\
               len(input_vcfs["indel"]) == len(sd.items()):

                mergetable[region][graph] = dict()
                for kind in input_vcfs.keys():

                    out_vcf = os.path.join(merge_dir, "{}_{}_merged.vcf.gz".format(graph, kind))
                    run("rm -f {}".format(out_vcf))
                    run("rtg vcfmerge {} -o {}".format(" ".join(input_vcfs[kind]), out_vcf), fail_hard = True)
                
                    mergetable[region][graph][kind] = out_vcf

    return mergetable
def trio_stats(sample_vcf, filter_xref, ignore_genotype, options):
    """ compute trio statistics """
    # we are hardcoding trio information here
    assert options.sample is "NA12878"
    child = sample_vcf.replace("NA12878", "NA12879")
    p1 = sample_vcf
    p2 = sample_vcf.replace("NA12878", "NA12877")

    out_base = tempfile.mkdtemp(prefix = "callStats_", dir = ".")

    if filter_xref is True:
        sys.stderr.write("Filtering {}\n".format(sample_vcf))
        filter_vcf = os.path.join(out_base, "child_filter.vcf")
        if os.path.isfile(child):
            os.system("grep -v XREF {} > {}".format(child, filter_vcf))
        child = filter_vcf
        filter1_vcf = os.path.join(out_base, "p1_filter.vcf")
        if os.path.isfile(p1):
            os.system("grep -v XREF {} > {}".format(p1, filter1_vcf))
        p1 = filter1_vcf
        filter2_vcf = os.path.join(out_base, "p2_filter.vcf")
        if os.path.isfile(p2):
            os.system("grep -v XREF {} > {}".format(p2, filter2_vcf))
        p2 = filter2_vcf

    trio_res = os.path.join(out_base, "ts.txt")

    try:
        ig = "-g" if ignore_genotype else ""
        sys.stderr.write("\nscripts/trioConcordance.py {} {} {} {} > {}\n".format(child, p1, p2, ig, trio_res))
        run("scripts/trioConcordance.py {} {} {} {} > {}".format(child, p1, p2, ig, trio_res))
    
        with open(trio_res) as f:
            toks = f.readline().split()
            res = toks[0:3]
            sys.stderr.write(" === {}\n".format(str(res)))

        ts = dict()
        ts["GOOD"] = int(res[0])
        ts["BAD"] = int(res[1])
        ts["RATIO"] = float(res[2])
    except:
        sys.stderr.write("trio concordance failed for {} {} {}".format(child, p1, p2))
        ts = dict()
        ts["GOOD"] = -1
        ts["BAD"] = -1
        ts["RATIO"] = -1.0

    os.system("rm -rf {}".format(out_base))

    
    return ts
def plot_kmer_comp(tsv_path, options):
    """ take a kmer compare table and make a 
    jaccard boxplot for the first column and a 
    recall / precision ploot for the 2nd and third column
    """
    out_dir = os.path.join(options.comp_dir, "comp_plots")
    robust_makedirs(out_dir)
    out_name = os.path.basename(os.path.splitext(tsv_path)[0])
    out_base_path = os.path.join(out_dir, out_name)
    sample = out_name.split("-")[-1].upper()
    region = out_name.split("-")[-2].upper()

    params = " ".join(PLOT_PARAMS)
    # jaccard boxplot
    jac_tsv = out_base_path + "_jac.tsv"
    awkstr = '''awk '{if (NR!=1) print $1 "\t" $2}' '''
    run("{} {} > {}".format(awkstr, tsv_path, jac_tsv))
    jac_png = out_base_path + "_jac.png"
    run("scripts/boxplot.py {} --save {} --title \"{} KMER Set Jaccard\" --x_label \"Graph\" --y_label \"Jaccard Index\" --x_sideways {}".format(jac_tsv, jac_png, region, params))

    # precision recall scatter plot
    acc_tsv = out_base_path + "_acc.tsv"
    awkstr = '''awk '{if (NR!=1) print $1 "\t" $4 "\t" $3}' '''
    run("{} {} > {}".format(awkstr, tsv_path, acc_tsv))
    acc_png = out_base_path + "_acc.png"
    run("scripts/scatter.py {} --save {} --title \"{} KMER Set Accuracy\" --x_label \"Recall\" --y_label \"Precision\" --width 12 --height 9 --lines {}".format(acc_tsv, acc_png, region, params))
def plot_kmer_comp(tsv_path, options):
    """ take a kmer compare table and make a 
    jaccard boxplot for the first column and a 
    recall / precision ploot for the 2nd and third column
    """
    out_dir = os.path.join(options.comp_dir, "comp_plots")
    robust_makedirs(out_dir)
    out_name = os.path.basename(os.path.splitext(tsv_path)[0])
    out_base_path = os.path.join(out_dir, out_name)
    region = out_name.split("-")[-1].upper()

    params = " ".join(PLOT_PARAMS)
    # jaccard boxplot
    jac_tsv = out_base_path + "_jac.tsv"
    awkstr = '''awk '{if (NR!=1) print $1 "\t" $2}' '''
    run("{} {} > {}".format(awkstr, tsv_path, jac_tsv))
    jac_png = out_base_path + "_jac.png"
    run("scripts/boxplot.py {} --save {} --title \"{} KMER Set Jaccard\" --x_label \"Graph\" --y_label \"Jaccard Index\" --x_sideways {}".format(jac_tsv, jac_png, region, params))

    # precision recall scatter plot
    acc_tsv = out_base_path + "_acc.tsv"
    awkstr = '''awk '{if (NR!=1) print $1 "\t" $4 "\t" $3}' '''
    run("{} {} > {}".format(awkstr, tsv_path, acc_tsv))
    acc_png = out_base_path + "_acc.png"
    run("scripts/scatter.py {} --save {} --title \"{} KMER Set Accuracy\" --x_label \"Recall\" --y_label \"Precision\" --width 12 --height 9 --lines {}".format(acc_tsv, acc_png, region, params))
def sompy_stats(sample_vcf, truth_vcf, filter_xref, options):
    """ run sompy (copied from computeVariantsDistances, mostly) """

    out_base = tempfile.mkdtemp(prefix = "callStats_", dir = ".")

    if filter_xref is True:
        filter_vcf = os.path.join(out_base, "filter.vcf")
        os.system("grep -v XREF {} > {}".format(sample_vcf, filter_vcf))
        sample_vcf = filter_vcf

    run("som.py {} {} -P -o {} -r {} > /dev/null".format(truth_vcf, sample_vcf, os.path.join(out_base, "sp_out"),
                                                         options.chrom_fa_path), fail_hard=True)


    indels, snps = None, None
    with open(os.path.join(out_base, "sp_out.stats.csv")) as sp_result:
        for line in sp_result:
            toks = line.split(",")
            if len(toks) < 2:
                continue
            if toks[1] == "type":
                header = toks
                tp_idx = toks.index("tp")
                fp_idx = toks.index("fp")
            elif toks[1] == "indels":
                indels = toks
            elif toks[1] == "SNVs":
                snps = toks
            elif toks[1] == "records":
                total = toks

    os.system("rm -rf {}".format(out_base))

    # indels optional
    if indels is None:
        indels = [0] * 100
    if snps is None:
        snps = [0] * 100

    ret = dict()
    ret["SNP-TP"] = int(snps[tp_idx])
    ret["SNP-FP"] = int(snps[fp_idx])
    ret["INDEL-TP"] = int(indels[tp_idx])
    ret["INDEL-FP"] = int(indels[fp_idx])
    ret["TOTAL-TP"] = int(total[tp_idx])
    ret["TOTAL-FP"] = int(total[fp_idx])
    
    return ret
def do_mendel(mergetable, options):
    """ run rtg mendelian on all our merged vcfs """

    header = ["graph", "all", "snp", "indel"]
    for region, gd in mergetable.items():
        table = []
        for graph, mergefiles in gd.items():
            annot_dir = os.path.join(options.out_dir, "mendel", region, graph)
            robust_makedirs(annot_dir)
            concordance = dict()
            for kind, mergefile in mergefiles.items():
                out_vcf = os.path.join(annot_dir,
                                       "mendel_{}.vcf.gz".format(kind))
                con_vcf = os.path.join(annot_dir,
                                       "consistent_{}.vcf.gz".format(kind))
                incon_vcf = os.path.join(annot_dir,
                                         "inconsistent_{}.vcf.gz".format(kind))
                out_stdout = os.path.join(annot_dir,
                                          "mendel_{}.stdout".format(kind))

                run("rtg mendelian -l -i {} -t {} --pedigree {} --output {} --output-consistent {} --output-inconsistent {} > {}"
                    .format(mergefile,
                            os.path.join(options.comp_dir, "chrom.sdf"),
                            os.path.join(options.out_dir, "predigree.ped"),
                            out_vcf, con_vcf, incon_vcf, out_stdout))

                concordance[kind] = scrape_mendel(out_stdout)

            table.append([
                graph, concordance["all"], concordance["snp"],
                concordance["indel"]
            ])

        # write the tsv for this region
        with open(
                os.path.join(options.out_dir, "mendel-{}.tsv".format(region)),
                "w") as f:
            f.write("\t".join(header) + "\n")
            for row in table:
                if None not in row:
                    line = [str(s) for s in row]
                    f.write("\t".join(line) + "\n")
def do_mendel(mergetable, options):
    """ run rtg mendelian on all our merged vcfs """

    header = ["graph", "all", "snp", "indel"]
    for region, gd in mergetable.items():
        table = []
        for graph, mergefiles in gd.items():
            annot_dir = os.path.join(options.out_dir, "mendel", region, graph)
            robust_makedirs(annot_dir)
            concordance = dict()
            for kind, mergefile in mergefiles.items():
                out_vcf = os.path.join(annot_dir, "mendel_{}.vcf.gz".format(kind))
                con_vcf = os.path.join(annot_dir, "consistent_{}.vcf.gz".format(kind))
                incon_vcf = os.path.join(annot_dir, "inconsistent_{}.vcf.gz".format(kind))
                out_stdout = os.path.join(annot_dir, "mendel_{}.stdout".format(kind))

                run("rtg mendelian -l -i {} -t {} --pedigree {} --output {} --output-consistent {} --output-inconsistent {} > {}".format(
                    mergefile,
                    os.path.join(options.comp_dir, "chrom.sdf"),
                    os.path.join(options.out_dir, "predigree.ped"),
                    out_vcf,
                    con_vcf,
                    incon_vcf,
                    out_stdout))

                concordance[kind] = scrape_mendel(out_stdout)

            table.append([graph, concordance["all"], concordance["snp"], concordance["indel"]])
            
        # write the tsv for this region
        with open(os.path.join(options.out_dir, "mendel-{}.tsv".format(region)), "w") as f:
            f.write("\t".join(header) + "\n")
            for row in table:
                if None not in row:
                    line = [str(s) for s in row]
                    f.write("\t".join(line) + "\n")
def plot_vcf_comp(tsv_path, options):
    """ take the big vcf compare table and make precision_recall plots for all the categories"""
    out_dir = os.path.join(options.comp_dir, "comp_plots")
    robust_makedirs(out_dir)
    out_name = os.path.basename(os.path.splitext(tsv_path)[0])
    sample = out_name.split("-")[-1].upper()
    region = out_name.split("-")[-2].upper()
    def out_base_path(tag, label, extension):
        bd = tag if extension != ".tsv" else "tsv"
        ret = os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region) + "_" + label + extension
        robust_makedirs(os.path.dirname(ret))
        return ret

    params = " ".join(PLOT_PARAMS)

    # precision recall scatter plot
    header = vcf_dist_header(options)
    # strip qual
    header = header[:-1]
    for i in range(len(header) / 2):
        prec_idx = 2 * i
        rec_idx = prec_idx + 1
        qual_idx = len(header)
        print prec_idx, header[prec_idx], rec_idx, header[rec_idx]
        ptoks = header[prec_idx].split("-")
        rtoks = header[rec_idx].split("-")
        assert ptoks[1] == "Precision"
        assert rtoks[1] == "Recall"
        assert ptoks[:1] == rtoks[:1]
        comp_cat  = ptoks[0]
        if comp_cat not in ["TOT", "SNP", "INDEL"]:
            continue
        label = header[prec_idx].replace("Precision", "acc")
        acc_tsv = out_base_path("pr", label, ".tsv")
        print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx)
        # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col
        awkcmd = '''if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}'''.format(rec_idx + 2, prec_idx + 2, qual_idx + 2)
        awkstr = "awk \'{" + awkcmd + "}\'"
        run("{} {} > {}".format(awkstr, tsv_path, acc_tsv))
        acc_png = out_base_path("pr", label, ".png")
        title = sample.upper() + " "
        if comp_cat == "TOT":
            title += " Total Accuracy"
        else:
            title += " {} Accuracy".format(comp_cat.title())
        if region == "TOTAL":
            title += ", all regions"
        else:
            title += ", {}".format(region)
        cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(acc_tsv, acc_png, title, params)
        print cmd
        os.system(cmd)

        #flatten to max f1 tsv and plot as bars
        f1_tsv = out_base_path("f1bar", label, ".tsv")
        f1_png = out_base_path("f1bar", label, ".png")
        f1_pr_tsv = out_base_path("f1pr", label, ".tsv")
        f1_pr_png = out_base_path("f1pr", label, ".png")
        f1_qual_tsv = out_base_path("f1qual", label, ".tsv")
        f1_qual_png = out_base_path("f1qual", label, ".png")

        make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options)
        cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {}".format(f1_tsv, f1_png, title, params)
        print cmd
        os.system(cmd)
        cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5".format(f1_pr_tsv, f1_pr_png, title, params)
        print cmd
        os.system(cmd)
        cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Quality for Max F1\" {}".format(f1_qual_tsv, f1_qual_png, title, params)
        print cmd
        os.system(cmd)
        
        if options.top is True:
            # top 20
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002".format(acc_tsv, acc_png.replace(".png", "_top20.png"), title, params)
            print cmd
            os.system(cmd)
            # top 20
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params)
            print cmd
            os.system(cmd)        
            # top 40
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top40.png"), title, params)
            print cmd
            os.system(cmd)
            # top .5 bar
            cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.5".format(f1_tsv, f1_png.replace(".png", "_top50.png"), title, params)
            print cmd
            os.system(cmd)
            # top .6 bar
            cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.6".format(f1_tsv, f1_png.replace(".png", "_top60.png"), title, params)
            print cmd
            os.system(cmd)
            # top .7 bar
            cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.7".format(f1_tsv, f1_png.replace(".png", "_top70.png"), title, params)
            print cmd
            os.system(cmd)            
            # top .85 bar
            cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.85".format(f1_tsv, f1_png.replace(".png", "_top85.png"), title, params)
            print cmd
            os.system(cmd)

            # top .25 f1pr scatter
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.746 --max_x 1.004 --min_y 0.746 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top25.png"), title, params)
            print cmd
            os.system(cmd)

            # top .50 f1pr scatter
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.496 --max_x 1.004 --min_y 0.496 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top50.png"), title, params)
            print cmd
            os.system(cmd)

            # top .65 f1pr scatter
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.646 --max_x 1.004 --min_y 0.646 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top65.png"), title, params)
            print cmd
            os.system(cmd)
def main(args):
    
    options = parse_args(args)

    RealTimeLogger.start_master()

    if options.classic:
        # expect call_dir/SAMPLE/region.vcf

        for sampledir in glob.glob(os.path.join(options.call_dir, "*")):
            if os.path.isdir(sampledir):
                sample = os.path.basename(sampledir)
                vcfs = []
                outfile = os.path.join(sampledir, "TOTAL.vcf")
                for vcf in glob.glob(os.path.join(sampledir, "*.vcf")):
                    if os.path.basename(vcf) in ["BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf", "MHC.vcf"]:
                        run("vcfsort {} > {}.sort".format(vcf, vcf), fail_hard = True)
                        run("bgzip -c {}.sort > {}.gz".format(vcf, vcf), fail_hard = True)
                        run("rm -f {}.sort".format(vcf))
                        run("tabix -f -p vcf {}.gz".format(vcf), fail_hard = True)
                        vcfs.append("{}.gz".format(vcf))
                if len(vcfs) > 0:
                    run("vt cat {} > {}".format(" ".join(vcfs), outfile),
                        fail_hard = True)
                    run("vcfsort {} > {}.sort".format(outfile, outfile), fail_hard = True)
                    run("mv {}.sort {}".format(outfile, outfile), fail_hard = True)
                    run("bgzip -c {} > {}.gz".format(outfile, outfile), fail_hard = True)
                    run("tabix -f -p vcf {}.gz".format(outfile), fail_hard = True)

        return 0

    # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf

    # count up regions
    regions = set()
    for regiondir in glob.glob(os.path.join(options.call_dir, "*")):
        if os.path.isdir(regiondir):
            region = os.path.basename(regiondir)
            # avoid crufty directories (including outputs of previous runs of this script)
            if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]:
                regions.add(region)

    print regions

    # count up graphs (that are present in every region)
    graphs = set()
    gcount = defaultdict(int)
    for region in regions:
        for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")):
            if os.path.isdir(graphdir):
                graph = os.path.basename(graphdir)
                gcount[graph] = gcount[graph] + 1
    
    for graph, count in gcount.items():
        if count == len(regions):
            graphs.add(graph)

    print graphs

    # count up samples
    samples = set()
    scount = defaultdict(int)
    for region in regions:
        for graph in graphs:
            for vcf in glob.glob(os.path.join(options.call_dir, region, graph, "*_sample.vcf")):
                sample = os.path.basename(vcf).split("_")[0]
                scount[sample] = scount[sample] + 1

    for sample, count in scount.items():
        samples.add(sample)

    print samples

    # make our output directory
    out_dir = os.path.join(options.call_dir, options.name)
    robust_makedirs(out_dir)

    for graph in graphs:
        g_out_dir = os.path.join(out_dir, graph)

        for sample in samples:
            vcf_files = []

            for region in regions:
                vcf = os.path.join(options.call_dir, region, graph, "{}_sample.vcf".format(sample))
                if os.path.isfile(vcf):
                    vcf_files.append((region, vcf))

            # this sample doesn't span all regions, skip it
            if len(vcf_files) < len(regions):
                print "Skipping Sample {} for Graph {}".format(sample, graph)
                continue
            
            # output vcf
            merge_vcf_path = os.path.join(out_dir, graph, "{}_sample.vcf".format(sample))

            # working directory for intermediates / debugging
            work_path = os.path.join(out_dir, graph, "input", sample)
            robust_makedirs(work_path)

            # preprocess all the vcfs and leave in input dir
            input_files = []
            for region, vcf in vcf_files:
                outbase = os.path.join(work_path, region)
                run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard = True)
                run("bgzip -f {}.vcf".format(outbase))
                run("tabix -f -p vcf {}.vcf.gz".format(outbase))
                input_files.append("{}.vcf.gz".format(outbase))
            
            # run the merge
            run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path), fail_hard = True)

            # make an index just in case
            run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
            run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
            run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
            run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True)
        
    return 0
def plot_vcf_comp(tsv_path, options):
    """ take the big vcf compare table and make precision_recall plots for all the categories"""
    out_dir = os.path.join(options.comp_dir, "comp_plots")
    robust_makedirs(out_dir)
    out_name = os.path.basename(os.path.splitext(tsv_path)[0])
    out_base_path = os.path.join(out_dir, out_name)
    region = out_name.split("-")[-1].upper()
    out_base_path_f1 = os.path.join(out_dir, "-".join(out_name.split("-")[:-1]) + "--f1-" + region)

    params = " ".join(PLOT_PARAMS)

    # precision recall scatter plot
    header = vcf_dist_header(options)
    # strip qual
    header = header[:-1]
    for i in range(len(header) / 2):
        prec_idx = 2 * i
        rec_idx = prec_idx + 1
        qual_idx = len(header)
        print prec_idx, header[prec_idx], rec_idx, header[rec_idx]
        ptoks = header[prec_idx].split("-")
        rtoks = header[rec_idx].split("-")
        assert ptoks[1] == "Precision"
        assert rtoks[1] == "Recall"
        assert ptoks[:1] == rtoks[:1]
        comp_cat  = ptoks[0]
        if comp_cat not in ["TOT", "SNP", "INDEL"]:
            continue
        label = header[prec_idx].replace("Precision", "acc")
        acc_tsv = out_base_path + "_" + label + ".tsv"
        print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx)
        # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col
        awkcmd = '''if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}'''.format(rec_idx + 2, prec_idx + 2, qual_idx + 2)
        awkstr = "awk \'{" + awkcmd + "}\'"
        run("{} {} > {}".format(awkstr, tsv_path, acc_tsv))
        acc_png = out_base_path + "_" + label + ".png"
        title = "VCF"
        if comp_cat == "TOT":
            title += " Total Accuracy"
        else:
            title += " {} Accuracy".format(comp_cat)
        title += " for {}".format(region)
        cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(acc_tsv, acc_png, title, params)
        print cmd
        os.system(cmd)

        #flatten to max f1 tsv and plot as bars
        f1_tsv = out_base_path_f1 + "_" + label + ".tsv"
        f1_png = out_base_path_f1 + "_" + label + ".png"
        f1_pr_tsv = out_base_path_f1.replace("-f1-", "-f1--pr-") + "_" + label + ".tsv"
        f1_pr_png = out_base_path_f1.replace("-f1-", "-f1--pr-") + "_" + label + ".png"
        f1_qual_tsv = out_base_path_f1.replace("-f1-", "-f1-qual-") + "_" + label + ".tsv"
        f1_qual_png = out_base_path_f1.replace("-f1-", "-f1-qual-") + "_" + label + ".png"

        make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options)
        cmd = "scripts/barchart.py {} --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {}".format(f1_tsv, f1_png, title, params)
        print cmd
        os.system(cmd)
        cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(f1_pr_tsv, f1_pr_png, title, params)
        print cmd
        os.system(cmd)
        cmd = "scripts/barchart.py {} --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Quality for Max F1\" {} --max 20".format(f1_qual_tsv, f1_qual_png, title, params)
        print cmd
        os.system(cmd)
        
        if options.top is True:
            # top 20
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002".format(acc_tsv, acc_png.replace(".png", "_top20.png"), title, params)
            print cmd
            os.system(cmd)
            # top 20
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params)
            print cmd
            os.system(cmd)        
            # top 40
            cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top40.png"), title, params)
            print cmd
            os.system(cmd)
def plot_vcf_comp(tsv_path, options):
    """ take the big vcf compare table and make precision_recall plots for all the categories"""
    out_dir = os.path.join(options.comp_dir, "comp_plots")
    robust_makedirs(out_dir)
    out_name = os.path.basename(os.path.splitext(tsv_path)[0])
    sample = out_name.split("-")[-1].upper()
    region = out_name.split("-")[-2].upper()

    def out_base_path(tag, label, extension):
        bd = tag if extension != ".tsv" else "tsv"
        ret = (
            os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region)
            + "_"
            + label
            + extension
        )
        robust_makedirs(os.path.dirname(ret))
        return ret

    params = " ".join(PLOT_PARAMS)

    # precision recall scatter plot
    header = vcf_dist_header(options)
    # strip qual
    header = header[:-1]
    for i in range(len(header) / 2):
        prec_idx = 2 * i
        rec_idx = prec_idx + 1
        qual_idx = len(header)
        print prec_idx, header[prec_idx], rec_idx, header[rec_idx]
        ptoks = header[prec_idx].split("-")
        rtoks = header[rec_idx].split("-")
        assert ptoks[1] == "Precision"
        assert rtoks[1] == "Recall"
        assert ptoks[:1] == rtoks[:1]
        comp_cat = ptoks[0]
        if comp_cat not in ["TOT", "SNP", "INDEL"]:
            continue
        label = header[prec_idx].replace("Precision", "acc")
        acc_tsv = out_base_path("pr", label, ".tsv")
        print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx)
        # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col
        awkcmd = """if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}""".format(rec_idx + 2, prec_idx + 2, qual_idx + 2)
        awkstr = "awk '{" + awkcmd + "}'"
        run("{} {} > {}".format(awkstr, tsv_path, acc_tsv))
        acc_png = out_base_path("pr", label, ".png")
        title = sample.upper() + " "
        if comp_cat == "TOT":
            title += " Total Accuracy"
        else:
            title += " {} Accuracy".format(comp_cat.title())
        if region == "TOTAL":
            title += ", all regions"
        else:
            title += ", {}".format(region)
        cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01'.format(
            acc_tsv, acc_png, title, params
        )
        print cmd
        os.system(cmd)

        # flatten to max f1 tsv and plot as bars
        f1_tsv = out_base_path("f1bar", label, ".tsv")
        f1_png = out_base_path("f1bar", label, ".png")
        f1_pr_tsv = out_base_path("f1pr", label, ".tsv")
        f1_pr_png = out_base_path("f1pr", label, ".png")
        f1_qual_tsv = out_base_path("f1qual", label, ".tsv")
        f1_qual_png = out_base_path("f1qual", label, ".png")

        make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options)
        cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {}'.format(
            f1_tsv, f1_png, title, params
        )
        print cmd
        os.system(cmd)
        cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5'.format(
            f1_pr_tsv, f1_pr_png, title, params
        )
        print cmd
        os.system(cmd)
        cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Quality for Max F1" {}'.format(
            f1_qual_tsv, f1_qual_png, title, params
        )
        print cmd
        os.system(cmd)

        if options.top is True:
            # top 20
            cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002'.format(
                acc_tsv, acc_png.replace(".png", "_top20.png"), title, params
            )
            print cmd
            os.system(cmd)
            # top 20
            cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004'.format(
                acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params
            )
            print cmd
            os.system(cmd)
            # top 40
            cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004'.format(
                acc_tsv, acc_png.replace(".png", "_top40.png"), title, params
            )
            print cmd
            os.system(cmd)
            # top .5 bar
            cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.5'.format(
                f1_tsv, f1_png.replace(".png", "_top50.png"), title, params
            )
            print cmd
            os.system(cmd)
            # top .6 bar
            cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.6'.format(
                f1_tsv, f1_png.replace(".png", "_top60.png"), title, params
            )
            print cmd
            os.system(cmd)
            # top .7 bar
            cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.7'.format(
                f1_tsv, f1_png.replace(".png", "_top70.png"), title, params
            )
            print cmd
            os.system(cmd)
            # top .85 bar
            cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.85'.format(
                f1_tsv, f1_png.replace(".png", "_top85.png"), title, params
            )
            print cmd
            os.system(cmd)

            # top .25 f1pr scatter
            cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.746 --max_x 1.004 --min_y 0.746 --max_y 1.004'.format(
                f1_pr_tsv, f1_pr_png.replace(".png", "_top25.png"), title, params
            )
            print cmd
            os.system(cmd)

            # top .50 f1pr scatter
            cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.496 --max_x 1.004 --min_y 0.496 --max_y 1.004'.format(
                f1_pr_tsv, f1_pr_png.replace(".png", "_top50.png"), title, params
            )
            print cmd
            os.system(cmd)

            # top .65 f1pr scatter
            cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.646 --max_x 1.004 --min_y 0.646 --max_y 1.004'.format(
                f1_pr_tsv, f1_pr_png.replace(".png", "_top65.png"), title, params
            )
            print cmd
            os.system(cmd)
def main(args):

    options = parse_args(args)

    RealTimeLogger.start_master()

    if options.classic:
        # expect call_dir/SAMPLE/region.vcf

        for sampledir in glob.glob(os.path.join(options.call_dir, "*")):
            if os.path.isdir(sampledir):
                sample = os.path.basename(sampledir)
                vcfs = []
                outfile = os.path.join(sampledir, "TOTAL.vcf")
                for vcf in glob.glob(os.path.join(sampledir, "*.vcf")):
                    if os.path.basename(vcf) in [
                            "BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf",
                            "MHC.vcf"
                    ]:
                        run("vcfsort {} > {}.sort".format(vcf, vcf),
                            fail_hard=True)
                        run("bgzip -c {}.sort > {}.gz".format(vcf, vcf),
                            fail_hard=True)
                        run("rm -f {}.sort".format(vcf))
                        run("tabix -f -p vcf {}.gz".format(vcf),
                            fail_hard=True)
                        vcfs.append("{}.gz".format(vcf))
                if len(vcfs) > 0:
                    run("vt cat {} > {}".format(" ".join(vcfs), outfile),
                        fail_hard=True)
                    run("vcfsort {} > {}.sort".format(outfile, outfile),
                        fail_hard=True)
                    run("mv {}.sort {}".format(outfile, outfile),
                        fail_hard=True)
                    run("bgzip -c {} > {}.gz".format(outfile, outfile),
                        fail_hard=True)
                    run("tabix -f -p vcf {}.gz".format(outfile),
                        fail_hard=True)

        return 0

    # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf

    # count up regions
    regions = set()
    for regiondir in glob.glob(os.path.join(options.call_dir, "*")):
        if os.path.isdir(regiondir):
            region = os.path.basename(regiondir)
            # avoid crufty directories (including outputs of previous runs of this script)
            if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]:
                regions.add(region)

    print regions

    # count up graphs (that are present in every region)
    graphs = set()
    gcount = defaultdict(int)
    for region in regions:
        for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")):
            if os.path.isdir(graphdir):
                graph = os.path.basename(graphdir)
                gcount[graph] = gcount[graph] + 1

    for graph, count in gcount.items():
        if count == len(regions):
            graphs.add(graph)

    print graphs

    # count up samples
    samples = set()
    scount = defaultdict(int)
    for region in regions:
        for graph in graphs:
            for vcf in glob.glob(
                    os.path.join(options.call_dir, region, graph,
                                 "*_sample.vcf")):
                sample = os.path.basename(vcf).split("_")[0]
                scount[sample] = scount[sample] + 1

    for sample, count in scount.items():
        samples.add(sample)

    print samples

    # make our output directory
    out_dir = os.path.join(options.call_dir, options.name)
    robust_makedirs(out_dir)

    for graph in graphs:
        g_out_dir = os.path.join(out_dir, graph)

        for sample in samples:
            vcf_files = []

            for region in regions:
                vcf = os.path.join(options.call_dir, region, graph,
                                   "{}_sample.vcf".format(sample))
                if os.path.isfile(vcf):
                    vcf_files.append((region, vcf))

            # this sample doesn't span all regions, skip it
            if len(vcf_files) < len(regions):
                print "Skipping Sample {} for Graph {}".format(sample, graph)
                continue

            # output vcf
            merge_vcf_path = os.path.join(out_dir, graph,
                                          "{}_sample.vcf".format(sample))

            # working directory for intermediates / debugging
            work_path = os.path.join(out_dir, graph, "input", sample)
            robust_makedirs(work_path)

            # preprocess all the vcfs and leave in input dir
            input_files = []
            for region, vcf in vcf_files:
                outbase = os.path.join(work_path, region)
                run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard=True)
                run("bgzip -f {}.vcf".format(outbase))
                run("tabix -f -p vcf {}.vcf.gz".format(outbase))
                input_files.append("{}.vcf.gz".format(outbase))

            # run the merge
            run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path),
                fail_hard=True)

            # make an index just in case
            run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)
            run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)
            run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)
            run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path),
                fail_hard=True)

    return 0