Exemple #1
def compute_all_indexes(job, options):
    """ run everything (root toil job)
    first all indexes are computed,
    then all comparisons (follow on)
    then summary (follow on of that)

    # do all the indexes
    baseline_set = set()
    for gam in options.in_gams:
        baseline = baseline_path(gam, options)
        if not os.path.isfile(baseline):
            raise RuntimeError("baseline {} for gam {} not found".format(baseline, gam))
        if baseline not in baseline_set:
            job.addChildJobFn(compute_kmer_index, baseline, options, cores=options.vg_cores)
        if graph_path(gam, options) != baseline:
            job.addChildJobFn(compute_kmer_index, graph_path(gam, options), options, cores=options.vg_cores)
        if augmented_vg_path(gam, options) != baseline:
            job.addChildJobFn(compute_kmer_index, augmented_vg_path(gam, options), options, cores=options.vg_cores)
        if linear_vg_path(gam, options) != baseline:
            job.addChildJobFn(compute_kmer_index, linear_vg_path(gam, options), options, cores=options.vg_cores)

    # do the comparisons
    job.addFollowOnJobFn(compute_all_comparisons, options, cores=1)
Exemple #2
def dist_table(options):
    """ make the jaccard distance table by scraping together all the comparison
    json files
    # tsv header
    dist_table =  "#\t{}\t\t\t\t\\t\tn".format(options.baseline)
    dist_table += "#graph\tgraph_dist\tlinear_dist\taugmented_dist\tsample_dist\tdelta_linear\tdelta_augmented\tdelta_sample\n"
    for gam in options.in_gams:
        baseline = baseline_path(gam, options)
        graph_comp_path = comp_path(baseline, graph_path(gam, options), options)
        graph_dist = jaccard_dist(graph_comp_path)
        aug_comp_path = comp_path(baseline, augmented_vg_path(gam, options), options)
        aug_graph_dist = jaccard_dist(aug_comp_path)
        lin_comp_path = comp_path(baseline, linear_vg_path(gam, options), options)
        lin_graph_dist = jaccard_dist(lin_comp_path)
        sam_comp_path = comp_path(baseline, sample_vg_path(gam, options), options)
        sam_graph_dist = jaccard_dist(sam_comp_path)        

        delta_lin = lin_graph_dist - graph_dist
        delta_aug = aug_graph_dist - graph_dist
        delta_sam = sam_graph_dist - graph_dist

        dist_table += "{}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\n".format(
            os.path.splitext(os.path.basename(graph_path(gam, options)))[0],

    with open(dist_tsv_path(options), "w") as ofile:
Exemple #3
def snp_count_table(options):
    """ make a table of snp counts.  there are serious problems with this now:
    1) don't have snp count for baseline (as it's not gam or vcf)
    2) snps counted differenty for gam/vcf (multiple alternates at same site
    counted in former but not latter)
    # tsv header
    count_table =  "#\t{}\t\n".format(options.baseline)
    count_table += "#graph\tlinear_snp_count\tsample_snp_count\taugmented_snp_count\n"

    sums = defaultdict(lambda : (0,0,0))
    counts = defaultdict(lambda : 0)

    for gam in options.in_gams:
        linear_vcf = linear_vcf_path(gam, options) + ".gz"
        vg_sample = sample_vg_path(gam, options)
        vg_augmented = augmented_vg_path(gam, options)
        vcf_snps = count_vcf_snps(linear_vcf, options)
        sample_snps = count_vg_paths(vg_sample, options)
        augmented_snps = count_vg_paths(vg_augmented, options)

        name = graph_path(gam, options)

        sums[name] = (sums[name][0] + vcf_snps,
                      sums[name][1] + sample_snps,
                      sums[name][2] + augmented_snps)
        counts[name] = counts[name] + 1

    for name in list(set(map(lambda x : graph_path(x, options), options.in_gams))):
        avg_vcf = float(sums[name][0]) / float(counts[name])
        avg_sam = float(sums[name][1]) / float(counts[name])
        avg_aug = float(sums[name][2]) / float(counts[name])
        count_table +="{}\t{}\t{}\t{}\n".format(

    with open(count_tsv_path(options), "w") as ofile:
Exemple #4
def compute_all_comparisons(job, options):
    """ run vg compare in parallel on all the graphs,
    outputting a json file for each
    ncores = min(2, options.vg_cores)
    for gam in options.in_gams:
        baseline = baseline_path(gam, options)
        job.addChildJobFn(compute_comparison, baseline,
                          graph_path(gam, options), options, cores=ncores)
        job.addChildJobFn(compute_comparison, baseline,
                          augmented_vg_path(gam, options), options, cores=ncores)
        job.addChildJobFn(compute_comparison, baseline,
                          linear_vg_path(gam, options), options, cores=ncores)
Exemple #5
def graph_size_table(options):
    """ make a table of sequence lengths for the vg call outputs
    # tsv header
    length_table =  "#\t{}\t\n".format(options.baseline)
    length_table += "#graph\tsample_snp_length\taugmented_snp_length\toriginal_length\n"

    sums = defaultdict(lambda : (0,0,0))
    counts = defaultdict(lambda : 0)

    for gam in options.in_gams:
        linear_vcf = linear_vcf_path(gam, options) + ".gz"
        vg_sample = sample_vg_path(gam, options)
        vg_augmented = augmented_vg_path(gam, options)
        sample_snps = vg_length(vg_sample, options)
        augmented_snps = vg_length(vg_augmented, options)
        vg_original = graph_path(gam, options)
        original_snps = vg_length(vg_original, options)

        name = graph_path(gam, options)

        sums[name] = (sums[name][0] + sample_snps,
                      sums[name][1] + augmented_snps,
                      sums[name][2] + original_snps)
        counts[name] = counts[name] + 1

    for name in list(set(map(lambda x : graph_path(x, options), options.in_gams))):
        avg_sam = float(sums[name][0]) / float(counts[name])
        avg_aug = float(sums[name][1]) / float(counts[name])
        avg_ori = float(sums[name][2]) / float(counts[name])
        length_table +="{}\t{}\t{}\t{}\n".format(

    with open(size_tsv_path(options), "w") as ofile:
Exemple #6
def acc_table(options):
    """ make the accuracy table by scraping together all the comparison
    json files
    # tsv header
    acc_table =  "#\t{}\t\t\t\t\t\t\t\t\t\n".format(options.baseline)
    acc_table += "#graph\tgraph_prec\tgraph_rec\tgraph_f1"
    acc_table += "\tlinear_prec\tlinear_rec\tlinear_f1"
    acc_table += "\taugmented_prec\taugmented_rec\taugmented_f1"
    acc_table += "\tsample_prec\tsample_rec\tsample_f1"

    sums = defaultdict(lambda : (0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.))
    counts = defaultdict(lambda : 0.)
    for gam in options.in_gams:
        baseline = baseline_path(gam, options)
        graph_comp_path = comp_path(baseline, graph_path(gam, options), options)
        graph_acc = accuracy(graph_comp_path)
        aug_comp_path = comp_path(baseline, augmented_vg_path(gam, options), options)
        aug_graph_acc = accuracy(aug_comp_path)
        lin_comp_path = comp_path(baseline, linear_vg_path(gam, options), options)
        lin_graph_acc = accuracy(lin_comp_path)
        sam_comp_path = comp_path(baseline, sample_vg_path(gam, options), options)
        sam_graph_acc = accuracy(sam_comp_path)

        name = graph_path(gam, options)

        sums[name] = (sums[name][0] +  graph_acc[0],
                      sums[name][1] +  graph_acc[1],
                      sums[name][2] +  graph_acc[2],
                      sums[name][3] +  lin_graph_acc[0],
                      sums[name][4] +  lin_graph_acc[1],
                      sums[name][5] +  lin_graph_acc[2],
                      sums[name][6] +  aug_graph_acc[0],
                      sums[name][7] +  aug_graph_acc[1],
                      sums[name][8] +  aug_graph_acc[2],
                      sums[name][9] +  sam_graph_acc[0],
                      sums[name][10] + sam_graph_acc[1],
                      sums[name][11] + sam_graph_acc[2])

        counts[name] = counts[name] + 1
    for name in list(set(map(lambda x : graph_path(x, options), options.in_gams))):
        acc_table += "{}\t{:.4}\t{:.4}\t{:.4}\t".format(
            os.path.splitext(os.path.basename(graph_path(gam, options)))[0],
            float(sums[name][0]) / float(counts[name]),
            float(sums[name][1]) / float(counts[name]),
            float(sums[name][2]) / float(counts[name]))
        acc_table += "{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t".format(
            float(sums[name][3]) / float(counts[name]),
            float(sums[name][4]) / float(counts[name]),
            float(sums[name][5]) / float(counts[name]),
            float(sums[name][6]) / float(counts[name]),
            float(sums[name][7]) / float(counts[name]),
            float(sums[name][8]) / float(counts[name]))
        acc_table +="{:.4}\t{:.4}\t{:.4}\n".format(
            float(sums[name][9]) / float(counts[name]),
            float(sums[name][10]) / float(counts[name]),
            float(sums[name][11]) / float(counts[name]))

    with open(acc_tsv_path(options), "w") as ofile: