Esempio n. 1
0
def compute_all_indexes(job, options):
    """ run everything (root toil job)
    first all indexes are computed,
    then all comparisons (follow on)
    then summary (follow on of that)
    """

    # do all the indexes
    baseline_set = set()
        
    for gam in options.in_gams:
        baseline = baseline_path(gam, options)
        if not os.path.isfile(baseline):
            raise RuntimeError("baseline {} for gam {} not found".format(baseline, gam))
        if baseline not in baseline_set:
            job.addChildJobFn(compute_kmer_index, baseline, options, cores=options.vg_cores)
            baseline_set.add(baseline)
        if graph_path(gam, options) != baseline:
            job.addChildJobFn(compute_kmer_index, graph_path(gam, options), options, cores=options.vg_cores)
        if augmented_vg_path(gam, options) != baseline:
            job.addChildJobFn(compute_kmer_index, augmented_vg_path(gam, options), options, cores=options.vg_cores)
        if linear_vg_path(gam, options) != baseline:
            job.addChildJobFn(compute_kmer_index, linear_vg_path(gam, options), options, cores=options.vg_cores)

    # do the comparisons
    job.addFollowOnJobFn(compute_all_comparisons, options, cores=1)
Esempio n. 2
0
def dist_table(options):
    """ make the jaccard distance table by scraping together all the comparison
    json files
    """
    # tsv header
    dist_table =  "#\t{}\t\t\t\t\\t\tn".format(options.baseline)
    dist_table += "#graph\tgraph_dist\tlinear_dist\taugmented_dist\tsample_dist\tdelta_linear\tdelta_augmented\tdelta_sample\n"
    
    for gam in options.in_gams:
        baseline = baseline_path(gam, options)
        graph_comp_path = comp_path(baseline, graph_path(gam, options), options)
        graph_dist = jaccard_dist(graph_comp_path)
        aug_comp_path = comp_path(baseline, augmented_vg_path(gam, options), options)
        aug_graph_dist = jaccard_dist(aug_comp_path)
        lin_comp_path = comp_path(baseline, linear_vg_path(gam, options), options)
        lin_graph_dist = jaccard_dist(lin_comp_path)
        sam_comp_path = comp_path(baseline, sample_vg_path(gam, options), options)
        sam_graph_dist = jaccard_dist(sam_comp_path)        

        delta_lin = lin_graph_dist - graph_dist
        delta_aug = aug_graph_dist - graph_dist
        delta_sam = sam_graph_dist - graph_dist

        dist_table += "{}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\n".format(
            os.path.splitext(os.path.basename(graph_path(gam, options)))[0],
            graph_dist,
            lin_graph_dist,
            aug_graph_dist,
            sam_graph_dist,
            delta_lin,
            delta_aug,
            delta_sam)

    with open(dist_tsv_path(options), "w") as ofile:
        ofile.write(dist_table)
Esempio n. 3
0
def compute_all_comparisons(job, options):
    """ run vg compare in parallel on all the graphs,
    outputting a json file for each
    """
    ncores = min(2, options.vg_cores)
    for gam in options.in_gams:
        baseline = baseline_path(gam, options)
        job.addChildJobFn(compute_comparison, baseline,
                          graph_path(gam, options), options, cores=ncores)
        job.addChildJobFn(compute_comparison, baseline,
                          augmented_vg_path(gam, options), options, cores=ncores)
        job.addChildJobFn(compute_comparison, baseline,
                          linear_vg_path(gam, options), options, cores=ncores)
Esempio n. 4
0
def acc_table(options):
    """ make the accuracy table by scraping together all the comparison
    json files
    """
    # tsv header
    acc_table =  "#\t{}\t\t\t\t\t\t\t\t\t\n".format(options.baseline)
    acc_table += "#graph\tgraph_prec\tgraph_rec\tgraph_f1"
    acc_table += "\tlinear_prec\tlinear_rec\tlinear_f1"
    acc_table += "\taugmented_prec\taugmented_rec\taugmented_f1"
    acc_table += "\tsample_prec\tsample_rec\tsample_f1"

    sums = defaultdict(lambda : (0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.))
    counts = defaultdict(lambda : 0.)
    
    for gam in options.in_gams:
        baseline = baseline_path(gam, options)
        graph_comp_path = comp_path(baseline, graph_path(gam, options), options)
        graph_acc = accuracy(graph_comp_path)
        aug_comp_path = comp_path(baseline, augmented_vg_path(gam, options), options)
        aug_graph_acc = accuracy(aug_comp_path)
        lin_comp_path = comp_path(baseline, linear_vg_path(gam, options), options)
        lin_graph_acc = accuracy(lin_comp_path)
        sam_comp_path = comp_path(baseline, sample_vg_path(gam, options), options)
        sam_graph_acc = accuracy(sam_comp_path)

        name = graph_path(gam, options)

        sums[name] = (sums[name][0] +  graph_acc[0],
                      sums[name][1] +  graph_acc[1],
                      sums[name][2] +  graph_acc[2],
                      sums[name][3] +  lin_graph_acc[0],
                      sums[name][4] +  lin_graph_acc[1],
                      sums[name][5] +  lin_graph_acc[2],
                      sums[name][6] +  aug_graph_acc[0],
                      sums[name][7] +  aug_graph_acc[1],
                      sums[name][8] +  aug_graph_acc[2],
                      sums[name][9] +  sam_graph_acc[0],
                      sums[name][10] + sam_graph_acc[1],
                      sums[name][11] + sam_graph_acc[2])

        counts[name] = counts[name] + 1
        
    for name in list(set(map(lambda x : graph_path(x, options), options.in_gams))):
        acc_table += "{}\t{:.4}\t{:.4}\t{:.4}\t".format(
            os.path.splitext(os.path.basename(graph_path(gam, options)))[0],
            float(sums[name][0]) / float(counts[name]),
            float(sums[name][1]) / float(counts[name]),
            float(sums[name][2]) / float(counts[name]))
        
        acc_table += "{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t".format(
            float(sums[name][3]) / float(counts[name]),
            float(sums[name][4]) / float(counts[name]),
            float(sums[name][5]) / float(counts[name]),
            float(sums[name][6]) / float(counts[name]),
            float(sums[name][7]) / float(counts[name]),
            float(sums[name][8]) / float(counts[name]))
        acc_table +="{:.4}\t{:.4}\t{:.4}\n".format(
            float(sums[name][9]) / float(counts[name]),
            float(sums[name][10]) / float(counts[name]),
            float(sums[name][11]) / float(counts[name]))

    with open(acc_tsv_path(options), "w") as ofile:
        ofile.write(acc_table)