def compute_all_indexes(job, options): """ run everything (root toil job) first all indexes are computed, then all comparisons (follow on) then summary (follow on of that) """ # do all the indexes baseline_set = set() for gam in options.in_gams: baseline = baseline_path(gam, options) if not os.path.isfile(baseline): raise RuntimeError("baseline {} for gam {} not found".format(baseline, gam)) if baseline not in baseline_set: job.addChildJobFn(compute_kmer_index, baseline, options, cores=options.vg_cores) baseline_set.add(baseline) if graph_path(gam, options) != baseline: job.addChildJobFn(compute_kmer_index, graph_path(gam, options), options, cores=options.vg_cores) if augmented_vg_path(gam, options) != baseline: job.addChildJobFn(compute_kmer_index, augmented_vg_path(gam, options), options, cores=options.vg_cores) if linear_vg_path(gam, options) != baseline: job.addChildJobFn(compute_kmer_index, linear_vg_path(gam, options), options, cores=options.vg_cores) # do the comparisons job.addFollowOnJobFn(compute_all_comparisons, options, cores=1)
def dist_table(options): """ make the jaccard distance table by scraping together all the comparison json files """ # tsv header dist_table = "#\t{}\t\t\t\t\\t\tn".format(options.baseline) dist_table += "#graph\tgraph_dist\tlinear_dist\taugmented_dist\tsample_dist\tdelta_linear\tdelta_augmented\tdelta_sample\n" for gam in options.in_gams: baseline = baseline_path(gam, options) graph_comp_path = comp_path(baseline, graph_path(gam, options), options) graph_dist = jaccard_dist(graph_comp_path) aug_comp_path = comp_path(baseline, augmented_vg_path(gam, options), options) aug_graph_dist = jaccard_dist(aug_comp_path) lin_comp_path = comp_path(baseline, linear_vg_path(gam, options), options) lin_graph_dist = jaccard_dist(lin_comp_path) sam_comp_path = comp_path(baseline, sample_vg_path(gam, options), options) sam_graph_dist = jaccard_dist(sam_comp_path) delta_lin = lin_graph_dist - graph_dist delta_aug = aug_graph_dist - graph_dist delta_sam = sam_graph_dist - graph_dist dist_table += "{}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\n".format( os.path.splitext(os.path.basename(graph_path(gam, options)))[0], graph_dist, lin_graph_dist, aug_graph_dist, sam_graph_dist, delta_lin, delta_aug, delta_sam) with open(dist_tsv_path(options), "w") as ofile: ofile.write(dist_table)
def compute_all_comparisons(job, options): """ run vg compare in parallel on all the graphs, outputting a json file for each """ ncores = min(2, options.vg_cores) for gam in options.in_gams: baseline = baseline_path(gam, options) job.addChildJobFn(compute_comparison, baseline, graph_path(gam, options), options, cores=ncores) job.addChildJobFn(compute_comparison, baseline, augmented_vg_path(gam, options), options, cores=ncores) job.addChildJobFn(compute_comparison, baseline, linear_vg_path(gam, options), options, cores=ncores)
def acc_table(options): """ make the accuracy table by scraping together all the comparison json files """ # tsv header acc_table = "#\t{}\t\t\t\t\t\t\t\t\t\n".format(options.baseline) acc_table += "#graph\tgraph_prec\tgraph_rec\tgraph_f1" acc_table += "\tlinear_prec\tlinear_rec\tlinear_f1" acc_table += "\taugmented_prec\taugmented_rec\taugmented_f1" acc_table += "\tsample_prec\tsample_rec\tsample_f1" sums = defaultdict(lambda : (0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.)) counts = defaultdict(lambda : 0.) for gam in options.in_gams: baseline = baseline_path(gam, options) graph_comp_path = comp_path(baseline, graph_path(gam, options), options) graph_acc = accuracy(graph_comp_path) aug_comp_path = comp_path(baseline, augmented_vg_path(gam, options), options) aug_graph_acc = accuracy(aug_comp_path) lin_comp_path = comp_path(baseline, linear_vg_path(gam, options), options) lin_graph_acc = accuracy(lin_comp_path) sam_comp_path = comp_path(baseline, sample_vg_path(gam, options), options) sam_graph_acc = accuracy(sam_comp_path) name = graph_path(gam, options) sums[name] = (sums[name][0] + graph_acc[0], sums[name][1] + graph_acc[1], sums[name][2] + graph_acc[2], sums[name][3] + lin_graph_acc[0], sums[name][4] + lin_graph_acc[1], sums[name][5] + lin_graph_acc[2], sums[name][6] + aug_graph_acc[0], sums[name][7] + aug_graph_acc[1], sums[name][8] + aug_graph_acc[2], sums[name][9] + sam_graph_acc[0], sums[name][10] + sam_graph_acc[1], sums[name][11] + sam_graph_acc[2]) counts[name] = counts[name] + 1 for name in list(set(map(lambda x : graph_path(x, options), options.in_gams))): acc_table += "{}\t{:.4}\t{:.4}\t{:.4}\t".format( os.path.splitext(os.path.basename(graph_path(gam, options)))[0], float(sums[name][0]) / float(counts[name]), float(sums[name][1]) / float(counts[name]), float(sums[name][2]) / float(counts[name])) acc_table += "{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t{:.4}\t".format( float(sums[name][3]) / float(counts[name]), float(sums[name][4]) / float(counts[name]), float(sums[name][5]) / float(counts[name]), float(sums[name][6]) / float(counts[name]), float(sums[name][7]) / float(counts[name]), float(sums[name][8]) / float(counts[name])) acc_table +="{:.4}\t{:.4}\t{:.4}\n".format( float(sums[name][9]) / float(counts[name]), float(sums[name][10]) / float(counts[name]), float(sums[name][11]) / float(counts[name])) with open(acc_tsv_path(options), "w") as ofile: ofile.write(acc_table)