def plot_heatmap(tsv, options): """ make a heatmap """ out_dir = os.path.join(options.comp_dir, "heatmaps") robust_makedirs(out_dir) mat, col_names, row_names, row_label = read_tsv(tsv) names = name_map() for i in range(len(col_names)): if col_names[i] in names: col_names[i] = names[col_names[i]] for i in range(len(row_names)): if row_names[i] in names: row_names[i] = names[row_names[i]] if "_rename" in tsv: return fix_tsv = tsv.replace(".tsv", "_rename.tsv") write_tsv(fix_tsv, mat, col_names, row_names, row_label) out_hm = os.path.join(out_dir, os.path.basename(tsv).replace(".tsv", ".pdf")) ph_opts = "--skip {}".format(options.skip) if options.skip is not None else "" cmd = "scripts/plotHeatmap.py {} {} {}".format(fix_tsv, out_hm, ph_opts) print cmd os.system(cmd) cmd = "scripts/plotHeatmap.py {} {} {} --log_scale".format(fix_tsv, out_hm.replace(".pdf", "_log.pdf"), ph_opts) print cmd os.system(cmd)
def temp_path(options, prefix="tmp", ext="", length=6): """ get a temporary file in out_dir/temp """ tempdir = os.path.join(options.out_dir, "temp") robust_makedirs(tempdir) tag = "".join([random.choice(string.ascii_uppercase + string.digits) for i in xrange(length)]) return os.path.join(tempdir, prefix + tag + ext)
def plot_kmer_comp(tsv_path, options): """ take a kmer compare table and make a jaccard boxplot for the first column and a recall / precision ploot for the 2nd and third column """ out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) out_base_path = os.path.join(out_dir, out_name) region = out_name.split("-")[-1].upper() params = " ".join(PLOT_PARAMS) # jaccard boxplot jac_tsv = out_base_path + "_jac.tsv" awkstr = '''awk '{if (NR!=1) print $1 "\t" $2}' ''' run("{} {} > {}".format(awkstr, tsv_path, jac_tsv)) jac_png = out_base_path + "_jac.png" run("scripts/boxplot.py {} --save {} --title \"{} KMER Set Jaccard\" --x_label \"Graph\" --y_label \"Jaccard Index\" --x_sideways {}".format(jac_tsv, jac_png, region, params)) # precision recall scatter plot acc_tsv = out_base_path + "_acc.tsv" awkstr = '''awk '{if (NR!=1) print $1 "\t" $4 "\t" $3}' ''' run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path + "_acc.png" run("scripts/scatter.py {} --save {} --title \"{} KMER Set Accuracy\" --x_label \"Recall\" --y_label \"Precision\" --width 12 --height 9 --lines {}".format(acc_tsv, acc_png, region, params))
def make_best_calls(best_table, options): """ using softlinks, make a call set with best f1s from the roc. this is dependent on the call directories being obtainable from the comparison directory by dropping extension """ best_dir = options.out_dir.strip("/") + ".best" for region in best_table.keys(): for graph in best_table[region].keys(): comp_tsv_path = best_table[region][graph][0] comp_tsv_path = comp_tsv_path[:comp_tsv_path.find("/comp_tables")] call_base_path = os.path.splitext(comp_tsv_path)[0] call_path = os.path.join(call_base_path, region, graph) # gatk3 and platypus: we just link in their vcf since they don't have call directory if graph in ["gatk3", "platypus"]: robust_makedirs(os.path.join(best_dir, region, graph)) else: robust_makedirs(os.path.join(best_dir, region)) os.system("ln -fs {} {}".format(os.path.abspath(call_path), os.path.abspath(os.path.join(best_dir, region)))) # link in the preprocessed vcf from the comp dir to the same directory comp_path = os.path.join(call_base_path +".comp") for pvcf in glob.glob(os.path.join(comp_path, "preprocessed_vcfs", region, "*_{}.vcf".format(graph))): os.system("ln -fs {} {}".format(os.path.abspath(pvcf), os.path.abspath(os.path.join(best_dir, region, graph, os.path.basename(pvcf).replace(graph, "sample_preprocessed"))))) # link in the truth while we're at it for pvcf in glob.glob(os.path.join(comp_path, "preprocessed_vcfs", region, "*_platvcf*.vcf")): os.system("ln -fs {} {}".format(os.path.abspath(pvcf), os.path.abspath(os.path.join(best_dir, region, graph))))
def main(args): options = parse_args(args) RealTimeLogger.start_master() for gam in options.in_gams: if len(gam.split("/")) < 3 or os.path.splitext(gam)[1] != ".gam": raise RuntimeError("Input gam paths must be of the form " ".../<alg>/<reads>/<filename>.gam") robust_makedirs(json_out_path(options)) robust_makedirs(compare_out_path(options)) # Make a root job root_job = Job.wrapJobFn(compute_all_indexes, options, cores=1, memory="2G", disk=0) # Run it and see how many jobs fail if not options.only_summary: failed_jobs = Job.Runner.startToil(root_job, options) else: failed_jobs = 0 if failed_jobs > 0: raise Exception("{} jobs failed!".format(failed_jobs)) RealTimeLogger.stop_master() # make some tables from the json comparison output #dist_table(options) #acc_table(options) snp_count_table(options) graph_size_table(options)
def plot_kmer_comp(tsv_path, options): """ take a kmer compare table and make a jaccard boxplot for the first column and a recall / precision ploot for the 2nd and third column """ out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) out_base_path = os.path.join(out_dir, out_name) sample = out_name.split("-")[-1].upper() region = out_name.split("-")[-2].upper() params = " ".join(PLOT_PARAMS) # jaccard boxplot jac_tsv = out_base_path + "_jac.tsv" awkstr = '''awk '{if (NR!=1) print $1 "\t" $2}' ''' run("{} {} > {}".format(awkstr, tsv_path, jac_tsv)) jac_png = out_base_path + "_jac.png" run("scripts/boxplot.py {} --save {} --title \"{} KMER Set Jaccard\" --x_label \"Graph\" --y_label \"Jaccard Index\" --x_sideways {}".format(jac_tsv, jac_png, region, params)) # precision recall scatter plot acc_tsv = out_base_path + "_acc.tsv" awkstr = '''awk '{if (NR!=1) print $1 "\t" $4 "\t" $3}' ''' run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path + "_acc.png" run("scripts/scatter.py {} --save {} --title \"{} KMER Set Accuracy\" --x_label \"Recall\" --y_label \"Precision\" --width 12 --height 9 --lines {}".format(acc_tsv, acc_png, region, params))
def temp_path(options, prefix="tmp", ext="", length=6): """ get a temporary file in out_dir/temp """ tempdir = os.path.join(options.out_dir, "temp") robust_makedirs(tempdir) tag = "".join([random.choice( string.ascii_uppercase + string.digits) for i in xrange(length)]) return os.path.join(tempdir, prefix + tag + ext)
def out_base_path(tag, label, extension): bd = tag if extension != ".tsv" else "tsv" ret = ( os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region) + "_" + label + extension ) robust_makedirs(os.path.dirname(ret)) return ret
def main(args): options = parse_args(args) RealTimeLogger.start_master() robust_makedirs(options.out_dir) vcfmap = munge_vcf_results(options.comp_dir) mergetable = make_trio_vcfs(vcfmap, options) do_mendel(mergetable, options)
def main(args): options = parse_args(args) robust_makedirs(options.out_dir) evalmap = munge_vcfeval_results(options.comp_dir) counts_table = do_all_counts(evalmap, options) for region, rd in counts_table.items(): for sample, graph_table in rd.items(): tsv = counts_tsv(graph_table, options) with open(os.path.join(options.out_dir, "call_stats_{}_{}.tsv".format(region, sample)), "w") as f: f.write(tsv)
def make_trio_vcfs(vcfmap, options): """ merge up samples into same vcf using rtg return index of merged files""" robust_makedirs(options.out_dir) ped_file = os.path.join(options.out_dir, "predigree.ped") with open(ped_file, "w") as f: f.write(options.ped + "\n") mergetable = dict() for region, rd in vcfmap.items(): mergetable[region] = dict() region_dir = os.path.join(options.out_dir, "trio_vcfs", region) robust_makedirs(region_dir) # round up all sampels for graph bygraph = dict() for sample, sd in rd.items(): for graph, pvcf in sd.items(): if graph not in bygraph: bygraph[graph] = dict() bygraph[graph][sample] = pvcf # make a merged vcf for each graph for graph, sd in bygraph.items(): input_vcfs = {"snp": [], "indel": [], "all": []} for sample, pvcf in sd.items(): work_dir = os.path.join(region_dir, "input_vcf") merge_dir = os.path.join(region_dir, "merged_vcf") robust_makedirs(work_dir) robust_makedirs(merge_dir) for kind in input_vcfs.keys(): filter_vcf = os.path.join( work_dir, "{}_{}_{}.vcf".format(graph, sample, kind)) vstr = "-v snps,mnps" if kind is "snp" else "-V snps,mnps" if kind is "indel" else "" if options.clip is not None: vstr += " -R {}".format(options.clip) run("bcftools view {} -f PASS,. {} | bcftools norm - -f {} > {}" .format(pvcf, vstr, options.chrom_fa_path, filter_vcf)) run("bgzip -f {}".format(filter_vcf)) run("tabix -f -p vcf {}.gz".format(filter_vcf)) input_vcfs[kind].append("{}.gz".format(filter_vcf)) if len(sd.items()) >= 3 and \ len(input_vcfs["all"]) == len(sd.items()) and\ len(input_vcfs["snp"]) == len(sd.items()) and\ len(input_vcfs["indel"]) == len(sd.items()): mergetable[region][graph] = dict() for kind in input_vcfs.keys(): out_vcf = os.path.join( merge_dir, "{}_{}_merged.vcf.gz".format(graph, kind)) run("rm -f {}".format(out_vcf)) run("rtg vcfmerge {} -o {}".format( " ".join(input_vcfs[kind]), out_vcf), fail_hard=True) mergetable[region][graph][kind] = out_vcf return mergetable
def compute_kmer_comparison(job, graph1, graph2, options): """ run vg compare between two graphs """ out_path = comp_path(graph1, graph2, options) graph1_index_path = index_path(graph1, options) assert os.path.exists(graph1_index_path) graph2_index_path = index_path(graph2, options) assert os.path.exists(graph2_index_path) do_comp = options.overwrite or not os.path.exists(out_path) if do_comp: robust_makedirs(os.path.dirname(out_path)) os.system("vg compare {} {} -t {} > {}".format(graph1, graph2, min(options.vg_cores, 2), out_path))
def make_trio_vcfs(vcfmap, options): """ merge up samples into same vcf using rtg return index of merged files""" robust_makedirs(options.out_dir) ped_file = os.path.join(options.out_dir, "predigree.ped") with open(ped_file, "w") as f: f.write(options.ped + "\n") mergetable = dict() for region, rd in vcfmap.items(): mergetable[region] = dict() region_dir = os.path.join(options.out_dir, "trio_vcfs", region) robust_makedirs(region_dir) # round up all sampels for graph bygraph = dict() for sample, sd in rd.items(): for graph, pvcf in sd.items(): if graph not in bygraph: bygraph[graph] = dict() bygraph[graph][sample] = pvcf # make a merged vcf for each graph for graph, sd in bygraph.items(): input_vcfs = { "snp" : [], "indel" : [], "all" : [] } for sample, pvcf in sd.items(): work_dir = os.path.join(region_dir, "input_vcf") merge_dir = os.path.join(region_dir, "merged_vcf") robust_makedirs(work_dir) robust_makedirs(merge_dir) for kind in input_vcfs.keys(): filter_vcf = os.path.join(work_dir, "{}_{}_{}.vcf".format(graph, sample, kind)) vstr = "-v snps,mnps" if kind is "snp" else "-V snps,mnps" if kind is "indel" else "" if options.clip is not None: vstr += " -R {}".format(options.clip) run("bcftools view {} -f PASS,. {} | bcftools norm - -f {} > {}".format( pvcf, vstr, options.chrom_fa_path, filter_vcf)) run("bgzip -f {}".format(filter_vcf)) run("tabix -f -p vcf {}.gz".format(filter_vcf)) input_vcfs[kind].append("{}.gz".format(filter_vcf)) if len(sd.items()) >= 3 and \ len(input_vcfs["all"]) == len(sd.items()) and\ len(input_vcfs["snp"]) == len(sd.items()) and\ len(input_vcfs["indel"]) == len(sd.items()): mergetable[region][graph] = dict() for kind in input_vcfs.keys(): out_vcf = os.path.join(merge_dir, "{}_{}_merged.vcf.gz".format(graph, kind)) run("rm -f {}".format(out_vcf)) run("rtg vcfmerge {} -o {}".format(" ".join(input_vcfs[kind]), out_vcf), fail_hard = True) mergetable[region][graph][kind] = out_vcf return mergetable
def main(args): options = parse_args(args) robust_makedirs(options.out_dir) evalmap = munge_vcfeval_results(options.comp_dir) counts_table = do_all_counts(evalmap, options) for region, rd in counts_table.items(): for sample, graph_table in rd.items(): tsv = counts_tsv(graph_table, options) with open( os.path.join(options.out_dir, "call_stats_{}_{}.tsv".format(region, sample)), "w") as f: f.write(tsv)
def do_mendel(mergetable, options): """ run rtg mendelian on all our merged vcfs """ header = ["graph", "all", "snp", "indel"] for region, gd in mergetable.items(): table = [] for graph, mergefiles in gd.items(): annot_dir = os.path.join(options.out_dir, "mendel", region, graph) robust_makedirs(annot_dir) concordance = dict() for kind, mergefile in mergefiles.items(): out_vcf = os.path.join(annot_dir, "mendel_{}.vcf.gz".format(kind)) con_vcf = os.path.join(annot_dir, "consistent_{}.vcf.gz".format(kind)) incon_vcf = os.path.join(annot_dir, "inconsistent_{}.vcf.gz".format(kind)) out_stdout = os.path.join(annot_dir, "mendel_{}.stdout".format(kind)) run("rtg mendelian -l -i {} -t {} --pedigree {} --output {} --output-consistent {} --output-inconsistent {} > {}" .format(mergefile, os.path.join(options.comp_dir, "chrom.sdf"), os.path.join(options.out_dir, "predigree.ped"), out_vcf, con_vcf, incon_vcf, out_stdout)) concordance[kind] = scrape_mendel(out_stdout) table.append([ graph, concordance["all"], concordance["snp"], concordance["indel"] ]) # write the tsv for this region with open( os.path.join(options.out_dir, "mendel-{}.tsv".format(region)), "w") as f: f.write("\t".join(header) + "\n") for row in table: if None not in row: line = [str(s) for s in row] f.write("\t".join(line) + "\n")
def compute_linear_variants(job, input_gam, options): """ project to bam, then run samtools to call some variants """ input_graph_path = graph_path(input_gam, options) input_index_path = index_path(input_graph_path, options) # can only do this if there is a "ref" path in the vg graph res_path = temp_path(options) run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, "ref", res_path)) has_ref = False with open(res_path) as res_file: has_ref = res_file.read()[0] == "1" run("rm {}".format(res_path)) if has_ref: surject_path = projected_bam_path(input_gam, options) out_vcf_path = linear_vcf_path(input_gam, options) out_vg_path = linear_vg_path(input_gam, options) fasta_path = ref_path(input_gam, options) do_surject = options.overwrite or not os.path.isfile(surject_path) do_vcf = do_surject or not os.path.isfile(out_vcf_path + ".gz") do_vg = do_vcf or not os.path.isfile(out_vg_path) if do_surject: robust_makedirs(os.path.dirname(surject_path)) prefix_path = temp_path(options, ".prefix") # surject to reference path (name hardcoded to ref for now) run("vg surject -d {} -p {} -b {} -t {} | samtools sort -o - {}> {}".format( input_index_path, "ref", input_gam, options.vg_cores, prefix_path, surject_path), timeout_sec=options.timeout, timeout_dep=surject_path) run("rm -f {}".format(prefix_path)) if do_vcf: # todo: we assume that all graphs have same reference fasta, here. # this is false for, ex, simons which uses grchg37 instead of 38. # create pileup in bcf using samtools # http://samtools.sourceforge.net/mpileup.shtml assert os.path.isfile(fasta_path) robust_makedirs(os.path.dirname(out_vcf_path)) run("samtools mpileup -I -u -t DP -f {} {} | bcftools call -m -V indels - > {}".format( fasta_path, surject_path, out_vcf_path)) # make compressed index run("bgzip -f {}".format(out_vcf_path)) run("tabix -f -p vcf {}.gz".format(out_vcf_path)) if do_vg: # and convert back to vg... robust_makedirs(os.path.dirname(out_vg_path)) run("vg construct -v {}.gz -r {} -t {} > {}".format(out_vcf_path, fasta_path, options.vg_cores, out_vg_path))
def compute_vg_variants(job, input_gam, options): """ run vg pileup and vg call on the input """ input_graph_path = graph_path(input_gam, options) out_pileup_path = pileup_path(input_gam, options) out_sample_vg_path = sample_vg_path(input_gam, options) out_augmented_vg_path = augmented_vg_path(input_gam, options) do_pu = options.overwrite or not os.path.isfile(out_pileup_path) do_call = do_pu or not os.path.isfile(out_sample_vg_path) do_aug = do_pu or not os.path.isfile(out_augmented_vg_path) if do_pu: RealTimeLogger.get().info("Computing Variants for {} {}".format( input_graph_path, input_gam)) robust_makedirs(os.path.dirname(out_pileup_path)) run("vg pileup {} {} -t {} > {}".format(input_graph_path, input_gam, options.vg_cores, out_pileup_path)) if do_call: robust_makedirs(os.path.dirname(out_sample_vg_path)) run("vg call {} {} -r 0.001 -d 50 -e 150 -s 25 -t {} | vg ids -c - | vg ids -s - > {}".format(input_graph_path, out_pileup_path, options.vg_cores, out_sample_vg_path)) if do_aug: robust_makedirs(os.path.dirname(out_augmented_vg_path)) run("vg call {} {} -r 0.001 -d 50 -e 150 -s 25 -t {} -l | vg ids -c - | vg ids -s - > {}".format(input_graph_path, out_pileup_path, options.vg_cores, out_augmented_vg_path))
def make_best_calls(best_table, options): """ using softlinks, make a call set with best f1s from the roc. this is dependent on the call directories being obtainable from the comparison directory by dropping extension """ best_dir = options.out_dir.strip("/") + ".best" for region in best_table.keys(): for graph in best_table[region].keys(): comp_tsv_path = best_table[region][graph][0] comp_tsv_path = comp_tsv_path[:comp_tsv_path.find("/comp_tables")] call_base_path = os.path.splitext(comp_tsv_path)[0] call_path = os.path.join(call_base_path, region, graph) # gatk3 and platypus: we just link in their vcf since they don't have call directory if graph in ["gatk3", "platypus"]: robust_makedirs(os.path.join(best_dir, region, graph)) else: robust_makedirs(os.path.join(best_dir, region)) os.system("ln -fs {} {}".format( os.path.abspath(call_path), os.path.abspath(os.path.join(best_dir, region)))) # link in the preprocessed vcf from the comp dir to the same directory comp_path = os.path.join(call_base_path + ".comp") for pvcf in glob.glob( os.path.join(comp_path, "preprocessed_vcfs", region, "*_{}.vcf".format(graph))): os.system("ln -fs {} {}".format( os.path.abspath(pvcf), os.path.abspath( os.path.join( best_dir, region, graph, os.path.basename(pvcf).replace( graph, "sample_preprocessed"))))) # link in the truth while we're at it for pvcf in glob.glob( os.path.join(comp_path, "preprocessed_vcfs", region, "*_platvcf*.vcf")): os.system("ln -fs {} {}".format( os.path.abspath(pvcf), os.path.abspath(os.path.join(best_dir, region, graph))))
def do_mendel(mergetable, options): """ run rtg mendelian on all our merged vcfs """ header = ["graph", "all", "snp", "indel"] for region, gd in mergetable.items(): table = [] for graph, mergefiles in gd.items(): annot_dir = os.path.join(options.out_dir, "mendel", region, graph) robust_makedirs(annot_dir) concordance = dict() for kind, mergefile in mergefiles.items(): out_vcf = os.path.join(annot_dir, "mendel_{}.vcf.gz".format(kind)) con_vcf = os.path.join(annot_dir, "consistent_{}.vcf.gz".format(kind)) incon_vcf = os.path.join(annot_dir, "inconsistent_{}.vcf.gz".format(kind)) out_stdout = os.path.join(annot_dir, "mendel_{}.stdout".format(kind)) run("rtg mendelian -l -i {} -t {} --pedigree {} --output {} --output-consistent {} --output-inconsistent {} > {}".format( mergefile, os.path.join(options.comp_dir, "chrom.sdf"), os.path.join(options.out_dir, "predigree.ped"), out_vcf, con_vcf, incon_vcf, out_stdout)) concordance[kind] = scrape_mendel(out_stdout) table.append([graph, concordance["all"], concordance["snp"], concordance["indel"]]) # write the tsv for this region with open(os.path.join(options.out_dir, "mendel-{}.tsv".format(region)), "w") as f: f.write("\t".join(header) + "\n") for row in table: if None not in row: line = [str(s) for s in row] f.write("\t".join(line) + "\n")
def compute_snp1000g_baseline(job, input_gam, platinum, filter_indels, options): """ make 1000 genomes sample graph by filtering the vcf """ # there is only one g1vcf graph per region per sample # this function is also going to get called once for each graph type # so we hack here to only run on refonly graphs (arbitrary choice) if alignment_graph_tag(input_gam, options) != "refonly": return sample = alignment_sample_tag(input_gam, options) if platinum is True and sample not in options.platinum_samples.split(","): return region = alignment_region_tag(input_gam, options) if platinum is False: g1kvcf_path = os.path.join(options.g1kvcf_path, region.upper() + ".vcf") else: g1kvcf_path = os.path.join(options.platinum_path, sample, region.upper() + ".vcf") g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed") filter_vcf_path = g1k_vcf_path(input_gam, platinum, filter_indels, options) filter_fa_path = g1k_fa_path(input_gam, platinum, filter_indels, options) filter_vg_path = g1k_vg_path(input_gam, platinum, filter_indels, options) fasta_path = options.chrom_fa_path do_filter = options.overwrite or not os.path.isfile(filter_vcf_path + ".gz") do_construct = do_filter or not os.path.isfile(filter_vg_path) # make sure we're dealing with a sample that's in the vcf if do_filter or do_construct: p = subprocess.Popen("grep {} {} | wc -l".format(sample, g1kvcf_path), shell=True, stdout=subprocess.PIPE, stderr=sys.stderr, bufsize=-1) output, _ = p.communicate() assert p.wait() == 0 if int(output) == 0: do_filter = False do_construct = False # make filtered compressed vcf for this sample if do_filter: robust_makedirs(os.path.dirname(filter_vcf_path)) if filter_indels is True: filter_input_path = filter_vcf_path + ".in" run("scripts/vcfFilterIndels.py {} > {}".format(g1kvcf_path, filter_input_path), fail_hard = True) else: filter_input_path = g1kvcf_path run("scripts/vcfFilterSample.py {} {} {} {} {}".format(filter_input_path, fasta_path, sample, filter_vcf_path, filter_fa_path), fail_hard = True) run("scripts/vcfsort {} > {}.sort ; mv {}.sort {}".format(filter_vcf_path, filter_vcf_path, filter_vcf_path, filter_vcf_path)) run("bgzip -f {}".format(filter_vcf_path), fail_hard = True) run("tabix -f -p vcf {}.gz".format(filter_vcf_path), fail_hard = True) # load it into a vg graph if do_construct: with open(g1kbed_path) as bed_file: coords = bed_file.readline().split() # convert from bed to vcf coordinates by adding one to start coords = (coords[0], int(coords[1]) + 1, int(coords[2])) run("vg construct -v {}.gz -r {} -t {} -R {}:{}-{} > {}".format(filter_vcf_path, filter_fa_path, options.vg_cores, coords[0], coords[1], coords[2], filter_vg_path), fail_hard = True)
def compute_vg_variants(job, input_gam, options): """ run vg pileup and vg call on the input """ # Move to the appropriate working directory from wherever Toil dropped us os.chdir(options.cwd) input_graph_path = graph_path(input_gam, options) out_pileup_path = pileup_path(input_gam, options) out_sample_vg_path = sample_vg_path(input_gam, options) out_sample_vcf_path = out_sample_vg_path.replace(".vg", ".vcf") out_sample_txt_path = sample_txt_path(input_gam, options) out_augmented_vg_path = augmented_vg_path(input_gam, options) out_gam_filter_path = gam_filter_path(input_gam, options) out_gam_index_path = gam_index_path(input_gam, options) out_bam_path = out_sample_vg_path.replace(".vg", ".bam") do_genotype = options.genotype and (options.overwrite or not os.path.isfile(out_sample_vcf_path)) do_gam_filter= (options.overwrite or not os.path.isfile(out_gam_filter_path)) do_gam_index = do_genotype and (do_gam_filter or options.overwrite or not os.path.isdir(out_gam_index_path)) do_pu = not options.genotype and (options.overwrite or not os.path.isfile(out_pileup_path)) do_call = not options.genotype and (do_pu or not os.path.isfile(out_sample_vcf_path)) do_surject = options.surject and (options.overwrite or do_gam_filter or not os.path.isfile(out_bam_path)) # We need an XG here for the mase graph, but I haven't got time to refactor # to make it in the right place. So just make it here. temp_xg_path = job.fileStore.getLocalTempDir() + "/filter.xg" if do_gam_filter or do_pu: # Make sure we have the xg index around, which fiulter may need. run("vg index -x {} {}".format(temp_xg_path, input_graph_path), fail_hard = True) if do_gam_filter: robust_makedirs(os.path.dirname(out_pileup_path)) run("vg filter -x {} {} {} {} > {}".format(temp_xg_path, input_gam, options.filter_opts, input_graph_path, out_gam_filter_path), fail_hard = True) if do_gam_index: robust_makedirs(os.path.dirname(out_pileup_path)) run("rm -rf {} ; vg index {} -N -d {}".format(out_gam_index_path, out_gam_filter_path, out_gam_index_path), fail_hard = True) if do_pu: robust_makedirs(os.path.dirname(out_pileup_path)) run("vg pileup {} {} {} -t {} > {}".format(input_graph_path, out_gam_filter_path, options.pileup_opts, options.vg_cores, out_pileup_path), fail_hard = True) ref = None bedLength = -1 if do_call or do_genotype or do_surject: robust_makedirs(os.path.dirname(out_sample_vcf_path)) region = alignment_region_tag(input_gam, options) g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed") with open(g1kbed_path) as f: contig, offset, end = f.readline().split()[0:3] bedLength = int(end) - int(offset) # make the vcf # can only do this if there is a "ref" path in the vg graph ref = None res_path = temp_path(options) for ref_name in ["ref", contig]: run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, ref_name, res_path)) with open(res_path) as res_file: if res_file.read()[0] == "1": ref = ref_name break run("rm {}".format(res_path)) if ref is not None: if do_genotype: run("vg genotype {} {} -S -pv -q -i -C -o {} -r {} -c {} -s {} -t {} > {} 2> {}".format(input_graph_path, out_gam_index_path, offset, ref, contig, alignment_sample_tag(input_gam, options), options.vg_cores, out_sample_vcf_path, out_sample_vcf_path.replace(".vcf", ".vcf.stderr")), fail_hard = True) if do_call: run("vg call {} {} {} -t {} -o {} -r {} -c {} -S {} -A {} > {} 2> {}".format(input_graph_path, out_pileup_path, options.call_opts, options.vg_cores, offset, ref, contig, alignment_sample_tag(input_gam, options), out_augmented_vg_path, out_sample_vg_path.replace(".vg", ".vcf"), out_sample_vg_path.replace(".vg", ".vcf.stderr")), fail_hard = True) if do_surject: run("vg index {} -k {} -e {} -s -d {}.index -t {}".format(input_graph_path, 20, 5, os.path.join(os.path.dirname(out_bam_path), "graph"), options.vg_cores), fail_hard = True) run("vg surject {} -t {} -p {} -b -d {}.index > {}".format(out_gam_filter_path, options.vg_cores, ref, os.path.join(os.path.dirname(out_bam_path), "graph"), out_bam_path), fail_hard = True) # fix up chromosome coordinates so we can display on browser if contig[0] != "c": contig = "chr{}".format(contig) contigLength = {"chr5": 181538259, "chr6": 170805979, "chr13": 114364328, "chr17": 83257441, "chr19": 58617616} # in header, change up the contig name and run("samtools view -H {} | sed -e \"s/{}/{}/\" | sed -e \"s/{}/{}/\" > {}.sam".format(out_bam_path, ref, contig, bedLength, contigLength[contig], out_bam_path), fail_hard = True) # in body, add offset and fix contig, leave in sam for now so we can debug run("samtools view -F 256 {} | awk -v OFS=\'\\t\' \'{{$3=\"{}\"; $4=$4+{}; $5=60; $8=$8+{}; print $0}}\' >> {}.sam".format(out_bam_path, contig, offset, offset, out_bam_path), fail_hard = True) # back to bam run("samtools view {}.sam -b -F 4 | samtools sort - --threads {} -o {}".format(out_bam_path, options.vg_cores, out_bam_path), fail_hard = True) # and index run("samtools index -b {}".format(out_bam_path), fail_hard = True)
def main(args): options = parse_args(args) robust_makedirs(os.path.join(options.out_dir, "comp_tables")) # compute average score for each roc dir in this table avg_table = [] # [region][method] --> (path, f1) best_table = defaultdict(lambda: defaultdict(lambda: (None, -1))) first = True # sort the directories, assuming their names give info on their order # in the roc for comp_dir in sorted( options.comp_dirs)[options.skip_first:len(options.comp_dirs) - options.skip_last]: print comp_dir # this can happen easily using wildcards in input if comp_dir == options.out_dir: continue # look through tsvs in comp_tables. for tsv in glob.glob(os.path.join(comp_dir, "comp_tables", "*.tsv")): # overwrite if first # strip header and append if second c = "cp {} ".format( tsv) if first is True else "tail -n +2 {} >> ".format(tsv) # just cat into the output directory os.system("{} {}".format( c, os.path.join(options.out_dir, "comp_tables", os.path.basename(tsv)))) print "{} {}".format( c, os.path.join(options.out_dir, "comp_tables", os.path.basename(tsv))) tb = os.path.basename(tsv).split("-") if len(tb) > 2 and tb[0] == options.best_baseline and tb[ 1] == options.best_comp: avg_table.append(avg_acc(tsv, options)) update_best_table(best_table, tsv, options) first = False # make a call directory of links to the best in roc points for each graph make_best_calls(best_table, options) # write out our sompy vcf snp accutacy with open( os.path.join( options.out_dir, "{}-{}-{}_{}-avg.tsv".format(options.best_baseline, options.best_comp, options.pcol, options.rcol)), "w") as f: lines = sorted(avg_table) for line in lines: for tok in line: f.write(str(tok) + "\t") f.write("\n") # let's sort the output to make it easier to remove dead points if options.smooth is True: for tsv in glob.glob( os.path.join(options.out_dir, "comp_tables", "*.tsv")): print "smoothing {}".format(tsv) with open(tsv) as f: lines = [line for line in f] lines = [lines[0]] + sorted( lines[1:], key=lambda x: (x.split()[0], float(x.split()[ options.pcol]), 1 - float(x.split()[options.rcol]))) # precisions can be bumpy (need to change to sensitivy?) # use simple smoother in the meantime lines = smooth_table([x.split() for x in lines], options) with open(tsv, "w") as f: for line in lines: f.write(line)
def compute_vg_variants(job, input_gam, options): """ run vg pileup and vg call on the input """ input_graph_path = graph_path(input_gam, options) out_pileup_path = pileup_path(input_gam, options) out_sample_vg_path = sample_vg_path(input_gam, options) out_sample_txt_path = sample_txt_path(input_gam, options) out_augmented_vg_path = augmented_vg_path(input_gam, options) do_pu = options.overwrite or not os.path.isfile(out_pileup_path) do_call = do_pu or not os.path.isfile(out_augmented_vg_path) do_sample = options.sample and (do_pu or not os.path.isfile(out_sample_vg_path)) do_vcf = do_call or not os.path.isfile(out_sample_vg_path.replace(".vg", ".vcf")) if do_pu: RealTimeLogger.get().info("Computing Variants for {} {}".format( input_graph_path, input_gam)) robust_makedirs(os.path.dirname(out_pileup_path)) run("vg filter {} {} | vg pileup {} - {} -t {} > {}".format(input_gam, options.filter_opts, input_graph_path, options.pileup_opts, options.vg_cores, out_pileup_path), fail_hard = True) if do_call: robust_makedirs(os.path.dirname(out_sample_vg_path)) run("vg call {} {} {} -l -c {} -t {} > {}".format(input_graph_path, out_pileup_path, options.call_opts, out_sample_txt_path, options.vg_cores, out_augmented_vg_path), fail_hard = True) if do_vcf: region = alignment_region_tag(input_gam, options) g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed") with open(g1kbed_path) as f: contig, offset = f.readline().split()[0:2] # make the vcf # can only do this if there is a "ref" path in the vg graph ref = None res_path = temp_path(options) for ref_name in ["ref", contig]: run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, ref_name, res_path)) with open(res_path) as res_file: if res_file.read()[0] == "1": ref = ref_name break run("rm {}".format(res_path)) if ref is not None: tasks = [] run("glenn2vcf {} {} -o {} -r {} -c {} -s {} -d {} > {} 2> {}".format(out_augmented_vg_path, out_sample_txt_path, offset, ref, contig, alignment_sample_tag(input_gam, options), options.depth, out_sample_vg_path.replace(".vg", ".vcf"), out_sample_vg_path.replace(".vg", ".vcf.stderr")), fail_hard = True) if do_sample: robust_makedirs(os.path.dirname(out_augmented_vg_path)) run("vg call {} {} {} -t {} | vg ids -cs - > {}".format(input_graph_path, out_pileup_path, options.call_opts, options.vg_cores, out_sample_vg_path), fail_hard = True)
def compute_vg_variants(job, input_gam, options): """ run vg pileup and vg call on the input """ # Move to the appropriate working directory from wherever Toil dropped us os.chdir(options.cwd) input_graph_path = graph_path(input_gam, options) out_pileup_path = pileup_path(input_gam, options) out_sample_vg_path = sample_vg_path(input_gam, options) out_sample_vcf_path = out_sample_vg_path.replace(".vg", ".vcf") out_sample_txt_path = sample_txt_path(input_gam, options) out_augmented_vg_path = augmented_vg_path(input_gam, options) out_gam_filter_path = gam_filter_path(input_gam, options) out_gam_index_path = gam_index_path(input_gam, options) out_bam_path = out_sample_vg_path.replace(".vg", ".bam") do_genotype = options.genotype and (options.overwrite or not os.path.isfile(out_sample_vcf_path)) do_gam_filter = options.overwrite or not os.path.isfile(out_gam_filter_path) do_gam_index = do_genotype and (do_gam_filter or options.overwrite or not os.path.isdir(out_gam_index_path)) do_pu = not options.genotype and (options.overwrite or not os.path.isfile(out_pileup_path)) do_call = not options.genotype and (do_pu or not os.path.isfile(out_sample_vcf_path)) do_surject = options.surject and (options.overwrite or do_gam_filter or not os.path.isfile(out_bam_path)) # We need an XG here for the mase graph, but I haven't got time to refactor # to make it in the right place. So just make it here. temp_xg_path = job.fileStore.getLocalTempDir() + "/filter.xg" if do_gam_filter or do_pu: # Make sure we have the xg index around, which fiulter may need. run("vg index -x {} {}".format(temp_xg_path, input_graph_path), fail_hard=True) if do_gam_filter: robust_makedirs(os.path.dirname(out_pileup_path)) run( "vg filter -x {} {} {} {} > {}".format( temp_xg_path, input_gam, options.filter_opts, input_graph_path, out_gam_filter_path ), fail_hard=True, ) if do_gam_index: robust_makedirs(os.path.dirname(out_pileup_path)) run( "rm -rf {} ; vg index {} -N -d {}".format(out_gam_index_path, out_gam_filter_path, out_gam_index_path), fail_hard=True, ) if do_pu: robust_makedirs(os.path.dirname(out_pileup_path)) run( "vg pileup {} {} {} -t {} > {}".format( input_graph_path, out_gam_filter_path, options.pileup_opts, options.vg_cores, out_pileup_path ), fail_hard=True, ) ref = None bedLength = -1 if do_call or do_genotype or do_surject: robust_makedirs(os.path.dirname(out_sample_vcf_path)) region = alignment_region_tag(input_gam, options) g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed") with open(g1kbed_path) as f: contig, offset, end = f.readline().split()[0:3] bedLength = int(end) - int(offset) # make the vcf # can only do this if there is a "ref" path in the vg graph ref = None res_path = temp_path(options) for ref_name in ["ref", contig]: run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, ref_name, res_path)) with open(res_path) as res_file: if res_file.read()[0] == "1": ref = ref_name break run("rm {}".format(res_path)) if ref is not None: if do_genotype: run( "vg genotype {} {} -S -pv -q -i -C -o {} -r {} -c {} -s {} -t {} > {} 2> {}".format( input_graph_path, out_gam_index_path, offset, ref, contig, alignment_sample_tag(input_gam, options), options.vg_cores, out_sample_vcf_path, out_sample_vcf_path.replace(".vcf", ".vcf.stderr"), ), fail_hard=True, ) if do_call: run( "vg call {} {} {} -t {} -o {} -r {} -c {} -S {} -A {} > {} 2> {}".format( input_graph_path, out_pileup_path, options.call_opts, options.vg_cores, offset, ref, contig, alignment_sample_tag(input_gam, options), out_augmented_vg_path, out_sample_vg_path.replace(".vg", ".vcf"), out_sample_vg_path.replace(".vg", ".vcf.stderr"), ), fail_hard=True, ) if do_surject: run( "vg index {} -k {} -e {} -s -d {}.index -t {}".format( input_graph_path, 20, 5, os.path.join(os.path.dirname(out_bam_path), "graph"), options.vg_cores ), fail_hard=True, ) run( "vg surject {} -t {} -p {} -b -d {}.index > {}".format( out_gam_filter_path, options.vg_cores, ref, os.path.join(os.path.dirname(out_bam_path), "graph"), out_bam_path, ), fail_hard=True, ) # fix up chromosome coordinates so we can display on browser if contig[0] != "c": contig = "chr{}".format(contig) contigLength = { "chr5": 181538259, "chr6": 170805979, "chr13": 114364328, "chr17": 83257441, "chr19": 58617616, } # in header, change up the contig name and run( 'samtools view -H {} | sed -e "s/{}/{}/" | sed -e "s/{}/{}/" > {}.sam'.format( out_bam_path, ref, contig, bedLength, contigLength[contig], out_bam_path ), fail_hard=True, ) # in body, add offset and fix contig, leave in sam for now so we can debug run( "samtools view -F 256 {} | awk -v OFS='\\t' '{{$3=\"{}\"; $4=$4+{}; $5=60; $8=$8+{}; print $0}}' >> {}.sam".format( out_bam_path, contig, offset, offset, out_bam_path ), fail_hard=True, ) # back to bam run( "samtools view {}.sam -b -F 4 | samtools sort - --threads {} -o {}".format( out_bam_path, options.vg_cores, out_bam_path ), fail_hard=True, ) # and index run("samtools index -b {}".format(out_bam_path), fail_hard=True)
def main(args): options = parse_args(args) robust_makedirs(os.path.join(options.out_dir, "comp_tables")) # compute average score for each roc dir in this table avg_table = [] # [region][method] --> (path, f1) best_table = defaultdict(lambda : defaultdict(lambda : (None, -1))) first = True # sort the directories, assuming their names give info on their order # in the roc for comp_dir in sorted(options.comp_dirs)[options.skip_first:len(options.comp_dirs) - options.skip_last]: print comp_dir # this can happen easily using wildcards in input if comp_dir == options.out_dir: continue # look through tsvs in comp_tables. for tsv in glob.glob(os.path.join(comp_dir, "comp_tables", "*.tsv")): # overwrite if first # strip header and append if second c = "cp {} ".format(tsv) if first is True else "tail -n +2 {} >> ".format(tsv) # just cat into the output directory os.system("{} {}".format(c, os.path.join(options.out_dir, "comp_tables", os.path.basename(tsv)))) print "{} {}".format(c, os.path.join(options.out_dir, "comp_tables", os.path.basename(tsv))) tb = os.path.basename(tsv).split("-") if len(tb) > 2 and tb[0] == options.best_baseline and tb[1] == options.best_comp: avg_table.append(avg_acc(tsv, options)) update_best_table(best_table, tsv, options) first = False # make a call directory of links to the best in roc points for each graph make_best_calls(best_table, options) # write out our sompy vcf snp accutacy with open(os.path.join(options.out_dir, "{}-{}-{}_{}-avg.tsv".format(options.best_baseline, options.best_comp, options.pcol, options.rcol)), "w") as f: lines = sorted(avg_table) for line in lines: for tok in line: f.write(str(tok) + "\t") f.write("\n") # let's sort the output to make it easier to remove dead points if options.smooth is True: for tsv in glob.glob(os.path.join(options.out_dir, "comp_tables", "*.tsv")): print "smoothing {}".format(tsv) with open(tsv) as f: lines = [line for line in f] lines = [lines[0]] + sorted(lines[1:], key = lambda x : (x.split()[0], float(x.split()[options.pcol]), 1 - float(x.split()[options.rcol]))) # precisions can be bumpy (need to change to sensitivy?) # use simple smoother in the meantime lines = smooth_table([x.split() for x in lines], options) with open(tsv, "w") as f: for line in lines: f.write(line)
def main(args): options = parse_args(args) RealTimeLogger.start_master() if options.classic: # expect call_dir/SAMPLE/region.vcf for sampledir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(sampledir): sample = os.path.basename(sampledir) vcfs = [] outfile = os.path.join(sampledir, "TOTAL.vcf") for vcf in glob.glob(os.path.join(sampledir, "*.vcf")): if os.path.basename(vcf) in [ "BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf", "MHC.vcf" ]: run("vcfsort {} > {}.sort".format(vcf, vcf), fail_hard=True) run("bgzip -c {}.sort > {}.gz".format(vcf, vcf), fail_hard=True) run("rm -f {}.sort".format(vcf)) run("tabix -f -p vcf {}.gz".format(vcf), fail_hard=True) vcfs.append("{}.gz".format(vcf)) if len(vcfs) > 0: run("vt cat {} > {}".format(" ".join(vcfs), outfile), fail_hard=True) run("vcfsort {} > {}.sort".format(outfile, outfile), fail_hard=True) run("mv {}.sort {}".format(outfile, outfile), fail_hard=True) run("bgzip -c {} > {}.gz".format(outfile, outfile), fail_hard=True) run("tabix -f -p vcf {}.gz".format(outfile), fail_hard=True) return 0 # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf # count up regions regions = set() for regiondir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(regiondir): region = os.path.basename(regiondir) # avoid crufty directories (including outputs of previous runs of this script) if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]: regions.add(region) print regions # count up graphs (that are present in every region) graphs = set() gcount = defaultdict(int) for region in regions: for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")): if os.path.isdir(graphdir): graph = os.path.basename(graphdir) gcount[graph] = gcount[graph] + 1 for graph, count in gcount.items(): if count == len(regions): graphs.add(graph) print graphs # count up samples samples = set() scount = defaultdict(int) for region in regions: for graph in graphs: for vcf in glob.glob( os.path.join(options.call_dir, region, graph, "*_sample.vcf")): sample = os.path.basename(vcf).split("_")[0] scount[sample] = scount[sample] + 1 for sample, count in scount.items(): samples.add(sample) print samples # make our output directory out_dir = os.path.join(options.call_dir, options.name) robust_makedirs(out_dir) for graph in graphs: g_out_dir = os.path.join(out_dir, graph) for sample in samples: vcf_files = [] for region in regions: vcf = os.path.join(options.call_dir, region, graph, "{}_sample.vcf".format(sample)) if os.path.isfile(vcf): vcf_files.append((region, vcf)) # this sample doesn't span all regions, skip it if len(vcf_files) < len(regions): print "Skipping Sample {} for Graph {}".format(sample, graph) continue # output vcf merge_vcf_path = os.path.join(out_dir, graph, "{}_sample.vcf".format(sample)) # working directory for intermediates / debugging work_path = os.path.join(out_dir, graph, "input", sample) robust_makedirs(work_path) # preprocess all the vcfs and leave in input dir input_files = [] for region, vcf in vcf_files: outbase = os.path.join(work_path, region) run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard=True) run("bgzip -f {}.vcf".format(outbase)) run("tabix -f -p vcf {}.vcf.gz".format(outbase)) input_files.append("{}.vcf.gz".format(outbase)) # run the merge run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path), fail_hard=True) # make an index just in case run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path), fail_hard=True) run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path), fail_hard=True) run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard=True) run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard=True) return 0
def compute_tree(options, mat, names): """ make upgma hierarchical clustering and write it as png and graphviz dot """ # oops, convert to biopython matrix matrix = [] for i in xrange(len(names)): row = [] for j in xrange(i + 1): # tree constructor writes 0-distances as 1s for some reason # so we hack around here val = float(mat[names[i]][names[j]]) if val == 0.: val = 1e-10 elif val == 1.: val = 1.1 row.append(val) matrix.append(row) dm = _DistanceMatrix(names, matrix) # upgma tree constructor = DistanceTreeConstructor() tree = constructor.upgma(dm) robust_makedirs(os.path.dirname(tree_path(options))) Phylo.write(tree, tree_path(options), "newick") # png tree -- note : doesn't work in toil def f(x): if "Inner" in str(x): return "" else: return x Phylo.draw_graphviz(tree, label_func = f, node_size=1000, node_shape="s", font_size=10) pylab.savefig(tree_path(options).replace("newick", "png")) # graphviz # get networkx graph nxgraph = Phylo.to_networkx(tree) # make undirected nxgraph = nx.Graph(nxgraph) # push names to name labels nxgraph = nx.convert_node_labels_to_integers(nxgraph, label_attribute="label") for node_id in nxgraph.nodes(): node = nxgraph.node[node_id] if "Inner" in str(node["label"]): node["label"] = "\"\"" node["width"] = 0.001 node["height"] = 0.001 else: node["fontsize"] = 18 for edge_id in nxgraph.edges(): edge = nxgraph.edge[edge_id[0]][edge_id[1]] # in graphviz, weight means something else, so make it a label weight = float(edge["weight"]) # undo hack from above if weight > 1: weight = 1. if weight <= 1e-10 or weight == 1.: weight = 0. edge["weight"] = None edge["label"] = "{0:.3g}".format(float(weight) * 100.) edge["fontsize"] = 14 edge["len"] = draw_len(weight) nx.write_dot(nxgraph, tree_path(options).replace("newick", "dot"))
def plot_vcf_comp(tsv_path, options): """ take the big vcf compare table and make precision_recall plots for all the categories""" out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) sample = out_name.split("-")[-1].upper() region = out_name.split("-")[-2].upper() def out_base_path(tag, label, extension): bd = tag if extension != ".tsv" else "tsv" ret = os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region) + "_" + label + extension robust_makedirs(os.path.dirname(ret)) return ret params = " ".join(PLOT_PARAMS) # precision recall scatter plot header = vcf_dist_header(options) # strip qual header = header[:-1] for i in range(len(header) / 2): prec_idx = 2 * i rec_idx = prec_idx + 1 qual_idx = len(header) print prec_idx, header[prec_idx], rec_idx, header[rec_idx] ptoks = header[prec_idx].split("-") rtoks = header[rec_idx].split("-") assert ptoks[1] == "Precision" assert rtoks[1] == "Recall" assert ptoks[:1] == rtoks[:1] comp_cat = ptoks[0] if comp_cat not in ["TOT", "SNP", "INDEL"]: continue label = header[prec_idx].replace("Precision", "acc") acc_tsv = out_base_path("pr", label, ".tsv") print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx) # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col awkcmd = '''if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}'''.format(rec_idx + 2, prec_idx + 2, qual_idx + 2) awkstr = "awk \'{" + awkcmd + "}\'" run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path("pr", label, ".png") title = sample.upper() + " " if comp_cat == "TOT": title += " Total Accuracy" else: title += " {} Accuracy".format(comp_cat.title()) if region == "TOTAL": title += ", all regions" else: title += ", {}".format(region) cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(acc_tsv, acc_png, title, params) print cmd os.system(cmd) #flatten to max f1 tsv and plot as bars f1_tsv = out_base_path("f1bar", label, ".tsv") f1_png = out_base_path("f1bar", label, ".png") f1_pr_tsv = out_base_path("f1pr", label, ".tsv") f1_pr_png = out_base_path("f1pr", label, ".png") f1_qual_tsv = out_base_path("f1qual", label, ".tsv") f1_qual_png = out_base_path("f1qual", label, ".png") make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options) cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {}".format(f1_tsv, f1_png, title, params) print cmd os.system(cmd) cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5".format(f1_pr_tsv, f1_pr_png, title, params) print cmd os.system(cmd) cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Quality for Max F1\" {}".format(f1_qual_tsv, f1_qual_png, title, params) print cmd os.system(cmd) if options.top is True: # top 20 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002".format(acc_tsv, acc_png.replace(".png", "_top20.png"), title, params) print cmd os.system(cmd) # top 20 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params) print cmd os.system(cmd) # top 40 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top40.png"), title, params) print cmd os.system(cmd) # top .5 bar cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.5".format(f1_tsv, f1_png.replace(".png", "_top50.png"), title, params) print cmd os.system(cmd) # top .6 bar cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.6".format(f1_tsv, f1_png.replace(".png", "_top60.png"), title, params) print cmd os.system(cmd) # top .7 bar cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.7".format(f1_tsv, f1_png.replace(".png", "_top70.png"), title, params) print cmd os.system(cmd) # top .85 bar cmd = "scripts/barchart.py {} --ascending --no_n --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {} --min 0.85".format(f1_tsv, f1_png.replace(".png", "_top85.png"), title, params) print cmd os.system(cmd) # top .25 f1pr scatter cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.746 --max_x 1.004 --min_y 0.746 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top25.png"), title, params) print cmd os.system(cmd) # top .50 f1pr scatter cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.496 --max_x 1.004 --min_y 0.496 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top50.png"), title, params) print cmd os.system(cmd) # top .65 f1pr scatter cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.646 --max_x 1.004 --min_y 0.646 --max_y 1.004".format(f1_pr_tsv, f1_pr_png.replace(".png", "_top65.png"), title, params) print cmd os.system(cmd)
def out_base_path(tag, label, extension): bd = tag if extension != ".tsv" else "tsv" ret = os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region) + "_" + label + extension robust_makedirs(os.path.dirname(ret)) return ret
def plot_vcf_comp(tsv_path, options): """ take the big vcf compare table and make precision_recall plots for all the categories""" out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) sample = out_name.split("-")[-1].upper() region = out_name.split("-")[-2].upper() def out_base_path(tag, label, extension): bd = tag if extension != ".tsv" else "tsv" ret = ( os.path.join(out_dir, bd, "-".join(out_name.split("-")[:-1]) + "-{}-{}-".format(sample, tag) + region) + "_" + label + extension ) robust_makedirs(os.path.dirname(ret)) return ret params = " ".join(PLOT_PARAMS) # precision recall scatter plot header = vcf_dist_header(options) # strip qual header = header[:-1] for i in range(len(header) / 2): prec_idx = 2 * i rec_idx = prec_idx + 1 qual_idx = len(header) print prec_idx, header[prec_idx], rec_idx, header[rec_idx] ptoks = header[prec_idx].split("-") rtoks = header[rec_idx].split("-") assert ptoks[1] == "Precision" assert rtoks[1] == "Recall" assert ptoks[:1] == rtoks[:1] comp_cat = ptoks[0] if comp_cat not in ["TOT", "SNP", "INDEL"]: continue label = header[prec_idx].replace("Precision", "acc") acc_tsv = out_base_path("pr", label, ".tsv") print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx) # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col awkcmd = """if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}""".format(rec_idx + 2, prec_idx + 2, qual_idx + 2) awkstr = "awk '{" + awkcmd + "}'" run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path("pr", label, ".png") title = sample.upper() + " " if comp_cat == "TOT": title += " Total Accuracy" else: title += " {} Accuracy".format(comp_cat.title()) if region == "TOTAL": title += ", all regions" else: title += ", {}".format(region) cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01'.format( acc_tsv, acc_png, title, params ) print cmd os.system(cmd) # flatten to max f1 tsv and plot as bars f1_tsv = out_base_path("f1bar", label, ".tsv") f1_png = out_base_path("f1bar", label, ".png") f1_pr_tsv = out_base_path("f1pr", label, ".tsv") f1_pr_png = out_base_path("f1pr", label, ".png") f1_qual_tsv = out_base_path("f1qual", label, ".tsv") f1_qual_png = out_base_path("f1qual", label, ".png") make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options) cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {}'.format( f1_tsv, f1_png, title, params ) print cmd os.system(cmd) cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5'.format( f1_pr_tsv, f1_pr_png, title, params ) print cmd os.system(cmd) cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Quality for Max F1" {}'.format( f1_qual_tsv, f1_qual_png, title, params ) print cmd os.system(cmd) if options.top is True: # top 20 cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002'.format( acc_tsv, acc_png.replace(".png", "_top20.png"), title, params ) print cmd os.system(cmd) # top 20 cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004'.format( acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params ) print cmd os.system(cmd) # top 40 cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004'.format( acc_tsv, acc_png.replace(".png", "_top40.png"), title, params ) print cmd os.system(cmd) # top .5 bar cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.5'.format( f1_tsv, f1_png.replace(".png", "_top50.png"), title, params ) print cmd os.system(cmd) # top .6 bar cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.6'.format( f1_tsv, f1_png.replace(".png", "_top60.png"), title, params ) print cmd os.system(cmd) # top .7 bar cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.7'.format( f1_tsv, f1_png.replace(".png", "_top70.png"), title, params ) print cmd os.system(cmd) # top .85 bar cmd = 'scripts/barchart.py {} --ascending --no_n --save {} --title "{}" --x_sideways --x_label "Graph" --y_label "Max F1" {} --min 0.85'.format( f1_tsv, f1_png.replace(".png", "_top85.png"), title, params ) print cmd os.system(cmd) # top .25 f1pr scatter cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.746 --max_x 1.004 --min_y 0.746 --max_y 1.004'.format( f1_pr_tsv, f1_pr_png.replace(".png", "_top25.png"), title, params ) print cmd os.system(cmd) # top .50 f1pr scatter cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.496 --max_x 1.004 --min_y 0.496 --max_y 1.004'.format( f1_pr_tsv, f1_pr_png.replace(".png", "_top50.png"), title, params ) print cmd os.system(cmd) # top .65 f1pr scatter cmd = 'scripts/scatter.py {} --save {} --title "{}" --x_label "Recall" --y_label "Precision" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.646 --max_x 1.004 --min_y 0.646 --max_y 1.004'.format( f1_pr_tsv, f1_pr_png.replace(".png", "_top65.png"), title, params ) print cmd os.system(cmd)
def compute_linear_variants(job, input_gam, options): """ project to bam, then run samtools to call some variants """ # Move to the appropriate working directory from wherever Toil dropped us os.chdir(options.cwd) input_graph_path = graph_path(input_gam, options) input_index_path = index_path(input_graph_path, options) # can only do this if there is a "ref" path in the vg graph res_path = temp_path(options) run("scripts/vgHasPath.sh {} {} > {}".format(input_graph_path, "ref", res_path)) has_ref = False with open(res_path) as res_file: has_ref = res_file.read()[0] == "1" run("rm {}".format(res_path)) if has_ref: surject_path = projected_bam_path(input_gam, options) out_vcf_path = linear_vcf_path(input_gam, options) out_vg_path = linear_vg_path(input_gam, options) fasta_path = ref_path(input_gam, options) do_surject = options.overwrite or not os.path.isfile(surject_path) do_vcf = do_surject or not os.path.isfile(out_vcf_path + ".gz") do_vg = do_vcf or not os.path.isfile(out_vg_path) if do_surject: robust_makedirs(os.path.dirname(surject_path)) prefix_path = temp_path(options, ".prefix") # surject to reference path (name hardcoded to ref for now) run("vg surject -d {} -p {} -b {} -t {} | samtools sort -o - {}> {}".format( input_index_path, "ref", input_gam, options.vg_cores, prefix_path, surject_path), timeout_sec=options.timeout, timeout_dep=surject_path) run("rm -f {}".format(prefix_path)) if do_vcf: # todo: we assume that all graphs have same reference fasta, here. # this is false for, ex, simons which uses grchg37 instead of 38. # create pileup in bcf using samtools # http://samtools.sourceforge.net/mpileup.shtml assert os.path.isfile(fasta_path) robust_makedirs(os.path.dirname(out_vcf_path)) run("samtools mpileup -I -u -t DP -f {} {} | bcftools call -m -V indels - > {}".format( fasta_path, surject_path, out_vcf_path)) # make compressed index run("bgzip -f {}".format(out_vcf_path)) run("tabix -f -p vcf {}.gz".format(out_vcf_path)) if do_vg: # and convert back to vg... robust_makedirs(os.path.dirname(out_vg_path)) run("vg construct -v {}.gz -r {} -t {} > {}".format(out_vcf_path, fasta_path, options.vg_cores, out_vg_path))
def plot_vcf_comp(tsv_path, options): """ take the big vcf compare table and make precision_recall plots for all the categories""" out_dir = os.path.join(options.comp_dir, "comp_plots") robust_makedirs(out_dir) out_name = os.path.basename(os.path.splitext(tsv_path)[0]) out_base_path = os.path.join(out_dir, out_name) region = out_name.split("-")[-1].upper() out_base_path_f1 = os.path.join(out_dir, "-".join(out_name.split("-")[:-1]) + "--f1-" + region) params = " ".join(PLOT_PARAMS) # precision recall scatter plot header = vcf_dist_header(options) # strip qual header = header[:-1] for i in range(len(header) / 2): prec_idx = 2 * i rec_idx = prec_idx + 1 qual_idx = len(header) print prec_idx, header[prec_idx], rec_idx, header[rec_idx] ptoks = header[prec_idx].split("-") rtoks = header[rec_idx].split("-") assert ptoks[1] == "Precision" assert rtoks[1] == "Recall" assert ptoks[:1] == rtoks[:1] comp_cat = ptoks[0] if comp_cat not in ["TOT", "SNP", "INDEL"]: continue label = header[prec_idx].replace("Precision", "acc") acc_tsv = out_base_path + "_" + label + ".tsv" print "Make {} tsv with cols {} {}".format(label, rec_idx, prec_idx) # +1 to convert to awk 1-base coordinates. +1 again since header doesnt include row_label col awkcmd = '''if (NR!=1) print $1 "\t" ${} "\t" ${} "\t" ${}'''.format(rec_idx + 2, prec_idx + 2, qual_idx + 2) awkstr = "awk \'{" + awkcmd + "}\'" run("{} {} > {}".format(awkstr, tsv_path, acc_tsv)) acc_png = out_base_path + "_" + label + ".png" title = "VCF" if comp_cat == "TOT": title += " Total Accuracy" else: title += " {} Accuracy".format(comp_cat) title += " for {}".format(region) cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(acc_tsv, acc_png, title, params) print cmd os.system(cmd) #flatten to max f1 tsv and plot as bars f1_tsv = out_base_path_f1 + "_" + label + ".tsv" f1_png = out_base_path_f1 + "_" + label + ".png" f1_pr_tsv = out_base_path_f1.replace("-f1-", "-f1--pr-") + "_" + label + ".tsv" f1_pr_png = out_base_path_f1.replace("-f1-", "-f1--pr-") + "_" + label + ".png" f1_qual_tsv = out_base_path_f1.replace("-f1-", "-f1-qual-") + "_" + label + ".tsv" f1_qual_png = out_base_path_f1.replace("-f1-", "-f1-qual-") + "_" + label + ".png" make_max_f1_tsv(acc_tsv, f1_tsv, f1_pr_tsv, f1_qual_tsv, options) cmd = "scripts/barchart.py {} --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Max F1\" {}".format(f1_tsv, f1_png, title, params) print cmd os.system(cmd) cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x -0.01 --max_x 1.01 --min_y -0.01 --max_y 1.01".format(f1_pr_tsv, f1_pr_png, title, params) print cmd os.system(cmd) cmd = "scripts/barchart.py {} --save {} --title \"{}\" --x_sideways --x_label \"Graph\" --y_label \"Quality for Max F1\" {} --max 20".format(f1_qual_tsv, f1_qual_png, title, params) print cmd os.system(cmd) if options.top is True: # top 20 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.798 --max_x 1.002 --min_y 0.798 --max_y 1.002".format(acc_tsv, acc_png.replace(".png", "_top20.png"), title, params) print cmd os.system(cmd) # top 20 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 11 --height 5.5 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.796 --max_x 1.004 --min_y 0.796 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top20_inset.png"), title, params) print cmd os.system(cmd) # top 40 cmd = "scripts/scatter.py {} --save {} --title \"{}\" --x_label \"Recall\" --y_label \"Precision\" --width 18 --height 9 {} --lines --no_n --line_width 1.5 --marker_size 5 --min_x 0.596 --max_x 1.004 --min_y 0.596 --max_y 1.004".format(acc_tsv, acc_png.replace(".png", "_top40.png"), title, params) print cmd os.system(cmd)
def compute_snp1000g_baseline(job, input_gam, platinum, filter_indels, options): """ make 1000 genomes sample graph by filtering the vcf """ # Move to the appropriate working directory from wherever Toil dropped us os.chdir(options.cwd) # there is only one g1vcf graph per region per sample # this function is also going to get called once for each graph type # so we hack here to only run on refonly graphs (arbitrary choice) if alignment_graph_tag(input_gam, options) != "refonly": return sample = alignment_sample_tag(input_gam, options) if platinum is True and sample not in options.platinum_samples.split(","): return region = alignment_region_tag(input_gam, options) if platinum is False: g1kvcf_path = os.path.join(options.g1kvcf_path, region.upper() + ".vcf") else: g1kvcf_path = os.path.join(options.platinum_path, sample, region.upper() + ".vcf") g1kbed_path = os.path.join(options.g1kvcf_path, region.upper() + ".bed") filter_vcf_path = g1k_vcf_path(input_gam, platinum, filter_indels, options) filter_fa_path = g1k_fa_path(input_gam, platinum, filter_indels, options) filter_vg_path = g1k_vg_path(input_gam, platinum, filter_indels, options) fasta_path = options.chrom_fa_path do_filter = options.overwrite or not os.path.isfile(filter_vcf_path + ".gz") do_construct = do_filter or not os.path.isfile(filter_vg_path) # make sure we're dealing with a sample that's in the vcf if do_filter or do_construct: p = subprocess.Popen("grep {} {} | wc -l".format(sample, g1kvcf_path), shell=True, stdout=subprocess.PIPE, stderr=sys.stderr, bufsize=-1) output, _ = p.communicate() assert p.wait() == 0 if int(output) == 0: do_filter = False do_construct = False # make filtered compressed vcf for this sample if do_filter: robust_makedirs(os.path.dirname(filter_vcf_path)) if filter_indels is True: filter_input_path = filter_vcf_path + ".in" run("scripts/vcfFilterIndels.py {} > {}".format(g1kvcf_path, filter_input_path), fail_hard = True) else: filter_input_path = g1kvcf_path run("scripts/vcfFilterSample.py {} {} {} {} {}".format(filter_input_path, fasta_path, sample, filter_vcf_path, filter_fa_path), fail_hard = True) run("scripts/vcfsort {} > {}.sort ; mv {}.sort {}".format(filter_vcf_path, filter_vcf_path, filter_vcf_path, filter_vcf_path)) run("bgzip -f {}".format(filter_vcf_path), fail_hard = True) run("tabix -f -p vcf {}.gz".format(filter_vcf_path), fail_hard = True) # load it into a vg graph if do_construct: with open(g1kbed_path) as bed_file: coords = bed_file.readline().split() # convert from bed to vcf coordinates by adding one to start coords = (coords[0], int(coords[1]) + 1, int(coords[2])) run("vg construct -v {}.gz -r {} -t {} -R {}:{}-{} > {}".format(filter_vcf_path, filter_fa_path, options.vg_cores, coords[0], coords[1], coords[2], filter_vg_path), fail_hard = True)
def main(args): options = parse_args(args) RealTimeLogger.start_master() if options.classic: # expect call_dir/SAMPLE/region.vcf for sampledir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(sampledir): sample = os.path.basename(sampledir) vcfs = [] outfile = os.path.join(sampledir, "TOTAL.vcf") for vcf in glob.glob(os.path.join(sampledir, "*.vcf")): if os.path.basename(vcf) in ["BRCA1.vcf", "BRCA2.vcf", "SMA.vcf", "LRC_KIR.vcf", "MHC.vcf"]: run("vcfsort {} > {}.sort".format(vcf, vcf), fail_hard = True) run("bgzip -c {}.sort > {}.gz".format(vcf, vcf), fail_hard = True) run("rm -f {}.sort".format(vcf)) run("tabix -f -p vcf {}.gz".format(vcf), fail_hard = True) vcfs.append("{}.gz".format(vcf)) if len(vcfs) > 0: run("vt cat {} > {}".format(" ".join(vcfs), outfile), fail_hard = True) run("vcfsort {} > {}.sort".format(outfile, outfile), fail_hard = True) run("mv {}.sort {}".format(outfile, outfile), fail_hard = True) run("bgzip -c {} > {}.gz".format(outfile, outfile), fail_hard = True) run("tabix -f -p vcf {}.gz".format(outfile), fail_hard = True) return 0 # expect call_dir/<REGION>/<GRAPH>/<SAMPLE>_sample.vcf # count up regions regions = set() for regiondir in glob.glob(os.path.join(options.call_dir, "*")): if os.path.isdir(regiondir): region = os.path.basename(regiondir) # avoid crufty directories (including outputs of previous runs of this script) if region in ["brca1", "brca2", "mhc", "lrc_kir", "sma"]: regions.add(region) print regions # count up graphs (that are present in every region) graphs = set() gcount = defaultdict(int) for region in regions: for graphdir in glob.glob(os.path.join(options.call_dir, region, "*")): if os.path.isdir(graphdir): graph = os.path.basename(graphdir) gcount[graph] = gcount[graph] + 1 for graph, count in gcount.items(): if count == len(regions): graphs.add(graph) print graphs # count up samples samples = set() scount = defaultdict(int) for region in regions: for graph in graphs: for vcf in glob.glob(os.path.join(options.call_dir, region, graph, "*_sample.vcf")): sample = os.path.basename(vcf).split("_")[0] scount[sample] = scount[sample] + 1 for sample, count in scount.items(): samples.add(sample) print samples # make our output directory out_dir = os.path.join(options.call_dir, options.name) robust_makedirs(out_dir) for graph in graphs: g_out_dir = os.path.join(out_dir, graph) for sample in samples: vcf_files = [] for region in regions: vcf = os.path.join(options.call_dir, region, graph, "{}_sample.vcf".format(sample)) if os.path.isfile(vcf): vcf_files.append((region, vcf)) # this sample doesn't span all regions, skip it if len(vcf_files) < len(regions): print "Skipping Sample {} for Graph {}".format(sample, graph) continue # output vcf merge_vcf_path = os.path.join(out_dir, graph, "{}_sample.vcf".format(sample)) # working directory for intermediates / debugging work_path = os.path.join(out_dir, graph, "input", sample) robust_makedirs(work_path) # preprocess all the vcfs and leave in input dir input_files = [] for region, vcf in vcf_files: outbase = os.path.join(work_path, region) run("vcfsort {} > {}.vcf".format(vcf, outbase), fail_hard = True) run("bgzip -f {}.vcf".format(outbase)) run("tabix -f -p vcf {}.vcf.gz".format(outbase)) input_files.append("{}.vcf.gz".format(outbase)) # run the merge run("vt cat {} > {}".format(" ".join(input_files), merge_vcf_path), fail_hard = True) # make an index just in case run("vcfsort {} > {}.sort".format(merge_vcf_path, merge_vcf_path), fail_hard = True) run("mv {}.sort {}".format(merge_vcf_path, merge_vcf_path), fail_hard = True) run("bgzip -c {} > {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True) run("tabix -f -p vcf {}.gz".format(merge_vcf_path, merge_vcf_path), fail_hard = True) return 0