def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [ utils.to_single_data(x) for x in validate.summarize_grading(items) ] out = {"validate": items[0]["validate"], "variants": {"calls": []}} added = set([]) for data in items: if data.get("vrn_file"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] cur_name = "%s-%s" % (names[0], dd.get_variantcaller(data)) if cur_name not in added: out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variants", "calls")), "%s.vcf.gz" % cur_name) added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data["vrn_file"]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"]["calls"].append(out_file) return [out]
def main(cosmic_version, bcbio_genome_dir): work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "cosmic-prep")) os.chdir(work_dir) for genome_build, bcbio_build, add_chr in [("GRCh37", "GRCh37", False), ("GRCh38", "hg38", True)]: bcbio_base = os.path.join(bcbio_genome_dir, "genomes", "Hsapiens", bcbio_build) if not os.path.exists(bcbio_base): continue bcbio_ref = os.path.join(bcbio_base, "seq", "%s.fa" % bcbio_build) sorted_inputs = [] for fname in get_cosmic_files(genome_build, cosmic_version): sorted_inputs.append(sort_to_ref(fname, bcbio_ref, add_chr=add_chr)) out_dir = utils.safe_makedir(os.path.join("v%s" % cosmic_version, "bcbio_ready", bcbio_build)) out_file = os.path.join(out_dir, "cosmic.vcf.gz") ready_cosmic = combine_cosmic(sorted_inputs, bcbio_ref, out_file) variation_dir = utils.safe_makedir(os.path.join(bcbio_base, "variation")) utils.copy_plus(ready_cosmic, os.path.join(variation_dir, os.path.basename(ready_cosmic))) print("Created COSMIC v%s resource in %s" % (cosmic_version, os.path.join(variation_dir, os.path.basename(ready_cosmic)))) if bcbio_build == "GRCh37": bcbio_base = os.path.join(bcbio_genome_dir, "genomes", "Hsapiens", "hg19") if not os.path.exists(bcbio_base): continue out_dir = utils.safe_makedir(os.path.join("v%s" % cosmic_version, "bcbio_ready", "hg19")) out_file = os.path.join(out_dir, "cosmic.vcf.gz") hg19_cosmic = map_coords_to_ucsc(ready_cosmic, bcbio_ref, out_file) variation_dir = utils.safe_makedir(os.path.join(bcbio_base, "variation")) utils.copy_plus(hg19_cosmic, os.path.join(variation_dir, os.path.basename(hg19_cosmic))) print("Created COSMIC v%s resource in %s" % (cosmic_version, os.path.join(variation_dir, os.path.basename(hg19_cosmic))))
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in validate.summarize_grading(items)] out = {"validate": items[0]["validate"], "variants": {"calls": [], "gvcf": []}} added = set([]) for data in items: if data.get("vrn_file"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) if cur_name not in added: out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) return [out]
def main(cosmic_version, bcbio_genome_dir, overwrite=False, clean=False): work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "cosmic-prep")) os.chdir(work_dir) for genome_build, bcbio_build, add_chr in [("GRCh37", "GRCh37", False), ("GRCh38", "hg38", True)]: bcbio_base = os.path.join(bcbio_genome_dir, "genomes", "Hsapiens", bcbio_build) installed_file = os.path.join(bcbio_base, "variation", f"cosmic-v{cosmic_version}.vcf.gz") installed_link = os.path.join(bcbio_base, "variation", "cosmic.vcf.gz") logging.info(f"Beginning COSMIC v{cosmic_version} prep for {genome_build}.") if not os.path.exists(bcbio_base): continue if os.path.exists(installed_file): if not overwrite: logging.info(f"{installed_file} exists, please use the --overwrite flag to overwrite the existing files if you want to reinstall.") continue else: logging.info(f"{installed_file} exists, removing.") remove_installed(installed_file, installed_link) bcbio_ref = os.path.join(bcbio_base, "seq", f"{bcbio_build}.fa") cosmic_vcf_files = get_cosmic_vcf_files(genome_build, cosmic_version, clean) sorted_inputs = [] for fname in cosmic_vcf_files: sorted_inputs.append(sort_to_ref(fname, bcbio_ref, add_chr=add_chr)) out_dir = utils.safe_makedir(os.path.join(f"v{cosmic_version}", "bcbio_ready", bcbio_build)) out_file = os.path.join(out_dir, "cosmic.vcf.gz") ready_cosmic = combine_cosmic(sorted_inputs, bcbio_ref, out_file) variation_dir = utils.safe_makedir(os.path.join(bcbio_base, "variation")) utils.copy_plus(ready_cosmic, installed_file) logging.info(f"Created COSMIC v{cosmic_version} resource in {installed_file}.") logging.info(f"Linking {installed_file} as {installed_link}.") make_links(installed_file, installed_link) update_version_file(bcbio_base, cosmic_version) logging.info(f"Finished COSMIC v{cosmic_version} prep for {genome_build}.") # prepare hg19 from the GRCh37 file if bcbio_build == "GRCh37": genome_build = "hg19" logging.info(f"Prepping COSMIC v{cosmic_version} for {genome_build} from the GRCh37 preparation.") bcbio_base = os.path.join(bcbio_genome_dir, "genomes", "Hsapiens", genome_build) if not os.path.exists(bcbio_base): continue if os.path.exists(installed_file): installed_file = os.path.join(bcbio_base, "variation", f"cosmic-v{cosmic_version}.vcf.gz") installed_link = os.path.join(bcbio_base, "variation", "cosmic.vcf.gz") if not overwrite: logging.info(f"{installed_file} exists, please use the --overwrite flag to overwrite the existing files if you want to reinstall.") continue else: logging.info(f"{installed_file} exists, removing.") remove_installed(installed_file, installed_link) out_dir = utils.safe_makedir(os.path.join(f"v{cosmic_version}", "bcbio_ready", genome_build)) out_file = os.path.join(out_dir, f"cosmic-v{cosmic_version}.vcf.gz") logging.info(f"Translating GRCh37 chromosome names to hg19 chromosome names.") hg19_cosmic = map_coords_to_ucsc(ready_cosmic, bcbio_ref, out_file) variation_dir = utils.safe_makedir(os.path.join(bcbio_base, "variation")) utils.copy_plus(hg19_cosmic, installed_file) logging.info(f"Created COSMIC v{cosmic_version} resource in {installed_file}.") logging.info(f"Linking {installed_file} as {installed_link}.") make_links(installed_file, installed_link) update_version_file(bcbio_base, cosmic_version) logging.info(f"Finished COSMIC v{cosmic_version} prep for {genome_build}.")
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, data, max_distance=10000): """Re-usable subcomponent that annotates BED file genes from another BED """ try: input_rec = iter(pybedtools.BedTool(in_file)).next() except StopIteration: # empty file utils.copy_plus(in_file, out_file) return # keep everything after standard chrom/start/end, 1-based extra_fields = range(4, len(input_rec.fields) + 1) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s) $%s = "."} {print}'""" % (max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd() cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" # Ensure gene transcripts match reference genome ready_gene_file = os.path.join(os.path.dirname(out_file), "%s-genomeonly.bed" % (utils.splitext_plus(os.path.basename(gene_file))[0])) ready_gene_file = bedutils.subset_to_genome(gene_file, ready_gene_file, data) cmd = ("{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | " "{sort_cmd} -k1,1 -k2,2n | " "bedtools closest -g <(cut -f1,2 {fai_file} | {sort_cmd} -k1,1 -k2,2n) " "-d -t all -a - -b <({sort_cmd} -k1,1 -k2,2n {ready_gene_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' -d -10 > {out_file}") do.run(cmd.format(**locals()), "Annotate BED file with gene info")
def coverage_region_detailed_stats(target_name, bed_file, data, out_dir, extra_cutoffs=None): """ Calculate coverage at different completeness cutoff for region in coverage option. """ if not bed_file or not utils.file_exists(bed_file): return [] else: ready_depth = tz.get_in(["depth", target_name], data) if ready_depth: cov_file = ready_depth["regions"] dist_file = ready_depth["dist"] else: mosdepth_cov = run_mosdepth(data, target_name, bed_file) cov_file = mosdepth_cov.regions dist_file = mosdepth_cov.dist out_cov_file = os.path.join(out_dir, os.path.basename(cov_file)) out_dist_file = os.path.join(out_dir, os.path.basename(dist_file)) if not utils.file_uptodate(out_cov_file, cov_file): utils.copy_plus(cov_file, out_cov_file) utils.copy_plus(dist_file, out_dist_file) cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000} if extra_cutoffs: cutoffs = sorted(list(cutoffs | extra_cutoffs)) out_files = _calculate_percentiles(out_dist_file, cutoffs, out_dir, data) return [os.path.abspath(x) for x in out_files]
def _make_examples(bam_file, data, ref_file, region, out_file, work_dir): """Create example pileup images to feed into variant calling. """ region_bed = strelka2.get_region_bed(region, [data], out_file, want_gzip=False) log_dir = utils.safe_makedir(os.path.join(work_dir, "log")) example_dir = utils.safe_makedir(os.path.join(work_dir, "examples")) if len( glob.glob( os.path.join(example_dir, "%s.tfrecord*.gz" % dd.get_sample_name(data)))) == 0: with tx_tmpdir(data) as tx_example_dir: cmd = [ "dv_make_examples.py", "--cores", dd.get_num_cores(data), "--ref", ref_file, "--reads", bam_file, "--regions", region_bed, "--logdir", log_dir, "--examples", tx_example_dir, "--sample", dd.get_sample_name(data) ] do.run(cmd, "DeepVariant make_examples %s" % dd.get_sample_name(data)) for fname in glob.glob( os.path.join(tx_example_dir, "%s.tfrecord*.gz" % dd.get_sample_name(data))): utils.copy_plus( fname, os.path.join(example_dir, os.path.basename(fname))) return example_dir
def _calculate_percentiles(cov_file, dist_file, cutoffs, out_dir, data): """Calculate percentage over over specified cutoff range. XXX Does not calculate the per-bin coverage estimations which we had earlier with sambamba depth. Instead has a global metric of percent coverage which provides a more defined look at coverage changes by depth. """ if not utils.file_exists(dist_file) or not utils.file_exists(cov_file): return [] sample = dd.get_sample_name(data) out_total_file = append_stem(dist_file, "_total_summary") if not utils.file_exists(out_total_file): with file_transaction(data, out_total_file) as tx_file: with open(tx_file, 'w') as out_handle: writer = csv.writer(out_handle, dialect="excel-tab") writer.writerow(["cutoff_reads", "bases_pct", "sample"]) with open(dist_file) as in_handle: for line in in_handle: count, pct = line.strip().split() count = int(count) pct = "%.1f" % (float(pct) * 100.0) if count >= min(cutoffs) and count <= max(cutoffs): writer.writerow( ["percentage%s" % count, pct, sample]) if min(cutoffs) < count: writer.writerow( ["percentage%s" % min(cutoffs), pct, sample]) # To move metrics to multiqc, will remove older files # when bcbreport accepts these one, to avoid errors # while porting everything to multiqc # These files will be copied to final out_total_fixed = os.path.join(os.path.dirname(out_total_file), "%s_bcbio_coverage_avg.txt" % sample) copy_plus(out_total_file, out_total_fixed) return [out_total_fixed]
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [utils.to_single_data(x) for x in items] out = {"sv": {"calls": []}} added = set([]) for data in items: if data.get("sv"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"]) ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index(out_file, data["config"]) out["sv"]["calls"].append(out_file) return [out]
def organize_noalign(data): """CWL target to skip alignment and organize input data. """ data = utils.to_single_data(data[0]) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) work_bam = os.path.join(work_dir, "%s-input.bam" % dd.get_sample_name(data)) utils.copy_plus(data["files"][0], work_bam) bam.index(work_bam, data["config"]) data["align_bam"] = work_bam return data
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file, work_dir): if not utils.file_exists(out_file): with file_transaction(items[0], work_dir) as tx_work_dir: workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir) _run_workflow(items[0], workflow_file, tx_work_dir) raw_file = os.path.join(work_dir, "results", "variants", "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz") utils.copy_plus(raw_file, out_file) # Remove files with relative symlinks utils.remove_plus(os.path.join(work_dir, "results", "variants", "genome.vcf.gz")) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = { "validate": validate.combine_validations(items), "variants": { "calls": [], "gvcf": [], "samples": [] } } added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller( data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def _symlink_or_copy_grabix(in_file, out_file, data): """We cannot symlink in CWL, but may be able to use inputs or copy """ if cwlutils.is_cwl_run(data): # Has grabix indexes, we're okay to go if utils.file_exists(in_file + ".gbi"): out_file = in_file else: utils.copy_plus(in_file, out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def _calculate_percentiles(in_file, sample, data=None, cutoffs=None): """ Parse pct bases per region to summarize it in 7 different pct of regions points with pct bases covered higher than a completeness cutoff (5, 10, 20, 50 ...) """ has_data = False with open(in_file) as in_handle: for i, line in enumerate(in_handle): if i > 0: has_data = True break if not has_data: return [] out_file = append_stem(in_file, "_summary") out_total_file = append_stem(in_file, "_total_summary") if not utils.file_exists(out_file) or not utils.file_exists( out_total_file): dt = pd.read_csv(in_file, sep="\t", index_col=False) pct = dict() pct_bases = dict() size = np.array(dt["chromEnd"]) - np.array(dt["chromStart"]) for cutoff in [h for h in list(dt) if h.startswith("percentage")]: if cutoffs and int(cutoff.split("percentage")[1]) in cutoffs: a = np.array(dt[cutoff]) for p_point in [0.01, 10, 25, 50, 75, 90, 99.9]: q = np.percentile(a, p_point) pct[(cutoff, p_point)] = q pct_bases[cutoff] = sum(size * a) / float(sum(size)) with file_transaction(data, out_total_file) as tx_file: with open(tx_file, 'w') as out_handle: print >> out_handle, "cutoff_reads\tbases_pct\tsample" for k in pct_bases: print >> out_handle, "\t".join( map(str, [k, pct_bases[k], sample])) with file_transaction(data, out_file) as tx_file: with open(tx_file, 'w') as out_handle: print >> out_handle, "cutoff_reads\tregion_pct\tbases_pct\tsample" for k in pct: print >> out_handle, "\t".join( map(str, [k[0], k[1], pct[k], sample])) # To move metrics to multiqc, will remove older files # when bcbreport accepts these one, to avoid errors # while porting everything to multiqc # These files will be copied to final out_file_fixed = os.path.join(os.path.dirname(out_file), "%s_bcbio_coverage.txt" % sample) out_total_fixed = os.path.join(os.path.dirname(out_file), "%s_bcbio_coverage_avg.txt" % sample) copy_plus(out_file, out_file_fixed) copy_plus(out_total_file, out_total_fixed) return [out_file_fixed, out_total_fixed]
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [ utils.to_single_data(x) for x in vcvalidate.summarize_grading(items, "svvalidate") ] out = { "sv": { "calls": [], "prioritize": { "tsv": [], "raw": [] } }, "svvalidate": vcvalidate.combine_validations(items, "svvalidate") } added = set([]) # Standard callers for data in items: if data.get("sv"): names = dd.get_batches(data) if not names: names = [dd.get_sample_name(data)] batch_name = names[0] cur_name = "%s-%s" % (batch_name, data["sv"]["variantcaller"]) if data["sv"].get("vrn_file"): ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index( out_file, data["config"]) out["sv"]["calls"].append(out_file) # prioritization for pdata in _group_by_sample(items): prioritysv = [ x for x in prioritize.run([utils.deepish_copy(pdata)])[0].get( "sv", []) if x["variantcaller"] == "sv-prioritize" ] if prioritysv: out["sv"]["prioritize"]["tsv"].append(prioritysv[0]["vrn_file"]) out["sv"]["prioritize"]["raw"].extend( prioritysv[0]["raw_files"].values()) return [out]
def _make_examples(bam_file, data, ref_file, region_bed, out_file, work_dir): """Create example pileup images to feed into variant calling. """ log_dir = utils.safe_makedir(os.path.join(work_dir, "log")) example_dir = utils.safe_makedir(os.path.join(work_dir, "examples")) if len(glob.glob(os.path.join(example_dir, "%s.tfrecord*.gz" % dd.get_sample_name(data)))) == 0: with tx_tmpdir(data) as tx_example_dir: cmd = ["dv_make_examples.py", "--cores", dd.get_num_cores(data), "--ref", ref_file, "--reads", bam_file, "--regions", region_bed, "--logdir", log_dir, "--examples", tx_example_dir, "--sample", dd.get_sample_name(data)] do.run(cmd, "DeepVariant make_examples %s" % dd.get_sample_name(data)) for fname in glob.glob(os.path.join(tx_example_dir, "%s.tfrecord*.gz" % dd.get_sample_name(data))): utils.copy_plus(fname, os.path.join(example_dir, os.path.basename(fname))) return example_dir
def _link_bam_file(in_file, new_dir, data): """Provide symlinks of BAM file and existing indexes if needed. """ new_dir = utils.safe_makedir(new_dir) out_file = os.path.join(new_dir, os.path.basename(in_file)) if data.get("cwl_keys"): # Has indexes, we're okay to go with the original file if utils.file_exists(in_file + ".bai"): out_file = in_file else: utils.copy_plus(in_file, out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def _handle_precalled(data): """Copy in external pre-called variants fed into analysis. """ if data.get("vrn_file"): vrn_file = data["vrn_file"] if isinstance(vrn_file, (list, tuple)): assert len(vrn_file) == 1 vrn_file = vrn_file[0] precalled_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "precalled")) ext = utils.splitext_plus(vrn_file)[-1] orig_file = os.path.abspath(vrn_file) our_vrn_file = os.path.join(precalled_dir, "%s-precalled%s" % (dd.get_sample_name(data), ext)) utils.copy_plus(orig_file, our_vrn_file) data["vrn_file"] = our_vrn_file return data
def _calculate_percentiles(in_file, sample, data=None, cutoffs=None): """ Parse pct bases per region to summarize it in 7 different pct of regions points with pct bases covered higher than a completeness cutoff (5, 10, 20, 50 ...) """ has_data = False with open(in_file) as in_handle: for i, line in enumerate(in_handle): if i > 0: has_data = True break if not has_data: return [] out_file = append_stem(in_file, "_summary") out_total_file = append_stem(in_file, "_total_summary") if not utils.file_exists(out_file) or not utils.file_exists(out_total_file): dt = pd.read_csv(in_file, sep="\t", index_col=False) pct = dict() pct_bases = dict() size = np.array(dt["chromEnd"]) - np.array(dt["chromStart"]) for cutoff in [h for h in list(dt) if h.startswith("percentage")]: if cutoffs and int(cutoff.split("percentage")[1]) in cutoffs: a = np.array(dt[cutoff]) for p_point in [0.01, 10, 25, 50, 75, 90, 99.9]: q = np.percentile(a, p_point) pct[(cutoff, p_point)] = q pct_bases[cutoff] = sum(size * a) / float(sum(size)) with file_transaction(data, out_total_file) as tx_file: with open(tx_file, 'w') as out_handle: print >>out_handle, "cutoff_reads\tbases_pct\tsample" for k in pct_bases: print >>out_handle, "\t".join(map(str, [k, pct_bases[k], sample])) with file_transaction(data, out_file) as tx_file: with open(tx_file, 'w') as out_handle: print >>out_handle, "cutoff_reads\tregion_pct\tbases_pct\tsample" for k in pct: print >>out_handle, "\t".join(map(str, [k[0], k[1], pct[k], sample])) # To move metrics to multiqc, will remove older files # when bcbreport accepts these one, to avoid errors # while porting everything to multiqc # These files will be copied to final out_file_fixed = os.path.join(os.path.dirname(out_file), "%s_bcbio_coverage.txt" % sample) out_total_fixed = os.path.join(os.path.dirname(out_file), "%s_bcbio_coverage_avg.txt" % sample) copy_plus(out_file, out_file_fixed) copy_plus(out_total_file, out_total_fixed) return [out_file_fixed, out_total_fixed]
def _link_bam_file(in_file, new_dir, data): """Provide symlinks of BAM file and existing indexes if needed. """ new_dir = utils.safe_makedir(new_dir) out_file = os.path.join(new_dir, os.path.basename(in_file)) if not utils.file_exists(out_file): out_file = os.path.join(new_dir, "%s-prealign.bam" % dd.get_sample_name(data)) if data.get("cwl_keys"): # Has indexes, we're okay to go with the original file if utils.file_exists(in_file + ".bai"): out_file = in_file else: utils.copy_plus(in_file, out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def _bgzip_from_fastq(data): """Prepare a bgzipped file from a fastq input, potentially gzipped (or bgzipped already). """ in_file = data["in_file"] needs_convert = dd.get_quality_format(data).lower() == "illumina" # special case, empty files that have been cleaned if not objectstore.is_remote(in_file) and os.path.getsize(in_file) == 0: needs_bgzip, needs_gunzip = False, False elif in_file.endswith(".gz") and not objectstore.is_remote(in_file): if needs_convert or dd.get_trim_ends(data): needs_bgzip, needs_gunzip = True, True else: needs_bgzip, needs_gunzip = _check_gzipped_input(in_file, data) elif in_file.endswith(".bz2"): needs_bgzip, needs_gunzip = True, True elif objectstore.is_remote(in_file) and not tz.get_in(["config", "algorithm", "align_split_size"], data): needs_bgzip, needs_gunzip = False, False else: needs_bgzip, needs_gunzip = True, False work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep")) if needs_bgzip or needs_gunzip or needs_convert or dd.get_trim_ends(data) or objectstore.is_remote(in_file): out_file = _bgzip_file(in_file, data["config"], work_dir, needs_bgzip, needs_gunzip, needs_convert, data) else: out_file = os.path.join(work_dir, "%s_%s" % (dd.get_sample_name(data), os.path.basename(in_file))) # We cannot symlink in CWL, but may be able to use inputs or copy if data.get("is_cwl"): # Has grabix indexes, we're okay to go if utils.file_exists(in_file + ".gbi"): return in_file else: return utils.copy_plus(in_file, out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def organize_noalign(data): """CWL target to skip alignment and organize input data. """ data = utils.to_single_data(data[0]) work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) work_bam = os.path.join(work_dir, "%s-input.bam" % dd.get_sample_name(data)) if data.get("files"): if data["files"][0].endswith(".cram"): work_bam = cram.to_bam(data["files"][0], work_bam, data) else: assert data["files"][0].endswith(".bam"), data["files"][0] utils.copy_plus(data["files"][0], work_bam) bam.index(work_bam, data["config"]) else: work_bam = None data["align_bam"] = work_bam return data
def summarize_vc(items): """CWL target: summarize variant calls and validation for multiple samples. """ items = [utils.to_single_data(x) for x in utils.flatten(items)] items = [_normalize_vc_input(x) for x in items] items = validate.summarize_grading(items) items = [utils.to_single_data(x) for x in items] out = {"validate": validate.combine_validations(items), "variants": {"calls": [], "gvcf": [], "samples": []}} added = set([]) variants_by_sample = collections.defaultdict(list) sample_order = [] for data in items: batch_samples = data.get("batch_samples", [dd.get_sample_name(data)]) for s in batch_samples: if s not in sample_order: sample_order.append(s) if data.get("vrn_file"): # Only get batches if we're actually doing variantcalling in bcbio # otherwise we'll be using the original files names = dd.get_batches(data) if dd.get_variantcaller(data) else None if not names: names = [dd.get_sample_name(data)] batch_name = names[0] if data.get("vrn_file_joint") is not None: to_add = [("vrn_file", "gvcf", dd.get_sample_name(data)), ("vrn_file_joint", "calls", batch_name)] else: to_add = [("vrn_file", "calls", batch_name)] for vrn_key, out_key, name in to_add: cur_name = "%s-%s" % (name, dd.get_variantcaller(data)) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variants", out_key)), "%s.vcf.gz" % cur_name) for s in batch_samples: variants_by_sample[s].append(out_file) if cur_name not in added: added.add(cur_name) # Ideally could symlink here but doesn't appear to work with # Docker container runs on Toil where PATHs don't get remapped utils.copy_plus(os.path.realpath(data[vrn_key]), out_file) vcfutils.bgzip_and_index(out_file, data["config"]) out["variants"][out_key].append(out_file) for sample in sample_order: out["variants"]["samples"].append(variants_by_sample[sample]) return [out]
def _handle_precalled(data): """Copy in external pre-called variants fed into analysis. Symlinks for non-CWL runs where we want to ensure VCF present in a local directory. """ if data.get("vrn_file") and not cwlutils.is_cwl_run(data): vrn_file = data["vrn_file"] if isinstance(vrn_file, (list, tuple)): assert len(vrn_file) == 1 vrn_file = vrn_file[0] precalled_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "precalled")) ext = utils.splitext_plus(vrn_file)[-1] orig_file = os.path.abspath(vrn_file) our_vrn_file = os.path.join(precalled_dir, "%s-precalled%s" % (dd.get_sample_name(data), ext)) utils.copy_plus(orig_file, our_vrn_file) data["vrn_file"] = our_vrn_file return data
def main(cosmic_version, bcbio_genome_dir): work_dir = utils.safe_makedir(os.path.join(os.getcwd(), "cosmic-prep")) os.chdir(work_dir) for genome_build, bcbio_build, add_chr in [("GRCh37", "GRCh37", False), ("GRCh38", "hg38", True)]: bcbio_base = os.path.join(bcbio_genome_dir, "genomes", "Hsapiens", bcbio_build) if not os.path.exists(bcbio_base): continue bcbio_ref = os.path.join(bcbio_base, "seq", "%s.fa" % bcbio_build) sorted_inputs = [] for fname in get_cosmic_files(genome_build, cosmic_version): sorted_inputs.append(sort_to_ref(fname, bcbio_ref, add_chr=add_chr)) out_dir = utils.safe_makedir( os.path.join("v%s" % cosmic_version, "bcbio_ready", bcbio_build)) out_file = os.path.join(out_dir, "cosmic.vcf.gz") ready_cosmic = combine_cosmic(sorted_inputs, bcbio_ref, out_file) variation_dir = utils.safe_makedir( os.path.join(bcbio_base, "variation")) utils.copy_plus( ready_cosmic, os.path.join(variation_dir, os.path.basename(ready_cosmic))) print("Created COSMIC v%s resource in %s" % (cosmic_version, os.path.join(variation_dir, os.path.basename(ready_cosmic)))) if bcbio_build == "GRCh37": bcbio_base = os.path.join(bcbio_genome_dir, "genomes", "Hsapiens", "hg19") if not os.path.exists(bcbio_base): continue out_dir = utils.safe_makedir( os.path.join("v%s" % cosmic_version, "bcbio_ready", "hg19")) out_file = os.path.join(out_dir, "cosmic.vcf.gz") hg19_cosmic = map_coords_to_ucsc(ready_cosmic, bcbio_ref, out_file) variation_dir = utils.safe_makedir( os.path.join(bcbio_base, "variation")) utils.copy_plus( hg19_cosmic, os.path.join(variation_dir, os.path.basename(hg19_cosmic))) print("Created COSMIC v%s resource in %s" % (cosmic_version, os.path.join(variation_dir, os.path.basename(hg19_cosmic))))
def _add_genes_to_bed(in_file, gene_file, fai_file, out_file, data, max_distance=10000): """Re-usable subcomponent that annotates BED file genes from another BED """ try: input_rec = next(iter(pybedtools.BedTool(in_file))) except StopIteration: # empty file utils.copy_plus(in_file, out_file) return # keep everything after standard chrom/start/end, 1-based extra_fields = list(range(4, len(input_rec.fields) + 1)) # keep the new gene annotation gene_index = len(input_rec.fields) + 4 extra_fields.append(gene_index) columns = ",".join([str(x) for x in extra_fields]) max_column = max(extra_fields) + 1 ops = ",".join(["distinct"] * len(extra_fields)) # swap over gene name to '.' if beyond maximum distance # cut removes the last distance column which can cause issues # with bedtools merge: 'ERROR: illegal character '.' found in integer conversion of string' distance_filter = (r"""awk -F$'\t' -v OFS='\t' '{if ($NF > %s || $NF < -%s) $%s = "."} {print}'""" % (max_distance, max_distance, gene_index)) sort_cmd = bedutils.get_sort_cmd(os.path.dirname(out_file)) cat_cmd = "zcat" if in_file.endswith(".gz") else "cat" # Ensure gene transcripts match reference genome ready_gene_file = os.path.join(os.path.dirname(out_file), "%s-genomeonly.bed" % (utils.splitext_plus(os.path.basename(gene_file))[0])) ready_gene_file = bedutils.subset_to_genome(gene_file, ready_gene_file, data) exports = "export TMPDIR=%s && %s" % (os.path.dirname(out_file), utils.local_path_export()) bcbio_py = sys.executable gsort = config_utils.get_program("gsort", data) cmd = ("{exports}{cat_cmd} {in_file} | grep -v ^track | grep -v ^browser | grep -v ^# | " "{bcbio_py} -c 'from bcbio.variation import bedutils; bedutils.remove_bad()' | " "{gsort} - {fai_file} | " "bedtools closest -g {fai_file} " "-D ref -t first -a - -b <({gsort} {ready_gene_file} {fai_file}) | " "{distance_filter} | cut -f 1-{max_column} | " "bedtools merge -i - -c {columns} -o {ops} -delim ',' -d -10 > {out_file}") do.run(cmd.format(**locals()), "Annotate BED file with gene info")
def _run_germline(align_bams, items, ref_file, assoc_files, region, out_file, work_dir): if not utils.file_exists(out_file): with file_transaction(items[0], work_dir) as tx_work_dir: workflow_file = _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir) if workflow_file: has_variants = True _run_workflow(items[0], workflow_file, tx_work_dir) else: has_variants = False vcfutils.write_empty_vcf( out_file, items[0]["config"], [dd.get_sample_name(d) for d in items]) if has_variants: raw_file = os.path.join( work_dir, "results", "variants", "genome.vcf.gz" if joint.want_gvcf(items) else "variants.vcf.gz") utils.copy_plus(raw_file, out_file) # Remove files with relative symlinks utils.remove_plus( os.path.join(work_dir, "results", "variants", "genome.vcf.gz")) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def coverage_region_detailed_stats(bed_file, data, out_dir, extra_cutoffs=None): """ Calculate coverage at different completeness cutoff for region in coverage option. """ if not bed_file or not utils.file_exists(bed_file): return [] else: cov_file, dist_file = _run_mosdepth(bed_file, data) out_cov_file = os.path.join(out_dir, os.path.basename(cov_file)) out_dist_file = os.path.join(out_dir, os.path.basename(dist_file)) if not utils.file_uptodate(out_cov_file, cov_file): utils.copy_plus(cov_file, out_cov_file) utils.copy_plus(dist_file, out_dist_file) cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000} if extra_cutoffs: cutoffs = sorted(list(cutoffs | extra_cutoffs)) out_files = _calculate_percentiles(out_cov_file, out_dist_file, cutoffs, out_dir, data) return [os.path.abspath(x) for x in out_files]
def summarize_sv(items): """CWL target: summarize structural variants for multiple samples. XXX Need to support non-VCF output as tabix indexed output """ items = [utils.to_single_data(x) for x in vcvalidate.summarize_grading(items, "svvalidate")] out = {"sv": {"calls": [], "supplemental": [], "prioritize": {"tsv": [], "raw": []}}, "svvalidate": vcvalidate.combine_validations(items, "svvalidate")} added = set([]) # Standard callers for data in items: if data.get("sv"): if data["sv"].get("vrn_file"): ext = utils.splitext_plus(data["sv"]["vrn_file"])[-1] cur_name = _useful_basename(data) if cur_name not in added and ext.startswith(".vcf"): added.add(cur_name) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "sv", "calls")), "%s%s" % (cur_name, ext)) utils.copy_plus(data["sv"]["vrn_file"], out_file) out_file = vcfutils.bgzip_and_index(out_file, data["config"]) out["sv"]["calls"].append(out_file) if data["sv"].get("supplemental"): out["sv"]["supplemental"].extend([x for x in data["sv"]["supplemental"] if x]) # prioritization for pdata in _group_by_sample(items): prioritysv = [x for x in prioritize.run([utils.deepish_copy(pdata)])[0].get("sv", []) if x["variantcaller"] == "sv-prioritize"] if prioritysv: out["sv"]["prioritize"]["tsv"].append(prioritysv[0]["vrn_file"]) out["sv"]["prioritize"]["raw"].extend(prioritysv[0]["raw_files"].values()) return [out]
def coverage_region_detailed_stats(target_name, bed_file, data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ if bed_file and utils.file_exists(bed_file): ready_depth = tz.get_in(["depth", target_name], data) if ready_depth: cov_file = ready_depth["regions"] dist_file = ready_depth["dist"] thresholds_file = ready_depth.get("thresholds") out_cov_file = os.path.join(out_dir, os.path.basename(cov_file)) out_dist_file = os.path.join(out_dir, os.path.basename(dist_file)) out_thresholds_file = os.path.join(out_dir, os.path.basename(thresholds_file)) \ if thresholds_file and os.path.isfile(thresholds_file) else None if not utils.file_uptodate(out_cov_file, cov_file): utils.copy_plus(cov_file, out_cov_file) utils.copy_plus(dist_file, out_dist_file) utils.copy_plus(thresholds_file, out_thresholds_file) if out_thresholds_file else None return [out_cov_file, out_dist_file] + ([out_thresholds_file] if out_thresholds_file else []) return []
def _report_summary(samples, out_dir): """ Run coverage report with bcbiocov package """ try: import bcbreport.prepare as bcbreport except ImportError: logger.info("skipping report. No bcbreport installed.") return samples # samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) parent_dir = utils.safe_makedir(out_dir) with utils.chdir(parent_dir): logger.info("copy qsignature") qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma") if qsignature_fn: # this need to be inside summary/qc dict if utils.file_exists( qsignature_fn) and not utils.file_exists("qsignature.ma"): shutil.copy(qsignature_fn, "bcbio_qsignature.ma") out_dir = utils.safe_makedir("fastqc") logger.info("summarize fastqc") with utils.chdir(out_dir): _merge_fastqc(samples) logger.info("summarize metrics") samples = _merge_metrics(samples) logger.info("summarize target information") samples = _merge_target_information(samples) out_dir = utils.safe_makedir("coverage") logger.info("summarize coverage") for data in samples: pfiles = tz.get_in(["summary", "qc", "coverage"], data, []) if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif pfiles: pfiles = [pfiles] for fn in pfiles: if os.path.basename(fn).find("coverage_fixed") > -1: utils.copy_plus( fn, os.path.join(out_dir, os.path.basename(fn))) out_dir = utils.safe_makedir("variants") logger.info("summarize variants") for data in samples: pfiles = tz.get_in(["summary", "qc", "variants"], data, []) if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif pfiles: pfiles = [pfiles] for fn in pfiles: if os.path.basename(fn).find("gc-depth-parse.tsv") > -1: utils.copy_plus( fn, os.path.join(out_dir, os.path.basename(fn))) bcbreport.report(parent_dir) out_report = os.path.join(parent_dir, "qc-coverage-report.html") if not utils.file_exists(out_report): rmd_file = os.path.join(parent_dir, "report-ready.Rmd") run_file = "%s-run.R" % (os.path.splitext(out_report)[0]) with open(run_file, "w") as out_handle: out_handle.write("""library(rmarkdown)\nrender("%s")\n""" % rmd_file) cmd = "%s %s" % (utils.Rscript_cmd(), run_file) # Skip automated generation of coverage report to avoid error # messages. We need to generalize coverage reporting and re-include. # try: # do.run(cmd, "Prepare coverage summary", log_error=False) # except subprocess.CalledProcessError as msg: # logger.info("Skipping generation of coverage report: %s" % (str(msg))) if utils.file_exists("report-ready.html"): shutil.move("report-ready.html", out_report) return samples
def _report_summary(samples, out_dir): """ Run coverage report with bcbiocov package """ try: import bcbreport.prepare as bcbreport except ImportError: logger.info("skipping report. No bcbreport installed.") return samples # samples = utils.unpack_worlds(samples) work_dir = dd.get_work_dir(samples[0]) parent_dir = utils.safe_makedir(out_dir) with utils.chdir(parent_dir): logger.info("copy qsignature") qsignature_fn = os.path.join(work_dir, "qc", "qsignature", "qsignature.ma") if qsignature_fn: # this need to be inside summary/qc dict if utils.file_exists(qsignature_fn) and not utils.file_exists("qsignature.ma"): shutil.copy(qsignature_fn, "bcbio_qsignature.ma") out_dir = utils.safe_makedir("fastqc") logger.info("summarize fastqc") with utils.chdir(out_dir): _merge_fastqc(samples) logger.info("summarize metrics") samples = _merge_metrics(samples) out_dir = utils.safe_makedir("coverage") logger.info("summarize coverage") for data in samples: pfiles = tz.get_in(["summary", "qc", "coverage"], data, []) if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif pfiles: pfiles = [pfiles] for fn in pfiles: if os.path.basename(fn).find("coverage_fixed") > -1: utils.copy_plus(fn, os.path.join(out_dir, os.path.basename(fn))) out_dir = utils.safe_makedir("variants") logger.info("summarize variants") for data in samples: pfiles = tz.get_in(["summary", "qc", "variants"], data, []) if isinstance(pfiles, dict): pfiles = [pfiles["base"]] + pfiles["secondary"] elif pfiles: pfiles = [pfiles] for fn in pfiles: if os.path.basename(fn).find("gc-depth-parse.tsv") > -1: utils.copy_plus(fn, os.path.join(out_dir, os.path.basename(fn))) bcbreport.report(parent_dir) out_report = os.path.join(parent_dir, "qc-coverage-report.html") if not utils.file_exists(out_report): rmd_file = os.path.join(parent_dir, "report-ready.Rmd") run_file = "%s-run.R" % (os.path.splitext(out_report)[0]) with open(run_file, "w") as out_handle: out_handle.write("""library(rmarkdown)\nrender("%s")\n""" % rmd_file) cmd = "%s %s" % (utils.Rscript_cmd(), run_file) # Skip automated generation of coverage report to avoid error # messages. We need to generalize coverage reporting and re-include. # try: # do.run(cmd, "Prepare coverage summary", log_error=False) # except subprocess.CalledProcessError, msg: # logger.info("Skipping generation of coverage report: %s" % (str(msg))) if utils.file_exists("report-ready.html"): shutil.move("report-ready.html", out_report) return samples