def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir, prep_params): """Perform realignment with GATK, using input commandline. GATK requires writing to disk and indexing before realignment. """ broad_runner = broad.runner_from_config(data["config"]) pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file) if not utils.file_exists(pa_bam): with file_transaction(data, pa_bam) as tx_out_file: cmd = "{cl} -o {tx_out_file}".format(**locals()) do.run(cmd, "GATK pre-alignment {0}".format(region), data) bam.index(pa_bam, data["config"]) recal_file = realign.gatk_realigner_targets( broad_runner, pa_bam, data["sam_ref"], data["config"], region=region_to_gatk(region), known_vrns=dd.get_variation_resources(data), ) recal_cl = realign.gatk_indel_realignment_cl( broad_runner, pa_bam, data["sam_ref"], recal_file, tmp_dir, region=region_to_gatk(region), known_vrns=dd.get_variation_resources(data), ) return pa_bam, " ".join(recal_cl)
def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir, prep_params): """Perform realignment with GATK, using input commandline. GATK requires writing to disk and indexing before realignment. """ broad_runner = broad.runner_from_config(data["config"]) pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file) if not utils.file_exists(pa_bam): with file_transaction(data, pa_bam) as tx_out_file: cmd = "{cl} -o {tx_out_file}".format(**locals()) do.run(cmd, "GATK re-alignment {0}".format(region), data) bam.index(pa_bam, data["config"]) recal_file = realign.gatk_realigner_targets( broad_runner, pa_bam, data["sam_ref"], data["config"], region=region_to_gatk(region), known_vrns=dd.get_variation_resources(data)) recal_cl = realign.gatk_indel_realignment_cl( broad_runner, pa_bam, data["sam_ref"], recal_file, tmp_dir, region=region_to_gatk(region), known_vrns=dd.get_variation_resources(data)) return pa_bam, recal_cl
def process_intervals(data): """Prepare intervals file""" bed_file = regions.get_sv_bed(data) if not bed_file: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None basename = os.path.splitext(bed_file)[0] ready_file = basename + ".txt" if os.path.exists(ready_file): return ready_file optimized_bed = basename + ".optimized.bed" rscript = utils.Rscript_cmd("r36") interval_file_r = utils.R_package_script("r36", "PureCN", "extdata/IntervalFile.R") ref_file = dd.get_ref_file(data) mappability_resource = dd.get_variation_resources(data)["purecn_mappability"] genome = dd.get_genome_build(data) cmd = [rscript, interval_file_r, "--infile", bed_file, "--fasta", ref_file, "--outfile", ready_file, "--offtarget", "--genome", genome, "--export", optimized_bed, "--mappability", mappability_resource] try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN intervals") except subprocess.CalledProcessError as msg: logger.info("PureCN failed to prepare intervals") logger.debug("Saved PureCN interval file into " + ready_file) return ready_file
def _run_cobalt(paired, work_dir): """Run Cobalt for counting read depth across genomic windows. PURPLE requires even 1000bp windows so use integrated counting solution directly rather than converting from CNVkit calculations. If this approach is useful should be moved upstream to be available to other tools as an input comparison. https://github.com/hartwigmedical/hmftools/tree/master/count-bam-lines """ cobalt_dir = utils.safe_makedir(os.path.join(work_dir, "cobalt")) out_file = os.path.join( cobalt_dir, "%s.cobalt" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = [ "COBALT", "-reference", paired.normal_name, "-reference_bam", paired.normal_bam, "-tumor", paired.tumor_name, "-tumor_bam", paired.tumor_bam, "-threads", dd.get_num_cores(paired.tumor_data), "-output_dir", os.path.dirname(tx_out_file), "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"] ] cmd = "%s && %s" % (utils.get_R_exports(), " ".join( [str(x) for x in cmd])) do.run(cmd, "PURPLE: COBALT read depth normalization") for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(cobalt_dir, f)) return out_file
def annotate_gemini(data): """Annotate with population calls if have data installed. """ r = dd.get_variation_resources(data) if r.get("exac") and os.path.exists(r["exac"]): return True return False
def _run_cobalt(paired, work_dir): """Run Cobalt for counting read depth across genomic windows. PURPLE requires even 1000bp windows so use integrated counting solution directly rather than converting from CNVkit calculations. If this approach is useful should be moved upstream to be available to other tools as an input comparison. https://github.com/hartwigmedical/hmftools/tree/master/count-bam-lines """ cobalt_dir = utils.safe_makedir(os.path.join(work_dir, "cobalt")) out_file = os.path.join(cobalt_dir, "%s.cobalt" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = ["COBALT"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-reference", paired.normal_name, "-reference_bam", paired.normal_bam, "-tumor", paired.tumor_name, "-tumor_bam", paired.tumor_bam, "-threads", dd.get_num_cores(paired.tumor_data), "-output_dir", os.path.dirname(tx_out_file), "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"]] cmd = "%s && %s" % (utils.get_R_exports(), " ".join([str(x) for x in cmd])) do.run(cmd, "PURPLE: COBALT read depth normalization") for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(cobalt_dir, f)) return out_file
def _run_purecn_dx(out, paired): """Extract signatures and mutational burdens from PureCN rds file.""" # no solution - no signatures if not "rds" in out: return out rscript = utils.Rscript_cmd() purecndx_r = utils.R_package_script("PureCN", "extdata/Dx.R", env="base") simple_repeat_bed = dd.get_variation_resources( paired.tumor_data)["simple_repeat"] callable_bed = dd.get_sample_callable(paired.tumor_data) out_base = utils.splitext_plus(out["rds"])[0] mutation_burden_csv = out_base + "_mutation_burden.csv" if not utils.file_uptodate(mutation_burden_csv, out["rds"]): # no signatures - so we generate them with file_transaction(paired.tumor_data, out_base) as tx_out_base: cmd = [ rscript, purecndx_r, "--rds", out["rds"], "--callable", callable_bed, "--signatures", "--exclude", simple_repeat_bed, "--out", tx_out_base ] do.run(cmd, "PureCN Dx mutational burden and signatures") out_base, out, all_files = _get_purecn_dx_files(paired, out, require_exist=True) # if a file was not generated it would not go to the upload for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) return out
def _run_purecn_normaldb(paired, out): """Run PureCN with normaldb and native segmentation paired is one t/n pair or only """ sample = utils.to_single_data(paired.tumor_data) bed_file = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) sample_name = dd.get_sample_name(sample) work_dir = _sv_workdir(sample) rscript = utils.Rscript_cmd("r36") purecn_r = utils.R_package_script("r36", "PureCN", "extdata/PureCN.R") intervals = tz.get_in(["config", "algorithm", "purecn_bed_ready"], sample) bam_file = dd.get_align_bam(sample) # termline and somatic - just annotated and filters assigned variants_vcf = tz.get_in(["variants"], sample)[0].get("germline") # in a T/N case, there is no germline file - vrn file with all variants if not variants_vcf: variants_vcf = tz.get_in(["variants"], sample)[0].get("vrn_file") normaldb = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_normaldb"], sample) mappingbiasfile = tz.get_in(["config", "algorithm", "background", "cnv_reference", "purecn_mapping_bias"], sample) sample_coverage = tz.get_in(["depth", "bins", "purecn"], sample) simple_repeat_bed = dd.get_variation_resources(sample)["simple_repeat"] result_file = os.path.join(work_dir, sample_name + ".rds") genome = dd.get_genome_build(sample) cmd = [ rscript, purecn_r, "--out", work_dir, "--tumor", sample_coverage, "--sampleid", sample_name, "--vcf", variants_vcf, "--normaldb", normaldb, "--mappingbiasfile", mappingbiasfile, "--intervals", intervals, "--snpblacklist", simple_repeat_bed, "--genome", genome, "--force", "--postoptimize", "--seed", "123", "--bootstrapn", "500", "--cores", dd.get_num_cores(sample)] resources = config_utils.get_resources("purecn", sample) if "options" in resources: cmd += [str(x) for x in resources.get("options", [])] # it is not recommended to use matched normal sample in PureCN analysis, # because then it skips PON coverage normalization and denoising steps! # but still, if it is supplied, we useit if paired.normal_data: normal_sample = utils.to_single_data(paired.normal_data) if normal_sample: normal_coverage = tz.get_in(["depth", "bins", "purecn"], normal_sample) cmd.extend(["--normal", normal_coverage]) if not os.path.exists(result_file): try: cmd_line = "export R_LIBS_USER=%s && %s && %s" % (utils.R_sitelib(env = "r36"), utils.get_R_exports(env = "r36"), " ".join([str(x) for x in cmd])) do.run(cmd_line, "PureCN copy number calling") logger.debug("Saved PureCN output to " + work_dir) except subprocess.CalledProcessError as msg: logger.info("PureCN failed") out_base, out, all_files = _get_purecn_files(paired, work_dir, require_exist = True) return out
def annotate_gemini(data, retriever=None): """Annotate with population calls if have data installed. """ r = dd.get_variation_resources(data) return all([ r.get(k) and objectstore.file_exists_or_remote(r[k]) for k in ["exac", "gnomad_exome"] ])
def _run_purple(paired, het_file, depth_file, work_dir): """Run PURPLE with pre-calculated AMBER and COBALT compatible inputs. XXX Need to add output conversion into VCF for standard formats """ purple_dir = utils.safe_makedir(os.path.join(work_dir, "purple")) out_file = os.path.join( purple_dir, "%s.purple.cnv" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = ["PURPLE"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-amber", os.path.dirname(het_file), "-baf", het_file, "-cobalt", os.path.dirname(depth_file), "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"], "-output_dir", os.path.dirname(tx_out_file), "-ref_genome", "hg38" if dd.get_genome_build(paired.tumor_data) == "hg38" else "hg19", "-run_dir", work_dir, "-threads", dd.get_num_cores(paired.tumor_data), "-tumor_sample", dd.get_sample_name(paired.tumor_data), "-ref_sample", dd.get_sample_name(paired.normal_data)] # Avoid X11 display errors when writing plots cmd = "unset DISPLAY && %s" % " ".join([str(x) for x in cmd]) do.run(cmd, "PURPLE: purity and ploidy estimation") for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(purple_dir, f)) out_file_export = os.path.join( purple_dir, "%s-purple-cnv.tsv" % (dd.get_sample_name(paired.tumor_data))) if not utils.file_exists(out_file_export): utils.symlink_plus(out_file, out_file_export) out = { "variantcaller": "purple", "call_file": out_file_export, "plot": {}, "metrics": {} } for name, ext in [("copy_number", "copyNumber"), ("minor_allele", "minor_allele"), ("variant", "variant")]: plot_file = os.path.join( purple_dir, "plot", "%s.%s.png" % (dd.get_sample_name(paired.tumor_data), ext)) if os.path.exists(plot_file): out["plot"][name] = plot_file purity_file = os.path.join( purple_dir, "%s.purple.purity" % dd.get_sample_name(paired.tumor_data)) with open(purity_file) as in_handle: header = in_handle.readline().replace("#", "").split("\t") vals = in_handle.readline().split("\t") for h, v in zip(header, vals): try: v = float(v) except ValueError: pass out["metrics"][h] = v return out
def _annotate_somatic(data, retriever=None): """Annotate somatic calls if we have cosmic data installed. """ if is_human(data): paired = vcfutils.get_paired([data]) if paired: r = dd.get_variation_resources(data) if r.get("cosmic") and objectstore.file_exists_or_remote(r["cosmic"]): return True return False
def _annotate_somatic(data): """Annotate somatic calls if we have cosmic data installed. """ if is_human(data): paired = vcfutils.get_paired([data]) if paired: r = dd.get_variation_resources(data) if r.get("cosmic") and os.path.exists(r["cosmic"]): return True return False
def _run_purple(paired, het_file, depth_file, vrn_files, work_dir): """Run PURPLE with pre-calculated AMBER and COBALT compatible inputs. """ purple_dir = utils.safe_makedir(os.path.join(work_dir, "purple")) out_file = os.path.join(purple_dir, "%s.purple.cnv" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = ["PURPLE"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-amber", os.path.dirname(het_file), "-baf", het_file, "-cobalt", os.path.dirname(depth_file), "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"], "-output_dir", os.path.dirname(tx_out_file), "-ref_genome", "hg38" if dd.get_genome_build(paired.tumor_data) == "hg38" else "hg19", "-run_dir", work_dir, "-threads", dd.get_num_cores(paired.tumor_data), "-tumor_sample", dd.get_sample_name(paired.tumor_data), "-ref_sample", dd.get_sample_name(paired.normal_data)] if vrn_files: cmd += ["-somatic_vcf", vrn_files[0]["vrn_file"]] # Avoid X11 display errors when writing plots cmd = "unset DISPLAY && %s" % " ".join([str(x) for x in cmd]) do.run(cmd, "PURPLE: purity and ploidy estimation") for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(purple_dir, f)) out_file_export = os.path.join(purple_dir, "%s-purple-cnv.tsv" % (dd.get_sample_name(paired.tumor_data))) if not utils.file_exists(out_file_export): utils.symlink_plus(out_file, out_file_export) out = {"variantcaller": "purple", "call_file": out_file_export, "vrn_file": titancna.to_vcf(out_file_export, "PURPLE", _get_header, _export_to_vcf, paired.tumor_data), "plot": {}, "metrics": {}} for name, ext in [("copy_number", "copyNumber"), ("minor_allele", "minor_allele"), ("variant", "variant")]: plot_file = os.path.join(purple_dir, "plot", "%s.%s.png" % (dd.get_sample_name(paired.tumor_data), ext)) if os.path.exists(plot_file): out["plot"][name] = plot_file purity_file = os.path.join(purple_dir, "%s.purple.purity" % dd.get_sample_name(paired.tumor_data)) with open(purity_file) as in_handle: header = in_handle.readline().replace("#", "").split("\t") vals = in_handle.readline().split("\t") for h, v in zip(header, vals): try: v = float(v) except ValueError: pass out["metrics"][h] = v return out
def bqsr_table(data): """Generate recalibration tables as inputs to BQSR. """ in_file = dd.get_align_bam(data) out_file = "%s-recal-table.txt" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: assoc_files = dd.get_variation_resources(data) known = "-k %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" license = license_export(data) cores = dd.get_num_cores(data) ref_file = dd.get_ref_file(data) cmd = ("{license}sentieon driver -t {cores} -r {ref_file} " "-i {in_file} --algo QualCal {known} {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon QualCal generate table") return out_file
def bqsr_table(data): """Generate recalibration tables as inputs to BQSR. """ in_file = dd.get_align_bam(data) out_file = "%s-recal-table.txt" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: assoc_files = dd.get_variation_resources(data) known = "-k %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" license = license_export(data) cores = dd.get_num_cores(data) ref_file = dd.get_ref_file(data) cmd = ("{license}sentieon driver -t {cores} -r {ref_file} " "-i {in_file} --algo QualCal {known} {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon QualCal generate table") return out_file
def apply_bqsr(data): """Apply recalibration, producing a updated BAM file. """ in_file = dd.get_align_bam(data) out_table_file = "%s-recal-table-post.txt" % utils.splitext_plus(in_file)[0] out_file = "%s-recal.bam" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file, out_table_file) as (tx_out_file, tx_table_file): assoc_files = dd.get_variation_resources(data) known = "-k %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" license = license_export(data) cores = dd.get_num_cores(data) ref_file = dd.get_ref_file(data) cmd = ("{license}sentieon driver -t {cores} -r {ref_file} " "-i {in_file} --algo QualCal {known} {tx_table_file} " "--algo ReadWriter {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon QualCal apply recalibration") return out_file
def _run_purecn_dx(out, paired): """Extract signatures and mutational burdens from PureCN rds file.""" out_base, out, all_files = _get_purecn_dx_files(paired, out) rscript = utils.Rscript_cmd("r36") purecndx_r = utils.R_package_script("r36", "PureCN", "extdata/Dx.R") simple_repeat_bed = dd.get_variation_resources(paired.tumor_data)["simple_repeat"] callable_bed = dd.get_sample_callable(paired.tumor_data) if not utils.file_uptodate(out["mutation_burden"], out["rds"]): with file_transaction(paired.tumor_data, out_base) as tx_out_base: cmd = [rscript, purecndx_r, "--rds", out["rds"], "--callable", callable_bed, "--signatures", "--exclude", simple_repeat_bed, "--out", tx_out_base] do.run(cmd, "PureCN Dx mutational burden and signatures") for f in all_files: if os.path.exists(os.path.join(os.path.dirname(tx_out_base), f)): shutil.move(os.path.join(os.path.dirname(tx_out_base), f), os.path.join(os.path.dirname(out_base), f)) return out
def apply_bqsr(data): """Apply recalibration, producing a updated BAM file. """ in_file = dd.get_align_bam(data) out_table_file = "%s-recal-table-post.txt" % utils.splitext_plus( in_file)[0] out_file = "%s-recal.bam" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file, out_table_file) as (tx_out_file, tx_table_file): assoc_files = dd.get_variation_resources(data) known = "-k %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" license = license_export(data) cores = dd.get_num_cores(data) ref_file = dd.get_ref_file(data) cmd = ("{license}sentieon driver -t {cores} -r {ref_file} " "-i {in_file} --algo QualCal {known} {tx_table_file} " "--algo ReadWriter {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon QualCal apply recalibration") return out_file
def annotate_gemini(data, retriever=None): """Annotate with population calls if have data installed. """ r = dd.get_variation_resources(data) return all([r.get(k) and objectstore.file_exists_or_remote(r[k]) for k in ["exac", "gnomad_exome"]])
def annotate_gemini(data): """Annotate with population calls if have data installed. """ r = dd.get_variation_resources(data) return all([r.get(k) and os.path.exists(r[k]) for k in ["exac", "gnomad_exome"]])