def _add_variantcalls_to_output(out, data): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) gender = population.get_gender(data) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call", "--ploidy", str(dd.get_ploidy(data)), "-o", tx_call_file, out["cns"]] if gender and gender.lower() != "unknown": cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(dd.get_ploidy(data)), "-o", tx_out_file, call_file] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] return out
def _get_batch_gender(items): """Retrieve gender for a batch of items if consistent. Better not to specify for mixed populations, CNVkit will work it out https://github.com/bcbio/bcbio-nextgen/commit/1a0e217c8a4d3cee10fa890fb3cfd4db5034281d#r26279752 """ genders = set([population.get_gender(x) for x in items]) if len(genders) == 1: gender = genders.pop() if gender != "unknown": return gender
def _add_diagram_plot(out, data): out_file = "%s-diagram.pdf" % os.path.splitext(out["cnr"])[0] cnr = _remove_haplotype_chroms(out["cnr"], data) cns = _remove_haplotype_chroms(out["cns"], data) if _cnx_is_empty(cnr) or _cnx_is_empty(cns): return None if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd(), "diagram", "-s", cns, "-o", tx_out_file, cnr] gender = population.get_gender(data) if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(_prep_cmd(cmd, tx_out_file), "CNVkit diagram plot") return out_file
def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) out = _get_battenberg_out(paired, work_dir) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath( os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file, gl_file = _make_ignore_file( work_dir, ref_file, ignore_file, os.path.join(bat_datadir, "impute", "impute_info.txt")) local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) gender = { "male": "XY", "female": "XX", "unknown": "L" }.get(population.get_gender(paired.tumor_data)) if gender == "L": gender_str = "-ge %s -gl %s" % (gender, gl_file) else: gender_str = "-ge %s" % (gender) r_export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "export R_LIBS_USER={local_sitelib} && {r_export_cmd}" "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} {gender_str} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files( out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["plot"] = _get_battenberg_out_plots(paired, work_dir) out["ignore"] = ignore_file return out
def _add_diagram_plot(out, data): out_file = "%s-diagram.pdf" % os.path.splitext(out["cnr"])[0] cnr = _remove_haplotype_chroms(out["cnr"], data) cns = _remove_haplotype_chroms(out["cns"], data) if _cnx_is_empty(cnr) or _cnx_is_empty(cns): return None if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd(), "diagram", "-s", cns, "-o", tx_out_file, cnr] gender = population.get_gender(data) if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit diagram plot") return out_file
def _add_variantcalls_to_output(out, data, items, is_somatic=False): """Call ploidy and convert into VCF and BED representations. """ call_file = "%s-call%s" % os.path.splitext(out["cns"]) if not utils.file_exists(call_file): with file_transaction(data, call_file) as tx_call_file: filters = ["--filter", "cn"] cmd = [os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "call"] + \ filters + \ ["--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_call_file, out["cns"]] small_vrn_files = _compatible_small_variants(data, items) if len(small_vrn_files) > 0 and _cna_has_values(out["cns"]): cmd += [ "--vcf", small_vrn_files[0].name, "--sample-id", small_vrn_files[0].sample ] if small_vrn_files[0].normal: cmd += ["--normal-id", small_vrn_files[0].normal] if not is_somatic: cmd += ["-m", "clonal"] gender = population.get_gender(data) if gender and gender.lower() != "unknown": cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit call ploidy") calls = {} for outformat in ["bed", "vcf"]: out_file = "%s.%s" % (os.path.splitext(call_file)[0], outformat) calls[outformat] = out_file if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [ os.path.join(os.path.dirname(sys.executable), "cnvkit.py"), "export", outformat, "--sample-id", dd.get_sample_name(data), "--ploidy", str(ploidy.get_ploidy([data])), "-o", tx_out_file, call_file ] if gender and gender.lower() == "male": cmd += ["--male-reference"] do.run(cmd, "CNVkit export %s" % outformat) out["call_file"] = call_file out["vrn_bed"] = annotate.add_genes(calls["bed"], data) effects_vcf, _ = effects.add_to_vcf(calls["vcf"], data, "snpeff") out["vrn_file"] = effects_vcf or calls["vcf"] return out
def _cnvkit_background(background_cnns, out_file, target_bed, antitarget_bed, data): """Calculate background reference, handling flat case with no normal sample. """ if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd(), "reference", "-f", dd.get_ref_file(data), "-o", tx_out_file] gender = population.get_gender(data) if gender and gender.lower() != "unknown": cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] if len(background_cnns) == 0: cmd += ["-t", target_bed, "-a", antitarget_bed] else: cmd += background_cnns do.run(_prep_cmd(cmd, tx_out_file), "CNVkit background") return out_file
def cnvkit_background(background_cnns, out_file, items, target_bed=None, antitarget_bed=None): """Calculate background reference, handling flat case with no normal sample. """ if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: cmd = [_get_cmd(), "reference", "-f", dd.get_ref_file(items[0]), "-o", tx_out_file] genders = set([population.get_gender(x) for x in items]) genders.discard("unknown") if len(genders) == 1: gender = genders.pop() cmd += ["--gender", gender] if gender.lower() == "male": cmd += ["--male-reference"] if len(background_cnns) == 0: assert target_bed and antitarget_bed, "Missing CNNs and target BEDs for flat background" cmd += ["-t", target_bed, "-a", antitarget_bed] else: cmd += background_cnns do.run(_prep_cmd(cmd, tx_out_file), "CNVkit background") return out_file
def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) out = _get_battenberg_out(paired, work_dir) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath(os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file, gl_file = _make_ignore_file(work_dir, ref_file, ignore_file, os.path.join(bat_datadir, "impute", "impute_info.txt")) local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) gender = {"male": "XY", "female": "XX", "unknown": "L"}.get(population.get_gender(paired.tumor_data)) if gender == "L": gender_str = "-ge %s -gl %s" % (gender, gl_file) else: gender_str = "-ge %s" % (gender) r_export_cmd = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd()) cmd = ("export R_LIBS_USER={local_sitelib} && {r_export_cmd}" "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} {gender_str} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files(out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["plot"] = _get_battenberg_out_plots(paired, work_dir) out["ignore"] = ignore_file return out