def _run_somatic(paired, ref_file, assoc_files, region, out_file, work_dir): if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, work_dir) as tx_work_dir: workflow_file = _configure_somatic(paired, ref_file, region, out_file, tx_work_dir) if workflow_file: has_variants = True _run_workflow(paired.tumor_data, workflow_file, tx_work_dir) else: has_variants = False vcfutils.write_empty_vcf( out_file, paired.tumor_data["config"], [ dd.get_sample_name(d) for d in [paired.tumor_data, paired.normal_data] ]) if has_variants: var_dir = os.path.join(work_dir, "results", "variants") vcfutils.combine_variant_files([ _postprocess_somatic(os.path.join(var_dir, f), paired) for f in ["somatic.snvs.vcf.gz", "somatic.indels.vcf.gz"] ], out_file, ref_file, paired.tumor_data["config"], region=region) return out_file
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file_mutect) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) _rename_allelic_fraction_field(out_file_mutect,config) disable_SID = True # SID isn't great, so use Scalpel instead if "appistry" not in broad_runner.get_mutect_version() or disable_SID: # Scalpel InDels is_paired = "-I:normal" in params out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): with file_transaction(out_file_indels) as tx_out_file2: if not is_paired: scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) else: # SomaticIndelDetector modifications out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) return out_file
def _run_somatic(paired, ref_file, assoc_files, region, out_file, work_dir): if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, work_dir) as tx_work_dir: workflow_file = _configure_somatic(paired, ref_file, region, out_file, tx_work_dir) _run_workflow(paired.tumor_data, workflow_file, tx_work_dir) var_dir = os.path.join(work_dir, "results", "variants") vcfutils.combine_variant_files([_postprocess_somatic(os.path.join(var_dir, f), paired) for f in ["somatic.snvs.vcf.gz", "somatic.indels.vcf.gz"]], out_file, ref_file, paired.tumor_data["config"], region=region) return out_file
def _run_somatic(paired, ref_file, assoc_files, region, out_file, work_dir): if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, work_dir) as tx_work_dir: workflow_file = _configure_somatic(paired, ref_file, region, out_file, tx_work_dir) _run_workflow(paired.tumor_data, workflow_file, tx_work_dir) var_dir = os.path.join(work_dir, "results", "variants") vcfutils.combine_variant_files([_postprocess_somatic(os.path.join(var_dir, f), paired) for f in ["somatic.snvs.vcf.gz", "somatic.indels.vcf.gz"]], out_file, ref_file, paired.tumor_data["config"], region=region) return out_file
def test_3_vcf_split_combine(self): """Split a VCF file into SNPs and indels, then combine back together. """ with make_workdir() as workdir: config = load_config(get_post_process_yaml(self.data_dir, workdir)) config["algorithm"] = {} ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") fname = os.path.join(self.var_dir, "S1-variants.vcf") snp_file, indel_file = vcfutils.split_snps_indels(fname, ref_file, config) merge_file = "%s-merge%s.gz" % os.path.splitext(fname) vcfutils.combine_variant_files([snp_file, indel_file], merge_file, ref_file, config) for f in [snp_file, indel_file, merge_file]: self._remove_vcf(f)
def test_3_vcf_split_combine(self): """Split a VCF file into SNPs and indels, then combine back together. """ with make_workdir() as workdir: config = load_config(get_post_process_yaml(self.automated_dir, workdir)) config["algorithm"] = {} ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") fname = os.path.join(self.var_dir, "S1-variants.vcf") snp_file, indel_file = vcfutils.split_snps_indels(fname, ref_file, config) merge_file = "%s-merge%s.gz" % os.path.splitext(fname) vcfutils.combine_variant_files([snp_file, indel_file], merge_file, ref_file, config) for f in [snp_file, indel_file, merge_file]: self._remove_vcf(f)
def test_3_vcf_split_combine(self, global_config): """Split a VCF file into SNPs and indels, then combine back together. """ from bcbio.variation import vcfutils config = load_config(global_config) config["algorithm"] = {} fname = os.path.join(self.var_dir, "S1-variants.vcf") snp_file, indel_file = vcfutils.split_snps_indels( fname, self.ref_file, config) merge_file = "%s-merge%s.gz" % os.path.splitext(fname) vcfutils.combine_variant_files([snp_file, indel_file], merge_file, self.ref_file, config) for f in [snp_file, indel_file, merge_file]: self._remove_vcf(f)
def variant_filtration(call_file, ref_file, vrn_files, config): """Filter variant calls using Variant Quality Score Recalibration. Newer GATK with Haplotype calling has combined SNP/indel filtering. """ broad_runner = broad.runner_from_config(config) caller = config["algorithm"].get("variantcaller") if caller in ["gatk-haplotype"] and not _no_vqsr(config): return _variant_filtration_both(broad_runner, call_file, ref_file, vrn_files, config) elif caller in ["freebayes"]: return filter_freebayes(broad_runner, call_file, ref_file, vrn_files, config) # no additional filtration for callers that filter as part of call process elif caller in ["samtools", "varscan"]: return call_file else: snp_file, indel_file = vcfutils.split_snps_indels( broad_runner, call_file, ref_file) snp_filter_file = _variant_filtration_snp(broad_runner, snp_file, ref_file, vrn_files, config) indel_filter_file = _variant_filtration_indel(broad_runner, indel_file, ref_file, vrn_files, config) orig_files = [snp_filter_file, indel_filter_file] out_file = "{base}combined.vcf".format( base=os.path.commonprefix(orig_files)) return vcfutils.combine_variant_files(orig_files, out_file, ref_file, config)
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. For VQSR, need to split the file to apply. For hard filters can run on the original filter, filtering by bcftools type. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): assert "gvcf" not in dd.get_tools_on(data), \ ("Cannot force gVCF output and use VQSR. Try using hard filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_hard) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_hard) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return _filter_nonref(combined_file, data) else: snp_filter = vfilter.gatk_snp_hard(call_file, data) indel_filter = vfilter.gatk_indel_hard(snp_filter, data) if "gvcf" not in dd.get_tools_on(data): return _filter_nonref(indel_filter, data) else: return indel_filter
def get_multisample_vcf(fnames, name, caller, data): """Retrieve a multiple sample VCF file in a standard location. """ out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_vcf = os.path.join(out_dir, "%s-%s.vcf" % (name, caller)) return vcfutils.combine_variant_files(fnames, gemini_vcf, data["sam_ref"], data["config"])
def prep_gemini_db(fnames, call_id, samples, data): """Prepare a gemini database from VCF inputs prepared with snpEff. """ out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db") use_gemini = _do_db_build(samples) is_population = len(fnames) > 1 if is_population: gemini_vcf = "%s.vcf" % os.path.splitext(gemini_db)[0] gemini_vcf = vcfutils.combine_variant_files(fnames, gemini_vcf, data["sam_ref"], data["config"]) else: gemini_vcf = fnames[0] if use_gemini and not utils.file_exists(gemini_db): with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % str(call_id), data) subprocess.check_call(cmd, shell=True) return [[ call_id, { "db": gemini_db if use_gemini else None, "vcf": gemini_vcf if is_population else None } ]]
def variant_filtration(call_file, ref_file, vrn_files, data): """Filter variant calls using Variant Quality Score Recalibration. Newer GATK with Haplotype calling has combined SNP/indel filtering. """ caller = data["config"]["algorithm"].get("variantcaller") call_file = ploidy.filter_vcf_by_sex(call_file, data) if caller in ["freebayes"]: return vfilter.freebayes(call_file, ref_file, vrn_files, data) # no additional filtration for callers that filter as part of call process elif caller in ["samtools", "varscan", "mutect"]: return call_file else: config = data["config"] snp_file, indel_file = vcfutils.split_snps_indels( call_file, ref_file, config) snp_filter_file = _variant_filtration_snp(snp_file, ref_file, vrn_files, data) indel_filter_file = _variant_filtration_indel(indel_file, ref_file, vrn_files, data) orig_files = [snp_filter_file, indel_filter_file] out_file = "{base}combined.vcf".format( base=os.path.commonprefix(orig_files)) return vcfutils.combine_variant_files(orig_files, out_file, ref_file, config)
def run_qsnp(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run qSNP calling on paired tumor/normal. """ if utils.file_exists(out_file): return out_file paired = get_paired_bams(align_bams, items) if paired.normal_bam: region_files = [] regions = _clean_regions(items, region) if regions: for region in regions: out_region_file = out_file.replace(".vcf.gz", _to_str(region) + ".vcf.gz") region_file = _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region, out_region_file) region_files.append(region_file) out_file = combine_variant_files(region_files, out_file, ref_file, items[0]["config"]) if not region: out_file = _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region, out_file) return out_file else: raise ValueError("qSNP only works on paired samples")
def test_3_vcf_split_combine(self): """Split a VCF file into SNPs and indels, then combine back together. """ from bcbio.variation import vcfutils with make_workdir() as workdir: config = load_config( get_post_process_yaml(self.automated_dir, workdir)) config["algorithm"] = {} fname = os.path.join(self.var_dir, "S1-variants.vcf") snp_file, indel_file = vcfutils.split_snps_indels( fname, self.ref_file, config) merge_file = "%s-merge%s.gz" % os.path.splitext(fname) vcfutils.combine_variant_files([snp_file, indel_file], merge_file, self.ref_file, config) for f in [snp_file, indel_file, merge_file]: self._remove_vcf(f)
def run(items): """Perform detection of structural variations with delly. """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} bytype_vcfs = run_multicore(_run_delly, [(work_bams, sv_type, ref_file, work_dir, items) for sv_type in ["DEL", "DUP", "INV", "TRA"]], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = {} data["sv"]["delly"] = delly_vcf out.append(data) return out
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): if "gvcf" in dd.get_tools_on(data) and not dd.get_jointcaller(data): raise ValueError( "Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. " "Try using cutoff-based soft filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels( call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_cutoff) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_cutoff) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return combined_file else: snp_filter = vfilter.gatk_snp_cutoff(call_file, data) indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data) return indel_filter
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if includes_missingalt(data): logger.info("Removing variants with missing alts from %s." % call_file) call_file = gatk_remove_missingalt(call_file, data) if "gatkcnn" in dd.get_tools_on(data): return _cnn_filter(call_file, vrn_files, data) elif config_utils.use_vqsr(algs, call_file): if vcfutils.is_gvcf_file(call_file): raise ValueError("Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. " "Try using cutoff-based soft filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_cutoff) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_cutoff) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return combined_file else: snp_filter = vfilter.gatk_snp_cutoff(call_file, data) indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data) return indel_filter
def run(items): """Perform detection of structural variations with delly. """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} sv_types = ["DEL", "DUP", "INV"] # "TRA" has invalid VCF END specifications that GATK doesn't like with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam: bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product(pysam_work_bam.references, sv_types)], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = {} data["sv"]["delly"] = delly_vcf out.append(data) return out
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. For VQSR, need to split the file to apply. For hard filters can run on the original filter, filtering by bcftools type. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): assert "gvcf" not in dd.get_tools_on(data), \ ("Cannot force gVCF output and use VQSR. Try using hard filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels( call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_hard) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_hard) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return _filter_nonref(combined_file, data) else: snp_filter = vfilter.gatk_snp_hard(call_file, data) indel_filter = vfilter.gatk_indel_hard(snp_filter, data) if "gvcf" not in dd.get_tools_on(data): return _filter_nonref(indel_filter, data) else: return indel_filter
def run(items): """Perform detection of structural variations with delly. """ work_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"] } sv_types = [ "DEL", "DUP", "INV" ] # "TRA" has invalid VCF END specifications that GATK doesn't like bytype_vcfs = run_multicore( _run_delly, [(work_bams, sv_type, ref_file, work_dir, items) for sv_type in sv_types], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = {} data["sv"]["delly"] = delly_vcf out.append(data) return out
def get_multisample_vcf(fnames, name, caller, data): """Retrieve a multiple sample VCF file in a standard location. """ out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_vcf = os.path.join(out_dir, "%s-%s.vcf" % (name, caller)) return vcfutils.combine_variant_files(fnames, gemini_vcf, data["sam_ref"], data["config"])
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"] } sv_types = [ "DEL", "DUP", "INV" ] # "TRA" has invalid VCF END specifications that GATK doesn't like with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam: bytype_vcfs = run_multicore( _run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type ) in itertools.product(pysam_work_bam.references, sv_types)], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) delly_vcf = vfilter.genotype_filter(combo_vcf, 'DV / (DV + DR) > 0.35 && DV > 4', data, "DVSupport") out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(delly_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = "%s-%s%s" % (base, sample, ext) data["sv"].append({ "variantcaller": "delly", "vrn_file": vcfutils.select_sample(delly_vcf, sample, delly_sample_vcf, data["config"]) }) out.append(data) return out
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"] } work_bams = run_multicore(_prep_subsampled_bams, [(data, work_dir) for data in items], config, parallel) ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) sv_types = [ "DEL", "DUP" ] # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow exclude_file = _get_full_exclude_file(items, work_dir) bytype_vcfs = run_multicore( _run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product( sshared.get_sv_chroms(items, exclude_file), sv_types)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample( combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data) data["sv"].append({ "variantcaller": "delly", "vrn_file": delly_vcf, "exclude": exclude_file }) out.append(data) return out
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all( utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError( "Require bwa-mem alignment input for lumpy structural variation detection" ) paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir( paired.tumor_data if paired and paired.tumor_data else items[0]) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants( data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample( lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file, data) gt_vcf = vcfutils.combine_variant_files( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], ref_file=dd.get_ref_file(data), config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") data["sv"].append({ "variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file }) out.append(data) return out
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_hard) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_hard) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) return vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"])
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_hard) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_hard) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return _filter_nonref(combined_file, data)
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "structural", dd.get_sample_name(items[0]), "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"] } work_bams = [dd.get_align_bam(d) for d in items] ref_file = dd.get_ref_file(items[0]) exclude_file = _get_full_exclude_file(items, work_bams, work_dir) bytype_vcfs = run_multicore( _run_delly, [(work_bams, chrom, ref_file, work_dir, items) for chrom in sshared.get_sv_chroms(items, exclude_file)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] upload_counts = collections.defaultdict(int) for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) final_vcf = sshared.finalize_sv(combo_vcf, data, items) if final_vcf: delly_vcf = _delly_count_evidence_filter(final_vcf, data) data["sv"].append({ "variantcaller": "delly", "vrn_file": delly_vcf, "do_upload": upload_counts[final_vcf] == 0, # only upload a single file per batch "exclude": exclude_file }) upload_counts[final_vcf] += 1 out.append(data) return out
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") if "appistry" in broad_runner.get_mutect_version(): out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") else: out_file_mutect = out_file broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file_mutect) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) if "appistry" in broad_runner.get_mutect_version(): # SomaticIndelDetector modifications out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) return out_file
def run(items): """Perform detection of structural variations with delly. """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["work_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) bytype_vcfs = [_run_delly(work_bams, sv_type, ref_file, work_dir) for sv_type in ["DEL", "DUP", "INV", "TRA"]] out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) delly_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = {} data["sv"]["delly"] = delly_vcf out.append(data) return out
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} work_bams = run_multicore(_prep_subsampled_bams, [(data, work_dir) for data in items], config, parallel) ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) sv_types = ["DEL", "DUP"] # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow exclude_file = _get_full_exclude_file(items, work_dir) bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product(sshared.get_sv_chroms(items, exclude_file), sv_types)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data) data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf, "exclude": exclude_file}) out.append(data) return out
def variant_filtration(call_file, ref_file, vrn_files, config): """Filter variant calls using Variant Quality Score Recalibration. Newer GATK with Haplotype calling has combined SNP/indel filtering. """ caller = config["algorithm"].get("variantcaller") if caller in ["freebayes"]: return vfilter.freebayes(call_file, ref_file, vrn_files, config) # no additional filtration for callers that filter as part of call process elif caller in ["samtools", "varscan"]: return call_file else: snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, config) snp_filter_file = _variant_filtration_snp(snp_file, ref_file, vrn_files, config) indel_filter_file = _variant_filtration_indel(indel_file, ref_file, vrn_files, config) orig_files = [snp_filter_file, indel_filter_file] out_file = "{base}combined.vcf".format(base=os.path.commonprefix(orig_files)) return vcfutils.combine_variant_files(orig_files, out_file, ref_file, config)
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) work_bams = [data["align_bam"] for data in items] ref_file = utils.get_in(items[0], ("reference", "fasta", "base")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = len(items) config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} sv_types = ["DEL", "DUP", "INV"] # "TRA" has invalid VCF END specifications that GATK doesn't like with closing(pysam.Samfile(work_bams[0], "rb")) as pysam_work_bam: bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product(pysam_work_bam.references, sv_types)], config, parallel) out_file = "%s.vcf.gz" % os.path.commonprefix(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, items[0]["config"]) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample(combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = vfilter.hard_w_expression(delly_sample_vcf, "FMT/DV < 4 || (FMT/DV / (FMT/DV + FMT/DR)) < 0.2", data, name="DVSupport") data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf}) out.append(data) return out
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "structural", dd.get_sample_name(items[0]), "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = {"type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"]} work_bams = [dd.get_align_bam(d) for d in items] ref_file = dd.get_ref_file(items[0]) exclude_file = _get_full_exclude_file(items, work_bams, work_dir) bytype_vcfs = run_multicore(_run_delly, [(work_bams, chrom, ref_file, work_dir, items) for chrom in sshared.get_sv_chroms(items, exclude_file)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] upload_counts = collections.defaultdict(int) for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) final_vcf = sshared.finalize_sv(combo_vcf, data, items) if final_vcf: delly_vcf = _delly_count_evidence_filter(final_vcf, data) data["sv"].append({"variantcaller": "delly", "vrn_file": delly_vcf, "do_upload": upload_counts[final_vcf] == 0, # only upload a single file per batch "exclude": exclude_file}) upload_counts[final_vcf] += 1 out.append(data) return out
def run_qsnp(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run qSNP calling on paired tumor/normal. """ if utils.file_exists(out_file): return out_file paired = get_paired_bams(align_bams, items) if paired.normal_bam: region_files = [] regions = _clean_regions(items, region) if regions: for region in regions: out_region_file = out_file.replace(".vcf.gz", _to_str(region) + ".vcf.gz") region_file = _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region, out_region_file) region_files.append(region_file) out_file = combine_variant_files(region_files, out_file, ref_file, items[0]["config"]) if not region: out_file = _run_qsnp_paired(align_bams, items, ref_file, assoc_files, region, out_file) return out_file else: raise ValueError("qSNP only works on paired samples")
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) full_bams, sr_bams, disc_bams = [], [], [] for data in items: dedup_bam, sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dedup_bam) sr_bams.append(sr_bam) disc_bams.append(disc_bam) lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) dedup_bam, sr_bam, _ = sshared.get_split_discordants(data, work_dir) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dedup_bam, sr_bam, exclude_file, data) gt_vcf = vcfutils.combine_variant_files(orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], ref_file=dd.get_ref_file(data), config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") if "appistry" in broad_runner.get_mutect_version(): out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") else: out_file_mutect = out_file broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return with file_transaction(out_file_mutect) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) if "appistry" in broad_runner.get_mutect_version(): # SomaticIndelDetector modifications out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) return out_file
def prep_gemini_db(fnames, call_id, samples, data): """Prepare a gemini database from VCF inputs prepared with snpEff. """ out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_db = os.path.join(out_dir, "-".join(call_id) + ".db") use_gemini = _do_db_build(samples) is_population = len(fnames) > 1 if is_population: gemini_vcf = "%s.vcf" % os.path.splitext(gemini_db)[0] gemini_vcf = vcfutils.combine_variant_files(fnames, gemini_vcf, data["sam_ref"], data["config"]) else: gemini_vcf = fnames[0] if use_gemini and not utils.file_exists(gemini_db): with file_transaction(gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = "{gemini} load -v {gemini_vcf} -t snpEff --cores {num_cores} {tx_gemini_db}" cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % str(call_id), data) return [[call_id, {"db": gemini_db if use_gemini else None, "vcf": gemini_vcf if is_population else None}]]
def combine_variant_files(*args): return vcfutils.combine_variant_files(*args)
def combine_variant_files(*args): return vcfutils.combine_variant_files(*args)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: affected_batch = items[0]["metadata"]["batch"] message = ("Batch {} requires both tumor and normal BAM files for" " VarScan cancer calling").format(affected_batch) raise ValueError(message) if not file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") base, ext = utils.splitext_plus(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(config, mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(orig_out_file, config) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp): with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) fix_ambig = vcfutils.fix_ambiguous_cl() tx_snp_in = "%s-orig" % os.path.splitext(tx_snp)[0] tx_indel_in = "%s-orig" % os.path.splitext(tx_indel)[0] varscan_cmd = ( "java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} " "--output-snp {tx_snp_in} --output-indel {tx_indel_in} " " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float( utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) for orig_fname, fname in [(tx_snp_in, tx_snp), (tx_indel_in, tx_indel)]: cmd = "vcfuniqalleles {orig_fname}.vcf | {fix_ambig} > {fname}" do.run(cmd.format(**locals()), "Varscan paired fix") # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records to_combine = [] if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name, config) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name, config) if not to_combine: write_empty_vcf(orig_out_file, config) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: for ext in ["", ".gz", ".gz.tbi"]: if os.path.exists(extra_file + ext): os.remove(extra_file + ext) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config) _add_reject_flag(out_file, config)
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect) if not file_exists(out_file_orig): with file_transaction(config, out_file_orig) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) is_paired = "-I:normal" in params if not utils.file_uptodate(out_file_mutect, out_file_orig): out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired) indelcaller = vcfutils.get_indelcaller(base_config) if ("scalpel" in indelcaller.lower() and region and isinstance(region, (tuple, list)) and chromhacks.is_autosomal_or_sex(region[0])): # Scalpel InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): if not is_paired: vcfutils.check_paired_problems(items) scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif "pindel" in indelcaller.lower(): from bcbio.structural import pindel out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if pindel.is_installed(items[0]["config"]): pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=ref_file, config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower()) and "appistry" in broad_runner.get_mutect_version()): # SomaticIndelDetector InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(config, out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files( orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) return out_file
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" tumor_bam, tumor_name, normal_bam, normal_name = get_paired_bams( align_bams, items) if not file_exists(out_file): base, ext = os.path.splitext(out_file) cleanup_files = [] for fname, mpext in [(normal_bam, "normal"), (tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(out_file) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] jvm_opts = _get_varscan_opts(config) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98") indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) to_combine = [] with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): varscan_cmd = varscan_cmd.format(**locals()) do.run(varscan_cmd, "Varscan".format(**locals()), None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, normal_name, tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, normal_name, tumor_name) if not to_combine: write_empty_vcf(out_file) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: os.remove(extra_file) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError( "Require both tumor and normal BAM files for VarScan cancer calling" ) if not file_exists(out_file): base, ext = os.path.splitext(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(out_file) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] jvm_opts = _get_varscan_opts(config) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) to_combine = [] with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): varscan_cmd = varscan_cmd.format(**locals()) do.run(varscan_cmd, "Varscan".format(**locals()), None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name) if not to_combine: write_empty_vcf(out_file) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: os.remove(extra_file) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file)
def mutect_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Run the MuTect paired analysis algorithm. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): base_config = items[0]["config"] broad_runner = broad.runner_from_config(base_config, "mutect") out_file_mutect = (out_file.replace(".vcf", "-mutect.vcf") if "vcf" in out_file else out_file + "-mutect.vcf") broad_runner, params = \ _mutect_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_mutect) if (not isinstance(region, (list, tuple)) and not all(has_aligned_reads(x, region) for x in align_bams)): vcfutils.write_empty_vcf(out_file) return out_file_orig = "%s-orig%s" % utils.splitext_plus(out_file_mutect) with file_transaction(config, out_file_orig) as tx_out_file: # Rationale: MuTect writes another table to stdout, which we don't need params += ["--vcf", tx_out_file, "-o", os.devnull] broad_runner.run_mutect(params) is_paired = "-I:normal" in params out_file_mutect = _fix_mutect_output(out_file_orig, config, out_file_mutect, is_paired) indelcaller = vcfutils.get_indelcaller(base_config) if "scalpel" in indelcaller.lower(): # Scalpel InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if scalpel.is_installed(items[0]["config"]): with file_transaction(config, out_file_indels) as tx_out_file2: if not is_paired: vcfutils.check_paired_problems(items) scalpel._run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) else: scalpel._run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=region, out_file=tx_out_file2) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif "pindel" in indelcaller.lower(): out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") if pindel.is_installed(items[0]["config"]): pindel._run_tumor_pindel_caller(align_bams, items, ref_file, assoc_files, region=region, out_file=out_file_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=ref_file, config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) elif (("somaticindeldetector" in indelcaller.lower() or "sid" in indelcaller.lower()) and "appistry" in broad_runner.get_mutect_version()): # SomaticIndelDetector InDels out_file_indels = (out_file.replace(".vcf", "-somaticIndels.vcf") if "vcf" in out_file else out_file + "-somaticIndels.vcf") params_indels = _SID_call_prep(align_bams, items, ref_file, assoc_files, region, out_file_indels) with file_transaction(config, out_file_indels) as tx_out_file: params_indels += ["-o", tx_out_file] broad_runner.run_mutect(params_indels) out_file = vcfutils.combine_variant_files(orig_files=[out_file_mutect, out_file_indels], out_file=out_file, ref_file=items[0]["sam_ref"], config=items[0]["config"], region=region) else: utils.symlink_plus(out_file_mutect, out_file) return out_file
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] version = programs.jar_versioner("varscan", "VarScan")(config) if LooseVersion(version) < LooseVersion("v2.3.6"): raise IOError( "Please install version 2.3.6 or better of VarScan with support " "for multisample calling and indels in VCF format.") varscan_jar = config_utils.get_jar( "VarScan", config_utils.get_program("varscan", config, "dir")) remove_zerocoverage = "grep -v -P '\t0\t\t$'" # No need for names in VarScan, hence the "_" paired = get_paired_bams(align_bams, items) if not paired.normal_bam: raise ValueError("Require both tumor and normal BAM files for VarScan cancer calling") if not file_exists(out_file): orig_out_file = out_file out_file = orig_out_file.replace(".vcf.gz", ".vcf") base, ext = utils.splitext_plus(out_file) cleanup_files = [] for fname, mpext in [(paired.normal_bam, "normal"), (paired.tumor_bam, "tumor")]: mpfile = "%s-%s.mpileup" % (base, mpext) cleanup_files.append(mpfile) with file_transaction(mpfile) as mpfile_tx: mpileup = samtools.prep_mpileup([fname], ref_file, max_read_depth, config, target_regions=target_regions, want_bcf=False) cmd = "{mpileup} > {mpfile_tx}" cmd = cmd.format(**locals()) do.run(cmd, "samtools mpileup".format(**locals()), None, [do.file_exists(mpfile_tx)]) # Sometimes mpileup writes an empty file: in this case we # just skip the rest of the analysis (VarScan will hang otherwise) if any(os.stat(filename).st_size == 0 for filename in cleanup_files): write_empty_vcf(orig_out_file, config) return # First index is normal, second is tumor normal_tmp_mpileup = cleanup_files[0] tumor_tmp_mpileup = cleanup_files[1] indel_file = base + ".indel.vcf" snp_file = base + ".snp.vcf" cleanup_files.append(indel_file) cleanup_files.append(snp_file) with file_transaction(indel_file, snp_file) as (tx_indel, tx_snp): with utils.curdir_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) varscan_cmd = ("java {jvm_opts} -jar {varscan_jar} somatic" " {normal_tmp_mpileup} {tumor_tmp_mpileup} {base}" " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"),10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) # VarScan files need to be corrected to match the VCF specification # We do this before combining them otherwise merging may fail # if there are invalid records to_combine = [] if do.file_exists(snp_file): to_combine.append(snp_file) _fix_varscan_vcf(snp_file, paired.normal_name, paired.tumor_name) if do.file_exists(indel_file): to_combine.append(indel_file) _fix_varscan_vcf(indel_file, paired.normal_name, paired.tumor_name) if not to_combine: write_empty_vcf(orig_out_file, config) return out_file = combine_variant_files([snp_file, indel_file], out_file, ref_file, config, region=target_regions) # Remove cleanup files for extra_file in cleanup_files: for ext in ["", ".gz", ".gz.tbi"]: if os.path.exists(extra_file + ext): os.remove(extra_file + ext) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if orig_out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config) _add_reject_flag(out_file, config)
def _varscan_paired(align_bams, ref_file, items, target_regions, out_file): """Run a paired VarScan analysis, also known as "somatic". """ max_read_depth = "1000" config = items[0]["config"] paired = get_paired_bams(align_bams, items) if not paired.normal_bam: affected_batch = items[0]["metadata"]["batch"] message = ("Batch {} requires both tumor and normal BAM files for" " VarScan cancer calling").format(affected_batch) raise ValueError(message) if not utils.file_exists(out_file): assert out_file.endswith(".vcf.gz"), "Expect bgzipped output to VarScan" normal_mpileup_cl = samtools.prep_mpileup([paired.normal_bam], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) tumor_mpileup_cl = samtools.prep_mpileup([paired.tumor_bam], ref_file, config, max_read_depth, target_regions=target_regions, want_bcf=False) base, ext = utils.splitext_plus(out_file) indel_file = base + "-indel.vcf" snp_file = base + "-snp.vcf" with file_transaction(config, indel_file, snp_file) as (tx_indel, tx_snp): with tx_tmpdir(items[0]) as tmp_dir: jvm_opts = _get_varscan_opts(config, tmp_dir) remove_zerocoverage = r"ifne grep -v -P '\t0\t\t$'" varscan_cmd = ("varscan {jvm_opts} somatic " " <({normal_mpileup_cl} | {remove_zerocoverage}) " "<({tumor_mpileup_cl} | {remove_zerocoverage}) " "--output-snp {tx_snp} --output-indel {tx_indel} " " --output-vcf --min-coverage 5 --p-value 0.98 " "--strand-filter 1 ") # add minimum AF if "--min-var-freq" not in varscan_cmd: min_af = float(utils.get_in(paired.tumor_config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 varscan_cmd += "--min-var-freq {min_af} " do.run(varscan_cmd.format(**locals()), "Varscan", None, None) to_combine = [] for fname in [snp_file, indel_file]: if utils.file_exists(fname): fix_file = "%s-fix.vcf.gz" % (utils.splitext_plus(fname)[0]) with file_transaction(config, fix_file) as tx_fix_file: fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) py_cl = os.path.join(os.path.dirname(sys.executable), "py") normal_name = paired.normal_name tumor_name = paired.tumor_name cmd = ("cat {fname} | " "{py_cl} -x 'bcbio.variation.varscan.fix_varscan_output(x," """ "{normal_name}", "{tumor_name}")' | """ "{fix_ambig_ref} | {fix_ambig_alt} | ifne vcfuniqalleles | " """bcftools filter -m + -s REJECT -e "SS != '.' && SS != '2'" 2> /dev/null | """ "{py_cl} -x 'bcbio.variation.varscan.spv_freq_filter(x, 1)' | " "bgzip -c > {tx_fix_file}") do.run(cmd.format(**locals()), "Varscan paired fix") to_combine.append(fix_file) if not to_combine: out_file = write_empty_vcf(out_file, config) else: out_file = combine_variant_files(to_combine, out_file, ref_file, config, region=target_regions) if os.path.getsize(out_file) == 0: write_empty_vcf(out_file) if out_file.endswith(".gz"): out_file = bgzip_and_index(out_file, config)