def _get_build_type(fnames, samples, caller): """Confirm we should build a gemini database: need gemini in tools_on. Checks for valid conditions for running a database and gemini or gemini_orig configured in tools on. """ build_type = set() if any(vcfutils.vcf_has_variants(f) for f in fnames) and caller not in NO_DB_CALLERS: for data in samples: if any([ x in dd.get_tools_on(data) for x in [ "gemini", "gemini_orig", "gemini_allvariants", "vcf2db_expand" ] ]): if vcfanno.annotate_gemini(data): build_type.add("gemini_orig" if "gemini_orig" in dd.get_tools_on(data) else "gemini") else: logger.info( "Not running gemini, input data not found: %s" % dd.get_sample_name(data)) else: logger.info( "Not running gemini, not configured in tools_on: %s" % dd.get_sample_name(data)) else: logger.info("Not running gemini, no samples with variants found: %s" % (", ".join([dd.get_sample_name(d) for d in samples]))) return build_type
def _create_config_file(out_dir, samples): """Provide configuration file hiding duplicate columns. Future entry point for providing top level configuration of output reports. """ out_file = os.path.join(out_dir, "multiqc_config.yaml") out = { "table_columns_visible": { "SnpEff": { "Change_rate": False, "Ts_Tv_ratio": False, "Number_of_variants_before_filter": False }, "samtools": { "error_rate": False } }, "module_order": [ "bcbio", "samtools", "goleft_indexcov", "bcftools", "picard", "qualimap", "snpeff", "fastqc" ] } # Avoid duplicated bcbio columns with qualimap if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples): out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False} with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. For VQSR, need to split the file to apply. For hard filters can run on the original filter, filtering by bcftools type. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): assert "gvcf" not in dd.get_tools_on(data), \ ("Cannot force gVCF output and use VQSR. Try using hard filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_hard) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_hard) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return _filter_nonref(combined_file, data) else: snp_filter = vfilter.gatk_snp_hard(call_file, data) indel_filter = vfilter.gatk_indel_hard(snp_filter, data) if "gvcf" not in dd.get_tools_on(data): return _filter_nonref(indel_filter, data) else: return indel_filter
def _create_config_file(out_dir, samples): """Provide configuration file hiding duplicate columns. Future entry point for providing top level configuration of output reports. """ out_file = os.path.join(out_dir, "multiqc_config.yaml") out = {"table_columns_visible": dict()} # Avoid duplicated bcbio columns with qualimap if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples): out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False} out["table_columns_visible"]["FastQC"] = {"percent_gc": False} # Setting the module order module_order = [] module_order.extend([ "bcbio", "samtools", "goleft_indexcov", "peddy" ]) out['bcftools'] = {'write_separate_table': True} # if germline calling was performed: if any("germline" in (get_active_vcinfo(s) or {}) # tumor-only somatic with germline extraction or dd.get_phenotype(s) == "germline" # or paired somatic with germline calling for normal for s in samples): # Split somatic and germline variant stats into separate multiqc submodules, # with somatic going into General Stats, and germline going into a separate table: module_order.extend([{ 'bcftools': { 'name': 'Bcftools (somatic)', 'info': 'Bcftools stats for somatic variant calls only.', 'path_filters': ['*_bcftools_stats.txt'], 'write_general_stats': True, }}, {'bcftools': { 'name': 'Bcftools (germline)', 'info': 'Bcftools stats for germline variant calls only.', 'path_filters': ['*_bcftools_stats_germline.txt'], 'write_general_stats': False }}, ]) else: module_order.append("bcftools") module_order.extend([ "picard", "qualimap", "snpeff", "fastqc", "preseq", ]) out["module_order"] = module_order preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)] if preseq_samples: out["preseq"] = _make_preseq_multiqc_config(preseq_samples) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): assert "gvcf" not in dd.get_tools_on(data), \ ("Cannot force gVCF output and use VQSR. Try using cutoff-based soft filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels( call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_cutoff) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_cutoff) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return _filter_nonref(combined_file, data) else: snp_filter = vfilter.gatk_snp_cutoff(call_file, data) indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data) if "gvcf" not in dd.get_tools_on(data): return _filter_nonref(indel_filter, data) else: return indel_filter
def _create_config_file(out_dir, samples): """Provide configuration file hiding duplicate columns. Future entry point for providing top level configuration of output reports. """ out_file = os.path.join(out_dir, "multiqc_config.yaml") out = {"table_columns_visible": dict()} # Avoid duplicated bcbio columns with qualimap if any(("qualimap" in dd.get_tools_on(d) or "qualimap_full" in dd.get_tools_on(d)) for d in samples): out["table_columns_visible"]["bcbio"] = {"Average_insert_size": False} out["table_columns_visible"]["FastQC"] = {"percent_gc": False} # Setting the module order module_order = [] module_order.extend([ "bcbio", "samtools", "goleft_indexcov" ]) out['bcftools'] = {'write_separate_table': True} # if germline calling was performed: if any("germline" in (get_active_vcinfo(s) or {}) # tumor-only somatic with germline extraction or dd.get_phenotype(s) == "germline" # or paired somatic with germline calling for normal for s in samples): # Split somatic and germline variant stats into separate multiqc submodules, # with somatic going into General Stats, and germline going into a separate table: module_order.extend([{ 'bcftools': { 'name': 'Bcftools (somatic)', 'info': 'Bcftools stats for somatic variant calls only.', 'path_filters': ['*_bcftools_stats.txt'], 'write_general_stats': True, }}, {'bcftools': { 'name': 'Bcftools (germline)', 'info': 'Bcftools stats for germline variant calls only.', 'path_filters': ['*_bcftools_stats_germline.txt'], 'write_general_stats': False }}, ]) else: module_order.append("bcftools") module_order.extend([ "picard", "qualimap", "snpeff", "fastqc", "preseq", ]) out["module_order"] = module_order preseq_samples = [s for s in samples if tz.get_in(["config", "algorithm", "preseq"], s)] if preseq_samples: out["preseq"] = _make_preseq_multiqc_config(preseq_samples) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return out_file
def _get_build_type(fnames, samples, caller): """Confirm we should build a gemini database: need gemini in tools_on. Checks for valid conditions for running a database and gemini or gemini_orig configured in tools on. """ build_type = set() if any(vcfutils.vcf_has_variants(f) for f in fnames) and caller not in NO_DB_CALLERS: for data in samples: if any([x in dd.get_tools_on(data) for x in ["gemini", "gemini_orig", "gemini_allvariants", "vcf2db_expand"]]): if vcfanno.annoated_gemini(data): build_type.add("gemini_orig" if "gemini_orig" in dd.get_tools_on(data) else "gemini") return build_type
def gatk_snp_cutoff(in_file, data): """Perform cutoff-based soft filtering on GATK SNPs using best-practice recommendations. We have a more lenient mapping quality (MQ) filter compared to GATK defaults. The recommended filter (MQ < 40) is too stringent, so we adjust to 30: http://imgur.com/a/oHRVB QD and FS are not calculated when generating gVCF output: https://github.com/broadgsa/gatk-protected/blob/e91472ddc7d58ace52db0cab4d70a072a918d64c/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java#L300 The extra command removes escaped quotes in the VCF output which pyVCF fails on. Does not use the GATK best practice recommend SOR filter (SOR > 3.0) as it has a negative impact on sensitivity relative to precision: https://github.com/bcbio/bcbio_validations/tree/master/gatk4#na12878-hg38 """ filters = ["MQ < 30.0", "MQRankSum < -12.5", "ReadPosRankSum < -8.0"] if "gvcf" not in dd.get_tools_on(data): filters += ["QD < 2.0", "FS > 60.0"] # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores # resulting in excessive filtering, so avoid this metric variantcaller = utils.get_in(data, ("config", "algorithm", "variantcaller")) if variantcaller not in ["gatk-haplotype", "haplotyper"]: filters.append("HaplotypeScore > 13.0") return cutoff_w_expression(in_file, 'TYPE="snp" && (%s)' % " || ".join(filters), data, "GATKCutoffSNP", "SNP", extra_cmd=r"""| sed 's/\\"//g'""")
def _bcftools_stats(data, out_dir, vcf_file_key=None, germline=False): """Run bcftools stats. """ vcinfo = get_active_vcinfo(data) if vcinfo: out_dir = utils.safe_makedir(out_dir) vcf_file = vcinfo[vcf_file_key or "vrn_file"] if dd.get_jointcaller(data) or "gvcf" in dd.get_tools_on(data): opts = "" else: opts = "-f PASS,." name = dd.get_sample_name(data) out_file = os.path.join(out_dir, "%s_bcftools_stats%s.txt" % (name, ("_germline" if germline else ""))) bcftools = config_utils.get_program("bcftools", data["config"]) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: orig_out_file = os.path.join(os.path.dirname(tx_out_file), "orig_%s" % os.path.basename(tx_out_file)) cmd = ("{bcftools} stats -s {name} {opts} {vcf_file} > {orig_out_file}") do.run(cmd.format(**locals()), "bcftools stats %s" % name) with open(orig_out_file) as in_handle: with open(tx_out_file, "w") as out_handle: for line in in_handle: if line.startswith("ID\t"): parts = line.split("\t") parts[-1] = "%s\n" % name line = "\t".join(parts) out_handle.write(line) return out_file
def create_combined_tx2gene(data): out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") items = disambiguate.split([data]) tx2gene_files = [] for i in items: odata = i[0] gtf_file = dd.get_transcriptome_gtf(odata) if not gtf_file: gtf_file = dd.get_gtf_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + "-tx2gene.csv") tools_on = dd.get_tools_on(odata) if tools_on and "keep_gene_version" in tools_on: k_version = True else: k_version = False if file_exists(out_file): tx2gene_files.append(out_file) else: out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False, keep_version=k_version) tx2gene_files.append(out_file) combined_file = os.path.join(out_dir, "tx2gene.csv") if file_exists(combined_file): return combined_file tx2gene_file_string = " ".join(tx2gene_files) cmd = "cat {tx2gene_file_string} > {tx_out_file}" with file_transaction(data, combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining tx2gene CSV files.") return combined_file
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ from bcbio.heterogeneity import chromhacks CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files") noalt_calling = "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data) def callable_chrom_filter(r): """Filter to callable region, potentially limiting by chromosomes. """ return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom)) out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir(data): sv_bed = regions.get_sv_bed(data) callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed) input_regions_bed = dd.get_variant_regions(data) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(data, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(callable_chrom_filter) if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect(input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, callable_bed, depth_files)
def _add_config_regions(nblock_regions, ref_regions, data): """Add additional nblock regions based on configured regions to call. Identifies user defined regions which we should not be analyzing. """ input_regions_bed = dd.get_variant_regions(data) if input_regions_bed: input_regions = pybedtools.BedTool(input_regions_bed) # work around problem with single region not subtracted correctly. if len(input_regions) == 1: str_regions = str(input_regions[0]).strip() input_regions = pybedtools.BedTool("%s\n%s" % (str_regions, str_regions), from_string=True) input_nblock = ref_regions.subtract(input_regions, nonamecheck=True) if input_nblock == ref_regions: raise ValueError("Input variant_region file (%s) " "excludes all genomic regions. Do the chromosome names " "in the BED file match your genome (chr1 vs 1)?" % input_regions_bed) all_intervals = _combine_regions([input_nblock, nblock_regions], ref_regions) else: all_intervals = nblock_regions if "noalt_calling" in dd.get_tools_on(data) or "altcontigs" in dd.get_exclude_regions(data): from bcbio.heterogeneity import chromhacks remove_intervals = ref_regions.filter(lambda r: not chromhacks.is_nonalt(r.chrom)) all_intervals = _combine_regions([all_intervals, remove_intervals], ref_regions) return all_intervals.merge()
def run(bam_file, data, out_dir): if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) broad_runner = broad.PicardCmdRunner("picard", data["config"]) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) out_base = utils.splitext_plus(os.path.basename(bam_fname))[0] hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base) hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base) if not utils.file_exists(hsmetric_file) and not utils.file_exists(hsinsert_file): with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, data["config"]) if utils.file_exists(hsmetric_file): do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "") if utils.file_exists(hsinsert_file): do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "") return hsmetric_file
def _associate_cnvkit_out(ckouts, items, is_somatic=False): """Associate cnvkit output with individual items. """ assert len(ckouts) == len(items) out = [] upload_counts = collections.defaultdict(int) for ckout, data in zip(ckouts, items): ckout = copy.deepcopy(ckout) ckout["variantcaller"] = "cnvkit" if utils.file_exists(ckout["cns"]) and _cna_has_values(ckout["cns"]): ckout = _add_seg_to_output(ckout, data) ckout = _add_gainloss_to_output(ckout, data) ckout = _add_segmetrics_to_output(ckout, data) ckout = _add_variantcalls_to_output(ckout, data, items, is_somatic) # ckout = _add_coverage_bedgraph_to_output(ckout, data) ckout = _add_cnr_bedgraph_and_bed_to_output(ckout, data) if "svplots" in dd.get_tools_on(data): ckout = _add_plots_to_output(ckout, data) ckout["do_upload"] = upload_counts[ckout.get("vrn_file")] == 0 if "sv" not in data: data["sv"] = [] data["sv"].append(ckout) if ckout.get("vrn_file"): upload_counts[ckout["vrn_file"]] += 1 out.append(data) return out
def run(bam_file, data, out_dir): if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) broad_runner = broad.PicardCmdRunner("picard", data["config"]) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) out_base = utils.splitext_plus(os.path.basename(bam_fname))[0] hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base) hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base) if not utils.file_exists(hsmetric_file) and not utils.file_exists( hsinsert_file): with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, data["config"]) if utils.file_exists(hsmetric_file): do.run( "sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "") if utils.file_exists(hsinsert_file): do.run( "sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "") return hsmetric_file
def run(bam_file, data, out_dir): config = data["config"] if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) broad_runner = broad.PicardCmdRunner("picard", config) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample) hsinsert_file = os.path.join(out_dir, "%s-sort.insert_metrics" % sample) if utils.file_exists(hsmetric_file): return hsmetric_file with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, config) do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "") do.run("sed -i 's/-sort.bam//g' %s" % hsinsert_file, "") return hsmetric_file
def make_bcbiornaseq_object(data): """ load the initial bcb.rda object using bcbioRNASeq """ if "bcbiornaseq" not in dd.get_tools_on(data): return data upload_dir = tz.get_in(("upload", "dir"), data) report_dir = os.path.join(upload_dir, "bcbioRNASeq") safe_makedir(report_dir) organism = dd.get_bcbiornaseq(data).get("organism", None) groups = dd.get_bcbiornaseq(data).get("interesting_groups", None) loadstring = create_load_string(upload_dir, groups, organism, "gene") r_file = os.path.join(report_dir, "load_bcbioRNAseq.R") with file_transaction(r_file) as tmp_file: memoize_write_file(loadstring, tmp_file) rcmd = Rscript_cmd() with chdir(report_dir): do.run([rcmd, "--vanilla", r_file], "Loading bcbioRNASeq object.") write_counts(os.path.join(report_dir, "data", "bcb.rda"), "gene") loadstring = create_load_string(upload_dir, groups, organism, "transcript") r_file = os.path.join(report_dir, "load_transcript_bcbioRNAseq.R") with file_transaction(r_file) as tmp_file: memoize_write_file(loadstring, tmp_file) rcmd = Rscript_cmd() with chdir(report_dir): do.run([rcmd, "--vanilla", r_file], "Loading transcript-level bcbioRNASeq object.") write_counts(os.path.join(report_dir, "data-transcript", "bcb.rda"), "transcript") make_quality_report(data) return data
def _bedpes_from_cnv_caller(data, work_dir): """Retrieve BEDPEs deletion and duplications from CNV callers. Currently integrates with CNVkit. """ supported = set(["cnvkit"]) cns_file = None for sv in data.get("sv", []): if sv["variantcaller"] in supported and "cns" in sv and "lumpy_usecnv" in dd.get_tools_on(data): cns_file = sv["cns"] break if not cns_file: return None, None else: out_base = os.path.join(work_dir, utils.splitext_plus(os.path.basename(cns_file))[0]) out_dels = out_base + "-dels.bedpe" out_dups = out_base + "-dups.bedpe" if not os.path.exists(out_dels) or not os.path.exists(out_dups): with file_transaction(data, out_dels, out_dups) as (tx_out_dels, tx_out_dups): try: cnvanator_path = config_utils.get_program("cnvanator_to_bedpes.py", data) except config_utils.CmdNotFound: return None, None cmd = [cnvanator_path, "-c", cns_file, "--cnvkit", "--del_o=%s" % tx_out_dels, "--dup_o=%s" % tx_out_dups, "-b", "250"] # XXX Uses default piece size for CNVkit. Right approach? do.run(cmd, "Prepare CNVkit as input for lumpy", data) return out_dels, out_dups
def _apply_priority_filter(in_file, priority_file, data): """Annotate variants with priority information and use to apply filters. """ out_file = "%s-priority%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: header = ( '##INFO=<ID=EPR,Number=.,Type=String,' 'Description="Somatic prioritization based on external annotations, ' 'identify as likely germline">') header_file = "%s-repeatheader.txt" % utils.splitext_plus( tx_out_file)[0] with open(header_file, "w") as out_handle: out_handle.write(header) if "tumoronly_germline_filter" in dd.get_tools_on(data): filter_cmd = ("bcftools filter -m '+' -s 'LowPriority' " """-e "EPR[0] != 'pass'" |""") else: filter_cmd = "" cmd = ("bcftools annotate -a {priority_file} -h {header_file} " "-c CHROM,FROM,TO,REF,ALT,INFO/EPR {in_file} | " "{filter_cmd} bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "Run external annotation based prioritization filtering") vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): num_cores = dd.get_num_cores(items[0]) broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores) gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+, or GATK4 for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: if num_cores > 1 and gatk_type == "gatk4": params += ["-T", "HaplotypeCallerSpark", "--spark-master", "local[%s]" % num_cores, "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)] else: params += ["-T", "HaplotypeCaller"] params += ["--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] if gatk_type == "gatk4": params += ["--output", tx_out_file] else: params += ["-o", tx_out_file] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"): # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE if not gatk_type == "gatk4" and _supports_avx(): params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"] # Prepare gVCFs if doing joint calling is_joint = False if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): is_joint = True if gatk_type == "gatk4": params += ["--emit-ref-confidence", "GVCF"] else: params += ["--emitRefConfidence", "GVCF"] params += ["--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] # Enable non-diploid calling in GATK 3.3+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"): # GenomicsDB does not support non-diploid samples in GATK4 joint calling # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 if not is_joint and gatk_type == "gatk4": params += ["-ploidy", str(ploidy.get_ploidy(items, region))] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("gatk-haplotype") memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=(num_cores > 1 and gatk_type == "gatk4")) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" # back compatible -- older files were named with lane information, use sample name now out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if not utils.file_exists(out_file): out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) qual_format = data["config"]["algorithm"].get("quality_format", "").lower() min_size = None if data.get("align_split") or fastq_file.endswith(".sdf"): if fastq_file.endswith(".sdf"): min_size = rtg.min_read_size(fastq_file) final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if qual_format == "illumina": fastq_file = alignprep.fastq_convert_pipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.fastq_convert_pipe_cl(pair_file, data) rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): # If we cannot do piping, use older bwa aln approach if ("bwa-mem" not in dd.get_tools_on(data) and ("bwa-mem" in dd.get_tools_off(data) or not _can_use_mem(fastq_file, data, min_size))): out_file = _align_backtrack(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) else: out_file = _align_mem(fastq_file, pair_file, ref_file, out_file, names, rg_info, data) data["work_bam"] = out_file return data
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if includes_missingalt(data): logger.info("Removing variants with missing alts from %s." % call_file) call_file = gatk_remove_missingalt(call_file, data) if "gatkcnn" in dd.get_tools_on(data): return _cnn_filter(call_file, vrn_files, data) elif config_utils.use_vqsr(algs, call_file): if vcfutils.is_gvcf_file(call_file): raise ValueError("Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. " "Try using cutoff-based soft filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels(call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_cutoff) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_cutoff) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return combined_file else: snp_filter = vfilter.gatk_snp_cutoff(call_file, data) indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data) return indel_filter
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not vcfutils.vcf_has_variants(gemini_vcf): return None if not utils.file_exists(gemini_db): data_basepath = install.get_gemini_dir(data) if support_gemini_orig( data) else None conf_files = dd.get_vcfanno(data) if not conf_files: conf_files = ["gemini"] ann_file = vcfanno.run_vcfanno(gemini_vcf, conf_files, data, data_basepath) with file_transaction(data, gemini_db) as tx_gemini_db: vcf2db = config_utils.get_program("vcf2db.py", data) if "vcf2db_expand" in dd.get_tools_on(data): vcf2db_args = [ "--expand", "gt_types", "--expand", "gt_ref_depths", "--expand", "gt_alt_depths" ] else: vcf2db_args = [] cmd = [vcf2db, ann_file, ped_file, tx_gemini_db] + vcf2db_args do.run(cmd, "GEMINI: create database with vcf2db") return gemini_db
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities", "--strict-vcf"] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps( bedutils.population_variant_regions(items), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any( tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if includes_missingalt(data): logger.info("Removing variants with missing alts from %s." % call_file) call_file = gatk_remove_missingalt(call_file, data) if "gatkcnn" in dd.get_tools_on(data): return _cnn_filter(call_file, vrn_files, data) elif config_utils.use_vqsr(algs, call_file): if vcfutils.is_gvcf_file(call_file): raise ValueError( "Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. " "Try using cutoff-based soft filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels( call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_cutoff) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_cutoff) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return combined_file else: snp_filter = vfilter.gatk_snp_cutoff(call_file, data) indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data) return indel_filter
def sample_callable_bed(bam_file, ref_file, data): """Retrieve callable regions for a sample subset by defined analysis regions. """ from bcbio.heterogeneity import chromhacks CovInfo = collections.namedtuple("CovInfo", "callable, raw_callable, depth_files") noalt_calling = "noalt_calling" in dd.get_tools_on( data) or "altcontigs" in dd.get_exclude_regions(data) def callable_chrom_filter(r): """Filter to callable region, potentially limiting by chromosomes. """ return r.name == "CALLABLE" and (not noalt_calling or chromhacks.is_nonalt(r.chrom)) out_file = "%s-callable_sample.bed" % os.path.splitext(bam_file)[0] with shared.bedtools_tmpdir(data): sv_bed = regions.get_sv_bed(data) callable_bed, depth_files = coverage.calculate(bam_file, data, sv_bed) input_regions_bed = dd.get_variant_regions(data) if not utils.file_uptodate(out_file, callable_bed): with file_transaction(data, out_file) as tx_out_file: callable_regions = pybedtools.BedTool(callable_bed) filter_regions = callable_regions.filter(callable_chrom_filter) if input_regions_bed: if not utils.file_uptodate(out_file, input_regions_bed): input_regions = pybedtools.BedTool(input_regions_bed) filter_regions.intersect( input_regions, nonamecheck=True).saveas(tx_out_file) else: filter_regions.saveas(tx_out_file) return CovInfo(out_file, callable_bed, depth_files)
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"]]): to_run.append("qualimap") if analysis.startswith("rna-seq"): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") if not analysis.startswith("smallrna-seq"): to_run.append("samtools") to_run.append("gemini") if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if analysis.startswith(("standard", "variant", "variant2")): to_run += ["qsignature", "coverage", "variants", "picard"] return to_run
def _add_config_regions(nblock_regions, ref_regions, data): """Add additional nblock regions based on configured regions to call. Identifies user defined regions which we should not be analyzing. """ input_regions_bed = dd.get_variant_regions(data) if input_regions_bed: input_regions = pybedtools.BedTool(input_regions_bed) # work around problem with single region not subtracted correctly. if len(input_regions) == 1: str_regions = str(input_regions[0]).strip() input_regions = pybedtools.BedTool("%s\n%s" % (str_regions, str_regions), from_string=True) input_nblock = ref_regions.subtract(input_regions, nonamecheck=True) if input_nblock == ref_regions: raise ValueError( "Input variant_region file (%s) " "excludes all genomic regions. Do the chromosome names " "in the BED file match your genome (chr1 vs 1)?" % input_regions_bed) all_intervals = _combine_regions([input_nblock, nblock_regions], ref_regions) else: all_intervals = nblock_regions if "noalt_calling" in dd.get_tools_on( data) or "altcontigs" in dd.get_exclude_regions(data): from bcbio.heterogeneity import chromhacks remove_intervals = ref_regions.filter( lambda r: not chromhacks.is_nonalt(r.chrom)) all_intervals = _combine_regions([all_intervals, remove_intervals], ref_regions) return all_intervals.merge()
def run(items): """Perform detection of structural variations with Manta. """ paired = vcfutils.get_paired(items) data = paired.tumor_data if paired else items[0] work_dir = _sv_workdir(data) variant_file = _get_out_file(work_dir, paired) if not utils.file_exists(variant_file): with file_transaction(data, work_dir) as tx_work_dir: utils.safe_makedir(tx_work_dir) tx_workflow_file = _prep_config(items, paired, tx_work_dir) _run_workflow(items, paired, tx_workflow_file, tx_work_dir) assert utils.file_exists( variant_file), "Manta finished without output file %s" % variant_file out = [] for data in items: if paired and paired.normal_bam and "break-point-inspector" in dd.get_tools_on( data): variant_file = _run_break_point_inspector(data, variant_file, paired) if "sv" not in data: data["sv"] = [] final_vcf = shared.finalize_sv(variant_file, data, items) data["sv"].append({"variantcaller": "manta", "vrn_file": final_vcf}) out.append(data) return out
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files.get("dbsnp"), region, out_file) gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+, or GATK4 for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: params += [ "-T", "HaplotypeCaller", "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC" ] if gatk_type == "gatk4": params += ["--output", tx_out_file] else: params += ["-o", tx_out_file] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion( broad_runner.gatk_major_version()) >= LooseVersion("3.1"): # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE if not gatk_type == "gatk4": params += [ "--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING" ] # Enable non-diploid calling in GATK 3.3+ if LooseVersion( broad_runner.gatk_major_version()) >= LooseVersion("3.3"): params += ["-ploidy", str(ploidy.get_ploidy(items, region))] # Prepare gVCFs if doing joint calling if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): params += [ "--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000" ] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([ tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"] ]): to_run.append("qualimap") if analysis.startswith("rna-seq"): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") if not analysis.startswith("smallrna-seq"): to_run.append("samtools") to_run.append("gemini") if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if analysis.startswith(("standard", "variant", "variant2")): to_run += ["qsignature", "coverage", "variants", "picard"] return to_run
def _bedpes_from_cnv_caller(data, work_dir): """Retrieve BEDPEs deletion and duplications from CNV callers. Currently integrates with CNVkit. """ supported = set(["cnvkit"]) cns_file = None for sv in data.get("sv", []): if sv["variantcaller"] in supported and "cns" in sv and "lumpy_usecnv" in dd.get_tools_on( data): cns_file = sv["cns"] break if not cns_file: return None, None else: out_base = os.path.join( work_dir, utils.splitext_plus(os.path.basename(cns_file))[0]) out_dels = out_base + "-dels.bedpe" out_dups = out_base + "-dups.bedpe" if not os.path.exists(out_dels) or not os.path.exists(out_dups): with file_transaction(data, out_dels, out_dups) as (tx_out_dels, tx_out_dups): try: cnvanator_path = config_utils.get_program( "cnvanator_to_bedpes.py", data) except config_utils.CmdNotFound: return None, None cmd = [ cnvanator_path, "-c", cns_file, "--cnvkit", "--del_o=%s" % tx_out_dels, "--dup_o=%s" % tx_out_dups, "-b", "250" ] # XXX Uses default piece size for CNVkit. Right approach? do.run(cmd, "Prepare CNVkit as input for lumpy", data) return out_dels, out_dups
def get_analysis_intervals(data, vrn_file, base_dir): """Retrieve analysis regions for the current variant calling pipeline. """ if vrn_file and "gvcf" in dd.get_tools_on(data): callable_bed = _callable_from_gvcf(data, vrn_file, base_dir) if callable_bed: return callable_bed if data.get("ensemble_bed"): return data["ensemble_bed"] elif dd.get_callable_regions(data): return dd.get_callable_regions(data) elif data.get("align_bam"): return callable.sample_callable_bed(data["align_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam"): return callable.sample_callable_bed(data["work_bam"], dd.get_ref_file(data), data)[0] elif data.get("work_bam_callable"): return callable.sample_callable_bed(data["work_bam_callable"], dd.get_ref_file(data), data)[0] elif tz.get_in(["config", "algorithm", "callable_regions"], data): return tz.get_in(["config", "algorithm", "callable_regions"], data) elif tz.get_in(["config", "algorithm", "variant_regions"], data): return tz.get_in(["config", "algorithm", "variant_regions"], data)
def run(items): """Perform detection of structural variations with Manta. """ paired = vcfutils.get_paired(items) data = paired.tumor_data if paired else items[0] work_dir = _sv_workdir(data) variant_file = _get_out_file(work_dir, paired) if not utils.file_exists(variant_file): with file_transaction(data, work_dir) as tx_work_dir: utils.safe_makedir(tx_work_dir) tx_workflow_file = _prep_config(items, paired, tx_work_dir) _run_workflow(items, paired, tx_workflow_file, tx_work_dir) assert utils.file_exists(variant_file), "Manta finished without output file %s" % variant_file variant_file = shared.annotate_with_depth(variant_file, items) out = [] upload_counts = collections.defaultdict(int) for data in items: if "break-point-inspector" in dd.get_tools_on(data): if paired and paired.normal_bam and paired.tumor_name == dd.get_sample_name(data): variant_file = _run_break_point_inspector(data, variant_file, paired, work_dir) if "sv" not in data: data["sv"] = [] final_vcf = shared.finalize_sv(variant_file, data, items) vc = {"variantcaller": "manta", "do_upload": upload_counts[final_vcf] == 0, # only upload a single file per batch "vrn_file": final_vcf} evidence_bam = _get_evidence_bam(work_dir, data) if evidence_bam: vc["read_evidence"] = evidence_bam data["sv"].append(vc) upload_counts[final_vcf] += 1 out.append(data) return out
def gatk_indel_hard(in_file, data): """Perform hard filtering on GATK indels using best-practice recommendations. """ filters = ["ReadPosRankSum < -20.0"] if "gvcf" not in dd.get_tools_on(data): filters += ["QD < 2.0", "FS > 200.0"] return hard_w_expression(in_file, 'TYPE="indel" && (%s)' % " || ".join(filters), data, "GATKHardIndel", "INDEL")
def _bcftools_stats(data, out_dir, vcf_file_key=None, germline=False): """Run bcftools stats. """ vcinfo = get_active_vcinfo(data) if vcinfo: out_dir = utils.safe_makedir(out_dir) vcf_file = vcinfo[vcf_file_key or "vrn_file"] if dd.get_jointcaller(data) or "gvcf" in dd.get_tools_on(data): opts = "" else: opts = "-f PASS,." name = dd.get_sample_name(data) out_file = os.path.join( out_dir, "%s_bcftools_stats%s.txt" % (name, ("_germline" if germline else ""))) bcftools = config_utils.get_program("bcftools", data["config"]) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: orig_out_file = os.path.join( os.path.dirname(tx_out_file), "orig_%s" % os.path.basename(tx_out_file)) cmd = ( "{bcftools} stats -s {name} {opts} {vcf_file} > {orig_out_file}" ) do.run(cmd.format(**locals()), "bcftools stats %s" % name) with open(orig_out_file) as in_handle: with open(tx_out_file, "w") as out_handle: for line in in_handle: if line.startswith("ID\t"): parts = line.split("\t") parts[-1] = "%s\n" % name line = "\t".join(parts) out_handle.write(line) return out_file
def get_gatk_version(self): """Retrieve GATK version, handling locally and config cached versions. Calling version can be expensive due to all the startup and shutdown of JVMs, so we prefer cached version information. """ if self._gatk_version is None: self._set_default_versions(self._config) if "gatk4" in dd.get_tools_on({"config": self._config}): # In cases whwere we don't have manifest versions. Not possible to get # version from commandline with GATK4 alpha version if self._gatk4_version is None: self._gatk4_version = "4.0" return self._gatk4_version elif self._gatk_version is not None: return self._gatk_version else: if self._has_gatk_conda_wrapper(): gatk_jar = None else: gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"], allow_missing=True) self._gatk_version = get_gatk_version(gatk_jar, config=self._config) return self._gatk_version
def gatk_rnaseq_calling(data): """Use GATK to perform gVCF variant calling on RNA-seq data """ from bcbio.bam import callable data = utils.deepish_copy(data) tools_on = dd.get_tools_on(data) if not tools_on: tools_on = [] tools_on.append("gvcf") data = dd.set_tools_on(data, tools_on) data = dd.set_jointcaller(data, ["%s-joint" % v for v in dd.get_variantcaller(data)]) out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype")) data = _setup_variant_regions(data, out_dir) out_file = os.path.join(out_dir, "%s-gatk-haplotype.vcf.gz" % dd.get_sample_name(data)) if not utils.file_exists(out_file): region_files = [] regions = [] for cur_region in callable.get_split_regions(dd.get_variant_regions(data), data): str_region = "_".join([str(x) for x in cur_region]) region_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "variation", "rnaseq", "gatk-haplotype", "regions")), "%s-%s-gatk-haplotype.vcf.gz" % (dd.get_sample_name(data), str_region)) region_file = gatk.haplotype_caller([dd.get_split_bam(data)], [data], dd.get_ref_file(data), {}, region=cur_region, out_file=region_file) region_files.append(region_file) regions.append(cur_region) out_file = vcfutils.concat_variant_files(region_files, out_file, regions, dd.get_ref_file(data), data["config"]) return dd.set_vrn_file(data, out_file)
def gatk_snp_hard(in_file, data): """Perform hard filtering on GATK SNPs using best-practice recommendations. We have a more lenient mapping quality (MQ) filter compared to GATK defaults. The recommended filter (MQ < 40) is too stringent, so we adjust to 30: http://imgur.com/a/oHRVB QD and FS are not calculated when generating gVCF output: https://github.com/broadgsa/gatk-protected/blob/e91472ddc7d58ace52db0cab4d70a072a918d64c/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/haplotypecaller/HaplotypeCaller.java#L300 The extra command removes escaped quotes in the VCF output which pyVCF fails on. """ filters = ["MQ < 30.0", "MQRankSum < -12.5", "ReadPosRankSum < -8.0"] if "gvcf" not in dd.get_tools_on(data): filters += ["QD < 2.0", "FS > 60.0"] # GATK Haplotype caller (v2.2) appears to have much larger HaplotypeScores # resulting in excessive filtering, so avoid this metric variantcaller = utils.get_in(data, ("config", "algorithm", "variantcaller")) if variantcaller not in ["gatk-haplotype"]: filters.append("HaplotypeScore > 13.0") return hard_w_expression(in_file, 'TYPE="snp" && (%s)' % " || ".join(filters), data, "GATKHardSNP", "SNP", extra_cmd=r"""| sed 's/\\"//g'""")
def prep_gemini_db(fnames, call_info, samples, extras): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] name, caller, is_batch = call_info build_type = _get_build_type(fnames, samples, caller) out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) gemini_vcf = get_multisample_vcf(fnames, name, caller, data) # If we're building a gemini database, normalize the inputs if build_type: passonly = all("gemini_allvariants" not in dd.get_tools_on(d) for d in samples) gemini_vcf = normalize.normalize(gemini_vcf, data, passonly=passonly) decomposed = True else: decomposed = False ann_vcf = run_vcfanno(gemini_vcf, data, decomposed) gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) if ann_vcf and build_type and not utils.file_exists(gemini_db): ped_file = create_ped_file(samples + extras, gemini_vcf) # Original approach for hg19/GRCh37 if vcfanno.is_human(data, builds=["37" ]) and "gemini_orig" in build_type: gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db, ped_file) else: gemini_db = create_gemini_db(ann_vcf, data, gemini_db, ped_file) # only pass along gemini_vcf_downstream if uniquely created here if os.path.islink(gemini_vcf): gemini_vcf = None return [[(name, caller), { "db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": ann_vcf or gemini_vcf, "decomposed": decomposed }]]
def run(bam_file, data, out_dir): config = data["config"] if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) broad_runner = broad.PicardCmdRunner("picard", config) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample) if utils.file_exists(hsmetric_file): return hsmetric_file with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, config) do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "") return hsmetric_file
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities"] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def prep_gemini_db(fnames, call_info, samples, extras): """Prepare a gemini database from VCF inputs prepared with snpEff. """ data = samples[0] use_gemini = do_db_build(samples) and any( vcfutils.vcf_has_variants(f) for f in fnames) name, caller, is_batch = call_info out_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "gemini")) multisample_vcf = get_multisample_vcf(fnames, name, caller, data) gemini_db = os.path.join(out_dir, "%s-%s.db" % (name, caller)) if not utils.file_exists(gemini_db) and use_gemini: passonly = all("gemini_allvariants" not in dd.get_tools_on(d) for d in samples) gemini_vcf = multiallelic.to_single(multisample_vcf, data, passonly=passonly) ped_file = create_ped_file(samples + extras, gemini_vcf) # Use original approach for hg19/GRCh37 pending additional testing if support_gemini_orig(data) and not any( dd.get_vcfanno(d) for d in samples): gemini_db = create_gemini_db_orig(gemini_vcf, data, gemini_db, ped_file) else: gemini_db = create_gemini_db(gemini_vcf, data, gemini_db, ped_file) return [[(name, caller), { "db": gemini_db if utils.file_exists(gemini_db) else None, "vcf": multisample_vcf if is_batch else None }]]
def run_vep(in_file, data): """Annotate input VCF file with Ensembl variant effect predictor. """ if not vcfutils.vcf_has_variants(in_file): return None out_file = utils.append_stem(in_file, "-vepeffects") assert in_file.endswith(".gz") and out_file.endswith(".gz") if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: vep_dir, ensembl_name = prep_vep_cache(data["genome_build"], tz.get_in(["reference", "fasta", "base"], data)) if vep_dir: cores = tz.get_in(("config", "algorithm", "num_cores"), data, 1) fork_args = ["--fork", str(cores)] if cores > 1 else [] vep = config_utils.get_program("vep", data["config"]) is_human = tz.get_in(["genome_resources", "aliases", "human"], data, False) # HGVS requires a bgzip compressed, faidx indexed input file or is unusable slow if dd.get_ref_file_compressed(data): hgvs_compatible = True config_args = ["--fasta", dd.get_ref_file_compressed(data)] else: hgvs_compatible = False config_args = ["--fasta", dd.get_ref_file(data)] if is_human: plugin_fns = {"loftee": _get_loftee, "maxentscan": _get_maxentscan, "genesplicer": _get_genesplicer, "spliceregion": _get_spliceregion} plugins = ["loftee"] if "vep_splicesite_annotations" in dd.get_tools_on(data): # "genesplicer" too unstable so currently removed plugins += ["maxentscan", "spliceregion"] for plugin in plugins: plugin_args = plugin_fns[plugin](data) config_args += plugin_args config_args += ["--sift", "b", "--polyphen", "b"] if hgvs_compatible: config_args += ["--hgvs", "--shift_hgvs", "1"] if (dd.get_effects_transcripts(data).startswith("canonical") or tz.get_in(("config", "algorithm", "clinical_reporting"), data)): config_args += ["--pick_allele"] if ensembl_name.endswith("_merged"): config_args += ["--merged"] ensembl_name = ensembl_name.replace("_merged", "") resources = config_utils.get_resources("vep", data["config"]) extra_args = [str(x) for x in resources.get("options", [])] cmd = [vep, "--vcf", "-o", "stdout", "-i", in_file] + fork_args + extra_args + \ ["--species", ensembl_name, "--no_stats", "--cache", "--offline", "--dir", vep_dir, "--symbol", "--numbers", "--biotype", "--total_length", "--canonical", "--gene_phenotype", "--ccds", "--uniprot", "--domains", "--regulatory", "--protein", "--tsl", "--appris", "--af", "--max_af", "--af_1kg", "--af_esp", "--af_gnomad", "--pubmed", "--variant_class", "--allele_number"] + config_args perl_exports = utils.get_perl_exports() # Remove empty fields (';;') which can cause parsing errors downstream cmd = "%s && %s | sed '/^#/! s/;;/;/g' | bgzip -c > %s" % (perl_exports, " ".join(cmd), tx_out_file) do.run(cmd, "Ensembl variant effect predictor", data) if utils.file_exists(out_file): vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def gatk_indel_cutoff(in_file, data): """Perform cutoff-based soft filtering on GATK indels using best-practice recommendations. """ filters = ["ReadPosRankSum < -20.0"] if "gvcf" not in dd.get_tools_on(data): filters += ["QD < 2.0", "FS > 200.0", "SOR > 10.0"] return cutoff_w_expression(in_file, 'TYPE="indel" && (%s)' % " || ".join(filters), data, "GATKCutoffIndel", "INDEL", extra_cmd=r"""| sed 's/\\"//g'""")
def gatk_indel_cutoff(in_file, data): """Perform cutoff-based soft filtering on GATK indels using best-practice recommendations. """ filters = ["ReadPosRankSum < -20.0"] if "gvcf" not in dd.get_tools_on(data): filters += ["QD < 2.0", "FS > 200.0"] return cutoff_w_expression(in_file, 'TYPE="indel" && (%s)' % " || ".join(filters), data, "GATKCutoffIndel", "INDEL", extra_cmd=r"""| sed 's/\\"//g'""")
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): num_cores = dd.get_num_cores(items[0]) broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, region, out_file, num_cores) gatk_type = broad_runner.gatk_type() assert gatk_type in ["restricted", "gatk4"], \ "Require full version of GATK 2.4+, or GATK4 for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: if num_cores > 1 and gatk_type == "gatk4": params += ["-T", "HaplotypeCallerSpark", "--sparkMaster", "local[%s]" % num_cores, "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file)] else: params += ["-T", "HaplotypeCaller"] params += ["--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC"] if gatk_type == "gatk4": params += ["--output", tx_out_file] else: params += ["-o", tx_out_file] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"): # GATK4 selects the right HMM optimization automatically with FASTEST_AVAILABLE if not gatk_type == "gatk4" and _supports_avx(): params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"] # Prepare gVCFs if doing joint calling is_joint = False if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): is_joint = True params += ["--emitRefConfidence", "GVCF"] if not gatk_type == "gatk4": params += ["--variant_index_type", "LINEAR", "--variant_index_parameter", "128000"] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] # Enable non-diploid calling in GATK 3.3+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"): # GenomicsDB does not support non-diploid samples in GATK4 joint calling # https://gatkforums.broadinstitute.org/gatk/discussion/10061/using-genomicsdbimport-to-prepare-gvcfs-for-input-to-genotypegvcfs-in-gatk4 if not is_joint and gatk_type == "gatk4": params += ["-ploidy", str(ploidy.get_ploidy(items, region))] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("gatk-haplotype") memscale = {"magnitude": 0.9 * num_cores, "direction": "increase"} if num_cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=(num_cores > 1 and gatk_type == "gatk4")) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) return checkpoints
def cl_gatk(self, params, tmp_dir, memscale=None, parallel_gc=False): support_nt = set() support_nct = set(["BaseRecalibrator"]) if self._has_gatk_conda_wrapper(): gatk_jar = None else: gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"], allow_missing=True) if not gatk_jar: raise ValueError("GATK processing requested but gatk or older jar install not found: " "http://bcbio-nextgen.readthedocs.io/en/latest/contents/" "installation.html#gatk-and-mutect-mutect2") is_gatk4 = "gatk4" in dd.get_tools_on({"config": self._config}) cores = self._config["algorithm"].get("num_cores", 1) config = self._config atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] # For GATK4 specify command first, so swap params to accomplish if is_gatk4: params = params[:] del params[atype_index + 1] del params[atype_index] params = [prog] + params if cores and int(cores) > 1: if prog in support_nt: params.extend(["-nt", str(cores)]) elif prog in support_nct: params.extend(["-nct", str(cores)]) memscale = config["algorithm"]["memory_adjust"] = {"direction": "increase", "magnitude": max(1, int(cores) // 2)} # Filters and unsafe specifications not in GATK4 if LooseVersion(self.gatk_major_version()) > LooseVersion("1.9") and not is_gatk4: if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0: params.extend(["-U", "LENIENT_VCF_PROCESSING"]) params.extend(["--read_filter", "BadCigar", "--read_filter", "NotPrimaryAlignment"]) if memscale: jvm_opts = get_gatk_opts(config, tmp_dir=tmp_dir, memscale=memscale, include_gatk=False, parallel_gc=parallel_gc) else: # Decrease memory slightly from configuration to avoid memory allocation errors jvm_opts = config_utils.adjust_opts(self._jvm_opts, {"algorithm": {"memory_adjust": {"magnitude": 1.1, "direction": "decrease"}}}) jvm_opts += get_default_jvm_opts(tmp_dir, parallel_gc=parallel_gc) if "keyfile" in self._gatk_resources: params = ["-et", "NO_ET", "-K", self._gatk_resources["keyfile"]] + params if gatk_jar: return " ".join(["java"] + jvm_opts + ["-jar", gatk_jar] + [str(x) for x in params]) else: cmd = gatk_cmd("gatk", jvm_opts, params, config=self._config) if cmd: return cmd else: raise ValueError("GATK processing requested but gatk or older jar install not found: " "http://bcbio-nextgen.readthedocs.io/en/latest/contents/" "installation.html#gatk-and-mutect-mutect2")
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data) else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def _has_gatk_conda_wrapper(self): cmd = gatk_cmd("gatk", [], ["--version"], config=self._config) if cmd: if "gatk4" in dd.get_tools_on({"config": self._config}): return True else: try: stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT, shell=True) return stdout.find("GATK jar file not found") == -1 except subprocess.CalledProcessError: return False
def make_quality_report(data): """ create and render the bcbioRNASeq quality report """ if "bcbiornaseq" not in dd.get_tools_on(data): return data upload_dir = tz.get_in(("upload", "dir"), data) report_dir = os.path.join(upload_dir, "bcbioRNASeq") safe_makedir(report_dir) quality_rmd = os.path.join(report_dir, "quality_control.Rmd") quality_html = os.path.join(report_dir, "quality_control.html") quality_rmd = rmarkdown_draft(quality_rmd, "quality_control", "bcbioRNASeq")
def haplotype_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variation with GATK's HaplotypeCaller. This requires the full non open-source version of GATK. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): broad_runner, params = _shared_gatk_call_prep( align_bams, items, ref_file, assoc_files.get("dbsnp"), region, out_file ) assert broad_runner.gatk_type() == "restricted", "Require full version of GATK 2.4+ for haplotype calling" with file_transaction(items[0], out_file) as tx_out_file: params += [ "-T", "HaplotypeCaller", "-o", tx_out_file, "--annotation", "ClippingRankSumTest", "--annotation", "DepthPerSampleHC", ] # Enable hardware based optimizations in GATK 3.1+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.1"): params += ["--pair_hmm_implementation", "VECTOR_LOGLESS_CACHING"] # Enable non-diploid calling in GATK 3.3+ if LooseVersion(broad_runner.gatk_major_version()) >= LooseVersion("3.3"): params += ["-ploidy", str(ploidy.get_ploidy(items, region))] # Prepare gVCFs if doing joint calling if _joint_calling(items) or any("gvcf" in dd.get_tools_on(d) for d in items): params += [ "--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000", ] # Set GQ banding to not be single GQ resolution # No recommended default but try to balance resolution and size # http://gatkforums.broadinstitute.org/gatk/discussion/7051/recommendation-best-practices-gvcf-gq-bands for boundary in [10, 20, 30, 40, 60, 80]: params += ["-GQB", str(boundary)] resources = config_utils.get_resources("gatk-haplotype", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.new_resources("gatk-haplotype") broad_runner.run_gatk(params) return out_file
def platypus(in_file, data): """Filter Platypus calls, removing Q20 hard filter and replacing with depth and quality based filter. Platypus uses its own VCF nomenclature: TC == DP, FR == AF Platypus gVCF output appears to have an 0/1 index problem so the reference block regions are 1 base outside regions of interest. We avoid limiting regions during filtering when using it. """ filters = ('(FR[0] <= 0.5 && TC < 4 && %QUAL < 20) || ' '(TC < 13 && %QUAL < 10) || ' '(FR[0] > 0.5 && TC < 4 && %QUAL < 50)') limit_regions = "variant_regions" if "gvcf" not in dd.get_tools_on(data) else None return hard_w_expression(in_file, filters, data, name="PlatQualDepth", extra_cmd="| sed 's/\\tQ20\\t/\\tPASS\\t/'", limit_regions=limit_regions)
def finalize_sv(samples, config, initial_only=False): """Combine results from multiple sv callers into a single ordered 'sv' key. Handles ensemble calling and plotting of results. """ by_bam = collections.OrderedDict() for x in samples: try: by_bam[x["align_bam"]].append(x) except KeyError: by_bam[x["align_bam"]] = [x] by_batch = collections.OrderedDict() lead_batches = {} for grouped_calls in by_bam.values(): def orig_svcaller_order(x): return _get_svcallers(x).index(x["config"]["algorithm"]["svcaller_active"]) sorted_svcalls = sorted([x for x in grouped_calls if "sv" in x], key=orig_svcaller_order) final = grouped_calls[0] if len(sorted_svcalls) > 0: final_calls = reduce(operator.add, [x["sv"] for x in sorted_svcalls]) if not initial_only: for caller in (c for c in _get_svcallers(final) if c in _ENSEMBLE_CALLERS): final_calls = _ENSEMBLE_CALLERS[caller](final_calls, final) final_calls = ensemble.summarize(final_calls, final, grouped_calls) final_calls = validate.evaluate(final, final_calls) final["sv"] = final_calls del final["config"]["algorithm"]["svcaller_active"] batch = dd.get_batch(final) or dd.get_sample_name(final) batches = batch if isinstance(batch, (list, tuple)) else [batch] lead_batches[dd.get_sample_name(final)] = batches[0] for batch in batches: try: by_batch[batch].append(final) except KeyError: by_batch[batch] = [final] out = [] for batch, items in by_batch.items(): if any("svplots" in dd.get_tools_on(d) for d in items): plot_items = plot.by_regions(items) else: plot_items = items for data in plot_items: if lead_batches[dd.get_sample_name(data)] == batch: out.append([data]) return out
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): """Generalized vcfanno/vcf2db workflow for loading variants into a GEMINI database. """ if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not vcfutils.vcf_has_variants(gemini_vcf): return None if not utils.file_exists(gemini_db): with file_transaction(data, gemini_db) as tx_gemini_db: vcf2db = config_utils.get_program("vcf2db.py", data) if "vcf2db_expand" in dd.get_tools_on(data): vcf2db_args = ["--expand", "gt_types", "--expand", "gt_ref_depths", "--expand", "gt_alt_depths"] else: vcf2db_args = [] cmd = [vcf2db, gemini_vcf, ped_file, tx_gemini_db] + vcf2db_args do.run(cmd, "GEMINI: create database with vcf2db") return gemini_db
def get_qc_tools(data): """Retrieve a list of QC tools to use based on configuration and analysis type. Uses defaults if previously set. """ if dd.get_algorithm_qc(data): return dd.get_algorithm_qc(data) analysis = data["analysis"].lower() to_run = [] if tz.get_in(["config", "algorithm", "kraken"], data): to_run.append("kraken") if "fastqc" not in dd.get_tools_off(data): to_run.append("fastqc") if any([tool in dd.get_tools_on(data) for tool in ["qualimap", "qualimap_full"]]): to_run.append("qualimap") if analysis.startswith("rna-seq") or analysis == "smallrna-seq": if "qualimap" not in dd.get_tools_off(data): if gtf.is_qualimap_compatible(dd.get_gtf_file(data)): to_run.append("qualimap_rnaseq") else: logger.debug("GTF not compatible with Qualimap, skipping.") if analysis.startswith("chip-seq"): to_run.append("chipqc") if analysis.startswith("smallrna-seq"): to_run.append("small-rna") to_run.append("atropos") if "coverage_qc" not in dd.get_tools_off(data): to_run.append("samtools") if analysis.startswith(("standard", "variant", "variant2")): if "coverage_qc" not in dd.get_tools_off(data): to_run += ["coverage", "picard"] to_run += ["qsignature", "variants"] if vcfanno.is_human(data): to_run += ["contamination", "peddy"] if vcfutils.get_paired_phenotype(data): to_run += ["viral"] if damage.should_filter([data]): to_run += ["damage"] if dd.get_umi_consensus(data): to_run += ["umi"] if tz.get_in(["config", "algorithm", "preseq"], data): to_run.append("preseq") to_run = [tool for tool in to_run if tool not in dd.get_tools_off(data)] to_run.sort() return to_run