def clean_inputs(data): """Clean BED input files to avoid overlapping segments that cause downstream issues. Per-merges inputs to avoid needing to call multiple times during later parallel steps. """ if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")): data["config"]["algorithm"][ "variant_regions_orig"] = dd.get_variant_regions(data) clean_vr = clean_file(dd.get_variant_regions(data), data) merged_vr = merge_overlaps(clean_vr, data) data["config"]["algorithm"]["variant_regions"] = clean_vr data["config"]["algorithm"]["variant_regions_merged"] = merged_vr if dd.get_coverage(data): if not utils.get_in(data, ("config", "algorithm", "coverage_orig")): data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage( data) clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_cov_bed = merge_overlaps(clean_cov_bed, data) data["config"]["algorithm"]["coverage"] = clean_cov_bed data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed if 'seq2c' in get_svcallers(data): seq2c_ready_bed = prep_seq2c_bed(data) if not seq2c_ready_bed: logger.warning( "Can't run Seq2C without a svregions or variant_regions BED file" ) else: data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed return data
def clean_inputs(data): """Clean BED input files to avoid overlapping segments that cause downstream issues. Per-merges inputs to avoid needing to call multiple times during later parallel steps. """ if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")): data["config"]["algorithm"]["variant_regions_orig"] = dd.get_variant_regions(data) clean_vr = clean_file(dd.get_variant_regions(data), data) merged_vr = merge_overlaps(clean_vr, data) data["config"]["algorithm"]["variant_regions"] = clean_vr data["config"]["algorithm"]["variant_regions_merged"] = merged_vr if dd.get_coverage(data): if not utils.get_in(data, ("config", "algorithm", "coverage_orig")): data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(data) clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_cov_bed = merge_overlaps(clean_cov_bed, data) data["config"]["algorithm"]["coverage"] = clean_cov_bed data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed if 'seq2c' in get_svcallers(data): seq2c_ready_bed = prep_seq2c_bed(data) if not seq2c_ready_bed: logger.warning("Can't run Seq2C without a svregions or variant_regions BED file") else: data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed return data
def _get_region_bed(region, items, out_file): """Retrieve BED file of regions to analyze, either single or multi-region. """ variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target = shared.subset_variant_regions(variant_regions, region, out_file, items) if not target: raise ValueError("Need BED input for strelka2 regions: %s %s" % (region, target)) if not isinstance(target, basestring) or not os.path.isfile(target): chrom, start, end = target target = "%s-regions.bed" % utils.splitext_plus(out_file)[0] with file_transaction(items[0], target) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write("%s\t%s\t%s\n" % (chrom, start, end)) return bedutils.merge_overlaps(target, items[0], out_dir=os.path.dirname(out_file)) + ".gz"
def _get_target_access_files(cov_interval, data, work_dir): """Retrieve target and access files based on the type of data to process. pick targets, anti-targets and access files based on analysis type http://cnvkit.readthedocs.org/en/latest/nonhybrid.html """ base_regions = regions.get_sv_bed(data) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir) if base_regions: base_regions = shared.remove_exclude_regions( base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) if cov_interval == "amplicon": return target_bed, target_bed elif cov_interval == "genome": return target_bed, target_bed else: access_file = _create_access_file(dd.get_ref_file(data), _sv_workdir(data), data) return target_bed, access_file
def _get_target_access_files(cov_interval, data, work_dir): """Retrieve target and access files based on the type of data to process. pick targets, anti-targets and access files based on analysis type http://cnvkit.readthedocs.org/en/latest/nonhybrid.html """ base_regions = regions.get_sv_bed(data) # if we don't have a configured BED or regions to use for SV caling if not base_regions: # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = regions.get_sv_bed(data, "transcripts1e4", work_dir) if base_regions: base_regions = shared.remove_exclude_regions(base_regions, base_regions, [data]) # Finally, default to the defined variant regions if not base_regions: base_regions = dd.get_variant_regions(data) target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) if cov_interval == "amplicon": return target_bed, target_bed elif cov_interval == "genome": return target_bed, target_bed else: access_file = _create_access_file(dd.get_ref_file(data), _sv_workdir(data), data) return target_bed, access_file
def run_haplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's haplotyper (GATK HaplotypeCaller like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps( dd.get_variant_regions(items[0]), items[0]) target = shared.subset_variant_regions(variant_regions, region, out_file, items) interval = "--interval %s" % (target) if target else "" with file_transaction(items[0], out_file) as tx_out_file: dbsnp = "--dbsnp %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" bams = " ".join(["-i %s" % x for x in align_bams]) license = _license_export(items[0]) cmd = ("{license} sentieon driver -t 1 -r {ref_file} " "{bams} {interval} --algo Haplotyper {dbsnp} {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") return out_file
def _regions_for_coverage(data, region, ref_file, out_file): """Retrieve BED file of regions we need to calculate coverage in. Checks for variant region specifications that do not overlap contigs (in which case we do not calculate coverage) and regions smaller than callable_min_size (in which case we assign everything as callable). callable_min_size avoids calculations for small chromosomes we won't split on later, saving computation and disk IO. """ variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) ready_region = shared.subset_variant_regions(variant_regions, region, out_file) custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0] region_size = _get_region_size(ref_file, data, region) if variant_regions is None and region_size is not None and region_size < dd.get_callable_min_size(data): coverage_str = "CALLABLE" if realign.has_aligned_reads(dd.get_work_bam(data), region) else "NO_COVERAGE" custom_file = _write_all_chrom_file(coverage_str, custom_file, ref_file, region, data) return custom_file, False elif not ready_region: get_ref_bedtool(ref_file, data["config"]).saveas(custom_file) return custom_file, True elif os.path.isfile(ready_region): return ready_region, True elif isinstance(ready_region, (list, tuple)): c, s, e = ready_region pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file) return custom_file, True else: custom_file = _write_all_chrom_file("NO_COVERAGE", custom_file, ref_file, region, data) return custom_file, variant_regions is None
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists( os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) # Fixing the file name: MultiQC picks sample name from BAM file name. fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam") if not os.path.islink(fixed_bam_fname): os.symlink(bam_file, fixed_bam_fname) export = utils.local_path_export() cmd = ( "unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if tz.get_in(("genome_resources", "aliases", "human"), data, ""): species = "HUMAN" elif any( tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps( dd.get_coverage(data), data) or dd.get_variant_regions_merged(data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) # return _parse_qualimap_metrics(report_file, data) return dict()
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(out_dir, "qualimapReport.html") pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) export = utils.local_path_export() cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file)
def run_tnhaplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's TNhaplotyper (MuTect2 like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for Sentieon TNhaplotyper" dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" cosmic = "--cosmic %s" % (assoc_files.get("cosmic")) if "cosmic" in assoc_files else "" license = license_export(items[0]) tx_orig_file = "%s-orig%s" % utils.splitext_plus(tx_out_file) cores = dd.get_num_cores(items[0]) cmd = ("{license}sentieon driver -t {cores} -r {ref_file} " "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} " "--algo TNhaplotyper " "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} " "{dbsnp} {cosmic} {tx_orig_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") cmd = ("gunzip -c {tx_orig_file} | " "sed 's/ID=ECNT,Number=1,Type=Integer/ID=ECNT,Number=1,Type=String/' | " "sed 's/ID=HCNT,Number=1,Type=Integer/ID=HCNT,Number=1,Type=String/' | " "sed 's/ID=NLOD,Number=1,Type=Float/ID=NLOD,Number=1,Type=String/' | " "sed 's/ID=TLOD,Number=1,Type=Float/ID=TLOD,Number=1,Type=String/' | " "sed 's/ID=PON,Number=1,Type=Integer/ID=PON,Number=1,Type=String/' | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper: make headers GATK compatible") vcfutils.bgzip_and_index(tx_out_file, items[0]["config"]) return out_file
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) results_file = os.path.join(results_dir, "genome_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(results_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, tx_results_dir) cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {bam_file} -outdir {tx_results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} {options}") species = None if (tz.get_in(("genome_resources", "aliases", "human"), data, "") or dd.get_genome_build(data).startswith(("hg", "GRCh"))): species = "HUMAN" elif dd.get_genome_build(data).startswith(("mm", "GRCm")): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = (dd.get_coverage(data) if dd.get_coverage(data) not in [None, False, "None"] else dd.get_variant_regions_merged(data)) if regions: regions = bedutils.merge_overlaps(bedutils.clean_file(regions, data), data) bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" bcbio_env = utils.get_bcbio_env() do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data), env=bcbio_env) tx_results_file = os.path.join(tx_results_dir, "genome_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return {"base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file)}
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities", "--strict-vcf"] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps( bedutils.population_variant_regions(items), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any( tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target_regions = subset_variant_regions(variant_regions, region, out_file) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(config, out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) if out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def shared_variantcall(call_fn, name, align_bams, ref_file, items, assoc_files, region=None, out_file=None): """Provide base functionality for prepping and indexing for variant calling. """ config = items[0]["config"] if out_file is None: if vcfutils.is_paired_analysis(align_bams, items): out_file = "%s-paired-variants.vcf.gz" % config["metdata"]["batch"] else: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not file_exists(out_file): logger.debug("Genotyping with {name}: {region} {fname}".format( name=name, region=region, fname=os.path.basename(align_bams[0]))) variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target_regions = subset_variant_regions(variant_regions, region, out_file, items=items) if (variant_regions is not None and isinstance(target_regions, basestring) and not os.path.isfile(target_regions)): vcfutils.write_empty_vcf(out_file, config) else: with file_transaction(config, out_file) as tx_out_file: call_fn(align_bams, ref_file, items, target_regions, tx_out_file) if out_file.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, config) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_qualimap(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ report_file = os.path.join(out_dir, "qualimapReport.html") if not os.path.exists(report_file): ds_bam = bam.downsample(bam_file, data, 1e7) bam_file = ds_bam if ds_bam else bam_file utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) resources = config_utils.get_resources("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ( "unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1]) return _parse_qualimap_metrics(report_file)
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. """ opts = [] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] if "--min-alternate-fraction" not in " ".join(opts) and "-F" not in " ".join(opts): # add minimum reportable allele frequency, for which FreeBayes defaults to 20 min_af = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 20)) / 100.0 opts += ["--min-alternate-fraction", str(min_af)] return opts
def _regions_for_coverage(data, region, ref_file, out_file): """Retrieve BED file of regions we need to calculate coverage in. """ import pybedtools variant_regions = bedutils.merge_overlaps( utils.get_in(data, ("config", "algorithm", "variant_regions")), data) ready_region = shared.subset_variant_regions(variant_regions, region, out_file) custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0] if not ready_region: get_ref_bedtool(ref_file, data["config"]).saveas(custom_file) return custom_file, True elif os.path.isfile(ready_region): return ready_region, True elif isinstance(ready_region, (list, tuple)): c, s, e = ready_region pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file) return custom_file, True else: with file_transaction(data, custom_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feat in get_ref_bedtool(ref_file, data["config"], region): out_handle.write( "%s\t%s\t%s\t%s\n" % (feat.chrom, feat.start, feat.end, "NO_COVERAGE")) return custom_file, variant_regions is None
def run_haplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's haplotyper (GATK HaplotypeCaller like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps( dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: dbsnp = "--dbsnp %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" bams = " ".join(["-i %s" % x for x in align_bams]) license = license_export(items[0]) cores = dd.get_num_cores(items[0]) out_mode = "--emit_mode gvcf" if joint.want_gvcf(items) else "" cmd = ( "{license}sentieon driver -t {cores} -r {ref_file} " "{bams} {interval} --algo Haplotyper {out_mode} {dbsnp} {tx_out_file}" ) do.run(cmd.format(**locals()), "Sentieon Haplotyper") return out_file
def run_tnscope(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's TNscope somatic caller. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps( dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) assert paired and paired.normal_bam, "Require normal BAM for Sentieon TNscope" dbsnp = "--dbsnp %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" license = license_export(items[0]) cmd = ( "{license}sentieon driver -t 1 -r {ref_file} " "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} " "--algo TNscope " "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} " "{dbsnp} {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") return out_file
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities"] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(out_dir, "qualimapReport.html") pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(out_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = tz.get_in(("genome_resources", "aliases", "ensembl"), data, "") if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) return _parse_qualimap_metrics(report_file)
def _run_cnvkit_shared(data, test_bams, background_bams, access_file, work_dir, background_name=None): """Shared functionality to run CNVkit. """ ref_file = dd.get_ref_file(data) raw_work_dir = os.path.join(work_dir, "raw") out_base = os.path.splitext(os.path.basename(test_bams[0]))[0].split(".")[0] background_cnn = "%s_background.cnn" % (background_name if background_name else "flat") files = {"cnr": os.path.join(raw_work_dir, "%s.cnr" % out_base), "cns": os.path.join(raw_work_dir, "%s.cns" % out_base), "back_cnn": os.path.join(raw_work_dir, background_cnn)} if not utils.file_exists(files["cnr"]): if os.path.exists(raw_work_dir): shutil.rmtree(raw_work_dir) with tx_tmpdir(data, work_dir) as tx_work_dir: # pick targets, anti-targets and access files based on analysis type # http://cnvkit.readthedocs.org/en/latest/nonhybrid.html cov_interval = dd.get_coverage_interval(data) base_regions = dd.get_variant_regions(data) # For genome calls, subset to regions within 10kb of genes if cov_interval == "genome": base_regions = annotate.subset_by_genes(base_regions, data, work_dir, pad=1e4) raw_target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) target_bed = annotate.add_genes(raw_target_bed, data) # bail out if we ended up with no regions if not utils.file_exists(target_bed): return {} if cov_interval == "amplicon": target_opts = ["--targets", target_bed, "--access", target_bed] elif cov_interval == "genome": target_opts = ["--targets", target_bed, "--access", dd.get_variant_regions(data)] else: target_opts = ["--targets", target_bed, "--access", access_file] cores = min(tz.get_in(["config", "algorithm", "num_cores"], data, 1), len(test_bams) + len(background_bams)) cmd = [_get_cmd(), "batch"] + \ test_bams + ["-n"] + background_bams + ["-f", ref_file] + \ target_opts + \ ["-d", tx_work_dir, "--split", "-p", str(cores), "--output-reference", os.path.join(tx_work_dir, background_cnn)] at_avg, at_min, t_avg = _get_antitarget_size(access_file, target_bed) if at_avg: cmd += ["--antitarget-avg-size", str(at_avg), "--antitarget-min-size", str(at_min), "--target-avg-size", str(t_avg)] local_sitelib = os.path.join(install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") cmd += ["--rlibpath", local_sitelib] do.run(cmd, "CNVkit batch") shutil.move(tx_work_dir, raw_work_dir) for ftype in ["cnr", "cns"]: if not os.path.exists(files[ftype]): raise IOError("Missing CNVkit %s file: %s" % (ftype, files[ftype])) return files
def _get_regional_bed_file(data): """If we are running a non-genome analysis, pull the regional file for analysis. """ variant_regions = bedutils.merge_overlaps(tz.get_in(["config", "algorithm", "variant_regions"], data), data) is_genome = data["config"]["algorithm"].get("coverage_interval", "exome").lower() in ["genome"] if variant_regions and utils.file_exists(variant_regions) and not is_genome: return variant_regions
def _subset_regions(region, base_file, items): """Subset to a BED file (or genomic region) for calling. """ variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) target = pshared.subset_variant_regions(variant_regions, region, base_file, items) if isinstance(target, basestring) and os.path.isfile(target): return target else: return bamprep.region_to_gatk(target)
def _clean_regions(items, region): """Intersect region with target file if it exists""" variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) with utils.tmpfile() as tx_out_file: target = subset_variant_regions(variant_regions, region, tx_out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target = _load_regions(target) else: target = [target] return target
def _clean_regions(items, region): """Intersect region with target file if it exists""" config = items[0]["config"] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "varaint_regions")), items[0]) with utils.tmpfile() as tx_out_file: target = subset_variant_regions(variant_regions, region, tx_out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target = _load_regions(target) else: target = [target] return target
def run(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) resources = config_utils.get_resources("qualimap", data["config"]) options = " ".join(resources.get("options", "")) report_file = os.path.join(results_dir, "qualimapReport.html") utils.safe_makedir(results_dir) pdf_file = "qualimapReport.pdf" if not utils.file_exists(report_file) and not utils.file_exists(os.path.join(results_dir, pdf_file)): if "qualimap_full" in tz.get_in(("config", "algorithm", "tools_on"), data, []): logger.info("Full qualimap analysis for %s may be slow." % bam_file) ds_bam = bam_file else: ds_bam = bam.downsample(bam_file, data, 1e7, work_dir=out_dir) bam_file = ds_bam if ds_bam else bam_file if options.find("PDF") > -1: options = "%s -outfile %s" % (options, pdf_file) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) # Fixing the file name: MultiQC picks sample name from BAM file name. fixed_bam_fname = os.path.join(out_dir, dd.get_sample_name(data) + ".bam") if not os.path.islink(fixed_bam_fname): os.symlink(bam_file, fixed_bam_fname) export = utils.local_path_export() cmd = ("unset DISPLAY && {export} {qualimap} bamqc -bam {fixed_bam_fname} -outdir {results_dir} " "--skip-duplicated --skip-dup-mode 0 " "-nt {num_cores} --java-mem-size={max_mem} {options}") species = None if tz.get_in(("genome_resources", "aliases", "human"), data, ""): species = "HUMAN" elif any(tz.get_in("genome_build", data, "").startswith(k) for k in ["mm", "GRCm"]): species = "MOUSE" if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_coverage(data), data) or dd.get_variant_regions_merged(data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % dd.get_sample_name(data)) # return _parse_qualimap_metrics(report_file, data) return dict()
def _clean_regions(items, region): """Intersect region with target file if it exists""" config = items[0]["config"] variant_regions = bedutils.merge_overlaps( utils.get_in(config, ("algorithm", "varaint_regions")), items[0]) with utils.tmpfile() as tx_out_file: target = subset_variant_regions(variant_regions, region, tx_out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): target = _load_regions(target) else: target = [target] return target
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() total_reads = sambamba.number_of_reads(data, bam_file) out['Total_reads'] = total_reads mapped = sambamba.number_of_mapped_reads(data, bam_file) out['Mapped_reads'] = mapped if total_reads: out['Mapped_reads_pct'] = 100.0 * mapped / total_reads if mapped: mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped mapped_dups = mapped - mapped_unique out['Duplicates'] = mapped_dups out['Duplicates_pct'] = 100.0 * mapped_dups / mapped if dd.get_coverage(data): cov_bed_file = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" else: merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" ontarget = sambamba.number_mapped_reads_on_target( data, merged_bed_file, bam_file, keep_dups=False, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data) ontarget_padded = sambamba.number_mapped_reads_on_target( data, padded_bed_file, bam_file, keep_dups=False, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_coverage = get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_coverage priority = cov.priority_coverage(data, out_dir) cov.priority_total_coverage(data, out_dir) region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir) # Re-enable with annotations from internally installed # problem region directory # if priority: # annotated = cov.decorate_problem_regions(priority, problem_regions) return out
def _get_target_access_files(cov_interval, data, work_dir): """Retrieve target and access files based on the type of data to process. pick targets, anti-targets and access files based on analysis type http://cnvkit.readthedocs.org/en/latest/nonhybrid.html """ base_regions = shared.get_base_cnv_regions(data, work_dir) target_bed = bedutils.merge_overlaps(base_regions, data, out_dir=work_dir) if cov_interval == "amplicon": return target_bed, target_bed elif cov_interval == "genome": return target_bed, target_bed else: access_file = _create_access_file(dd.get_ref_file(data), _sv_workdir(data), data) return target_bed, access_file
def _bed_to_platypusin(region, base_file, items): """Convert BED file regions into Platypus custom inputs. """ variant_regions = bedutils.merge_overlaps(tz.get_in(["config", "algorithm", "variant_regions"], items[0]), items[0]) target = pshared.subset_variant_regions(variant_regions, region, base_file, items) if isinstance(target, basestring) and os.path.isfile(target): out_file = "%s-platypusregion.list" % utils.splitext_plus(base_file)[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for region in pybedtools.BedTool(target): out_handle.write("%s:%s-%s\n" % (region.chrom, region.start, region.stop)) return out_file else: return bamprep.region_to_gatk(target)
def run_haplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's haplotyper (GATK HaplotypeCaller like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" bams = " ".join(["-i %s" % x for x in align_bams]) license = license_export(items[0]) cmd = ("{license}sentieon driver -t 1 -r {ref_file} " "{bams} {interval} --algo Haplotyper {dbsnp} {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") return out_file
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() if dd.get_coverage(data): bed_file = bedutils.merge_overlaps(dd.get_coverage(data), data) target_name = "coverage" elif dd.get_variant_regions_merged(data): bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: bed_file = None target_name = "wgs" bed_file = clean_file(bed_file, data, prefix="cov-", simple=True) offtarget_stats_file = calculate_offtarget_stats(bam_file, data, bed_file, target_name) if offtarget_stats_file and utils.file_exists(offtarget_stats_file): with open(offtarget_stats_file) as in_handle: stats = yaml.safe_load(in_handle) offtarget = stats.get('offtarget') mapped_unique = stats['mapped_unique'] if offtarget and mapped_unique: out['offtarget_rate'] = 1.0 * offtarget / mapped_unique mapped = stats['mapped'] if mapped: out['Duplicates'] = mapped - mapped_unique out['Duplicates_pct'] = 1.0 * (mapped - mapped_unique) / mapped total_reads = stats['total_reads'] if total_reads: out['usable_rate'] = 1.0 * (mapped_unique - offtarget) / total_reads avg_coverage = get_average_coverage(data, bam_file, bed_file, target_name) out['avg_coverage'] = avg_coverage priority = cov.priority_coverage(data, out_dir) cov.priority_total_coverage(data, out_dir) region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir) # Re-enable with annotations from internally installed # problem region directory # if priority: # annotated = cov.decorate_problem_regions(priority, problem_regions) return out
def run_tnhaplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's TNhaplotyper (MuTect2 like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps( dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for Sentieon TNhaplotyper" dbsnp = "--dbsnp %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" cosmic = "--cosmic %s" % ( assoc_files.get("cosmic")) if "cosmic" in assoc_files else "" license = license_export(items[0]) tx_orig_file = "%s-orig%s" % utils.splitext_plus(tx_out_file) cores = dd.get_num_cores(items[0]) cmd = ( "{license}sentieon driver -t {cores} -r {ref_file} " "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} " "--algo TNhaplotyper " "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} " "{dbsnp} {cosmic} {tx_orig_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") cmd = ( "gunzip -c {tx_orig_file} | " "sed 's/ID=ECNT,Number=1,Type=Integer/ID=ECNT,Number=1,Type=String/' | " "sed 's/ID=HCNT,Number=1,Type=Integer/ID=HCNT,Number=1,Type=String/' | " "sed 's/ID=NLOD,Number=1,Type=Float/ID=NLOD,Number=1,Type=String/' | " "sed 's/ID=TLOD,Number=1,Type=Float/ID=TLOD,Number=1,Type=String/' | " "sed 's/ID=PON,Number=1,Type=Integer/ID=PON,Number=1,Type=String/' | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper: make headers GATK compatible") vcfutils.bgzip_and_index(tx_out_file, items[0]["config"]) return out_file
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities", "--strict-vcf"] cur_ploidy = ploidy.get_ploidy(items, region) base_ploidy = ploidy.get_ploidy(items) opts += ["--ploidy", str(cur_ploidy)] # Adjust min fraction when trying to call more sensitively in certain # regions. This is primarily meant for pooled mitochondrial calling. if (isinstance(region, (list, tuple)) and chromhacks.is_mitochondrial(region[0]) and cur_ploidy >= base_ploidy and "--min-alternate-fraction" not in opts and "-F" not in opts): opts += ["--min-alternate-fraction", "0.01"] variant_regions = bedutils.merge_overlaps( bedutils.population_variant_regions(items), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any( tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. """ opts = [] opts += ["--ploidy", str(ploidy.get_ploidy(items, region))] variant_regions = bedutils.merge_overlaps(utils.get_in(config, ("algorithm", "variant_regions")), items[0]) target = subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts
def run_tnscope(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's TNscope somatic caller. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) assert paired and paired.normal_bam, "Require normal BAM for Sentieon TNscope" dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" license = license_export(items[0]) cmd = ("{license}sentieon driver -t 1 -r {ref_file} " "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} " "--algo TNscope " "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} " "{dbsnp} {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") return out_file
def _regions_for_coverage(data, region, ref_file, out_file): """Retrieve BED file of regions we need to calculate coverage in. """ variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) ready_region = shared.subset_variant_regions(variant_regions, region, out_file) custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0] if not ready_region: get_ref_bedtool(ref_file, data["config"]).saveas(custom_file) return custom_file, True elif os.path.isfile(ready_region): return ready_region, True elif isinstance(ready_region, (list, tuple)): c, s, e = ready_region pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file) return custom_file, True else: with file_transaction(data, custom_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feat in get_ref_bedtool(ref_file, data["config"], region): out_handle.write("%s\t%s\t%s\t%s\n" % (feat.chrom, feat.start, feat.end, "NO_COVERAGE")) return custom_file, variant_regions is None
def _freebayes_options_from_config(items, config, out_file, region=None): """Prepare standard options from configuration input. Input BED target files are merged to avoid overlapping regions which cause FreeBayes to call multiple times. Checks for empty sets of target regions after filtering for high depth, in which case we should skip the FreeBayes run. """ opts = ["--genotype-qualities", "--strict-vcf"] cur_ploidy = ploidy.get_ploidy(items, region) base_ploidy = ploidy.get_ploidy(items) opts += ["--ploidy", str(cur_ploidy)] # Adjust min fraction when trying to call more sensitively in certain # regions. This is primarily meant for pooled mitochondrial calling. if (isinstance(region, (list, tuple)) and chromhacks.is_mitochondrial(region[0]) and cur_ploidy >= base_ploidy and "--min-alternate-fraction" not in opts and "-F" not in opts): opts += ["--min-alternate-fraction", "0.01"] variant_regions = bedutils.merge_overlaps(bedutils.population_variant_regions(items), items[0]) # Produce gVCF output if any("gvcf" in dd.get_tools_on(d) for d in items): opts += ["--gvcf", "--gvcf-chunk", "50000"] no_target_regions = False target = shared.subset_variant_regions(variant_regions, region, out_file, items) if target: if isinstance(target, basestring) and os.path.isfile(target): if any(tz.get_in(["config", "algorithm", "coverage_interval"], x, "").lower() == "genome" for x in items): target = shared.remove_highdepth_regions(target, items) if os.path.getsize(target) == 0: no_target_regions = True opts += ["--targets", target] else: opts += ["--region", region_to_freebayes(target)] resources = config_utils.get_resources("freebayes", config) if resources.get("options"): opts += resources["options"] return opts, no_target_regions
def _run_qualimap(bam_file, data, out_dir): """Run qualimap to assess alignment quality metrics. """ report_file = os.path.join(out_dir, "qualimapReport.html") if not os.path.exists(report_file): ds_bam = bam.downsample(bam_file, data, 1e7) bam_file = ds_bam if ds_bam else bam_file utils.safe_makedir(out_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) qualimap = config_utils.get_program("qualimap", data["config"]) resources = config_utils.get_resources("qualimap", data["config"]) max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), num_cores) cmd = ("unset DISPLAY && {qualimap} bamqc -bam {bam_file} -outdir {out_dir} " "-nt {num_cores} --java-mem-size={max_mem}") species = data["genome_resources"]["aliases"].get("ensembl", "").upper() if species in ["HUMAN", "MOUSE"]: cmd += " -gd {species}" regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) if regions: bed6_regions = _bed_to_bed6(regions, out_dir) cmd += " -gff {bed6_regions}" do.run(cmd.format(**locals()), "Qualimap: %s" % data["name"][-1]) return _parse_qualimap_metrics(report_file)
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() total_reads = sambamba.number_of_reads(data, bam_file) out['Total_reads'] = total_reads mapped = sambamba.number_of_mapped_reads(data, bam_file) out['Mapped_reads'] = mapped if total_reads: out['Mapped_reads_pct'] = 100.0 * mapped / total_reads if mapped: mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) out['Mapped_unique_reads'] = mapped mapped_dups = mapped - mapped_unique out['Duplicates'] = mapped_dups out['Duplicates_pct'] = 100.0 * mapped_dups / mapped if dd.get_coverage(data): cov_bed_file = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" else: merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" ontarget = sambamba.number_mapped_reads_on_target( data, merged_bed_file, bam_file, keep_dups=False, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique padded_bed_file = bedutils.get_padded_bed_file( merged_bed_file, 200, data) ontarget_padded = sambamba.number_mapped_reads_on_target( data, padded_bed_file, bam_file, keep_dups=False, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_coverage = get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_coverage priority = cov.priority_coverage(data, out_dir) cov.priority_total_coverage(data, out_dir) region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir) # Re-enable with annotations from internally installed # problem region directory # if priority: # annotated = cov.decorate_problem_regions(priority, problem_regions) return out
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir) from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) if "Total_reads" not in samtools_stats: return out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) if not total_reads: return if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats: return out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if not mapped: return out if "Duplicates" in samtools_stats: out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) out['Duplicates_pct'] = 100.0 * dups / int(samtools_stats["Mapped_reads_raw"]) else: dups = 0 if dd.get_coverage(data) and dd.get_coverage(data) not in ["None"]: cov_bed_file = bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" # Whole genome runs do not need detailed on-target calculations, use total unique mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False) if merged_bed_file: ontarget = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file(merged_bed_file, 200, data) ontarget_padded = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_depth region_coverage_file = cov.coverage_region_detailed_stats(data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))])) return out
def _run_coverage_qc(bam_file, data, out_dir): """Run coverage QC analysis""" out = dict() samtools_stats_dir = os.path.join(out_dir, os.path.pardir, out_dir) from bcbio.qc import samtools samtools_stats = samtools.run(bam_file, data, samtools_stats_dir) if "Total_reads" not in samtools_stats: return out["Total_reads"] = total_reads = int(samtools_stats["Total_reads"]) if not total_reads: return if "Mapped_reads_raw" not in samtools_stats or "Mapped_reads" not in samtools_stats: return out["Mapped_reads"] = mapped = int(samtools_stats["Mapped_reads"]) out["Mapped_reads_pct"] = 100.0 * mapped / total_reads if not mapped: return out if "Duplicates" in samtools_stats: out['Duplicates'] = dups = int(samtools_stats["Duplicates"]) out['Duplicates_pct'] = 100.0 * dups / int( samtools_stats["Mapped_reads_raw"]) else: dups = 0 if dd.get_coverage(data): cov_bed_file = bedutils.clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True) merged_bed_file = bedutils.merge_overlaps(cov_bed_file, data) target_name = "coverage" elif dd.get_coverage_interval(data) != "genome": merged_bed_file = dd.get_variant_regions_merged(data) target_name = "variant_regions" else: merged_bed_file = None target_name = "genome" # Whole genome runs do not need detailed on-target calculations, use total unique mapped if dd.get_coverage_interval(data) == "genome": mapped_unique = mapped - dups else: out['Mapped_unique_reads'] = mapped_unique = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False) if merged_bed_file: ontarget = sambamba.number_of_mapped_reads(data, bam_file, keep_dups=False, bed_file=merged_bed_file, target_name=target_name) if mapped_unique: out["Ontarget_unique_reads"] = ontarget out["Ontarget_pct"] = 100.0 * ontarget / mapped_unique out['Offtarget_pct'] = 100.0 * (mapped_unique - ontarget) / mapped_unique if dd.get_coverage_interval(data) != "genome": # Skip padded calculation for WGS even if the "coverage" file is specified # the padded statistic makes only sense for exomes and panels padded_bed_file = bedutils.get_padded_bed_file( merged_bed_file, 200, data) ontarget_padded = sambamba.number_of_mapped_reads( data, bam_file, keep_dups=False, bed_file=padded_bed_file, target_name=target_name + "_padded") out["Ontarget_padded_pct"] = 100.0 * ontarget_padded / mapped_unique if total_reads: out['Usable_pct'] = 100.0 * ontarget / total_reads avg_depth = cov.get_average_coverage(data, bam_file, merged_bed_file, target_name) out['Avg_coverage'] = avg_depth region_coverage_file = cov.coverage_region_detailed_stats( data, out_dir, extra_cutoffs=set([max(1, int(avg_depth * 0.8))])) return out