def annotate_nongatk_vcf(orig_file, bam_files, dbsnp_file, ref_file, data, out_file=None): """Annotate a VCF file with dbSNP and standard GATK called annotations. """ orig_file = vcfutils.bgzip_and_index(orig_file, data["config"]) broad_runner = broad.runner_from_config_safe(data["config"]) if not broad_runner or not broad_runner.has_gatk( ) or broad_runner.gatk_type() == "gatk4": if dbsnp_file: return add_dbsnp(orig_file, dbsnp_file, data, out_file) else: return orig_file else: if out_file is None: out_file = "%s-gatkann%s" % utils.splitext_plus(orig_file) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: # Avoid issues with incorrectly created empty GATK index files. # Occurs when GATK cannot lock shared dbSNP database on previous run idx_file = orig_file + ".idx" if os.path.exists( idx_file) and not utils.file_exists(idx_file): os.remove(idx_file) annotations = get_gatk_annotations(data["config"], include_depth=False) params = [ "-T", "VariantAnnotator", "-R", ref_file, "--variant", orig_file, "--out", tx_out_file, "-L", orig_file ] if dbsnp_file: params += ["--dbsnp", dbsnp_file] for bam_file in bam_files: params += ["-I", bam_file] for x in annotations: params += ["-A", x] if ("--allow_potentially_misencoded_quality_scores" not in params and "-allowPotentiallyMisencodedQuals" not in params): params += ["--allow_potentially_misencoded_quality_scores"] # be less stringent about BAM and VCF files (esp. N in CIGAR for RNA-seq) # start by removing existing -U or --unsafe opts # (if another option is added to Gatk that starts with -U... this may create a bug) unsafe_options = [ x for x in params if x.startswith(("-U", "--unsafe")) ] for my_opt in unsafe_options: ind_to_rem = params.index(my_opt) # are the options given as separate strings or in one? if my_opt.strip() == "-U" or my_opt.strip() == "--unsafe": params.pop(ind_to_rem + 1) params.pop(ind_to_rem) params.extend(["-U", "ALL"]) broad_runner = broad.runner_from_config(data["config"]) broad_runner.run_gatk(params) vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = ["-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data["vrn_file"] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) sample = dd.get_sample_name(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out, ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, "w") as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}" ) do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug("parsing coverage: %s" % sample) return data
def annotate_nongatk_vcf(orig_file, bam_files, dbsnp_file, ref_file, data, out_file=None): """Annotate a VCF file with dbSNP and standard GATK called annotations. """ orig_file = vcfutils.bgzip_and_index(orig_file, data["config"]) broad_runner = broad.runner_from_config_safe(data["config"]) if not broad_runner or not broad_runner.has_gatk() or broad_runner.gatk_type() == "gatk4": if dbsnp_file: return add_dbsnp(orig_file, dbsnp_file, data, out_file) else: return orig_file else: if out_file is None: out_file = "%s-gatkann%s" % utils.splitext_plus(orig_file) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: # Avoid issues with incorrectly created empty GATK index files. # Occurs when GATK cannot lock shared dbSNP database on previous run idx_file = orig_file + ".idx" if os.path.exists(idx_file) and not utils.file_exists(idx_file): os.remove(idx_file) annotations = get_gatk_annotations(data["config"], include_depth=False) params = ["-T", "VariantAnnotator", "-R", ref_file, "--variant", orig_file, "--out", tx_out_file, "-L", orig_file] if dbsnp_file: params += ["--dbsnp", dbsnp_file] for bam_file in bam_files: params += ["-I", bam_file] for x in annotations: params += ["-A", x] if ("--allow_potentially_misencoded_quality_scores" not in params and "-allowPotentiallyMisencodedQuals" not in params): params += ["--allow_potentially_misencoded_quality_scores"] # be less stringent about BAM and VCF files (esp. N in CIGAR for RNA-seq) # start by removing existing -U or --unsafe opts # (if another option is added to Gatk that starts with -U... this may create a bug) unsafe_options = [x for x in params if x.startswith(("-U", "--unsafe"))] for my_opt in unsafe_options: ind_to_rem = params.index(my_opt) # are the options given as separate strings or in one? if my_opt.strip() == "-U" or my_opt.strip() == "--unsafe": params.pop(ind_to_rem + 1) params.pop(ind_to_rem) params.extend(["-U", "ALL"]) broad_runner = broad.runner_from_config(data["config"]) broad_runner.run_gatk(params) vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data['vrn_file'] sample = dd.get_sample_name(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): if file_exists(qc_file): return data in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): os.remove(cg_file) return data