def run(call_file, ref_file, vrn_files, data): """Run filtering on the input call file, handling SNPs and indels separately. """ algs = [data["config"]["algorithm"]] * len(data.get("vrn_files", [1])) if config_utils.use_vqsr(algs): if "gvcf" in dd.get_tools_on(data) and not dd.get_jointcaller(data): raise ValueError( "Cannot force gVCF output with joint calling using tools_on: [gvcf] and use VQSR. " "Try using cutoff-based soft filtering with tools_off: [vqsr]") snp_file, indel_file = vcfutils.split_snps_indels( call_file, ref_file, data["config"]) snp_filter_file = _variant_filtration(snp_file, ref_file, vrn_files, data, "SNP", vfilter.gatk_snp_cutoff) indel_filter_file = _variant_filtration(indel_file, ref_file, vrn_files, data, "INDEL", vfilter.gatk_indel_cutoff) orig_files = [snp_filter_file, indel_filter_file] out_file = "%scombined.vcf.gz" % os.path.commonprefix(orig_files) combined_file = vcfutils.combine_variant_files(orig_files, out_file, ref_file, data["config"]) return combined_file else: snp_filter = vfilter.gatk_snp_cutoff(call_file, data) indel_filter = vfilter.gatk_indel_cutoff(snp_filter, data) return indel_filter
def run_jointvc(items): items = [utils.to_single_data(x) for x in items] data = items[0] if not dd.get_jointcaller(data): data["config"]["algorithm"][ "jointcaller"] = "%s-joint" % dd.get_variantcaller(data) # GenomicsDBImport uses 1-based coordinates. That's unexpected, convert over to these. chrom, coords = data["region"].split(":") start, end = coords.split("-") ready_region = "%s:%s-%s" % (chrom, int(start) + 1, end) str_region = ready_region.replace(":", "_") batches = dd.get_batches(data) or dd.get_sample_name(data) if not isinstance(batches, (list, tuple)): batches = [batches] out_file = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "joint", dd.get_variantcaller(data), str_region)), "%s-%s-%s.vcf.gz" % (batches[0], dd.get_variantcaller(data), str_region)) joint_out = square_batch_region(data, ready_region, [], [d["vrn_file"] for d in items], out_file)[0] data["vrn_file_region"] = joint_out["vrn_file"] return data
def _bcftools_stats(data, out_dir, vcf_file_key=None, germline=False): """Run bcftools stats. """ vcinfo = get_active_vcinfo(data) if vcinfo: out_dir = utils.safe_makedir(out_dir) vcf_file = vcinfo[vcf_file_key or "vrn_file"] if dd.get_jointcaller(data) or "gvcf" in dd.get_tools_on(data): opts = "" else: opts = "-f PASS,." name = dd.get_sample_name(data) out_file = os.path.join( out_dir, "%s_bcftools_stats%s.txt" % (name, ("_germline" if germline else ""))) bcftools = config_utils.get_program("bcftools", data["config"]) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: orig_out_file = os.path.join( os.path.dirname(tx_out_file), "orig_%s" % os.path.basename(tx_out_file)) cmd = ( "{bcftools} stats -s {name} {opts} {vcf_file} > {orig_out_file}" ) do.run(cmd.format(**locals()), "bcftools stats %s" % name) with open(orig_out_file) as in_handle: with open(tx_out_file, "w") as out_handle: for line in in_handle: if line.startswith("ID\t"): parts = line.split("\t") parts[-1] = "%s\n" % name line = "\t".join(parts) out_handle.write(line) return out_file
def rnaseq_variant_calling(samples, run_parallel): """ run RNA-seq variant calling using GATK """ samples = run_parallel("run_rnaseq_variant_calling", samples) variantcaller = dd.get_variantcaller(to_single_data(samples[0])) jointcaller = dd.get_jointcaller(to_single_data(samples[0])) if jointcaller and 'gatk-haplotype-joint' in jointcaller: out = [] for d in joint.square_off(samples, run_parallel): out.extend( [[to_single_data(xs)] for xs in multi.split_variants_by_sample(to_single_data(d))]) samples = out if variantcaller or jointcaller: samples = run_parallel("run_rnaseq_ann_filter", samples) if jointcaller and 'gatk-haplotype-joint' in jointcaller: out = [] for data in (to_single_data(xs) for xs in samples): if "variants" not in data: data["variants"] = [] data["variants"].append({ "variantcaller": "gatk-haplotype", "vcf": data["vrn_file_orig"], "population": { "vcf": data["vrn_file"] } }) data["vrn_file"] = data.pop("vrn_file_orig") out.append([data]) samples = out return samples
def _bcftools_stats(data, out_dir, vcf_file_key=None, germline=False): """Run bcftools stats. """ vcinfo = get_active_vcinfo(data) if vcinfo: out_dir = utils.safe_makedir(out_dir) vcf_file = vcinfo[vcf_file_key or "vrn_file"] if dd.get_jointcaller(data) or "gvcf" in dd.get_tools_on(data): opts = "" else: opts = "-f PASS,." name = dd.get_sample_name(data) out_file = os.path.join(out_dir, "%s_bcftools_stats%s.txt" % (name, ("_germline" if germline else ""))) bcftools = config_utils.get_program("bcftools", data["config"]) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: orig_out_file = os.path.join(os.path.dirname(tx_out_file), "orig_%s" % os.path.basename(tx_out_file)) cmd = ("{bcftools} stats -s {name} {opts} {vcf_file} > {orig_out_file}") do.run(cmd.format(**locals()), "bcftools stats %s" % name) with open(orig_out_file) as in_handle: with open(tx_out_file, "w") as out_handle: for line in in_handle: if line.startswith("ID\t"): parts = line.split("\t") parts[-1] = "%s\n" % name line = "\t".join(parts) out_handle.write(line) return out_file
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) return checkpoints
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d))) and dd.get_batch(d) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) return checkpoints
def run_jointvc(items): items = [utils.to_single_data(x) for x in items] data = items[0] if not dd.get_jointcaller(data): data["config"]["algorithm"]["jointcaller"] = "%s-joint" % dd.get_variantcaller(data) # GenomicsDBImport uses 1-based coordinates. That's unexpected, convert over to these. chrom, coords = data["region"].split(":") start, end = coords.split("-") ready_region = "%s:%s-%s" % (chrom, int(start) + 1, end) str_region = ready_region.replace(":", "_") out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "joint", dd.get_variantcaller(data), str_region)), "%s-%s-%s.vcf.gz" % (dd.get_batches(data)[0], dd.get_variantcaller(data), str_region)) joint_out = square_batch_region(data, ready_region, [], [d["vrn_file"] for d in items], out_file)[0] data["vrn_file_region"] = joint_out["vrn_file"] return data
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["archive"] = any([dd.get_archive(d) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def run_rnaseq_ann_filter(data): """Run RNA-seq annotation and filtering. """ data = to_single_data(data) if dd.get_vrn_file(data): eff_file = effects.add_to_vcf(dd.get_vrn_file(data), data)[0] if eff_file: data = dd.set_vrn_file(data, eff_file) ann_file = population.run_vcfanno(dd.get_vrn_file(data), data) if ann_file: data = dd.set_vrn_file(data, ann_file) jointcaller = dd.get_jointcaller(data) if jointcaller and 'gatk-haplotype-joint' in jointcaller: filter_file = variation.gatk_filter_rnaseq(dd.get_vrn_file(data), data) data = dd.set_vrn_file(data, filter_file) # remove variants close to splice junctions vrn_file = dd.get_vrn_file(data) vrn_file = variation.filter_junction_variants(vrn_file, data) data = dd.set_vrn_file(data, vrn_file) return [[data]]
def is_joint(data): return "gvcf" in dd.get_tools_on(data) or dd.get_jointcaller(data)
def run_peddy(samples, out_dir=None): data = samples[0] batch = dd.get_batch(data) or dd.get_sample_name(data) if isinstance(batch, (list, tuple)): batch = batch[0] if out_dir: peddy_dir = safe_makedir(out_dir) else: peddy_dir = safe_makedir( os.path.join(dd.get_work_dir(data), "qc", batch, "peddy")) peddy_prefix = os.path.join(peddy_dir, batch) peddy_report = peddy_prefix + ".html" vcf_file = None for d in samples: vcinfo = None if dd.get_jointcaller(d): vcinfo = variant.extract_population_vcinfo(d) elif dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [ "tumor" ]: vcinfo = variant.get_active_vcinfo(d, use_ensemble=False) if not vcinfo and dd.get_phenotype(d) in ["tumor"]: vcinfo = variant.extract_germline_vcinfo(d, peddy_dir) if vcinfo: for key in ["germline", "vrn_file"]: if vcinfo and vcinfo.get(key) and utils.file_exists( vcinfo[key]): if vcinfo[key] and dd.get_sample_name( d) in vcfutils.get_samples(vcinfo[key]): if vcinfo[ key] and vcfutils.vcf_has_nonfiltered_variants( vcinfo[key]): vcf_file = vcinfo[key] break peddy = config_utils.get_program("peddy", data) if config_utils.program_installed( "peddy", data) else None config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples]) if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips: if not peddy: reason = "peddy executable not found" elif config_skips: reason = "peddy in tools_off configuration" elif not vcfanno.is_human(data): reason = "sample is not human" else: assert not vcf_file reason = "no suitable VCF files found with the sample and non-filtered variants" msg = "Skipping peddy QC, %s: %s" % ( reason, [dd.get_sample_name(d) for d in samples]) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write(msg) logger.info(msg) return samples if file_exists(peddy_prefix + "-failed.log"): return samples if not file_exists(peddy_report): ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir) num_cores = dd.get_num_cores(data) with tx_tmpdir(data) as tx_dir: peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix)) # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2 stderr_log = os.path.join(tx_dir, "run-stderr.log") sites_str = "--sites hg38" if dd.get_genome_build( data) == "hg38" else "" locale = utils.locale_export() cmd = ( "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} " "{vcf_file} {ped_file} 2> {stderr_log}") message = "Running peddy on {vcf_file} against {ped_file}." try: do.run(cmd.format(**locals()), message.format(**locals())) except: to_show = collections.deque(maxlen=100) with open(stderr_log) as in_handle: for line in in_handle: to_show.append(line) def allowed_errors(l): return ( (l.find("IndexError") >= 0 and l.find("is out of bounds for axis") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and n_features=") >= 0) or (l.find("n_components=") >= 0 and l.find("must be between 1 and min") >= 0) or (l.find( "Input contains NaN, infinity or a value too large for dtype" ) >= 0) or (l.find("peddy: no hets found for sample") >= 0) or (l.find( "ValueError: need at least one array to concatenate" ) >= 0)) def all_line_errors(l): return (l.find("no intervals found for") >= 0) if any([allowed_errors(l) for l in to_show]) or all( [all_line_errors(l) for l in to_show]): logger.info( "Skipping peddy because no variants overlap with checks: %s" % batch) with open(peddy_prefix + "-failed.log", "w") as out_handle: out_handle.write( "peddy did not find overlaps with 1kg sites in VCF, skipping" ) return samples else: logger.warning("".join(to_show)) raise for ext in PEDDY_OUT_EXTENSIONS: if os.path.exists(peddy_prefix_tx + ext): shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext) peddyfiles = expected_peddy_files(peddy_report, batch) return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)