def run(calls, data): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + [ "--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir, ] available_callers = 0 for call in calls: if call["variantcaller"] in SUPPORTED: available_callers += 1 cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if available_callers >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save( dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml") ) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") calls.append({"variantcaller": "metasv", "vrn_file": out_file}) return calls
def _run_amber(paired, work_dir, lenient=False): """AMBER: calculate allele frequencies at likely heterozygous sites. lenient flag allows amber runs on small test sets. """ amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".pcf"): with file_transaction(paired.tumor_data, out_file) as tx_out_file: key = "germline_het_pon" het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data) cmd = ["AMBER"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-threads", dd.get_num_cores(paired.tumor_data), "-tumor", dd.get_sample_name(paired.tumor_data), "-tumor_bam", dd.get_align_bam(paired.tumor_data), "-reference", dd.get_sample_name(paired.normal_data), "-reference_bam", dd.get_align_bam(paired.normal_data), "-ref_genome", dd.get_ref_file(paired.tumor_data), "-bed", het_bed, "-output_dir", os.path.dirname(tx_out_file)] if lenient: cmd += ["-max_het_af_percent", "1.0"] try: do.run(cmd, "PURPLE: AMBER baf generation") except subprocess.CalledProcessError as msg: if not lenient and _amber_allowed_errors(str(msg)): return _run_amber(paired, work_dir, True) for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(amber_dir, f)) return out_file
def run(calls, data): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir] available_callers = 0 for call in calls: if call["variantcaller"] in SUPPORTED: available_callers += 1 cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if available_callers >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--assembly_max_tools=1", "--assembly_pad=500"] cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>10000) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>20) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || " "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>1)") filter_file = vfilter.hard_w_expression(out_file, filters, data, name="ReassemblyStats", limit_regions=None) calls.append({"variantcaller": "metasv", "vrn_file": filter_file}) return calls
def run(items): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ assert len(items) == 1, "Expect one input to MetaSV ensemble calling" data = items[0] work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir] methods = [] for call in data.get("sv", []): if call["variantcaller"] in SUPPORTED and call["variantcaller"] not in methods: methods.append(call["variantcaller"]) cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if len(methods) >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--assembly_max_tools=1", "--assembly_pad=500"] cmd += ["--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || " "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)") filter_file = vfilter.hard_w_expression(out_file, filters, data, name="ReassemblyStats", limit_regions=None) effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff") data["sv"].append({"variantcaller": "metasv", "vrn_file": effects_vcf or filter_file}) return [data]
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = ["-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def run(items): """Perform detection of structural variations with lumpy, using bwa-mem alignment. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", False, None] for data in items): raise ValueError("Require bwa-mem alignment input for lumpy structural variation detection") paired = vcfutils.get_paired_bams([x["align_bam"] for x in items], items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) full_bams.append(dd.get_align_bam(data)) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_lumpy(full_bams, sr_bams, disc_bams, previous_evidence, work_dir, items) gt_vcfs = {} for data in items: sample = dd.get_sample_name(data) sample_vcf = vcfutils.select_sample(lumpy_vcf, sample, utils.append_stem(lumpy_vcf, "-%s" % sample), data["config"]) if "bnd-genotype" in dd.get_tools_on(data): gt_vcf = _run_svtyper(sample_vcf, dd.get_align_bam(data), exclude_file, data) else: std_vcf, bnd_vcf = _split_breakends(sample_vcf, data) std_gt_vcf = _run_svtyper(std_vcf, dd.get_align_bam(data), exclude_file, data) gt_vcf = vcfutils.concat_variant_files_bcftools( orig_files=[std_gt_vcf, bnd_vcf], out_file="%s-combined.vcf.gz" % utils.splitext_plus(std_gt_vcf)[0], config=data["config"]) gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(gt_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background([paired.tumor_name], [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs[dd.get_sample_name(data)] if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data["vrn_file"] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) sample = dd.get_sample_name(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out, ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, "w") as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}" ) do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug("parsing coverage: %s" % sample) return data
def run(_, data, out_dir): stats_file = os.path.join(utils.safe_makedir(out_dir), "%s_umi_stats.yaml" % dd.get_sample_name(data)) if not utils.file_uptodate(stats_file, dd.get_align_bam(data)): out = {} total = 0 mapped = 0 duplicates = 0 umi_reductions = [] umi_counts = collections.defaultdict(int) with pysam.AlignmentFile(data["umi_bam"], "rb", check_sq=False) as bam_iter: cur_counts = collections.defaultdict(int) cur_key = None for rec in bam_iter: total += 1 umi = _get_umi_tag(rec) if umi and not rec.is_unmapped: mapped += 1 if rec.is_duplicate: duplicates += 1 chrom = bam_iter.getrname(rec.reference_id) pos = rec.reference_start key = (chrom, pos) if key != cur_key: # update counts if cur_counts: for c in cur_counts.values(): umi_counts[c] += 1 total_seqs = sum(cur_counts.values()) umi_count = len(cur_counts) umi_reductions.append(float(total_seqs) / umi_count) # update current keys cur_key = key cur_counts = collections.defaultdict(int) cur_counts[umi] += 1 if cur_counts: for c in cur_counts.values(): umi_counts[c] += 1 total_seqs = sum(cur_counts.values()) umi_count = len(cur_counts) umi_reductions.append(float(total_seqs) / umi_count) consensus_count = sum([x.aligned for x in bam.idxstats(dd.get_align_bam(data), data)]) out["umi_baseline_all"] = total out["umi_baseline_mapped"] = mapped out["umi_baseline_duplicate_pct"] = float(duplicates) / float(mapped) * 100.0 out["umi_consensus_mapped"] = consensus_count out["umi_consensus_pct"] = (100.0 - float(consensus_count) / float(mapped) * 100.0) out["umi_reduction_median"] = int(math.ceil(np.median(umi_reductions))) out["umi_reduction_max"] = int(max(umi_reductions)) out["umi_counts"] = dict(umi_counts) out["umi_raw_avg_cov"] = data["config"]["algorithm"].get("rawumi_avg_cov", 0) with open(stats_file, "w") as out_handle: yaml.safe_dump({dd.get_sample_name(data): out}, out_handle, default_flow_style=False, allow_unicode=False) return stats_file
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ vrn_key = "vrn_file" if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] if "vrn_file_joint" in items[0]: vrn_key = "vrn_file_joint" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data, require_bam=False)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data, require_bam=False), orig_items) if cwlutils.is_cwl_run(data): logger.info("Annotate with population level variation data") ann_file = population.run_vcfanno(data[vrn_key], data) if ann_file: data[vrn_key] = ann_file logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration(data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions_merged(data) callable_file = dd.get_sample_callable(data) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() else: callable_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) genome_cov_pct = callable_size / float(total_size) if genome_cov_pct > GENOME_COV_THRESH: cov_interval = "genome" offtarget_pct = 0.0 elif not vrs: cov_interval = "regional" offtarget_pct = 0.0 else: offtarget_pct = _count_offtarget(data, dd.get_align_bam(data) or dd.get_work_bam(data), vrs or callable_file, "variant_regions") if offtarget_pct > OFFTARGET_THRESH: cov_interval = "regional" else: cov_interval = "amplicon" logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def _group_by_sample_and_batch(samples): """Group samples split by QC method back one per sample-batch. """ out = collections.defaultdict(list) for data in samples: out[(dd.get_sample_name(data), dd.get_align_bam(data), tuple(_get_batches(data)))].append(data) return [xs[0] for xs in out.values()]
def run(data): """HLA typing with OptiType, parsing output from called genotype files. """ align_file = dd.get_align_bam(data) hla_dir = os.path.join(os.path.dirname(align_file), "hla") hla_base = os.path.join(hla_dir, os.path.basename(align_file) + ".hla") hlas = [] for hla_fq in glob.glob(hla_base + ".*.fq"): hla_type = os.path.splitext(os.path.splitext(os.path.basename(hla_fq))[0])[1].replace(".", "") if hla_type in SUPPORTED_HLAS: hlas.append((hla_type, hla_fq)) if len(hlas) > 0: hla_calls = [] for hla_type, hla_fq in hlas: out_dir = os.path.join(hla_dir, "OptiType-%s" % hla_type) out_file = glob.glob(os.path.join(out_dir, "*", "*_result.tsv")) if len(out_file) > 0: out_file = out_file[0] else: out_file = _call_hla(hla_fq, out_dir, data) hla_calls.append((hla_type.replace("HLA-", ""), out_file)) out_file = _combine_calls(hla_calls, hla_dir, data) data["hla"] = {"call_file": out_file, "hlacaller": "optitype"} return data
def _organize_calls(out_file, hla_base, data): """Prepare genotype calls, reporting best call along with quality metrics. """ hla_truth = get_hla_truthset(data) align_file = dd.get_align_bam(data) sample = dd.get_sample_name(data) with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: writer = csv.writer(out_handle) writer.writerow(["sample", "locus", "mismatches", "options", "alleles", "p-groups", "expected", "validates"]) for genotype_file in glob.glob("%s.HLA-*.gt" % (hla_base)): hla_locus = os.path.basename(genotype_file).replace( "%s.hla.HLA-" % os.path.basename(align_file), "").replace(".gt", "") with open(genotype_file) as in_handle: total_options = set([]) for i, line in enumerate(in_handle): _, aone, atwo, m = line.split("\t")[:4] pgroups = (hla_groups.hla_protein(aone, data), hla_groups.hla_protein(atwo, data)) if i == 0: call_alleles = [aone, atwo] call_pgroups = pgroups mismatches = m total_options.add(pgroups) if len(total_options) > 0: truth_alleles = tz.get_in([sample, hla_locus], hla_truth, []) writer.writerow([sample, hla_locus, mismatches, len(total_options), ";".join(call_alleles), ";".join(call_pgroups), ";".join(truth_alleles), _matches_truth(call_alleles, truth_alleles, data)]) return out_file
def precall(items): """Perform initial pre-calling steps -- coverage calcuation by sample. Use sambamba to call average region coverage in regions, and convert into a correct format. """ items = [utils.to_single_data(x) for x in items] assert len(items) == 1, "Expect one item to Seq2C coverage calculation" data = utils.to_single_data(items) # sv_bed could specify a smaller region than variant coverage, so avoid # this sanity check # assert dd.get_coverage_interval(data) != "genome", "Seq2C only for amplicon and exome sequencing" assert "seq2c_bed_ready" in data["config"]["algorithm"], "Error: svregions or variant_regions BED file required for Seq2C" bed_file = data["config"]["algorithm"]["seq2c_bed_ready"] bam_file = dd.get_align_bam(data) sample_name = dd.get_sample_name(data) work_dir = _sv_workdir(data) cov_file = _calculate_coverage(data, work_dir, bed_file, bam_file, sample_name) if "sv" not in data: data["sv"] = [] data["sv"].append({"variantcaller": "seq2c", "coverage": cov_file}) return [data]
def priority_total_coverage(data): """ calculate coverage at 10 depth intervals in the priority regions """ bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def _cnvkit_coverage_bin_estimate(raw_target_bed, access_bed, cov_interval, work_dir, data): """Estimate good coverage bin sizes for target regions based on coverage. """ out_file = os.path.join(work_dir, "%s-bin_estimate.txt" % os.path.splitext(os.path.basename(raw_target_bed))[0]) method_map = {"genome": "wgs", "regional": "hybrid", "amplicon": "amplicon"} if not os.path.exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd("coverage_bin_size.py"), dd.get_align_bam(data), "-m", method_map[cov_interval], "-t", raw_target_bed, "-g", access_bed] cmd = " ".join(cmd) + " > " + tx_out_file do.run(_prep_cmd(cmd, tx_out_file), "CNVkit coverage bin estimation") avg_bin_sizes = {} estimate_map = {"On-target": "target", "Off-target": "antitarget", "Genome": "target", "Targets (sampling)": "target"} range_map = {("genome", "target"): (500, 1000), ("regional", "target"): (50, 267), ("regional", "antitarget"): (20000, 200000), ("amplicon", "target"): (50, 267)} with open(out_file) as in_handle: for line in in_handle: if line.startswith(tuple(estimate_map.keys())): name, depth, bin_size = line.strip().split("\t") name = estimate_map[name.replace(":", "").strip()] try: bin_size = int(bin_size) except ValueError: bin_size = None if bin_size and bin_size > 0: cur_min, cur_max = range_map[(cov_interval, name)] avg_bin_sizes[name] = max(min(bin_size, cur_max), cur_min) return avg_bin_sizes
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return {} work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file
def _cnvkit_coverage(data, bed_info, input_type): """Calculate coverage in a BED file for CNVkit. """ bam_file = dd.get_align_bam(data) work_dir = utils.safe_makedir(os.path.join(_sv_workdir(data), "raw")) bed_file = bed_info["file"] exts = {".target.bed": ("target", "targetcoverage.cnn"), ".antitarget.bed": ("antitarget", "antitargetcoverage.cnn")} cnntype = None for orig, (cur_cnntype, ext) in exts.items(): if bed_file.endswith(orig): cnntype = cur_cnntype break if cnntype is None: assert bed_file.endswith(".bed"), "Unexpected BED file extension for coverage %s" % bed_file cnntype = "" base = _bam_to_outbase(bam_file, work_dir) merged_out_file = "%s.%s" % (base, ext) out_file = "%s-%s.%s" % (base, bed_info["i"], ext) if "i" in bed_info else merged_out_file if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd(), "coverage", bam_file, bed_file, "-o", tx_out_file] do.run(cmd, "CNVkit coverage") return [{"itype": input_type, "file": out_file, "bam": bam_file, "cnntype": cnntype, "final_out": merged_out_file, "bed_i": bed_info.get("i"), "bed_orig": bed_info["orig"]}]
def _run_gridss(inputs, background, work_dir): out_file = os.path.join(work_dir, "%s-gridss.sv.vcf" % (dd.get_batch(inputs[0]) or dd.get_sample_name(inputs[0]))) if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with file_transaction(inputs[0], out_file) as tx_out_file: htsjdk_opts = ["-Dsamjdk.create_index=true", "-Dsamjdk.use_async_io_read_samtools=true", "-Dsamjdk.use_async_io_write_samtools=true", "-Dsamjdk.use_async_io_write_tribble=true"] cores = dd.get_cores(inputs[0]) resources = config_utils.get_resources("gridss", inputs[0]["config"]) jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx4g"]) jvm_opts = config_utils.adjust_opts(jvm_opts, {"algorithm": {"memory_adjust": {"direction": "increase", "magnitude": cores}}}) jvm_opts = _finalize_memory(jvm_opts) tx_ref_file = _setup_reference_files(inputs[0], os.path.dirname(tx_out_file)) blacklist_bed = sshared.prepare_exclude_file(inputs + background, out_file) cmd = ["gridss"] + jvm_opts + htsjdk_opts + ["gridss.CallVariants"] + \ ["THREADS=%s" % cores, "TMP_DIR=%s" % os.path.dirname(tx_out_file), "WORKING_DIR=%s" % os.path.dirname(tx_out_file), "OUTPUT=%s" % tx_out_file, "ASSEMBLY=%s" % tx_out_file.replace(".sv.vcf", ".gridss.assembly.bam"), "REFERENCE_SEQUENCE=%s" % tx_ref_file, "BLACKLIST=%s" % blacklist_bed] for data in inputs + background: cmd += ["INPUT=%s" % dd.get_align_bam(data), "INPUT_LABEL=%s" % dd.get_sample_name(data)] exports = utils.local_path_export() cmd = exports + " ".join(cmd) do.run(cmd, "GRIDSS SV analysis") return vcfutils.bgzip_and_index(out_file, inputs[0]["config"])
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) sambamba = config_utils.get_program("sambamba", data["config"]) work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage")) if not bed_file: return data cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed") cleaned_bed = bed.decomment(bed_file, cleaned_bed) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: with file_transaction(parse_file) as out_tx: cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} " "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) _calculate_percentiles(os.path.abspath(parse_file), sample) data['coverage'] = os.path.abspath(parse_file) return data
def coverage_region_detailed_stats(data, out_dir, extra_cutoffs=None): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file or not utils.file_exists(bed_file): return [] work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000} with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(data, parse_file) as out_tx: depth_thresholds = sorted(list(cutoffs | extra_cutoffs)) cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=depth_thresholds) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) out_files = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data, cutoffs=cutoffs) return [os.path.abspath(x) for x in out_files]
def _prep_subsampled_bams(data, work_dir): """Prepare a subsampled BAM file with discordants from samblaster and minimal correct pairs. This attempts to minimize run times by pre-extracting useful reads mixed with subsampled normal pairs to estimate paired end distributions: https://groups.google.com/d/msg/delly-users/xmia4lwOd1Q/uaajoBkahAIJ Subsamples correctly aligned reads to 100 million based on speedseq defaults and evaluations on NA12878 whole genome data: https://github.com/cc2qe/speedseq/blob/ca624ba9affb0bd0fb88834ca896e9122639ec94/bin/speedseq#L1102 XXX Currently not used as new versions of delly do not get good sensitivity with downsampled BAMs. """ sr_bam, disc_bam = sshared.get_split_discordants(data, work_dir) ds_bam = bam.downsample(dd.get_align_bam(data), data, 1e8, read_filter="-F 'not secondary_alignment and proper_pair'", always_run=True, work_dir=work_dir) out_bam = "%s-final%s" % utils.splitext_plus(ds_bam) if not utils.file_exists(out_bam): bam.merge([ds_bam, sr_bam, disc_bam], out_bam, data["config"]) bam.index(out_bam, data["config"]) return [out_bam]
def _combine_qc_samples(samples): """Combine split QC analyses into single samples based on BAM files. """ by_bam = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in samples]: batch = dd.get_batch(data) or dd.get_sample_name(data) if not isinstance(batch, (list, tuple)): batch = [batch] batch = tuple(batch) by_bam[(dd.get_align_bam(data), batch)].append(data) out = [] for data_group in by_bam.values(): data = data_group[0] alg_qc = [] qc = {} metrics = {} for d in data_group: qc.update(dd.get_summary_qc(d)) metrics.update(dd.get_summary_metrics(d)) alg_qc.extend(dd.get_algorithm_qc(d)) data["config"]["algorithm"]["qc"] = alg_qc data["summary"]["qc"] = qc data["summary"]["metrics"] = metrics out.append([data]) return out
def _prep_config(items, paired, work_dir): """Run initial configuration, generating a run directory for Manta. """ assert utils.which("configManta.py"), "Could not find installed configManta.py" out_file = os.path.join(work_dir, "runWorkflow.py") if not utils.file_exists(out_file) or _out_of_date(out_file): config_script = os.path.realpath(utils.which("configManta.py")) cmd = [utils.get_program_python("configManta.py"), config_script] if paired: if paired.normal_bam: cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items] data = paired.tumor_data if paired else items[0] cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir] if dd.get_coverage_interval(data) not in ["genome"]: cmd += ["--exome"] for region in _maybe_limit_chromosomes(data): cmd += ["--region", region] resources = config_utils.get_resources("manta", data["config"]) if resources.get("options"): cmd += [str(x) for x in resources["options"]] # If we are removing polyX, avoid calling on small indels which require # excessively long runtimes on noisy WGS runs if "polyx" in dd.get_exclude_regions(data): cmd += ["--config", _prep_streamlined_config(config_script, work_dir)] do.run(cmd, "Configure manta SV analysis") return out_file
def estimate(items, batch, config): """Estimate heterogeneity for a pair of tumor/normal samples. Run in parallel. """ hetcallers = {"theta": theta.run, "phylowgs": phylowgs.run, "bubbletree": bubbletree.run} paired = vcfutils.get_paired_bams([dd.get_align_bam(d) for d in items], items) calls = _get_calls(paired.tumor_data) variants = _get_variants(paired.tumor_data) het_info = [] for hetcaller in _get_hetcallers(items): try: hetfn = hetcallers[hetcaller] except KeyError: hetfn = None print "%s not yet implemented" % hetcaller if hetfn: hetout = hetfn(variants[0], calls, paired) if hetout: het_info.append(hetout) out = [] for data in items: if batch == _get_batches(data)[0]: if dd.get_sample_name(data) == paired.tumor_name: if het_info: data["heterogeneity"] = het_info out.append([data]) return out
def annotate_with_depth(in_file, items): """Annotate called VCF file with depth using duphold (https://github.com/brentp/duphold) Currently annotates single sample and tumor samples in somatic analysis. """ bam_file = None if len(items) == 1: bam_file = dd.get_align_bam(items[0]) else: paired = vcfutils.get_paired(items) if paired: bam_file = paired.tumor_bam if bam_file: out_file = "%s-duphold.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: if not in_file.endswith(".gz"): in_file = vcfutils.bgzip_and_index(in_file, remove_orig=False, out_dir=os.path.dirname(tx_out_file)) ref_file = dd.get_ref_file(items[0]) # cores for BAM reader thread, so max out at 4 based on recommendations cores = min([dd.get_num_cores(items[0]), 4]) cmd = ("duphold --threads {cores} --vcf {in_file} --bam {bam_file} --fasta {ref_file} " "-o {tx_out_file}") do.run(cmd.format(**locals()), "Annotate SV depth with duphold") vcfutils.bgzip_and_index(out_file) return out_file else: return in_file
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ from bcbio.variation import coverage from bcbio.structural import annotate, cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): return [[data]] work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data)) out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data)) if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file)) and (dd.get_align_bam(data) or dd.get_work_bam(data))): # mosdepth target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data)) anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data)) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0) out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data) out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data) # TODO: Correct for GC bias if os.path.exists(out_target_file): data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file} return [[data]]
def _get_files(data): work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "align", dd.get_sample_name(data))) out_file = "%s-highdepth.bed" % os.path.join(out_dir, utils.splitext_plus(os.path.basename(work_bam))[0]) stats_file = "%s-stats.yaml" % utils.splitext_plus(out_file)[0] return work_bam, out_file, stats_file
def _cnvkit_coverage(data, bed_file, input_type): """Calculate coverage in a BED file for CNVkit. """ bam_file = dd.get_align_bam(data) work_dir = utils.safe_makedir(os.path.join(_sv_workdir(data), "raw")) exts = {".target.bed": ("target", "targetcoverage.cnn"), ".antitarget.bed": ("antitarget", "antitargetcoverage.cnn")} cnntype = None for orig, (cur_cnntype, ext) in exts.items(): if bed_file.endswith(orig): cnntype = cur_cnntype break if cnntype is None: assert bed_file.endswith(".bed"), "Unexpected BED file extension for coverage %s" % bed_file cnntype = "" base, base_old = _bam_to_outbase(bam_file, work_dir, data) out_file = "%s.%s" % (base, ext) out_file_old = "%s.%s" % (base_old, ext) # back compatible with previous runs to avoid re-calculating if utils.file_exists(out_file_old): out_file = out_file_old if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = [_get_cmd(), "coverage", "-p", str(dd.get_cores(data)), bam_file, bed_file, "-o", tx_out_file] do.run(_prep_cmd(cmd, tx_out_file), "CNVkit coverage") return {"itype": input_type, "file": out_file, "bam": bam_file, "cnntype": cnntype, "sample": dd.get_sample_name(data)}
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ data, items = _get_batch_representative(items, "vrn_file") cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get("vrn_file") data = _symlink_to_workdir(data, ["vrn_file"]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get("align_bam") and data.get("vrn_file"): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data["vrn_file"], data) if ann_vrn_file: data["vrn_file"] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data["vrn_file"] = annotation.finalize_vcf(data["vrn_file"], get_variantcaller(data), orig_items) logger.info("Filtering for %s" % cur_name) data["vrn_file"] = variant_filtration(data["vrn_file"], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) data["vrn_file"] = prioritize.handle_vcf_calls(data["vrn_file"], data, orig_items) logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) data = damage.run_filter(data["vrn_file"], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data["vrn_file"], orig_vrn_file): data["vrn_file"] = orig_vrn_file return [[data]]
def _get_files(data): work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) out_dir = utils.safe_makedir( os.path.join(tz.get_in(["dirs", "work"], data), "align", dd.get_sample_name(data))) out_file = "%s-highdepth.bed" % os.path.join( out_dir, utils.splitext_plus(os.path.basename(work_bam))[0]) stats_file = "%s-stats.yaml" % utils.splitext_plus(out_file)[0] return work_bam, out_file, stats_file
def _add_hla_files(data): """Add extracted fastq files of HLA alleles for typing. """ if "hla" not in data: data["hla"] = {} align_file = dd.get_align_bam(data) hla_dir = os.path.join(os.path.dirname(align_file), "hla") hla_base = os.path.join(hla_dir, os.path.basename(align_file) + ".hla") data["hla"]["fastq"] = sorted(list(glob.glob(hla_base + ".*.fq"))) return data
def _get_orig_items(data): """Retrieve original items in a batch, handling CWL and standard cases. """ if isinstance(data, dict): if dd.get_align_bam(data) and tz.get_in(["metadata", "batch"], data): return vmulti.get_orig_items(data) else: return [data] else: return data
def postprocess_variants(items): """Provide post-processing of variant calls: filtering and effects annotation. """ if not isinstance(items, dict): items = [utils.to_single_data(x) for x in items] vrn_key = "vrn_file_joint" if "vrn_file_joint" in items[0] else "vrn_file" else: vrn_key = "vrn_file" data, items = _get_batch_representative(items, vrn_key) items = cwlutils.unpack_tarballs(items, data) data = cwlutils.unpack_tarballs(data, data) cur_name = "%s, %s" % (dd.get_sample_name(data), get_variantcaller(data)) logger.info("Finalizing variant calls: %s" % cur_name) orig_vrn_file = data.get(vrn_key) data = _symlink_to_workdir(data, [vrn_key]) data = _symlink_to_workdir(data, ["config", "algorithm", "variant_regions"]) if data.get(vrn_key): logger.info("Calculating variation effects for %s" % cur_name) ann_vrn_file, vrn_stats = effects.add_to_vcf(data[vrn_key], data) if ann_vrn_file: data[vrn_key] = ann_vrn_file if vrn_stats: data["vrn_stats"] = vrn_stats orig_items = _get_orig_items(items) logger.info("Annotate VCF file: %s" % cur_name) data[vrn_key] = annotation.finalize_vcf(data[vrn_key], get_variantcaller(data), orig_items) logger.info("Filtering for %s" % cur_name) data[vrn_key] = variant_filtration(data[vrn_key], dd.get_ref_file(data), tz.get_in(("genome_resources", "variation"), data, {}), data, orig_items) logger.info("Prioritization for %s" % cur_name) prio_vrn_file = prioritize.handle_vcf_calls(data[vrn_key], data, orig_items) if prio_vrn_file != data[vrn_key]: data[vrn_key] = prio_vrn_file logger.info("Germline extraction for %s" % cur_name) data = germline.extract(data, orig_items) if dd.get_align_bam(data): data = damage.run_filter(data[vrn_key], dd.get_align_bam(data), dd.get_ref_file(data), data, orig_items) if orig_vrn_file and os.path.samefile(data[vrn_key], orig_vrn_file): data[vrn_key] = orig_vrn_file return [[data]]
def run(items): """Perform detection of structural variations with lumpy. """ paired = vcfutils.get_paired(items) work_dir = _sv_workdir( paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: full_bams.append(dd.get_align_bam(data)) sr_bam, disc_bam = sshared.find_existing_split_discordants(data) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items) lumpy_vcf = sshared.annotate_with_depth(lumpy_vcf, items) gt_vcfs = {} # Retain paired samples with tumor/normal genotyped in one file if paired and paired.normal_name: batches = [[paired.tumor_data, paired.normal_data]] else: batches = [[x] for x in items] for batch_items in batches: for data in batch_items: gt_vcfs[dd.get_sample_name(data)] = _filter_by_support( lumpy_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background(paired.tumor_name, [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] upload_counts = collections.defaultdict(int) for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs.get(dd.get_sample_name(data)) if vcf_file: effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") data["sv"].append({ "variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "do_upload": upload_counts[vcf_file] == 0, # only upload a single file per batch "exclude_file": exclude_file }) upload_counts[vcf_file] += 1 out.append(data) return out
def _gatk_apply_bqsr(data): """Parallel BQSR support for GATK4. Normalized qualities to 3 bin outputs at 10, 20 and 30 based on pipeline standard recommendations, which will help with output file sizes: https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md#base-quality-score-binning-scheme https://github.com/gatk-workflows/broad-prod-wgs-germline-snps-indels/blob/5585cdf7877104f2c61b2720ddfe7235f2fad577/PairedEndSingleSampleWf.gatk4.0.wdl#L1081 spark host and timeout settings help deal with runs on restricted systems where we encounter network and timeout errors """ in_file = dd.get_align_bam(data) or dd.get_work_bam(data) out_file = os.path.join( dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0]) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) gatk_type = broad_runner.gatk_type() cores = dd.get_num_cores(data) if gatk_type == "gatk4": params = [ "-T", "ApplyBQSRSpark", "--spark-master", "local[%s]" % cores, "--input", in_file, "--output", tx_out_file, "--bqsr-recal-file", data["prep_recal"], "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file), "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800", "--static-quantized-quals", "10", "--static-quantized-quals", "20", "--static-quantized-quals", "30" ] else: params = [ "-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file, "-BQSR", data["prep_recal"], "-o", tx_out_file ] # Avoid problems with intel deflater for GATK 3.8 and GATK4 # https://github.com/bcbio/bcbio-nextgen/issues/2145#issuecomment-343095357 if gatk_type == "gatk4": params += ["--jdk-deflater", "--jdk-inflater"] elif LooseVersion( broad_runner.gatk_major_version()) > LooseVersion("3.7"): params += ["-jdk_deflater", "-jdk_inflater"] memscale = { "magnitude": 0.9 * cores, "direction": "increase" } if cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=True) bam.index(out_file, data["config"]) return out_file
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "structural", items[0]["name"][-1], "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"] } work_bams = [dd.get_align_bam(d) for d in items] ref_file = dd.get_ref_file(items[0]) sv_types = [ "DEL", "DUP" ] # "TRA" has invalid VCF END specifications that GATK doesn't like, "INV" very slow exclude_file = _get_full_exclude_file(items, work_dir) bytype_vcfs = run_multicore( _run_delly, [(work_bams, chrom, sv_type, ref_file, work_dir, items) for (chrom, sv_type) in itertools.product( sshared.get_sv_chroms(items, exclude_file), sv_types)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) sample = tz.get_in(["rgnames", "sample"], data) delly_sample_vcf = vcfutils.select_sample( combo_vcf, sample, "%s-%s%s" % (base, sample, ext), data["config"]) delly_vcf = _delly_count_evidence_filter(delly_sample_vcf, data) effects_vcf, _ = effects.add_to_vcf(delly_vcf, data, "snpeff") data["sv"].append({ "variantcaller": "delly", "vrn_file": effects_vcf, "exclude": exclude_file }) out.append(data) return out
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) or dd.get_align_bam(data) out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)) if dd.get_aligner(data) == "star": out_dir = os.path.join( out_dir, "%s_%s" % (dd.get_sample_name(data), dd.get_aligner(data))) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir)) gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file) and _is_fixed_count_file(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}") resources = config_utils.get_resources("featureCounts", data["config"]) if resources: options = resources.get("options") if options: cmd += " %s" % " ".join([str(x) for x in options]) message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) return count_file
def _group_by_sample_and_batch(samples): """Group samples split by heterogeneity method back one per sample-batch. Groups potentially multiple shared samples (multi batch normals) into single items per group. """ out = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in samples]: out[(dd.get_sample_name(data), dd.get_align_bam(data), tuple(_get_batches(data)))].append(data) return [[xs[0]] for xs in out.values()]
def _group_by_samplename(samples): """Group samples split by QC method back into a single sample. """ out = collections.defaultdict(list) for data in samples: batch = dd.get_batch(data) or dd.get_sample_name(data) if not isinstance(batch, (list, tuple)): batch = [batch] batch = tuple(batch) out[(dd.get_sample_name(data), dd.get_align_bam(data), batch)].append(data) return [xs[0] for xs in out.values()]
def _ready_for_het_analysis(items): """Check if a sample has input information for heterogeneity analysis. We currently require a tumor/normal sample containing both CNV and variant calls. """ paired = vcfutils.get_paired_bams([dd.get_align_bam(d) for d in items], items) has_het = any(dd.get_hetcaller(d) for d in items) if has_het and paired: return get_variants(paired.tumor_data) and _get_calls( paired.tumor_data, cnv_only=True)
def _add_hla_files(data): """Add extracted fastq files of HLA alleles for typing. """ if "hla" not in data: data["hla"] = {} align_file = dd.get_align_bam(data) hla_dir = os.path.join(os.path.dirname(align_file), "hla") if not os.path.exists(hla_dir): hla_dir = None data["hla"]["fastq"] = hla_dir return data
def _run_collect_allelic_counts(pos_file, pos_name, work_dir, data): """Counts by alleles for a specific sample and set of positions. """ out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", "counts")) out_file = os.path.join(out_dir, "%s-%s-counts.tsv" % (dd.get_sample_name(data), pos_name)) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: params = ["-T", "CollectAllelicCounts", "-L", pos_file, "-I", dd.get_align_bam(data), "-R", dd.get_ref_file(data), "-O", tx_out_file] _run_with_memory_scaling(params, tx_out_file, data) return out_file
def run(calls, data): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + [ "--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir ] available_callers = 0 for call in calls: if call["variantcaller"] in SUPPORTED: available_callers += 1 cmd += [ "--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"]) ] if available_callers >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save( dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")) cmd += [ "--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data)) ] cmd += [ "--spades", utils.which("spades.py"), "--age", utils.which("age_align") ] cmd += [ "--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"] ] do.run(cmd, "Combine variant calls with MetaSV") calls.append({"variantcaller": "metasv", "vrn_file": out_file}) return calls
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data['vrn_file'] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) sample = dd.get_sample_name(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) return data
def get_split_discordants(data, work_dir): """Retrieve split and discordant reads, potentially calculating with extract_sv_reads as needed. """ align_bam = dd.get_align_bam(data) sr_bam, disc_bam = _find_existing_inputs(data) if not sr_bam: work_dir = (work_dir if not os.access(os.path.dirname(align_bam), os.W_OK | os.X_OK) else os.path.dirname(align_bam)) sr_bam, disc_bam = _extract_split_and_discordants( align_bam, work_dir, data) return sr_bam, disc_bam
def _run_cnvkit_shared(inputs, backgrounds): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw")) out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input) if utils.file_exists(out_base_old + ".cns"): out_base = out_base_old ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cns"]): cov_interval = dd.get_coverage_interval(inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) # New style shared SV bins if tz.get_in(["depth", "bins", "target"], inputs[0]): target_bed = tz.get_in(["depth", "bins", "target"], inputs[0]) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0]) raw_coverage_cnns = reduce(operator.add, [_get_general_coverage(cdata, itype) for itype, cdata in samples_to_run]) # Back compatible with pre-existing runs else: target_bed, antitarget_bed = _get_original_targets(inputs[0]) raw_coverage_cnns = reduce(operator.add, [_get_original_coverage(cdata, itype) for itype, cdata in samples_to_run]) # Currently metrics not calculated due to speed and needing re-evaluation # We could re-enable with larger truth sets to evaluate background noise # But want to reimplement in a more general fashion as part of normalization if False: coverage_cnns = reduce(operator.add, [_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values()]) background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns), background_cnn, target_bed, antitarget_bed, inputs[0]) else: coverage_cnns = raw_coverage_cnns background_cnn = _cnvkit_background([x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, target_bed, antitarget_bed, inputs[0]) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} fixed_cnrs = run_multicore(_cnvkit_fix, [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) [_cnvkit_segment(cnr, cov_interval, data) for cnr, data in fixed_cnrs] return ckouts
def _prep_data(data, items): for r in ["callable_regions", "variant_regions"]: data[r] = list( set( filter(lambda x: x is not None, [ tz.get_in(("config", "algorithm", r), d) for d in items ]))) data["work_bams"] = [ dd.get_align_bam(x) or dd.get_work_bam(x) for x in items ] data["vrn_files"] = [x["vrn_file"] for x in items] return data
def apply_recal(data): """Apply recalibration tables to the sorted aligned BAM, producing recalibrated BAM. """ orig_bam = dd.get_align_bam(data) or dd.get_work_bam(data) had_work_bam = "work_bam" in data if dd.get_recalibrate(data) in [True, "gatk"]: if data.get("prep_recal"): logger.info("Applying BQSR recalibration with GATK: %s " % str(dd.get_sample_name(data))) data["work_bam"] = _gatk_apply_bqsr(data) elif dd.get_recalibrate(data) == "sentieon": if data.get("prep_recal"): logger.info("Applying BQSR recalibration with sentieon: %s " % str(dd.get_sample_name(data))) data["work_bam"] = sentieon.apply_bqsr(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) # CWL does not have work/alignment BAM separation if not had_work_bam and dd.get_work_bam(data): data["align_bam"] = dd.get_work_bam(data) if orig_bam != dd.get_work_bam(data) and orig_bam != dd.get_align_bam(data): utils.save_diskspace(orig_bam, "BAM recalibrated to %s" % dd.get_work_bam(data), data["config"]) return data
def collect_read_counts(data, work_dir): """Count reads in defined bins using CollectReadCounts. """ out_file = os.path.join(work_dir, "%s-target-coverage.hdf5" % dd.get_sample_name(data)) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: params = ["-T", "CollectReadCounts", "-I", dd.get_align_bam(data), "-L", tz.get_in(["regions", "bins", "target"], data), "--interval-merging-rule", "OVERLAPPING_ONLY", "-O", tx_out_file, "--format", "HDF5"] _run_with_memory_scaling(params, tx_out_file, data) return out_file
def run(items): """Perform detection of structural variations with delly. Performs post-call filtering with a custom filter tuned based on NA12878 Moleculo and PacBio data, using calls prepared by @ryanlayer and @cc2qe Filters using the high quality variant pairs (DV) compared with high quality reference pairs (DR). """ work_dir = utils.safe_makedir( os.path.join(items[0]["dirs"]["work"], "structural", dd.get_sample_name(items[0]), "delly")) # Add core request for delly config = copy.deepcopy(items[0]["config"]) delly_config = utils.get_in(config, ("resources", "delly"), {}) delly_config["cores"] = 1 config["resources"]["delly"] = delly_config parallel = { "type": "local", "cores": config["algorithm"].get("num_cores", 1), "progs": ["delly"] } work_bams = [dd.get_align_bam(d) for d in items] ref_file = dd.get_ref_file(items[0]) exclude_file = _get_full_exclude_file(items, work_bams, work_dir) bytype_vcfs = run_multicore( _run_delly, [(work_bams, chrom, ref_file, work_dir, items) for chrom in sshared.get_sv_chroms(items, exclude_file)], config, parallel) out_file = "%s.vcf.gz" % sshared.outname_from_inputs(bytype_vcfs) combo_vcf = vcfutils.combine_variant_files(bytype_vcfs, out_file, ref_file, config) out = [] upload_counts = collections.defaultdict(int) for data in items: if "sv" not in data: data["sv"] = [] base, ext = utils.splitext_plus(combo_vcf) final_vcf = sshared.finalize_sv(combo_vcf, data, items) if final_vcf: delly_vcf = _delly_count_evidence_filter(final_vcf, data) data["sv"].append({ "variantcaller": "delly", "vrn_file": delly_vcf, "do_upload": upload_counts[final_vcf] == 0, # only upload a single file per batch "exclude": exclude_file }) upload_counts[final_vcf] += 1 out.append(data) return out
def _pick_lead_item(items): """Pick single representative sample for batch calling to attach calls to. For cancer samples, attach to tumor. """ if vcfutils.is_paired_analysis([dd.get_align_bam(x) for x in items], items): for data in items: if vcfutils.get_paired_phenotype(data) == "tumor": return data print(items) raise ValueError("Did not find tumor sample in paired tumor/normal calling") else: return items[0]
def _add_hla_files(data): """Add extracted fastq files of HLA alleles for typing. """ if "hla" not in data: data["hla"] = {} align_file = dd.get_align_bam(data) hla_dir = os.path.join(os.path.dirname(align_file), "hla") if not os.path.exists(hla_dir): hla_files = None else: hla_files = sorted(list(glob.glob(os.path.join(hla_dir, "%s.*.fq" % os.path.basename(align_file))))) data["hla"]["fastq"] = hla_files return data
def pipeline_summary(data): """Provide summary information on processing sample. Handles standard and CWL (single QC output) cases. """ data = utils.to_single_data(data) if data["analysis"].startswith("wgbs-seq"): bismark_bam = dd.get_align_bam(data) sorted_bam = bam.sort(bismark_bam, data["config"]) data = dd.set_align_bam(data, sorted_bam) data = dd.set_work_bam(data, bismark_bam) work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if not work_bam or not work_bam.endswith(".bam"): work_bam = None if dd.get_ref_file(data): if work_bam or (tz.get_in(["config", "algorithm", "kraken"], data)): # kraken doesn't need bam logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join(dd.get_algorithm_qc(data)))) work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data) data["summary"] = _run_qc_tools(work_bam, work_data) if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data): data["summary"]["qc"] = data["summary"]["qc"].get(dd.get_algorithm_qc(data)[0]) return [[data]]
def run_mosdepth(data, target_name, bed_file, per_base=False, quantize=None): """Run mosdepth generating distribution, region depth and per-base depth. """ MosdepthCov = collections.namedtuple( "MosdepthCov", ("dist", "per_base", "regions", "quantize")) bam_file = dd.get_align_bam(data) or dd.get_work_bam(data) work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "coverage", dd.get_sample_name(data))) prefix = os.path.join(work_dir, "%s-%s" % (dd.get_sample_name(data), target_name)) out = MosdepthCov("%s.mosdepth.dist.txt" % prefix, ("%s.per-base.bed.gz" % prefix) if per_base else None, ("%s.regions.bed.gz" % prefix) if bed_file else None, ("%s.quantized.bed.gz" % prefix) if quantize else None) if not utils.file_uptodate(out.dist, bam_file): with file_transaction(data, out.dist) as tx_out_file: tx_prefix = os.path.join(os.path.dirname(tx_out_file), os.path.basename(prefix)) num_cores = dd.get_cores(data) bed_arg = ("--by %s" % bed_file) if bed_file else "" perbase_arg = "" if per_base else "--no-per-base" mapq_arg = "-Q 1" if (per_base or quantize) else "" if quantize: quant_arg = "--quantize %s" % quantize[0] quant_export = " && ".join([ "export MOSDEPTH_Q%s=%s" % (i, x) for (i, x) in enumerate(quantize[1]) ]) quant_export += " && " else: quant_arg, quant_export = "", "" cmd = ( "{quant_export}mosdepth -t {num_cores} -F 1804 {mapq_arg} {perbase_arg} {bed_arg} {quant_arg} " "{tx_prefix} {bam_file}") message = "Calculating coverage: %s %s" % ( dd.get_sample_name(data), target_name) do.run(cmd.format(**locals()), message.format(**locals())) if out.per_base: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.per_base)), out.per_base) if out.regions: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.regions)), out.regions) if out.quantize: shutil.move( os.path.join(os.path.dirname(tx_out_file), os.path.basename(out.quantize)), out.quantize) return out
def _cnn_score_variants(in_file, tensor_type, data): """Score variants with pre-trained CNN models. """ out_file = "%s-cnnscore.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): runner = broad.runner_from_config(data["config"]) gatk_type = runner.gatk_type() assert gatk_type == "gatk4", "CNN filtering requires GATK4" with file_transaction(data, out_file) as tx_out_file: params = ["-T", "CNNScoreVariants", "--variant", in_file, "--reference", dd.get_ref_file(data), "--output", tx_out_file, "--input", dd.get_align_bam(data)] params += ["--tensor-type", tensor_type] runner.run_gatk(params) return vcfutils.bgzip_and_index(out_file, data["config"])
def _run_amber(paired, work_dir, lenient=False): """AMBER: calculate allele frequencies at likely heterozygous sites. lenient flag allows amber runs on small test sets. """ amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join( amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".pcf"): with file_transaction(paired.tumor_data, out_file) as tx_out_file: key = "germline_het_pon" het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data) cmd = [ "AMBER", "-threads", dd.get_num_cores(paired.tumor_data), "-tumor", dd.get_sample_name(paired.tumor_data), "-tumor_bam", dd.get_align_bam(paired.tumor_data), "-reference", dd.get_sample_name(paired.normal_data), "-reference_bam", dd.get_align_bam(paired.normal_data), "-ref_genome", dd.get_ref_file(paired.tumor_data), "-bed", het_bed, "-output_dir", os.path.dirname(tx_out_file) ] if lenient: cmd += ["-max_het_af_percent", "1.0"] try: do.run(cmd, "PURPLE: AMBER baf generation") except subprocess.CalledProcessError as msg: if not lenient and _amber_allowed_errors(str(msg)): return _run_amber(paired, work_dir, True) for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(amber_dir, f)) return out_file
def run(items): """Perform detection of structural variations with lumpy. """ if not all(utils.get_in(data, ("config", "algorithm", "aligner")) in ["bwa", "sentieon-bwa", "minimap2", False, None] for data in items): raise ValueError("Require bwa or minimap2 alignment input for lumpy structural variation detection") paired = vcfutils.get_paired(items) work_dir = _sv_workdir(paired.tumor_data if paired and paired.tumor_data else items[0]) previous_evidence = {} full_bams, sr_bams, disc_bams = [], [], [] for data in items: full_bams.append(dd.get_align_bam(data)) sr_bam, disc_bam = sshared.find_existing_split_discordants(data) sr_bams.append(sr_bam) disc_bams.append(disc_bam) cur_dels, cur_dups = _bedpes_from_cnv_caller(data, work_dir) previous_evidence[dd.get_sample_name(data)] = {} if cur_dels and utils.file_exists(cur_dels): previous_evidence[dd.get_sample_name(data)]["dels"] = cur_dels if cur_dups and utils.file_exists(cur_dups): previous_evidence[dd.get_sample_name(data)]["dups"] = cur_dups lumpy_vcf, exclude_file = _run_smoove(full_bams, sr_bams, disc_bams, work_dir, items) gt_vcfs = {} # Retain paired samples with tumor/normal genotyped in one file if paired and paired.normal_name: batches = [[paired.tumor_data, paired.normal_data]] else: batches = [[x] for x in items] for batch_items in batches: for data in batch_items: gt_vcfs[dd.get_sample_name(data)] = _filter_by_support(lumpy_vcf, data) if paired and paired.normal_name: gt_vcfs = _filter_by_background(paired.tumor_name, [paired.normal_name], gt_vcfs, paired.tumor_data) out = [] for data in items: if "sv" not in data: data["sv"] = [] vcf_file = gt_vcfs.get(dd.get_sample_name(data)) if vcf_file: if dd.get_svprioritize(data): effects_vcf, _ = effects.add_to_vcf(vcf_file, data, "snpeff") else: effects_vcf = None data["sv"].append({"variantcaller": "lumpy", "vrn_file": effects_vcf or vcf_file, "exclude_file": exclude_file}) out.append(data) return out
def _get_original_coverage(data, itype="target"): """Back compatible: get existing coverage files if they exist """ work_dir = os.path.join(_sv_workdir(data), "raw") work_bam = dd.get_work_bam(data) or dd.get_align_bam(data) out = [] base, _ = _bam_to_outbase(work_bam, work_dir, data) target_cnn = "%s.targetcoverage.cnn" % base anti_cnn = "%s.antitargetcoverage.cnn" % base if os.path.exists(target_cnn) and os.path.exists(anti_cnn): out.append({"bam": work_bam, "file": target_cnn, "cnntype": "target", "itype": itype, "sample": dd.get_sample_name(data)}) out.append({"bam": work_bam, "file": anti_cnn, "cnntype": "antitarget", "itype": itype, "sample": dd.get_sample_name(data)}) return out
def _split_samples_by_qc(samples): """Split data into individual quality control steps for a run. """ to_process = [] extras = [] for data in [utils.to_single_data(x) for x in samples]: qcs = dd.get_algorithm_qc(data) if not dd.get_align_bam(data) or not qcs: extras.append([data]) else: for qc in qcs: add = copy.deepcopy(data) add["config"]["algorithm"]["qc"] = [qc] to_process.append([add]) return to_process, extras