def remove_highdepth_regions(in_file, items): """Remove high depth regions from a BED file for analyzing a set of calls. Tries to avoid spurious errors and slow run times in collapsed repeat regions. Also adds ENCODE blacklist regions which capture additional collapsed repeats around centromeres. """ from bcbio.variation import bedutils highdepth_beds = filter(lambda x: x is not None, list(set([tz.get_in(["config", "algorithm", "highdepth_regions"], x) for x in items]))) encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], items[0]) if encode_bed and os.path.exists(encode_bed): highdepth_beds.append(encode_bed) out_file = "%s-glimit%s" % utils.splitext_plus(in_file) if not utils.file_uptodate(out_file, in_file): with file_transaction(items[0], out_file) as tx_out_file: with bedtools_tmpdir(items[0]): all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] if len(highdepth_beds) > 0: with open(all_file, "w") as out_handle: for line in fileinput.input(highdepth_beds): parts = line.split("\t") out_handle.write("\t".join(parts[:4]).rstrip() + "\n") if utils.file_exists(all_file): to_remove = bedutils.sort_merge(all_file, items[0]) cmd = "bedtools subtract -nonamecheck -a {in_file} -b {to_remove} > {tx_out_file}" do.run(cmd.format(**locals()), "Remove high depth regions") else: utils.symlink_plus(in_file, out_file) return out_file
def _combine_sample_regions_batch(batch, items): """Combine sample regions within a group of batched samples. """ config = items[0]["config"] work_dir = utils.safe_makedir(os.path.join(items[0]["dirs"]["work"], "regions")) analysis_file = os.path.join(work_dir, "%s-analysis_blocks.bed" % batch) no_analysis_file = os.path.join(work_dir, "%s-noanalysis_blocks.bed" % batch) if not utils.file_exists(analysis_file) or _needs_region_update(analysis_file, items): # Combine all nblocks into a final set of intersecting regions # without callable bases. HT @brentp for intersection approach # https://groups.google.com/forum/?fromgroups#!topic/bedtools-discuss/qA9wK4zN8do bed_regions = [pybedtools.BedTool(x["regions"]["nblock"]) for x in items if "regions" in x] if len(bed_regions) == 0: analysis_file, no_analysis_file = None, None else: with file_transaction(items[0], analysis_file, no_analysis_file) as (tx_afile, tx_noafile): nblock_regions = reduce(operator.add, bed_regions).saveas( "%s-nblock%s" % utils.splitext_plus(tx_afile)) ref_file = tz.get_in(["reference", "fasta", "base"], items[0]) ref_regions = get_ref_bedtool(ref_file, config) min_n_size = int(config["algorithm"].get("nomap_split_size", 100)) ec_regions = _combine_excessive_coverage(items, ref_regions, min_n_size, tx_afile) if len(ec_regions) > 0: nblock_regions = nblock_regions.cat(ec_regions, d=min_n_size) block_filter = NBlockRegionPicker(ref_regions, config) final_nblock_regions = nblock_regions.filter( block_filter.include_block).each(block_filter.expand_block).saveas( "%s-nblockfinal%s" % utils.splitext_plus(tx_afile)) final_regions = ref_regions.subtract(final_nblock_regions).merge(d=min_n_size) _write_bed_regions(items[0], final_regions, tx_afile, tx_noafile) return analysis_file, no_analysis_file
def run_tnhaplotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Call variants with Sentieon's TNhaplotyper (MuTect2 like). """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(items[0]), items[0]) interval = _get_interval(variant_regions, region, out_file, items) with file_transaction(items[0], out_file) as tx_out_file: paired = vcfutils.get_paired_bams(align_bams, items) assert paired.normal_bam, "Require normal BAM for Sentieon TNhaplotyper" dbsnp = "--dbsnp %s" % (assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" cosmic = "--cosmic %s" % (assoc_files.get("cosmic")) if "cosmic" in assoc_files else "" license = license_export(items[0]) tx_orig_file = "%s-orig%s" % utils.splitext_plus(tx_out_file) cores = dd.get_num_cores(items[0]) cmd = ("{license}sentieon driver -t {cores} -r {ref_file} " "-i {paired.tumor_bam} -i {paired.normal_bam} {interval} " "--algo TNhaplotyper " "--tumor_sample {paired.tumor_name} --normal_sample {paired.normal_name} " "{dbsnp} {cosmic} {tx_orig_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper") cmd = ("gunzip -c {tx_orig_file} | " "sed 's/ID=ECNT,Number=1,Type=Integer/ID=ECNT,Number=1,Type=String/' | " "sed 's/ID=HCNT,Number=1,Type=Integer/ID=HCNT,Number=1,Type=String/' | " "sed 's/ID=NLOD,Number=1,Type=Float/ID=NLOD,Number=1,Type=String/' | " "sed 's/ID=TLOD,Number=1,Type=Float/ID=TLOD,Number=1,Type=String/' | " "sed 's/ID=PON,Number=1,Type=Integer/ID=PON,Number=1,Type=String/' | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon TNhaplotyper: make headers GATK compatible") vcfutils.bgzip_and_index(tx_out_file, items[0]["config"]) return out_file
def _evaluate_multi(callers, truth_svtypes, ensemble, call_beds, data): out_file = "%s-validate.csv" % utils.splitext_plus(ensemble)[0] df_file = "%s-validate-df.csv" % utils.splitext_plus(ensemble)[0] if not utils.file_uptodate(out_file, ensemble) or not utils.file_uptodate(df_file, ensemble): with open(out_file, "w") as out_handle: with open(df_file, "w") as df_out_handle: writer = csv.writer(out_handle) dfwriter = csv.writer(df_out_handle) total_callers = callers_by_event(ensemble, data) writer.writerow(["svtype", "size", "caller", "sensitivity", "precision"]) dfwriter.writerow(["svtype", "size", "caller", "metric", "value", "label"]) for svtype, truth in truth_svtypes.items(): for size in EVENT_SIZES: str_size = "%s-%s" % size for caller in (x for x in callers if x in total_callers[svtype] or x == "sv-ensemble"): try: call_bed = call_beds[caller] except KeyError: assert caller == "sv-ensemble", caller call_bed = ensemble evalout = _evaluate_one(caller, svtype, size, call_bed, truth, data) writer.writerow([svtype, str_size, caller, evalout["sensitivity"]["label"], evalout["precision"]["label"]]) for metric in ["sensitivity", "precision"]: dfwriter.writerow([svtype, str_size, caller, metric, evalout[metric]["val"], evalout[metric]["label"]]) return out_file, df_file
def _fix_gatk_header(exist_files, out_file, config): """Ensure consistent headers for VCF concatenation. Fixes problems for genomes that start with chrM by reheadering the first file. These files do haploid variant calling which lack the PID phasing key/value pair in FORMAT, so initial chrM samples cause errors during concatenation due to the lack of header merging. This fixes this by updating the first header. """ from bcbio.variation import ploidy c, base_file = exist_files[0] replace_file = base_file items = [{"config": config}] if ploidy.get_ploidy(items, region=(c, 1, 2)) == 1: for c, x in exist_files[1:]: if ploidy.get_ploidy(items, (c, 1, 2)) > 1: replace_file = x break base_fix_file = os.path.join(os.path.dirname(out_file), "%s-fixheader%s" % utils.splitext_plus(os.path.basename(base_file))) with file_transaction(config, base_fix_file) as tx_out_file: header_file = "%s-header.vcf" % utils.splitext_plus(tx_out_file)[0] do.run("zgrep ^# %s > %s" % (replace_file, header_file), "Prepare header file for merging") resources = config_utils.get_resources("picard", config) ropts = [] if "options" in resources: ropts += [str(x) for x in resources.get("options", [])] do.run("%s && picard FixVcfHeader HEADER=%s INPUT=%s OUTPUT=%s %s" % (utils.get_java_clprep(), header_file, base_file, base_fix_file, " ".join(ropts)), "Reheader initial VCF file in merge") bgzip_and_index(base_fix_file, config) return [base_fix_file] + [x for (c, x) in exist_files[1:]]
def add_genome_context(orig_file, data): """Annotate a file with annotations of genome context using vcfanno. """ out_file = "%s-context.vcf.gz" % utils.splitext_plus(orig_file)[0] if not utils.file_uptodate(out_file, orig_file): with file_transaction(data, out_file) as tx_out_file: config_file = "%s.toml" % (utils.splitext_plus(tx_out_file)[0]) with open(config_file, "w") as out_handle: all_names = [] for fname in dd.get_genome_context_files(data): bt = pybedtools.BedTool(fname) if bt.field_count() >= 4: d, base = os.path.split(fname) _, prefix = os.path.split(d) name = "%s_%s" % (prefix, utils.splitext_plus(base)[0]) out_handle.write("[[annotation]]\n") out_handle.write('file = "%s"\n' % fname) out_handle.write("columns = [4]\n") out_handle.write('names = ["%s"]\n' % name) out_handle.write('ops = ["uniq"]\n') all_names.append(name) out_handle.write("[[postannotation]]\n") out_handle.write("fields = [%s]\n" % (", ".join(['"%s"' % n for n in all_names]))) out_handle.write('name = "genome_context"\n') out_handle.write('op = "concat"\n') out_handle.write('type = "String"\n') cmd = "vcfanno {config_file} {orig_file} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Annotate with problem annotations", data) return vcfutils.bgzip_and_index(out_file, data["config"])
def _fastp_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp) """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report = tx_out[0] tx_out_files = tx_out[1:] cmd = ["fastp", "--thread", dd.get_num_cores(data)] if dd.get_quality_format(data).lower() == "illumina": cmd += ["--phred64"] for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)): if i == 0: cmd += ["-i", inf, "-o", outf] else: cmd += ["-I", inf, "-O", outf] cmd += ["--cut_by_quality3", "--cut_mean_quality", "5", "--length_required", str(dd.get_min_read_length(data)), "--disable_quality_filtering"] if "polyx" in dd.get_adapters(data): cmd += ["--trim_poly_x", "--poly_x_min_len", "8"] if "polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data): cmd += ["--trim_poly_g", "--poly_g_min_len", "8"] for a in adapters: cmd += ["--adapter_sequence", a] if not adapters: cmd += ["--disable_adapter_trimming"] cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)] do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data)) return out_files, report_file
def _run_snpeff(snp_in, out_format, data): snpeff_db, datadir = get_db(data) assert datadir is not None, "Did not find snpEff resources in genome configuration: %s" % data["genome_resources"] assert os.path.exists(os.path.join(datadir, snpeff_db)), "Did not find %s snpEff genome data in %s" % ( snpeff_db, datadir, ) snpeff_cmd = get_cmd("eff", datadir, data["config"]) ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv" out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext) if not utils.file_exists(out_file): config_args = " ".join(_snpeff_args_from_config(data)) if ext.endswith(".gz"): bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"]) else: bgzip_cmd = "" with file_transaction(out_file) as tx_out_file: cmd = ( "{snpeff_cmd} {config_args} -noLog -1 -i vcf -o {out_format} " "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}" ) do.run(cmd.format(**locals()), "snpEff effects", data) if ext.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _run_snpeff(snp_in, out_format, data): """Run effects prediction with snpEff, skipping if snpEff database not present. """ snpeff_db, datadir = get_db(data) if not snpeff_db: return None assert os.path.exists(os.path.join(datadir, snpeff_db)), \ "Did not find %s snpEff genome data in %s" % (snpeff_db, datadir) snpeff_cmd = get_cmd("eff", datadir, data["config"]) ext = utils.splitext_plus(snp_in)[1] if out_format == "vcf" else ".tsv" out_file = "%s-effects%s" % (utils.splitext_plus(snp_in)[0], ext) if not utils.file_exists(out_file): config_args = " ".join(_snpeff_args_from_config(data)) if ext.endswith(".gz"): bgzip_cmd = "| %s -c" % tools.get_bgzip_cmd(data["config"]) else: bgzip_cmd = "" with file_transaction(data, out_file) as tx_out_file: cmd = ("{snpeff_cmd} {config_args} -noLog -i vcf -o {out_format} " "{snpeff_db} {snp_in} {bgzip_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "snpEff effects", data) if ext.endswith(".gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def run_filter(vrn_file, align_bam, ref_file, data, items): """Filter and annotate somatic VCFs with damage/bias artifacts on low frequency variants. Moves damage estimation to INFO field, instead of leaving in FILTER. """ if not should_filter(items) or not vcfutils.vcf_has_variants(vrn_file): return data else: raw_file = "%s-damage.vcf" % utils.splitext_plus(vrn_file)[0] out_plot_files = ["%s%s" % (utils.splitext_plus(raw_file)[0], ext) for ext in ["_seq_bias_simplified.pdf", "_pcr_bias_simplified.pdf"]] if not utils.file_uptodate(raw_file, vrn_file) and not utils.file_uptodate(raw_file + ".gz", vrn_file): with file_transaction(items[0], raw_file) as tx_out_file: # Does not apply --qcSummary plotting due to slow runtimes cmd = ["dkfzbiasfilter.py", "--filterCycles", "1", "--passOnly", "--tempFolder", os.path.dirname(tx_out_file), vrn_file, align_bam, ref_file, tx_out_file] do.run(cmd, "Filter low frequency variants for DNA damage and strand bias") for out_plot in out_plot_files: tx_plot_file = os.path.join("%s_qcSummary" % utils.splitext_plus(tx_out_file)[0], "plots", os.path.basename(out_plot)) if utils.file_exists(tx_plot_file): shutil.move(tx_plot_file, out_plot) raw_file = vcfutils.bgzip_and_index(raw_file, items[0]["config"]) data["vrn_file"] = _filter_to_info(raw_file, items[0]) out_plot_files = [x for x in out_plot_files if utils.file_exists(x)] data["damage_plots"] = out_plot_files return data
def select_regions(args): """ select regions and create coverage plots """ assert args.files, "Need a set of fastq files" assert args.out, "Need --out" region = os.path.abspath(args.region) workdir = 'select' safe_makedir(workdir) out_file = os.path.join(workdir, splitext_plus(args.out)[0] + "_cpg.bed") out_snp_file = os.path.join(workdir, splitext_plus(args.out)[0] + '_snp.bed') if not file_exists(out_file): with file_transaction(out_file) as tx_out: with open(tx_out, 'w') as out_handle: # print >> out_handle, "chrom\tstart\tend\tcu\tcm\tstrand\tgene\tsample" for in_vcf in args.files: snp_file = in_vcf.replace("rawcpg", "rawsnp") sample = splitext_plus(os.path.basename(in_vcf))[0].split("_")[0] get_het(snp_file, region, sample, out_snp_file) res = pybedtools.BedTool(in_vcf).intersect(b=region, wo=True) # cmd = ("bedtools intersect -u -a {in_vcf} -b {region} > {tx_tmp}") # do.run(cmd.format(**locals()), "selecting %s" % in_vcf) for record in res: gene = record[-2] chrom, pos, info, header, frmt = record[0], int(record[1]), record[7], record[8], record[9] cs = info.split(';')[0].split('=')[1] frmt = dict(zip(header.split(":"), frmt.split(':'))) if is_good_cpg(frmt): tag = "%s-%s-%s-%s" % (frmt['CU'], frmt['CM'], gene, sample) print >> out_handle, "%s\t%s\t%s\t%s\t.\t%s" % (chrom, pos, pos + 1, tag, cs)
def _cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, items): """Estimate noise of a sample using a flat background. Only used for panel/targeted data due to memory issues with whole genome samples. """ if cov_interval == "genome": return cnns target_cnn = [x["file"] for x in cnns if x["cnntype"] == "target"][0] background_file = "%s-flatbackground.cnn" % utils.splitext_plus(target_cnn)[0] background_file = cnvkit_background([], background_file, items, target_bed, antitarget_bed) cnr_file, data = _cnvkit_fix_base(cnns, background_file, items, "-flatbackground") cns_file = _cnvkit_segment(cnr_file, cov_interval, data) metrics_file = "%s-metrics.txt" % utils.splitext_plus(target_cnn)[0] if not utils.file_exists(metrics_file): with file_transaction(data, metrics_file) as tx_metrics_file: cmd = [_get_cmd(), "metrics", "-o", tx_metrics_file, "-s", cns_file, "--", cnr_file] do.run(_prep_cmd(cmd, tx_metrics_file), "CNVkit metrics") metrics = _read_metrics_file(metrics_file) out = [] for cnn in cnns: cnn["metrics"] = metrics out.append(cnn) return out
def _get_files(data): work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "align", dd.get_sample_name(data))) out_file = "%s-highdepth.bed" % os.path.join(out_dir, utils.splitext_plus(os.path.basename(work_bam))[0]) stats_file = "%s-stats.yaml" % utils.splitext_plus(out_file)[0] return work_bam, out_file, stats_file
def umi_consensus(data): """Convert UMI grouped reads into fastq pair for re-alignment. """ align_bam = dd.get_work_bam(data) umi_method, umi_tag = _check_umi_type(align_bam) f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0] f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0] if not utils.file_uptodate(f1_out, align_bam): with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out): jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = "--async-io=true --compression=0" group_opts, cons_opts, filter_opts = _get_fgbio_options(data, umi_method) cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads" tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0] ref_file = dd.get_ref_file(data) cmd = ("unset JAVA_HOME && " "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} " "-i {align_bam} | " "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: " "-i /dev/stdin -o /dev/stdout | " "fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} " "-i /dev/stdin -o /dev/stdout | " "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1") do.run(cmd.format(**locals()), "UMI consensus fastq generation") return f1_out, f2_out
def run(vcf, conf_fns, lua_fns, data, basepath=None, decomposed=False): """Annotate a VCF file using vcfanno (https://github.com/brentp/vcfanno) decomposed -- if set to true we'll convert allele based output into single values to match alleles and make compatible with vcf2db (https://github.com/quinlan-lab/vcf2db/issues/14) """ conf_fns.sort(key=lambda x: os.path.basename(x) if x else "") lua_fns.sort(key=lambda x: os.path.basename(x) if x else "") ext = "-annotated-%s" % utils.splitext_plus(os.path.basename(conf_fns[0]))[0] if vcf.find(ext) > 0: out_file = vcf else: out_file = "%s%s.vcf.gz" % (utils.splitext_plus(vcf)[0], ext) if not utils.file_exists(out_file): vcfanno = config_utils.get_program("vcfanno", data) with file_transaction(out_file) as tx_out_file: conffn = _combine_files(conf_fns, out_file, data, basepath is None) luafn = _combine_files(lua_fns, out_file, data, False) luaflag = "-lua {0}".format(luafn) if luafn and utils.file_exists(luafn) else "" basepathflag = "-base-path {0}".format(basepath) if basepath else "" cores = dd.get_num_cores(data) post_ann = "sed -e 's/Number=A/Number=1/g' |" if decomposed else "" cmd = ("{vcfanno} -p {cores} {luaflag} {basepathflag} {conffn} {vcf} " "| {post_ann} bgzip -c > {tx_out_file}") message = "Annotating {vcf} with vcfanno, using {conffn}".format(**locals()) do.run(cmd.format(**locals()), message) return vcfutils.bgzip_and_index(out_file, data["config"])
def _prioritize_vcf(caller, vcf_file, prioritize_by, post_prior_fn, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) if not utils.file_exists(out_file): priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: cmd = ("bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}") do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") if post_prior_fn: priority_vcf = post_prior_fn(priority_vcf, work_dir, data) simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0] if not utils.file_exists(simple_vcf): with file_transaction(data, simple_vcf) as tx_out_file: transcript_file = regions.get_sv_bed(data, "transcripts1000", work_dir) if transcript_file: transcript_file = vcfutils.bgzip_and_index(transcript_file, data["config"]) ann_opt = "--gene_bed %s" % transcript_file else: ann_opt = "" cmd = "simple_sv_annotation.py {ann_opt} -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(vcfutils.sort_by_ref(simple_vcf, data), data["config"]) with file_transaction(data, out_file) as tx_out_file: cmd = ("zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".") && (S${sample}$GT != "0/0")) """ "print CALLER,SNAME,$1,$2,I$END," """I$SVTYPE=="BND" ? I$SVTYPE":"$3":"I$MATEID : I$SVTYPE,""" "I$KNOWN,I$END_GENE,I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}") do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file
def run(bam_file, data, out_dir): """Run viral QC analysis. """ viral_target = "gdc-viral" out = {} if vcfutils.get_paired_phenotype(data): viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target] if viral_refs and utils.file_exists(viral_refs[0]): viral_ref = viral_refs[0] viral_bam = os.path.join(utils.safe_makedir(out_dir), "%s-%s.bam" % (dd.get_sample_name(data), utils.splitext_plus(os.path.basename(viral_ref))[0])) out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0] if not utils.file_uptodate(out_file, bam_file): if not utils.file_uptodate(viral_bam, bam_file): with file_transaction(data, viral_bam) as tx_out_file: cores = dd.get_num_cores(data) tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0] cmd = ("samtools view -u -f 4 {bam_file} | " "bamtofastq collate=0 | " "bwa mem -t {cores} {viral_ref} - | " "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} " "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}") do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome") with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write("# sample\t%s\n" % dd.get_sample_name(data)) for info in bam.idxstats(viral_bam, data): if info.aligned > 0: out_handle.write("%s\t%s\n" % (info.contig, info.aligned)) out["base"] = out_file return out
def _shared_variant_filtration(filter_type, snp_file, ref_file, vrn_files, variantcaller): """Share functionality for filtering variants. """ recal_file = "{base}.recal".format(base=utils.splitext_plus(snp_file)[0]) tranches_file = "{base}.tranches".format(base=utils.splitext_plus(snp_file)[0]) params = ["-T", "VariantRecalibrator", "-R", ref_file, "--input", snp_file, "--mode", filter_type, "-an", "DP", "-an", "FS", "-an", "ReadPosRankSum", "-an", "MQRankSum"] if filter_type in ["SNP", "BOTH"]: # Haplotype Score no longer calculated for indels as of GATK 2.4 # and only used for GATK Unified Genotyper calls if variantcaller == "gatk": params.extend(["-an", "HaplotypeScore"]) for name, train_info in [("train_hapmap", "known=false,training=true,truth=true,prior=15.0"), ("train_1000g_omni", "known=false,training=true,truth=false,prior=12.0"), ("dbsnp", "known=true,training=false,truth=false,prior=8.0")]: if name in vrn_files: params.extend(["-resource:%s,VCF,%s" % (name.replace("train_", ""), train_info), vrn_files[name]]) if filter_type in ["INDEL", "BOTH"]: params.extend( ["-resource:mills,VCF,known=true,training=true,truth=true,prior=12.0", vrn_files["train_indels"]]) return params, recal_file, tranches_file
def _bedpe_to_vcf(bedpe_file, sconfig_file, items): """Convert BEDPE output into a VCF file. """ tovcf_script = do.find_cmd("bedpeToVcf") if tovcf_script: out_file = "%s.vcf.gz" % utils.splitext_plus(bedpe_file)[0] out_nogzip = out_file.replace(".vcf.gz", ".vcf") raw_file = "%s-raw.vcf" % utils.splitext_plus(bedpe_file)[0] if not utils.file_exists(out_file): if not utils.file_exists(raw_file): with file_transaction(raw_file) as tx_raw_file: ref_file = tz.get_in(["reference", "fasta", "base"], items[0]) cmd = [ sys.executable, tovcf_script, "-c", sconfig_file, "-f", ref_file, "-b", bedpe_file, "-o", tx_raw_file, ] do.run(cmd, "Convert lumpy bedpe output to VCF") prep_file = vcfutils.sort_by_ref(raw_file, items[0]) if not utils.file_exists(out_nogzip): utils.symlink_plus(prep_file, out_nogzip) out_file = vcfutils.bgzip_and_index(out_nogzip, items[0]["config"]) return out_file
def _filter_by_normal(tumor_counts, normal_counts, data): """Filter count files based on normal frequency and median depth, avoiding high depth regions. For frequency, restricts normal positions to those between 0.4 and 0.65 For depth, matches approach used in AMBER to try and avoid problematic genomic regions with high count in the normal: https://github.com/hartwigmedical/hmftools/tree/master/amber#usage """ from bcbio.heterogeneity import bubbletree fparams = bubbletree.NORMAL_FILTER_PARAMS tumor_out = "%s-normfilter%s" % utils.splitext_plus(tumor_counts) normal_out = "%s-normfilter%s" % utils.splitext_plus(normal_counts) if not utils.file_uptodate(tumor_out, tumor_counts): with file_transaction(data, tumor_out, normal_out) as (tx_tumor_out, tx_normal_out): median_depth = _get_normal_median_depth(normal_counts) min_normal_depth = median_depth * fparams["min_depth_percent"] max_normal_depth = median_depth * fparams["max_depth_percent"] with open(tumor_counts) as tumor_handle: with open(normal_counts) as normal_handle: with open(tx_tumor_out, "w") as tumor_out_handle: with open(tx_normal_out, "w") as normal_out_handle: header = None for t, n in zip(tumor_handle, normal_handle): if header is None: if not n.startswith("@"): header = n.strip().split() tumor_out_handle.write(t) normal_out_handle.write(n) elif (_normal_passes_depth(header, n, min_normal_depth, max_normal_depth) and _normal_passes_freq(header, n, fparams)): tumor_out_handle.write(t) normal_out_handle.write(n) return tumor_out, normal_out
def subset_by_supported(input_file, get_coords, calls_by_name, work_dir, data, headers=("#",)): """Limit CNVkit input to calls with support from another caller. get_coords is a function that return chrom, start, end from a line of the input_file, allowing handling of multiple input file types. """ support_files = [(c, tz.get_in([c, "vrn_file"], calls_by_name)) for c in ensemble.SUBSET_BY_SUPPORT["cnvkit"]] support_files = [(c, f) for (c, f) in support_files if f and vcfutils.vcf_has_variants(f)] if len(support_files) == 0: return input_file else: out_file = os.path.join(work_dir, "%s-havesupport%s" % utils.splitext_plus(os.path.basename(input_file))) if not utils.file_uptodate(out_file, input_file): input_bed = _input_to_bed(input_file, work_dir, get_coords, headers) pass_coords = set([]) with file_transaction(data, out_file) as tx_out_file: support_beds = " ".join([_sv_vcf_to_bed(f, c, out_file) for c, f in support_files]) tmp_cmp_bed = "%s-intersectwith.bed" % utils.splitext_plus(tx_out_file)[0] cmd = "bedtools intersect -wa -f 0.5 -r -a {input_bed} -b {support_beds} > {tmp_cmp_bed}" do.run(cmd.format(**locals()), "Intersect CNVs with support files") for r in pybedtools.BedTool(tmp_cmp_bed): pass_coords.add((str(r.chrom), str(r.start), str(r.stop))) with open(input_file) as in_handle: with open(tx_out_file, "w") as out_handle: for line in in_handle: passes = True if not line.startswith(headers): passes = get_coords(line) in pass_coords if passes: out_handle.write(line) return out_file
def _mirtop(input_fn, sps, db, out_dir, config): """ Convert to GFF3 standard format """ hairpin = os.path.join(db, "hairpin.fa") gtf = os.path.join(db, "mirbase.gff3") if not file_exists(hairpin) or not file_exists(gtf): logger.warning("%s or %s are not installed. Skipping." % (hairpin, gtf)) return None out_gtf_fn = "%s.gtf" % utils.splitext_plus(os.path.basename(input_fn))[0] out_gff_fn = "%s.gff" % utils.splitext_plus(os.path.basename(input_fn))[0] export = _get_env() cmd = ("{export} mirtop gff --sps {sps} --hairpin {hairpin} " "--gtf {gtf} --format seqbuster -o {out_tx} {input_fn}") if not file_exists(os.path.join(out_dir, out_gtf_fn)) and \ not file_exists(os.path.join(out_dir, out_gff_fn)): with tx_tmpdir() as out_tx: do.run(cmd.format(**locals()), "Do miRNA annotation for %s" % input_fn) with utils.chdir(out_tx): out_fn = out_gtf_fn if utils.file_exists(out_gtf_fn) \ else out_gff_fn if utils.file_exists(out_fn): shutil.move(os.path.join(out_tx, out_fn), os.path.join(out_dir, out_fn)) out_fn = out_gtf_fn if utils.file_exists(os.path.join(out_dir, out_gtf_fn)) \ else os.path.join(out_dir, out_gff_fn) if utils.file_exists(os.path.join(out_dir, out_fn)): return os.path.join(out_dir, out_fn)
def _cram_to_fastq_regions(regions, cram_file, dirs, data): """Convert CRAM files to fastq, potentially within sub regions. Returns multiple fastq files that can be merged back together. """ base_name = utils.splitext_plus(os.path.basename(cram_file))[0] work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep", "%s-parts" % base_name)) ref_file = tz.get_in(["reference", "fasta", "base"], data) resources = config_utils.get_resources("bamtofastq", data["config"]) cores = tz.get_in(["config", "algorithm", "num_cores"], data, 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default fnames = [] is_paired = False for region in regions: rext = "-%s" % region.replace(":", "_").replace("-", "_") if region else "full" out_s, out_p1, out_p2 = [os.path.join(work_dir, "%s%s-%s.fq.gz" % (base_name, rext, fext)) for fext in ["s1", "p1", "p2"]] if not utils.file_exists(out_p1): with file_transaction(out_s, out_p1, out_p2) as (tx_out_s, tx_out_p1, tx_out_p2): sortprefix = "%s-sort" % utils.splitext_plus(tx_out_s)[0] cmd = ("bamtofastq filename={cram_file} inputformat=cram T={sortprefix} " "gz=1 collate=1 colsbs={max_mem} " "F={tx_out_p1} F2={tx_out_p2} S={tx_out_s} O=/dev/null O2=/dev/null " "reference={ref_file}") if region: cmd += " ranges='{region}'" do.run(cmd.format(**locals()), "CRAM to fastq %s" % region if region else "") if is_paired or not _is_gzip_empty(out_p1): fnames.append((out_p1, out_p2)) is_paired = True else: fnames.append((out_s,)) return fnames
def _prioritize_vcf(caller, vcf_file, prioritize_by, work_dir, data): """Provide prioritized tab delimited output for a single caller. """ sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, "%s-%s-prioritize.tsv" % (sample, caller)) if not utils.file_exists(out_file): priority_vcf = "%s.vcf.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(priority_vcf): with file_transaction(data, priority_vcf) as tx_out_file: cmd = "bcbio-prioritize known -i {vcf_file} -o {tx_out_file} -k {prioritize_by}" do.run(cmd.format(**locals()), "Prioritize: select in known regions of interest") simple_vcf = "%s-simple.vcf.gz" % utils.splitext_plus(priority_vcf)[0] if not utils.file_exists(simple_vcf): with file_transaction(data, simple_vcf) as tx_out_file: cmd = "simple_sv_annotation.py -o - {priority_vcf} | bgzip -c > {tx_out_file}" do.run(cmd.format(**locals()), "Prioritize: simplified annotation output") simple_vcf = vcfutils.bgzip_and_index(simple_vcf, data["config"]) with file_transaction(data, out_file) as tx_out_file: cmd = ( "zcat {simple_vcf} | vawk -v SNAME={sample} -v CALLER={caller} " """'{{if (($7 == "PASS" || $7 == ".")) """ "print CALLER,SNAME,$1,$2,I$END,I$SVTYPE,I$KNOWN,I$LOF,I$SIMPLE_ANN," "S${sample}$SR,S${sample}$PE}}' > {tx_out_file}" ) do.run(cmd.format(**locals()), "Prioritize: convert to tab delimited") return out_file
def normalize(in_file, data, passonly=False, normalize_indels=True, split_biallelic=True, rerun_effects=True, remove_oldeffects=False, nonrefonly=False, work_dir=None): """Normalizes variants and reruns SnpEFF for resulting VCF """ if remove_oldeffects: out_file = "%s-noeff-nomultiallelic%s" % utils.splitext_plus(in_file) else: out_file = "%s-nomultiallelic%s" % utils.splitext_plus(in_file) if work_dir: out_file = os.path.join(work_dir, os.path.basename(out_file)) if not utils.file_exists(out_file): if vcfutils.vcf_has_variants(in_file): ready_ma_file = _normalize(in_file, data, passonly=passonly, normalize_indels=normalize_indels, split_biallelic=split_biallelic, remove_oldeffects=remove_oldeffects, nonrefonly=nonrefonly, work_dir=work_dir) if rerun_effects: ann_ma_file, _ = effects.add_to_vcf(ready_ma_file, data) if ann_ma_file: ready_ma_file = ann_ma_file utils.symlink_plus(ready_ma_file, out_file) else: utils.symlink_plus(in_file, out_file) return vcfutils.bgzip_and_index(out_file, data["config"])
def _run_svtyper(in_file, full_bam, exclude_file, data): """Genotype structural variant calls with SVtyper. Removes calls in high depth regions to avoid slow runtimes: https://github.com/hall-lab/svtyper/issues/16 """ out_file = "%s-wgts.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: if not vcfutils.vcf_has_variants(in_file): shutil.copy(in_file, out_file) else: python = sys.executable svtyper = os.path.join(os.path.dirname(sys.executable), "svtyper") if exclude_file and utils.file_exists(exclude_file): regions_to_rm = "-T ^%s" % (exclude_file) else: regions_to_rm = "" # add FILTER headers, which are lost during svtyping header_file = "%s-header.txt" % utils.splitext_plus(tx_out_file)[0] with open(header_file, "w") as out_handle: with utils.open_gzipsafe(in_file) as in_handle: for line in in_handle: if not line.startswith("#"): break if line.startswith("##FILTER"): out_handle.write(line) for region in ref.file_contigs(dd.get_ref_file(data), data["config"]): out_handle.write("##contig=<ID=%s,length=%s>\n" % (region.name, region.size)) cmd = ("bcftools view {in_file} {regions_to_rm} | " "{python} {svtyper} --max_reads 1000 -B {full_bam} | " "bcftools annotate -h {header_file} | " "bgzip -c > {tx_out_file}") do.run(cmd.format(**locals()), "SV genotyping with svtyper") return vcfutils.sort_by_ref(out_file, data)
def combine_pairs(input_files): """ calls files pairs if they are completely the same except for one has _1 and the other has _2 returns a list of tuples of pairs or singles. From bipy.utils (https://github.com/roryk/bipy/blob/master/bipy/utils.py) Adjusted to allow different input paths or extensions for matching files. """ PAIR_FILE_IDENTIFIERS = set(["1", "2", "3"]) pairs = [] used = set([]) for in_file in input_files: if in_file in used: continue for comp_file in input_files: if comp_file in used or comp_file == in_file: continue a = rstrip_extra(utils.splitext_plus(os.path.basename(in_file))[0]) b = rstrip_extra(utils.splitext_plus(os.path.basename(comp_file))[0]) if len(a) != len(b): continue s = dif(a,b) # no differences, then its the same file stem if len(s) == 0: logger.error("%s and %s have the same stem, so we don't know " "how to assign it to the sample data in the CSV. To " "get around this you can rename one of the files. " "If they are meant to be the same sample run in two " "lanes, combine them first with the " "bcbio_prepare_samples.py script." "(http://bcbio-nextgen.readthedocs.io/en/latest/contents/configuration.html#multiple-files-per-sample)" % (in_file, comp_file)) sys.exit(1) if len(s) > 1: continue #there is only 1 difference if (a[s[0]] in PAIR_FILE_IDENTIFIERS and b[s[0]] in PAIR_FILE_IDENTIFIERS): # if the 1/2 isn't the last digit before a separator, skip # this skips stuff like 2P 2A, often denoting replicates, not # read pairings if len(b) > (s[0] + 1): if (b[s[0]+1] not in ("_", "-", ".")): continue # if the 1/2 is not a separator or prefaced with R, skip if b[s[0]- 1] in ("R", "_", "-", "."): used.add(in_file) used.add(comp_file) if b[s[0]] > a[s[0]]: pairs.append([in_file, comp_file]) else: pairs.append([comp_file, in_file]) break if in_file not in used: pairs.append([in_file]) used.add(in_file) return pairs
def _collapse(in_file): cmd = "seqcluster collapse -f {in_file} -o {out_dir}" basename = splitext_plus(op.basename(in_file))[0] out_file = splitext_plus(in_file)[0] + "_trimmed.fastq" if not utils.file_exists(out_file): with tx_tmpdir() as out_dir: tx_out_file = op.join(out_dir, basename + "_trimmed.fastq") do.run(cmd.format(**locals()), "collapse") shutil.move(tx_out_file, out_file) return out_file
def _gene_closest(orig_bed, gene_gtf): """Calculate the closest transcript to events in the input BED file. """ sorted_gtf = "%s-sort.gtf" % utils.splitext_plus(gene_gtf)[0] if not utils.file_exists(sorted_gtf): cmd = ("zcat {gene_gtf} | grep -v ^# " "| sort -k1,1 -k4,4n -k 5,5n > {sorted_gtf}") do.run(cmd.format(**locals()), "Sort input GTF file") out_file = "%s-ann%s" % utils.splitext_plus(orig_bed) cmd = "bedtools closest -d -t first -a {orig_bed} -b {sorted_gtf} > {out_file}" do.run(cmd.format(**locals()), "Identify closest gene")
def _combine_excessive_coverage(samples, ref_regions, min_n_size, tmp_outfile): """Provide a global set of regions with excessive coverage to avoid. """ flag = "EXCESSIVE_COVERAGE" ecs = (pybedtools.BedTool(x["regions"]["callable"]).filter(lambda x: x.name == flag) for x in samples if "regions" in x) merge_ecs = _combine_regions(ecs, ref_regions).saveas("%s-ecmergeorig%s" % utils.splitext_plus(tmp_outfile)) if len(merge_ecs) > 0: return merge_ecs.merge(d=min_n_size).filter(lambda x: x.stop - x.start > min_n_size).saveas( "%s-ecmerge%s" % utils.splitext_plus(tmp_outfile)) else: return merge_ecs
def _flatten_samples(samples, base_file): """Create a flattened JSON representation of data from the bcbio world map. """ out_file = "%s-samples.json" % utils.splitext_plus(base_file)[0] flat_data = [] for data in samples: cur_flat = {} for key_path in [["analysis"], ["description"], ["rgnames"], ["config", "algorithm"], ["metadata"], ["genome_build"], ["files"], ["reference"], ["genome_resources"], ["vrn_file"]]: cur_key = "__".join(key_path) for flat_key, flat_val in _to_cwldata(cur_key, tz.get_in(key_path, data)): cur_flat[flat_key] = flat_val flat_data.append(cur_flat) out = {} for key in sorted(list(set(reduce(operator.add, [d.keys() for d in flat_data])))): out[key] = [] for cur_flat in flat_data: out[key].append(cur_flat.get(key)) with open(out_file, "w") as out_handle: json.dump(out, out_handle, sort_keys=True, indent=4, separators=(',', ': ')) return out_file, _samplejson_to_inputs(out)
def find_annotations(data): """Find annotation configuration files for vcfanno, using pre-installed inputs. Creates absolute paths for user specified inputs and finds locally installed defaults. Default annotations: - gemini for variant pipelines - somatic for variant tumor pipelines - rnaedit for RNA-seq variant calling """ conf_files = dd.get_vcfanno(data) if not isinstance(conf_files, (list, tuple)): conf_files = [conf_files] for c in _default_conf_files(data): if c not in conf_files: conf_files.append(c) out = [] annodir = os.path.normpath( os.path.abspath( os.path.join(os.path.dirname(dd.get_ref_file(data)), os.pardir, "config", "vcfanno"))) for conf_file in conf_files: if utils.file_exists(conf_file) and os.path.isfile(conf_file): conffn = conf_file else: conffn = os.path.join(annodir, conf_file + ".conf") if not utils.file_exists(conffn): build = dd.get_genome_build(data) CONF_NOT_FOUND = ( "The vcfanno configuration {conffn} was not found for {build}, skipping." ) logger.warn(CONF_NOT_FOUND.format(**locals())) else: out.append(conffn) luafn = "%s.lua" % utils.splitext_plus(conffn)[0] if os.path.exists(luafn): out.append(luafn) return out
def _regions_for_coverage(data, region, ref_file, out_file): """Retrieve BED file of regions we need to calculate coverage in. """ variant_regions = bedutils.merge_overlaps(utils.get_in(data, ("config", "algorithm", "variant_regions")), data) ready_region = shared.subset_variant_regions(variant_regions, region, out_file) custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0] if not ready_region: get_ref_bedtool(ref_file, data["config"]).saveas(custom_file) return custom_file, True elif os.path.isfile(ready_region): return ready_region, True elif isinstance(ready_region, (list, tuple)): c, s, e = ready_region pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file) return custom_file, True else: with file_transaction(data, custom_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feat in get_ref_bedtool(ref_file, data["config"], region): out_handle.write("%s\t%s\t%s\t%s\n" % (feat.chrom, feat.start, feat.end, "NO_COVERAGE")) return custom_file, variant_regions is None
def pon_to_bed(pon_file, out_dir, data): """Extract BED intervals from a GATK4 hdf5 panel of normal file. """ out_file = os.path.join( out_dir, "%s-intervals.bed" % (utils.splitext_plus(os.path.basename(pon_file))[0])) if not utils.file_uptodate(out_file, pon_file): import h5py with file_transaction(data, out_file) as tx_out_file: with h5py.File(pon_file, "r") as f: with open(tx_out_file, "w") as out_handle: intervals = f["original_data"]["intervals"] for i in range( len(intervals["transposed_index_start_end"][0])): chrom = intervals["indexed_contig_names"][ intervals["transposed_index_start_end"][0][i]] start = int( intervals["transposed_index_start_end"][1][i]) - 1 end = int( intervals["transposed_index_start_end"][2][i]) out_handle.write("%s\t%s\t%s\n" % (chrom, start, end)) return out_file
def create_gemini_db(gemini_vcf, data, gemini_db=None, ped_file=None): if not gemini_db: gemini_db = "%s.db" % utils.splitext_plus(gemini_vcf)[0] if not utils.file_exists(gemini_db): if not vcfutils.vcf_has_variants(gemini_vcf): return None with file_transaction(data, gemini_db) as tx_gemini_db: gemini = config_utils.get_program("gemini", data["config"]) load_opts = "" if "gemini_allvariants" not in dd.get_tools_on(data): load_opts += " --passonly" # For small test files, skip gene table loading which takes a long time if _is_small_vcf(gemini_vcf): load_opts += " --skip-gene-tables" if "/test_automated_output/" in gemini_vcf: load_opts += " --test-mode" # Skip CADD or gerp-bp if neither are loaded gemini_dir = install.get_gemini_dir(data) for skip_cmd, check_file in [("--skip-cadd", "whole_genome_SNVs.tsv.compressed.gz")]: if not os.path.exists(os.path.join(gemini_dir, check_file)): load_opts += " %s" % skip_cmd # skip gerp-bp which slows down loading load_opts += " --skip-gerp-bp " num_cores = data["config"]["algorithm"].get("num_cores", 1) tmpdir = os.path.dirname(tx_gemini_db) eanns = _get_effects_flag(data) # Apply custom resource specifications, allowing use of alternative annotation_dir resources = config_utils.get_resources("gemini", data["config"]) gemini_opts = " ".join([str(x) for x in resources["options"]]) if resources.get("options") else "" exports = utils.local_path_export() cmd = ("{exports} {gemini} {gemini_opts} load {load_opts} " "-v {gemini_vcf} {eanns} --cores {num_cores} " "--tempdir {tmpdir} {tx_gemini_db}") cmd = cmd.format(**locals()) do.run(cmd, "Create gemini database for %s" % gemini_vcf, data) if ped_file: cmd = [gemini, "amend", "--sample", ped_file, tx_gemini_db] do.run(cmd, "Add PED file to gemini database", data) return gemini_db
def prep_seq2c_bed(data): """Selecting the bed file, cleaning, and properly annotating for Seq2C """ if dd.get_background_cnv_reference(data, "seq2c"): bed_file = _background_to_bed( dd.get_background_cnv_reference(data, "seq2c"), data) else: bed_file = regions.get_sv_bed(data) if bed_file: bed_file = bedutils.clean_file(bed_file, data, prefix="svregions-") else: bed_file = bedutils.clean_file(dd.get_variant_regions(data), data) if not bed_file: return None col_num = bt.BedTool(bed_file).field_count() if col_num < 4: annotated_file = annotate.add_genes(bed_file, data, max_distance=0) if annotated_file == bed_file: raise ValueError( "BED file for Seq2C must be annotated with gene names, " "however the input BED is 3-columns and we have no transcript " "data to annotate with " + bed_file) annotated_file = annotate.gene_one_per_line(annotated_file, data) else: annotated_file = bed_file ready_file = "%s-seq2cclean.bed" % (utils.splitext_plus(annotated_file)[0]) if not utils.file_uptodate(ready_file, annotated_file): bed = bt.BedTool(annotated_file) if col_num > 4 and col_num != 8: bed = bed.cut(range(4)) bed = bed.filter(lambda x: x.name not in ["", ".", "-"]) with file_transaction(data, ready_file) as tx_out_file: bed.saveas(tx_out_file) logger.debug("Saved Seq2C clean annotated ready input BED into " + ready_file) return ready_file
def get_refs(genome_build, aligner, galaxy_base): """Retrieve the reference genome file location from galaxy configuration. """ out = {} name_remap = {"samtools": "fasta"} if genome_build: galaxy_config = _get_galaxy_tool_info(galaxy_base) for name in [x for x in (aligner, "samtools") if x]: galaxy_dt = _get_galaxy_data_table(name, galaxy_config["tool_data_table_config_path"]) loc_file, need_remap = _get_galaxy_loc_file(name, galaxy_dt, galaxy_config["tool_data_path"], galaxy_base) cur_ref = _get_ref_from_galaxy_loc(name, genome_build, loc_file, galaxy_dt, need_remap, galaxy_config) base = os.path.normpath(utils.add_full_path(cur_ref, galaxy_config["tool_data_path"])) if os.path.isdir(base): indexes = glob.glob(os.path.join(base, "*")) else: indexes = glob.glob("%s*" % utils.splitext_plus(base)[0]) out[name_remap.get(name, name)] = {"indexes": indexes} if os.path.exists(base) and os.path.isfile(base): out[name_remap.get(name, name)]["base"] = base return out
def prepare_exclude_file(items, base_file, chrom=None): """Prepare a BED file for exclusion, incorporating variant regions and chromosome. Excludes locally repetitive regions (if `remove_lcr` is set) and centromere regions, both of which contribute to long run times and false positive structural variant calls. """ out_file = "%s-exclude%s.bed" % (utils.splitext_plus(base_file)[0], "-%s" % chrom if chrom else "") if not utils.file_exists(out_file) and not utils.file_exists(out_file + ".gz"): with shared.bedtools_tmpdir(items[0]): # Get a bedtool for the full region if no variant regions want_bedtool = callable.get_ref_bedtool( tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"], chrom) if chrom: want_bedtool = pybedtools.BedTool( shared.subset_bed_by_chrom(want_bedtool.saveas().fn, chrom, items[0])) lcr_bed = shared.get_lcr_bed(items) if lcr_bed: want_bedtool = want_bedtool.subtract( pybedtools.BedTool(lcr_bed)) sv_exclude_bed = _get_sv_exclude_file(items) if sv_exclude_bed and len(want_bedtool) > 0: want_bedtool = want_bedtool.subtract(sv_exclude_bed).saveas() want_bedtool = pybedtools.BedTool( shared.remove_highdepth_regions(want_bedtool.saveas().fn, items)) with file_transaction(items[0], out_file) as tx_out_file: full_bedtool = callable.get_ref_bedtool( tz.get_in(["reference", "fasta", "base"], items[0]), items[0]["config"]) if len(want_bedtool) > 0: full_bedtool.subtract(want_bedtool).saveas(tx_out_file) else: full_bedtool.saveas(tx_out_file) return out_file
def _enforce_max_region_size(in_file, data): """Ensure we don't have any chunks in the region greater than 1Mb. Larger sections have high memory usage on VarDictJava and failures on VarDict. This creates minimum windows from the input BED file to avoid these issues. Downstream VarDict merging sorts out any variants across windows. """ max_size = 1e6 overlap_size = 250 def _has_larger_regions(f): return any(r.stop - r.start > max_size for r in pybedtools.BedTool(f)) out_file = "%s-regionlimit%s" % utils.splitext_plus(in_file) if not utils.file_exists(out_file): if _has_larger_regions(in_file): with file_transaction(data, out_file) as tx_out_file: pybedtools.BedTool().window_maker(w=max_size, s=max_size - overlap_size, b=pybedtools.BedTool(in_file)).saveas(tx_out_file) else: utils.symlink_plus(in_file, out_file) return out_file
def unified_genotyper(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Perform SNP genotyping on the given alignment file. """ if out_file is None: out_file = "%s-variants.vcf.gz" % utils.splitext_plus(align_bams[0])[0] if not utils.file_exists(out_file): broad_runner, params = \ _shared_gatk_call_prep(align_bams, items, ref_file, assoc_files.get("dbsnp"), region, out_file) with file_transaction(items[0], out_file) as tx_out_file: params += ["-T", "UnifiedGenotyper", "-o", tx_out_file, "-ploidy", (str(ploidy.get_ploidy(items, region)) if broad_runner.gatk_type() == "restricted" else "2"), "--genotype_likelihoods_model", "BOTH"] resources = config_utils.get_resources("gatk", items[0]["config"]) if "options" in resources: params += [str(x) for x in resources.get("options", [])] broad_runner.run_gatk(params) return vcfutils.bgzip_and_index(out_file, items[0]["config"])
def _prep_items_from_base(base, in_files, force_single=False): """Prepare a set of configuration items for input files. """ details = [] in_files = _expand_dirs(in_files, KNOWN_EXTS) in_files = _expand_wildcards(in_files) for i, (ext, files) in enumerate( itertools.groupby( in_files, lambda x: KNOWN_EXTS.get(utils.splitext_plus(x)[-1].lower()))): if ext == "bam": for f in files: details.append(_prep_bam_input(f, i, base)) elif ext in ["fastq", "fq", "fasta"]: files = list(files) for fs in fastq.combine_pairs(files, force_single): details.append(_prep_fastq_input(fs, base)) else: print("Ignoring unexpected input file types %s: %s" % (ext, list(files))) return details
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data): """Prepare input VCF and BED files for validation. """ if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base = utils.splitext_plus(os.path.basename(vrn_file))[0] sample_file = os.path.join( base_dir, "%s-%s.vcf.gz" % (base, dd.get_sample_name(data))) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) # rtg fails on bgzipped VCFs produced by GatherVcfs so we re-prep them else: vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data) return vrn_file, rm_file, interval_bed
def _extract_germline(in_file, data): """Extract germline calls non-somatic, non-filtered calls. """ out_file = "%s-germline.vcf" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file) and not utils.file_uptodate( out_file + ".gz", in_file): with file_transaction(data, out_file) as tx_out_file: reader = cyvcf2.VCF(str(in_file)) reader.add_filter_to_header({ 'ID': 'Somatic', 'Description': 'Variant called as Somatic' }) #with contextlib.closing(cyvcf2.Writer(tx_out_file, reader)) as writer: with open(tx_out_file, "w") as out_handle: out_handle.write(reader.raw_header) for rec in reader: rec = _update_germline_filters(rec) out_handle.write(str(rec)) #writer.write_record(rec) return out_file
def unpack_tarballs(xs, data, use_subdir=True): """Unpack workflow tarballs into ready to use directories. """ if isinstance(xs, dict): for k, v in xs.items(): xs[k] = unpack_tarballs(v, data, use_subdir) elif isinstance(xs, (list, tuple)): xs = [unpack_tarballs(x, data, use_subdir) for x in xs] elif isinstance(xs, basestring): if os.path.isfile(xs.encode("utf-8", "ignore")) and xs.endswith("-wf.tar.gz"): if use_subdir: tarball_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "wf-inputs")) else: tarball_dir = dd.get_work_dir(data) out_dir = os.path.join( tarball_dir, os.path.basename(xs).replace("-wf.tar.gz", "").replace("--", os.path.sep)) if not os.path.exists(out_dir): with utils.chdir(tarball_dir): with tarfile.open(xs, "r:gz") as tar: tar.extractall() assert os.path.exists(out_dir), out_dir # Default to representing output directory xs = out_dir # Look for aligner indices for fname in os.listdir(out_dir): if fname.endswith(DIR_TARGETS): xs = os.path.join(out_dir, fname) break elif fname.endswith(BASENAME_TARGETS): base = os.path.join( out_dir, utils.splitext_plus(os.path.basename(fname))[0]) xs = glob.glob("%s*" % base) break return xs
def _merge_and_bgzip(orig_files, out_file, base_file, ext=""): """Merge a group of gzipped input files into a final bgzipped output. Also handles providing unique names for each input file to avoid collisions on multi-region output. Handles renaming with awk magic from: https://www.biostars.org/p/68477/ """ assert out_file.endswith(".gz") full_file = out_file.replace(".gz", "") run_file = "%s-merge.bash" % utils.splitext_plus(base_file)[0] cmds = ["set -e\n"] for i, fname in enumerate(orig_files): cmd = ("""zcat %s | awk '{print (NR%%4 == 1) ? "@%s_" ++i "%s" : $0}' >> %s\n""" % (fname, i, ext, full_file)) cmds.append(cmd) cmds.append("bgzip -f %s\n" % full_file) with open(run_file, "w") as out_handle: out_handle.write("".join("".join(cmds))) do.run([do.find_bash(), run_file], "Rename, merge and bgzip CRAM fastq output") assert os.path.exists(out_file) and not _is_gzip_empty(out_file)
def _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data): """Prepare input VCF and BED files for validation. """ if not rm_file.endswith(".vcf.gz") or not os.path.exists(rm_file + ".tbi"): rm_file = vcfutils.bgzip_and_index(rm_file, data["config"], out_dir=base_dir) if len(vcfutils.get_samples(vrn_file)) > 1: base, ext = utils.splitext_plus(os.path.basename(vrn_file)) sample_file = os.path.join( base_dir, "%s-%s%s" % (base, dd.get_sample_name(data), ext)) vrn_file = vcfutils.select_sample(vrn_file, dd.get_sample_name(data), sample_file, data["config"]) if not vrn_file.endswith(".vcf.gz") or not os.path.exists(vrn_file + ".tbi"): vrn_file = vcfutils.bgzip_and_index(vrn_file, data["config"], out_dir=base_dir) interval_bed = _get_merged_intervals(rm_interval_file, vrn_file, base_dir, data) return vrn_file, rm_file, interval_bed
def _get_coverage_file(in_bam, ref_file, region, region_file, depth, base_file, data): """Retrieve summary of coverage in a region. Requires positive non-zero mapping quality at a position, matching GATK's CallableLoci defaults. """ out_file = "%s-genomecov.bed" % utils.splitext_plus(base_file)[0] if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: fai_file = ref.fasta_idx(ref_file, data["config"]) sambamba = config_utils.get_program("sambamba", data["config"]) bedtools = config_utils.get_program("bedtools", data["config"]) cmd = ("{sambamba} view -F 'mapping_quality > 0' -L {region_file} -f bam -l 1 {in_bam} | " "{bedtools} genomecov -split -ibam stdin -bga -g {fai_file} " "> {tx_out_file}") do.run(cmd.format(**locals()), "bedtools genomecov: %s" % (str(region)), data) # Empty output file, no coverage for the whole contig if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for feat in get_ref_bedtool(ref_file, data["config"], region): out_handle.write("%s\t%s\t%s\t%s\n" % (feat.chrom, feat.start, feat.end, 0)) return out_file
def _callable_from_gvcf(data, vrn_file, out_dir): """Retrieve callable regions based on ref call regions in gVCF. Uses https://github.com/lijiayong/gvcf_regions """ methods = { "freebayes": "freebayes", "platypus": "platypus", "gatk-haplotype": "gatk" } gvcf_type = methods.get(dd.get_variantcaller(data)) if gvcf_type: out_file = os.path.join( out_dir, "%s-gcvf-coverage.bed" % utils.splitext_plus(os.path.basename(vrn_file))[0]) if not utils.file_uptodate(out_file, vrn_file): with file_transaction(data, out_file) as tx_out_file: cmd = ("gvcf_regions.py --gvcf_type {gvcf_type} {vrn_file} " "| bedtools merge > {tx_out_file}") do.run(cmd.format(**locals()), "Convert gVCF to BED file of callable regions") return out_file
def merge_overlaps(in_file, data): """Merge bed file intervals to avoid overlapping regions. Overlapping regions (1:1-100, 1:90-100) cause issues with callers like FreeBayes that don't collapse BEDs prior to using them. """ if in_file: bedtools = config_utils.get_program("bedtools", data["config"]) work_dir = tz.get_in(["dirs", "work"], data) if work_dir: bedprep_dir = utils.safe_makedir(os.path.join(work_dir, "bedprep")) else: bedprep_dir = os.path.dirname(in_file) out_file = os.path.join( bedprep_dir, "%s-merged.bed" % (utils.splitext_plus(os.path.basename(in_file))[0])) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = "{bedtools} merge -i {in_file} > {tx_out_file}" do.run(cmd.format(**locals()), "Prepare merged BED file", data) vcfutils.bgzip_and_index(out_file, data["config"], remove_orig=False) return out_file
def get_region_bed(region, items, out_file, want_gzip=True): """Retrieve BED file of regions to analyze, either single or multi-region. """ variant_regions = bedutils.merge_overlaps( bedutils.population_variant_regions(items), items[0]) target = shared.subset_variant_regions(variant_regions, region, out_file, items) if not target: raise ValueError("Need BED input for strelka2 regions: %s %s" % (region, target)) if not isinstance(target, basestring) or not os.path.isfile(target): chrom, start, end = target target = "%s-regions.bed" % utils.splitext_plus(out_file)[0] with file_transaction(items[0], target) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write("%s\t%s\t%s\n" % (chrom, start, end)) out_file = bedutils.merge_overlaps(target, items[0], out_dir=os.path.dirname(out_file)) if want_gzip: out_file += ".gz" return out_file
def summarize(calls, data): """Summarize results from multiple callers into a single flattened BED file. """ sample = tz.get_in(["rgnames", "sample"], data) work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "structural", sample, "ensemble")) out_file = os.path.join(work_dir, "%s-ensemble.bed" % sample) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with shared.bedtools_tmpdir(data): input_beds = filter(lambda x: x is not None, [_create_bed(c, out_file) for c in calls]) if len(input_beds) > 0: all_file = "%s-all.bed" % utils.splitext_plus(tx_out_file)[0] with open(all_file, "w") as out_handle: for line in fileinput.input(input_beds): out_handle.write(line) pybedtools.BedTool(all_file).sort(stream=True).merge(nms=True).saveas(tx_out_file) if utils.file_exists(out_file): calls.append({"variantcaller": "ensemble", "vrn_file": out_file}) return calls
def _prep_cnv_file(in_file, svcaller, work_dir, data): """Create a CSV file of CNV calls with log2 and number of marks. """ out_file = os.path.join( work_dir, "%s-%s-prep.csv" % (utils.splitext_plus(os.path.basename(in_file))[0], svcaller)) autosomal_chroms = _get_autosomal_chroms() if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: with open(in_file) as in_handle: with open(tx_out_file, "w") as out_handle: reader = csv.reader(in_handle, dialect="excel-tab") writer = csv.writer(out_handle) writer.writerow( ["chrom", "start", "end", "num.mark", "seg.mean"]) reader.next() # header for chrom, start, end, _, log2, probes in reader: if chrom in autosomal_chroms: writer.writerow([ _to_ucsc_style(chrom), start, end, probes, log2 ]) return out_file
def _cnn_tranch_filtering(in_file, vrn_files, tensor_type, data): """Filter CNN scored VCFs in tranches using standard SNP and Indel truth sets. """ out_file = "%s-filter.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): runner = broad.runner_from_config(data["config"]) gatk_type = runner.gatk_type() assert gatk_type == "gatk4", "CNN filtering requires GATK4" if "train_hapmap" not in vrn_files: raise ValueError("CNN filtering requires HapMap training inputs: %s" % vrn_files) with file_transaction(data, out_file) as tx_out_file: params = ["-T", "FilterVariantTranches", "--variant", in_file, "--output", tx_out_file, "--snp-truth-vcf", vrn_files["train_hapmap"], "--indel-truth-vcf", vrn_files["train_indels"]] if tensor_type == "reference": params += ["--info-key", "CNN_1D", "--tranche", "99"] else: assert tensor_type == "read_tensor" params += ["--info-key", "CNN_2D", "--tranche", "99"] runner.run_gatk(params) return vcfutils.bgzip_and_index(out_file, data["config"])
def _freebayes_cutoff(in_file, data): """Perform filtering of FreeBayes results, flagging low confidence calls. Filters using cutoffs on low depth based on Meynert et al's work modeling sensitivity of homozygote and heterozygote calling on depth: http://www.ncbi.nlm.nih.gov/pubmed/23773188 and high depth heterozygote SNP filtering based on Heng Li's work evaluating variant calling artifacts: http://arxiv.org/abs/1404.0929 Tuned based on NA12878 call comparisons to Genome in a Bottle reference genome. """ if not vcfutils.vcf_has_variants(in_file): base, ext = utils.splitext_plus(in_file) out_file = "{base}-filter{ext}".format(**locals()) if not utils.file_exists(out_file): shutil.copy(in_file, out_file) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file depth_thresh, qual_thresh = None, None if _do_high_depth_filter(data): stats = _calc_vcf_stats(in_file) if stats["avg_depth"] > 0: depth_thresh = int( math.ceil(stats["avg_depth"] + 3 * math.pow(stats["avg_depth"], 0.5))) qual_thresh = depth_thresh * 2.0 # Multiplier from default GATK QD cutoff filter filters = ( '(AF[0] <= 0.5 && (max(FORMAT/DP) < 4 || (max(FORMAT/DP) < 13 && %QUAL < 10))) || ' '(AF[0] > 0.5 && (max(FORMAT/DP) < 4 && %QUAL < 50))') if depth_thresh: filters += ' || (%QUAL < {qual_thresh} && max(FORMAT/DP) > {depth_thresh} && AF[0] <= 0.5)'.format( **locals()) return cutoff_w_expression(in_file, filters, data, name="FBQualDepth")
def _filter_paired(tumor, normal, out_file, reference, data): """filter paired vcf file with GATK :param tumor: (str) sample name for tumor :param normal: (str) sample name for normal :param out_file: (str) final vcf file :param reference: (str) genome in fasta format :param data: (dict) information from yaml file(items[0]) :returns: (str) name of final vcf file """ in_file = utils.splitext_plus(out_file)[0] + "-tmp.vcf" shutil.move(out_file, in_file) config = data["config"] with file_transaction(data, out_file) as tx_out_file: params = [ "-T", "SomaticPindelFilter", "-V", in_file, "-o", tx_out_file, "-TID", tumor, "-NID", normal, "-R", reference ] jvm_opts = broad.get_gatk_framework_opts(config) cmd = [config_utils.get_program("gatk-framework", config) ] + jvm_opts + params do.run(cmd, "Filter pindel variants") return out_file
def hard_w_expression(vcf_file, expression, data, name="+", filterext="", extra_cmd=""): """Perform hard filtering using bcftools expressions like %QUAL < 20 || DP < 4. """ base, ext = utils.splitext_plus(vcf_file) out_file = "{base}-filter{filterext}{ext}".format(**locals()) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: if vcfutils.vcf_has_variants(vcf_file): bcftools = config_utils.get_program("bcftools", data["config"]) bgzip_cmd = "| bgzip -c" if out_file.endswith(".gz") else "" variant_regions = utils.get_in(data, ("config", "algorithm", "variant_regions")) intervals = ("-T %s" % vcfutils.bgzip_and_index(variant_regions, data["config"]) if variant_regions else "") cmd = ("{bcftools} filter -O v {intervals} --soft-filter '{name}' " "-e '{expression}' -m '+' {vcf_file} {extra_cmd} {bgzip_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Hard filtering %s with %s" % (vcf_file, expression), data) else: shutil.copy(vcf_file, out_file) if out_file.endswith(".vcf.gz"): out_file = vcfutils.bgzip_and_index(out_file, data["config"]) return out_file
def _setup_variant_regions(data, out_dir): """Ensure we have variant regions for calling, using transcript if not present. Respects noalt_calling by removing additional contigs to improve speeds. """ vr_file = dd.get_variant_regions(data) if not vr_file: vr_file = regions.get_sv_bed(data, "transcripts", out_dir=out_dir) contigs = set([c.name for c in ref.file_contigs(dd.get_ref_file(data))]) out_file = os.path.join(utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bedprep")), "%s-rnaseq_clean.bed" % utils.splitext_plus(os.path.basename(vr_file))[0]) if not utils.file_uptodate(out_file, vr_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: with shared.bedtools_tmpdir(data): for r in pybedtools.BedTool(vr_file): if r.chrom in contigs: if chromhacks.is_nonalt(r.chrom): out_handle.write(str(r)) data = dd.set_variant_regions(data, out_file) return data
def fix_somatic_calls(in_file, config): """Fix somatic variant output, standardize it to the SOMATIC flag. """ if vcf is None: raise ImportError("Require PyVCF for manipulating cancer VCFs") # HACK: Needed to replicate the structure used by PyVCF Info = namedtuple('Info', ['id', 'num', 'type', 'desc']) somatic_info = Info(id='SOMATIC', num=0, type='Flag', desc='Somatic event') # NOTE: PyVCF will write an uncompressed VCF base, ext = utils.splitext_plus(in_file) name = "somaticfix" out_file = "{0}-{1}{2}".format(base, name, ".vcf") if utils.file_exists(in_file): reader = vcf.VCFReader(filename=in_file) # Add info to the header of the reader reader.infos["SOMATIC"] = somatic_info with file_transaction(out_file) as tx_out_file: with open(tx_out_file, "wb") as handle: writer = vcf.VCFWriter(handle, template=reader) for record in reader: # Handle FreeBayes if "VT" in record.INFO: if record.INFO["VT"] == "somatic": record.add_info("SOMATIC", True) # Discard old record del record.INFO["VT"] writer.write_record(record) # Re-compress the file out_file = bgzip_and_index(out_file, config) _move_vcf(in_file, "{0}.orig".format(in_file)) _move_vcf(out_file, in_file) with open(out_file, "w") as out_handle: out_handle.write("Moved to {0}".format(in_file))
def dedup_bam(in_bam, data): """Perform non-stream based deduplication of BAM input files using biobambam. """ if _check_dedup(data): out_file = "%s-dedup%s" % utils.splitext_plus(in_bam) if not utils.file_exists(out_file): with tx_tmpdir(data) as tmpdir: with file_transaction(data, out_file) as tx_out_file: bammarkduplicates = config_utils.get_program( "bammarkduplicates", data["config"]) base_tmp = os.path.join( tmpdir, os.path.splitext(os.path.basename(tx_out_file))[0]) cores, mem = _get_cores_memory(data, downscale=3) cmd = ("{bammarkduplicates} tmpfile={base_tmp}-markdup " "markthreads={cores} I={in_bam} O={tx_out_file}") do.run(cmd.format(**locals()), "De-duplication with biobambam") bam.index(out_file, data["config"]) return out_file else: return in_bam
def _prioritize_plot_regions(region_bt, data, out_dir=None): """Avoid plotting large numbers of regions due to speed issues. Prioritize most interesting. XXX For now, just removes larger regions and avoid plotting thousands of regions. Longer term we'll insert biology-based prioritization. """ max_plots = 1000 max_size = 100 * 1000 # 100kb out_file = "%s-priority%s" % utils.splitext_plus(region_bt.fn) if out_dir: out_file = os.path.join(out_dir, os.path.basename(out_file)) num_plots = 0 if not utils.file_uptodate(out_file, region_bt.fn): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for r in region_bt: if r.stop - r.start < max_size: if num_plots < max_plots: num_plots += 1 out_handle.write("%s\t%s\t%s\n" % (r.chrom, r.start, r.stop)) return out_file