def _salmon_quant_reads(fq1, fq2, salmon_dir, index, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) cmd = ("{salmon} quant -l A -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "<(cat {fq2})" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." %(fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, salmon_dir) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "-o {tx_out_dir} ") fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " # skip --useVBOpt for now, it can cause segfaults cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." %(fq1, fq2)) do.run(cmd.format(**locals()), message, None) return out_file
def trim_srna_sample(data): """ Remove 3' adapter for smallRNA-seq Uses cutadapt but with different parameters than for other pipelines. """ in_file = data["files"][0] names = data["rgnames"]['sample'] work_dir = os.path.join(dd.get_work_dir(data), "trimmed") out_dir = os.path.join(work_dir, names) utils.safe_makedir(out_dir) out_file = replace_directory(append_stem(in_file, ".clean"), out_dir) trim_reads = data["config"]["algorithm"].get("trim_reads", True) if utils.file_exists(out_file): data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]] adapter = dd.get_adapters(data) if trim_reads and not adapter and error_dnapi: raise ValueError(error_dnapi) adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir) times = "" if len(adapters) == 1 else "--times %s" % len(adapters) if trim_reads and adapters: adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters)) out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir) out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir) log_out = os.path.join(out_dir, "%s.log" % names) atropos = _get_atropos() options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", "")) cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "") if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")): raise ValueError("Atropos is now used, but cutadapt options found in YAML file." "See https://atropos.readthedocs.io/en/latest/") cmd = _cmd_atropos() if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: do.run(cmd.format(**locals()), "remove adapter for %s" % names) if utils.file_exists(log_out): content = open(log_out).read().replace(out_short_file, names) open(log_out, 'w').write(content) if options: in_file = append_stem(tx_out_file, ".tmp") utils.move_safe(tx_out_file, in_file) cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17" do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names)) else: if not trim_reads: logger.debug("Skip trimming for: %s" % names) elif not adapters: logger.info("No adapter founds in %s, this is an issue related" " to no small RNA enrichment in your sample." % names) symlink_plus(in_file, out_file) data["clean_fastq"] = out_file data["collapse"] = _collapse(data["clean_fastq"]) data["size_stats"] = _summary(data['collapse']) return [[data]]
def _atropos_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with atropos. """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report_file, tx_out1 = tx_out[:2] if len(tx_out) > 2: tx_out2 = tx_out[2] # polyX trimming, anchored to the 3' ends of reads if "polyx" in dd.get_adapters(data): adapters += ["A{200}", "C{200}", "G{200}", "T{200}"] adapters_args = " ".join(["-a '%s'" % a for a in adapters]) adapters_args += " --overlap 8" # Avoid very short internal matches (default is 3) adapters_args += " --no-default-adapters --no-cache-adapters" # Prevent GitHub queries and saving pickles aligner_args = "--aligner adapter" if len(fastq_files) == 1: cores = dd.get_num_cores(data) input_args = "-se %s" % objectstore.cl_input(fastq_files[0]) output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals()) else: assert len(fastq_files) == 2, fastq_files cores = max(1, dd.get_num_cores(data) // 2) adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters]) input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files]) output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) " "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals()) quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33" sample_name = dd.get_sample_name(data) report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file, dd.get_sample_name(data)) ropts = " ".join(str(x) for x in config_utils.get_resources("atropos", data["config"]).get("options", [])) extra_opts = [] for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True), ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True), ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data)))]: if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks): if want: extra_opts.append("%s=%s" % (k, v)) extra_opts = " ".join(extra_opts) thread_args = ("--threads %s" % cores if cores > 1 else "") cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq " "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}") do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data)) return out_files, report_file
def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data), "high_multiplier": 20} prefix = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) depth_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions") if not utils.file_uptodate(callable_file, bam_file): cmd = ["goleft", "depth", "--q", "1", "--mincov", str(params["min"]), "--processes", str(dd.get_num_cores(data)), "--ordered"] max_depth = _get_max_depth(variant_regions_avg_cov, params, data) if max_depth: cmd += ["--maxmeandepth", str(int(max_depth))] with file_transaction(data, depth_file) as tx_depth_file: with utils.chdir(os.path.dirname(tx_depth_file)): tx_callable_file = tx_depth_file.replace(".depth.bed", ".callable.bed") prefix = tx_depth_file.replace(".depth.bed", "") bam_ref_file = "%s-bamref.fa" % utils.splitext_plus(bam_file)[0] bam.fai_from_bam(dd.get_ref_file(data), bam_file, bam_ref_file + ".fai", data) cmd += ["--reference", bam_ref_file] cmd += ["--prefix", prefix, bam_file] bcbio_env = utils.get_bcbio_env() msg = "Calculate coverage: %s" % dd.get_sample_name(data) do.run(cmd, msg, env=bcbio_env) shutil.move(tx_callable_file, callable_file) final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return depth_file, final_callable, _extract_highdepth(final_callable, data), variant_regions_avg_cov
def run(calls, data): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + [ "--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir, ] available_callers = 0 for call in calls: if call["variantcaller"] in SUPPORTED: available_callers += 1 cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if available_callers >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save( dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml") ) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") calls.append({"variantcaller": "metasv", "vrn_file": out_file}) return calls
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return {} work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file
def _run_cobalt(paired, work_dir): """Run Cobalt for counting read depth across genomic windows. PURPLE requires even 1000bp windows so use integrated counting solution directly rather than converting from CNVkit calculations. If this approach is useful should be moved upstream to be available to other tools as an input comparison. https://github.com/hartwigmedical/hmftools/tree/master/count-bam-lines """ cobalt_dir = utils.safe_makedir(os.path.join(work_dir, "cobalt")) out_file = os.path.join(cobalt_dir, "%s.cobalt" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file): with file_transaction(paired.tumor_data, out_file) as tx_out_file: cmd = ["COBALT"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-reference", paired.normal_name, "-reference_bam", paired.normal_bam, "-tumor", paired.tumor_name, "-tumor_bam", paired.tumor_bam, "-threads", dd.get_num_cores(paired.tumor_data), "-output_dir", os.path.dirname(tx_out_file), "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"]] cmd = "%s && %s" % (utils.get_R_exports(), " ".join([str(x) for x in cmd])) do.run(cmd, "PURPLE: COBALT read depth normalization") for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(cobalt_dir, f)) return out_file
def priority_total_coverage(data): """ calculate coverage at 10 depth intervals in the priority regions """ bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def combine_variant_files(orig_files, out_file, ref_file, config, quiet_out=True, region=None): """Combine VCF files from the same sample into a single output file. Handles cases where we split files into SNPs/Indels for processing then need to merge back into a final file. """ in_pipeline = False if isinstance(orig_files, dict): file_key = config["file_key"] in_pipeline = True orig_files = orig_files[file_key] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: exist_files = [x for x in orig_files if os.path.exists(x)] ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config) dict_file = "%s.dict" % utils.splitext_plus(ref_file)[0] cores = dd.get_num_cores({"config": config}) memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None cmd = ["picard"] + broad.get_picard_opts(config, memscale) + \ ["MergeVcfs", "D=%s" % dict_file, "O=%s" % tx_out_file] + \ ["I=%s" % f for f in ready_files] cmd = "%s && %s" % (utils.get_java_clprep(), " ".join(cmd)) do.run(cmd, "Combine variant files") if out_file.endswith(".gz"): bgzip_and_index(out_file, config) if in_pipeline: return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}] else: return out_file
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(kallisto_dir, "quant") safe_makedir(kallisto_dir) num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() kallisto = config_utils.get_program("kallisto", dd.get_config(data)) index = kallisto_index(gtf_file, fasta_file, data, os.path.dirname(kallisto_dir)) fusion_flag = "--fusion" if dd.get_fusion_mode(data) or dd.get_fusion_caller(data) else "" single_flag = "--single" if not fq2 else "" fraglength_flag = "--fragment-length=200" if not fq2 else "" sd_flag = "--sd=25" if not fq2 else "" bootstrap_flag = "--bootstrap-samples=30" fq2 = "" if not fq2 else fq2 if not fq2: logger.warning("kallisto was run on single-end data and we set the " "estimated fragment length to 200 and the standard " "deviation to 25, if these don't reflect your data then " "the results may be inaccurate. Use with caution. See " "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w " "for details.") cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} " "{fraglength_flag} {sd_flag} {bootstrap_flag} " "-o {tx_out_dir} -i {index} {fq1} {fq2}") with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts with kallisto.") do.run(cmd.format(**locals()), message, None) return quant_dir
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) sambamba = config_utils.get_program("sambamba", data["config"]) work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage")) if not bed_file: return data cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed") cleaned_bed = bed.decomment(bed_file, cleaned_bed) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: with file_transaction(parse_file) as out_tx: cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} " "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) _calculate_percentiles(os.path.abspath(parse_file), sample) data['coverage'] = os.path.abspath(parse_file) return data
def filter_multimappers(align_file, data): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext if file_exists(out_file): return out_file base_filter = '-F "[XS] == null and not unmapped {paired_filter}"' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) return out_file
def _run_somatic(paired, ref_file, target, out_file): """Run somatic calling with octopus, handling both paired and tumor-only cases. Tweaks for low frequency, tumor only and UMI calling documented in: https://github.com/luntergroup/octopus/blob/develop/configs/UMI.config """ align_bams = paired.tumor_bam if paired.normal_bam: align_bams += " %s --normal-sample %s" % (paired.normal_bam, paired.normal_name) cores = dd.get_num_cores(paired.tumor_data) # Do not try to search below 0.4% currently as leads to long runtimes # https://github.com/luntergroup/octopus/issues/29#issuecomment-428167979 min_af = max([float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0, 0.004]) min_af_floor = min_af / 4.0 cmd = ("octopus --threads {cores} --reference {ref_file} --reads {align_bams} " "--regions-file {target} " "--min-credible-somatic-frequency {min_af_floor} --min-expected-somatic-frequency {min_af} " "--downsample-above 4000 --downsample-target 4000 --min-kmer-prune 5 --min-bubble-score 20 " "--max-haplotypes 200 --somatic-snv-mutation-rate '5e-4' --somatic-indel-mutation-rate '1e-05' " "--target-working-memory 5G --target-read-buffer-footprint 5G --max-somatic-haplotypes 3 " "--caller cancer " "--working-directory {tmp_dir} " "-o {tx_out_file} --legacy") if not paired.normal_bam: cmd += (" --tumour-germline-concentration 5") if dd.get_umi_type(paired.tumor_data) or _is_umi_consensus_bam(paired.tumor_bam): cmd += (" --allow-octopus-duplicates --overlap-masking 0 " "--somatic-filter-expression 'GQ < 200 | MQ < 30 | SB > 0.2 | SD[.25] > 0.1 " "| BQ < 40 | DP < 100 | MF > 0.1 | AD < 5 | CC > 1.1 | GQD > 2'") with file_transaction(paired.tumor_data, out_file) as tx_out_file: tmp_dir = os.path.dirname(tx_out_file) do.run(cmd.format(**locals()), "Octopus somatic calling") _produce_compatible_vcf(tx_out_file, paired.tumor_data, is_somatic=True) return out_file
def filter_multimappers(align_file, data): """ Filtering a BWA alignment file for uniquely mapped reads, from here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def annotate_with_depth(in_file, items): """Annotate called VCF file with depth using duphold (https://github.com/brentp/duphold) Currently annotates single sample and tumor samples in somatic analysis. """ bam_file = None if len(items) == 1: bam_file = dd.get_align_bam(items[0]) else: paired = vcfutils.get_paired(items) if paired: bam_file = paired.tumor_bam if bam_file: out_file = "%s-duphold.vcf.gz" % utils.splitext_plus(in_file)[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: if not in_file.endswith(".gz"): in_file = vcfutils.bgzip_and_index(in_file, remove_orig=False, out_dir=os.path.dirname(tx_out_file)) ref_file = dd.get_ref_file(items[0]) # cores for BAM reader thread, so max out at 4 based on recommendations cores = min([dd.get_num_cores(items[0]), 4]) cmd = ("duphold --threads {cores} --vcf {in_file} --bam {bam_file} --fasta {ref_file} " "-o {tx_out_file}") do.run(cmd.format(**locals()), "Annotate SV depth with duphold") vcfutils.bgzip_and_index(out_file) return out_file else: return in_file
def run(bam_file, data, out_dir): """Run viral QC analysis. """ viral_target = "gdc-viral" out = {} if vcfutils.get_paired_phenotype(data): viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target] if viral_refs and utils.file_exists(viral_refs[0]): viral_ref = viral_refs[0] viral_bam = os.path.join(utils.safe_makedir(out_dir), "%s-%s.bam" % (dd.get_sample_name(data), utils.splitext_plus(os.path.basename(viral_ref))[0])) out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0] if not utils.file_uptodate(out_file, bam_file): if not utils.file_uptodate(viral_bam, bam_file): with file_transaction(data, viral_bam) as tx_out_file: cores = dd.get_num_cores(data) tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0] cmd = ("samtools view -u -f 4 {bam_file} | " "bamtofastq collate=0 | " "bwa mem -t {cores} {viral_ref} - | " "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} " "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}") do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome") with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: out_handle.write("# sample\t%s\n" % dd.get_sample_name(data)) for info in bam.idxstats(viral_bam, data): if info.aligned > 0: out_handle.write("%s\t%s\n" % (info.contig, info.aligned)) out["base"] = out_file return out
def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data): safe_makedir(sailfish_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(sailfish_dir, "quant.sf") if file_exists(out_file): return out_file sailfish_idx = sailfish_index(gtf_file, ref_file, data, sailfish_dir) num_cores = dd.get_num_cores(data) sailfish = config_utils.get_program("sailfish", data["config"]) cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} " cmd += _libtype_string(fq1, fq2, strandedness) fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " cmd += "--useVBOpt --numBootstraps 30 " cmd += "-o {tx_out_dir}" message = "Quantifying transcripts in {fq1} and {fq2}." with file_transaction(data, sailfish_dir) as tx_out_dir: do.run(cmd.format(**locals()), message.format(**locals()), None) return out_file
def rapmap_align(fq1, fq2, rapmap_dir, gtf_file, ref_file, algorithm, data): valid_algorithms = ["pseudo", "quasi"] assert algorithm in valid_algorithms, \ "RapMap algorithm needs to be one of %s." % valid_algorithms safe_makedir(rapmap_dir) samplename = dd.get_sample_name(data) out_file = os.path.join(rapmap_dir, samplename + ".bam") if file_exists(out_file): return out_file rapmap_index_loc = rapmap_index(gtf_file, ref_file, algorithm, data, rapmap_dir) num_cores = dd.get_num_cores(data) algorithm_subcommand = algorithm + "map" rapmap = config_utils.get_program("rapmap", dd.get_config(data)) cmd = "{rapmap} {algorithm_subcommand} -t {num_cores} -i {rapmap_index_loc} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += "-r {fq1_cmd} " else: fq2_cmd = "{fq2} " if not is_gzipped(fq2) else "<(gzip -cd {fq2}) " fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += "-1 {fq2_cmd} -2 {fq2_cmd} " with file_transaction(out_file) as tx_out_file: cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file) run_message = ("%smapping %s and %s to %s with Rapmap. " % (algorithm, fq1, fq2, rapmap_index)) do.run(cmd.format(**locals()), run_message, None) return out_file
def _run_amber(paired, work_dir, lenient=False): """AMBER: calculate allele frequencies at likely heterozygous sites. lenient flag allows amber runs on small test sets. """ amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber")) out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data)) if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".pcf"): with file_transaction(paired.tumor_data, out_file) as tx_out_file: key = "germline_het_pon" het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data) cmd = ["AMBER"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \ ["-threads", dd.get_num_cores(paired.tumor_data), "-tumor", dd.get_sample_name(paired.tumor_data), "-tumor_bam", dd.get_align_bam(paired.tumor_data), "-reference", dd.get_sample_name(paired.normal_data), "-reference_bam", dd.get_align_bam(paired.normal_data), "-ref_genome", dd.get_ref_file(paired.tumor_data), "-bed", het_bed, "-output_dir", os.path.dirname(tx_out_file)] if lenient: cmd += ["-max_het_af_percent", "1.0"] try: do.run(cmd, "PURPLE: AMBER baf generation") except subprocess.CalledProcessError as msg: if not lenient and _amber_allowed_errors(str(msg)): return _run_amber(paired, work_dir, True) for f in os.listdir(os.path.dirname(tx_out_file)): if f != os.path.basename(tx_out_file): shutil.move(os.path.join(os.path.dirname(tx_out_file), f), os.path.join(amber_dir, f)) return out_file
def gatk_rnaseq_calling(data): """ use GATK to perform variant calling on RNA-seq data """ broad_runner = broad.runner_from_config(dd.get_config(data)) ref_file = dd.get_ref_file(data) split_bam = dd.get_split_bam(data) out_file = os.path.splitext(split_bam)[0] + ".gvcf" num_cores = dd.get_num_cores(data) if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data with file_transaction(out_file) as tx_out_file: params = ["-T", "HaplotypeCaller", "-R", ref_file, "-I", split_bam, "-o", tx_out_file, "-nct", str(num_cores), "--emitRefConfidence", "GVCF", "--variant_index_type", "LINEAR", "--variant_index_parameter", "128000", "-dontUseSoftClippedBases", "-stand_call_conf", "20.0", "-stand_emit_conf", "20.0"] broad_runner.run_gatk(params) data = dd.set_vrn_file(data, out_file) return data
def run(vcf, conf_fns, lua_fns, data, basepath=None, decomposed=False): """Annotate a VCF file using vcfanno (https://github.com/brentp/vcfanno) decomposed -- if set to true we'll convert allele based output into single values to match alleles and make compatible with vcf2db (https://github.com/quinlan-lab/vcf2db/issues/14) """ conf_fns.sort(key=lambda x: os.path.basename(x) if x else "") lua_fns.sort(key=lambda x: os.path.basename(x) if x else "") ext = "-annotated-%s" % utils.splitext_plus(os.path.basename(conf_fns[0]))[0] if vcf.find(ext) > 0: out_file = vcf else: out_file = "%s%s.vcf.gz" % (utils.splitext_plus(vcf)[0], ext) if not utils.file_exists(out_file): vcfanno = config_utils.get_program("vcfanno", data) with file_transaction(out_file) as tx_out_file: conffn = _combine_files(conf_fns, out_file, data, basepath is None) luafn = _combine_files(lua_fns, out_file, data, False) luaflag = "-lua {0}".format(luafn) if luafn and utils.file_exists(luafn) else "" basepathflag = "-base-path {0}".format(basepath) if basepath else "" cores = dd.get_num_cores(data) post_ann = "sed -e 's/Number=A/Number=1/g' |" if decomposed else "" cmd = ("{vcfanno} -p {cores} {luaflag} {basepathflag} {conffn} {vcf} " "| {post_ann} bgzip -c > {tx_out_file}") message = "Annotating {vcf} with vcfanno, using {conffn}".format(**locals()) do.run(cmd.format(**locals()), message) return vcfutils.bgzip_and_index(out_file, data["config"])
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) if dd.get_transcriptome_fasta(data): gtf_fa = dd.get_transcriptome_fasta(data) else: gtf_fa = sailfish.create_combined_fasta(data, out_dir) assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir files = dd.get_input_sequence_files(data) readlength = bam.fastq.estimate_read_length(files[0]) if readlength % 2 == 0: readlength -= 1 kmersize = min(readlength, 31) with file_transaction(data, out_dir) as tx_out_dir: cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def _vardict_options_from_config(items, config, out_file, target=None, is_rnaseq=False): var2vcf_opts = [] opts = ["-c 1", "-S 2", "-E 3", "-g 4"] # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0", # "-k", "3", "-r", "4", "-m", "8"] cores = dd.get_num_cores(items[0]) if cores and cores > 1: opts += ["-th", str(cores)] # Disable SV calling for vardict, causes issues with regional analysis # by detecting SVs outside of target regions, which messes up merging # SV calling will be worked on as a separate step vardict_cl = get_vardict_command(items[0]) version = programs.get_version_manifest(vardict_cl) if (vardict_cl and version and ((vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.5")) or (vardict_cl == "vardict" and LooseVersion(version) >= LooseVersion("2018.07.25")))): opts += ["--nosv"] if (vardict_cl and version and (vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.6"))): opts += ["--deldupvar"] # remove low mapping quality reads if not is_rnaseq: opts += ["-Q", "10"] # Remove QCfail reads, avoiding high depth repetitive regions opts += ["-F", "0x700"] resources = config_utils.get_resources("vardict", config) if resources.get("options"): opts += [str(x) for x in resources["options"]] resources = config_utils.get_resources("var2vcf", config) if resources.get("options"): var2vcf_opts += [str(x) for x in resources["options"]] if target and _is_bed_file(target): target = _enforce_max_region_size(target, items[0]) opts += [target] # this must be the last option return " ".join(opts), " ".join(var2vcf_opts)
def _fastp_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp) """ report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report = tx_out[0] tx_out_files = tx_out[1:] cmd = ["fastp", "--thread", dd.get_num_cores(data)] if dd.get_quality_format(data).lower() == "illumina": cmd += ["--phred64"] for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)): if i == 0: cmd += ["-i", inf, "-o", outf] else: cmd += ["-I", inf, "-O", outf] cmd += ["--cut_by_quality3", "--cut_mean_quality", "5", "--length_required", str(dd.get_min_read_length(data)), "--disable_quality_filtering"] if "polyx" in dd.get_adapters(data): cmd += ["--trim_poly_x", "--poly_x_min_len", "8"] if "polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data): cmd += ["--trim_poly_g", "--poly_g_min_len", "8"] for a in adapters: cmd += ["--adapter_sequence", a] if not adapters: cmd += ["--disable_adapter_trimming"] cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)] do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data)) return out_files, report_file
def priority_total_coverage(data): """ calculate coverage at depth 20 in the priority regions """ bed_file = dd.get_priority_regions(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {bed_file} " "-F \"not unmapped\" " "-T 20 {in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def trim_adapters(data): fq1, fq2 = dd.get_input_sequence_files(data) skewer = config_utils.get_program("skewer", data, default="skewer") nthreads = dd.get_num_cores(data) samplename = dd.get_sample_name(data) out_dir = os.path.join(dd.get_work_dir(data), "trimmed", samplename) of1 = os.path.join(out_dir, samplename + "-trimmed-pair1.fastq.gz") of2 = os.path.join(out_dir, samplename + "-trimmed-pair2.fastq.gz") of2 = of2 if fq2 else None if fq1 and fq2: if file_exists(of1) and file_exists(of2): return of1, of2 else: if file_exists(of1): return of1, None safe_makedir(out_dir) file_string = "{fq1} {fq2} " if fq2 else "{fq1} " fw_cmd = _fw_command(data) rv_cmd = _rv_command(data) mode = "tail" if not fq2 else "pe" cmd = ("{skewer} --min 25 --threads {nthreads} -q 5 " "{fw_cmd} " "{rv_cmd} " "-m {mode} " "--compress --output {out_stem} ") + file_string with file_transaction(out_dir) as tx_out_dir: safe_makedir(tx_out_dir) out_stem = os.path.join(tx_out_dir, samplename) message = "Trimming {fq1}, {fq2} with skewer.".format(**locals()) do.run(cmd.format(**locals()), message) return of1, of2
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") with chdir(work_dir): in_bam = data['work_bam'] sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: cmd = ("sambamba depth region -F \"not unmapped\" -t {cores} -C 1000 -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 -T 80 -T 100 -L {bed_file} {in_bam} | sed 's/# chrom/chrom/' > {parse_file}") do.run(cmd.format(**locals()), "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, bed_file, sample) _calculate_percentiles(parse_file, sample) data['coverage'] = os.path.abspath(parse_file) return data
def run(items): """Run MetaSV if we have enough supported callers, adding output to the set of calls. """ assert len(items) == 1, "Expect one input to MetaSV ensemble calling" data = items[0] work_dir = _sv_workdir(data) out_file = os.path.join(work_dir, "variants.vcf.gz") cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data), "--bam", dd.get_align_bam(data), "--outdir", work_dir] methods = [] for call in data.get("sv", []): if call["variantcaller"] in SUPPORTED and call["variantcaller"] not in methods: methods.append(call["variantcaller"]) cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])] if len(methods) >= MIN_CALLERS: if not utils.file_exists(out_file): tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")) cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))] cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")] cmd += ["--assembly_max_tools=1", "--assembly_pad=500"] cmd += ["--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]] do.run(cmd, "Combine variant calls with MetaSV") filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || " "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || " "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)") filter_file = vfilter.hard_w_expression(out_file, filters, data, name="ReassemblyStats", limit_regions=None) effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff") data["sv"].append({"variantcaller": "metasv", "vrn_file": effects_vcf or filter_file}) return [data]
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam" if file_exists(out_file): data = dd.set_work_bam(data, out_file) return data cmd = ("{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks can use if dd.get_assemble_transcripts(data): cmd += "--dta-cufflinks " if dd.get_analysis(data) == "rna-seq": splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") cmd += "--known-splicesite-infile {splicesites} " message = "Aligning %s and %s with hisat2." %(fastq_file, pair_file) with file_transaction(out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) return data
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data['vrn_file'] sample = dd.get_sample_name(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): if file_exists(qc_file): return data in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): os.remove(cg_file) return data
def merge_bam_files(bam_files, work_dir, data, out_file=None, batch=None): """Merge multiple BAM files from a sample into a single BAM for processing. Checks system open file limit and merges in batches if necessary to avoid file handle limits. """ out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch) if not utils.file_exists(out_file): if len(bam_files) == 1 and bam.bam_already_sorted( bam_files[0], data["config"], "coordinate"): with file_transaction(data, out_file) as tx_out_file: _create_merge_filelist(bam_files, tx_out_file, data["config"]) out_file = bam_files[0] samtools = config_utils.get_program("samtools", data["config"]) do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") else: with tx_tmpdir(data) as tmpdir: with utils.chdir(tmpdir): with file_transaction(data, out_file) as tx_out_file: tx_bam_file_list = _create_merge_filelist( bam_files, tx_out_file, data["config"]) samtools = config_utils.get_program( "samtools", data["config"]) resources = config_utils.get_resources( "samtools", data["config"]) num_cores = dd.get_num_cores(data) # Aim for 3.5Gb/core memory for BAM merging num_cores = config_utils.adjust_cores_to_mb_target( 3500, resources.get("memory", "2G"), num_cores) max_mem = config_utils.adjust_memory( resources.get("memory", "1G"), 2, "decrease").upper() if dd.get_mark_duplicates(data): cmd = _biobambam_merge_dedup_maxcov(data) else: cmd = _biobambam_merge_maxcov(data) do.run( cmd.format(**locals()), "Merge bam files to %s" % os.path.basename(out_file), None) do.run( '{} quickcheck -v {}'.format( samtools, tx_out_file), "Check for valid merged BAM") do.run('{} quickcheck -v {}'.format(samtools, out_file), "Check for valid merged BAM after transfer") _finalize_merge(out_file, bam_files, data["config"]) bam.index(out_file, data["config"]) return out_file
def filter_barcodes(data): fq1 = dd.get_input_sequence_files(data)[0] umi_dir = os.path.join(dd.get_work_dir(data), "umis") correction = dd.get_cellular_barcode_correction(data) bc = get_cellular_barcodes(data) if not bc: return [[data]] bc1 = None bc2 = None bc3 = None umi_dir = os.path.join(dd.get_work_dir(data), "umis") if isinstance(bc, basestring): bc1 = bc if len(bc) == 1: bc1 = bc[0] if len(bc) > 1: bc1 = bc[0] bc2 = bc[1] if len(bc) == 3: bc3 = bc[2] out_base = dd.get_sample_name(data) + ".filtered.fq.gz" out_file = os.path.join(umi_dir, out_base) if file_exists(out_file): data["files"] = [out_file] return [[data]] ncores = dd.get_num_cores(data) cmd = "{umis} cb_filter --cores {ncores} " if bc1: cmd += "--bc1 {bc1} " if correction: cmd += "--nedit {correction} " if bc2: cmd += "--bc2 {bc2} " if bc3: cmd += "--bc3 {bc3} " fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) " fq1_cmd = fq1_cmd.format(fq1=fq1) cmd += "{fq1_cmd} | gzip > {tx_out_file}" sample_dir = os.path.join(umi_dir, dd.get_sample_name(data)) safe_makedir(sample_dir) umis = config_utils.get_program("umis", data, default="umis") with file_transaction(out_file) as tx_out_file: message = "Filtering by cellular barcode." do.run(cmd.format(**locals()), message) data["files"] = [out_file] return [[data]]
def _gatk_apply_bqsr(data): """Parallel BQSR support for GATK4. Normalized qualities to 3 bin outputs at 10, 20 and 30 based on pipeline standard recommendations, which will help with output file sizes: https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md#base-quality-score-binning-scheme https://github.com/gatk-workflows/broad-prod-wgs-germline-snps-indels/blob/5585cdf7877104f2c61b2720ddfe7235f2fad577/PairedEndSingleSampleWf.gatk4.0.wdl#L1081 spark host and timeout settings help deal with runs on restricted systems where we encounter network and timeout errors """ in_file = dd.get_align_bam(data) or dd.get_work_bam(data) out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0]) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) gatk_type = broad_runner.gatk_type() cores = dd.get_num_cores(data) if gatk_type == "gatk4": resources = config_utils.get_resources("gatk-spark", data["config"]) spark_opts = [str(x) for x in resources.get("options", [])] params = ["-T", "ApplyBQSRSpark", "--input", in_file, "--output", tx_out_file, "--bqsr-recal-file", data["prep_recal"], "--static-quantized-quals", "10", "--static-quantized-quals", "20", "--static-quantized-quals", "30"] if spark_opts: params += spark_opts else: params += ["--spark-master", "local[%s]" % cores, "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file), "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800"] # Avoid problems with StreamClosedErrors on GATK 4.1+ # https://github.com/bcbio/bcbio-nextgen/issues/2806#issuecomment-492504497 params += ["--create-output-bam-index", "false"] else: params = ["-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file, "-BQSR", data["prep_recal"], "-o", tx_out_file] # Avoid problems with intel deflater for GATK 3.8 and GATK4 # https://github.com/bcbio/bcbio-nextgen/issues/2145#issuecomment-343095357 if gatk_type == "gatk4": params += ["--jdk-deflater", "--jdk-inflater"] elif LooseVersion(broad_runner.gatk_major_version()) > LooseVersion("3.7"): params += ["-jdk_deflater", "-jdk_inflater"] memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, parallel_gc=True) bam.index(out_file, data["config"]) return out_file
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, single_end=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export()) cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library} " "-gtf {gtf_file} --java-mem-size={max_mem}").format(**locals()) return cmd
def cufflinks_merge(*samples): to_merge = filter_missing([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)]) data = samples[0][0] bam_file = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) out_dir = os.path.join(dd.get_work_dir(data), "assembly") num_cores = dd.get_num_cores(data) merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores, samples[0][0]) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_assembled_gtf(data, merged_gtf) updated_samples.append([data]) return updated_samples
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data): samplename = dd.get_sample_name(data) quant_dir = os.path.join(salmon_dir, "quant") safe_makedir(salmon_dir) out_file = os.path.join(quant_dir, "quant.sf") if file_exists(out_file): return out_file num_cores = dd.get_num_cores(data) strandedness = dd.get_strandedness(data).lower() salmon = config_utils.get_program("salmon", dd.get_config(data)) libtype = sailfish._libtype_string(fq1, fq2, strandedness) num_cores = dd.get_num_cores(data) index = salmon_index(gtf_file, ref_file, data, os.path.dirname(salmon_dir)) resources = config_utils.get_resources("salmon", dd.get_config(data)) params = "" if resources.get("options") is not None: params = " ".join([str(x) for x in resources.get("options", [])]) cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} " "--gcBias " "-o {tx_out_dir} {params} ") fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})" fq1_cmd = fq1_cmd.format(fq1=fq1) if not fq2: cmd += " -r {fq1_cmd} " else: fq2_cmd = "<(cat {fq2})" if not is_gzipped(fq2) else "<(gzip -cd {fq2})" fq2_cmd = fq2_cmd.format(fq2=fq2) cmd += " -1 {fq1_cmd} -2 {fq2_cmd} " # skip --useVBOpt for now, it can cause segfaults cmd += "--numBootstraps 30 " with file_transaction(data, quant_dir) as tx_out_dir: message = ("Quantifying transcripts in %s and %s with Salmon." %(fq1, fq2)) do.run(cmd.format(**locals()), message, None) sailfish.sleuthify_sailfish(tx_out_dir) return out_file
def _call_variants(example_dir, data, out_file): """Call variants from prepared pileup examples, creating tensorflow record file. """ tf_out_file = "%s-tfrecord.gz" % utils.splitext_plus(out_file)[0] if not utils.file_exists(tf_out_file): with file_transaction(data, tf_out_file) as tx_out_file: cmd = [ "dv_call_variants.py", "--cores", dd.get_num_cores(data), "--outfile", tx_out_file, "--examples", example_dir, "--sample", dd.get_sample_name(data) ] do.run(cmd, "DeepVariant call_variants %s" % dd.get_sample_name(data)) return tf_out_file
def _run_with_memory_scaling(params, tx_out_file, data, ld_preload=False): num_cores = dd.get_num_cores(data) memscale = { "magnitude": 0.9 * num_cores, "direction": "increase" } if num_cores > 1 else None # Ignore tools_off: [gatk4], since it doesn't apply to GATK CNV calling config = utils.deepish_copy(data["config"]) if "gatk4" in dd.get_tools_off({"config": config}): config["algorithm"]["tools_off"].remove("gatk4") broad_runner = broad.runner_from_config(config) broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale, ld_preload=ld_preload)
def sailfish_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambiguate(data)) sailfish = config_utils.get_program("sailfish", data["config"]) num_cores = dd.get_num_cores(data) gtf_fa = _create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) if file_exists(out_dir + "versionInfo.json"): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} -k 25" message = "Creating sailfish index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def _do_run(paired): """Perform Battenberg caling with the paired dataset. This purposely does not use a temporary directory for the output since Battenberg does smart restarts. """ work_dir = _sv_workdir(paired.tumor_data) out = _get_battenberg_out(paired, work_dir) ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt") if len(_missing_files(out)) > 0: ref_file = dd.get_ref_file(paired.tumor_data) bat_datadir = os.path.normpath( os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg")) ignore_file, gl_file = _make_ignore_file( work_dir, ref_file, ignore_file, os.path.join(bat_datadir, "impute", "impute_info.txt")) local_sitelib = os.path.join( install.get_defaults().get("tooldir", "/usr/local"), "lib", "R", "site-library") tumor_bam = paired.tumor_bam normal_bam = paired.normal_bam platform = dd.get_platform(paired.tumor_data) genome_build = paired.tumor_data["genome_build"] # scale cores to avoid over-using memory during imputation cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5)) gender = { "male": "XY", "female": "XX", "unknown": "L" }.get(population.get_gender(paired.tumor_data)) if gender == "L": gender_str = "-ge %s -gl %s" % (gender, gl_file) else: gender_str = "-ge %s" % (gender) r_export_cmd = utils.get_R_exports() cmd = ( "export R_LIBS_USER={local_sitelib} && {r_export_cmd} && " "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai " "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt " "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt " "-ig {ignore_file} {gender_str} " "-assembly {genome_build} -species Human -platform {platform}") do.run(cmd.format(**locals()), "Battenberg CNV calling") assert len(_missing_files( out)) == 0, "Missing Battenberg output: %s" % _missing_files(out) out["plot"] = _get_battenberg_out_plots(paired, work_dir) out["ignore"] = ignore_file return out
def sailfish_index(gtf_file, ref_file, data, build): work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "sailfish", "index", build) sailfish = config_utils.get_program("sailfish", data["config"]) num_cores = dd.get_num_cores(data) gtf_fa = create_combined_fasta(data) if file_exists(os.path.join(out_dir, "versionInfo.json")): return out_dir with file_transaction(data, out_dir) as tx_out_dir: fq1, _ = dd.get_input_sequence_files(data) kmersize = pick_kmersize(fq1) cmd = ("{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} " "-k {kmersize}") message = "Creating sailfish index for {gtf_fa} with {kmersize} bp kmers." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def salmon_index(gtf_file, ref_file, data, out_dir): out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data)) if dd.get_disambiguate(data): out_dir = "-".join([out_dir] + dd.get_disambguate(data)) salmon = config_utils.get_program("salmon", dd.get_config(data)) num_cores = dd.get_num_cores(data) gtf_fa = sailfish.create_combined_fasta(data, out_dir) tmpdir = dd.get_tmp_dir(data) out_file = os.path.join(out_dir, "versionInfo.json") if file_exists(out_file): return out_dir with file_transaction(out_dir) as tx_out_dir: cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}" message = "Creating Salmon index for {gtf_fa}." do.run(cmd.format(**locals()), message.format(**locals()), None) return out_dir
def filter_primary(bam_file, data): """Filter reads to primary only BAM. Removes: - not primary alignment (0x100) 256 - supplementary alignment (0x800) 2048 """ stem, ext = os.path.splitext(bam_file) out_file = stem + ".primary" + ext if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cores = dd.get_num_cores(data) cmd = ("samtools view -@ {cores} -F 2304 -b {bam_file} > {tx_out_file}") do.run(cmd.format(**locals()), ("Filtering primary alignments in %s." % os.path.basename(bam_file))) return out_file
def stringtie_merge(*samples): to_merge = filter_missing( flatten([ dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples) ])) data = samples[0][0] ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) merged_gtf = stringtie.merge(to_merge, ref_file, gtf_file, num_cores, data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_merged_gtf(data, merged_gtf) updated_samples.append([data]) return updated_samples
def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = { "window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data), "high_multiplier": 20 } prefix = os.path.join( utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) depth_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions") if not utils.file_uptodate(callable_file, bam_file): ref_file = dd.get_ref_file(data) cmd = [ "goleft", "depth", "--q", "1", "--mincov", str(params["min"]), "--reference", ref_file, "--processes", str(dd.get_num_cores(data)), "--ordered" ] max_depth = _get_max_depth(variant_regions_avg_cov, params, data) if max_depth: cmd += ["--maxmeandepth", str(int(max_depth))] with file_transaction(data, depth_file) as tx_depth_file: with utils.chdir(os.path.dirname(tx_depth_file)): tx_callable_file = tx_depth_file.replace( ".depth.bed", ".callable.bed") prefix = tx_depth_file.replace(".depth.bed", "") cmd += ["--prefix", prefix, bam_file] bcbio_env = utils.get_bcbio_env() msg = "Calculate coverage: %s" % dd.get_sample_name(data) do.run(cmd, msg, env=bcbio_env) shutil.move(tx_callable_file, callable_file) final_callable = _subset_to_variant_regions(callable_file, variant_regions, data) return depth_file, final_callable, _extract_highdepth( final_callable, data), variant_regions_avg_cov
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data): """Run titanCNA wrapper script on given ploidy and clusters. """ sample = dd.get_sample_name(data) cores = dd.get_num_cores(data) export_cmd = utils.get_R_exports() ploidy_dir = utils.safe_makedir(os.path.join(work_dir, "run_ploidy%s" % ploidy)) cluster_dir = "%s_cluster%02d" % (sample, num_clusters) out_dir = os.path.join(ploidy_dir, cluster_dir) if not utils.file_uptodate(out_dir + ".titan.txt", cn_file): with tx_tmpdir(data) as tmp_dir: with utils.chdir(tmp_dir): cmd = ("{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} " "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir} " "--libdir None") chroms = ["'%s'" % c.name.replace("chr", "") for c in ref.file_contigs(dd.get_ref_file(data)) if chromhacks.is_autosomal_or_x(c.name)] if "'X'" not in chroms: chroms += ["'X'"] # Use UCSC style naming for human builds to support BSgenome genome_build = ("hg19" if dd.get_genome_build(data) in ["GRCh37", "hg19"] else dd.get_genome_build(data)) cmd += """ --chrs "c(%s)" """ % ",".join(chroms) cmd += " --genomeBuild {genome_build}" if data["genome_build"] in ("hg19", "hg38"): cmd += " --genomeStyle UCSC" if data["genome_build"] in ["hg38"]: data_dir = os.path.normpath(os.path.join( os.path.dirname(os.path.realpath(os.path.join( os.path.dirname(utils.Rscript_cmd()), "titanCNA.R"))), os.pardir, os.pardir, "data")) cytoband_file = os.path.join(data_dir, "cytoBand_hg38.txt") assert os.path.exists(cytoband_file), cytoband_file cmd += " --cytobandFile %s" % cytoband_file # TitanCNA's model is influenced by the variance in read coverage data # and data type: set reasonable defaults for non-WGS runs # (see https://github.com/gavinha/TitanCNA/tree/master/scripts/R_scripts) if dd.get_coverage_interval(data) != "genome": cmd += " --alphaK=2500 --alphaKHigh=2500" do.run(cmd.format(**locals()), "TitanCNA CNV detection: ploidy %s, cluster %s" % (ploidy, num_clusters)) for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")): shutil.move(fname, ploidy_dir) if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")): shutil.move(os.path.join(tmp_dir, "Rplots.pdf"), os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir)) return ploidy_dir
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data): """Run evaluation of a caller against the truth set using rtg vcfeval. """ out_dir = os.path.join(base_dir, "rtg") if not utils.file_exists(os.path.join(out_dir, "done")): if os.path.exists(out_dir): shutil.rmtree(out_dir) vrn_file, rm_file, interval_bed = _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data) rtg_ref = tz.get_in(["reference", "rtg"], data) assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n" "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref) # handle CWL where we have a reference to a single file in the RTG directory if os.path.isfile(rtg_ref): rtg_ref = os.path.dirname(rtg_ref) # get core and memory usage from standard configuration threads = min(dd.get_num_cores(data), 6) resources = config_utils.get_resources("rtg", data["config"]) memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]), {"algorithm": {"memory_adjust": {"magnitude": threads, "direction": "increase"}}}) jvm_stack = [x for x in memory if x.startswith("-Xms")] jvm_mem = [x for x in memory if x.startswith("-Xmx")] jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m" jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g" cmd = ["rtg", "vcfeval", "--threads", str(threads), "-b", rm_file, "--bed-regions", interval_bed, "-c", vrn_file, "-t", rtg_ref, "-o", out_dir] rm_samples = vcfutils.get_samples(rm_file) if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples: cmd += ["--sample=%s" % dd.get_sample_name(data)] cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))] mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (utils.local_path_export(), jvm_stack, jvm_mem) cmd = mem_export + " && " + " ".join(cmd) do.run(cmd, "Validate calls using rtg vcfeval", data) out = {"fp": os.path.join(out_dir, "fp.vcf.gz"), "fn": os.path.join(out_dir, "fn.vcf.gz")} tp_calls = os.path.join(out_dir, "tp.vcf.gz") tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz") if os.path.exists(tp_baseline): out["tp"] = tp_baseline out["tp-calls"] = tp_calls else: out["tp"] = tp_calls return out
def _vardict_options_from_config(items, config, out_file, target=None, is_rnaseq=False): var2vcf_opts = [] opts = ["-c 1", "-S 2", "-E 3", "-g 4"] # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0", # "-k", "3", "-r", "4", "-m", "8"] cores = dd.get_num_cores(items[0]) if cores and cores > 1: opts += ["-th", str(cores)] # Disable SV calling for vardict, causes issues with regional analysis # by detecting SVs outside of target regions, which messes up merging # SV calling will be worked on as a separate step # use tools_on: vardict_sv to turn sv calling in vardict on (experimental) tools_on = dd.get_in_samples(items, dd.get_tools_on) vardict_sv_on = tools_on and "vardict_sv" in tools_on vardict_cl = get_vardict_command(items[0]) version = programs.get_version_manifest(vardict_cl) # turn off structural variants if ((vardict_cl and version and ((vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.5")) or (vardict_cl == "vardict"))) and not vardict_sv_on): opts += ["--nosv"] if (vardict_cl and version and (vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.6"))): opts += ["--deldupvar"] # remove low mapping quality reads if not is_rnaseq: opts += ["-Q", "10"] # Remove QCfail reads, avoiding high depth repetitive regions opts += ["-F", "0x700"] resources = config_utils.get_resources("vardict", config) if resources.get("options"): opts += [str(x) for x in resources["options"]] resources = config_utils.get_resources("var2vcf", config) if resources.get("options"): var2vcf_opts += [str(x) for x in resources["options"]] if target and _is_bed_file(target): target = _enforce_max_region_size(target, items[0]) opts += [target] # this must be the last option _add_freq_options(config, opts, var2vcf_opts) return " ".join(opts), " ".join(var2vcf_opts)
def identify(data): """Identify high depth regions in the alignment file for potential filtering. """ high_multiplier = 20 sample_size = int(1e6) high_percentage = 25.0 min_coverage = 10 window_size = 250 work_bam, out_file, stats_file = _get_files(data) if not os.path.exists(out_file): cores = dd.get_num_cores(data) with file_transaction(data, out_file) as tx_out_file: tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file) py_cl = os.path.join(os.path.dirname(sys.executable), "py") cmd = ( "sambamba depth window -t {cores} -c {min_coverage} " "--window-size {window_size} {work_bam} " "| head -n {sample_size} " """| cut -f 5 | {py_cl} -l 'numpy.median([float(x) for x in l if not x.startswith("mean")])'""" ) median_cov = float( subprocess.check_output(cmd.format(**locals()), shell=True)) if not numpy.isnan(median_cov): high_thresh = int(high_multiplier * median_cov) cmd = ( "sambamba depth window -t {cores} -c {median_cov} " "--window-size {window_size} -T {high_thresh} {work_bam} " "| {py_cl} -fx 'float(x.split()[5]) >= {high_percentage} " """if not x.startswith("#") else None' """ "| cut -f 1-3,7 > {tx_raw_file} ") do.run(cmd.format(**locals()), "Identify high coverage regions") with open(stats_file, "w") as out_handle: yaml.safe_dump({"median_cov": median_cov}, out_handle, allow_unicode=False, default_flow_style=False) else: with open(tx_raw_file, "w") as out_handle: out_handle.write("") if utils.file_exists(tx_raw_file): cmd = "bedtools merge -i {tx_raw_file} -c 4 -o distinct > {tx_out_file}" do.run(cmd.format(**locals()), "Clean up raw coverage file") else: shutil.move(tx_raw_file, tx_out_file) return out_file if os.path.exists(out_file) else None
def remove_duplicates(in_bam, data): """ remove duplicates from a duplicate marked BAM file """ base, ext = os.path.splitext(in_bam) out_bam = base + "-noduplicates" + ext if utils.file_exists(out_bam): return out_bam num_cores = dd.get_num_cores(data) sambamba = config_utils.get_program("sambamba", data) with file_transaction(out_bam) as tx_out_bam: cmd = (f'{sambamba} view -h --nthreads {num_cores} -f bam -F "not duplicate" ' f'{in_bam} > {tx_out_bam}') message = f"Removing duplicates from {in_bam}, saving as {out_bam}." do.run(cmd, message) index(out_bam, dd.get_config(data)) return out_bam
def has_nalignments(in_bam, n, data, filter=None): """ does a BAM file has at least n alignments? """ sambamba = config_utils.get_program("sambamba", dd.get_config(data)) num_cores = dd.get_num_cores(data) if not filter: filter_string = "" message = f"Counting alignments in {in_bam}." else: filter_string = "--filter {filter}" message = f"Counting alignments in {in_bam} matching {filter}." cmd = f"{sambamba} view -f sam {filter_string} {in_bam} | head -{n} | wc -l" logger.info(message) result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) return int(result.stdout.decode().strip()) >= n
def bqsr_table(data): """Generate recalibration tables as inputs to BQSR. """ in_file = dd.get_align_bam(data) out_file = "%s-recal-table.txt" % utils.splitext_plus(in_file)[0] if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: assoc_files = dd.get_variation_resources(data) known = "-k %s" % ( assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else "" license = license_export(data) cores = dd.get_num_cores(data) ref_file = dd.get_ref_file(data) cmd = ("{license}sentieon driver -t {cores} -r {ref_file} " "-i {in_file} --algo QualCal {known} {tx_out_file}") do.run(cmd.format(**locals()), "Sentieon QualCal generate table") return out_file
def _run_break_point_inspector(data, variant_file, paired): output_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(variant_file)[0], "bpi") if not utils.file_exists(output_vcf): with file_transaction(data, output_vcf) as tx_output_vcf: cores = dd.get_num_cores(data) resources = config_utils.get_resources("break-point-inspector", data["config"]) memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms1000m", "-Xmx2000m"]), {"algorithm": {"memory_adjust": {"magnitude": cores, "direction": "increase"}}}) cmd = ["break-point-inspector"] cmd += memory cmd += ["-vcf", variant_file] if paired: cmd += ["-ref", paired.normal_bam, "-tumor", paired.tumor_bam] cmd += ["-output_vcf", tx_output_vcf] do.run(cmd, "Running Break Point Inspector for Manta SV calls") return output_vcf
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not file_exists(out_file) and (final_file is None or not file_exists(final_file)): cmd = ("{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": gtf_file = dd.get_gtf_file(data) splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") if not file_exists(splicesites): splicesites = create_splicesites_file(gtf_file, align_dir, data) # empty splicesite files means there is no splicing, so skip this option # if there is no splicing for this organism if file_exists(splicesites): cmd += "--known-splicesite-infile {splicesites} " # apply additional hisat2 options cmd += " ".join(_get_options_from_config(data)) message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cmd += " | " + tobam_cl do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) return data
def _fastp_trim(fastq_files, adapters, out_dir, data): """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp) """ report_file = os.path.join( out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0]) out_files = [ os.path.join( out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0]) for x in fastq_files ] if not utils.file_exists(out_files[0]): with file_transaction(data, *[report_file] + out_files) as tx_out: tx_report = tx_out[0] tx_out_files = tx_out[1:] cmd = ["fastp", "--thread", dd.get_num_cores(data)] if dd.get_quality_format(data).lower() == "illumina": cmd += ["--phred64"] for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)): if i == 0: cmd += ["-i", inf, "-o", outf] else: cmd += ["-I", inf, "-O", outf] cmd += [ "--cut_by_quality3", "--cut_mean_quality", "5", "--length_required", str(dd.get_min_read_length(data)), "--disable_quality_filtering" ] if "polyx" in dd.get_adapters(data): cmd += ["--trim_poly_x", "--poly_x_min_len", "8"] if "polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters( data): cmd += ["--trim_poly_g", "--poly_g_min_len", "8"] for a in adapters: cmd += ["--adapter_sequence", a] if not adapters: cmd += ["--disable_adapter_trimming"] cmd += [ "--json", report_file, "--report_title", dd.get_sample_name(data) ] do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data)) return out_files, report_file
def calculate(bam_file, data): """Calculate coverage in parallel using samtools depth through goleft. samtools depth removes duplicates and secondary reads from the counts: if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; """ params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data), "high_multiplier": 20} prefix = os.path.join( utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))), "%s-coverage" % (dd.get_sample_name(data))) out_file = prefix + ".depth.bed" callable_file = prefix + ".callable.bed" variant_regions = dd.get_variant_regions_merged(data) variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions", file_prefix=prefix) if not utils.file_uptodate(out_file, bam_file): ref_file = dd.get_ref_file(data) cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1", "--mincov", str(params["min"]), "--reference", ref_file, "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"] window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0] if not utils.file_uptodate(window_file, bam_file): with file_transaction(data, window_file) as tx_out_file: if not variant_regions: variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0] with open(variant_regions, "w") as out_handle: for c in shared.get_noalt_contigs(data): out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size)) pybedtools.BedTool().window_maker(w=params["parallel_window_size"], b=pybedtools.BedTool(variant_regions)).saveas(tx_out_file) cmd += ["--bed", window_file] max_depth = _get_max_depth(variant_regions_avg_cov, params, data) if max_depth: cmd += ["--maxmeandepth", str(int(max_depth))] with file_transaction(data, out_file) as tx_out_file: with utils.chdir(os.path.dirname(tx_out_file)): tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed") prefix = tx_out_file.replace(".depth.bed", "") cmd += ["--prefix", prefix, bam_file] bcbio_env = utils.get_bcbio_env() msg = "Calculate coverage: %s" % dd.get_sample_name(data) do.run(cmd, msg, env=bcbio_env) shutil.move(tx_callable_file, callable_file) return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov
def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith( "wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error( "bismark index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data bismark = config_utils.get_program("bismark", config) fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) n = 1 if num_cores < 5 else 2 safe_makedir(align_dir) cmd = "{bismark} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}" if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") process_bam = _process_bam(raw_bam[0], fastq_files, sample, dd.get_sam_ref(data), config) utils.symlink_plus(process_bam, final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data
def _bam_coverage(name, bam_input, data): """Run bamCoverage from deeptools""" cmd = ("{bam_coverage} -b {bam_input} -o {bw_output} " "--binSize 20 --effectiveGenomeSize {size} " "--smoothLength 60 --extendReads 150 --centerReads -p {cores}") size = int(get_genome(dd.get_genome_build(data))) cores = dd.get_num_cores(data) try: bam_coverage = config_utils.get_program("bamCoverage", data) except config_utils.CmdNotFound: logger.info("No bamCoverage found, skipping bamCoverage.") return None bw_output = os.path.join(os.path.dirname(bam_input), "%s.bw" % name) if utils.file_exists(bw_output): return bw_output with file_transaction(bw_output) as out_tx: do.run(cmd.format(**locals()), "Run bamCoverage in %s" % name) return bw_output
def downsample(in_bam, data, target_counts, work_dir=None): """Downsample a BAM file to the specified number of target counts. """ index(in_bam, data["config"], check_timestamp=False) ds_pct = get_downsample_pct(in_bam, target_counts, data) if ds_pct: out_file = "%s-downsample%s" % os.path.splitext(in_bam) if work_dir: out_file = os.path.join(work_dir, os.path.basename(out_file)) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: samtools = config_utils.get_program("samtools", data["config"]) num_cores = dd.get_num_cores(data) ds_pct = "42." + "{ds_pct:.3}".format(ds_pct=ds_pct).replace("0.", "") cmd = ("{samtools} view -O BAM -@ {num_cores} -o {tx_out_file} " "-s {ds_pct} {in_bam}") do.run(cmd.format(**locals()), "Downsample BAM file: %s" % os.path.basename(in_bam)) return out_file