def clean_chipseq_alignment(data): # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) method = dd.get_chip_method(data) if method == "atac": data = clean_ATAC(data) # for ATAC-seq, this will be the NF BAM work_bam = dd.get_work_bam(data) work_bam = bam.sort(work_bam, dd.get_config(data)) bam.index(work_bam, dd.get_config(data)) clean_bam = remove_nonassembled_chrom(work_bam, data) clean_bam = remove_mitochondrial_reads(clean_bam, data) data = atac.calculate_complexity_metrics(clean_bam, data) if not dd.get_keep_multimapped(data): clean_bam = remove_multimappers(clean_bam, data) if not dd.get_keep_duplicates(data): clean_bam = bam.remove_duplicates(clean_bam, data) data["work_bam"] = clean_bam encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) try: data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) except subprocess.CalledProcessError: logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, " f" falling back to non-normalized coverage.") data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) data["align_bam"] = dd.get_work_bam(data) if dd.get_mark_duplicates(data): if aligner: if aligner == "bowtie2": filterer = bowtie2.filter_multimappers elif aligner == "bwa": filterer = bwa.filter_multimappers else: logger.error("ChIP-seq only supported for bowtie2 and bwa.") sys.exit(-1) unique_bam = filterer(dd.get_work_bam(data), data) data["work_bam"] = unique_bam else: logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal." "If BAM is not cleaned for peak calling, can result in downstream errors.") # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data), data["config"]) encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) data["align_bam"] = dd.get_work_bam(data) if dd.get_mark_duplicates(data): if aligner: if aligner == "bowtie2": filterer = bowtie2.filter_multimappers elif aligner == "bwa": filterer = bwa.filter_multimappers else: logger.error("ChIP-seq only supported for bowtie2 and bwa.") sys.exit(-1) unique_bam = filterer(dd.get_work_bam(data), data) data["work_bam"] = unique_bam else: logger.info( "Warning: When BAM file is given as input, bcbio skips multimappers removal." "If BAM is not cleaned for peak calling, can result in downstream errors." ) # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data), data["config"]) encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def apply_recal(data): """Apply recalibration tables to the sorted aligned BAM, producing recalibrated BAM. """ orig_bam = dd.get_align_bam(data) or dd.get_work_bam(data) had_work_bam = "work_bam" in data if dd.get_recalibrate(data) in [True, "gatk"]: logger.info("Applying BQSR recalibration with GATK: %s " % str(dd.get_sample_name(data))) data["work_bam"] = _gatk_apply_bqsr(data) elif dd.get_recalibrate(data) == "sentieon": logger.info("Applying BQSR recalibration with sentieon: %s " % str(dd.get_sample_name(data))) data["work_bam"] = sentieon.apply_bqsr(data) elif dd.get_recalibrate(data): raise NotImplementedError("Unsupported recalibration type: %s" % (dd.get_recalibrate(data))) # CWL does not have work/alignment BAM separation if not had_work_bam and dd.get_work_bam(data): data["align_bam"] = dd.get_work_bam(data) if orig_bam != dd.get_work_bam(data) and orig_bam != dd.get_align_bam( data): utils.save_diskspace(orig_bam, "BAM recalibrated to %s" % dd.get_work_bam(data), data["config"]) return data
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def variants(data, out_dir): """Variants QC metrics""" if not "variants" in data: return None work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) bcfstats = _run_bcftools(data, work_dir) bed_file = dd.get_coverage(data) bcf_out = os.path.join(sample + "_bcbio_variants_stats.txt") cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") with chdir(work_dir): if not file_exists(bcf_out): with open(bcf_out, "w") as out_handle: yaml.safe_dump(bcfstats, out_handle, default_flow_style=False, allow_unicode=False) if "vrn_file" not in data or not bed_file: return None in_vcf = data['vrn_file'] cleaned_bed = clean_file(bed_file, data) if file_exists(qc_file): return qc_file in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(parse_file): with file_transaction(cg_file) as tx_out: params = ["-T", "VariantAnnotator", "-R", ref_file, "-L", cleaned_bed, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >>out_handle, "CG\tdepth\tsample" cmd = ("bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): remove_plus(cg_file)
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) assert aligner == "bowtie2", "ChIP-seq only supported for bowtie2." if aligner == "bowtie2": data["raw_bam"] = dd.get_work_bam(data) unique_bam = filter_multimappers(dd.get_work_bam(data), data) data["work_bam"] = unique_bam return [[data]] return [[data]]
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data["vrn_file"] work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) sample = dd.get_sample_name(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out, ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, "w") as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}" ) do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug("parsing coverage: %s" % sample) return data
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) data["raw_bam"] = dd.get_work_bam(data) if aligner: assert aligner == "bowtie2", "ChIP-seq only supported for bowtie2." unique_bam = filter_multimappers(dd.get_work_bam(data), data) data["work_bam"] = unique_bam else: logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal." "If BAM is not cleaned for peak calling, can result in downstream errors.") return [[data]]
def _get_replicate_samples(sample, data): """Get input sample for each chip bam file.""" dd.get_phenotype(sample) rep_bam = "" for origin in data: if dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(sample) in dd.get_phenotype(origin[0]) and dd.get_work_bam(sample) != dd.get_work_bam(origin[0]) and dd.get_phenotype(origin[0]) != "control": if rep_bam != "": rep_bam = rep_bam + "," + dd.get_work_bam(origin[0]) else: rep_bam = dd.get_work_bam(origin[0]) sample["work_bam_rep"] = dd.get_work_bam(origin[0]) return [sample]
def _get_paired_samples(sample, data): """Get input sample for each chip bam file.""" dd.get_phenotype(sample) input_bam = "" for origin in data: if dd.get_phenotype(origin[0]) == "control": if input_bam != "": input_bam = input_bam + "," + dd.get_work_bam(origin[0]) else: input_bam = dd.get_work_bam(origin[0]) sample["work_bam_input"] = input_bam return [sample]
def variants(data): if "vrn_file" not in data: return data if not dd.get_coverage(data): return data in_vcf = data['vrn_file'] sample = dd.get_sample_name(data) cg_file = os.path.join(sample + "_with-gc.vcf.gz") parse_file = os.path.join(sample + "_gc-depth-parse.tsv") qc_file = os.path.join(sample + "_bcbio_variants.txt") work_dir = os.path.join(dd.get_work_dir(data), "report", "variants") with chdir(work_dir): if file_exists(qc_file): return data in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) ref_file = dd.get_ref_file(data) assert ref_file, "Need the reference genome fasta file." bed_file = dd.get_variant_regions(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) num_cores = dd.get_num_cores(data) broad_runner = broad.runner_from_config_safe(data["config"]) if in_bam and broad_runner and broad_runner.has_gatk(): if not file_exists(cg_file): with file_transaction(cg_file) as tx_out: params = [ "-T", "VariantAnnotator", "-R", ref_file, "-L", bed_file, "-I", in_bam, "-A", "GCContent", "-A", "Coverage", "--variant", in_vcf, "--out", tx_out ] broad_runner.run_gatk(params) cg_file = vcfutils.bgzip_and_index(cg_file, data["config"]) if not file_exists(parse_file): with file_transaction(parse_file) as out_tx: with open(out_tx, 'w') as out_handle: print >> out_handle, "CG\tdepth\tsample" cmd = ( "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R " "{bed_file} {cg_file} >> {out_tx}") do.run(cmd.format(**locals()), "Calculating GC content and depth for %s" % in_vcf) logger.debug('parsing coverage: %s' % sample) if not file_exists(qc_file): # This files will be copied to final _summary_variants(parse_file, qc_file) if file_exists(qc_file) and file_exists(parse_file): os.remove(cg_file) return data
def umi_consensus(data): """Convert UMI grouped reads into fastq pair for re-alignment. """ align_bam = dd.get_work_bam(data) umi_method, umi_tag = _check_umi_type(align_bam) f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0] f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0] if not utils.file_uptodate(f1_out, align_bam): with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out): jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = "--async-io=true --compression=0" group_opts, cons_opts, filter_opts = _get_fgbio_options(data, umi_method) cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads" tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0] ref_file = dd.get_ref_file(data) cmd = ("unset JAVA_HOME && " "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} " "-i {align_bam} | " "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: " "-i /dev/stdin -o /dev/stdout | " "fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} " "-i /dev/stdin -o /dev/stdout | " "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1") do.run(cmd.format(**locals()), "UMI consensus fastq generation") return f1_out, f2_out
def _calculate_sv_coverage_cnvkit(data, work_dir): """Calculate coverage in an CNVkit ready format using mosdepth. """ from bcbio.variation import coverage from bcbio.structural import annotate out_target_file = os.path.join( work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data)) out_anti_file = os.path.join( work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data)) if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file)) and (dd.get_align_bam(data) or dd.get_work_bam(data))): target_cov = coverage.run_mosdepth( data, "target", tz.get_in(["regions", "bins", "target"], data)) anti_cov = coverage.run_mosdepth( data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data)) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0) out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data) out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data) return out_target_file, out_anti_file
def dedup_bismark(data): """Remove alignments to the same position in the genome from the Bismark mapping output using deduplicate_bismark """ config = data["config"] input_file = datadict.get_work_bam(data) # don't sort even by read names # input_file = bam.sort(input_file, config, order="queryname") sample_name = datadict.get_sample_name(data) output_dir = os.path.join(datadict.get_work_dir(data), 'dedup', sample_name) output_dir = utils.safe_makedir(output_dir) input_file_name, input_file_extension = os.path.splitext(os.path.basename( input_file )) output_file = os.path.join( output_dir, f'{input_file_name}.deduplicated{input_file_extension}' ) if utils.file_exists(output_file): data = datadict.set_work_bam(data, output_file) return [[data]] deduplicate_bismark = config_utils.get_program('deduplicate_bismark', config) command = f'{deduplicate_bismark} --output_dir {output_dir} {input_file}' with transaction.file_transaction(output_dir): do.run(command, 'remove deduplicate alignments') data = datadict.set_work_bam(data, output_file) data["deduplication_report"] = output_file.replace("deduplicated.bam", "deduplication_report.txt") return [[data]]
def coverage_region_detailed_stats(data, out_dir): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file: return None work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_total_file = os.path.join(sample + "_cov_total.tsv") parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(parse_file) as out_tx: cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=[1, 5, 10, 20, 40, 50, 60, 70, 80, 100], max_cov=1000) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) parse_file = _calculate_percentiles(os.path.abspath(parse_file), sample) return os.path.abspath(parse_file)
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file): return {} work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = clean_file(bed_file, data) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) # data['priority_total_coverage'] = os.path.abspath(out_file) return out_file
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data # bwa mem needs phred+33 quality, so convert if it is Illumina if dd.get_quality_format(data).lower() == "illumina": logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.") fastq_file = fastq.groom(fastq_file, in_qual="fastq-illumina", data=data) if pair_file: pair_file = fastq.groom(pair_file, in_qual="fastq-illumina", data=data) bwa = config_utils.get_program("bwa", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_fasta = index_transcriptome(gtf_file, ref_file, data) args = " ".join(_bwa_args_from_config(data["config"])) num_cores = data["config"]["algorithm"].get("num_cores", 1) cmd = ( "{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} " "{pair_file} | samtools view -bhS - > {tx_out_file}" ) with file_transaction(out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def extract_NF_regions(data): """ extract the nucleosome free regions from the work_bam. These regions will be < 100 bases """ MAX_FRAG_LENGTH = 100 sieve = config_utils.get_program("alignmentSieve", data) work_bam = dd.get_work_bam(data) num_cores = dd.get_num_cores(data) out_file = os.path.splitext(work_bam)[0] + "-NF.bam" log_file = os.path.splitext(work_bam)[0] + "-NF.log" if file_exists(out_file): data["NF_bam"] = out_file return data with file_transaction(out_file) as tx_out_file, \ file_transaction(log_file) as tx_log_file: tx_unsorted_bam = tx_out_file + ".unsorted" cmd = ( f"{sieve} --bam ${work_bam} --outFile {tx_unsorted_bam} --ATACshift " f"--numberOfProcessors {num_cores} --maxFragmentLength {MAX_FRAG_LENGTH} " f"--minMappingQuality 10 " f"--filterMetrics {tx_log_file} ") do.run(cmd, "Extract NF regions from {work_bam} to {tx_unsorted_bam}.") tx_out_file = bam.sort(tx_unsorted_bam) data["NF_bam"] = out_file return data
def _gatk_apply_bqsr(data): """Parallel BQSR support for GATK4. """ in_file = dd.get_align_bam(data) or dd.get_work_bam(data) out_file = os.path.join( dd.get_work_dir(data), "align", dd.get_sample_name(data), "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0]) if not utils.file_uptodate(out_file, in_file): with file_transaction(data, out_file) as tx_out_file: broad_runner = broad.runner_from_config(data["config"]) gatk_type = broad_runner.gatk_type() if gatk_type == "gatk4": params = [ "-T", "ApplyBQSRSpark", "--sparkMaster", "local[%s]" % dd.get_num_cores(data), "--input", in_file, "--output", tx_out_file, "--bqsr_recal_file", data["prep_recal"] ] else: params = [ "-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file, "-BQSR", data["prep_recal"], "-o", tx_out_file ] broad_runner.run_gatk(params, os.path.dirname(tx_out_file)) bam.index(out_file, data["config"]) return out_file
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" if file_exists(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {in_bam}") message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, count_file) as tx_count_file: do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) os.rename(fixed_count_file, count_file) return count_file
def _prep_data(data, items): for r in ["callable_regions", "variant_regions"]: data[r] = list(set(filter(lambda x: x is not None, [tz.get_in(("config", "algorithm", r), d) for d in items]))) data["work_bams"] = [dd.get_align_bam(x) or dd.get_work_bam(x) for x in items] data["vrn_files"] = [x["vrn_file"] for x in items] return data
def priority_total_coverage(data): """ calculate coverage at depth 20 in the priority regions """ bed_file = dd.get_priority_regions(data) if not bed_file: return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {bed_file} " "-F \"not unmapped\" " "-T 20 {in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def coverage_region_detailed_stats(data, out_dir, extra_cutoffs=None): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) if not bed_file or not utils.file_exists(bed_file): return [] work_dir = safe_makedir(out_dir) cleaned_bed = clean_file(bed_file, data, prefix="cov-", simple=True) cutoffs = {1, 5, 10, 20, 50, 100, 250, 500, 1000, 5000, 10000, 50000} with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") if utils.file_uptodate(parse_file, cleaned_bed) and utils.file_uptodate(parse_file, in_bam): pass else: with file_transaction(data, parse_file) as out_tx: depth_thresholds = sorted(list(cutoffs | extra_cutoffs)) cmdl = sambamba.make_command(data, "depth region", in_bam, cleaned_bed, depth_thresholds=depth_thresholds) cmdl += " | sed 's/# chrom/chrom/' > " + out_tx do.run(cmdl, "Run coverage regional analysis for {}".format(sample)) out_files = _calculate_percentiles(os.path.abspath(parse_file), sample, data=data, cutoffs=cutoffs) return [os.path.abspath(x) for x in out_files]
def assign_interval(data): """Identify coverage based on percent of genome covered and relation to targets. Classifies coverage into 3 categories: - genome: Full genome coverage - regional: Regional coverage, like exome capture, with off-target reads - amplicon: Amplication based regional coverage without off-target reads """ if not dd.get_coverage_interval(data): vrs = dd.get_variant_regions_merged(data) callable_file = dd.get_sample_callable(data) if vrs: callable_size = pybedtools.BedTool(vrs).total_coverage() else: callable_size = pybedtools.BedTool(callable_file).total_coverage() total_size = sum([c.size for c in ref.file_contigs(dd.get_ref_file(data), data["config"])]) genome_cov_pct = callable_size / float(total_size) if genome_cov_pct > GENOME_COV_THRESH: cov_interval = "genome" offtarget_pct = 0.0 elif not vrs: cov_interval = "regional" offtarget_pct = 0.0 else: offtarget_pct = _count_offtarget(data, dd.get_align_bam(data) or dd.get_work_bam(data), vrs or callable_file, "variant_regions") if offtarget_pct > OFFTARGET_THRESH: cov_interval = "regional" else: cov_interval = "amplicon" logger.info("%s: Assigned coverage as '%s' with %.1f%% genome coverage and %.1f%% offtarget coverage" % (dd.get_sample_name(data), cov_interval, genome_cov_pct * 100.0, offtarget_pct * 100.0)) data["config"]["algorithm"]["coverage_interval"] = cov_interval return data
def _run_variantcall_batch_multicore(items, regions, final_file): """Run variant calling on a batch of items using multiple cores. """ batch_name = _get_batch_name(items) variantcaller = _get_batch_variantcaller(items) work_bams = [dd.get_work_bam(d) or dd.get_align_bam(d) for d in items] def split_fn(data): out = [] for region in regions: region = _region_to_coords(region) chrom, start, end = region region_str = "_".join(str(x) for x in region) out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom, "%s-%s.vcf.gz" % (batch_name, region_str)) out.append((region, work_bams, out_file)) return final_file, out parallel = {"type": "local", "num_jobs": dd.get_num_cores(items[0]), "cores_per_job": 1} run_parallel = dmulti.runner(parallel, items[0]["config"]) to_run = copy.deepcopy(items[0]) to_run["sam_ref"] = dd.get_ref_file(to_run) to_run["group_orig"] = items parallel_split_combine([[to_run]], split_fn, run_parallel, "variantcall_sample", "concat_variant_files", "vrn_file", ["region", "sam_ref", "config"]) return final_file
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bowtie2 with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data bowtie2 = config_utils.get_program("bowtie2", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_index = index_transcriptome(gtf_file, ref_file, data) num_cores = data["config"]["algorithm"].get("num_cores", 1) fastq_cmd = "-1 %s" % fastq_file if pair_file else "-U %s" % fastq_file pair_cmd = "-2 %s " % pair_file if pair_file else "" cmd = ( "{bowtie2} -p {num_cores} -a -X 600 --rdg 6,5 --rfg 6,5 --score-min L,-.6,-.4 --no-discordant --no-mixed -x {gtf_index} {fastq_cmd} {pair_cmd} " ) with file_transaction(data, out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) cmd += "| " + postalign.sam_to_sortbam_cl( data, tx_out_file, name_sort=True) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def calling(data): """Main function to parallelize peak calling.""" chip_bam = dd.get_work_bam(data) if data["work_bam_rep"] != "": rep_bam = data["work_bam_rep"] else: rep_bam = "" input_bam = data["work_bam_input"] caller_fn = get_callers()[data["rmats_fn"]] name = dd.get_sample_name(data) fastq_file = fastq.get_fastq_files(data) read_len = bam.fastq.estimate_read_length(fastq_file[0]) if read_len < 50: read_len = 50 elif read_len > 50 and read_len < 75: read_len = 75 else: read_len = 100 if len(fastq_file) > 1: read_pair = "paired" else: read_pair = "single" out_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), data["rmats_fn"], name )) out_file = caller_fn(name, chip_bam, rep_bam, input_bam, dd.get_gtf_file(data), out_dir, read_len, read_pair, data["config"]) data["rmats_file"] = out_file return [[data]]
def priority_total_coverage(data): """ calculate coverage at 10 depth intervals in the priority regions """ bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(out_file) as tx_out_file: cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def run(items, config): """Run third party disambiguation script, resolving into single set of calls. """ assert len(items) == 2, "Can only resolve two organism disambiguation" # check aligner, handling tophat/tophat2 distinctions aligner = config["algorithm"].get("aligner") aligner = "tophat" if aligner.startswith("tophat") else aligner assert aligner in ["bwa", "hisat2", "tophat", "star"], "Disambiguation only supported for bwa, hisat2, star and tophat alignments." if items[0]["disambiguate"].get("base"): data_a, data_b = items else: data_b, data_a = items work_bam_a = bam.sort(data_a["work_bam"], config, "queryname") work_bam_b = bam.sort(data_b["work_bam"], config, "queryname") if data_a.get("align_split"): base_dir = utils.safe_makedir(os.path.normpath(os.path.join(os.path.dirname(work_bam_a), os.pardir, os.pardir, "disambiguate_%s" % aligner))) out_dir = os.path.join(base_dir, "_".join([str(x) for x in data_a["align_split"].split("-")])) else: out_dir = os.path.normpath(os.path.join(os.path.dirname(work_bam_a), os.pardir, "disambiguate_%s" % aligner)) base_name = os.path.join(out_dir, os.path.splitext(os.path.basename(work_bam_a))[0]) summary_file = "%s_summary.txt" % base_name if not utils.file_exists(summary_file): with file_transaction(items[0], out_dir) as tx_out_dir: _run_cplusplus(work_bam_a, work_bam_b, tx_out_dir, aligner, os.path.basename(base_name), items) data_a["disambiguate"] = \ {data_b["genome_build"]: bam.sort("%s.disambiguatedSpeciesB.bam" % base_name, config), "%s-ambiguous" % data_a["genome_build"]: bam.sort("%s.ambiguousSpeciesA.bam" % base_name, config), "%s-ambiguous" % data_b["genome_build"]: bam.sort("%s.ambiguousSpeciesB.bam" % base_name, config), "summary": summary_file} data_a["work_bam"] = bam.sort("%s.disambiguatedSpeciesA.bam" % base_name, config) bam.index(dd.get_work_bam(data_a), data_a["config"]) return [[data_a]]
def run_stringtie_expression(data): """ estimate expression from Stringtie, using the bcbio datadict does not do transcriptome assembly """ bam = dd.get_work_bam(data) gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) sample_name = dd.get_sample_name(data) out_dir = os.path.join("stringtie", sample_name) isoform_fpkm = os.path.join(out_dir, sample_name + ".isoform.fpkm") gene_fpkm = os.path.join(out_dir, sample_name + ".fpkm") if file_exists(isoform_fpkm) and file_exists(gene_fpkm): data = dd.set_cufflinks_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) return data with file_transaction(data, out_dir) as tx_out_dir: exon_file, transcript_file = _stringtie_expression(bam, gtf_file, num_cores, tx_out_dir) df = _parse_ballgown(transcript_file) _write_fpkms(df, tx_out_dir, sample_name) data = dd.set_cufflinks_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) return data
def run_stringtie_expression(data): """ estimate expression from Stringtie, using the bcbio datadict does not do transcriptome assembly """ bam = dd.get_work_bam(data) sample_name = dd.get_sample_name(data) out_dir = os.path.join("stringtie", sample_name) isoform_fpkm = os.path.join(out_dir, sample_name + ".isoform.fpkm") gene_fpkm = os.path.join(out_dir, sample_name + ".fpkm") assembly = os.path.abspath(os.path.join(out_dir, "stringtie-assembly.gtf")) if file_exists(isoform_fpkm) and file_exists(gene_fpkm): data = dd.set_stringtie_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) if "stringtie" in dd.get_transcript_assembler(data): assembled_gtfs = dd.get_assembled_gtf(data) assembled_gtfs.append(assembly) data = dd.set_assembled_gtf(data, assembled_gtfs) return data with file_transaction(data, out_dir) as tx_out_dir: transcript_file = _stringtie_expression(bam, data, tx_out_dir) df = _parse_ballgown(transcript_file) _write_fpkms(df, tx_out_dir, sample_name) data = dd.set_stringtie_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) if "stringtie" in dd.get_transcript_assembler(data): assembled_gtfs = dd.get_assembled_gtf(data) assembled_gtfs.append(assembly) data = dd.set_assembled_gtf(data, assembled_gtfs) return data
def _combine_qc_samples(samples): """Combine split QC analyses into single samples based on BAM files. """ by_bam = collections.defaultdict(list) for data in [utils.to_single_data(x) for x in samples]: batch = dd.get_batch(data) or dd.get_sample_name(data) if not isinstance(batch, (list, tuple)): batch = [batch] batch = tuple(batch) by_bam[(dd.get_align_bam(data) or dd.get_work_bam(data), batch)].append(data) out = [] for data_group in by_bam.values(): data = data_group[0] alg_qc = [] qc = {} metrics = {} for d in data_group: qc.update(dd.get_summary_qc(d)) metrics.update(dd.get_summary_metrics(d)) alg_qc.extend(dd.get_algorithm_qc(d)) data["config"]["algorithm"]["qc"] = alg_qc data["summary"]["qc"] = qc data["summary"]["metrics"] = metrics out.append([data]) return out
def calculate_sv_coverage(data): """Calculate coverage within bins for downstream CNV calling. Creates corrected cnr files with log2 ratios and depths. """ from bcbio.variation import coverage from bcbio.structural import annotate, cnvkit data = utils.to_single_data(data) if not cnvkit.use_general_sv_bins(data): return [[data]] work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "structural", dd.get_sample_name(data), "bins")) out_target_file = os.path.join(work_dir, "%s-target-coverage.cnn" % dd.get_sample_name(data)) out_anti_file = os.path.join(work_dir, "%s-antitarget-coverage.cnn" % dd.get_sample_name(data)) if ((not utils.file_exists(out_target_file) or not utils.file_exists(out_anti_file)) and (dd.get_align_bam(data) or dd.get_work_bam(data))): # mosdepth target_cov = coverage.run_mosdepth(data, "target", tz.get_in(["regions", "bins", "target"], data)) anti_cov = coverage.run_mosdepth(data, "antitarget", tz.get_in(["regions", "bins", "antitarget"], data)) target_cov_genes = annotate.add_genes(target_cov.regions, data, max_distance=0) anti_cov_genes = annotate.add_genes(anti_cov.regions, data, max_distance=0) out_target_file = _add_log2_depth(target_cov_genes, out_target_file, data) out_anti_file = _add_log2_depth(anti_cov_genes, out_anti_file, data) # TODO: Correct for GC bias if os.path.exists(out_target_file): data["depth"]["bins"] = {"target": out_target_file, "antitarget": out_anti_file} return [[data]]
def priority_total_coverage(data, out_dir): """ calculate coverage at 10 depth intervals in the priority regions """ from bcbio.structural import prioritize bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list( bed_file): return {} in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) cleaned_bed = clean_file(bed_file, data, prefix="svprioritize-") work_dir = safe_makedir(out_dir) sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if utils.file_uptodate(out_file, cleaned_bed) and utils.file_uptodate( out_file, in_bam): return out_file cmdl = sambamba.make_command( data, "depth region", in_bam, cleaned_bed, depth_thresholds=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]) with file_transaction(out_file) as tx_out_file: message = "Calculating region coverage of {bed_file} in {in_bam}" do.run(cmdl + " -o " + tx_out_file, message.format(**locals())) logger.debug("Saved svprioritize coverage into " + out_file) return out_file
def coverage(data): """ Calculate coverage at different completeness cutoff for region in coverage option. """ bed_file = dd.get_coverage(data) sambamba = config_utils.get_program("sambamba", data["config"]) work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage")) if not bed_file: return data cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed") cleaned_bed = bed.decomment(bed_file, cleaned_bed) with chdir(work_dir): in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sample = dd.get_sample_name(data) logger.debug("doing coverage for %s" % sample) parse_file = os.path.join(sample + "_coverage.bed") parse_total_file = os.path.join(sample + "_cov_total.tsv") cores = dd.get_num_cores(data) if not file_exists(parse_file): with tx_tmpdir(data, work_dir) as tmp_dir: with file_transaction(parse_file) as out_tx: cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} " "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 " "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# " "chrom/chrom/' > {out_tx}") do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample)) parse_file = _add_high_covered_regions(parse_file, cleaned_bed, sample) _calculate_percentiles(os.path.abspath(parse_file), sample) data['coverage'] = os.path.abspath(parse_file) return data
def rnaseq_vardict_variant_calling(data): sample = dd.get_sample_name(data) variation_dir = os.path.join(dd.get_work_dir(data), "variation") safe_makedir(variation_dir) out_file = os.path.join(variation_dir, sample + "-vardict.vcf.gz") if file_exists(out_file): data = dd.set_vrn_file(data, out_file) return data vardict_cmd = vardict.get_vardict_command(data) strandbias = "teststrandbias.R" var2vcf = "var2vcf_valid.pl" vcfstreamsort = config_utils.get_program("vcfstreamsort", data) compress_cmd = "| bgzip -c" freq = float(dd.get_min_allele_fraction(data, 20) / 100.0) var2vcf_opts = "-v 50" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() r_setup = ("unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(Rscript_cmd())) ref_file = dd.get_ref_file(data) bamfile = dd.get_work_bam(data) bed_file = gtf.gtf_to_bed(dd.get_gtf_file(data)) opts = " -c 1 -S 2 -E 3 -g 4 " with file_transaction(out_file) as tx_out_file: jvm_opts = vardict._get_jvm_opts(data, tx_out_file) cmd = ("{r_setup}{jvm_opts}{vardict_cmd} -G {ref_file} -f {freq} " "-N {sample} -b {bamfile} {opts} {bed_file} " "| {strandbias}" "| {var2vcf} -N {sample} -E -f {freq} {var2vcf_opts} " "| {fix_ambig} | {remove_dup} | {vcfstreamsort} {compress_cmd} " "> {tx_out_file}") message = "Calling RNA-seq variants with VarDict" do.run(cmd.format(**locals()), message) data = dd.set_vrn_file(data, out_file) return data
def split_ATAC(data, bam_file=None): """ splits a BAM into nucleosome-free (NF) and mono/di/tri nucleosome BAMs based on the estimated insert sizes uses the current working BAM file if no BAM file is supplied """ sambamba = config_utils.get_program("sambamba", data) num_cores = dd.get_num_cores(data) base_cmd = f'{sambamba} view --format bam --nthreads {num_cores} ' bam_file = bam_file if bam_file else dd.get_work_bam(data) out_stem = os.path.splitext(bam_file)[0] split_files = {} # we can only split these fractions from paired runs if not bam.is_paired(bam_file): split_files["full"] = bam_file data = tz.assoc_in(data, ['atac', 'align'], split_files) return data for arange in ATACRanges.values(): out_file = f"{out_stem}-{arange.label}.bam" if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: cmd = base_cmd +\ f'-F "template_length > {arange.min} and template_length < {arange.max}" ' +\ f'{bam_file} > {tx_out_file}' message = f'Splitting {arange.label} regions from {bam_file}.' do.run(cmd, message) bam.index(out_file, dd.get_config(data)) split_files[arange.label] = out_file split_files["full"] = bam_file data = tz.assoc_in(data, ['atac', 'align'], split_files) return data
def correct_umis(data): """Correct umis against the whitelist in correct_umi_file http://fulcrumgenomics.github.io/fgbio/tools/latest/CorrectUmis.html """ input_bam = dd.get_work_bam(data) output_bam = os.path.join( utils.safe_makedir( os.path.join(os.getcwd(), "align", dd.get_sample_name(data))), "%s-umis_corrected%s" % utils.splitext_plus(os.path.basename(input_bam))) jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(output_bam), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = "--async-io=true --compression=0" umis_whitelist = tz.get_in(["config", "algorithm", "correct_umis"], data) fgbio = config_utils.get_program("fgbio", data["config"]) samtools = config_utils.get_program("samtools", data["config"]) if not utils.file_exists(output_bam): umi_method, umi_tag = _check_umi_type(input_bam) cmd = ( "unset JAVA_HOME && " "{fgbio} {jvm_opts} {io_opts} CorrectUmis " "-t {umi_tag} -m 3 -d 1 -x " "-U {umis_whitelist} " "-i {input_bam} -o /dev/stdout | {samtools} view -bh > {output_bam}" ) do.run(cmd.format(**locals()), "Correcting UMIs") bam.index(output_bam, data["config"]) return output_bam
def _run_variantcall_batch_multicore(items, regions, final_file): """Run variant calling on a batch of items using multiple cores. """ batch_name = _get_batch_name(items) variantcaller = _get_batch_variantcaller(items) work_bams = [dd.get_work_bam(d) or dd.get_align_bam(d) for d in items] def split_fn(data): out = [] for region in regions: region = _region_to_coords(region) chrom, start, end = region region_str = "_".join(str(x) for x in region) out_file = os.path.join(dd.get_work_dir(items[0]), variantcaller, chrom, "%s-%s.vcf.gz" % (batch_name, region_str)) out.append((region, work_bams, out_file)) return final_file, out parallel = { "type": "local", "num_jobs": dd.get_num_cores(items[0]), "cores_per_job": 1 } run_parallel = dmulti.runner(parallel, items[0]["config"]) to_run = copy.deepcopy(items[0]) to_run["sam_ref"] = dd.get_ref_file(to_run) to_run["group_orig"] = items parallel_split_combine([[to_run]], split_fn, run_parallel, "variantcall_sample", "concat_variant_files", "vrn_file", ["region", "sam_ref", "config"]) return final_file
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname") gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = "{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}" message = "Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts" with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) return count_file
def run_stringtie_expression(data): """ estimate expression from Stringtie, using the bcbio datadict does not do transcriptome assembly """ bam = dd.get_work_bam(data) gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) sample_name = dd.get_sample_name(data) out_dir = os.path.join("stringtie", sample_name) isoform_fpkm = os.path.join(out_dir, sample_name + ".isoform.fpkm") gene_fpkm = os.path.join(out_dir, sample_name + ".fpkm") assembly = os.path.abspath(os.path.join(out_dir, "stringtie-assembly.gtf")) if file_exists(isoform_fpkm) and file_exists(gene_fpkm): data = dd.set_cufflinks_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) if "stringtie" in dd.get_transcript_assembler(data): dd.get_assembled_gtf(data).append(assembly) return data with file_transaction(data, out_dir) as tx_out_dir: transcript_file = _stringtie_expression(bam, gtf_file, num_cores, tx_out_dir) df = _parse_ballgown(transcript_file) _write_fpkms(df, tx_out_dir, sample_name) data = dd.set_cufflinks_dir(data, out_dir) data = dd.set_fpkm(data, gene_fpkm) data = dd.set_fpkm_isoform(data, isoform_fpkm) if "stringtie" in dd.get_transcript_assembler(data): dd.get_assembled_gtf(data).append(assembly) return data
def _get_paired_samples(sample, data): """Get input sample for each chip bam file.""" dd.get_phenotype(sample) for origin in data: if dd.get_batch(sample) in dd.get_batch(origin[0]) and dd.get_phenotype(origin[0]) == "input": sample["work_bam_input"] = dd.get_work_bam(origin[0]) return [sample]
def _regions_for_coverage(data, region, ref_file, out_file): """Retrieve BED file of regions we need to calculate coverage in. Checks for variant region specifications that do not overlap contigs (in which case we do not calculate coverage) and regions smaller than callable_min_size (in which case we assign everything as callable). callable_min_size avoids calculations for small chromosomes we won't split on later, saving computation and disk IO. """ variant_regions = bedutils.merge_overlaps(dd.get_variant_regions(data), data) ready_region = shared.subset_variant_regions(variant_regions, region, out_file) custom_file = "%s-coverageregions.bed" % utils.splitext_plus(out_file)[0] region_size = _get_region_size(ref_file, data, region) if variant_regions is None and region_size is not None and region_size < dd.get_callable_min_size(data): coverage_str = "CALLABLE" if realign.has_aligned_reads(dd.get_work_bam(data), region) else "NO_COVERAGE" custom_file = _write_all_chrom_file(coverage_str, custom_file, ref_file, region, data) return custom_file, False elif not ready_region: get_ref_bedtool(ref_file, data["config"]).saveas(custom_file) return custom_file, True elif os.path.isfile(ready_region): return ready_region, True elif isinstance(ready_region, (list, tuple)): c, s, e = ready_region pybedtools.BedTool("%s\t%s\t%s\n" % (c, s, e), from_string=True).saveas(custom_file) return custom_file, True else: custom_file = _write_all_chrom_file("NO_COVERAGE", custom_file, ref_file, region, data) return custom_file, variant_regions is None
def parallel_calling(data, run_parallel): """This is needed only if running methylated veruss hidroxy-methulated""" out = [] for sample in data: work_bam = dd.get_work_bam(sample[0]) with closing(pysam.Samfile(work_bam, "rb")) as pysam_work_bam: chroms = pysam_work_bam.references for chrom in chroms: new_sample = copy.deepcopy(sample) if chrom.find("_") > -1: continue new_sample[0]['chr_to_run'] = chrom out.append(new_sample) out = run_parallel("cpg_calling", out) for sample in out: phenotype = dd.get_phenotype(sample[0]) batch = dd.get_batch(sample[0]) if phenotype == "mC": for sample2 in out: if batch in dd.get_batch(sample2[0]) and dd.get_phenotype( sample2[0]) == "hmC": if sample[0]["chr_to_run"] == sample2[0]["chr_to_run"]: sample[0]["control"] = sample2[0]["cpg_file"] break out = run_parallel("cpg_processing", out) for sample in data: sample[0]["cpg_split"] = [] sample[0]["hmc_split"] = [] name = dd.get_sample_name(sample[0]) for chunck in out: if name == dd.get_sample_name(chunck[0]): sample[0]["cpg_split"].append(chunck[0]["cpg_file"]) if "hmc_file" in chunck[0]: sample[0]["hmc_split"].append(chunck[0]["hmc_file"])
def _get_original_coverage(data, itype="target"): """Back compatible: get existing coverage files if they exist """ work_dir = os.path.join(_sv_workdir(data), "raw") work_bam = dd.get_work_bam(data) or dd.get_align_bam(data) out = [] base, _ = _bam_to_outbase(work_bam, work_dir, data) target_cnn = "%s.targetcoverage.cnn" % base anti_cnn = "%s.antitargetcoverage.cnn" % base if os.path.exists(target_cnn) and os.path.exists(anti_cnn): out.append({ "bam": work_bam, "file": target_cnn, "cnntype": "target", "itype": itype, "sample": dd.get_sample_name(data) }) out.append({ "bam": work_bam, "file": anti_cnn, "cnntype": "antitarget", "itype": itype, "sample": dd.get_sample_name(data) }) return out
def pipeline_summary(data): """Provide summary information on processing sample. Handles standard and CWL (single QC output) cases. """ data = utils.to_single_data(data) if data["analysis"].startswith("wgbs-seq"): bismark_bam = dd.get_align_bam(data) sorted_bam = bam.sort(bismark_bam, data["config"]) data = dd.set_align_bam(data, sorted_bam) data = dd.set_work_bam(data, bismark_bam) work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if not work_bam or not work_bam.endswith(".bam"): work_bam = None if dd.get_ref_file(data): if work_bam or (tz.get_in(["config", "algorithm", "kraken"], data)): # kraken doesn't need bam logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join( dd.get_algorithm_qc(data)))) work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data) data["summary"] = _run_qc_tools(work_bam, work_data) if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data): data["summary"]["qc"] = data["summary"]["qc"].get( dd.get_algorithm_qc(data)[0]) return [[data]]
def _get_files(data): work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) out_dir = utils.safe_makedir(os.path.join(tz.get_in(["dirs", "work"], data), "align", dd.get_sample_name(data))) out_file = "%s-highdepth.bed" % os.path.join(out_dir, utils.splitext_plus(os.path.basename(work_bam))[0]) stats_file = "%s-stats.yaml" % utils.splitext_plus(out_file)[0] return work_bam, out_file, stats_file
def umi_consensus(data): """Convert UMI grouped reads into fastq pair for re-alignment. """ align_bam = dd.get_work_bam(data) umi_method, umi_tag = _check_umi_type(align_bam) f1_out = "%s-cumi-1.fq.gz" % utils.splitext_plus(align_bam)[0] f2_out = "%s-cumi-2.fq.gz" % utils.splitext_plus(align_bam)[0] if not utils.file_uptodate(f1_out, align_bam): with file_transaction(data, f1_out, f2_out) as (tx_f1_out, tx_f2_out): jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tx_f1_out), 2) # Improve speeds by avoiding compression read/write bottlenecks io_opts = "--async-io=true --compression=0" group_opts, cons_opts, filter_opts = _get_fgbio_options( data, umi_method) cons_method = "CallDuplexConsensusReads" if umi_method == "paired" else "CallMolecularConsensusReads" tempfile = "%s-bamtofastq-tmp" % utils.splitext_plus(f1_out)[0] ref_file = dd.get_ref_file(data) cmd = ( "unset JAVA_HOME && " "fgbio {jvm_opts} {io_opts} GroupReadsByUmi {group_opts} -t {umi_tag} -s {umi_method} " "-i {align_bam} | " "fgbio {jvm_opts} {io_opts} {cons_method} {cons_opts} --sort-order=:none: " "-i /dev/stdin -o /dev/stdout | " "fgbio {jvm_opts} {io_opts} FilterConsensusReads {filter_opts} -r {ref_file} " "-i /dev/stdin -o /dev/stdout | " "bamtofastq collate=1 T={tempfile} F={tx_f1_out} F2={tx_f2_out} tags=cD,cM,cE gz=1" ) do.run(cmd.format(**locals()), "UMI consensus fastq generation") return f1_out, f2_out
def create_peaktable(samples): """create a table of peak counts per sample to use with differential peak calling """ data = dd.get_data_from_sample(samples[0]) peakcounts = [] out_dir = os.path.join(dd.get_work_dir(data), "consensus") out_file = os.path.join(out_dir, "consensus-counts.tsv") if dd.get_chip_method(data) == "chip": for data in dd.sample_data_iterator(samples): peakcounts.append(tz.get_in(("peak_counts"), data)) elif dd.get_chip_method(data) == "atac": for data in dd.sample_data_iterator(samples): if bam.is_paired(dd.get_work_bam(data)): peakcounts.append(tz.get_in(("peak_counts", "NF"), data)) else: logger.info(f"Creating peak table from full BAM file because " f"{dd.get_work_bam(data)} is single-ended.") peakcounts.append(tz.get_in(("peak_counts", "full"), data)) combined_peaks = count.combine_count_files(peakcounts, out_file, ext=".counts") new_data = [] for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks) new_data.append(data) new_samples = dd.get_samples_from_datalist(new_data) return new_samples
def priority_total_coverage(data): """ calculate coverage at 10 depth intervals in the priority regions """ bed_file = dd.get_svprioritize(data) if not bed_file and not file_exists(bed_file): return data work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage") sample = dd.get_sample_name(data) out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed") if file_exists(out_file): data['priority_total_coverage'] = os.path.abspath(out_file) return data nthreads = dd.get_num_cores(data) in_bam = dd.get_align_bam(data) or dd.get_work_bam(data) sambamba = config_utils.get_program("sambamba", data, default="sambamba") with tx_tmpdir(data, work_dir) as tmp_dir: cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file)) cleaned_bed = bed.decomment(bed_file, cleaned_bed) with file_transaction(out_file) as tx_out_file: cmd = ( "{sambamba} depth region -t {nthreads} -L {cleaned_bed} " "-F \"not unmapped\" " "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 " "{in_bam} -o {tx_out_file}") message = "Calculating coverage of {bed_file} regions in {in_bam}" do.run(cmd.format(**locals()), message.format(**locals())) data['priority_total_coverage'] = os.path.abspath(out_file) return data
def align_transcriptome(fastq_file, pair_file, ref_file, data): """ bwa mem with settings for aligning to the transcriptome for eXpress/RSEM/etc """ work_bam = dd.get_work_bam(data) base, ext = os.path.splitext(work_bam) out_file = base + ".transcriptome" + ext if utils.file_exists(out_file): data = dd.set_transcriptome_bam(data, out_file) return data # bwa mem needs phred+33 quality, so convert if it is Illumina if dd.get_quality_format(data).lower() == "illumina": logger.info("bwa mem does not support the phred+64 quality format, " "converting %s and %s to phred+33.") fastq_file = fastq.groom(fastq_file, data, in_qual="fastq-illumina") if pair_file: pair_file = fastq.groom(pair_file, data, in_qual="fastq-illumina") bwa = config_utils.get_program("bwa", data["config"]) gtf_file = dd.get_gtf_file(data) gtf_fasta = index_transcriptome(gtf_file, ref_file, data) args = " ".join(_bwa_args_from_config(data["config"])) num_cores = data["config"]["algorithm"].get("num_cores", 1) samtools = config_utils.get_program("samtools", data["config"]) cmd = ("{bwa} mem {args} -a -t {num_cores} {gtf_fasta} {fastq_file} " "{pair_file} | {samtools} view -bhS - > {tx_out_file}") with file_transaction(data, out_file) as tx_out_file: message = "Aligning %s and %s to the transcriptome." % (fastq_file, pair_file) do.run(cmd.format(**locals()), message) data = dd.set_transcriptome_bam(data, out_file) return data
def cufflinks_assemble(data): bam_file = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) out_dir = os.path.join(dd.get_work_dir(data), "assembly") num_cores = dd.get_num_cores(data) assembled_gtf = cufflinks.assemble(bam_file, ref_file, num_cores, out_dir, data) data = dd.set_assembled_gtf(data, assembled_gtf) return [[data]]