def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) or dd.get_align_bam(data) out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)) if dd.get_aligner(data) == "star": out_dir = os.path.join( out_dir, "%s_%s" % (dd.get_sample_name(data), dd.get_aligner(data))) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir)) gtf_file = dd.get_transcriptome_gtf(data, default=dd.get_gtf_file(data)) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file) and _is_fixed_count_file(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}") resources = config_utils.get_resources("featureCounts", data["config"]) if resources: options = resources.get("options") if options: cmd += " %s" % " ".join([str(x) for x in options]) message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name(summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) return count_file
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or ("gvcf" in dd.get_tools_on(d))) and dd.get_batch(d) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) return checkpoints
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) data["align_bam"] = dd.get_work_bam(data) if dd.get_mark_duplicates(data): if aligner: if aligner == "bowtie2": filterer = bowtie2.filter_multimappers elif aligner == "bwa": filterer = bwa.filter_multimappers else: logger.error("ChIP-seq only supported for bowtie2 and bwa.") sys.exit(-1) unique_bam = filterer(dd.get_work_bam(data), data) data["work_bam"] = unique_bam else: logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal." "If BAM is not cleaned for peak calling, can result in downstream errors.") # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data), data["config"]) encode_bed = tz.get_in(["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) data["align_bam"] = dd.get_work_bam(data) if dd.get_mark_duplicates(data): if aligner: if aligner == "bowtie2": filterer = bowtie2.filter_multimappers elif aligner == "bwa": filterer = bwa.filter_multimappers else: logger.error("ChIP-seq only supported for bowtie2 and bwa.") sys.exit(-1) unique_bam = filterer(dd.get_work_bam(data), data) data["work_bam"] = unique_bam else: logger.info( "Warning: When BAM file is given as input, bcbio skips multimappers removal." "If BAM is not cleaned for peak calling, can result in downstream errors." ) # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr")) data["work_bam"] = _keep_assembled_chrom(dd.get_work_bam(data), dd.get_ref_file(data), data["config"]) encode_bed = tz.get_in( ["genome_resources", "variation", "encode_blacklist"], data) if encode_bed: data["work_bam"] = _prepare_bam(dd.get_work_bam(data), encode_bed, data['config']) bam.index(data["work_bam"], data['config']) data["bigwig"] = _bam_coverage(dd.get_sample_name(data), dd.get_work_bam(data), data) return [[data]]
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) assert aligner == "bowtie2", "ChIP-seq only supported for bowtie2." if aligner == "bowtie2": unique_bam = filter_multimappers(dd.get_work_bam(data), data) data["work_bam"] = unique_bam return [[data]] return [[data]]
def _nosort_tobam_cmd(data): """Handle converting to BAM for queryname sorted inputs, correcting HD headers. """ if dd.get_aligner(data).startswith("bwa"): fix_hd = "(echo '@HD VN:1.3 SO:queryname' && cat) | " else: fix_hd = "sed 's/SO:unsorted/SO:queryname/g' | " return fix_hd + "{samtools} view -b - -o {out_file}"
def _variant_checkpoints(samples): """Check sample configuration to identify required steps in analysis. """ checkpoints = {} checkpoints["vc"] = any([dd.get_variantcaller(d) or d.get("vrn_file") for d in samples]) checkpoints["sv"] = any([dd.get_svcaller(d) for d in samples]) checkpoints["jointvc"] = any([(dd.get_jointcaller(d) or "gvcf" in dd.get_tools_on(d)) for d in samples]) checkpoints["hla"] = any([dd.get_hlacaller(d) for d in samples]) checkpoints["align"] = any([(dd.get_aligner(d) or dd.get_bam_clean(d)) for d in samples]) checkpoints["align_split"] = not all([(dd.get_align_split_size(d) is False or not dd.get_aligner(d)) for d in samples]) checkpoints["archive"] = any([dd.get_archive(d) for d in samples]) checkpoints["umi"] = any([dd.get_umi_consensus(d) for d in samples]) checkpoints["ensemble"] = any([dd.get_ensemble(d) for d in samples]) checkpoints["cancer"] = any(dd.get_phenotype(d) in ["tumor"] for d in samples) return checkpoints
def clean_chipseq_alignment(data): aligner = dd.get_aligner(data) data["raw_bam"] = dd.get_work_bam(data) if aligner: assert aligner == "bowtie2", "ChIP-seq only supported for bowtie2." unique_bam = filter_multimappers(dd.get_work_bam(data), data) data["work_bam"] = unique_bam else: logger.info("Warning: When BAM file is given as input, bcbio skips multimappers removal." "If BAM is not cleaned for peak calling, can result in downstream errors.") return [[data]]
def _setup_reference_files(data, tx_out_dir): """Create a reference directory with fasta and bwa indices. GRIDSS requires all files in a single directory, so setup with symlinks. """ aligner = dd.get_aligner(data) out_dir = utils.safe_makedir(os.path.join(tx_out_dir, aligner)) ref_fasta = dd.get_ref_file(data) ref_files = ["%s%s" % (utils.splitext_plus(ref_fasta)[0], ext) for ext in [".fa", ".fa.fai", ".dict"]] for orig_file in ref_files + tz.get_in(("reference", aligner, "indexes"), data): utils.symlink_plus(orig_file, os.path.join(out_dir, os.path.basename(orig_file))) return os.path.join(out_dir, os.path.basename(ref_fasta))
def count(data): """ count reads mapping to genes using featureCounts http://subread.sourceforge.net """ in_bam = dd.get_work_bam(data) or dd.get_align_bam(data) out_dir = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data)) if dd.get_aligner(data) == "star": out_dir = os.path.join(out_dir, "%s_%s" % (dd.get_sample_name(data), dd.get_aligner(data))) sorted_bam = bam.sort(in_bam, dd.get_config(data), order="queryname", out_dir=safe_makedir(out_dir)) gtf_file = dd.get_gtf_file(data) work_dir = dd.get_work_dir(data) out_dir = os.path.join(work_dir, "htseq-count") safe_makedir(out_dir) count_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts" summary_file = os.path.join(out_dir, dd.get_sample_name(data)) + ".counts.summary" if file_exists(count_file) and _is_fixed_count_file(count_file): return count_file featureCounts = config_utils.get_program("featureCounts", dd.get_config(data)) paired_flag = _paired_flag(in_bam) strand_flag = _strand_flag(data) filtered_bam = bam.filter_primary(sorted_bam, data) cmd = ("{featureCounts} -a {gtf_file} -o {tx_count_file} -s {strand_flag} " "{paired_flag} {filtered_bam}") message = ("Count reads in {tx_count_file} mapping to {gtf_file} using " "featureCounts") with file_transaction(data, [count_file, summary_file]) as tx_files: tx_count_file, tx_summary_file = tx_files do.run(cmd.format(**locals()), message.format(**locals())) fixed_count_file = _format_count_file(count_file, data) fixed_summary_file = _change_sample_name( summary_file, dd.get_sample_name(data), data=data) shutil.move(fixed_count_file, count_file) shutil.move(fixed_summary_file, summary_file) return count_file
def _get_bwa_mem_cmd(data, out_file, ref_file, fastq1, fastq2="", with_hla=False): """Perform piped bwa mem mapping potentially with alternative alleles in GRCh38 + HLA typing. Commands for HLA post-processing: base=TEST run-HLA $base.hla > $base.hla.top cat $base.hla.HLA*.gt | grep ^GT | cut -f2- > $base.hla.all rm -f $base.hla.HLA*gt rm -f $base.hla.HLA*gz """ alt_file = ref_file + ".alt" if with_hla: bwakit_dir = os.path.dirname( os.path.realpath(utils.which("run-bwamem"))) hla_base = os.path.join( utils.safe_makedir(os.path.join(os.path.dirname(out_file), "hla")), os.path.basename(out_file) + ".hla") alt_cmd = ( " | {bwakit_dir}/k8 {bwakit_dir}/bwa-postalt.js -p {hla_base} {alt_file}" ) else: alt_cmd = "" if dd.get_aligner(data) == "sentieon-bwa": bwa_exe = "sentieon-bwa" exports = sentieon.license_export(data) else: bwa_exe = "bwa" exports = "" bwa = config_utils.get_program(bwa_exe, data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) bwa_resources = config_utils.get_resources("bwa", data["config"]) bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])]) if "options" in bwa_resources else "") rg_info = novoalign.get_rg_info(data["rgnames"]) # For UMI runs, pass along consensus tags c_tags = "-C" if "umi_bam" in data else "" pairing = "-p" if not fastq2 else "" # Restrict seed occurances to 1/2 of default, manage memory usage for centromere repeats in hg38 # https://sourceforge.net/p/bio-bwa/mailman/message/31514937/ # http://ehc.ac/p/bio-bwa/mailman/message/32268544/ mem_usage = "-c 250" bwa_cmd = ( "{exports}{bwa} mem {pairing} {c_tags} {mem_usage} -M -t {num_cores} {bwa_params} -R '{rg_info}' " "-v 1 {ref_file} {fastq1} {fastq2} ") return (bwa_cmd + alt_cmd).format(**locals())
def generate_transcript_counts(data): """Generate counts per transcript and per exon from an alignment""" data["count_file"] = featureCounts.count(data) if dd.get_fusion_mode(data, False): oncofuse_file = oncofuse.run(data) if oncofuse_file: data["oncofuse_file"] = oncofuse.run(data) if dd.get_dexseq_gff(data, None): data = dd.set_dexseq_counts(data, dexseq.bcbio_run(data)) # if RSEM was run, stick the transcriptome BAM file into the datadict if dd.get_aligner(data).lower() == "star" and dd.get_rsem(data): base, ext = os.path.splitext(dd.get_work_bam(data)) data = dd.set_transcriptome_bam(data, base + ".transcriptome" + ext) return [[data]]
def quantitate(data): """CWL target for quantitation. XXX Needs to be split and parallelized by expression caller, with merging of multiple calls. """ data = to_single_data(to_single_data(data)) data = generate_transcript_counts(data)[0][0] data["quant"] = {} if "sailfish" in dd.get_expression_caller(data): data = to_single_data(sailfish.run_sailfish(data)[0]) data["quant"]["tsv"] = data["sailfish"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]), "abundance.h5") if ("kallisto" in dd.get_expression_caller(data) or "pizzly" in dd.get_fusion_caller(data, [])): data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0]) data["quant"]["tsv"] = os.path.join(data["kallisto_quant"], "abundance.tsv") data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"], "abundance.h5") if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))): data["quant"]["fusion"] = os.path.join(data["kallisto_quant"], "fusion.txt") else: data["quant"]["fusion"] = None if "salmon" in dd.get_expression_caller(data): if dd.get_quantify_genome_alignments(data): if dd.get_aligner(data).lower() != "star": if dd.get_genome_build(data) == "hg38": logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Since this is hg38 we will fall " "back to the decoy method") data = to_single_data(salmon.run_salmon_decoy(data)[0]) else: logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Falling back to the " "transcriptome-only method.") data = to_single_data(salmon.run_salmon_reads(data)[0]) else: data = to_single_data(salmon.run_salmon_bam(data)[0]) else: data = to_single_data(salmon.run_salmon_reads(data)[0]) data["quant"]["tsv"] = data["salmon"] data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]), "abundance.h5") return [[data]]
def run_count(bam_file, dexseq_gff, stranded, out_file, data): """ run dexseq_count on a BAM file """ assert file_exists(bam_file), "%s does not exist." % bam_file sort_order = bam._get_sort_order(bam_file, {}) assert sort_order, "Cannot determine sort order of %s." % bam_file strand_flag = _strand_flag(stranded) assert strand_flag, "%s is not a valid strandedness value." % stranded if not dexseq_gff: logger.info( "No DEXSeq GFF file was found, skipping exon-level counting.") return None elif not file_exists(dexseq_gff): logger.info("%s was not found, so exon-level counting is being " "skipped." % dexseq_gff) return None dexseq_count = _dexseq_count_path() if not dexseq_count: logger.info("DEXseq is not installed, skipping exon-level counting.") return None if dd.get_aligner(data) == "bwa": logger.info( "Can't use DEXSeq with bwa alignments, skipping exon-level counting." ) return None sort_flag = "name" if sort_order == "queryname" else "pos" is_paired = bam.is_paired(bam_file) paired_flag = "yes" if is_paired else "no" anaconda = os.path.dirname(os.path.realpath(sys.executable)) r36_python = os.path.join(anaconda, "..", "envs", "r36", "bin", "python") if file_exists(out_file): return out_file cmd = ( "{r36_python} {dexseq_count} -f bam -r {sort_flag} -p {paired_flag} " "-s {strand_flag} {dexseq_gff} {bam_file} {tx_out_file}") message = "Counting exon-level counts with %s and %s." % (bam_file, dexseq_gff) with file_transaction(data, out_file) as tx_out_file: do.run(cmd.format(**locals()), message) return out_file
def remove_multimappers(bam_file, data): aligner = dd.get_aligner(data) if aligner: if aligner == "bowtie2": filterer = bowtie2.filter_multimappers elif aligner == "bwa": filterer = bwa.filter_multimappers else: logger.error("ChIP-seq only supported for bowtie2 and bwa.") sys.exit(-1) unique_bam = filterer(bam_file, data) else: unique_bam = bam_file logger.warn( "When a BAM file is given as input, bcbio skips removal of " "multimappers.") return unique_bam
def _setup_reference_files(data, tx_out_dir): """Create a reference directory with fasta and bwa indices. GRIDSS requires all files in a single directory, so setup with symlinks. """ aligner = dd.get_aligner(data) out_dir = utils.safe_makedir(os.path.join(tx_out_dir, aligner)) ref_fasta = dd.get_ref_file(data) ref_files = [ "%s%s" % (utils.splitext_plus(ref_fasta)[0], ext) for ext in [".fa", ".fa.fai", ".dict"] ] for orig_file in ref_files + tz.get_in( ("reference", aligner, "indexes"), data): utils.symlink_plus(orig_file, os.path.join(out_dir, os.path.basename(orig_file))) return os.path.join(out_dir, os.path.basename(ref_fasta))
def _check_dedup(data): """Check configuration for de-duplication. Defaults to no de-duplication for RNA-seq and small RNA, the back compatible default. Allow overwriting with explicit `mark_duplicates: true` setting. Also defaults to false for no alignment inputs. """ if dd.get_analysis(data).lower() in ["rna-seq", "smallrna-seq"] or not dd.get_aligner(data): dup_param = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), False) else: dup_param = utils.get_in(data, ("config", "algorithm", "mark_duplicates"), True) if dup_param and isinstance(dup_param, six.string_types): logger.info("Warning: bcbio no longer support explicit setting of mark_duplicate algorithm. " "Using best-practice choice based on input data.") dup_param = True return dup_param
def quantitate_expression_parallel(samples, run_parallel): """ quantitate expression, all programs run here should be multithreaded to take advantage of the threaded run_parallel environment """ data = samples[0][0] to_index = determine_indexes_to_make(samples) samples = run_parallel("generate_transcript_counts", samples) if "cufflinks" in dd.get_expression_caller(data): samples = run_parallel("run_cufflinks", samples) if "stringtie" in dd.get_expression_caller(data): samples = run_parallel("run_stringtie_expression", samples) if ("kallisto" in dd.get_expression_caller(data) or dd.get_fusion_mode(data) or "pizzly" in dd.get_fusion_caller(data, [])): run_parallel("run_kallisto_index", [to_index]) samples = run_parallel("run_kallisto_rnaseq", samples) if "sailfish" in dd.get_expression_caller(data): run_parallel("run_sailfish_index", [to_index]) samples = run_parallel("run_sailfish", samples) # always run salmon run_parallel("run_salmon_index", [to_index]) if dd.get_quantify_genome_alignments(data): if dd.get_aligner(data).lower() != "star": if dd.get_genome_build(data) == "hg38": logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Since this is hg38 we will fall " "back to the decoy method") samples = run_parallel("run_salmon_decoy", samples) else: logger.warning( "Whole genome alignment-based Salmon quantification is " "only supported for the STAR aligner. Falling back to the " "transcriptome-only method.") samples = run_parallel("run_salmon_reads", samples) else: samples = run_parallel("run_salmon_bam", samples) else: samples = run_parallel("run_salmon_reads", samples) samples = run_parallel("detect_fusions", samples) return samples
def run(data): if not aligner_supports_fusion(data): aligner = dd.get_aligner(data) logger.warning("Oncofuse is not supported for the %s aligner, " "skipping. " % aligner) return None config = data["config"] genome_build = data.get("genome_build", "") input_type, input_dir, input_file = _get_input_para(data) if genome_build == "GRCh37": # assume genome_build is hg19 otherwise if config["algorithm"].get("aligner") in ["star"]: if file_exists(input_file): input_file = _fix_star_junction_output(input_file) if config["algorithm"].get("aligner") in ["tophat", "tophat2"]: if file_exists(input_file): input_file = _fix_tophat_junction_output(input_file) elif "hg19" not in genome_build: return None #handle cases when fusion file doesn't exist if not file_exists(input_file): return None out_file = os.path.join(input_dir, "oncofuse_out.txt") if file_exists(out_file): return out_file oncofuse = config_utils.get_program("oncofuse", config) tissue_type = _oncofuse_tissue_arg_from_config(data) resources = config_utils.get_resources("oncofuse", config) if not file_exists(out_file): cl = [oncofuse] cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]) with file_transaction(data, out_file) as tx_out_file: cl += [input_file, input_type, tissue_type, tx_out_file] cmd = " ".join(cl) try: do.run(cmd, "oncofuse fusion detection", data) except: do.run( "touch %s && echo '# failed' >> %s" % (tx_out_file, tx_out_file), "oncofuse failed", data) #return out_file return out_file
def run(data): if not aligner_supports_fusion(data): aligner = dd.get_aligner(data) logger.warning("Oncofuse is not supported for the %s aligner, " "skipping. " % aligner) return None config = data["config"] genome_build = data.get("genome_build", "") input_type, input_dir, input_file = _get_input_para(data) if genome_build == "GRCh37": # assume genome_build is hg19 otherwise if config["algorithm"].get("aligner") in ["star"]: if file_exists(input_file): input_file = _fix_star_junction_output(input_file) if config["algorithm"].get("aligner") in ["tophat", "tophat2"]: if file_exists(input_file): input_file = _fix_tophat_junction_output(input_file) elif "hg19" not in genome_build: return None #handle cases when fusion file doesn't exist if not file_exists(input_file): return None out_file = os.path.join(input_dir, "oncofuse_out.txt") if file_exists(out_file): return out_file oncofuse = config_utils.get_program("oncofuse", config) tissue_type = _oncofuse_tissue_arg_from_config(data) resources = config_utils.get_resources("oncofuse", config) if not file_exists(out_file): cl = [oncofuse] cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"]) with file_transaction(data, out_file) as tx_out_file: cl += [input_file, input_type, tissue_type, tx_out_file] cmd = " ".join(cl) try: do.run(cmd, "oncofuse fusion detection", data) except: do.run("touch %s && echo '# failed' >> %s" % (tx_out_file, tx_out_file), "oncofuse failed", data) #return out_file return out_file
def _get_bwa_mem_cmd(data, out_file, ref_file, fastq1, fastq2=""): """Perform piped bwa mem mapping potentially with alternative alleles in GRCh38 + HLA typing. Commands for HLA post-processing: base=TEST run-HLA $base.hla > $base.hla.top cat $base.hla.HLA*.gt | grep ^GT | cut -f2- > $base.hla.all rm -f $base.hla.HLA*gt rm -f $base.hla.HLA*gz """ alt_file = ref_file + ".alt" if utils.file_exists(alt_file) and dd.get_hlacaller(data): bwakit_dir = os.path.dirname(os.path.realpath(utils.which("run-bwamem"))) hla_base = os.path.join(utils.safe_makedir(os.path.join(os.path.dirname(out_file), "hla")), os.path.basename(out_file) + ".hla") alt_cmd = (" | {bwakit_dir}/k8 {bwakit_dir}/bwa-postalt.js -p {hla_base} {alt_file}") else: alt_cmd = "" if dd.get_aligner(data) == "sentieon-bwa": bwa_exe = "sentieon-bwa" exports = sentieon.license_export(data) else: bwa_exe = "bwa" exports = "" bwa = config_utils.get_program(bwa_exe, data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) bwa_resources = config_utils.get_resources("bwa", data["config"]) bwa_params = (" ".join([str(x) for x in bwa_resources.get("options", [])]) if "options" in bwa_resources else "") rg_info = novoalign.get_rg_info(data["rgnames"]) # For UMI runs, pass along consensus tags c_tags = "-C" if "umi_bam" in data else "" pairing = "-p" if not fastq2 else "" # Restrict seed occurances to 1/2 of default, manage memory usage for centromere repeats in hg38 # https://sourceforge.net/p/bio-bwa/mailman/message/31514937/ # http://ehc.ac/p/bio-bwa/mailman/message/32268544/ mem_usage = "-c 250" bwa_cmd = ("{exports}{bwa} mem {pairing} {c_tags} {mem_usage} -M -t {num_cores} {bwa_params} -R '{rg_info}' " "-v 1 {ref_file} {fastq1} {fastq2} ") return (bwa_cmd + alt_cmd).format(**locals())
def get_maxcov_downsample_cl(data, in_pipe=None): """Retrieve command line for max coverage downsampling, fitting into bamsormadup output. """ max_cov = _get_maxcov_downsample(data) if dd.get_aligner(data) not in ["snap"] else None if max_cov: if in_pipe == "bamsormadup": prefix = "level=0" elif in_pipe == "samtools": prefix = "-l 0" else: prefix = "" # Swap over to multiple cores until after testing #core_arg = "-t %s" % dd.get_num_cores(data) core_arg = "" return ("%s | variant - -b %s --mark-as-qc-fail --max-coverage %s" % (prefix, core_arg, max_cov)) else: if in_pipe == "bamsormadup": prefix = "indexfilename={tx_out_file}.bai" else: prefix = "" return prefix
def run_count(bam_file, dexseq_gff, stranded, out_file, data): """ run dexseq_count on a BAM file """ assert file_exists(bam_file), "%s does not exist." % bam_file sort_order = bam._get_sort_order(bam_file, {}) assert sort_order, "Cannot determine sort order of %s." % bam_file strand_flag = _strand_flag(stranded) assert strand_flag, "%s is not a valid strandedness value." % stranded if not dexseq_gff: logger.info("No DEXSeq GFF file was found, skipping exon-level counting.") return None elif not file_exists(dexseq_gff): logger.info("%s was not found, so exon-level counting is being " "skipped." % dexseq_gff) return None dexseq_count = _dexseq_count_path() if not dexseq_count: logger.info("DEXseq is not installed, skipping exon-level counting.") return None if dd.get_aligner(data) == "bwa": logger.info("Can't use DEXSeq with bwa alignments, skipping exon-level counting.") return None sort_flag = "name" if sort_order == "queryname" else "pos" is_paired = bam.is_paired(bam_file) paired_flag = "yes" if is_paired else "no" bcbio_python = sys.executable if file_exists(out_file): return out_file cmd = ("{bcbio_python} {dexseq_count} -f bam -r {sort_flag} -p {paired_flag} " "-s {strand_flag} {dexseq_gff} {bam_file} {tx_out_file}") message = "Counting exon-level counts with %s and %s." % (bam_file, dexseq_gff) with file_transaction(data, out_file) as tx_out_file: do.run(cmd.format(**locals()), message) return out_file
def aligner_supports_fusion(data): SUPPORTED_ALIGNERS = ["tophat2", "tophat", "star"] aligner = dd.get_aligner(data) return aligner and aligner.lower() in SUPPORTED_ALIGNERS
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions( items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join( _vardict_options_from_config(items, config, out_file, target)) coverage_interval = utils.get_in( config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage( items[0]) > 5000 else "" fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(( "config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ( "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " "| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) freq_filter = ( "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname( utils.Rscript_cmd()) cmd = ( "{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "{freq_filter} " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) if dd.get_umi_consensus(data): data["umi_bam"] = dd.get_work_bam(data) if fastq2: f1, f2, avg_cov = postalign.umi_consensus(data) data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov del data["config"]["algorithm"]["umi_type"] data["config"]["algorithm"]["mark_duplicates"] = False data = align_to_sort_bam(f1, f2, aligner, data) else: raise ValueError("Single fastq input for UMI processing; fgbio needs paired reads: %s" % dd.get_sample_name(data)) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "remove_extracontigs": out_bam = cleanbam.remove_extracontigs(fastq1, data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) if not utils.file_exists(out_file): work_dir = utils.safe_makedir(os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join(work_dir, "{}-sort.bam".format(dd.get_sample_name(data))) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = _link_bam_file(fastq1, os.path.join(dd.get_work_dir(data), "prealign", dd.get_sample_name(data)), data) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and not dd.get_aligner(data): data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) elif "kraken" in config["algorithm"]: # kraken doesn's need bam pass else: raise ValueError("Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def _skip_duplicates(data): return (dd.get_coverage_interval(data) == "amplicon" or (dd.get_aligner(data) and not dd.get_mark_duplicates(data)))
def process_alignment(data, alt_input=None): """Do an alignment of fastq files, preparing a sorted BAM output file. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) fastq1, fastq2 = dd.get_input_sequence_files(data) if alt_input: fastq1, fastq2 = alt_input config = data["config"] aligner = config["algorithm"].get("aligner", None) if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner: logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner)) data = align_to_sort_bam(fastq1, fastq2, aligner, data) if dd.get_correct_umis(data): data["work_bam"] = postalign.correct_umis(data) if dd.get_umi_consensus(data): data["umi_bam"] = dd.get_work_bam(data) if fastq2: f1, f2, avg_cov = postalign.umi_consensus(data) data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov del data["config"]["algorithm"]["umi_type"] data["config"]["algorithm"]["mark_duplicates"] = False data = align_to_sort_bam(f1, f2, aligner, data) else: raise ValueError( "Single fastq input for UMI processing; fgbio needs paired reads: %s" % dd.get_sample_name(data)) data = _add_supplemental_bams(data) elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".bam"): sort_method = config["algorithm"].get("bam_sort") bamclean = config["algorithm"].get("bam_clean") if bamclean is True or bamclean == "picard": if sort_method and sort_method != "coordinate": raise ValueError( "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method) ref_file = dd.get_ref_file(data) out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file, data["dirs"], data) elif bamclean == "fixrg": out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data) elif bamclean == "remove_extracontigs": out_bam = cleanbam.remove_extracontigs(fastq1, data) elif sort_method: runner = broad.runner_from_path("picard", config) out_file = os.path.join( data["dirs"]["work"], "{}-sort.bam".format( os.path.splitext(os.path.basename(fastq1))[0])) if not utils.file_exists(out_file): work_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "bamclean", dd.get_sample_name(data))) out_file = os.path.join( work_dir, "{}-sort.bam".format(dd.get_sample_name(data))) out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file) else: out_bam = _link_bam_file( fastq1, os.path.join(dd.get_work_dir(data), "prealign", dd.get_sample_name(data)), data) bam.index(out_bam, data["config"]) bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"]) dedup_bam = postalign.dedup_bam(out_bam, data) bam.index(dedup_bam, data["config"]) data["work_bam"] = dedup_bam elif fastq1 and objectstore.file_exists_or_remote( fastq1) and fastq1.endswith(".cram"): data["work_bam"] = fastq1 elif fastq1 is None and not dd.get_aligner(data): data["config"]["algorithm"]["variantcaller"] = False data["work_bam"] = None elif not fastq1: raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data)) elif "kraken" in config["algorithm"]: # kraken doesn's need bam pass else: raise ValueError( "Could not process input file from sample configuration. \n" + fastq1 + "\nIs the path to the file correct or is empty?\n" + "If it is a fastq file (not pre-aligned BAM or CRAM), " "is an aligner specified in the input configuration?") if data.get("work_bam"): # Add stable 'align_bam' target to use for retrieving raw alignment data["align_bam"] = data["work_bam"] data = _add_hla_files(data) return [[data]]
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: target = shared.subset_variant_regions(dd.get_variant_regions(items[0]), region, out_file, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf(tx_out_file, config, samples=[x for x in [paired.tumor_name, paired.normal_name] if x]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcffilter = config_utils.get_program("vcffilter", config) vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program("vcfstreamsort", config) strandbias = "testsomatic.R" var2vcf = "var2vcf_paired.pl" compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float(utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts = " ".join(_vardict_options_from_config(items, config, out_file, target)) coverage_interval = utils.get_in(config, ("algorithm", "coverage_interval"), "exome") # for deep targeted panels, require 50 worth of coverage var2vcf_opts = " -v 50 " if highdepth.get_median_coverage(items[0]) > 5000 else "" fix_ambig = vcfutils.fix_ambiguous_cl() remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(("config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ("| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " "| %s -x 'bcbio.variation.freebayes.call_somatic(x)'" % os.path.join(os.path.dirname(sys.executable), "py")) freq_filter = ("| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) r_setup = "unset R_HOME && export PATH=%s:$PATH && " % os.path.dirname(utils.Rscript_cmd()) cmd = ("{r_setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| {strandbias} " "| {var2vcf} -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "{freq_filter} " "{somatic_filter} | {fix_ambig} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) out_file = (annotation.add_dbsnp(out_file, assoc_files["dbsnp"], config) if assoc_files.get("dbsnp") else out_file) return out_file
def _run_vardict_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect variants with Vardict. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext( align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(items[0], out_file) as tx_out_file: vrs = bedutils.population_variant_regions(items) target = shared.subset_variant_regions(vrs, region, out_file, items=items, do_merge=True) paired = vcfutils.get_paired_bams(align_bams, items) if not _is_bed_file(target): vcfutils.write_empty_vcf( tx_out_file, config, samples=[ x for x in [paired.tumor_name, paired.normal_name] if x ]) else: if not paired.normal_bam: ann_file = _run_vardict_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vardict = get_vardict_command(items[0]) vcfstreamsort = config_utils.get_program( "vcfstreamsort", config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" freq = float( utils.get_in(config, ("algorithm", "min_allele_fraction"), 10)) / 100.0 # merge bed file regions as amplicon VarDict is only supported in single sample mode opts, var2vcf_opts = _vardict_options_from_config( items, config, out_file, target) fix_ambig_ref = vcfutils.fix_ambiguous_cl() fix_ambig_alt = vcfutils.fix_ambiguous_cl(5) remove_dup = vcfutils.remove_dup_cl() if any("vardict_somatic_filter" in tz.get_in(( "config", "algorithm", "tools_off"), data, []) for data in items): somatic_filter = "" freq_filter = "" else: var2vcf_opts += " -M " # this makes VarDict soft filter non-differential variants somatic_filter = ( "| sed 's/\\\\.*Somatic\\\\/Somatic/' " "| sed 's/REJECT,Description=\".*\">/REJECT,Description=\"Not Somatic via VarDict\">/' " """| %s -c 'from bcbio.variation import freebayes; """ """freebayes.call_somatic("%s", "%s")' """ % (sys.executable, paired.tumor_name, paired.normal_name)) freq_filter = ( "| bcftools filter -m '+' -s 'REJECT' -e 'STATUS !~ \".*Somatic\"' 2> /dev/null " "| %s -x 'bcbio.variation.vardict.add_db_germline_flag(x)' " "| %s " "| %s -x 'bcbio.variation.vardict.depth_freq_filter(x, %s, \"%s\")'" % (os.path.join(os.path.dirname(sys.executable), "py"), _lowfreq_linear_filter(0, True), os.path.join(os.path.dirname(sys.executable), "py"), 0, dd.get_aligner(paired.tumor_data))) jvm_opts = _get_jvm_opts(items[0], tx_out_file) py_cl = os.path.join(utils.get_bcbio_bin(), "py") setup = ("%s && unset JAVA_HOME &&" % utils.get_R_exports()) contig_cl = vcfutils.add_contig_to_header_cl( ref_file, tx_out_file) cmd = ( "{setup}{jvm_opts}{vardict} -G {ref_file} -f {freq} " "-N {paired.tumor_name} -b \"{paired.tumor_bam}|{paired.normal_bam}\" {opts} " "| awk 'NF>=48' | testsomatic.R " "| var2vcf_paired.pl -P 0.9 -m 4.25 -f {freq} {var2vcf_opts} " "-N \"{paired.tumor_name}|{paired.normal_name}\" " "| {contig_cl} {freq_filter} " "| bcftools filter -i 'QUAL >= 0' " "{somatic_filter} | {fix_ambig_ref} | {fix_ambig_alt} | {remove_dup} | {vcfstreamsort} " "{compress_cmd} > {tx_out_file}") do.run(cmd.format(**locals()), "Genotyping with VarDict: Inference", {}) return out_file
def aligner_supports_fusion(data): SUPPORTED_ALIGNERS = ["tophat2", "tophat", "star"] aligner = dd.get_aligner(data).lower() return aligner in SUPPORTED_ALIGNERS
def calling(data): if dd.get_aligner(data) == "bismark": data = _bismark_calling(data) if dd.get_aligner(data) == "bsmap": data = _bsmap_calling(data) return [[data]]