def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith("wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error("bismark index not found. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] data = dd.update_summary_qc(data, "bismark", base=data["bam_report"]) return data bismark = config_utils.get_program("bismark", config) # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38) resources = config_utils.get_resources("bismark", data["config"]) max_cores = dd.get_num_cores(data) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) / (1024.0 * 1024.0) instances = calculate_bismark_instances(max_cores, max_mem * max_cores) # override instances if specified in the config if resources and resources.get("bismark_threads"): instances = resources.get("bismark_threads") logger.info(f"Using {instances} bismark instances - overriden by resources") bowtie_threads = 1 if resources and resources.get("bowtie_threads"): bowtie_threads = resources.get("bowtie_threads") logger.info(f"Using {bowtie_threads} bowtie threads per bismark instance") kit = kits.KITS.get(dd.get_kit(data), None) directional = "--non_directional" if kit and not kit.is_directional else "" other_opts = resources.get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file safe_makedir(align_dir) cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --parallel {instances} -p {bowtie_threads} -o {tx_out_dir} --unmapped {ref_file} {fastq_file} " if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") # don't process bam in the bismark pipeline! utils.symlink_plus(raw_bam[0], final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] data = dd.update_summary_qc(data, "bismark", base=data["bam_report"]) return data
def _bismark_calling(data): workdir = safe_makedir(os.path.join(dd.get_work_dir(data), "cpg")) config = data["config"] sample = dd.get_sample_name(data) index_dir = get_aligner_index('bismark', data) biasm_file = _run_meth_extractor(data["work_bam"], sample, workdir, index_dir, config) data['bismark_report'] = _run_report(data["work_bam"], data["bam_report"], sample, biasm_file, workdir, config) splitting_report = biasm_file.replace(".M-bias", "_splitting_report") data = dd.update_summary_qc(data, "bismark", base=biasm_file) data = dd.update_summary_qc(data, "bismark", base=data["bam_report"]) data = dd.update_summary_qc(data, "bismark", base=splitting_report) return data
def postprocess_alignment(data): """Perform post-processing steps required on full BAM files. Prepares list of callable genome regions allowing subsequent parallelization. """ data = cwlutils.normalize_missing(utils.to_single_data(data)) data = cwlutils.unpack_tarballs(data, data) bam_file = data.get("align_bam") or data.get("work_bam") ref_file = dd.get_ref_file(data) artifacts = gatk.collect_artifact_metrics(data) if artifacts: data = dd.update_summary_qc(data, "picard", artifacts.pop(), artifacts) oxog = gatk.collect_oxog_metrics(data) data = dd.update_summary_qc(data, "picard", oxog.pop(), oxog) if vmulti.bam_needs_processing(data) and bam_file and bam_file.endswith( ".bam"): out_dir = utils.safe_makedir( os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))) bam_file_ready = os.path.join(out_dir, os.path.basename(bam_file)) if not utils.file_exists(bam_file_ready): utils.symlink_plus(bam_file, bam_file_ready) bam.index(bam_file_ready, data["config"]) covinfo = callable.sample_callable_bed(bam_file_ready, ref_file, data) callable_region_bed, nblock_bed = \ callable.block_regions(covinfo.raw_callable, bam_file_ready, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": covinfo.raw_callable, "sample_callable": covinfo.callable, "mapped_stats": readstats.get_cache_file(data) } data["depth"] = covinfo.depth_files data = coverage.assign_interval(data) data = samtools.run_and_save(data) data = recalibrate.prep_recal(data) data = recalibrate.apply_recal(data) elif dd.get_variant_regions(data): callable_region_bed, nblock_bed = \ callable.block_regions(dd.get_variant_regions(data), bam_file, ref_file, data) data["regions"] = { "nblock": nblock_bed, "callable": dd.get_variant_regions(data), "sample_callable": dd.get_variant_regions(data) } return [[data]]
def run_salmon_bam(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) bam_file = dd.get_transcriptome_bam(data) out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data)) return [[data]]
def dedup_bismark(data): """Remove alignments to the same position in the genome from the Bismark mapping output using deduplicate_bismark """ config = data["config"] input_file = datadict.get_work_bam(data) # don't sort even by read names # input_file = bam.sort(input_file, config, order="queryname") sample_name = datadict.get_sample_name(data) output_dir = os.path.join(datadict.get_work_dir(data), 'dedup', sample_name) output_dir = utils.safe_makedir(output_dir) input_file_name, input_file_extension = os.path.splitext( os.path.basename(input_file)) output_file = os.path.join( output_dir, f'{input_file_name}.deduplicated{input_file_extension}') if utils.file_exists(output_file): data = datadict.set_work_bam(data, output_file) data["deduplication_report"] = output_file.replace( "deduplicated.bam", "deduplication_report.txt") data = dd.update_summary_qc(data, "bismark", base=data["deduplication_report"]) return [[data]] deduplicate_bismark = config_utils.get_program('deduplicate_bismark', config) command = f'{deduplicate_bismark} --output_dir {output_dir} {input_file}' with transaction.file_transaction(output_dir): do.run(command, 'remove deduplicate alignments') data = datadict.set_work_bam(data, output_file) data["deduplication_report"] = output_file.replace( "deduplicated.bam", "deduplication_report.txt") data = dd.update_summary_qc(data, "bismark", base=data["deduplication_report"]) return [[data]]
def run_salmon_bam(data): samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) bam_file = dd.get_transcriptome_bam(data) fasta_file = dd.get_ref_file(data) out_file = salmon_quant_bam(bam_file, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data)) return [[data]]
def run_salmon_reads(data): data = utils.to_single_data(data) files = dd.get_input_sequence_files(data) if bam.is_bam(files[0]): files = fastq.convert_bam_to_fastq(files[0], data["dirs"]["work"], data, data["dirs"], data["config"]) samplename = dd.get_sample_name(data) work_dir = dd.get_work_dir(data) salmon_dir = os.path.join(work_dir, "salmon", samplename) gtf_file = dd.get_gtf_file(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None fasta_file = dd.get_ref_file(data) out_file = salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, fasta_file, data) data = dd.set_salmon(data, out_file) data = dd.set_salmon_dir(data, salmon_dir) data = dd.set_salmon_fraglen_file(data, _get_fraglen_file(salmon_dir)) data = dd.update_summary_qc(data, "salmon", base=dd.get_salmon_fraglen_file(data)) return [[data]]
def align(fastq_file, pair_file, ref_file, names, align_dir, data): if not ref_file: logger.error( "STAR index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners star --genomes genome-build-name --data") sys.exit(1) max_hits = 10 srna = True if data["analysis"].lower().startswith( "smallrna-seq") else False srna_opts = "" if srna: max_hits = 1000 srna_opts = "--alignIntronMax 1" config = data["config"] star_dirs = _get_star_dirnames(align_dir, data, names) if file_exists(star_dirs.final_out): data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) out_log_file = os.path.join(align_dir, dd.get_lane(data) + "Log.final.out") data = dd.update_summary_qc(data, "star", base=out_log_file) return data star_path = config_utils.get_program("STAR", config) def _unpack_fastq(f): """Use process substitution instead of readFilesCommand for gzipped inputs. Prevents issues on shared filesystems that don't support FIFO: https://github.com/alexdobin/STAR/issues/143 """ if f and is_gzipped(f): return "<(gunzip -c %s)" % f else: return f fastq_files = (" ".join([ _unpack_fastq(fastq_file), _unpack_fastq(pair_file) ]) if pair_file else _unpack_fastq(fastq_file)) num_cores = dd.get_num_cores(data) gtf_file = dd.get_transcriptome_gtf(data) if not gtf_file: gtf_file = dd.get_gtf_file(data) if ref_file.endswith("chrLength"): ref_file = os.path.dirname(ref_file) if index_has_alts(ref_file): logger.error( "STAR is being run on an index with ALTs which STAR is not " "designed for. Please remake your STAR index or use an ALT-aware " "aligner like hisat2") sys.exit(1) with file_transaction(data, align_dir) as tx_align_dir: tx_1pass_dir = tx_align_dir + "1pass" tx_star_dirnames = _get_star_dirnames(tx_1pass_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_1pass_dir) safe_makedir(tx_out_dir) cmd = ( "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd BAM_Unsorted {srna_opts} " "--limitOutSJcollapsed 2000000 " "--outSAMtype BAM Unsorted " "--outSAMmapqUnique 60 " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += _read_group_option(names) if dd.get_fusion_caller(data): if "arriba" in dd.get_fusion_caller(data): cmd += ( "--chimSegmentMin 10 --chimOutType WithinBAM " "--chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 " "--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 " "--alignSJstitchMismatchNmax 5 -1 5 5 " "--chimSegmentReadGapMax 3 " "--peOverlapNbasesMin 10 " "--alignSplicedMateMapLminOverLmate 0.5 ") else: cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 ") if "oncofuse" in dd.get_fusion_caller(data): cmd += "--chimOutType Junctions " else: cmd += "--chimOutType WithinBAM " strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " resources = config_utils.get_resources("star", data["config"]) if resources.get("options", []): cmd += " " + " ".join( [str(x) for x in resources.get("options", [])]) cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) cmd += " > {tx_final_out} " run_message = "Running 1st pass of STAR aligner on %s and %s" % ( fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) sjfile = get_splicejunction_file(tx_out_dir, data) sjflag = f"--sjdbFileChrStartEnd {sjfile}" if sjfile else "" tx_star_dirnames = _get_star_dirnames(tx_align_dir, data, names) tx_out_dir, tx_out_file, tx_out_prefix, tx_final_out = tx_star_dirnames safe_makedir(tx_align_dir) safe_makedir(tx_out_dir) cmd = ( "{star_path} --genomeDir {ref_file} --readFilesIn {fastq_files} " "--runThreadN {num_cores} --outFileNamePrefix {tx_out_prefix} " "--outReadsUnmapped Fastx --outFilterMultimapNmax {max_hits} " "--outStd BAM_Unsorted {srna_opts} " "--limitOutSJcollapsed 2000000 " "{sjflag} " "--outSAMtype BAM Unsorted " "--outSAMmapqUnique 60 " "--outSAMunmapped Within --outSAMattributes %s " % " ".join(ALIGN_TAGS)) cmd += _add_sj_index_commands(fastq_file, ref_file, gtf_file) if not srna else "" cmd += _read_group_option(names) if dd.get_fusion_caller(data): if "arriba" in dd.get_fusion_caller(data): cmd += ( "--chimSegmentMin 10 --chimOutType WithinBAM SoftClip Junctions " "--chimJunctionOverhangMin 10 --chimScoreMin 1 --chimScoreDropMax 30 " "--chimScoreJunctionNonGTAG 0 --chimScoreSeparation 1 " "--alignSJstitchMismatchNmax 5 -1 5 5 " "--chimSegmentReadGapMax 3 ") else: cmd += (" --chimSegmentMin 12 --chimJunctionOverhangMin 12 " "--chimScoreDropMax 30 --chimSegmentReadGapMax 5 " "--chimScoreSeparation 5 ") if "oncofuse" in dd.get_fusion_caller(data): cmd += "--chimOutType Junctions " else: cmd += "--chimOutType WithinBAM " strandedness = utils.get_in(data, ("config", "algorithm", "strandedness"), "unstranded").lower() if strandedness == "unstranded" and not srna: cmd += " --outSAMstrandField intronMotif " if not srna: cmd += " --quantMode TranscriptomeSAM " resources = config_utils.get_resources("star", data["config"]) if resources.get("options", []): cmd += " " + " ".join( [str(x) for x in resources.get("options", [])]) cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_final_out) cmd += " > {tx_final_out} " run_message = "Running 2nd pass of STAR aligner on %s and %s" % ( fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = _update_data(star_dirs.final_out, star_dirs.out_dir, names, data) out_log_file = os.path.join(align_dir, dd.get_lane(data) + "Log.final.out") data = dd.update_summary_qc(data, "star", base=out_log_file) return data