def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam" if file_exists(out_file): data = dd.set_work_bam(data, out_file) return data cmd = ( "{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": gtf_file = dd.get_gtf_file(data) splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") cmd += "--known-splicesite-infile {splicesites} " message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with file_transaction(out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) return data
def dedup_bismark(data): """Remove alignments to the same position in the genome from the Bismark mapping output using deduplicate_bismark """ config = data["config"] input_file = datadict.get_work_bam(data) # don't sort even by read names # input_file = bam.sort(input_file, config, order="queryname") sample_name = datadict.get_sample_name(data) output_dir = os.path.join(datadict.get_work_dir(data), 'dedup', sample_name) output_dir = utils.safe_makedir(output_dir) input_file_name, input_file_extension = os.path.splitext(os.path.basename( input_file )) output_file = os.path.join( output_dir, f'{input_file_name}.deduplicated{input_file_extension}' ) if utils.file_exists(output_file): data = datadict.set_work_bam(data, output_file) return [[data]] deduplicate_bismark = config_utils.get_program('deduplicate_bismark', config) command = f'{deduplicate_bismark} --output_dir {output_dir} {input_file}' with transaction.file_transaction(output_dir): do.run(command, 'remove deduplicate alignments') data = datadict.set_work_bam(data, output_file) data["deduplication_report"] = output_file.replace("deduplicated.bam", "deduplication_report.txt") return [[data]]
def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith("wgbs-seq"), "No comparible alignment" config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) ref_file = dd.get_sam_ref(data) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) return data bsmap = config_utils.get_program("bsmap", config) fastq_files = " -a %s" % fastq_file num_cores = dd.get_num_cores(data) num_cores = "-p %d" % num_cores safe_makedir(align_dir) cmd = "{bsmap} {num_cores} -w 100 -v 0.07 -m 10 -x 300 -o {tx_out_bam} -d {ref_file} {fastq_files}" if pair_file: fastq_files = "-a %s -b %s" % (fastq_file, pair_file) if not final_out: with file_transaction(final_out) as tx_out_bam: run_message = "Running BSMAP aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = dd.set_work_bam(data, final_out) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam" if file_exists(out_file): data = dd.set_work_bam(data, out_file) return data cmd = ("{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks can use if dd.get_assemble_transcripts(data): cmd += "--dta-cufflinks " if dd.get_analysis(data) == "rna-seq": splicesites = os.path.join(os.path.dirname(gtf_file), "ref-transcripts-splicesites.txt") cmd += "--known-splicesite-infile {splicesites} " message = "Aligning %s and %s with hisat2." %(fastq_file, pair_file) with file_transaction(out_file) as tx_out_file: cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file) do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith("wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error("bismark index not found. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] data = dd.update_summary_qc(data, "bismark", base=data["bam_report"]) return data bismark = config_utils.get_program("bismark", config) # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38) resources = config_utils.get_resources("bismark", data["config"]) max_cores = dd.get_num_cores(data) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) / (1024.0 * 1024.0) instances = calculate_bismark_instances(max_cores, max_mem * max_cores) # override instances if specified in the config if resources and resources.get("bismark_threads"): instances = resources.get("bismark_threads") logger.info(f"Using {instances} bismark instances - overriden by resources") bowtie_threads = 1 if resources and resources.get("bowtie_threads"): bowtie_threads = resources.get("bowtie_threads") logger.info(f"Using {bowtie_threads} bowtie threads per bismark instance") kit = kits.KITS.get(dd.get_kit(data), None) directional = "--non_directional" if kit and not kit.is_directional else "" other_opts = resources.get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file safe_makedir(align_dir) cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --parallel {instances} -p {bowtie_threads} -o {tx_out_dir} --unmapped {ref_file} {fastq_file} " if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") # don't process bam in the bismark pipeline! utils.symlink_plus(raw_bam[0], final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] data = dd.update_summary_qc(data, "bismark", base=data["bam_report"]) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith( "wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error( "bismark index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data bismark = config_utils.get_program("bismark", config) # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38) resources = config_utils.get_resources("bismark", data["config"]) max_cores = resources.get("cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) n = min(max(int(max_cores / 5), 1), max(int(max_mem / config_utils.convert_to_bytes("12G")), 1)) kit = kits.KITS.get(dd.get_kit(data), None) directional = "--non_directional" if kit and not kit.is_directional else "" other_opts = resources.get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file safe_makedir(align_dir) cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}" if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") process_bam = _process_bam(raw_bam[0], fastq_files, sample, dd.get_sam_ref(data), config) utils.symlink_plus(process_bam, final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data
def _update_data(align_file, out_dir, names, data): data = dd.set_work_bam(data, align_file) data = dd.set_align_bam(data, align_file) if dd.get_transcriptome_align(data) and not is_transcriptome_broken(): transcriptome_file = _move_transcriptome_file(out_dir, names) data = dd.set_transcriptome_bam(data, transcriptome_file) return data
def _update_data(align_file, out_dir, names, data): data = dd.set_work_bam(data, align_file) data = dd.set_align_bam(data, align_file) if dd.get_transcriptome_align(data) and not is_transcriptome_broken(data): transcriptome_file = _move_transcriptome_file(out_dir, names) data = dd.set_transcriptome_bam(data, transcriptome_file) return data
def pipeline_summary(data): """Provide summary information on processing sample. Handles standard and CWL (single QC output) cases. """ data = utils.to_single_data(data) if data["analysis"].startswith("wgbs-seq"): bismark_bam = dd.get_align_bam(data) sorted_bam = bam.sort(bismark_bam, data["config"]) data = dd.set_align_bam(data, sorted_bam) data = dd.set_work_bam(data, bismark_bam) work_bam = dd.get_align_bam(data) or dd.get_work_bam(data) if not work_bam or not work_bam.endswith(".bam"): work_bam = None if dd.get_ref_file(data): if work_bam or (tz.get_in(["config", "algorithm", "kraken"], data)): # kraken doesn't need bam logger.info("QC: %s %s" % (dd.get_sample_name(data), ", ".join( dd.get_algorithm_qc(data)))) work_data = cwlutils.unpack_tarballs(utils.deepish_copy(data), data) data["summary"] = _run_qc_tools(work_bam, work_data) if (len(dd.get_algorithm_qc(data)) == 1 and "output_cwl_keys" in data): data["summary"]["qc"] = data["summary"]["qc"].get( dd.get_algorithm_qc(data)[0]) return [[data]]
def test_bcbio_dexseq(self): data = dd.set_sample_name({}, "test") data = dd.set_work_bam(data, test_data.BAM_FILE) data = dd.set_work_dir(data, self.out_dir) data = dd.set_dexseq_gff(data, test_data.DEXSEQ_GFF) data = dd.set_strandedness(data, "unstranded") out_file = dexseq.bcbio_run(data) self.assertTrue(file_exists(out_file))
def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith( "wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error( "bismark index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data bismark = config_utils.get_program("bismark", config) fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) n = 1 if num_cores < 5 else 2 safe_makedir(align_dir) cmd = "{bismark} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}" if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") process_bam = _process_bam(raw_bam[0], fastq_files, sample, dd.get_sam_ref(data), config) utils.symlink_plus(process_bam, final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data
def _update_data(align_file, out_dir, names, data): data = dd.set_work_bam(data, align_file) data = dd.set_align_bam(data, align_file) transcriptome_file = _move_transcriptome_file(out_dir, names) data = dd.set_transcriptome_bam(data, transcriptome_file) sjfile = get_splicejunction_file(out_dir, data) sjbed = junction2bed(sjfile) data = dd.set_junction_bed(data, sjbed) return data
def _update_data(align_file, out_dir, names, data): data = dd.set_work_bam(data, align_file) data = dd.set_align_bam(data, align_file) transcriptome_file = _move_transcriptome_file(out_dir, names) data = dd.set_transcriptome_bam(data, transcriptome_file) sjfile = get_splicejunction_file(out_dir, data) if sjfile: sjbed = junction2bed(sjfile) data = dd.set_junction_bed(data, sjbed) return data
def fix_umi_dragen_bam(data, bam=None): """ fixes the UMI BAM from DRAGEN. Accepts a pre UMI collapsed BAM file and adds several missing tags needed to use fgbio's UMI tools """ if not bam: bam = dd.get_work_bam(data) base, ext = os.path.splitext(bam) sample_name = dd.get_sample_name(data) out_bam = os.path.join(dd.get_work_dir(data), "align", sample_name, sample_name + "-fgbio" + ext) out_bam = add_fgbio_tags(bam, out_bam, data) data = dd.set_work_bam(data, out_bam) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls( fastq_file, pair_file, data) else: final_file = None if not file_exists(out_file) and (final_file is None or not file_exists(final_file)): cmd = ( "{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": splicesites = get_known_splicesites_file(align_dir, data) if file_exists(splicesites): cmd += "--known-splicesite-infile {splicesites} " novel_splicesite_file = os.path.join( align_dir, "{0}-novelsplicesites.bed".format(dd.get_sample_name(data))) cmd += "--novel-splicesite-outfile {novel_splicesite_file} " # apply additional hisat2 options cmd += " ".join(_get_options_from_config(data)) message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cmd += " | " + tobam_cl do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) junctionbed = get_splicejunction_file(align_dir, data) data = dd.set_junction_bed(data, junctionbed) return data
def run_rapmap_pseudoalign(data): samplename = dd.get_sample_name(data) files = dd.get_input_sequence_files(data) work_dir = dd.get_work_dir(data) if len(files) == 2: fq1, fq2 = files else: fq1, fq2 = files[0], None rapmap_dir = os.path.join(work_dir, "rapmap", samplename) gtf_file = dd.get_gtf_file(data) assert file_exists(gtf_file), "%s was not found, exiting." % gtf_file fasta_file = dd.get_ref_file(data) assert file_exists(fasta_file), "%s was not found, exiting." % fasta_file out_file = rapmap_pseudoalign(fq1, fq2, rapmap_dir, gtf_file, fasta_file, data) data = dd.set_work_bam(data, out_file) data = dd.set_transcriptome_bam(data, out_file) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): paired = True if pair_file else False hisat2 = config_utils.get_program("hisat2", data) num_cores = dd.get_num_cores(data) quality_flag = _get_quality_flag(data) stranded_flag = _get_stranded_flag(data, paired) rg_flags = _get_rg_flags(names) out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data))) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data) else: final_file = None if not file_exists(out_file) and (final_file is None or not file_exists(final_file)): cmd = ("{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} " "{rg_flags} ") if paired: cmd += "-1 {fastq_file} -2 {pair_file} " else: cmd += "-U {fastq_file} " if dd.get_analysis(data).lower() == "smallrna-seq": cmd += "-k 1000 " # if assembling transcripts, set flags that cufflinks/stringtie can use if dd.get_transcript_assembler(data): cmd += "--dta-cufflinks " if dd.get_analysis(data).lower() == "rna-seq": splicesites = get_known_splicesites_file(align_dir, data) if file_exists(splicesites): cmd += "--known-splicesite-infile {splicesites} " novel_splicesite_file = os.path.join(align_dir, "{0}-novelsplicesites.bed".format(dd.get_sample_name(data))) cmd += "--novel-splicesite-outfile {novel_splicesite_file} " # apply additional hisat2 options cmd += " ".join(_get_options_from_config(data)) message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file) with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file): cmd += " | " + tobam_cl do.run(cmd.format(**locals()), message) data = dd.set_work_bam(data, out_file) junctionbed = get_splicejunction_file(align_dir, data) data = dd.set_junction_bed(data, junctionbed) return data
def _update_data(align_file, out_dir, names, data): data = dd.set_work_bam(data, align_file) data = dd.set_align_bam(data, align_file) transcriptome_file = _move_transcriptome_file(out_dir, names) data = dd.set_transcriptome_bam(data, transcriptome_file) return data