def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith("wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error("bismark index not found. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] data = dd.update_summary_qc(data, "bismark", base=data["bam_report"]) return data bismark = config_utils.get_program("bismark", config) # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38) resources = config_utils.get_resources("bismark", data["config"]) max_cores = dd.get_num_cores(data) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) / (1024.0 * 1024.0) instances = calculate_bismark_instances(max_cores, max_mem * max_cores) # override instances if specified in the config if resources and resources.get("bismark_threads"): instances = resources.get("bismark_threads") logger.info(f"Using {instances} bismark instances - overriden by resources") bowtie_threads = 1 if resources and resources.get("bowtie_threads"): bowtie_threads = resources.get("bowtie_threads") logger.info(f"Using {bowtie_threads} bowtie threads per bismark instance") kit = kits.KITS.get(dd.get_kit(data), None) directional = "--non_directional" if kit and not kit.is_directional else "" other_opts = resources.get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() fastq_files = " ".join([fastq_file, pair_file]) if pair_file else fastq_file safe_makedir(align_dir) cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --parallel {instances} -p {bowtie_threads} -o {tx_out_dir} --unmapped {ref_file} {fastq_file} " if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") # don't process bam in the bismark pipeline! utils.symlink_plus(raw_bam[0], final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] data = dd.update_summary_qc(data, "bismark", base=data["bam_report"]) return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith( "wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error( "bismark index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data bismark = config_utils.get_program("bismark", config) # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38) resources = config_utils.get_resources("bismark", data["config"]) max_cores = resources.get("cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) n = min(max(int(max_cores / 5), 1), max(int(max_mem / config_utils.convert_to_bytes("12G")), 1)) kit = kits.KITS.get(dd.get_kit(data), None) directional = "--non_directional" if kit and not kit.is_directional else "" other_opts = resources.get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file safe_makedir(align_dir) cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}" if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") process_bam = _process_bam(raw_bam[0], fastq_files, sample, dd.get_sam_ref(data), config) utils.symlink_plus(process_bam, final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data
def trim(data): """Remove adapter for bisulphite conversion sequencing data""" in_files = data["files"] names = dd.get_sample_name(data) work_dir = os.path.join(dd.get_work_dir(data), "trimmed", names) out_dir = utils.safe_makedir(work_dir) out_files = [ os.path.join(out_dir, utils.splitext_plus(os.path.basename(in_files[0]))[0] + '_val_1.fq.gz'), os.path.join(out_dir, utils.splitext_plus(os.path.basename(in_files[1]))[0] + '_val_2.fq.gz') ] if utils.file_exists(out_files[0]): data["files"] = out_files return [[data]] kit = kits.KITS.get(dd.get_kit(data), None) if kit: logger.info(f"{kit.name} specified, using clip settings: R1 5'-{kit.clip_r1_5}nt/--/{kit.clip_r1_3}nt-3', R2 5'-{kit.clip_r2_5}nt/--/{kit.clip_r2_3}nt-3'") clipsettings = _get_clip_settings(kit) else: logger.info(f"No kit specified, using default clip settings") clipsettings = "" trim_galore = config_utils.get_program("trim_galore", data["config"]) # trim_galore actual cores used = 3x + 3 where x = value of the parameter (according to manual) tg_cores = max(int((dd.get_num_cores(data) - 3) / 3), 1) other_opts = config_utils.get_resources("trim_galore", data["config"]).get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() cmd = "{trim_galore} {other_opts} {clipsettings} --cores {tg_cores} --length 30 --quality 30 --fastqc --paired -o {tx_out_dir} {files}" log_file = os.path.join(out_dir, names + "_cutadapt_log.txt") if not utils.file_exists(out_files[0]): with file_transaction(out_dir) as tx_out_dir: files = "%s %s" % (in_files[0], in_files[1]) do.run(cmd.format(**locals()), "remove adapters with trimgalore") data["files"] = out_files return [[data]]