def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith("wgbs-seq"), "No comparible alignment" config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) ref_file = dd.get_sam_ref(data) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) return data bsmap = config_utils.get_program("bsmap", config) fastq_files = " -a %s" % fastq_file num_cores = dd.get_num_cores(data) num_cores = "-p %d" % num_cores safe_makedir(align_dir) cmd = "{bsmap} {num_cores} -w 100 -v 0.07 -m 10 -x 300 -o {tx_out_bam} -d {ref_file} {fastq_files}" if pair_file: fastq_files = "-a %s -b %s" % (fastq_file, pair_file) if not final_out: with file_transaction(final_out) as tx_out_bam: run_message = "Running BSMAP aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) data = dd.set_work_bam(data, final_out) return data
def cufflinks_assemble(data): bam_file = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) out_dir = os.path.join(dd.get_work_dir(data), "assembly") num_cores = dd.get_num_cores(data) assembled_gtf = cufflinks.assemble(bam_file, ref_file, num_cores, out_dir, data) data = dd.set_assembled_gtf(data, assembled_gtf) return [[data]]
def cufflinks_assemble(data): bam_file = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) out_dir = os.path.join(dd.get_work_dir(data), "assembly") num_cores = dd.get_num_cores(data) assembled_gtf = cufflinks.assemble(bam_file, ref_file, num_cores, out_dir, data) dd.get_assembled_gtf(data).append(assembled_gtf) return [[data]]
def run_cufflinks(data): """Quantitate transcript expression with Cufflinks""" work_bam = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) out_dir, fpkm_file, fpkm_isoform_file = cufflinks.run(work_bam, ref_file, data) data = dd.set_cufflinks_dir(data, out_dir) data = dd.set_fpkm(data, fpkm_file) data = dd.set_fpkm_isoform(data, fpkm_isoform_file) return [[data]]
def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith( "wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error( "bismark index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data bismark = config_utils.get_program("bismark", config) # bismark uses 5 threads/sample and ~12GB RAM/sample (hg38) resources = config_utils.get_resources("bismark", data["config"]) max_cores = resources.get("cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) n = min(max(int(max_cores / 5), 1), max(int(max_mem / config_utils.convert_to_bytes("12G")), 1)) kit = kits.KITS.get(dd.get_kit(data), None) directional = "--non_directional" if kit and not kit.is_directional else "" other_opts = resources.get("options", []) other_opts = " ".join([str(x) for x in other_opts]).strip() fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file safe_makedir(align_dir) cmd = "{bismark} {other_opts} {directional} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}" if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") process_bam = _process_bam(raw_bam[0], fastq_files, sample, dd.get_sam_ref(data), config) utils.symlink_plus(process_bam, final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data
def stringtie_merge(*samples): to_merge = filter_missing(flatten([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)])) data = samples[0][0] ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) num_cores = dd.get_num_cores(data) merged_gtf = stringtie.merge(to_merge, ref_file, gtf_file, num_cores, data) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_merged_gtf(data, merged_gtf) updated_samples.append([data]) return updated_samples
def cufflinks_merge(*samples): to_merge = filter_missing([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)]) data = samples[0][0] bam_file = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) out_dir = os.path.join(dd.get_work_dir(data), "assembly") num_cores = dd.get_num_cores(data) merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores, samples[0][0]) for data in dd.sample_data_iterator(samples): dd.set_assembled_gtf(data, merged_gtf) return samples
def cufflinks_merge(*samples): to_merge = filter_missing([dd.get_assembled_gtf(data) for data in dd.sample_data_iterator(samples)]) data = samples[0][0] bam_file = dd.get_work_bam(data) ref_file = dd.get_sam_ref(data) gtf_file = dd.get_gtf_file(data) out_dir = os.path.join(dd.get_work_dir(data), "assembly") num_cores = dd.get_num_cores(data) merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores, samples[0][0]) updated_samples = [] for data in dd.sample_data_iterator(samples): data = dd.set_assembled_gtf(data, merged_gtf) updated_samples.append([data]) return updated_samples
def _bsmap_calling(data): sample = dd.get_sample_name(data) workdir = safe_makedir(os.path.join(dd.get_work_dir(data), "cpg_split", sample)) config = data["config"] ref = dd.get_sam_ref(data) work_bam = dd.get_work_bam(data) python = os.path.join(os.path.dirname(sys.executable), "python") methratio = config_utils.get_program("methratio.py", config) cmd = ("{python} {methratio} -g -n -u -p -r -m 5 --chr={chrom} --ref={ref} {work_bam} >> {out_tx}") chrom = data["chr_to_run"] out_file = os.path.join(workdir, "methyratios_%s.txt" % chrom) if not file_exists(out_file): with file_transaction(out_file) as out_tx: do.run(cmd.format(**locals()), "Extract methylation for: %s" % sample) data["cpg_file"] = out_file return data
def align(fastq_file, pair_file, ref_file, names, align_dir, data): assert data["analysis"].lower().startswith( "wgbs-seq"), "No comparible alignment." config = data["config"] sample = dd.get_sample_name(data) out_prefix = os.path.join(align_dir, dd.get_lane(data)) out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data)) if not ref_file: logger.error( "bismark index not found. We don't provide the STAR indexes " "by default because they are very large. You can install " "the index for your genome with: bcbio_nextgen.py upgrade " "--aligners bismark --genomes genome-build-name --data") sys.exit(1) final_out = os.path.join(align_dir, "{0}.bam".format(sample)) if file_exists(final_out): data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data bismark = config_utils.get_program("bismark", config) fastq_files = " ".join([fastq_file, pair_file ]) if pair_file else fastq_file num_cores = dd.get_num_cores(data) n = 1 if num_cores < 5 else 2 safe_makedir(align_dir) cmd = "{bismark} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}" if pair_file: fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") if not raw_bam: with tx_tmpdir() as tx_out_dir: run_message = "Running Bismark aligner on %s and %s" % (fastq_file, ref_file) do.run(cmd.format(**locals()), run_message, None) shutil.move(tx_out_dir, out_dir) raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam") process_bam = _process_bam(raw_bam[0], fastq_files, sample, dd.get_sam_ref(data), config) utils.symlink_plus(process_bam, final_out) data = dd.set_work_bam(data, final_out) data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0] return data