def _bgzip_from_bam(bam_file, dirs, config): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default bgzip = _get_bgzip_cmd(config) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0]) if bam.is_paired(bam_file): out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") else: out_file_2 = None if not utils.file_exists(out_file_1): with file_transaction(out_file_1) as tx_out_file: fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)]) return [x for x in [out_file_1, out_file_2] if x is not None]
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join( work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ( "F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info( "bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, config, is_retry=True) else: return [ x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x) ]
def _bgzip_from_bam(bam_file, dirs, data, is_retry=False, output_infix=''): """Create bgzipped fastq files from an input BAM file. """ # tools config = data["config"] bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = config_utils.convert_to_bytes(resources.get("memory", "1G")) * cores bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s%s-1.fq.gz" % (os.path.splitext(os.path.basename(bam_file))[0], output_infix)) out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") needs_retry = False if is_retry or not utils.file_exists(out_file_1): if not bam.is_paired(bam_file): out_file_2 = None with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) prep_cmd = _seqtk_fastq_prep_cl(data, read_num=0) if prep_cmd: fq1_bgzip_cmd = prep_cmd + " | " + fq1_bgzip_cmd sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): prep_cmd = _seqtk_fastq_prep_cl(data, read_num=1) fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) if prep_cmd: fq2_bgzip_cmd = prep_cmd + " | " + fq2_bgzip_cmd out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) extra_opts = " ".join([str(x) for x in resources.get("options", [])]) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} {extra_opts} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError as msg: if not is_retry and "deflate failed" in str(msg): logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise if needs_retry: return _bgzip_from_bam(bam_file, dirs, data, is_retry=True) else: return [x for x in [out_file_1, out_file_2] if x is not None and utils.file_exists(x)]
def run(bam_file, data, out_dir): config = data["config"] if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) broad_runner = broad.PicardCmdRunner("picard", config) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample) hsinsert_file = os.path.join(out_dir, "%s-sort.insert_metrics" % sample) if utils.file_exists(hsmetric_file): return hsmetric_file with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, config) do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "") do.run("sed -i 's/-sort.bam//g' %s" % hsinsert_file, "") return hsmetric_file
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name config = dd.get_config(data) out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): _compres_bdg_files(out_dir) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data)) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" with utils.chdir(out_dir): cmd = _macs2_cmd(method) try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning("macs2 terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources") _compres_bdg_files(out_dir) return _get_output_files(out_dir)
def run(bam_file, data, out_dir): if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) broad_runner = broad.PicardCmdRunner("picard", data["config"]) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) out_base = utils.splitext_plus(os.path.basename(bam_fname))[0] hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base) hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base) if not utils.file_exists(hsmetric_file) and not utils.file_exists(hsinsert_file): with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, data["config"]) if utils.file_exists(hsmetric_file): do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "") if utils.file_exists(hsinsert_file): do.run("sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "") return hsmetric_file
def to_sdf(files, data): """Convert a fastq or BAM input into a SDF indexed file. """ # BAM if len(files) == 1 and files[0].endswith(".bam"): qual = [] format = ["-f", "sam-pe" if bam.is_paired(files[0]) else "sam-se"] inputs = [files[0]] # fastq else: qual = ["-q", "illumina" if dd.get_quality_format(data).lower() == "illumina" else "sanger"] format = ["-f", "fastq"] if len(files) == 2: inputs = ["-l", files[0], "-r", files[1]] else: assert len(files) == 1 inputs = [files[0]] work_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "align_prep")) out_file = os.path.join(work_dir, "%s.sdf" % utils.splitext_plus(os.path.basename(os.path.commonprefix(files)))[0]) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = _rtg_cmd(["rtg", "format", "-o", tx_out_file] + format + qual + inputs) do.run(cmd, "Format inputs to indexed SDF") return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. TODO: Use streaming with new development version of SNAP to feed into structural variation preparation de-duplication. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) assert not data.get( "align_split"), "Split alignments not supported with SNAP" snap = config_utils.get_program("snap", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) rg_info = novoalign.get_rg_info(names) is_paired = bam.is_paired(fastq_file) if fastq_file.endswith( ".bam") else pair_file if not utils.file_exists(out_file): with postalign.tobam_cl(data, out_file, is_paired) as (tobam_cl, tx_out_file): cmd_name = "paired" if is_paired else "single" cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} " "-R '{rg_info}' -t {num_cores} -M -o -sam - | ") do.run( cmd.format(**locals()) + tobam_cl, "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. TODO: Use streaming with new development version of SNAP to feed into structural variation preparation de-duplication. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) assert not data.get("align_split"), "Split alignments not supported with SNAP" snap = config_utils.get_program("snap", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) max_mem = resources.get("memory", "1G") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with utils.curdir_tmpdir(data) as work_dir: if fastq_file.endswith(".bam"): cmd_name = "paired" if bam.is_paired(fastq_file) else "single" else: cmd_name = "single" if not pair_file else "paired" cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} " "-rg '{rg_info}' -t {num_cores} -sa -so -sm {max_mem} -o {tx_out_file}") do.run(cmd.format(**locals()), "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = {"firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific"} report_file = os.path.join(out_dir, "qualimapReport.html") raw_file = os.path.join(out_dir, "rnaseq_qc_results.txt") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(report_file): utils.safe_makedir(out_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, out_dir, data)) metrics.update(_detect_rRNA(data)) metrics.update({"Average insert size": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) return metrics
def run(bam_file, data, out_dir): if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) broad_runner = broad.PicardCmdRunner("picard", data["config"]) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) out_base = utils.splitext_plus(os.path.basename(bam_fname))[0] hsmetric_file = os.path.join(out_dir, "%s.hs_metrics" % out_base) hsinsert_file = os.path.join(out_dir, "%s.insert_metrics" % out_base) if not utils.file_exists(hsmetric_file) and not utils.file_exists( hsinsert_file): with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, data["config"]) if utils.file_exists(hsmetric_file): do.run( "sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsmetric_file), "") if utils.file_exists(hsinsert_file): do.run( "sed -i 's/%s.bam//g' %s" % (out_base.replace(sample, ""), hsinsert_file), "") return hsmetric_file
def run(name, chip_bam, input_bam, genome_build, out_dir, method, config): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): return out_file macs2 = config_utils.get_program("macs2", config) options = " ".join(config_utils.get_resources("macs2", config).get("options", "")) if genome_build not in HS and options.find("-g") == -1: raise ValueError("This %s genome doesn't have a pre-set value." "You can add specific values using resources " "option for macs2 in the YAML file (-g genome_size)." "Check Chip-seq configuration in " "bcbio-nextgen documentation.") genome_size = "" if options.find("-g") > -1 else "-g %s" % HS[genome_build] paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" with utils.chdir(out_dir): cmd = _macs2_cmd(method) try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning("macs2 terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources") return out_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def rsem_calculate_expression(bam_file, rsem_genome_dir, samplename, build, out_dir, cores=1): """ works only in unstranded mode for now (--forward-prob 0.5) """ if not which("rsem-calculate-expression"): logger.info("Skipping RSEM because rsem-calculate-expression could " "not be found.") return None sentinel_file = os.path.join(out_dir, samplename + "Test.genes.results") if file_exists(sentinel_file): return out_dir paired_flag = "--paired" if bam.is_paired(bam_file) else "" core_flag = "-p {cores}".format(cores=cores) cmd = ("rsem-calculate-expression --bam {core_flag} {paired_flag} --no-bam-output " "--forward-prob 0.5 --estimate-rspd {bam_file} {rsem_genome_dir}/{build} " "{samplename}") message = "Calculating transcript expression of {bam_file} using RSEM." with file_transaction(out_dir) as tx_out_dir: safe_makedir(tx_out_dir) with chdir(tx_out_dir): do.run(cmd.format(**locals()), message.format(**locals())) return out_dir
def create_peaktable(samples): """create a table of peak counts per sample to use with differential peak calling """ data = dd.get_data_from_sample(samples[0]) peakcounts = [] out_dir = os.path.join(dd.get_work_dir(data), "consensus") out_file = os.path.join(out_dir, "consensus-counts.tsv") if dd.get_chip_method(data) == "chip": for data in dd.sample_data_iterator(samples): peakcounts.append(tz.get_in(("peak_counts"), data)) elif dd.get_chip_method(data) == "atac": for data in dd.sample_data_iterator(samples): if bam.is_paired(dd.get_work_bam(data)): peakcounts.append(tz.get_in(("peak_counts", "NF"), data)) else: logger.info(f"Creating peak table from full BAM file because " f"{dd.get_work_bam(data)} is single-ended.") peakcounts.append(tz.get_in(("peak_counts", "full"), data)) combined_peaks = count.combine_count_files(peakcounts, out_file, ext=".counts") new_data = [] for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ("peak_counts", "peaktable"), combined_peaks) new_data.append(data) new_samples = dd.get_samples_from_datalist(new_data) return new_samples
def rsem_calculate_expression(bam_file, rsem_genome_dir, samplename, build, out_dir, cores=1): """ works only in unstranded mode for now (--forward-prob 0.5) """ if not utils.which("rsem-calculate-expression"): logger.info("Skipping RSEM because rsem-calculate-expression could " "not be found.") return None sentinel_file = os.path.join(out_dir, samplename + "Test.genes.results") if utils.file_exists(sentinel_file): return out_dir paired_flag = "--paired" if bam.is_paired(bam_file) else "" core_flag = "-p {cores}".format(cores=cores) command = CALCULATE_EXP.format( core_flag=core_flag, paired_flag=paired_flag, bam_file=bam_file, rsem_genome_dir=rsem_genome_dir, build=build, samplename=samplename) message = "Calculating transcript expression of {bam_file} using RSEM." with transaction.file_transaction(out_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) with utils.chdir(tx_out_dir): do.run(command, message.format(bam_file=bam_file)) return out_dir
def rsem_calculate_expression(bam_file, rsem_genome_dir, samplename, build, out_dir, cores=1): """ works only in unstranded mode for now (--forward-prob 0.5) """ if not utils.which("rsem-calculate-expression"): logger.info("Skipping RSEM because rsem-calculate-expression could " "not be found.") return None sentinel_file = os.path.join(out_dir, samplename + "Test.genes.results") if utils.file_exists(sentinel_file): return out_dir paired_flag = "--paired" if bam.is_paired(bam_file) else "" core_flag = "-p {cores}".format(cores=cores) command = CALCULATE_EXP.format(core_flag=core_flag, paired_flag=paired_flag, bam_file=bam_file, rsem_genome_dir=rsem_genome_dir, build=build, samplename=samplename) message = "Calculating transcript expression of {bam_file} using RSEM." with transaction.file_transaction(out_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) with utils.chdir(tx_out_dir): do.run(command, message.format(bam_file=bam_file)) return out_dir
def filter_multimappers(align_file, data): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "[XS] == null and not unmapped {paired_filter} and not duplicate" ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform realignment of input BAM file; uses unix pipes for avoid IO. """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G").upper() extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): rg_info = get_rg_info(names) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config): """Convert BAM input file into FASTQ files. """ out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert")) qual_bin_method = config["algorithm"].get("quality_bin") if (qual_bin_method == "prealignment" or (isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method)): out_bindir = safe_makedir(os.path.join(out_dir, "qualbin")) in_file = cram.illumina_qual_bin(in_file, data["sam_ref"], out_bindir, config) out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format( os.path.splitext(os.path.basename(in_file))[0], x)) for x in ["1", "2"]] if bam.is_paired(in_file): out1, out2 = out_files else: out1 = out_files[0] out2 = None if not file_exists(out1): broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2) if out2 and os.path.getsize(out2) == 0: out2 = None return [out1, out2]
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease").upper() if not utils.file_exists(out_file): with tx_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): bwa_cmd = _get_bwa_mem_cmd(data, out_file, ref_file, "-") tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa_cmd} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name config = dd.get_config(data) out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): _compres_bdg_files(out_dir) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = HS.get( genome_build, bam.fasta.total_sequence_length(dd.get_ref_file(data))) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" with utils.chdir(out_dir): cmd = _macs2_cmd(method) try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning( "macs2 terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources" ) _compres_bdg_files(out_dir) return _get_output_files(out_dir)
def to_sdf(files, data): """Convert a fastq or BAM input into a SDF indexed file. """ # BAM if len(files) == 1 and files[0].endswith(".bam"): qual = [] format = ["-f", "sam-pe" if bam.is_paired(files[0]) else "sam-se"] inputs = [files[0]] # fastq else: qual = [ "-q", "illumina" if dd.get_quality_format(data).lower() == "illumina" else "sanger" ] format = ["-f", "fastq"] if len(files) == 2: inputs = ["-l", files[0], "-r", files[1]] else: assert len(files) == 1 inputs = [files[0]] work_dir = utils.safe_makedir( os.path.join(data["dirs"]["work"], "align_prep")) out_file = os.path.join( work_dir, "%s.sdf" % utils.splitext_plus(os.path.basename(os.path.commonprefix(files)))[0]) if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: cmd = _rtg_cmd(["rtg", "format", "-o", tx_out_file] + format + qual + inputs) do.run(cmd, "Format inputs to indexed SDF") return out_file
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform realignment of input BAM file; uses unix pipes for avoid IO. """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G").upper() extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with tx_tmpdir(data, base_dir=align_dir) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): rg_info = get_rg_info(names) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 1 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} | ") cmd = (cmd + tobam_cl).format(**locals()) do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def filter_multimappers(align_file, data): """ Filtering a BWA alignment file for uniquely mapped reads, from here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. TODO: Use streaming with new development version of SNAP to feed into structural variation preparation de-duplication. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) assert not data.get( "align_split"), "Split alignments not supported with SNAP" snap = config_utils.get_program("snap", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) max_mem = resources.get("memory", "1G") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with utils.curdir_tmpdir(data) as work_dir: if fastq_file.endswith(".bam"): cmd_name = "paired" if bam.is_paired( fastq_file) else "single" else: cmd_name = "single" if not pair_file else "paired" cmd = ( "{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} " "-rg '{rg_info}' -t {num_cores} -sa -so -sm {max_mem} -o {tx_out_file}" ) do.run(cmd.format(**locals()), "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config): """Convert BAM input file into FASTQ files. """ out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert")) qual_bin_method = config["algorithm"].get("quality_bin") if (qual_bin_method == "prealignment" or (isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method)): out_bindir = safe_makedir(os.path.join(out_dir, "qualbin")) in_file = cram.illumina_qual_bin(in_file, data["sam_ref"], out_bindir, config) out_files = [ os.path.join( out_dir, "{0}_{1}.fastq".format( os.path.splitext(os.path.basename(in_file))[0], x)) for x in ["1", "2"] ] if bam.is_paired(in_file): out1, out2 = out_files else: out1 = out_files[0] out2 = None if not file_exists(out1): broad_runner = broad.runner_from_config(config) broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2) if out2 and os.path.getsize(out2) == 0: out2 = None return [out1, out2]
def calculate_complexity_metrics(work_bam, data): """ the work_bam should have duplicates marked but not removed mitochondrial reads should be removed """ bedtools = config_utils.get_program("bedtools", dd.get_config(data)) work_dir = dd.get_work_dir(data) metrics_dir = os.path.join(work_dir, "metrics", "atac") utils.safe_makedir(metrics_dir) metrics_file = os.path.join( metrics_dir, f"{dd.get_sample_name(data)}-atac-metrics.csv") # complexity metrics only make sense for paired-end reads if not bam.is_paired(work_bam): return data if utils.file_exists(metrics_file): data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file) return data # BAM file must be sorted by read name work_bam = bam.sort(work_bam, dd.get_config(data), order="queryname") with file_transaction(metrics_file) as tx_metrics_file: with open(tx_metrics_file, "w") as out_handle: out_handle.write("mt,m0,m1,m2\n") cmd = ( f"{bedtools} bamtobed -bedpe -i {work_bam} | " "awk 'BEGIN{OFS=\"\\t\"}{print $1,$2,$4,$6,$9,$10}' | " "sort | " "uniq -c | " "awk 'BEGIN{mt=0;m0=0;m1=0;m2=0}($1==1){m1=m1+1} " "($1==2){m2=m2+1}{m0=m0+1}{mt=mt+$1}END{printf \"%d,%d,%d,%d\\n\", mt,m0,m1,m2}' >> " f"{tx_metrics_file}") message = f"Calculating ATAC-seq complexity metrics on {work_bam}, saving as {metrics_file}." do.run(cmd, message) data = tz.assoc_in(data, ['atac', 'complexity_metrics_file'], metrics_file) return data
def run(bam_file, data, out_dir): config = data["config"] if "picard" not in dd.get_tools_on(data): return {} ref_file = dd.get_ref_file(data) sample = dd.get_sample_name(data) target_file = dd.get_variant_regions(data) broad_runner = broad.PicardCmdRunner("picard", config) bam_fname = os.path.abspath(bam_file) path = os.path.dirname(bam_fname) utils.safe_makedir(out_dir) hsmetric_file = os.path.join(out_dir, "%s-sort.hs_metrics" % sample) if utils.file_exists(hsmetric_file): return hsmetric_file with utils.chdir(out_dir): with tx_tmpdir() as tmp_dir: cur_bam = os.path.basename(bam_fname) if not os.path.exists(cur_bam): os.symlink(bam_fname, cur_bam) gen_metrics = PicardMetrics(broad_runner, tmp_dir) gen_metrics.report(cur_bam, ref_file, bam.is_paired(bam_fname), target_file, target_file, None, config) do.run("sed -i 's/-sort.bam//g' %s" % hsmetric_file, "") return hsmetric_file
def run_count(bam_file, dexseq_gff, stranded, out_file, data): """ run dexseq_count on a BAM file """ assert file_exists(bam_file), "%s does not exist." % bam_file sort_order = bam._get_sort_order(bam_file, {}) assert sort_order, "Cannot determine sort order of %s." % bam_file strand_flag = _strand_flag(stranded) assert strand_flag, "%s is not a valid strandedness value." % stranded if not file_exists(dexseq_gff): logger.info("%s was not found, so exon-level counting is being " "skipped." % dexseq_gff) return None dexseq_count = _dexseq_count_path() if not dexseq_count: logger.info("DEXseq is not installed, skipping exon-level counting.") return None sort_flag = "name" if sort_order == "queryname" else "pos" is_paired = bam.is_paired(bam_file) paired_flag = "yes" if is_paired else "no" bcbio_python = sys.executable if file_exists(out_file): return out_file cmd = ( "{bcbio_python} {dexseq_count} -f bam -r {sort_flag} -p {paired_flag} " "-s {strand_flag} {dexseq_gff} {bam_file} {tx_out_file}") message = "Counting exon-level counts with %s and %s." % (bam_file, dexseq_gff) with file_transaction(data, out_file) as tx_out_file: do.run(cmd.format(**locals()), message) return out_file
def run_count(bam_file, dexseq_gff, stranded, out_file, data): """ run dexseq_count on a BAM file """ assert file_exists(bam_file), "%s does not exist." % bam_file sort_order = bam._get_sort_order(bam_file, {}) assert sort_order, "Cannot determine sort order of %s." % bam_file strand_flag = _strand_flag(stranded) assert strand_flag, "%s is not a valid strandedness value." % stranded if not dexseq_gff: logger.info("No DEXSeq GFF file was found, skipping exon-level counting.") return None elif not file_exists(dexseq_gff): logger.info("%s was not found, so exon-level counting is being " "skipped." % dexseq_gff) return None dexseq_count = _dexseq_count_path() if not dexseq_count: logger.info("DEXseq is not installed, skipping exon-level counting.") return None sort_flag = "name" if sort_order == "queryname" else "pos" is_paired = bam.is_paired(bam_file) paired_flag = "yes" if is_paired else "no" bcbio_python = sys.executable if file_exists(out_file): return out_file cmd = ("{bcbio_python} {dexseq_count} -f bam -r {sort_flag} -p {paired_flag} " "-s {strand_flag} {dexseq_gff} {bam_file} {tx_out_file}") message = "Counting exon-level counts with %s and %s." % (bam_file, dexseq_gff) with file_transaction(data, out_file) as tx_out_file: do.run(cmd.format(**locals()), message) return out_file
def filter_multimappers(align_file, data): """ It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie, there are some options that are close but don't do the same thing. Bowtie2 sets the XS flag for reads mapping in more than one place, so we can just filter on that. This will not work for other aligners. """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext if file_exists(out_file): return out_file base_filter = '-F "[XS] == null and not unmapped {paired_filter}"' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) return out_file
def filter_multimappers(align_file, data): """ Filtering a BWA alignment file for uniquely mapped reads, from here: https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment """ config = dd.get_config(data) type_flag = "" if bam.is_bam(align_file) else "S" base, ext = os.path.splitext(align_file) out_file = base + ".unique" + ext bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data) bed_cmd = '-L {0}'.format(bed_file) if bed_file else " " if utils.file_exists(out_file): return out_file base_filter = '-F "not unmapped {paired_filter} and [XA] == null and [SA] == null and not supplementary " ' if bam.is_paired(align_file): paired_filter = "and paired and proper_pair" else: paired_filter = "" filter_string = base_filter.format(paired_filter=paired_filter) sambamba = config_utils.get_program("sambamba", config) num_cores = dd.get_num_cores(data) with file_transaction(out_file) as tx_out_file: cmd = ('{sambamba} view -h{type_flag} ' '--nthreads {num_cores} ' '-f bam {bed_cmd} ' '{filter_string} ' '{align_file} ' '> {tx_out_file}') message = "Removing multimapped reads from %s." % align_file do.run(cmd.format(**locals()), message) bam.index(out_file, config) return out_file
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = {"firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific"} # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(report_file): with file_transaction(data, results_dir) as tx_out_dir: utils.safe_makedir(tx_out_dir) raw_file = os.path.join(tx_out_dir, "rnaseq_qc_results.txt") bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_out_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % (dd.get_sample_name(data), raw_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data)) metrics.update({"Average_insert_size": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) return metrics
def split_ATAC(data, bam_file=None): """ splits a BAM into nucleosome-free (NF) and mono/di/tri nucleosome BAMs based on the estimated insert sizes uses the current working BAM file if no BAM file is supplied """ sambamba = config_utils.get_program("sambamba", data) num_cores = dd.get_num_cores(data) base_cmd = f'{sambamba} view --format bam --nthreads {num_cores} ' bam_file = bam_file if bam_file else dd.get_work_bam(data) out_stem = os.path.splitext(bam_file)[0] split_files = {} # we can only split these fractions from paired runs if not bam.is_paired(bam_file): split_files["full"] = bam_file data = tz.assoc_in(data, ['atac', 'align'], split_files) return data for arange in ATACRanges.values(): out_file = f"{out_stem}-{arange.label}.bam" if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: cmd = base_cmd +\ f'-F "template_length > {arange.min} and template_length < {arange.max}" ' +\ f'{bam_file} > {tx_out_file}' message = f'Splitting {arange.label} regions from {bam_file}.' do.run(cmd, message) bam.index(out_file, dd.get_config(data)) split_files[arange.label] = out_file split_files["full"] = bam_file data = tz.assoc_in(data, ['atac', 'align'], split_files) return data
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export()) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, out_dir) if library != "non-strand-specific": logger.info( "Qualimap can get the orientation wrong for stranded reads, so we run it in unstranded mode. This gives comparable results to unstranded for RNA-seq data (see https://groups.google.com/forum/#!topic/qualimap/ZGo-k8LGmHQ) for a further explanation." ) library = "non-strand-specific" paired = " --paired" if bam.is_paired(bam_file) else "" cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library}{paired} " "-gtf {gtf_file}").format(**locals()) return cmd
def _prep_load_script(work_bams, names, chrom, items): pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired" print len(items), items[0].get("metadata") if len(items) == 2 and vcfutils.get_paired_phenotype(items[0]): load_script = _paired_load_script else: load_script = _population_load_script return load_script(work_bams, names, chrom, pairmode, items)
def _paired_flag(bam_file): """ sets flags to handle paired-end BAM files """ if is_paired(bam_file): return "-p -B -C" else: return ""
def _libtype_string(bam_file, strandedness): # auto by default libtype = "-l " strand = "A" if strandedness != "auto": libtype = "-l I" if bam.is_paired(bam_file) else "-l " strand = sailfish._sailfish_strand_string(strandedness) return libtype + strand
def _prep_load_script(work_bams, names, chrom, items): pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired" print len(items), items[0].get("metadata") if len(items) == 2 and items[0].get("metadata", {}).get("phenotype") in ["tumor", "normal"]: load_script = _paired_load_script else: load_script = _population_load_script return load_script(work_bams, names, chrom, pairmode, items)
def _prep_load_script(work_bams, names, chrom, items): if not chrom: chrom = "" pairmode = "paired" if bam.is_paired(work_bams[0]) else "unpaired" if len(items) == 2 and vcfutils.get_paired_phenotype(items[0]): load_script = _paired_load_script else: load_script = _population_load_script return load_script(work_bams, names, chrom, pairmode, items)
def _bgzip_from_bam(bam_file, dirs, config, is_retry=False): """Create bgzipped fastq files from an input BAM file. """ # tools bamtofastq = config_utils.get_program("bamtofastq", config) resources = config_utils.get_resources("bamtofastq", config) cores = config["algorithm"].get("num_cores", 1) max_mem = int(resources.get("memory", "1073741824")) * cores # 1Gb/core default bgzip = tools.get_bgzip_cmd(config, is_retry) # files work_dir = utils.safe_makedir(os.path.join(dirs["work"], "align_prep")) out_file_1 = os.path.join(work_dir, "%s-1.fq.gz" % os.path.splitext(os.path.basename(bam_file))[0]) if bam.is_paired(bam_file): out_file_2 = out_file_1.replace("-1.fq.gz", "-2.fq.gz") else: out_file_2 = None needs_retry = False if is_retry or not utils.file_exists(out_file_1): with file_transaction(config, out_file_1) as tx_out_file: for f in [tx_out_file, out_file_1, out_file_2]: if f and os.path.exists(f): os.remove(f) fq1_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, tx_out_file) sortprefix = "%s-sort" % os.path.splitext(tx_out_file)[0] if bam.is_paired(bam_file): fq2_bgzip_cmd = "%s -c /dev/stdin > %s" % (bgzip, out_file_2) out_str = ("F=>({fq1_bgzip_cmd}) F2=>({fq2_bgzip_cmd}) S=/dev/null O=/dev/null " "O2=/dev/null collate=1 colsbs={max_mem}") else: out_str = "S=>({fq1_bgzip_cmd})" bam_file = objectstore.cl_input(bam_file) cmd = "{bamtofastq} filename={bam_file} T={sortprefix} " + out_str try: do.run(cmd.format(**locals()), "BAM to bgzipped fastq", checks=[do.file_reasonable_size(tx_out_file, bam_file)], log_error=False) except subprocess.CalledProcessError, msg: if not is_retry and "deflate failed" in str(msg): logger.info("bamtofastq deflate IO failure preparing %s. Retrying with single core." % (bam_file)) needs_retry = True else: logger.exception() raise
def run(name, chip_bam, input_bam, genome_build, out_dir, method, resources, data): """ Run macs2 for chip and input samples avoiding errors due to samples. """ # output file name need to have the caller name config = dd.get_config(data) out_file = os.path.join(out_dir, name + "_peaks_macs2.xls") macs2_file = os.path.join(out_dir, name + "_peaks.xls") if utils.file_exists(out_file): _compress_and_sort_bdg_files(out_dir, data) return _get_output_files(out_dir) macs2 = config_utils.get_program("macs2", config) antibody = dd.get_antibody(data) if antibody: antibody = antibody.lower() if antibody not in antibodies.SUPPORTED_ANTIBODIES: logger.error( f"{antibody} specified, but not listed as a supported antibody. Valid antibodies are {antibodies.SUPPORTED_ANTIBODIES}. If you know your antibody " f"should be called with narrow or broad peaks, supply 'narrow' or 'broad' as the antibody." f"It will run 'narrow' if the antibody is not supported.") antibody = 'narrow' antibody = antibodies.ANTIBODIES[antibody] logger.info( f"{antibody.name} specified, using {antibody.peaktype} peak settings." ) peaksettings = select_peak_parameters(antibody) elif method == "atac": logger.info(f"ATAC-seq specified, using narrow peak settings.") peaksettings = " " else: peaksettings = " " options = " ".join(resources.get("macs2", {}).get("options", "")) genome_size = bam.fasta.total_sequence_length(dd.get_ref_file(data)) genome_size = "" if options.find("-g") > -1 else "-g %s" % genome_size paired = "-f BAMPE" if bam.is_paired(chip_bam) else "" with utils.chdir(out_dir): cmd = _macs2_cmd(data) cmd += peaksettings try: do.run(cmd.format(**locals()), "macs2 for %s" % name) utils.move_safe(macs2_file, out_file) except subprocess.CalledProcessError: raise RuntimeWarning( "macs2 terminated with an error. " "Please, check the message and report " "error if it is related to bcbio. " "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources" ) _compress_and_sort_bdg_files(out_dir, data) return _get_output_files(out_dir)
def _run_bamtools_stats(bam_file, data, out_dir): """Run bamtools stats with reports on mapped reads, duplicates and insert sizes. """ stats_file = os.path.join(out_dir, "bamtools_stats.txt") if not utils.file_exists(stats_file): utils.safe_makedir(out_dir) bamtools = config_utils.get_program("bamtools", data["config"]) with file_transaction(stats_file) as tx_out_file: cmd = "{bamtools} stats -in {bam_file}" if bam.is_paired(bam_file): cmd += " -insert" cmd += " > {tx_out_file}" do.run(cmd.format(**locals()), "bamtools stats", data) return _parse_bamtools_stats(stats_file)
def _set_stranded_flag(bam_file, data): strand_flag = {"unstranded": "", "firststrand": "--rf-stranded", "secondstrand": "--fr-stranded", "firststrand-s": "--r-stranded", "secondstrand-s": "--f-stranded"} stranded = dd.get_strandedness(data) assert stranded in strand_flag, ("%s is not a valid strandedness value. " "Valid values are 'firststrand', " "'secondstrand' and 'unstranded" % (stranded)) if stranded != "unstranded" and not is_paired(bam_file): stranded += "-s" flag = strand_flag[stranded] return flag
def run_rnaseq(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ strandedness = { "firststrand": "strand-specific-reverse", "secondstrand": "strand-specific-forward", "unstranded": "non-strand-specific" } # Qualimap results should be saved to a directory named after sample. # MultiQC (for parsing additional data) picks the sample name after the dir as follows: # <sample name>/raw_data_qualimapReport/insert_size_histogram.txt results_dir = os.path.join(out_dir, dd.get_sample_name(data)) results_file = os.path.join(results_dir, "rnaseq_qc_results.txt") report_file = os.path.join(results_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_gtf_file(data) single_end = not bam.is_paired(bam_file) library = strandedness[dd.get_strandedness(data)] if not utils.file_exists(results_file): with file_transaction(data, results_dir) as tx_results_dir: utils.safe_makedir(tx_results_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(data, bam_file, tx_results_dir, gtf_file, single_end, library) do.run(cmd, "Qualimap for {}".format(dd.get_sample_name(data))) tx_results_file = os.path.join(tx_results_dir, "rnaseq_qc_results.txt") cmd = "sed -i 's/bam file = .*/bam file = %s.bam/' %s" % ( dd.get_sample_name(data), tx_results_file) do.run(cmd, "Fix Name Qualimap for {}".format(dd.get_sample_name(data))) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, results_dir, data)) metrics.update(_detect_rRNA(data)) metrics.update( {"Average_insert_size": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) # Qualimap output folder (results_dir) needs to be named after the sample (see comments above). However, in order # to keep its name after upload, we need to put the base QC file (results_file) into the root directory (out_dir): base_results_file = os.path.join(out_dir, os.path.basename(results_file)) shutil.copyfile(results_file, base_results_file) return { "base": base_results_file, "secondary": _find_qualimap_secondary_files(results_dir, base_results_file), "metrics": metrics }
def run_count(bam_file, dexseq_gff, stranded, out_file, data): """ run dexseq_count on a BAM file """ assert file_exists(bam_file), "%s does not exist." % bam_file sort_order = bam._get_sort_order(bam_file, {}) assert sort_order, "Cannot determine sort order of %s." % bam_file strand_flag = _strand_flag(stranded) assert strand_flag, "%s is not a valid strandedness value." % stranded if not dexseq_gff: logger.info( "No DEXSeq GFF file was found, skipping exon-level counting.") return None elif not file_exists(dexseq_gff): logger.info("%s was not found, so exon-level counting is being " "skipped." % dexseq_gff) return None dexseq_count = _dexseq_count_path() if not dexseq_count: logger.info("DEXseq is not installed, skipping exon-level counting.") return None if dd.get_aligner(data) == "bwa": logger.info( "Can't use DEXSeq with bwa alignments, skipping exon-level counting." ) return None sort_flag = "name" if sort_order == "queryname" else "pos" is_paired = bam.is_paired(bam_file) paired_flag = "yes" if is_paired else "no" anaconda = os.path.dirname(os.path.realpath(sys.executable)) r36_python = os.path.join(anaconda, "..", "envs", "r36", "bin", "python") if file_exists(out_file): return out_file cmd = ( "{r36_python} {dexseq_count} -f bam -r {sort_flag} -p {paired_flag} " "-s {strand_flag} {dexseq_gff} {bam_file} {tx_out_file}") message = "Counting exon-level counts with %s and %s." % (bam_file, dexseq_gff) with file_transaction(data, out_file) as tx_out_file: do.run(cmd.format(**locals()), message) return out_file
def call_consensus(samples): """ call consensus peaks on the narrowPeak files from a set of ChiP/ATAC samples """ data = samples[0][0] new_samples = [] consensusdir = os.path.join(dd.get_work_dir(data), "consensus") utils.safe_makedir(consensusdir) peakfiles = [] for data in dd.sample_data_iterator(samples): if dd.get_chip_method(data) == "chip": for fn in tz.get_in(("peaks_files", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) elif "broadPeak" in fn: peakfiles.append(fn) elif dd.get_chip_method(data) == "atac": if bam.is_paired(dd.get_work_bam(data)): for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) else: logger.info( f"Using peaks from full fraction since {dd.get_work_bam(data)} is single-ended." ) for fn in tz.get_in(("peaks_files", "full", "macs2"), data, []): if "narrowPeak" in fn: peakfiles.append(fn) consensusfile = os.path.join(consensusdir, "consensus.bed") if not peakfiles: logger.info( "No suitable peak files found, skipping consensus peak calling.") return samples consensusfile = consensus(peakfiles, consensusfile, data) if not utils.file_exists(consensusfile): logger.warning("No consensus peaks found.") return samples saffile = consensus_to_saf(consensusfile, os.path.splitext(consensusfile)[0] + ".saf") for data in dd.sample_data_iterator(samples): data = tz.assoc_in(data, ("peaks_files", "consensus"), {"main": consensusfile}) new_samples.append([data]) return new_samples
def sample_summary(bam_file, data, out_dir): """Run RNA-SeQC on a single RNAseq sample, writing to specified output directory. """ metrics_file = os.path.join(out_dir, "metrics.tsv") if not file_exists(metrics_file): config = data["config"] ref_file = data["sam_ref"] genome_dir = os.path.dirname(os.path.dirname(ref_file)) gtf_file = config_utils.get_transcript_gtf(genome_dir) rna_file = config_utils.get_rRNA_sequence(genome_dir) sample_file = os.path.join(safe_makedir(out_dir), "sample_file.txt") _write_sample_id_file(data, bam_file, sample_file) runner = rnaseqc_runner_from_config(config) bam.index(bam_file, config) single_end = bam.is_paired(bam_file) runner.run(sample_file, ref_file, rna_file, gtf_file, out_dir, single_end) return _parse_rnaseqc_metrics(metrics_file, data["name"][-1])
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, library="non-strand-specific"): """ Create command lines for qualimap """ config = data["config"] qualimap = config_utils.get_program("qualimap", config) resources = config_utils.get_resources("qualimap", config) num_cores = resources.get("cores", dd.get_num_cores(data)) max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), num_cores) export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export()) export = "%s%s export JAVA_OPTS='-Xms32m -Xmx%s -Djava.io.tmpdir=%s' && " % ( utils.java_freetype_fix(), utils.local_path_export(), max_mem, out_dir) paired = " --paired" if bam.is_paired(bam_file) else "" cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} " "-a proportional -bam {bam_file} -p {library}{paired} " "-gtf {gtf_file}").format(**locals()) return cmd
def _convert_bam_to_fastq(in_file, work_dir, data, dirs, config): """Convert BAM input file into FASTQ files. """ out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert")) out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format( os.path.splitext(os.path.basename(in_file))[0], x)) for x in ["1", "2"]] if bam.is_paired(in_file): out1, out2 = out_files else: out1 = out_files[0] out2 = None if not file_exists(out1): broad_runner = broad.runner_from_path("picard", config) broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2) if out2 and os.path.getsize(out2) == 0: out2 = None return [out1, out2]
def sample_summary(bam_file, data, out_dir): """Run RNA-SeQC on a single RNAseq sample, writing to specified output directory. """ metrics_file = os.path.join(out_dir, "metrics.tsv") if not file_exists(metrics_file): with file_transaction(out_dir) as tx_out_dir: config = data["config"] ref_file = data["sam_ref"] genome_dir = os.path.dirname(os.path.dirname(ref_file)) gtf_file = config_utils.get_transcript_gtf(genome_dir) rna_file = config_utils.get_rRNA_sequence(genome_dir) sample_file = os.path.join(safe_makedir(tx_out_dir), "sample_file.txt") _write_sample_id_file(data, bam_file, sample_file) runner = rnaseqc_runner_from_config(config) bam.index(bam_file, config) single_end = not bam.is_paired(bam_file) runner.run(sample_file, ref_file, rna_file, gtf_file, tx_out_dir, single_end) # we don't need this large directory for just the report shutil.rmtree(os.path.join(tx_out_dir, data["description"])) return _parse_rnaseqc_metrics(metrics_file, data["name"][-1])
def _rnaseq_qualimap(bam_file, data, out_dir): """ Run qualimap for a rnaseq bam file and parse results """ report_file = os.path.join(out_dir, "qualimapReport.html") config = data["config"] gtf_file = dd.get_gtf_file(data) ref_file = dd.get_ref_file(data) single_end = not bam.is_paired(bam_file) if not utils.file_exists(report_file): utils.safe_makedir(out_dir) bam.index(bam_file, config) cmd = _rnaseq_qualimap_cmd(config, bam_file, out_dir, gtf_file, single_end) do.run(cmd, "Qualimap for {}".format(data["name"][-1])) metrics = _parse_rnaseq_qualimap_metrics(report_file) metrics.update(_detect_duplicates(bam_file, out_dir, config)) metrics.update(_detect_rRNA(config, bam_file, gtf_file, ref_file, out_dir, single_end)) metrics.update({"Fragment Length Mean": bam.estimate_fragment_size(bam_file)}) metrics = _parse_metrics(metrics) return metrics