def main(config_file, align_sam, ref_file, fastq_one, fastq_pair=None, sample_name="", rg_name="", pu_name=""): with open(config_file) as in_handle: config = yaml.load(in_handle) picard = BroadRunner(config["program"]["picard"], max_memory=config["algorithm"].get("java_memory", "")) platform = config["algorithm"]["platform"] if platform.lower() == "illumina": qual_format = "Illumina" else: raise ValueError("Need to specify quality format for %s" % platform) index_ref_file(picard, ref_file) base_dir = os.path.split(align_sam)[0] with curdir_tmpdir() as tmp_dir: out_fastq_bam = picard_fastq_to_bam(picard, fastq_one, fastq_pair, base_dir, platform, qual_format, sample_name, rg_name, pu_name, tmp_dir) out_bam = picard_merge_bam(picard, align_sam, out_fastq_bam, ref_file, tmp_dir, fastq_pair is not None) sort_bam = picard_sort(picard, out_bam, tmp_dir) save_diskspace(out_fastq_bam, "Combined into output BAM %s" % out_bam, config) save_diskspace(out_bam, "Sorted to %s" % sort_bam, config)
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. TODO: Use streaming with new development version of SNAP to feed into structural variation preparation de-duplication. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) assert not data.get("align_split"), "Split alignments not supported with SNAP" snap = config_utils.get_program("snap", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) max_mem = resources.get("memory", "1G") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with utils.curdir_tmpdir(data) as work_dir: if fastq_file.endswith(".bam"): cmd_name = "paired" if bam.is_paired(fastq_file) else "single" else: cmd_name = "single" if not pair_file else "paired" cmd = ("{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} " "-rg '{rg_info}' -t {num_cores} -sa -so -sm {max_mem} -o {tx_out_file}") do.run(cmd.format(**locals()), "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if data.get("align_split"): final_file = out_file out_file, data = alignprep.setup_combine(final_file, data) fastq_file = alignprep.split_namedpipe_cl(fastq_file, data) if pair_file: pair_file = alignprep.split_namedpipe_cl(pair_file, data) else: final_file = None samtools = config_utils.get_program("samtools", data["config"]) novoalign = config_utils.get_program("novoalign", data["config"]) resources = config_utils.get_resources("novoalign", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(data["config"])) rg_info = get_rg_info(names) if not utils.file_exists(out_file) and (final_file is None or not utils.file_exists(final_file)): with utils.curdir_tmpdir(data) as work_dir: with postalign.tobam_cl(data, out_file, pair_file != "") as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, fastq_file)]) data["work_bam"] = out_file return data
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samblaster = config_utils.get_program("samblaster", data["config"]) sambamba = config_utils.get_program("sambamba", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") if not utils.file_exists(sr_file) or not utils.file_exists(disc_file): with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with utils.curdir_tmpdir() as tmpdir: tobam_cmd = ("{sambamba} view -S -f bam -l 0 /dev/stdin | " "{sambamba} sort -t {cores} -m {mem} --tmpdir {tmpdir} " "-o {out_file} /dev/stdin") splitter_cmd = tobam_cmd.format(out_file=tx_sr_file, **locals()) discordant_cmd = tobam_cmd.format(out_file=tx_disc_file, **locals()) cmd = ("{sambamba} sort -t {cores} -m {mem} --tmpdir={tmpdir} " "-n -o /dev/stdout -l 0 {in_bam} | " "{sambamba} view -h /dev/stdin | " "{samblaster} --splitterFile >({splitter_cmd}) --discordantFile >({discordant_cmd}) " "-o /dev/null") do.run(cmd.format(**locals()), "samblaster: split and discordant reads", data) return sr_file, disc_file
def _run_recal_bam(dup_align_bam, recal_file, region, ref_file, out_file, config): """Run BAM recalibration with the given input """ if not file_exists(out_file): if _recal_available(recal_file): broad_runner = broad.runner_from_config(config) intervals = config["algorithm"].get("variant_regions", None) with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = [ "-T", "PrintReads", "-BQSR", recal_file, "-R", ref_file, "-I", dup_align_bam, "--out", tx_out_file, ] if region: params += ["-L", region] if intervals: params += ["-L", intervals] if params and intervals: params += ["--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params, tmp_dir) elif region: subset_bam_by_region(dup_align_bam, region, out_file) else: shutil.copy(dup_align_bam, out_file) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) novoalign = config_utils.get_program("novoalign", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") qual_format = config["algorithm"].get("quality_format", "").lower() qual_flag = "ILMFQ" if qual_format == "illumina" else "STDFQ" rg_info = get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -F {qual_flag} -c {num_cores} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) log_cmd("Novoalign: %s" % names["sample"], None, cmd) subprocess.check_call(cmd, shell=True) return out_file
def _run_recal_bam(dup_align_bam, recal_file, region, ref_file, out_file, config): """Run BAM recalibration with the given input """ if not file_exists(out_file): if _recal_available(recal_file): broad_runner = broad.runner_from_config(config) with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = ["-T", "PrintReads", "-BQSR", recal_file, "-R", ref_file, "-I", dup_align_bam, "--out", tx_out_file, ] base_bed = config["algorithm"].get("variant_regions", None) region_bed = subset_variant_regions(base_bed, region, tx_out_file) if region_bed: params += ["-L", region_bed, "--interval_set_rule", "INTERSECTION"] elif region: params += ["-L", region, "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params, tmp_dir) elif region: subset_bam_by_region(dup_align_bam, region, out_file) else: shutil.copy(dup_align_bam, out_file) return out_file
def run_gatk(self, params, tmp_dir=None): #support_nt = set(["UnifiedGenotyper", "VariantEval"]) support_nt = set() support_nct = set(["BaseRecalibrator"]) gatk_jar = self._get_jar("GenomeAnalysisTK", ["GenomeAnalysisTKLite"]) local_args = [] cores = self._resources.get("cores", None) if cores and cores > 1: atype_index = params.index("-T") if params.count("-T") > 0 \ else params.index("--analysis_type") prog = params[atype_index + 1] if prog in support_nt: params.extend(["-nt", str(cores)]) elif prog in support_nct: params.extend(["-nct", str(cores)]) if len([x for x in params if x.startswith(("-U", "--unsafe"))]) == 0: params.extend(["-U", "LENIENT_VCF_PROCESSING"]) with curdir_tmpdir() as local_tmp_dir: if tmp_dir is None: tmp_dir = local_tmp_dir local_args.append("-Djava.io.tmpdir=%s" % tmp_dir) cl = ["java"] + self._jvm_opts + local_args + \ ["-jar", gatk_jar] + [str(x) for x in params] #print " ".join(cl) subprocess.check_call(cl)
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform realignment of input BAM file, handling sorting of input/output with novosort. Uses unix pipes for avoid IO writing between steps: - novosort of input BAM to coordinates - alignment with novoalign - conversion to BAM with samtools - coordinate sorting with novosort """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novosort = config_utils.get_program("novosort", config) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G") extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with curdir_tmpdir(base_dir=align_dir) as work_dir: with file_transaction(out_file) as tx_out_file: rg_info = get_rg_info(names) cmd = ("{novosort} -c {num_cores} -m {max_mem} --compression 0 " " -n -t {work_dir} {in_bam} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} " "| {samtools} view -b -S -u - " "| {novosort} -c {num_cores} -m {max_mem} -t {work_dir} " " -o {tx_out_file} /dev/stdin") cmd = cmd.format(**locals()) do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) novoalign = config_utils.get_program("novoalign", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) rg_info = get_rg_info(names) if not utils.file_exists(out_file): with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ( "{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}" ) cmd = cmd.format(**locals()) do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform realignment of input BAM file, handling sorting of input/output with novosort. Uses unix pipes for avoid IO writing between steps: - novosort of input BAM to coordinates - alignment with novoalign - conversion to BAM with samtools - coordinate sorting with novosort """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novosort = config_utils.get_program("novosort", config) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G") extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with curdir_tmpdir(base_dir=align_dir) as work_dir: with file_transaction(out_file) as tx_out_file: rg_info = get_rg_info(names) cmd = ( "{novosort} -c {num_cores} -m {max_mem} --compression 0 " " -n -t {work_dir} {in_bam} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} " "| {samtools} view -b -S -u - " "| {novosort} -c {num_cores} -m {max_mem} -t {work_dir} " " -o {tx_out_file} /dev/stdin") cmd = cmd.format(**locals()) do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def _gatk_count_covariates(broad_runner, dup_align_bam, ref_file, platform, snp_file, intervals): """Step 1 of GATK recalibration process -- counting covariates. """ out_file = "%s.recal" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = ["-T", "CountCovariates", "-cov", "ReadGroupCovariate", "-cov", "QualityScoreCovariate", "-cov", "CycleCovariate", "-cov", "DinucCovariate", "-recalFile", tx_out_file, "-I", dup_align_bam, "-R", ref_file, "-l", "INFO", "-U", "-OQ", "--default_platform", platform, ] if snp_file: params += ["--knownSites", snp_file] if intervals: params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def _gatk_table_recalibrate(broad_runner, dup_align_bam, ref_file, recal_file, platform, intervals): """Step 2 of GATK recalibration -- use covariates to re-write output file. """ out_file = "%s-gatkrecal.bam" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if _recal_available(recal_file): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = ["-T", "TableRecalibration", "-recalFile", recal_file, "-R", ref_file, "-I", dup_align_bam, "--out", tx_out_file, "-baq", "RECALCULATE", "-l", "INFO", "-U", "-OQ", "--default_platform", platform, ] if intervals: params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params, tmp_dir) else: shutil.copy(dup_align_bam, out_file) return out_file
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = setup_directories(work_dir, fc_dir, config, config_file) config_file = os.path.join(dirs["config"], os.path.basename(config_file)) samples = run_info.organize(dirs, config, run_info_yaml) pipelines = _pair_lanes_with_pipelines(samples) final = [] with utils.curdir_tmpdir({"config": config}) as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform realignment of input BAM file; uses unix pipes for avoid IO. """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G").upper() extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): rg_info = get_rg_info(names) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam)]) return out_file
def run_gatk(self, params, tmp_dir=None, log_error=True, memory_retry=False, data=None, region=None): with curdir_tmpdir() as local_tmp_dir: if tmp_dir is None: tmp_dir = local_tmp_dir cl = self.cl_gatk(params, tmp_dir) atype_index = cl.index("-T") if cl.count("-T") > 0 \ else cl.index("--analysis_type") prog = cl[atype_index + 1] if memory_retry: do.run_memory_retry(cl, "GATK: {0}".format(prog), data, region=region) else: do.run(cl, "GATK: {0}".format(prog), data, region=region, log_error=log_error)
def run_mutect(self, params, tmp_dir=None): with curdir_tmpdir() as local_tmp_dir: if tmp_dir is None: tmp_dir = local_tmp_dir cl = self.cl_mutect(params, tmp_dir) prog = "MuTect" do.run(cl, "MuTect: {0}".format(prog), None)
def _run_fastqc(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/QC pipeline. """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_bam = (bam.downsample(bam_file, data, 1e7) if data.get( "analysis", "").lower() not in ["standard"] else None) bam_file = ds_bam if ds_bam else bam_file num_cores = data["config"]["algorithm"].get("num_cores", 1) with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [ config_utils.get_program("fastqc", data["config"]), "-t", str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file ] do.run(cl, "FastQC: %s" % data["name"][-1]) fastqc_outdir = os.path.join( tx_tmp_dir, "%s_fastqc" % os.path.splitext(os.path.basename(bam_file))[0]) if os.path.exists("%s.zip" % fastqc_outdir): os.remove("%s.zip" % fastqc_outdir) if not os.path.exists(sentry_file): if os.path.exists(fastqc_out): shutil.rmtree(fastqc_out) shutil.move(fastqc_outdir, fastqc_out) if ds_bam and os.path.exists(ds_bam): os.remove(ds_bam) parser = FastQCParser(fastqc_out) stats = parser.get_fastqc_summary() return stats
def align_bam(in_bam, ref_file, names, align_dir, data): """Perform realignment of input BAM file; uses unix pipes for avoid IO. """ config = data["config"] out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G") extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) if not file_exists(out_file): with utils.curdir_tmpdir(data, base_dir=align_dir) as work_dir: with postalign.tobam_cl(data, out_file, bam.is_paired(in_bam)) as (tobam_cl, tx_out_file): rg_info = get_rg_info(names) tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ( "{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} {extra_novo_args} | ") cmd = cmd.format(**locals()) + tobam_cl do.run(cmd, "Novoalign: %s" % names["sample"], None, [ do.file_nonempty(tx_out_file), do.file_reasonable_size(tx_out_file, in_bam) ]) return out_file
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_items = run_info.organize(dirs, config, run_info_yaml) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane lane_items = lane.process_all_lanes(run_items, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) final = [] with utils.curdir_tmpdir() as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, run_parallel, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "768M") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): _check_samtools_version() with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") logger.info(cmd.format(**locals())) subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used alongside alignment max_mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): novoalign.check_samtools_version(config) with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{bwa} mem -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} " "{fastq_file} {pair_file} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from fastq: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform realignment of input BAM file, handling sorting of input/output with novosort. Uses unix pipes for avoid IO writing between steps: - novosort of input BAM to coordinates - alignment with novoalign - conversion to BAM with samtools - coordinate sorting with novosort """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) if not file_exists(out_file): with curdir_tmpdir(base_dir=align_dir) as work_dir: with file_transaction(out_file) as tx_out_file: resources = config_utils.get_resources("novoalign", config) num_cores = resources["cores"] max_mem = resources.get("memory", "4G") read_sort = sh.novosort.bake(in_bam, c=num_cores, m=max_mem, compression=0, n=True, t=work_dir, _piped=True) rg_info = r"SAM '@RG\tID:{rg}\tPL:{pl}\tPU:{pu}\tSM:{sample}'".format(**names) align = sh.novoalign.bake(o=rg_info, d=ref_file, f="/dev/stdin", F="BAMPE", c=num_cores, _piped=True) to_bam = sh.samtools.view.bake(b=True, S=True, u=True, _piped=True).bake("-") coord_sort = sh.novosort.bake("/dev/stdin", c=num_cores, m=max_mem, o=tx_out_file, t=work_dir) subprocess.check_call("%s | %s | %s | %s" % (read_sort, align, to_bam, coord_sort), shell=True) return out_file
def split_bam_file(bam_file, split_size, out_dir, config): """Split a BAM file into paired end fastq splits based on split size. XXX Need to generalize for non-paired end inputs. """ existing = _find_current_bam_split(bam_file, out_dir) if len(existing) > 0: return existing pipe = True utils.safe_makedir(out_dir) broad_runner = broad.runner_from_config(config) out_files = [] def new_handle(num): out = [] for pair in [1, 2]: fname = os.path.join( out_dir, "{base}_{pair}_{num}.fastq".format( base=os.path.splitext(os.path.basename(bam_file))[0], pair=pair, num=num ), ) out += [fname, open(fname, "w")] return out with utils.curdir_tmpdir(base_dir=config_utils.get_resources("tmp", config).get("dir")) as tmp_dir: if pipe: sort_file = os.path.join(tmp_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0]) os.mkfifo(sort_file) broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file, compression_level=0, pipe=True) else: sort_file = os.path.join(out_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0]) broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file) samfile = pysam.Samfile(sort_file, "rb") i = 0 num = 0 f1, out_handle1, f2, out_handle2 = new_handle(num) out_files.append([f1, f2, None]) for x1, x2 in utils.partition_all(2, samfile): x1_seq, x1_qual = _get_seq_qual(x1) out_handle1.write("@%s/1\n%s\n+\n%s\n" % (i, x1_seq, x1_qual)) x2_seq, x2_qual = _get_seq_qual(x2) out_handle2.write("@%s/2\n%s\n+\n%s\n" % (i, x2_seq, x2_qual)) i += 1 if i % split_size == 0: num += 1 out_handle1.close() out_handle2.close() f1, out_handle1, f2, out_handle2 = new_handle(num) out_files.append([f1, f2, num]) out_handle1.close() out_handle2.close() samfile.close() if pipe: os.unlink(sort_file) else: utils.save_diskspace(sort_file, "Split to {}".format(out_files[0][0]), config) return out_files
def gatk_indel_realignment(runner, align_bam, ref_file, intervals, region=None, out_file=None, deep_coverage=False): """Perform realignment of BAM file in specified regions """ if out_file is None: out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0] if not file_exists(out_file): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: logger.info("GATK IndelRealigner: %s %s" % (os.path.basename(align_bam), region)) params = ["-T", "IndelRealigner", "-I", align_bam, "-R", ref_file, "-targetIntervals", intervals, "-o", tx_out_file, "-l", "INFO", ] if region: params += ["-L", region] if deep_coverage: params += ["--maxReadsInMemory", "300000", "--maxReadsForRealignment", str(int(5e5)), "--maxReadsForConsensuses", "500", "--maxConsensuses", "100"] try: runner.run_gatk(params, tmp_dir) except: logger.exception("Running GATK IndelRealigner failed: {} {}".format( os.path.basename(align_bam), region)) raise return out_file
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals): """Step 1 of GATK recalibration process, producing table of covariates. """ out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0] plot_file = "%s-plots.pdf" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = ["-T", "BaseRecalibrator", "-o", tx_out_file, "--plot_pdf_file", plot_file, "-I", dup_align_bam, "-R", ref_file, ] downsample_pct = _get_downsample_pct(broad_runner, dup_align_bam) if downsample_pct: params += ["--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS"] # GATK-lite does not have support for # insertion/deletion quality modeling if broad_runner.gatk_type() == "lite": params += ["--disable_indel_quals"] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def _gatk_recalibrate(broad_runner, dup_align_bam, ref_file, recal_file, platform, intervals): """Step 2 of GATK recalibration -- use covariates to re-write output file. """ out_file = "%s-gatkrecal.bam" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if _recal_available(recal_file): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = [ "-T", "PrintReads", "-BQSR", recal_file, "-R", ref_file, "-I", dup_align_bam, "--out", tx_out_file, ] if intervals: params += [ "-L", intervals, "--interval_set_rule", "INTERSECTION" ] broad_runner.run_gatk(params, tmp_dir) else: shutil.copy(dup_align_bam, out_file) return out_file
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease").upper() if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file): with utils.curdir_tmpdir(data) as tmpdir: with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with file_transaction(dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file, tx_sr_file, tx_disc_file) out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0]) cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | " "{samtools} view -h - | ") cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} samples = run_info.organize(dirs, config, run_info_yaml) pipelines = _pair_lanes_with_pipelines(samples) final = [] with utils.curdir_tmpdir() as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def align_pipe(fastq_file, pair_file, ref_file, names, align_dir, config): """Perform piped alignment of fastq input files, generating sorted output BAM. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) novoalign = config_utils.get_program("novoalign", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "1G") extra_novo_args = " ".join(_novoalign_args_from_config(config, False)) rg_info = get_rg_info(names) if not utils.file_exists(out_file): check_samtools_version() with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] cmd = ("{novoalign} -o SAM '{rg_info}' -d {ref_file} -f {fastq_file} {pair_file} " " -c {num_cores} {extra_novo_args} " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "Novoalign: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def _gatk_count_covariates(picard, dup_align_bam, ref_file, platform, snp_file): """Step 1 of GATK recalibration process -- counting covariates. """ out_file = "%s.recal" % os.path.splitext(dup_align_bam)[0] params = ["-T", "CountCovariates", "-cov", "ReadGroupCovariate", "-cov", "QualityScoreCovariate", "-cov", "CycleCovariate", "-cov", "DinucCovariate", "-cov", "TileCovariate", "-recalFile", out_file, "-I", dup_align_bam, "-R", ref_file, "-l", "INFO", "-U", "-OQ", "--default_platform", platform, ] if snp_file: params += ["-B:dbsnp,VCF", snp_file] if not os.path.exists(out_file): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file): picard.run_gatk(params, tmp_dir) return out_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform realignment of input BAM file, handling sorting of input/output with novosort. Uses unix pipes for avoid IO writing between steps: - novosort of input BAM to coordinates - alignment with novoalign - conversion to BAM with samtools - coordinate sorting with novosort """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) novosort = config_utils.get_program("novosort", config) novoalign = config_utils.get_program("novoalign", config) samtools = config_utils.get_program("samtools", config) resources = config_utils.get_resources("novoalign", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "4G") if not file_exists(out_file): with curdir_tmpdir(base_dir=align_dir) as work_dir: with file_transaction(out_file) as tx_out_file: rg_info = r"@RG\tID:{rg}\tPL:{pl}\tPU:{pu}\tSM:{sample}".format(**names) cmd = ("{novosort} -c {num_cores} -m {max_mem} --compression 0 " " -n -t {work_dir} {in_bam} " "| {novoalign} -o SAM '{rg_info}' -d {ref_file} -f /dev/stdin " " -F BAMPE -c {num_cores} " "| {samtools} view -b -S -u - " "| {novosort} -c {num_cores} -m {max_mem} -t {work_dir} " " -o {tx_out_file} /dev/stdin") subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def _run_kraken(data,ratio): """Run kraken, generating report in specified directory and parsing metrics. Using only first paired reads. """ logger.info("Number of aligned reads < than 0.60 in %s: %s" % (str(data["name"]),ratio)) logger.info("Running kraken to determine contaminant: %s" % str(data["name"])) qc_dir = utils.safe_makedir(os.path.join(data["dirs"]["work"], "qc", data["description"])) kraken_out = os.path.join(qc_dir, "kraken") stats = out = out_stats = None db = data['config']["algorithm"]["kraken"] if db == "minikraken": db = os.path.join(_get_data_dir(),"genome","kraken","minikraken") else: if not os.path.exists(db): logger.info("kraken: no database found %s, skipping" % db) return {"kraken_report" : "null"} if not os.path.exists(os.path.join(kraken_out,"kraken_out")): work_dir = os.path.dirname(kraken_out) utils.safe_makedir(work_dir) num_cores = data["config"]["algorithm"].get("num_cores", 1) files = data["files"] with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): out = os.path.join(tx_tmp_dir,"kraken_out") out_stats = os.path.join(tx_tmp_dir,"kraken_stats") cl = (" ").join([config_utils.get_program("kraken", data["config"]), "--db",db,"--quick", "--preload","--min-hits","2","--threads",str(num_cores), "--out", out, files[0]," 2>",out_stats]) do.run(cl,"kraken: %s" % data["name"][-1]) if os.path.exists(kraken_out): shutil.rmtree(kraken_out) shutil.move(tx_tmp_dir, kraken_out) metrics = _parse_kraken_output(kraken_out,db,data) return metrics
def _run_fastqc(bam_file, data, fastqc_out): """Run fastqc, generating report in specified directory and parsing metrics. Downsamples to 10 million reads to avoid excessive processing times with large files, unless we're running a Standard/QC pipeline. """ sentry_file = os.path.join(fastqc_out, "fastqc_report.html") if not os.path.exists(sentry_file): work_dir = os.path.dirname(fastqc_out) utils.safe_makedir(work_dir) ds_bam = (bam.downsample(bam_file, data, 1e7) if data.get("analysis", "").lower() not in ["standard"] else None) bam_file = ds_bam if ds_bam else bam_file num_cores = data["config"]["algorithm"].get("num_cores", 1) with utils.curdir_tmpdir(data, work_dir) as tx_tmp_dir: with utils.chdir(tx_tmp_dir): cl = [config_utils.get_program("fastqc", data["config"]), "-t", str(num_cores), "-o", tx_tmp_dir, "-f", "bam", bam_file] do.run(cl, "FastQC: %s" % data["name"][-1]) fastqc_outdir = os.path.join(tx_tmp_dir, "%s_fastqc" % os.path.splitext(os.path.basename(bam_file))[0]) if os.path.exists("%s.zip" % fastqc_outdir): os.remove("%s.zip" % fastqc_outdir) if not os.path.exists(sentry_file): if os.path.exists(fastqc_out): shutil.rmtree(fastqc_out) shutil.move(fastqc_outdir, fastqc_out) if ds_bam and os.path.exists(ds_bam): os.remove(ds_bam) parser = FastQCParser(fastqc_out) stats = parser.get_fastqc_summary() return stats
def _extract_split_and_discordants(in_bam, work_dir, data): """Retrieve split-read alignments from input BAM file. """ dedup_file = os.path.join(work_dir, "%s-dedup.bam" % os.path.splitext(os.path.basename(in_bam))[0]) sr_file = os.path.join(work_dir, "%s-sr.bam" % os.path.splitext(os.path.basename(in_bam))[0]) disc_file = os.path.join(work_dir, "%s-disc.bam" % os.path.splitext(os.path.basename(in_bam))[0]) samtools = config_utils.get_program("samtools", data["config"]) cores = utils.get_in(data, ("config", "algorithm", "num_cores"), 1) resources = config_utils.get_resources("sambamba", data["config"]) mem = config_utils.adjust_memory(resources.get("memory", "2G"), 3, "decrease") if not utils.file_exists(sr_file) or not utils.file_exists(disc_file) or utils.file_exists(dedup_file): with utils.curdir_tmpdir() as tmpdir: with file_transaction(sr_file) as tx_sr_file: with file_transaction(disc_file) as tx_disc_file: with file_transaction(dedup_file) as tx_dedup_file: samblaster_cl = postalign.samblaster_dedup_sort(data, tmpdir, tx_dedup_file, tx_sr_file, tx_disc_file) out_base = os.path.join(tmpdir, "%s-namesort" % os.path.splitext(in_bam)[0]) cmd = ("{samtools} sort -n -o -@ {cores} -m {mem} {in_bam} {out_base} | " "{samtools} view -h - | ") cmd = cmd.format(**locals()) + samblaster_cl do.run(cmd, "samblaster: split and discordant reads", data) for fname in [sr_file, disc_file, dedup_file]: bam.index(fname, data["config"]) return dedup_file, sr_file, disc_file
def align(fastq_file, pair_file, index_dir, names, align_dir, data): """Perform piped alignment of fastq input files, generating sorted, deduplicated BAM. TODO: Use streaming with new development version of SNAP to feed into structural variation preparation de-duplication. """ pair_file = pair_file if pair_file else "" out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) assert not data.get( "align_split"), "Split alignments not supported with SNAP" snap = config_utils.get_program("snap", data["config"]) num_cores = data["config"]["algorithm"].get("num_cores", 1) resources = config_utils.get_resources("snap", data["config"]) max_mem = resources.get("memory", "1G") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): with file_transaction(out_file) as tx_out_file: with utils.curdir_tmpdir(data) as work_dir: if fastq_file.endswith(".bam"): cmd_name = "paired" if bam.is_paired( fastq_file) else "single" else: cmd_name = "single" if not pair_file else "paired" cmd = ( "{snap} {cmd_name} {index_dir} {fastq_file} {pair_file} " "-rg '{rg_info}' -t {num_cores} -sa -so -sm {max_mem} -o {tx_out_file}" ) do.run(cmd.format(**locals()), "SNAP alignment: %s" % names["sample"]) data["work_bam"] = out_file return data
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) # adjust memory for samtools since used for input and output max_mem = config_utils.adjust_memory(resources.get("memory", "1G"), 3, "decrease") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): novoalign.check_samtools_version(config) with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def align_bam(in_bam, ref_file, names, align_dir, config): """Perform direct alignment of an input BAM file with BWA using pipes. This avoids disk IO by piping between processes: - samtools sort of input BAM to queryname - bedtools conversion to interleaved FASTQ - bwa-mem alignment - samtools conversion to BAM - samtools sort to coordinate """ out_file = os.path.join(align_dir, "{0}-sort.bam".format(names["lane"])) samtools = config_utils.get_program("samtools", config) bedtools = config_utils.get_program("bedtools", config) bwa = config_utils.get_program("bwa", config) resources = config_utils.get_resources("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) max_mem = resources.get("memory", "768M") rg_info = novoalign.get_rg_info(names) if not utils.file_exists(out_file): _check_samtools_version() with utils.curdir_tmpdir() as work_dir: with file_transaction(out_file) as tx_out_file: tx_out_prefix = os.path.splitext(tx_out_file)[0] prefix1 = "%s-in1" % tx_out_prefix cmd = ("{samtools} sort -n -o -l 0 -@ {num_cores} -m {max_mem} {in_bam} {prefix1} " "| {bedtools} bamtofastq -i /dev/stdin -fq /dev/stdout -fq2 /dev/stdout " "| {bwa} mem -p -M -t {num_cores} -R '{rg_info}' -v 1 {ref_file} - " "| {samtools} view -b -S -u - " "| {samtools} sort -@ {num_cores} -m {max_mem} - {tx_out_prefix}") cmd = cmd.format(**locals()) do.run(cmd, "bwa mem alignment from BAM: %s" % names["sample"], None, [do.file_nonempty(tx_out_file)]) return out_file
def gatk_recalibrate(picard, dup_align_bam, ref_file, recal_file, platform): """Step 2 of GATK recalibration -- use covariates to re-write output file. """ out_file = "%s-gatkrecal.bam" % os.path.splitext(dup_align_bam)[0] params = [ "-T", "TableRecalibration", "-recalFile", recal_file, "-R", ref_file, "-I", dup_align_bam, "--out", out_file, "-baq", "RECALCULATE", "-l", "INFO", "-U", "-OQ", "--default_platform", platform, ] if not os.path.exists(out_file): if _recal_available(recal_file): with curdir_tmpdir() as tmp_dir: picard.run_gatk(params, tmp_dir) else: shutil.copy(dup_align_bam, out_file) return out_file
def gatk_indel_realignment(runner, align_bam, ref_file, intervals, deep_coverage=False): """Perform realignment of BAM file in specified regions """ out_file = "%s-realign.bam" % os.path.splitext(align_bam)[0] params = [ "-T", "IndelRealigner", "-I", align_bam, "-R", ref_file, "-targetIntervals", intervals, "-o", out_file, "-l", "INFO", ] if deep_coverage: params += [ "--maxReadsInMemory", "300000", "--maxReadsForRealignment", str(int(5e5)), "--maxReadsForConsensuses", "500", "--maxConsensuses", "100", ] if not (os.path.exists(out_file) and os.path.getsize(out_file) > 0): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file): runner.run_gatk(params, tmp_dir) return out_file
def generate_align_summary(bam_file, is_paired, sam_ref, sample_name, config, dirs): """Run alignment summarizing script to produce a pdf with align details. """ with utils.chdir(dirs["work"]): with utils.curdir_tmpdir() as tmp_dir: graphs, summary, overrep = _graphs_and_summary(bam_file, sam_ref, is_paired, tmp_dir, config) return _generate_pdf(graphs, summary, overrep, bam_file, sample_name, dirs, config)
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, snp_file, intervals): """Step 1 of GATK recalibration process, producing table of covariates. """ out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = ["-T", "BaseRecalibrator", "-o", tx_out_file, "-I", dup_align_bam, "-R", ref_file, ] # GATK-lite does not have support for # insertion/deletion quality modeling if not broad_runner.has_gatk_full(): params += ["--disable_indel_quals"] if snp_file: params += ["--knownSites", snp_file] if intervals: params += ["-L", intervals, "--interval_set_rule", "INTERSECTION"] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def picard_fastq_to_bam(picard, fastq_one, fastq_two, out_dir, platform, sample_name="", rg_name="", pu_name="", qual_format=None): """Convert fastq file(s) to BAM, adding sample, run group and platform information. """ qual_formats = {"illumina": "Illumina"} if qual_format is None: try: qual_format = qual_formats[platform.lower()] except KeyError: raise ValueError("Need to specify quality format for %s" % platform) out_bam = os.path.join(out_dir, "%s-fastq.bam" % os.path.splitext(os.path.basename(fastq_one))[0]) if not file_exists(out_bam): with curdir_tmpdir() as tmp_dir: with file_transaction(out_bam) as tx_out_bam: opts = [("FASTQ", fastq_one), ("QUALITY_FORMAT", qual_format), ("READ_GROUP_NAME", rg_name), ("SAMPLE_NAME", sample_name), ("PLATFORM_UNIT", pu_name), ("PLATFORM", platform), ("TMP_DIR", tmp_dir), ("OUTPUT", tx_out_bam)] if fastq_two: opts.append(("FASTQ2", fastq_two)) picard.run("FastqToSam", opts) return out_bam
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, snp_file, intervals): """Step 1 of GATK recalibration process, producing table of covariates. """ out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = [ "-T", "BaseRecalibrator", "-o", tx_out_file, "-I", dup_align_bam, "-R", ref_file, ] # GATK-lite does not have support for # insertion/deletion quality modeling if not broad_runner.has_gatk_full(): params += ["--disable_indel_quals"] if snp_file: params += ["--knownSites", snp_file] if intervals: params += [ "-L", intervals, "--interval_set_rule", "INTERSECTION" ] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals): """Step 1 of GATK recalibration process, producing table of covariates. Large whole genome BAM files take an excessively long time to recalibrate and the extra inputs don't help much beyond a certain point. See the 'Downsampling analysis' plots in the GATK documentation: http://gatkforums.broadinstitute.org/discussion/44/base-quality-score-recalibrator#latest This identifies large files and calculates the fraction to downsample to. TODO: Use new GATK 2.6+ AnalyzeCovariates tool to plot recalibration results. """ target_counts = 1e8 # 100 million reads per read group, 20x the plotted max out_file = "%s.grp" % os.path.splitext(dup_align_bam)[0] if not file_exists(out_file): if has_aligned_reads(dup_align_bam, intervals): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: params = [ "-T", "BaseRecalibrator", "-o", tx_out_file, "-I", dup_align_bam, "-R", ref_file, ] downsample_pct = bam.get_downsample_pct( broad_runner, dup_align_bam, target_counts) if downsample_pct: params += [ "--downsample_to_fraction", str(downsample_pct), "--downsampling_type", "ALL_READS" ] if platform.lower() == "solid": params += [ "--solid_nocall_strategy", "PURGE_READ", "--solid_recal_mode", "SET_Q_ZERO_BASE_N" ] # GATK-lite does not have support for # insertion/deletion quality modeling if broad_runner.gatk_type() == "lite": params += ["--disable_indel_quals"] if dbsnp_file: params += ["--knownSites", dbsnp_file] if intervals: params += [ "-L", intervals, "--interval_set_rule", "INTERSECTION" ] broad_runner.run_gatk(params, tmp_dir) else: with open(out_file, "w") as out_handle: out_handle.write("# No aligned reads") return out_file
def main(config_file): task_module = "bcbio.distributed.tasks" with open(config_file) as in_handle: config = yaml.load(in_handle) with utils.curdir_tmpdir() as work_dir: dirs = {"work": work_dir, "config": os.path.dirname(config_file)} with create_celeryconfig(task_module, dirs, config): run_celeryd(work_dir)
def picard_sort(picard, align_bam): base, ext = os.path.splitext(align_bam) out_file = "%s-sort%s" % (base, ext) if not os.path.exists(out_file): with curdir_tmpdir() as tmp_dir: opts = [("INPUT", align_bam), ("OUTPUT", out_file), ("TMP_DIR", tmp_dir), ("SORT_ORDER", "coordinate")] picard.run("SortSam", opts) return out_file
def split_bam_file(bam_file, split_size, out_dir, config): """Split a BAM file into paired end fastq splits based on split size. XXX Need to generalize for non-paired end inputs. """ existing = _find_current_bam_split(bam_file, out_dir) if len(existing) > 0: return existing pipe = True utils.safe_makedir(out_dir) broad_runner = broad.runner_from_config(config) out_files = [] def new_handle(num): out = [] for pair in [1, 2]: fname = os.path.join(out_dir, "{base}_{pair}_{num}.fastq".format( base=os.path.splitext(os.path.basename(bam_file))[0], pair=pair, num=num)) out += [fname, open(fname, "w")] return out with utils.curdir_tmpdir(base_dir=config_utils.get_resources("tmp", config).get("dir")) as tmp_dir: if pipe: sort_file = os.path.join(tmp_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0]) os.mkfifo(sort_file) broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file, compression_level=0, pipe=True) else: sort_file = os.path.join(out_dir, "%s-sort.bam" % os.path.splitext(os.path.basename(bam_file))[0]) broad_runner.run_fn("picard_sort", bam_file, "queryname", sort_file) samfile = pysam.Samfile(sort_file, "rb") i = 0 num = 0 f1, out_handle1, f2, out_handle2 = new_handle(num) out_files.append([f1, f2, None]) for x1, x2 in utils.partition_all(2, samfile): x1_seq, x1_qual = _get_seq_qual(x1) out_handle1.write("@%s/1\n%s\n+\n%s\n" % (i, x1_seq, x1_qual)) x2_seq, x2_qual = _get_seq_qual(x2) out_handle2.write("@%s/2\n%s\n+\n%s\n" % (i, x2_seq, x2_qual)) i += 1 if i % split_size == 0: num += 1 out_handle1.close() out_handle2.close() f1, out_handle1, f2, out_handle2 = new_handle(num) out_files.append([f1, f2, num]) out_handle1.close() out_handle2.close() samfile.close() if pipe: os.unlink(sort_file) else: utils.save_diskspace(sort_file, "Split to {}".format(out_files[0][0]), config) return out_files
def run_gatk(self, params, tmp_dir=None): with curdir_tmpdir() as local_tmp_dir: if tmp_dir is None: tmp_dir = local_tmp_dir cl = self.cl_gatk(params, tmp_dir) atype_index = cl.index("-T") if cl.count("-T") > 0 \ else cl.index("--analysis_type") prog = cl[atype_index + 1] do.run(cl, "GATK: {0}".format(prog), None)
def bedtools_tmpdir(data): with utils.curdir_tmpdir(data) as tmpdir: orig_tmpdir = tempfile.gettempdir() pybedtools.set_tempdir(tmpdir) yield if orig_tmpdir and os.path.exists(orig_tmpdir): pybedtools.set_tempdir(orig_tmpdir) else: tempfile.tempdir = None
def generate_align_summary(bam_file, sam_ref, sample_name, config, dirs): """Run alignment summarizing script to produce a pdf with align details. """ with utils.chdir(dirs["work"]): with utils.curdir_tmpdir() as tmp_dir: graphs, summary, overrep = \ _graphs_and_summary(bam_file, sam_ref, tmp_dir, config) return {"pdf": _generate_pdf(graphs, summary, overrep, bam_file, sample_name, dirs, config)}
def picard_formatconverter(picard, align_sam): """Convert aligned SAM file to BAM format. """ out_bam = "%s.bam" % os.path.splitext(align_sam)[0] if not file_exists(out_bam): with curdir_tmpdir() as tmp_dir: with file_transaction(out_bam) as tx_out_bam: opts = [("INPUT", align_sam), ("OUTPUT", tx_out_bam)] picard.run("SamFormatConverter", opts) return out_bam
def picard_downsample(picard, in_bam, ds_pct, random_seed=None): out_file = "%s-downsample%s" % os.path.splitext(in_bam) if not file_exists(out_file): with curdir_tmpdir() as tmp_dir: with file_transaction(out_file) as tx_out_file: opts = [("INPUT", in_bam), ("OUTPUT", tx_out_file), ("PROBABILITY", "%.3f" % ds_pct), ("TMP_DIR", tmp_dir)] if random_seed: opts += [("RANDOM_SEED", str(random_seed))] picard.run("DownsampleSam", opts) return out_file
def mark_duplicates(picard, align_bam): base, ext = os.path.splitext(align_bam) base = base.replace(".", "-") dup_bam = "%s-dup%s" % (base, ext) dup_metrics = "%s-dup.dup_metrics" % base if not os.path.exists(dup_bam): with curdir_tmpdir() as tmp_dir: opts = [("INPUT", align_bam), ("OUTPUT", dup_bam), ("TMP_DIR", tmp_dir), ("METRICS_FILE", dup_metrics)] picard.run("MarkDuplicates", opts) return dup_bam