def assemble_transcripts(align_file, ref_file, config): """Create transcript assemblies using Cufflinks. """ work_dir, fname = os.path.split(align_file) num_cores = config["algorithm"].get("num_cores", 1) core_flags = ["-p", str(num_cores)] if num_cores > 1 else [] out_dir = os.path.join(work_dir, "{base}-cufflinks".format(base=os.path.splitext(fname)[0])) cl = [config_utils.get_program("cufflinks", config), align_file, "-o", out_dir, "-b", ref_file, "-u"] cl += core_flags tx_file = configured_ref_file("transcripts", config, ref_file) tx_mask_file = configured_ref_file("transcripts_mask", config, ref_file) if tx_file: cl += ["-g", tx_file] if tx_mask_file: cl += ["-M", tx_mask_file] out_tx_file = os.path.join(out_dir, "transcripts.gtf") if not os.path.exists(out_tx_file): subprocess.check_call(cl) assert os.path.exists(out_tx_file) return out_tx_file
def variantcall_sample(data, region=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ from bcbio.variation import freebayes, cortex, samtools, varscan safe_makedir(os.path.dirname(out_file)) caller_fns = { "gatk": unified_genotyper, "gatk-haplotype": haplotype_caller, "freebayes": freebayes.run_freebayes, "cortex": cortex.run_cortex, "samtools": samtools.run_samtools, "varscan": varscan.run_varscan } sam_ref = data["sam_ref"] config = data["config"] caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")] if isinstance(data["work_bam"], basestring): align_bams = [data["work_bam"]] else: align_bams = data["work_bam"] call_file = "%s-raw%s" % os.path.splitext(out_file) caller_fn(align_bams, sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, call_file) if data["config"]["algorithm"].get("phasing", False) == "gatk": call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config) if not os.path.exists(out_file): for ext in ["", ".idx"]: if os.path.exists(call_file + ext): os.symlink(call_file + ext, out_file + ext) data["vrn_file"] = out_file return [data]
def prep_recal(data): """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM. """ if data["config"]["algorithm"].get("recalibrate", True): logger.info("Recalibrating %s with GATK" % str(data["name"])) ref_file = data["sam_ref"] config = data["config"] dbsnp_file = configured_ref_file("dbsnp", config, ref_file) broad_runner = broad.runner_from_config(config) platform = config["algorithm"]["platform"] broad_runner.run_fn("picard_index_ref", ref_file) if config["algorithm"].get("mark_duplicates", True): (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"], remove_dups=True) else: dup_align_bam = data["work_bam"] broad_runner.run_fn("picard_index", dup_align_bam) intervals = config["algorithm"].get("variant_regions", None) data["work_bam"] = dup_align_bam data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals) return [[data]]
def variantcall_sample(data, region=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ from bcbio.variation import freebayes, cortex, samtools, varscan safe_makedir(os.path.dirname(out_file)) caller_fns = {"gatk": unified_genotyper, "gatk-haplotype": haplotype_caller, "freebayes": freebayes.run_freebayes, "cortex": cortex.run_cortex, "samtools": samtools.run_samtools, "varscan": varscan.run_varscan} sam_ref = data["sam_ref"] config = data["config"] caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")] if isinstance(data["work_bam"], basestring): align_bams = [data["work_bam"]] else: align_bams = data["work_bam"] call_file = "%s-raw%s" % os.path.splitext(out_file) caller_fn(align_bams, sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, call_file) if data["config"]["algorithm"].get("phasing", False) == "gatk": call_file = phasing.read_backed_phasing(call_file, align_bams, sam_ref, region, config) if not os.path.exists(out_file): for ext in ["", ".idx"]: if os.path.exists(call_file + ext): os.symlink(call_file + ext, out_file + ext) data["vrn_file"] = out_file return [data]
def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir): """Perform realignment with GATK, using input commandline. GATK requires writing to disk and indexing before realignment. """ broad_runner = broad.runner_from_config(data["config"]) pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file) if not utils.file_exists(pa_bam): with file_transaction(pa_bam) as tx_out_file: subprocess.check_call("{cl} > {tx_out_file}".format(**locals()), shell=True) broad_runner.run_fn("picard_index", pa_bam) dbsnp_vcf = shared.configured_ref_file("dbsnp", data["config"], data["sam_ref"]) recal_file = realign.gatk_realigner_targets(broad_runner, pa_bam, data["sam_ref"], dbsnp=dbsnp_vcf, region=region_to_gatk(region)) recal_cl = realign.gatk_indel_realignment_cl(broad_runner, pa_bam, data["sam_ref"], recal_file, tmp_dir, region=region_to_gatk(region)) return pa_bam, " ".join(recal_cl)
def realign_sample(data, region=None, out_file=None): """Realign sample BAM file at indels. """ realigner = data["config"]["algorithm"].get("realign", True) realigner = "gatk" if realigner is True else realigner realign_fn = _realign_approaches[realigner] if realigner else None if data["config"]["algorithm"]["snpcall"] and realign_fn: logger.info("Realigning %s with %s: %s %s" % (data["name"], realigner, os.path.basename( data["work_bam"]), region)) sam_ref = data["sam_ref"] config = data["config"] if region == "nochr": realign_bam = write_nochr_reads(data["work_bam"], out_file) else: realign_bam = realign_fn( data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file) if region is None: save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam, config) data["work_bam"] = realign_bam return [data]
def recalibrate_quality(sort_bam_file, fastq1, fastq2, sam_ref, dirs, config): """Recalibrate alignments with GATK and provide pdf summary. """ dbsnp_file = configured_ref_file("dbsnp", config, sam_ref) recal_file = gatk_recalibrate(sort_bam_file, sam_ref, config, dbsnp_file) if config["algorithm"].get("recalibration_plots", False): _analyze_recalibration(recal_file, fastq1, fastq2, dirs, config) return recal_file
def unified_genotyper_sample(data, region=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ if data["config"]["algorithm"]["snpcall"]: sam_ref = data["sam_ref"] config = data["config"] data["vrn_file"] = unified_genotyper( data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file ) return [data]
def realign_sample(data, region=None, out_file=None): """Realign sample BAM file at indels. """ log.info("Realigning %s with GATK" % str(data["name"])) if data["config"]["algorithm"]["snpcall"]: sam_ref = data["sam_ref"] config = data["config"] data["work_bam"] = gatk_realigner(data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file) return [data]
def bamutil_dedup_recal_cl(in_file, out_file, data, do_recal): """Prepare commandline for running deduplication and recalibration with bamutil. http://genome.sph.umich.edu/wiki/BamUtil:_dedup """ raise NotImplementedError("Not functional for piped BAM analysis") config = data["config"] bam_cmd = config_utils.get_program("bam", config) ref_file = data["sam_ref"] dbsnp_file = configured_ref_file("dbsnp", config, ref_file) cmd = "{bam_cmd} dedup --in {in_file} --out {out_file} --oneChrom" if do_recal: cmd += " --recab --refFile {ref_file} --dbsnp {dbsnp_file}" return cmd.format(**locals())
def variantcall_sample(data, region=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ from bcbio.variation import freebayes caller_fns = {"gatk": unified_genotyper, "freebayes": freebayes.run_freebayes} if data["config"]["algorithm"]["snpcall"]: sam_ref = data["sam_ref"] config = data["config"] caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")] data["vrn_file"] = caller_fn(data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file) return [data]
def assemble_transcripts(align_file, ref_file, config): """Create transcript assemblies using Cufflinks. """ work_dir, fname = os.path.split(align_file) num_cores = config["algorithm"].get("num_cores", 1) core_flags = ["-p", str(num_cores)] if num_cores > 1 else [] out_dir = os.path.join( work_dir, "{base}-cufflinks".format(base=os.path.splitext(fname)[0])) cl = [ config_utils.get_program("cufflinks", config), align_file, "-o", out_dir, "-b", ref_file, "-u" ] cl += core_flags tx_file = configured_ref_file("transcripts", config, ref_file) tx_mask_file = configured_ref_file("transcripts_mask", config, ref_file) if tx_file: cl += ["-g", tx_file] if tx_mask_file: cl += ["-M", tx_mask_file] out_tx_file = os.path.join(out_dir, "transcripts.gtf") if not os.path.exists(out_tx_file): subprocess.check_call(cl) assert os.path.exists(out_tx_file) return out_tx_file
def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir): """Perform realignment with GATK, using input commandline. GATK requires writing to disk and indexing before realignment. """ broad_runner = broad.runner_from_config(data["config"]) pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file) if not utils.file_exists(pa_bam): with file_transaction(pa_bam) as tx_out_file: subprocess.check_call("{cl} > {tx_out_file}".format(**locals()), shell=True) broad_runner.run_fn("picard_index", pa_bam) recal_file = realign.gatk_realigner_targets(broad_runner, pa_bam, data["sam_ref"], dbsnp=shared.configured_ref_file("dbsnp", data["config"], data["sam_ref"]), region=_region_to_gatk(region)) recal_cl = realign.gatk_indel_realignment_cl(broad_runner, pa_bam, data["sam_ref"], recal_file, tmp_dir, region=_region_to_gatk(region)) return pa_bam, " ".join(recal_cl)
def realign_sample(data, region=None, out_file=None): """Realign sample BAM file at indels. """ logger.info("Realigning %s with GATK: %s %s" % (data["name"], os.path.basename(data["work_bam"]), region)) if data["config"]["algorithm"]["snpcall"]: sam_ref = data["sam_ref"] config = data["config"] if region == "nochr": realign_bam = write_nochr_reads(data["work_bam"], out_file) else: realign_bam = gatk_realigner( data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file ) if region is None: save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam, config) data["work_bam"] = realign_bam return [data]
def _piped_realign_gatk(data, region, cl, out_base_file, tmp_dir, prep_params): """Perform realignment with GATK, using input commandline. GATK requires writing to disk and indexing before realignment. """ broad_runner = broad.runner_from_config(data["config"]) pa_bam = "%s-prealign%s" % os.path.splitext(out_base_file) if not utils.file_exists(pa_bam): with file_transaction(pa_bam) as tx_out_file: pipe = ">" if prep_params["dup"] else "-o" cmd = "{cl} {pipe} {tx_out_file}".format(**locals()) do.run(cmd, "GATK pre-alignment {0}".format(region), data) broad_runner.run_fn("picard_index", pa_bam) dbsnp_vcf = shared.configured_ref_file("dbsnp", data["config"], data["sam_ref"]) recal_file = realign.gatk_realigner_targets(broad_runner, pa_bam, data["sam_ref"], dbsnp=dbsnp_vcf, region=region_to_gatk(region)) recal_cl = realign.gatk_indel_realignment_cl(broad_runner, pa_bam, data["sam_ref"], recal_file, tmp_dir, region=region_to_gatk(region)) return pa_bam, " ".join(recal_cl)
def realign_sample(data, region=None, out_file=None): """Realign sample BAM file at indels. """ logger.info("Realigning %s with GATK: %s %s" % (data["name"], os.path.basename(data["work_bam"]), region)) if data["config"]["algorithm"]["snpcall"]: sam_ref = data["sam_ref"] config = data["config"] if region == "nochr": realign_bam = write_nochr_reads(data["work_bam"], out_file) else: realign_bam = gatk_realigner( data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file) if region is None: save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam, config) data["work_bam"] = realign_bam return [data]
def prep_recal(data): """Perform a GATK recalibration of the sorted aligned BAM, producing recalibrated BAM. """ if data["config"]["algorithm"].get("recalibrate", True) in [True, "gatk"]: logger.info("Recalibrating %s with GATK" % str(data["name"])) ref_file = data["sam_ref"] config = data["config"] dbsnp_file = configured_ref_file("dbsnp", config, ref_file) broad_runner = broad.runner_from_config(config) platform = config["algorithm"]["platform"] broad_runner.run_fn("picard_index_ref", ref_file) if config["algorithm"].get("mark_duplicates", True): (dup_align_bam, _) = broad_runner.run_fn("picard_mark_duplicates", data["work_bam"]) else: dup_align_bam = data["work_bam"] broad_runner.run_fn("picard_index", dup_align_bam) intervals = config["algorithm"].get("variant_regions", None) data["work_bam"] = dup_align_bam data["prep_recal"] = _gatk_base_recalibrator(broad_runner, dup_align_bam, ref_file, platform, dbsnp_file, intervals) return [[data]]
def variantcall_sample(data, region=None, out_file=None): """Parallel entry point for doing genotyping of a region of a sample. """ from bcbio.variation import freebayes, cortex, samtools, varscan caller_fns = {"gatk": unified_genotyper, "gatk-haplotype": haplotype_caller, "freebayes": freebayes.run_freebayes, "cortex": cortex.run_cortex, "samtools": samtools.run_samtools, "varscan": varscan.run_varscan} if data["config"]["algorithm"]["snpcall"]: sam_ref = data["sam_ref"] config = data["config"] caller_fn = caller_fns[config["algorithm"].get("variantcaller", "gatk")] if isinstance(data["work_bam"], basestring): align_bams = [data["work_bam"]] else: align_bams = data["work_bam"] data["vrn_file"] = caller_fn(align_bams, sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file) return [data]
def realign_sample(data, region=None, out_file=None): """Realign sample BAM file at indels. """ realigner = data["config"]["algorithm"].get("realign", True) realigner = "gatk" if realigner is True else realigner realign_fn = _realign_approaches[realigner] if realigner else None if data["config"]["algorithm"]["snpcall"] and realign_fn: logger.info("Realigning %s with %s: %s %s" % (data["name"], realigner, os.path.basename(data["work_bam"]), region)) sam_ref = data["sam_ref"] config = data["config"] if region == "nochr": realign_bam = write_nochr_reads(data["work_bam"], out_file) else: realign_bam = realign_fn(data["work_bam"], sam_ref, config, configured_ref_file("dbsnp", config, sam_ref), region, out_file) if region is None: save_diskspace(data["work_bam"], "Realigned to %s" % realign_bam, config) data["work_bam"] = realign_bam return [data]
def _get_gtf_file(data): ref_file = data["sam_ref"] return configured_ref_file("transcripts", data["config"], ref_file)