def _prep_config(items, paired, work_dir): """Run initial configuration, generating a run directory for Manta. """ assert utils.which("configManta.py"), "Could not find installed configManta.py" out_file = os.path.join(work_dir, "runWorkflow.py") if not utils.file_exists(out_file) or _out_of_date(out_file): config_script = os.path.realpath(utils.which("configManta.py")) cmd = [utils.get_program_python("configManta.py"), config_script] if paired: if paired.normal_bam: cmd += ["--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items] data = paired.tumor_data if paired else items[0] cmd += ["--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir] if dd.get_coverage_interval(data) not in ["genome"]: cmd += ["--exome"] for region in _maybe_limit_chromosomes(data): cmd += ["--region", region] resources = config_utils.get_resources("manta", data["config"]) if resources.get("options"): cmd += [str(x) for x in resources["options"]] # If we are removing polyX, avoid calling on small indels which require # excessively long runtimes on noisy WGS runs if "polyx" in dd.get_exclude_regions(data): cmd += ["--config", _prep_streamlined_config(config_script, work_dir)] do.run(cmd, "Configure manta SV analysis") return out_file
def _run_workflow(data, workflow_file, work_dir): """Run Strelka2 analysis inside prepared workflow directory. """ utils.remove_safe(os.path.join(work_dir, "workspace")) cmd = [utils.get_program_python("configureStrelkaGermlineWorkflow.py"), workflow_file, "-m", "local", "-j", dd.get_num_cores(data), "--quiet"] do.run(cmd, "Run Strelka2: %s" % dd.get_sample_name(data)) utils.remove_safe(os.path.join(work_dir, "workspace"))
def _run_workflow(items, paired, workflow_file, work_dir): """Run manta analysis inside prepared workflow directory. """ utils.remove_safe(os.path.join(work_dir, "workspace")) data = paired.tumor_data if paired else items[0] cmd = [utils.get_program_python("configManta.py"), workflow_file, "-m", "local", "-j", dd.get_num_cores(data)] do.run(cmd, "Run manta SV analysis") utils.remove_safe(os.path.join(work_dir, "workspace"))
def _tophat_major_version(config): cmd = [ utils.get_program_python("tophat"), config_utils.get_program("tophat", config, default="tophat"), "--version" ] # tophat --version returns strings like this: Tophat v2.0.4 version_string = str(subprocess.check_output(cmd)).strip().split()[1] major_version = int(version_string.split(".")[0][1:]) return major_version
def prepare_dexseq(gtf): out_file = os.path.splitext(gtf)[0] + ".dexseq.gff3" if file_exists(out_file): return out_file dexseq_path = _dexseq_preparation_path() if not dexseq_path: return None executable = get_program_python("htseq-count") cmd = "{executable} {dexseq_path} {gtf} {out_file}" subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def prepare_dexseq(gtf): out_file = os.path.splitext(gtf)[0] + ".dexseq.gff3" if file_exists(out_file): return out_file dexseq_path = _dexseq_preparation_path() if not dexseq_path: return None executable = get_program_python("htseq-count") cmd = "{executable} {dexseq_path} {gtf} {out_file}" subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def _run_workflow(items, paired, workflow_file, work_dir): """Run manta analysis inside prepared workflow directory. """ utils.remove_safe(os.path.join(work_dir, "workspace")) data = paired.tumor_data if paired else items[0] cmd = [ utils.get_program_python("configManta.py"), workflow_file, "-m", "local", "-j", dd.get_num_cores(data) ] do.run(cmd, "Run manta SV analysis") utils.remove_safe(os.path.join(work_dir, "workspace"))
def _configure_somatic(paired, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [utils.get_program_python("configureStrelkaSomaticWorkflow.py"), os.path.realpath(utils.which("configureStrelkaSomaticWorkflow.py"))] cur_bed = get_region_bed(region, [paired.tumor_data, paired.normal_data], out_file) cmd += ["--referenceFasta=%s" % ref_file, "--callRegions=%s" % cur_bed, "--runDir=%s" % tx_work_dir, "--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam] if _is_targeted_region(cur_bed, paired.tumor_data): cmd += ["--targeted"] do.run(cmd, "Configure Strelka2 germline calling: %s" % paired.tumor_name) return os.path.join(tx_work_dir, "runWorkflow.py")
def make_hisat2_splicesites(gtf_file): base, _ = os.path.splitext(gtf_file) out_file = os.path.join(base + "-splicesites.txt") executable = get_program_python("hisat2") hisat2_script = os.path.join(os.path.dirname(executable), "hisat2_extract_splice_sites.py") cmd = "{executable} {hisat2_script} {gtf_file} > {out_file}" if file_exists(out_file): return out_file if not file_exists(hisat2_script): return None subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def make_hisat2_splicesites(gtf_file): base, _ = os.path.splitext(gtf_file) out_file = os.path.join(base + "-splicesites.txt") executable = get_program_python("hisat2") hisat2_script = os.path.join(os.path.dirname(executable), "hisat2_extract_splice_sites.py") cmd = "{executable} {hisat2_script} {gtf_file} > {out_file}" if file_exists(out_file): return out_file if not file_exists(hisat2_script): return None subprocess.check_call(cmd.format(**locals()), shell=True) return out_file
def _configure_germline(align_bams, items, ref_file, region, out_file, tx_work_dir): utils.safe_makedir(tx_work_dir) cmd = [utils.get_program_python("configureStrelkaGermlineWorkflow.py"), os.path.realpath(utils.which("configureStrelkaGermlineWorkflow.py"))] cur_bed = get_region_bed(region, items, out_file) cmd += ["--referenceFasta=%s" % ref_file, "--callRegions=%s" % cur_bed, "--ploidy=%s" % _get_ploidy(shared.to_multiregion(region), items, out_file), "--runDir=%s" % tx_work_dir] cmd += ["--bam=%s" % b for b in align_bams] if _is_targeted_region(cur_bed, items[0]): cmd += ["--targeted"] do.run(cmd, "Configure Strelka2 germline calling: %s" % (", ".join([dd.get_sample_name(d) for d in items]))) return os.path.join(tx_work_dir, "runWorkflow.py")
def _sam_to_grouped_umi_cl(data, umi_consensus, tx_out_file): """Mark duplicates on aligner output and convert to grouped UMIs by position. Works with either a separate umi_file or UMI embedded in the read names. """ tmp_file = "%s-sorttmp" % utils.splitext_plus(tx_out_file)[0] jvm_opts = _get_fgbio_jvm_opts(data, os.path.dirname(tmp_file), 1) cores, mem = _get_cores_memory(data) bamsormadup = config_utils.get_program("bamsormadup", data) cmd = ("{bamsormadup} tmpfile={tmp_file}-markdup inputformat=sam threads={cores} outputformat=bam " "level=0 SO=coordinate | ") # UMIs in a separate file if os.path.exists(umi_consensus) and os.path.isfile(umi_consensus): cmd += "fgbio {jvm_opts} AnnotateBamWithUmis -i /dev/stdin -f {umi_consensus} -o {tx_out_file}" # UMIs embedded in read name else: cmd += ("%s %s bamtag - | samtools view -b > {tx_out_file}" % (utils.get_program_python("umis"), config_utils.get_program("umis", data["config"]))) return cmd.format(**locals())
def _prep_config(items, paired, work_dir): """Run initial configuration, generating a run directory for Manta. """ assert utils.which( "configManta.py"), "Could not find installed configManta.py" out_file = os.path.join(work_dir, "runWorkflow.py") if not utils.file_exists(out_file) or _out_of_date(out_file): config_script = os.path.realpath(utils.which("configManta.py")) cmd = [utils.get_program_python("configManta.py"), config_script] if paired: if paired.normal_bam: cmd += [ "--normalBam=%s" % paired.normal_bam, "--tumorBam=%s" % paired.tumor_bam ] else: cmd += ["--tumorBam=%s" % paired.tumor_bam] else: cmd += ["--bam=%s" % dd.get_align_bam(data) for data in items] data = paired.tumor_data if paired else items[0] cmd += [ "--referenceFasta=%s" % dd.get_ref_file(data), "--runDir=%s" % work_dir ] if dd.get_coverage_interval(data) not in ["genome"]: cmd += ["--exome"] for region in _maybe_limit_chromosomes(data): cmd += ["--region", region] resources = config_utils.get_resources("manta", data["config"]) if resources.get("options"): cmd += [str(x) for x in resources["options"]] # If we are removing polyX, avoid calling on small indels which require # excessively long runtimes on noisy WGS runs if "polyx" in dd.get_exclude_regions(data): cmd += [ "--config", _prep_streamlined_config(config_script, work_dir) ] do.run(cmd, "Configure manta SV analysis") return out_file
def _get_cmd(): return [ utils.get_program_python("run_metasv.py"), utils.which("run_metasv.py") ]
def _umis_cmd(data): """Return umis command line argument, with correct python and locale. """ return "%s %s %s" % ( utils.locale_export(), utils.get_program_python("umis"), config_utils.get_program("umis", data["config"], default="umis"))
def tophat_align(fastq_file, pair_file, ref_file, out_base, align_dir, data, names=None): """ run alignment using Tophat v2 """ config = data["config"] options = get_in(config, ("resources", "tophat", "options"), {}) options = _set_fusion_mode(options, data) options = _set_quality_flag(options, data) options = _set_transcriptome_option(options, data, ref_file) options = _set_cores(options, config) options = _set_rg_options(options, names) options = _set_stranded_flag(options, config) ref_file, runner = _determine_aligner_and_reference(ref_file, data) # fusion search does not work properly with Bowtie2 if options.get("fusion-search", False): ref_file = ref_file.replace("/bowtie2", "/bowtie") if _tophat_major_version(config) == 1: raise NotImplementedError("Tophat versions < 2.0 are not supported, please " "download the newest version of Tophat here: " "http://tophat.cbcb.umd.edu") if _ref_version(ref_file) == 1 or options.get("fusion-search", False): options["bowtie1"] = True out_dir = os.path.join(align_dir, "%s_tophat" % out_base) final_out = os.path.join(out_dir, "{0}.bam".format(names["sample"])) if file_exists(final_out): return final_out out_file = os.path.join(out_dir, "accepted_hits.bam") unmapped = os.path.join(out_dir, "unmapped.bam") files = [ref_file, fastq_file] if not file_exists(out_file): with file_transaction(config, out_dir) as tx_out_dir: safe_makedir(tx_out_dir) if pair_file and not options.get("mate-inner-dist", None): d, d_stdev = _estimate_paired_innerdist(fastq_file, pair_file, ref_file, out_base, tx_out_dir, data) options["mate-inner-dist"] = d options["mate-std-dev"] = d_stdev files.append(pair_file) options["output-dir"] = tx_out_dir options["no-coverage-search"] = True options["no-mixed"] = True cmd = [utils.get_program_python("tophat"), config_utils.get_program("tophat", config)] for k, v in options.items(): if v is True: cmd.append("--%s" % k) else: assert not isinstance(v, bool) cmd.append("--%s=%s" % (k, v)) # tophat requires options before arguments, otherwise it silently ignores them cmd += files do.run(cmd, "Running Tophat on %s and %s." % (fastq_file, pair_file)) if pair_file and _has_alignments(out_file): fixed = _fix_mates(out_file, os.path.join(out_dir, "%s-align.bam" % out_base), ref_file, config) else: fixed = out_file fixed_unmapped = _fix_unmapped(fixed, unmapped, data) fixed = merge_unmapped(fixed, fixed_unmapped, config) fixed = _add_rg(fixed, config, names) fixed = bam.sort(fixed, config) picard = broad.runner_from_path("picard", config) # set the contig order to match the reference file so GATK works fixed = picard.run_fn("picard_reorder", fixed, data["sam_ref"], os.path.splitext(fixed)[0] + ".picard.bam") fixed = fix_insert_size(fixed, config) if not file_exists(final_out): symlink_plus(fixed, final_out) return final_out