def index(in_bam, config): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if (not utils.file_uptodate(index_file, in_bam) and not utils.file_uptodate(alt_index_file, in_bam)): # Remove old index files and re-run to prevent linking into tx directory for fname in [index_file, alt_index_file]: utils.remove_safe(fname) sambamba = _get_sambamba(config) samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: assert tx_index_file.find(".bam.bai") > 0 tx_bam_file = tx_index_file.replace(".bam.bai", ".bam") utils.symlink_plus(in_bam, tx_bam_file) if sambamba: cmd = "{sambamba} index -t {num_cores} {tx_bam_file}" else: cmd = "{samtools} index {tx_bam_file}" do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam)) return index_file if utils.file_uptodate(index_file, in_bam) else alt_index_file
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. This is used for paired tumor / normal samples. """ config = items[0]["config"] if out_file is None: out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: paired = get_paired_bams(align_bams, items) if not paired.normal_bam: ann_file = _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region, out_file) return ann_file vcfstreamsort = config_utils.get_program("vcfstreamsort", config) perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] db_file = os.path.join(tmp_path, "main", "somatic.db") if not os.path.exists(db_file + ".dir"): if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --ref {}".format(ref_file) opts += " --dir %s" % tmp_path # caling cl = ("{perl_exports} && " "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}") do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {}) # filtering to adjust input parameters bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path)) use_defaults = True if use_defaults: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf") # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher # to swap precision for sensitivity else: scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz") with file_transaction(config, scalpel_tmp_file) as tx_indel_file: cmd = ("{perl_exports} && " "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} " "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 " "| bgzip -c > {tx_indel_file}") do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {}) scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config) scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config) fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) " "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | " " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def index(in_bam, config, check_timestamp=True): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if check_timestamp: bai_exists = utils.file_uptodate(index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam) else: bai_exists = utils.file_exists(index_file) or utils.file_exists(alt_index_file) if not bai_exists: # Remove old index files and re-run to prevent linking into tx directory for fname in [index_file, alt_index_file]: utils.remove_safe(fname) sambamba = _get_sambamba(config) assert sambamba, "Did not find sambamba for indexing" samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: assert tx_index_file.find(".bam.bai") > 0 tx_bam_file = tx_index_file.replace(".bam.bai", ".bam") utils.symlink_plus(in_bam, tx_bam_file) try: cmd = "{sambamba} index -t {num_cores} {tx_bam_file}" do.run(cmd.format(**locals()), "Index BAM file with sambamba: %s" % os.path.basename(in_bam)) except subprocess.CalledProcessError: cmd = "{samtools} index {in_bam} {tx_index_file}" do.run(cmd.format(**locals()), "Backup single thread index of BAM file with samtools: %s" % os.path.basename(in_bam)) return index_file if utils.file_exists(index_file) else alt_index_file
def _move_file_with_sizecheck(tx_file, final_file): """Move transaction file to final location, with size checks avoiding failed transfers. Creates an empty file with '.bcbiotmp' extention in the destination location, which serves as a flag. If a file like that is present, it means that transaction didn't finish successfully. """ logger.debug("Moving %s to %s" % (tx_file, final_file)) tmp_file = final_file + ".bcbiotmp" open(tmp_file, 'wb').close() want_size = utils.get_size(tx_file) shutil.move(tx_file, final_file) transfer_size = utils.get_size(final_file) assert want_size == transfer_size, ( 'distributed.transaction.file_transaction: File copy error: ' 'file or directory on temporary storage ({}) size {} bytes ' 'does not equal size of file or directory after transfer to ' 'shared storage ({}) size {} bytes'.format(tx_file, want_size, final_file, transfer_size)) utils.remove_safe(tmp_file)
def _install_kraken_db(datadir, args): """Install kraken minimal DB in genome folder. """ kraken = os.path.join(datadir, "genomes/kraken") url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz" compress = os.path.join(kraken, os.path.basename(url)) base, ext = utils.splitext_plus(os.path.basename(url)) db = os.path.join(kraken, base) tooldir = args.tooldir or get_defaults()["tooldir"] if os.path.exists(os.path.join(tooldir, "bin", "kraken")): if not os.path.exists(kraken): utils.safe_makedir(kraken) if not os.path.exists(db): if not os.path.exists(compress): subprocess.check_call( ["wget", "-O", compress, url, "--no-check-certificate"]) cmd = ["tar", "-xzvf", compress, "-C", kraken] subprocess.check_call(cmd) shutil.move(os.path.join(kraken, "minikraken_20140330"), os.path.join(kraken, "minikraken")) utils.remove_safe(compress) else: raise argparse.ArgumentTypeError( "kraken not installed in tooldir %s." % os.path.join(tooldir, "bin", "kraken"))
def _move_file_with_sizecheck(tx_file, final_file): """Move transaction file to final location, with size checks avoiding failed transfers. Creates an empty file with '.bcbiotmp' extention in the destination location, which serves as a flag. If a file like that is present, it means that transaction didn't finish successfully. """ #logger.debug("Moving %s to %s" % (tx_file, final_file)) tmp_file = final_file + ".bcbiotmp" open(tmp_file, 'wb').close() want_size = utils.get_size(tx_file) shutil.move(tx_file, final_file) transfer_size = utils.get_size(final_file) assert want_size == transfer_size, ( 'distributed.transaction.file_transaction: File copy error: ' 'file or directory on temporary storage ({}) size {} bytes ' 'does not equal size of file or directory after transfer to ' 'shared storage ({}) size {} bytes'.format( tx_file, want_size, final_file, transfer_size) ) utils.remove_safe(tmp_file)
def _run_workflow(data, workflow_file, work_dir): """Run Strelka2 analysis inside prepared workflow directory. """ utils.remove_safe(os.path.join(work_dir, "workspace")) cmd = [sys.executable, workflow_file, "-m", "local", "-j", dd.get_num_cores(data), "--quiet"] do.run(cmd, "Run Strelka2: %s" % dd.get_sample_name(data)) utils.remove_safe(os.path.join(work_dir, "workspace"))
def _install_kraken_db(datadir, args): """Install kraken minimal DB in genome folder. """ kraken = os.path.join(datadir, "genomes/kraken") url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz" compress = os.path.join(kraken, os.path.basename(url)) base, ext = utils.splitext_plus(os.path.basename(url)) db = os.path.join(kraken, base) tooldir = args.tooldir or get_defaults()["tooldir"] requests.packages.urllib3.disable_warnings() last_mod = urllib.urlopen(url).info().getheader("Last-Modified") last_mod = dateutil.parser.parse(last_mod).astimezone(dateutil.tz.tzutc()) if os.path.exists(os.path.join(tooldir, "bin", "kraken")): if not os.path.exists(db): is_new_version = True else: cur_file = glob.glob(os.path.join(kraken, "minikraken_*"))[0] cur_version = datetime.datetime.utcfromtimestamp(os.path.getmtime(cur_file)) is_new_version = last_mod.date() > cur_version.date() if is_new_version: shutil.move(cur_file, cur_file.replace("minikraken", "old")) if not os.path.exists(kraken): utils.safe_makedir(kraken) if is_new_version: if not os.path.exists(compress): subprocess.check_call(["wget", "-O", compress, url, "--no-check-certificate"]) cmd = ["tar", "-xzvf", compress, "-C", kraken] subprocess.check_call(cmd) last_version = glob.glob(os.path.join(kraken, "minikraken_*")) utils.symlink_plus(os.path.join(kraken, last_version[0]), os.path.join(kraken, "minikraken")) utils.remove_safe(compress) else: print "You have the latest version %s." % last_mod else: raise argparse.ArgumentTypeError("kraken not installed in tooldir %s." % os.path.join(tooldir, "bin", "kraken"))
def index(in_bam, config, check_timestamp=True): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if check_timestamp: bai_exists = utils.file_uptodate( index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam) else: bai_exists = utils.file_exists(index_file) or utils.file_exists( alt_index_file) if not bai_exists: # Remove old index files and re-run to prevent linking into tx directory for fname in [index_file, alt_index_file]: utils.remove_safe(fname) samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: cmd = "{samtools} index -@ {num_cores} {in_bam} {tx_index_file}" do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam)) return index_file if utils.file_exists(index_file) else alt_index_file
def _run_workflow(data, workflow_file, work_dir): """Run Strelka2 analysis inside prepared workflow directory. """ utils.remove_safe(os.path.join(work_dir, "workspace")) cmd = [utils.get_program_python("configureStrelkaGermlineWorkflow.py"), workflow_file, "-m", "local", "-j", dd.get_num_cores(data), "--quiet"] do.run(cmd, "Run Strelka2: %s" % dd.get_sample_name(data)) utils.remove_safe(os.path.join(work_dir, "workspace"))
def _run_workflow(items, paired, workflow_file, work_dir): """Run manta analysis inside prepared workflow directory. """ utils.remove_safe(os.path.join(work_dir, "workspace")) data = paired.tumor_data if paired else items[0] cmd = [sys.executable, workflow_file, "-m", "local", "-j", dd.get_num_cores(data)] do.run(cmd, "Run manta SV analysis") utils.remove_safe(os.path.join(work_dir, "workspace"))
def remove(in_bam): """ remove bam file and the index if exists """ if utils.file_exists(in_bam): utils.remove_safe(in_bam) if utils.file_exists(in_bam + ".bai"): utils.remove_safe(in_bam + ".bai")
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus( tx_out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join( _scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tx_tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) cmd = ( "{perl_exports} && " "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} " ) do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) shutil.move(tx_tmp_path, tmp_path) # parse produced variant file further scalpel_tmp_file = bgzip_and_index( os.path.join(tmp_path, "variants.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression( "chi2", config) sample_name_str = items[0]["name"][1] fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ( "{bcftools_cmd_chi2} {scalpel_tmp_file} | " r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' " "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort " "{compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _run_workflow(items, paired, workflow_file, work_dir): """Run manta analysis inside prepared workflow directory. """ data = paired.tumor_data if paired else items[0] out_file = os.path.join(work_dir, "results", "variants", "somaticSV.vcf.gz" if paired and paired.normal_bam else "diploidSV.vcf.gz") if not utils.file_exists(out_file): utils.remove_safe(os.path.join(work_dir, "workspace")) cmd = [sys.executable, workflow_file, "-m", "local", "-j", dd.get_num_cores(data), "--quiet"] do.run(cmd, "Run manta SV analysis") utils.remove_safe(os.path.join(work_dir, "workspace")) return out_file
def _grabix_index(data): """Create grabix index of bgzip input file. grabix does not allow specification of output file, so symlink the original file into a transactional directory. """ in_file = data["bgzip_file"] config = data["config"] grabix = config_utils.get_program("grabix", config) gbi_file = in_file + ".gbi" if tz.get_in(["algorithm", "align_split_size"], config) is not False: if not utils.file_exists(gbi_file) or _is_partial_index(gbi_file): utils.remove_safe(gbi_file) with file_transaction(data, gbi_file) as tx_gbi_file: tx_in_file = os.path.splitext(tx_gbi_file)[0] utils.symlink_plus(in_file, tx_in_file) do.run([grabix, "index", tx_in_file], "Index input with grabix: %s" % os.path.basename(in_file)) assert utils.file_exists(gbi_file) return [gbi_file]
def tabix_index(in_file, config, preset=None, tabix_args=None): """Index a file using tabix. """ in_file = os.path.abspath(in_file) out_file = in_file + ".tbi" if not utils.file_exists(out_file) or not utils.file_uptodate(out_file, in_file): # Remove old index files to prevent linking into tx directory utils.remove_safe(out_file) with file_transaction(config, out_file) as tx_out_file: tabix = tools.get_tabix_cmd(config) tx_in_file = os.path.splitext(tx_out_file)[0] utils.symlink_plus(in_file, tx_in_file) if tabix_args: cmd = "{tabix} -f {tabix_args} {tx_in_file}" else: preset = _guess_preset(in_file) if preset is None else preset cmd = "{tabix} -f -p {preset} {tx_in_file}" do.run(cmd.format(**locals()), "tabix index %s" % os.path.basename(in_file)) return out_file
def _run_workflow(items, paired, workflow_file, work_dir): """Run manta analysis inside prepared workflow directory. """ data = paired.tumor_data if paired else items[0] if paired: if paired.normal_bam: base_file = "somaticSV.vcf.gz" else: base_file = "tumorSV.vcf.gz" else: base_file = "diploidSV.vcf.gz" out_file = os.path.join(work_dir, "results", "variants", base_file) if not utils.file_exists(out_file): utils.remove_safe(os.path.join(work_dir, "workspace")) cmd = [sys.executable, workflow_file, "-m", "local", "-j", dd.get_num_cores(data), ] do.run(cmd, "Run manta SV analysis") utils.remove_safe(os.path.join(work_dir, "workspace")) return out_file
def _grabix_index(data): """Create grabix index of bgzip input file. grabix does not allow specification of output file, so symlink the original file into a transactional directory. """ in_file = data["bgzip_file"] config = data["config"] grabix = config_utils.get_program("grabix", config) gbi_file = in_file + ".gbi" # We always build grabix input so we can use it for counting reads and doing downsampling if not utils.file_exists(gbi_file) or _is_partial_index(gbi_file): utils.remove_safe(gbi_file) with file_transaction(data, gbi_file) as tx_gbi_file: tx_in_file = os.path.splitext(tx_gbi_file)[0] utils.symlink_plus(in_file, tx_in_file) do.run([grabix, "index", tx_in_file], "Index input with grabix: %s" % os.path.basename(in_file)) assert utils.file_exists(gbi_file) return [gbi_file]
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files, region=None, out_file=None): """Detect indels with Scalpel. Single sample mode. """ config = items[0]["config"] if out_file is None: out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0] if not utils.file_exists(out_file): with file_transaction(config, out_file) as tx_out_file: if len(align_bams) > 1: message = ("Scalpel does not currently support batch calling!") raise ValueError(message) input_bams = " ".join("%s" % x for x in align_bams) tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0] tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(tx_out_file)[0] if os.path.exists(tmp_path): utils.remove_safe(tmp_path) opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path)) opts += " --dir %s" % tx_tmp_path min_cov = "3" # minimum coverage opts += " --mincov %s" % min_cov perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file)) cmd = ("{perl_exports} && " "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} ") do.run(cmd.format(**locals()), "Genotyping with Scalpel", {}) shutil.move(tx_tmp_path, tmp_path) # parse produced variant file further scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants.indel.vcf"), config) compress_cmd = "| bgzip -c" if out_file.endswith("gz") else "" bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config) sample_name_str = items[0]["name"][1] fix_ambig = vcfutils.fix_ambiguous_cl() cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | " r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' " "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort " "{compress_cmd} > {tx_out_file}") do.run(cl2.format(**locals()), "Finalising Scalpel variants", {}) ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams, assoc_files.get("dbsnp"), ref_file, config) return ann_file
def _rename_allelic_fraction_field(orig_file,config): """Rename allelic fraction field in mutect output from FA to FREQ to standarize with other tools """ tmp_file = orig_file.replace(".vcf.gz","-fix.vcf") with file_transaction(tmp_file) as tx_in_file: with open(tx_in_file,'w') as out_handle: with open_gzipsafe(orig_file) as handle: for line in handle: if line.startswith("##FORMAT=<ID=FA"): line = line.replace("=FA","=FREQ") if not line.startswith("#"): line = line.replace("FA","FREQ") out_handle.write(line) out_file = orig_file.replace(".gz","") remove_safe(orig_file) shutil.move(tmp_file, out_file ) with open(tmp_file, "w") as out_handle: out_handle.write("Moved to {0}".format(out_file)) out_file = bgzip_and_index(out_file, config)
def _rename_allelic_fraction_field(orig_file, config): """Rename allelic fraction field in mutect output from FA to FREQ to standarize with other tools """ tmp_file = orig_file.replace(".vcf.gz", "-fix.vcf") with file_transaction(config, tmp_file) as tx_in_file: with open(tx_in_file, 'w') as out_handle: with open_gzipsafe(orig_file) as handle: for line in handle: if line.startswith("##FORMAT=<ID=FA"): line = line.replace("=FA", "=FREQ") if not line.startswith("#"): line = line.replace("FA", "FREQ") out_handle.write(line) out_file = orig_file.replace(".gz", "") remove_safe(orig_file) shutil.move(tmp_file, out_file) with open(tmp_file, "w") as out_handle: out_handle.write("Moved to {0}".format(out_file)) out_file = bgzip_and_index(out_file, config)
def _install_kraken_db(datadir, args): """Install kraken minimal DB in genome folder. """ import requests kraken = os.path.join(datadir, "genomes/kraken") url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz" compress = os.path.join(kraken, os.path.basename(url)) base, ext = utils.splitext_plus(os.path.basename(url)) db = os.path.join(kraken, base) tooldir = args.tooldir or get_defaults()["tooldir"] requests.packages.urllib3.disable_warnings() last_mod = urllib.request.urlopen(url).info().get('Last-Modified') last_mod = dateutil.parser.parse(last_mod).astimezone(dateutil.tz.tzutc()) if os.path.exists(os.path.join(tooldir, "bin", "kraken")): if not os.path.exists(db): is_new_version = True else: cur_file = glob.glob(os.path.join(kraken, "minikraken_*"))[0] cur_version = datetime.datetime.utcfromtimestamp( os.path.getmtime(cur_file)) is_new_version = last_mod.date() > cur_version.date() if is_new_version: shutil.move(cur_file, cur_file.replace('minikraken', 'old')) if not os.path.exists(kraken): utils.safe_makedir(kraken) if is_new_version: if not os.path.exists(compress): subprocess.check_call( ["wget", "-O", compress, url, "--no-check-certificate"]) cmd = ["tar", "-xzvf", compress, "-C", kraken] subprocess.check_call(cmd) last_version = glob.glob(os.path.join(kraken, "minikraken_*")) utils.symlink_plus(os.path.join(kraken, last_version[0]), os.path.join(kraken, "minikraken")) utils.remove_safe(compress) else: print("You have the latest version %s." % last_mod) else: raise argparse.ArgumentTypeError( "kraken not installed in tooldir %s." % os.path.join(tooldir, "bin", "kraken"))
def tx_tmpdir(data=None, base_dir=None, remove=True): """Context manager to create and remove a transactional temporary directory. Handles creating a transactional directory for running commands in. Will use either the current directory or /tmp/bcbiotx. Creates an intermediary location and time specific directory for global temporary directories to prevent collisions. data can be the full world information object being process or a configuration dictionary. """ base_dir = base_dir or os.getcwd() tmpdir_base = utils.get_abspath(_get_base_tmpdir(data, base_dir)) utils.safe_makedir(tmpdir_base) tmp_dir = tempfile.mkdtemp(dir=tmpdir_base) logger.debug("Created tmp dir %s " % tmp_dir) try: yield tmp_dir finally: if remove: utils.remove_safe(tmp_dir)
def _install_kraken_db(datadir, args): """Install kraken minimal DB in genome folder. """ kraken = os.path.join(datadir, "genomes/kraken") url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz" compress = os.path.join(kraken, os.path.basename(url)) base, ext = utils.splitext_plus(os.path.basename(url)) db = os.path.join(kraken, base) tooldir = args.tooldir or get_defaults()["tooldir"] if os.path.exists(os.path.join(tooldir, "bin", "kraken")): if not os.path.exists(kraken): utils.safe_makedir(kraken) if not os.path.exists(db): if not os.path.exists(compress): subprocess.check_call(["wget", "-O", compress, url, "--no-check-certificate"]) cmd = ["tar", "-xzvf", compress, "-C", kraken] subprocess.check_call(cmd) shutil.move(os.path.join(kraken, "minikraken_20140330"), os.path.join(kraken, "minikraken")) utils.remove_safe(compress) else: raise argparse.ArgumentTypeError("kraken not installed in tooldir %s." % os.path.join(tooldir, "bin", "kraken"))
def _move_tmp_files(safe, orig): exts = { ".vcf": ".idx", ".bam": ".bai", ".vcf.gz": ".tbi", ".bed.gz": ".tbi" } utils.safe_makedir(os.path.dirname(orig)) # If we are rolling back a directory and it already exists # this will avoid making a nested set of directories if os.path.isdir(orig) and os.path.isdir(safe): utils.remove_safe(orig) _move_file_with_sizecheck(safe, orig) # Move additional, associated files in the same manner for check_ext, check_idx in exts.iteritems(): if not safe.endswith(check_ext): continue safe_idx = safe + check_idx if os.path.exists(safe_idx): _move_file_with_sizecheck(safe_idx, orig + check_idx)
def tx_tmpdir(data=None, base_dir=None, remove=True): """Context manager to create and remove a transactional temporary directory. Handles creating a transactional directory for running commands in. Will use either the current directory or /tmp/bcbiotx. Creates an intermediary location and time specific directory for global temporary directories to prevent collisions. data can be the full world information object being process or a configuration dictionary. """ base_dir = base_dir or os.getcwd() tmpdir_base = utils.get_abspath(_get_base_tmpdir(data, base_dir)) utils.safe_makedir(tmpdir_base) tmp_dir = tempfile.mkdtemp(dir=tmpdir_base) #logger.debug("Created tmp dir %s " % tmp_dir) try: yield tmp_dir finally: if remove: utils.remove_safe(tmp_dir)
def _move_tmp_files(safe, orig): exts = { ".vcf": ".idx", ".bam": ".bai", ".vcf.gz": ".tbi", ".bed.gz": ".tbi" } utils.safe_makedir(os.path.dirname(orig)) # If we are rolling back a directory and it already exists # this will avoid making a nested set of directories if os.path.isdir(orig) and os.path.isdir(safe): utils.remove_safe(orig) _move_file_with_sizecheck(safe, orig) # Move additional, associated files in the same manner for check_ext, check_idx in exts.items(): if not safe.endswith(check_ext): continue safe_idx = safe + check_idx if os.path.exists(safe_idx): _move_file_with_sizecheck(safe_idx, orig + check_idx)
def run(name, chip_bam, rep_bam, input_bam, gtf_file, out_dir, rlength, rpair, config): """ Run rmats for muatant and control samples avoiding errors due to samples. """ # output file name need to have the caller name MATS_output = os.path.join(out_dir, name + "_MATS_output") MATS_dir = os.path.join(out_dir, "MATS_output") rmats_file = os.path.join(out_dir, "summary.txt") out_file = os.path.join(out_dir, name + "_summary.txt") '''myCmd = 'samtools view '+chip_bam+' | head -n 1' status,output=commands.getstatusoutput(myCmd) rlength=len(output.strip().split('\t')[9])''' libType = _get_stranded_flag(config) if rep_bam != "": chip_bam = chip_bam + "," + rep_bam if utils.file_exists(out_file): return out_file rmats = config_utils.get_program("rmats", config) options = " ".join(config_utils.get_resources("rmats", config).get("options", "")) with utils.chdir(out_dir): cmd = _rmats_cmd() try: do.run(cmd.format(**locals()), "rmats for %s" % name) utils.move_safe(rmats_file, out_file) utils.move_safe(MATS_dir, MATS_output) repdir_dir = os.path.join(out_dir,"SAMPLE_1") utils.remove_safe(repdir_dir) repdir_dir = os.path.join(out_dir,"SAMPLE_2") utils.remove_safe(repdir_dir) print repdir_dir except subprocess.CalledProcessError: raise RuntimeWarning("rMATS terminated with an error.\n" "Please, check the message and report " "error if it is related to bcbio.\n" "You can add specific options for the sample " "setting resources as explained in docs: " "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources") return (out_file)
def index(in_bam, config, check_timestamp=True): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if check_timestamp: bai_exists = utils.file_uptodate(index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam) else: bai_exists = utils.file_exists(index_file) or utils.file_exists(alt_index_file) if not bai_exists: # Remove old index files and re-run to prevent linking into tx directory for fname in [index_file, alt_index_file]: utils.remove_safe(fname) samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: cmd = "{samtools} index -@ {num_cores} {in_bam} {tx_index_file}" do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam)) return index_file if utils.file_exists(index_file) else alt_index_file
def index(in_bam, config, check_timestamp=True): """Index a BAM file, skipping if index present. Centralizes BAM indexing providing ability to switch indexing approaches. """ assert is_bam(in_bam), "%s in not a BAM file" % in_bam index_file = "%s.bai" % in_bam alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0] if check_timestamp: bai_exists = utils.file_uptodate( index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam) else: bai_exists = utils.file_exists(index_file) or utils.file_exists( alt_index_file) if not bai_exists: # Remove old index files and re-run to prevent linking into tx directory for fname in [index_file, alt_index_file]: utils.remove_safe(fname) sambamba = _get_sambamba(config) assert sambamba, "Did not find sambamba for indexing" samtools = config_utils.get_program("samtools", config) num_cores = config["algorithm"].get("num_cores", 1) with file_transaction(config, index_file) as tx_index_file: assert tx_index_file.find(".bam.bai") > 0 tx_bam_file = tx_index_file.replace(".bam.bai", ".bam") utils.symlink_plus(in_bam, tx_bam_file) try: cmd = "{sambamba} index -t {num_cores} {tx_bam_file}" do.run( cmd.format(**locals()), "Index BAM file with sambamba: %s" % os.path.basename(in_bam)) except subprocess.CalledProcessError: cmd = "{samtools} index {in_bam} {tx_index_file}" do.run( cmd.format(**locals()), "Backup single thread index of BAM file with samtools: %s" % os.path.basename(in_bam)) return index_file if utils.file_exists(index_file) else alt_index_file