def _download_broad_bundle(gid, bundle_version, name, ext): # Broad bundle directories have uneven use of ".sites" in VCF files # only present in hg19 for non-dbSNP resources sites = ".sites" if gid == "hg19" and not name.startswith("dbsnp") else "" broad_fname = "{name}.{gid}{sites}.vcf{ext}".format(gid=gid, name=name, sites=sites, ext=ext) fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "") + ".gz" base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \ "{bundle}/{gid}/{fname}.gz".format( bundle=bundle_version, fname=broad_fname, gid=gid) # compress and prepare existing uncompressed versions if env.safe_exists(fname.replace(".vcf.gz", ".vcf")): env.safe_run("bgzip %s" % fname.replace(".vcf.gz", ".vcf")) env.safe_run("tabix -f -p vcf %s" % fname) # otherwise, download and bgzip and tabix index if not env.safe_exists(fname): out_file = shared._remote_fetch(env, base_url) env.safe_run("gunzip -c %s | bgzip -c > %s" % (out_file, fname)) env.safe_run("tabix -f -p vcf %s" % fname) env.safe_run("rm -f %s" % out_file) # clean up old files for ext in [".vcf", ".vcf.idx"]: if env.safe_exists(fname.replace(".vcf.gz", ext)): env.safe_run("rm -f %s" % (fname.replace(".vcf.gz", ext))) return fname
def _determine_distribution(env): """ Attempt to automatically determine the distribution of the target machine. Currently works for Ubuntu, CentOS, Debian, Scientific Linux and Mac OS X. """ with quiet(): output = env.safe_run_output("cat /etc/*release").lower() if output.find("distrib_id=ubuntu") >= 0: return "ubuntu" elif output.find("centos release") >= 0: return "centos" elif output.find("centos linux release") >= 0: return "centos" elif output.find("red hat") >= 0: return "centos" elif output.find("fedora release") >= 0: return "centos" elif output.find("amzn") >= 0: # Amazon AMIs are Red-Hat based return "centos" elif output.find("suse linux") >= 0: return "suse" elif output.find("opensuse") >= 0: return "suse" elif output.find("scientific linux") >= 0: return "scientificlinux" elif env.safe_exists("/etc/debian_version"): return "debian" elif output.find("id=arch") >= 0: return "arch" # check for file used by Python's platform.mac_ver elif env.safe_exists("/System/Library/CoreServices/SystemVersion.plist"): return "macosx" else: raise Exception("Attempt to automatically determine Linux distribution of target machine failed, please manually specify distribution in fabricrc.txt")
def _download_dbnsfp(env, gid, gconfig): """Download and prepare dbNSFP functional prediction resources if configured. Feeds into VEP for annotating VCF files: https://sites.google.com/site/jpopgen/dbNSFP https://github.com/ensembl-variation/VEP_plugins/blob/master/dbNSFP.pm """ version = "2.5" url = "http://dbnsfp.houstonbioinformatics.org/dbNSFPzip/dbNSFPv%s.zip" % version if gconfig.get("dbnsfp"): outfile = "dbNSFP_v%s.gz" % (version) if gid == "GRCh37": # download and prepare bgzipped output file if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, url, samedir=True) outdir = "dbNSFPv%s" % version env.safe_run("mkdir -p %s" % outdir) env.safe_run("unzip %s -d %s" % (zipfile, outdir)) env.safe_run("cat %s/dbNSFP*_variant.chr* | bgzip -c > %s" % (outdir, outfile)) env.safe_run("rm -f %s/* && rmdir %s" % (outdir, outdir)) env.safe_run("rm -f %s" % (zipfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("tabix -s 1 -b 2 -e 2 -c '#' %s" % outfile) elif gid == "hg19": # symlink to GRCh37 download if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("ln -sf ../../GRCh37/variation/%s.tbi %s.tbi" % (outfile, outfile))
def _download_qsignature(env, gid, gconfig): """Download qsignature position file to detect samples problems :param env :param gid: str genome id :param gconfig: :returns: NULL """ base_url = "http://downloads.sourceforge.net/project/adamajava/qsignature.tar.bz2" if gid == "GRCh37": outfile = "qsignature.vcf" if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, base_url, samedir=True) outdir = "qsignature" env.safe_run("mkdir -p %s" % outdir) env.safe_run("tar -jxf %s -C %s" % (zipfile, outdir)) env.safe_run("mv %s/qsignature_positions.txt %s" % (outdir, outfile)) env.safe_run("rm -rf %s" % outdir) env.safe_run("rm -rf %s" % zipfile) elif gid == "hg19": # symlink to GRCh37 download outfile = os.path.basename(base_url) if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
def _download_dbnsfp(env, gid, gconfig): """Download and prepare dbNSFP functional prediction resources if configured. Feeds into VEP for annotating VCF files: https://sites.google.com/site/jpopgen/dbNSFP https://github.com/ensembl-variation/VEP_plugins/blob/master/dbNSFP.pm """ version = "2.8" url = "https://onedrive.live.com/download?cid=0D359D171E382137&resid=D359D171E382137%2154761&authkey=AFm7prRqSLLLC9g" dl_file = "dbNSFPv%s.zip" % version if gconfig.get("dbnsfp"): outfile = "dbNSFP_v%s.gz" % (version) if gid == "GRCh37" or (gid == "hg19" and not env.safe_exists("../../GRCh37")): if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, url, out_file=dl_file, samedir=True) outdir = "dbNSFPv%s" % version env.safe_run("mkdir -p %s" % outdir) env.safe_run("unzip %s -d %s" % (zipfile, outdir)) env.safe_run("cat %s/dbNSFP*_variant.chr* | bgzip -c > %s" % (outdir, outfile)) env.safe_run("rm -f %s/* && rmdir %s" % (outdir, outdir)) env.safe_run("rm -f %s" % (zipfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("tabix -s 1 -b 2 -e 2 -c '#' %s" % outfile) elif gid == "hg19": # symlink to GRCh37 download if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("ln -sf ../../GRCh37/variation/%s.tbi %s.tbi" % (outfile, outfile))
def _download_dbnsfp(env, gid, gconfig): """Download and prepare dbNSFP functional prediction resources if configured. Feeds into VEP for annotating VCF files: https://sites.google.com/site/jpopgen/dbNSFP https://github.com/ensembl-variation/VEP_plugins/blob/master/dbNSFP.pm """ version = "2.6" url = "http://dbnsfp.houstonbioinformatics.org/dbNSFPzip/dbNSFPv%s.zip" % version if gconfig.get("dbnsfp"): outfile = "dbNSFP_v%s.gz" % (version) if gid == "GRCh37": # download and prepare bgzipped output file if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, url, samedir=True) outdir = "dbNSFPv%s" % version env.safe_run("mkdir -p %s" % outdir) env.safe_run("unzip %s -d %s" % (zipfile, outdir)) env.safe_run("cat %s/dbNSFP*_variant.chr* | bgzip -c > %s" % (outdir, outfile)) env.safe_run("rm -f %s/* && rmdir %s" % (outdir, outdir)) env.safe_run("rm -f %s" % (zipfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("tabix -s 1 -b 2 -e 2 -c '#' %s" % outfile) elif gid == "hg19": # symlink to GRCh37 download if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("ln -sf ../../GRCh37/variation/%s.tbi %s.tbi" % (outfile, outfile))
def _determine_distribution(env): """ Attempt to automatically determine the distribution of the target machine. Currently works for Ubuntu, CentOS, Debian, Scientific Linux and Mac OS X. """ with quiet(): output = env.safe_run_output("cat /etc/*release").lower() if output.find("distrib_id=ubuntu") >= 0: return "ubuntu" elif output.find("centos release") >= 0: return "centos" elif output.find("red hat enterprise linux server release") >= 0: return "centos" elif output.find("scientific linux release") >= 0: return "scientificlinux" elif env.safe_exists("/etc/debian_version"): return "debian" # check for file used by Python's platform.mac_ver elif env.safe_exists("/System/Library/CoreServices/SystemVersion.plist"): return "macosx" else: raise Exception( "Attempt to automatically determine Linux distribution of target machine failed, please manually specify distribution in fabricrc.txt" )
def _download_broad_bundle(gid, bundle_version, name, ext): broad_fname = "{name}.{gid}.vcf{ext}".format(gid=gid, name=name, ext=ext) fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "") + ".gz" base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \ "{bundle}/{gid}/{fname}.gz".format( bundle=bundle_version, fname=broad_fname, gid=gid) # compress and prepare existing uncompressed versions if env.safe_exists(fname.replace(".vcf.gz", ".vcf")): env.safe_run("bgzip %s" % fname.replace(".vcf.gz", ".vcf")) env.safe_run("tabix -f -p vcf %s" % fname) # otherwise, download and bgzip and tabix index if not env.safe_exists(fname): out_file = shared._remote_fetch(env, base_url, allow_fail=True) if out_file: env.safe_run("gunzip -c %s | bgzip -c > %s" % (out_file, fname)) env.safe_run("tabix -f -p vcf %s" % fname) env.safe_run("rm -f %s" % out_file) else: env.logger.warn("dbSNP resources not available for %s" % gid) # clean up old files for ext in [".vcf", ".vcf.idx"]: if env.safe_exists(fname.replace(".vcf.gz", ext)): env.safe_run("rm -f %s" % (fname.replace(".vcf.gz", ext))) return fname
def _download_cosmic(gid): """Prepared versions of COSMIC, pre-sorted and indexed. utils/prepare_cosmic.py handles the work of creating the VCFs from standard COSMIC resources. """ base_url = "https://s3.amazonaws.com/biodata/variants" version = "v68" supported = ["hg19", "GRCh37"] if gid in supported: url = "%s/cosmic-%s-%s.vcf.gz" % (base_url, version, gid) fname = os.path.basename(url) if not env.safe_exists(fname): shared._remote_fetch(env, url) if not env.safe_exists(fname + ".tbi"): shared._remote_fetch(env, url + ".tbi")
def _download_ancestral(env, gid, gconfig): """Download ancestral genome sequence for loss of function evaluation. Used by LOFTEE VEP plugin: https://github.com/konradjk/loftee """ base_url = "http://www.broadinstitute.org/~konradk/loftee/human_ancestor.fa.rz" if gid == "GRCh37": for ext in ["", ".fai"]: outfile = os.path.basename(base_url) + ext if not env.safe_exists(outfile): shared._remote_fetch(env, base_url + ext, samedir=True) elif gid == "hg19": # symlink to GRCh37 download for ext in ["", ".fai"]: outfile = os.path.basename(base_url) + ext if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
def _connect_native_packages(env, pkg_install, lib_install): """Connect native installed packages to local versions. This helps setup a non-sudo environment to handle software that needs a local version in our non-root directory tree. """ bin_dir = os.path.join(env.system_install, "bin") path = env.safe_run_output("echo $PATH") comment_line = "# CloudBioLinux PATH updates" if not env.safe_contains(env.shell_config, comment_line): env.safe_append(env.shell_config, "\n" + comment_line) if bin_dir not in path and env.safe_exists(env.shell_config): add_path = "export PATH=%s:$PATH" % bin_dir if not env.safe_contains(env.shell_config, add_path): env.safe_append(env.shell_config, add_path) ldlib_path = os.path.join(env.system_install, "lib") add_ldlibrary = "export LD_LIBRARY_PATH=%s:$LD_LIBRARY_PATH" % ldlib_path if not env.safe_contains(env.shell_config, add_ldlibrary): env.safe_append(env.shell_config, add_ldlibrary) perl_export = ( "export PERL5LIB=%s/lib/perl5:%s/lib/perl5/site_perl:${PERL5LIB}" % (env.system_install, env.system_install)) if not env.safe_contains(env.shell_config, perl_export): env.safe_append(env.shell_config, perl_export) if "python" in pkg_install and "python" in lib_install: _create_local_virtualenv(env.system_install)
def _connect_native_packages(env, pkg_install, lib_install): """Connect native installed packages to local versions. This helps setup a non-sudo environment to handle software that needs a local version in our non-root directory tree. """ bin_dir = os.path.join(env.system_install, "bin") path = env.safe_run_output("echo $PATH") comment_line = "# CloudBioLinux PATH updates" if not env.safe_contains(env.shell_config, comment_line): env.safe_append(env.shell_config, "\n" + comment_line) if bin_dir not in path and env.safe_exists(env.shell_config): add_path = "export PATH=%s:$PATH" % bin_dir if not env.safe_contains(env.shell_config, add_path): env.safe_append(env.shell_config, add_path) ldlib_path = os.path.join(env.system_install, "lib") add_ldlibrary = "export LD_LIBRARY_PATH=%s:$LD_LIBRARY_PATH" % ldlib_path if not env.safe_contains(env.shell_config, add_ldlibrary): env.safe_append(env.shell_config, add_ldlibrary) perl_export = ("export PERL5LIB=%s/lib/perl5:%s/lib/perl5/site_perl:${PERL5LIB}" % (env.system_install, env.system_install)) if not env.safe_contains(env.shell_config, perl_export): env.safe_append(env.shell_config, perl_export) if "python" in pkg_install and "python" in lib_install: _create_local_virtualenv(env.system_install)
def _download_background_vcf(gid): """Download background file of variant to use in calling. """ base_url = "https://s3.amazonaws.com/biodata/variants" base_name = "background-diversity-1000g.vcf" if gid in ["GRCh37"] and not env.safe_exists("{0}.gz".format(base_name)): for ext in ["gz", "gz.tbi"]: shared._remote_fetch(env, "{0}/{1}.{2}".format(base_url, base_name, ext))
def _download_cosmic(gid): """Prepared versions of COSMIC, pre-sorted and indexed. utils/prepare_cosmic.py handles the work of creating the VCFs from standard COSMIC resources. """ base_url = "https://s3.amazonaws.com/biodata/variants" version = "v67_20131024" supported = ["hg19", "GRCh37"] if gid in supported: url = "%s/cosmic-%s-%s.vcf.gz" % (base_url, version, gid) gzip_fname = os.path.basename(url) fname = os.path.splitext(gzip_fname)[0] if not env.safe_exists(fname): if not env.safe_exists(gzip_fname): shared._remote_fetch(env, url) env.safe_run("gunzip %s" % fname) if not env.safe_exists(fname + ".idx"): shared._remote_fetch(env, url.replace(".gz", ".idx"))
def _dbsnp_custom(env, gid): """Retrieve resources for dbsnp builds from custom S3 biodata bucket. """ remote_dir = "https://s3.amazonaws.com/biodata/variants/" files = {"mm10": ["mm10-dbSNP-2013-09-12.vcf.gz"], "canFam3": ["canFam3-dbSNP-2014-04-10.vcf.gz"]} for f in files[gid]: for ext in ["", ".tbi"]: fname = f + ext if not env.safe_exists(fname): shared._remote_fetch(env, "%s%s" % (remote_dir, fname))
def r_library_installer(config): """Install R libraries using CRAN and Bioconductor. """ # Create an Rscript file with install details. out_file = "install_packages.R" if env.safe_exists(out_file): env.safe_run("rm -f %s" % out_file) env.safe_run("touch %s" % out_file) lib_loc = os.path.join(env.system_install, "lib", "R", "site-library") env.safe_sudo("mkdir -p %s" % lib_loc) repo_info = """ .libPaths(c("%s")) library(methods) cran.repos <- getOption("repos") cran.repos["CRAN" ] <- "%s" options(repos=cran.repos) source("%s") """ % (lib_loc, config["cranrepo"], config["biocrepo"]) env.safe_append(out_file, repo_info) install_fn = """ repo.installer <- function(repos, install.fn) { %s maybe.install <- function(pname) { if (!(pname %%in%% installed.packages())) install.fn(pname) } } """ if config.get("update_packages", True): update_str = """ update.packages(lib.loc="%s", repos=repos, ask=FALSE) """ % lib_loc else: update_str = "\n" env.safe_append(out_file, install_fn % update_str) std_install = """ std.pkgs <- c(%s) std.installer = repo.installer(cran.repos, install.packages) lapply(std.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran'])) env.safe_append(out_file, std_install) if len(config.get("bioc", [])) > 0: bioc_install = """ bioc.pkgs <- c(%s) bioc.installer = repo.installer(biocinstallRepos(), biocLite) lapply(bioc.pkgs, bioc.installer) """ % (", ".join('"%s"' % p for p in config['bioc'])) env.safe_append(out_file, bioc_install) # run the script and then get rid of it rscript = fabutils.find_cmd(env, "Rscript", "--version") if rscript: env.safe_sudo("%s %s" % (rscript, out_file)) else: env.logger.warn("Rscript not found; skipping install of R libraries.") env.safe_run("rm -f %s" % out_file)
def _dbsnp_custom(env, gid): """Retrieve resources for dbsnp builds from custom S3 biodata bucket. """ remote_dir = "https://s3.amazonaws.com/biodata/variants/" files = {"mm10": ["mm10-dbSNP-2013-09-12.vcf.gz"], "canFam3": ["canFam3-dbSNP-2014-05-10.vcf.gz"]} for f in files[gid]: for ext in ["", ".tbi"]: fname = f + ext if not env.safe_exists(fname): shared._remote_fetch(env, "%s%s" % (remote_dir, fname))
def _dbsnp_mouse(env, gid): """Retrieve resources for mouse variant analysis from custom S3 biodata bucket. """ remote_dir = "https://s3.amazonaws.com/biodata/variants/" files = {"mm10": ["mm10-dbSNP-2013-09-12.vcf"]} for f in files[gid]: for ext in ["", ".idx"]: fname = f + ext if not env.safe_exists(fname): out_file = shared._remote_fetch(env, "%s%s.gz" % (remote_dir, fname)) env.safe_run("gunzip %s" % out_file)
def _dbsnp_mouse(env, gid): """Retrieve resources for mouse variant analysis from custom S3 biodata bucket. """ remote_dir = "https://s3.amazonaws.com/biodata/variants/" files = {"mm10": ["mm10-dbSNP-2013-09-12.vcf"]} for f in files[gid]: for ext in ["", ".idx"]: fname = f + ext if not env.safe_exists(fname): url = "%s%s.gz" % (remote_dir, fname) env.safe_run("wget -O %s -c %s" % (os.path.basename(url), url)) env.safe_run("gunzip %s" % os.path.basename(url))
def _dbsnp_mouse(env, gid): """Retrieve resources for mouse variant analysis from custom S3 biodata bucket. """ remote_dir = "https://s3.amazonaws.com/biodata/variants/" files = {"mm10": ["mm10-dbSNP-2013-09-12.vcf"]} for f in files[gid]: for ext in ["", ".idx"]: fname = f + ext if not env.safe_exists(fname): out_file = shared._remote_fetch( env, "%s%s.gz" % (remote_dir, fname)) env.safe_run("gunzip %s" % out_file)
def _determine_distribution(env): """ Attempt to automatically determine the distribution of the target machine. Currently works for Ubuntu, CentOS, Debian, Scientific Linux and Mac OS X. """ with quiet(): output = env.safe_run_output("cat /etc/*release").lower() if output.find("id=ubuntu") >= 0: return "ubuntu" elif output.find("centos release") >= 0: return "centos" elif output.find("centos linux release") >= 0: return "centos" elif output.find("red hat enterprise linux") >= 0: return "centos" elif output.find("fedora") >= 0: return "centos" # Amazon AMIs are Red-Hat based elif output.find("amzn") >= 0 or output.find("amazon") >= 0: return "centos" elif output.find("suse linux") >= 0: return "suse" elif output.find("opensuse") >= 0: return "suse" elif output.find("scientific linux") >= 0: return "scientificlinux" elif env.safe_exists("/etc/debian_version"): return "debian" elif output.find("id=arch") >= 0 or output.find('id_like="arch"') >= 0: return "arch" elif output.find("antergos") >= 0: return "arch" # check for file used by Python's platform.mac_ver elif env.safe_exists("/System/Library/CoreServices/SystemVersion.plist"): return "macosx" else: raise Exception( "Attempt to automatically determine Linux distribution of target machine failed:\n%s" % output)
def _make_install_script(out_file, config): if env.safe_exists(out_file): env.safe_run("rm -f %s" % out_file) env.safe_run("touch %s" % out_file) lib_loc = os.path.join(env.system_install, "lib", "R", "site-library") env.safe_sudo("mkdir -p %s" % lib_loc) with settings(warn_only=True): env.safe_sudo("chown -R %s %s" % (env.user, lib_loc)) repo_info = """ .libPaths(c("%s")) library(methods) cran.repos <- getOption("repos") cran.repos["CRAN" ] <- "%s" options(repos=cran.repos) source("%s") """ % (lib_loc, config["cranrepo"], config["biocrepo"]) env.safe_append(out_file, repo_info) install_fn = """ repo.installer <- function(repos, install.fn) { %s maybe.install <- function(pname) { if (!(pname %%in%% installed.packages())) install.fn(pname) } } """ if config.get("update_packages", True): update_str = """ update.packages(lib.loc="%s", repos=repos, ask=FALSE) """ % lib_loc else: update_str = "\n" env.safe_append(out_file, install_fn % update_str) std_install = """ std.pkgs <- c(%s) std.installer = repo.installer(cran.repos, install.packages) lapply(std.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran'])) env.safe_append(out_file, std_install) if len(config.get("bioc", [])) > 0: bioc_install = """ bioc.pkgs <- c(%s) bioc.installer = repo.installer(biocinstallRepos(), biocLite) lapply(bioc.pkgs, bioc.installer) """ % (", ".join('"%s"' % p for p in config['bioc'])) env.safe_append(out_file, bioc_install) if config.get("cran-after-bioc"): std2_install = """ std2.pkgs <- c(%s) lapply(std2.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran-after-bioc'])) env.safe_append(out_file, std2_install)
def download_dbnsfp(genomes): """Back compatible download target for dbNSFP, to be moved to GGD recipes. """ folder_name = "variation" genome_dir = os.path.join(env.data_files, "genomes") gids = set(["hg19", "GRCh37"]) for (orgname, gid, manager) in ((o, g, m) for (o, g, m) in genomes if g in gids and m.config.get("dbnsfp")): vrn_dir = os.path.join(genome_dir, orgname, gid, folder_name) if not env.safe_exists(vrn_dir): env.safe_run('mkdir -p %s' % vrn_dir) with cd(vrn_dir): _download_dbnsfp(env, gid, manager.config)
def local_append(filename, text, use_sudo=False, partial=False, escape=True, shell=False): func = use_sudo and env.safe_sudo or env.safe_run # Normalize non-list input to be a list if isinstance(text, basestring): text = [text] for line in text: regex = '^' + _escape_for_regex(line) + ('' if partial else '$') if (env.safe_exists(filename, use_sudo=use_sudo) and line and env.safe_contains(filename, regex, use_sudo=use_sudo, escape=False, shell=shell)): continue line = line.replace("'", r"'\\''") if escape else line func("echo '%s' >> %s" % (line, _expand_path(filename)))
def _ensembl_vcf(env, gid, manager): """Fetch ensemble vcf file (available from release 71) and do tabix indexing """ fname = "%s.vcf.gz" % (manager._organism) download_url = manager._base_url section = "variation/" if not manager._section is "standard": section = "" fname = fname.lower() download_url += "release-%s/%svcf/%s/%s" % (manager._release_number, section, manager._organism.lower(), fname) if not env.safe_exists(fname): shared._remote_fetch(env, download_url) env.safe_run("tabix -f -p vcf %s" % fname)
def _download_broad_bundle(gid, bundle_version, name, ext): broad_fname = "{name}.{gid}.vcf{ext}".format(gid=gid, name=name, ext=ext) fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "") base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \ "{bundle}/{gid}/{fname}.gz".format( bundle=bundle_version, fname=broad_fname, gid=gid) if not env.safe_exists(fname): out_file = shared._remote_fetch(env, base_url, allow_fail=True) if out_file: env.safe_run("gunzip %s" % out_file) env.safe_run("mv %s %s" % (broad_fname, fname)) else: env.logger.warn("dbSNP resources not available for %s" % gid) return fname
def _download_lcrs_custom(env, gid): """Retrieve low complexity regions from other sources. mm10 from Brent Pedersen: http://figshare.com/articles/LCR_mm10_bed_gz/1180124 """ urls = {"mm10": "http://files.figshare.com/1688228/LCR_mm10.bed.gz"} out_file = "LCR.bed.gz" cur_url = urls.get(gid) if cur_url and not env.safe_exists(out_file): def _bgzip_file(env, orig_file): env.safe_run("zcat %s | bgzip -c > %s" % (orig_file, out_file)) return out_file shared._remote_fetch(env, cur_url, fix_fn=_bgzip_file) env.safe_run("tabix -p vcf -f %s" % out_file)
def _download_qsignature(env, gid, gconfig): """Download qsignature position file to detect samples problems :param env :param gid: str genome id :param gconfig: :returns: NULL """ base_url = "http://downloads.sourceforge.net/project/adamajava/qsignature.tar.bz2" outfile = "qsignature.vcf" if gid == "GRCh37" or (gid == "hg19" and not env.safe_exists("../../GRCh37")): if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, base_url, samedir=True) outdir = "qsignature" env.safe_run("mkdir -p %s" % outdir) env.safe_run("tar -jxf %s -C %s" % (zipfile, outdir)) env.safe_run("mv %s/qsignature_positions.txt %s" % (outdir, outfile)) env.safe_run("rm -rf %s" % outdir) env.safe_run("rm -rf %s" % zipfile) elif gid == "hg19": # symlink to GRCh37 download if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
def download_dbsnp(genomes, bundle_version, dbsnp_version): """Download and install dbSNP variation data for supplied genomes. """ folder_name = "variation" genome_dir = os.path.join(env.data_files, "genomes") for (orgname, gid, manager) in ((o, g, m) for (o, g, m) in genomes if m.config.get("dbsnp", False)): vrn_dir = os.path.join(genome_dir, orgname, gid, folder_name) if not env.safe_exists(vrn_dir): env.safe_run("mkdir -p %s" % vrn_dir) with cd(vrn_dir): if gid in ["GRCh37", "hg19"]: _dbsnp_human(env, gid, manager, bundle_version, dbsnp_version) elif gid in ["mm10", "canFam3"]: _dbsnp_custom(env, gid)
def _make_install_script(out_file, config): if env.safe_exists(out_file): env.safe_run("rm -f %s" % out_file) env.safe_run("touch %s" % out_file) lib_loc = os.path.join(env.system_install, "lib", "R", "site-library") env.safe_sudo("mkdir -p %s" % lib_loc) repo_info = """ .libPaths(c("%s")) library(methods) cran.repos <- getOption("repos") cran.repos["CRAN" ] <- "%s" options(repos=cran.repos) source("%s") """ % (lib_loc, config["cranrepo"], config["biocrepo"]) env.safe_append(out_file, repo_info) install_fn = """ repo.installer <- function(repos, install.fn) { %s maybe.install <- function(pname) { if (!(pname %%in%% installed.packages())) install.fn(pname) } } """ if config.get("update_packages", True): update_str = """ update.packages(lib.loc="%s", repos=repos, ask=FALSE) """ % lib_loc else: update_str = "\n" env.safe_append(out_file, install_fn % update_str) std_install = """ std.pkgs <- c(%s) std.installer = repo.installer(cran.repos, install.packages) lapply(std.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran'])) env.safe_append(out_file, std_install) if len(config.get("bioc", [])) > 0: bioc_install = """ bioc.pkgs <- c(%s) bioc.installer = repo.installer(biocinstallRepos(), biocLite) lapply(bioc.pkgs, bioc.installer) """ % (", ".join('"%s"' % p for p in config['bioc'])) env.safe_append(out_file, bioc_install) if config.get("cran-after-bioc"): std2_install = """ std2.pkgs <- c(%s) lapply(std2.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran-after-bioc'])) env.safe_append(out_file, std2_install)
def _download_sv_repeats(gid): """Retrieve telomere and centromere exclusion regions for structural variant calling. From Delly: https://github.com/tobiasrausch/delly """ mere_url = "https://raw.githubusercontent.com/chapmanb/delly/master/human.hg19.excl.tsv" out_file = "sv_repeat_telomere_centromere.bed" if not env.safe_exists(out_file): def _select_by_gid(env, orig_file): if gid == "hg19": env.safe_run("grep ^chr %s > %s" % (orig_file, out_file)) else: assert gid == "GRCh37" env.safe_run("grep -v ^chr %s > %s" % (orig_file, out_file)) return out_file shared._remote_fetch(env, mere_url, fix_fn=_select_by_gid)
def download_dbsnp(genomes, bundle_version, dbsnp_version): """Download and install dbSNP variation data for supplied genomes. """ folder_name = "variation" genome_dir = os.path.join(env.data_files, "genomes") for (orgname, gid, manager) in ((o, g, m) for (o, g, m) in genomes if m.config.get("dbsnp", False)): vrn_dir = os.path.join(genome_dir, orgname, gid, folder_name) if not env.safe_exists(vrn_dir): env.safe_run('mkdir -p %s' % vrn_dir) with cd(vrn_dir): if gid in ["GRCh37", "hg19"]: _dbsnp_human(env, gid, manager, bundle_version, dbsnp_version) elif gid in ["mm10", "canFam3"]: _dbsnp_custom(env, gid)
def _download_broad_bundle(gid, bundle_version, name, ext): broad_fname = "{name}.{gid}.vcf{ext}".format(gid=gid, name=name, ext=ext) fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "") base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \ "{bundle}/{gid}/{fname}.gz".format( bundle=bundle_version, fname=broad_fname, gid=gid) if not env.safe_exists(fname): with warn_only(): dl = env.safe_run("wget -c %s" % base_url) if dl.succeeded: env.safe_run("gunzip %s" % os.path.basename(base_url)) env.safe_run("mv %s %s" % (broad_fname, fname)) else: env.logger.warn("dbSNP resources not available for %s" % gid) return fname
def r_library_installer(config): """Install R libraries using CRAN and Bioconductor. """ # Create an Rscript file with install details. out_file = "install_packages.R" if env.safe_exists(out_file): env.safe_run("rm -f %s" % out_file) env.safe_run("touch %s" % out_file) repo_info = """ cran.repos <- getOption("repos") cran.repos["CRAN" ] <- "%s" options(repos=cran.repos) source("%s") """ % (config["cranrepo"], config["biocrepo"]) env.safe_append(out_file, repo_info) install_fn = """ repo.installer <- function(repos, install.fn) { update.or.install <- function(pname) { if (pname %in% installed.packages()) update.packages(lib.loc=c(pname), repos=repos, ask=FALSE) else install.fn(pname) } } """ env.safe_append(out_file, install_fn) std_install = """ std.pkgs <- c(%s) std.installer = repo.installer(cran.repos, install.packages) lapply(std.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran'])) env.safe_append(out_file, std_install) if len(config.get("bioc", [])) > 0: bioc_install = """ bioc.pkgs <- c(%s) bioc.installer = repo.installer(biocinstallRepos(), biocLite) lapply(bioc.pkgs, bioc.installer) """ % (", ".join('"%s"' % p for p in config['bioc'])) env.safe_append(out_file, bioc_install) if config.get("update_packages", True): final_update = """ update.packages(repos=biocinstallRepos(), ask=FALSE) update.packages(ask=FALSE) """ env.safe_append(out_file, final_update) # run the script and then get rid of it env.safe_sudo("Rscript %s" % out_file) env.safe_run("rm -f %s" % out_file)
def _download_lcrs(gid): """Retrieve low complexity regions from Heng Li's variant analysis paper. """ lcr_url = "https://github.com/lh3/varcmp/raw/master/scripts/LCR-hs37d5.bed.gz" out_file = "LCR.bed.gz" if not env.safe_exists(out_file): def _fix_chrom_names(env, orig_file): if gid == "hg19": convert_cmd = "| grep -v ^GL | grep -v ^NC | grep -v ^hs | sed 's/^/chr/'" else: assert gid == "GRCh37" convert_cmd = "" env.safe_run("zcat %s %s | bgzip -c > %s" % (orig_file, convert_cmd, out_file)) return out_file shared._remote_fetch(env, lcr_url, fix_fn=_fix_chrom_names) env.safe_run("tabix -p vcf -f %s" % out_file)
def _connect_native_packages(env, pkg_install, lib_install): """Connect native installed packages to local versions. This helps setup a non-sudo environment to handle software that needs a local version in our non-root directory tree. """ bin_dir = os.path.join(env.system_install, "bin") exports = _get_shell_exports(env) path = env.safe_run_output("echo $PATH") comment_line = "# CloudBioLinux PATH updates" if not env.safe_contains(env.shell_config, comment_line): env.safe_append(env.shell_config, "\n" + comment_line) if bin_dir not in path and env.safe_exists(env.shell_config): if not env.safe_contains(env.shell_config, exports["path"]): env.safe_append(env.shell_config, exports["path"]) if "python" in pkg_install and "python" in lib_install: _create_local_virtualenv(env.system_install)
def _make_install_script(out_file, config): if env.safe_exists(out_file): env.safe_run("rm -f %s" % out_file) env.safe_run("touch %s" % out_file) lib_loc = os.path.join(env.system_install, "lib", "R", "site-library") env.safe_sudo("mkdir -p %s" % lib_loc) with settings(warn_only=True): env.safe_sudo("chown -R %s %s" % (env.user, lib_loc)) repo_info = """ .libPaths(c("%s")) library(methods) cran.repos <- getOption("repos") cran.repos["CRAN" ] <- "%s" options(repos=cran.repos) source("%s") """ % (lib_loc, config["cranrepo"], config["biocrepo"]) env.safe_append(out_file, repo_info) install_fn = """ repo.installer <- function(repos, install.fn, pkg_name_fn) { %s maybe.install <- function(pname) { check_name <- ifelse(is.null(pkg_name_fn), pname, pkg_name_fn(pname)) if (!(is.element(check_name, installed.packages()[,1]))) install.fn(pname) } } """ if config.get("update_packages", True): update_str = """ update.packages(lib.loc="%s", repos=repos, ask=FALSE) """ % lib_loc else: update_str = "\n" env.safe_append(out_file, install_fn % update_str) std_install = """ std.pkgs <- c(%s) std.installer = repo.installer(cran.repos, install.packages, NULL) lapply(std.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran'])) env.safe_append(out_file, std_install) if len(config.get("bioc", [])) > 0: bioc_install = """ bioc.pkgs <- c(%s) bioc.installer = repo.installer(biocinstallRepos(), biocLite, NULL) lapply(bioc.pkgs, bioc.installer) """ % (", ".join('"%s"' % p for p in config['bioc'])) env.safe_append(out_file, bioc_install) if config.get("cran-after-bioc"): std2_install = """ std2.pkgs <- c(%s) lapply(std2.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran-after-bioc'])) env.safe_append(out_file, std2_install) if config.get("github"): dev_install = """ library(devtools) github.pkgs <- c(%s) get_pkg_name <- function(orig) { unlist(strsplit(unlist(strsplit(orig, "/"))[2], "@"))[1] } github_installer = repo.installer(NULL, install_github, get_pkg_name) lapply(github.pkgs, github_installer) """ % (", ".join('"%s"' % p for p in config['github'])) env.safe_append(out_file, dev_install)
def _download_cosmic(gid): base_url = "http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/mutect/" base_name = "b37_cosmic_v54_120711.vcf" if gid in ["GRCh37"] and not env.safe_exists(base_name): env.safe_run("wget -c {0}/{1}".format(base_url, base_name))
def _make_install_script(out_file, config): if env.safe_exists(out_file): env.safe_run("rm -f %s" % out_file) env.safe_run("touch %s" % out_file) lib_loc = os.path.join(env.system_install, "lib", "R", "site-library") env.safe_sudo("mkdir -p %s" % lib_loc) with settings(warn_only=True): env.safe_sudo("chown -R %s %s" % (env.user, lib_loc)) repo_info = """ .libPaths(c("%s")) library(methods) cran.repos <- getOption("repos") cran.repos["CRAN" ] <- "%s" options(repos=cran.repos) """ % (lib_loc, config["cranrepo"]) if config.get("biocrepo"): repo_info += """\nsource("%s")\n""" % config["biocrepo"] env.safe_append(out_file, repo_info) install_fn = """ repo.installer <- function(repos, install.fn, pkg_name_fn) { %s maybe.install <- function(pname) { if (!is.null(pkg_name_fn)) { pinfo <- pkg_name_fn(pname) ipkgs <- installed.packages()[,3][pinfo["pkg"]] if (is.na(ipkgs[pinfo["pkg"]]) || pinfo["version"] != ipkgs[pinfo["pkg"]]) try(install.fn(pinfo["pname"])) } else if (!(is.element(pname, installed.packages()[,1]))) install.fn(pname) } } """ if config.get("update_packages", True): update_str = """ update.packages(lib.loc="%s", repos=repos, ask=FALSE) """ % lib_loc else: update_str = "\n" env.safe_append(out_file, install_fn % update_str) if len(config.get("cran") or []) > 0: std_install = """ std.pkgs <- c(%s) std.installer = repo.installer(cran.repos, install.packages, NULL) lapply(std.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran'])) env.safe_append(out_file, std_install) if len(config.get("bioc") or []) > 0: bioc_install = """ bioc.pkgs <- c(%s) bioc.installer = repo.installer(biocinstallRepos(), biocLite, NULL) lapply(bioc.pkgs, bioc.installer) """ % (", ".join('"%s"' % p for p in config['bioc'])) env.safe_append(out_file, bioc_install) if config.get("cran-after-bioc"): std2_install = """ std2.pkgs <- c(%s) lapply(std2.pkgs, std.installer) """ % (", ".join('"%s"' % p for p in config['cran-after-bioc'])) env.safe_append(out_file, std2_install) if config.get("github"): dev_install = """ library(devtools) github.pkgs <- c(%s) get_pkg_name <- function(orig) { c(pkg=unlist(strsplit(unlist(strsplit(orig, "/"))[2], "@"))[1], version=unlist(strsplit(orig, ";"))[2], pname=unlist(strsplit(orig, ";"))[1]) } gh_install <- function(name) { install_github(name, upgrade_dependencies=FALSE) } github_installer = repo.installer(NULL, gh_install, get_pkg_name) lapply(github.pkgs, github_installer) """ % (", ".join('"%s"' % p for p in config['github'])) env.safe_append(out_file, dev_install)
def _download_cosmic(gid): base_url = "http://www.broadinstitute.org/cancer/cga/sites/default/files/data/tools/mutect/" base_name = "b37_cosmic_v54_120711.vcf" if gid in ["GRCh37"] and not env.safe_exists(base_name): shared._remote_fetch(env, "{0}/{1}".format(base_url, base_name))