def download(self, seq_dir): genome_file = "%s.fa" % self._name if not self._exists(self._get_file, seq_dir): shared._remote_fetch(None, "%s%s" % (self._url, self._get_file)) if not self._exists(genome_file, seq_dir): subprocess.check_call("gunzip -c %s > %s" % (self._get_file, genome_file), shell=True) return genome_file, [self._get_file]
def _data_uniref(): """Retrieve and index UniRef databases for protein searches. http://www.ebi.ac.uk/uniref/ These are currently indexed for FASTA searches. Are other indexes desired? Should this be separated out and organized by program like genome data? This should also check the release note and automatically download and replace older versions. """ site = "ftp://ftp.uniprot.org" base_url = site + "/pub/databases/uniprot/" \ "current_release/uniref/%s/%s" for uniref_db in ["uniref50", "uniref90", "uniref100"]: work_dir = os.path.join(env.data_files, "uniref", uniref_db) if not env.safe_exists(work_dir): env.safe_run("mkdir -p %s" % work_dir) base_work_url = base_url % (uniref_db, uniref_db) fasta_url = base_work_url + ".fasta.gz" base_file = os.path.splitext(os.path.basename(fasta_url))[0] with cd(work_dir): if not env.safe_exists(base_file): out_file = shared._remote_fetch(env, fasta_url) env.safe_run("gunzip %s" % out_file) shared._remote_fetch(env, base_work_url + ".release_note") _index_blast_db(work_dir, base_file, "prot")
def download(self, seq_dir): org_file = "%s.fa" % self._name if not self._exists(org_file, seq_dir): shared._remote_fetch(None, "%s%s.gz" % (self._ftp_url, self._target)) subprocess.check_call("gunzip %s.gz" % self._target, shell=True) subprocess.check_call("mv %s %s" % (self._target, org_file), shell=True) return org_file, []
def download(self, seq_dir): genome_file = "%s.fa" % self._name if not self._exists(self._get_file, seq_dir): shared._remote_fetch(env, "%s%s" % (self._url, self._get_file)) if not self._exists(genome_file, seq_dir): env.safe_run("gunzip -c %s > %s" % (self._get_file, genome_file)) return genome_file, [self._get_file]
def download(self, seq_dir): org_file = "%s.fa" % self._name if not self._exists(org_file, seq_dir): shared._remote_fetch(env, "%s%s.gz" % (self._ftp_url, self._target)) env.safe_run("gunzip %s.gz" % self._target) env.safe_run("mv %s %s" % (self._target, org_file)) return org_file, []
def _download_background_vcf(gid): """Download background file of variant to use in calling. """ base_url = "https://s3.amazonaws.com/biodata/variants" base_name = "background-diversity-1000g.vcf" if gid in ["GRCh37"] and not env.safe_exists("{0}.gz".format(base_name)): for ext in ["gz", "gz.tbi"]: shared._remote_fetch(env, "{0}/{1}.{2}".format(base_url, base_name, ext))
def download(self, seq_dir): genome_file = "%s.fa" % self._name for fn in self._to_get: url = self._base_url + fn if not self._exists(fn, seq_dir): shared._remote_fetch(env, url) env.safe_run("gunzip -c %s >> %s" % (fn, genome_file)) return genome_file, []
def download(self, seq_dir): genome_file = "%s.fa" % self._name for fn in self._to_get: url = self._base_url + fn if not self._exists(fn, seq_dir): shared._remote_fetch(None, url) subprocess.check_call("gunzip -c %s >> %s" % (fn, genome_file), shell=True) return genome_file, []
def _download_ensembl_gtf(env, manager): """Fetch ensembl gtf file for coresponding genome - release """ fname = "%s.%s.%s.gtf" % (manager._organism, manager._name, manager._release_number) download_url = manager._base_url download_url += "release-%s/gtf/%s/%s" % (manager._release_number, manager._organism.lower(), fname) if not env.safe_exists(fname): shared._remote_fetch(env, download_url + ".gz") env.safe_run("gunzip %s.gz" % fname)
def _download_executables(env, base_url, tools): install_dir = shared._get_bin_dir(env) with _make_tmp_dir() as work_dir: with cd(work_dir): for tool in tools: final_tool = os.path.join(install_dir, tool) if not env.safe_exists(final_tool) and shared._executable_not_on_path(tool): shared._remote_fetch(env, "%s%s" % (base_url, tool)) env.safe_sudo("cp -f %s %s" % (tool, install_dir))
def download(self, seq_dir): genome_file = "%s.fa" % self._name if not self._exists(self._get_file, seq_dir): shared._remote_fetch(env, "%s%s" % (self._url, self._get_file)) if not self._exists(genome_file, seq_dir): env.safe_run("gunzip -c %s > %s" % (self._get_file, genome_file)) if self._convert_to_ucsc: #run("sed s/ / /g %s" % genome_file) raise NotImplementedError("Replace with chr") return genome_file, [self._get_file]
def _dbsnp_custom(env, gid): """Retrieve resources for dbsnp builds from custom S3 biodata bucket. """ remote_dir = "https://s3.amazonaws.com/biodata/variants/" files = {"mm10": ["mm10-dbSNP-2013-09-12.vcf.gz"], "canFam3": ["canFam3-dbSNP-2014-04-10.vcf.gz"]} for f in files[gid]: for ext in ["", ".tbi"]: fname = f + ext if not env.safe_exists(fname): shared._remote_fetch(env, "%s%s" % (remote_dir, fname))
def install_leiningen(env): """Clojure tool for project configuration and automation. http://github.com/technomancy/leiningen """ bin_dir = os.path.join(env.system_install, "bin") with _make_tmp_dir() as work_dir: with cd(work_dir): shared._remote_fetch(env, "https://raw.github.com/technomancy/leiningen/stable/bin/lein") env.safe_run("chmod a+rwx lein") env.safe_sudo("mv lein %s" % bin_dir) env.safe_run("%s/lein" % bin_dir)
def download(self, seq_dir): genome_file = "%s.fa" % self._name if not self._exists(genome_file, seq_dir): for ref in self._refs: shared._remote_fetch(None, self._base_url % ref) subprocess.check_call("ls -l", shell=True) subprocess.check_call(r"sed -i 's/^>.*$/>%s/' %s.fasta" % (ref, ref), shell=True) tmp_file = genome_file.replace(".fa", ".txt") subprocess.check_call("cat *.fasta > %s" % tmp_file, shell=True) subprocess.check_call("rm -f *.fasta", shell=True) subprocess.check_call("rm -f *.bak", shell=True) subprocess.check_call("mv %s %s" % (tmp_file, genome_file), shell=True) return genome_file, []
def download(self, seq_dir): genome_file = "%s.fa" % self._name if not self._exists(genome_file, seq_dir): for ref in self._refs: shared._remote_fetch(env, self._base_url % ref) env.safe_run("ls -l") env.safe_sed('%s.fasta' % ref, '^>.*$', '>%s' % ref, '1') tmp_file = genome_file.replace(".fa", ".txt") env.safe_run("cat *.fasta > %s" % tmp_file) env.safe_run("rm -f *.fasta") env.safe_run("rm -f *.bak") env.safe_run("mv %s %s" % (tmp_file, genome_file)) return genome_file, []
def _ensembl_vcf(env, gid, manager): """Fetch ensemble vcf file (available from release 71) and do tabix indexing """ fname = "%s.vcf.gz" % (manager._organism) download_url = manager._base_url section = "variation/" if not manager._section is "standard": section = "" fname = fname.lower() download_url += "release-%s/%svcf/%s/%s" % (manager._release_number, section, manager._organism.lower(), fname) if not env.safe_exists(fname): shared._remote_fetch(env, download_url) env.safe_run("tabix -f -p vcf %s" % fname)
def _download_lcrs_custom(env, gid): """Retrieve low complexity regions from other sources. mm10 from Brent Pedersen: http://figshare.com/articles/LCR_mm10_bed_gz/1180124 """ urls = {"mm10": "http://files.figshare.com/1688228/LCR_mm10.bed.gz"} out_file = "LCR.bed.gz" cur_url = urls.get(gid) if cur_url and not env.safe_exists(out_file): def _bgzip_file(env, orig_file): env.safe_run("zcat %s | bgzip -c > %s" % (orig_file, out_file)) return out_file shared._remote_fetch(env, cur_url, fix_fn=_bgzip_file) env.safe_run("tabix -p vcf -f %s" % out_file)
def _download_sv_repeats(gid): """Retrieve telomere and centromere exclusion regions for structural variant calling. From Delly: https://github.com/tobiasrausch/delly """ mere_url = "https://raw.githubusercontent.com/chapmanb/delly/master/human.hg19.excl.tsv" out_file = "sv_repeat_telomere_centromere.bed" if not env.safe_exists(out_file): def _select_by_gid(env, orig_file): if gid == "hg19": env.safe_run("grep ^chr %s > %s" % (orig_file, out_file)) else: assert gid == "GRCh37" env.safe_run("grep -v ^chr %s > %s" % (orig_file, out_file)) return out_file shared._remote_fetch(env, mere_url, fix_fn=_select_by_gid)
def _download_cosmic(gid): """Prepared versions of COSMIC, pre-sorted and indexed. utils/prepare_cosmic.py handles the work of creating the VCFs from standard COSMIC resources. """ base_url = "https://s3.amazonaws.com/biodata/variants" version = "v68" supported = ["hg19", "GRCh37"] if gid in supported: url = "%s/cosmic-%s-%s.vcf.gz" % (base_url, version, gid) fname = os.path.basename(url) if not env.safe_exists(fname): shared._remote_fetch(env, url) if not env.safe_exists(fname + ".tbi"): shared._remote_fetch(env, url + ".tbi")
def _download_dbnsfp(env, gid, gconfig): """Download and prepare dbNSFP functional prediction resources if configured. Feeds into VEP for annotating VCF files: https://sites.google.com/site/jpopgen/dbNSFP https://github.com/ensembl-variation/VEP_plugins/blob/master/dbNSFP.pm """ version = "2.5" url = "http://dbnsfp.houstonbioinformatics.org/dbNSFPzip/dbNSFPv%s.zip" % version if gconfig.get("dbnsfp"): outfile = "dbNSFP_v%s.gz" % (version) if gid == "GRCh37": # download and prepare bgzipped output file if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, url, samedir=True) outdir = "dbNSFPv%s" % version env.safe_run("mkdir -p %s" % outdir) env.safe_run("unzip %s -d %s" % (zipfile, outdir)) env.safe_run("cat %s/dbNSFP*_variant.chr* | bgzip -c > %s" % (outdir, outfile)) env.safe_run("rm -f %s/* && rmdir %s" % (outdir, outdir)) env.safe_run("rm -f %s" % (zipfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("tabix -s 1 -b 2 -e 2 -c '#' %s" % outfile) elif gid == "hg19": # symlink to GRCh37 download if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("ln -sf ../../GRCh37/variation/%s.tbi %s.tbi" % (outfile, outfile))
def _download_ancestral(env, gid, gconfig): """Download ancestral genome sequence for loss of function evaluation. Used by LOFTEE VEP plugin: https://github.com/konradjk/loftee """ base_url = "http://www.broadinstitute.org/~konradk/loftee/human_ancestor.fa.rz" if gid == "GRCh37": for ext in ["", ".fai"]: outfile = os.path.basename(base_url) + ext if not env.safe_exists(outfile): shared._remote_fetch(env, base_url + ext, samedir=True) elif gid == "hg19": # symlink to GRCh37 download for ext in ["", ".fai"]: outfile = os.path.basename(base_url) + ext if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
def _download_lcrs(gid): """Retrieve low complexity regions from Heng Li's variant analysis paper. """ lcr_url = "https://github.com/lh3/varcmp/raw/master/scripts/LCR-hs37d5.bed.gz" out_file = "LCR.bed.gz" if not env.safe_exists(out_file): def _fix_chrom_names(env, orig_file): if gid == "hg19": convert_cmd = "| grep -v ^GL | grep -v ^NC | grep -v ^hs | sed 's/^/chr/'" else: assert gid == "GRCh37" convert_cmd = "" env.safe_run("zcat %s %s | bgzip -c > %s" % (orig_file, convert_cmd, out_file)) return out_file shared._remote_fetch(env, lcr_url, fix_fn=_fix_chrom_names) env.safe_run("tabix -p vcf -f %s" % out_file)
def _data_liftover(lift_over_genomes): """Download chain files for running liftOver. Does not install liftOver binaries automatically. """ lo_dir = os.path.join(env.data_files, "liftOver") if not env.safe_exists(lo_dir): env.safe_run("mkdir %s" % lo_dir) lo_base_url = "ftp://hgdownload.cse.ucsc.edu/goldenPath/%s/liftOver/%s" lo_base_file = "%sTo%s.over.chain.gz" for g1 in lift_over_genomes: for g2 in [g for g in lift_over_genomes if g != g1]: g2u = g2[0].upper() + g2[1:] cur_file = lo_base_file % (g1, g2u) non_zip = os.path.splitext(cur_file)[0] worked = False with cd(lo_dir): if not env.safe_exists(non_zip): result = shared._remote_fetch(env, "%s" % (lo_base_url % (g1, cur_file)), allow_fail=True) # Lift over back and forths don't always exist # Only move forward if we found the file if result: worked = True env.safe_run("gunzip %s" % result) if worked: ref_parts = [g1, g2, os.path.join(lo_dir, non_zip)] galaxy.update_loc_file("liftOver.loc", ref_parts)
def download(self, seq_dir): genome_file = "%s.fa" % self._name if not self._exists(genome_file, seq_dir): for ref in self._refs: shared._remote_fetch(None, self._base_url % ref) subprocess.check_call("ls -l", shell=True) subprocess.check_call(r"sed -i 's/^>.*$/>%s/' %s.fasta" % (ref, ref), shell=True) tmp_file = genome_file.replace(".fa", ".txt") subprocess.check_call("cat *.fasta > %s" % tmp_file, shell=True) subprocess.check_call("rm -f *.fasta", shell=True) subprocess.check_call("rm -f *.bak", shell=True) subprocess.check_call("mv %s %s" % (tmp_file, genome_file), shell=True) return genome_file, []
def install_anaconda(env): """Pre-packaged Anaconda Python installed from Continuum. http://docs.continuum.io/anaconda/index.html """ version = "2.0.0" outdir = os.path.join(env.system_install, "anaconda") if env.distribution in [ "ubuntu", "centos", "scientificlinux", "debian", "arch", "suse" ]: platform = "Linux" elif env.distribution in ["macosx"]: platform = "MacOSX" else: raise ValueError("Unexpected distribution: %s" % env.distribution) url = "http://09c8d0b2229f813c1b93-c95ac804525aac4b6dba79b00b39d1d3.r79.cf1.rackcdn.com/" \ "Anaconda-%s-%s-x86_64.sh" % (version, platform) if not env.safe_exists(outdir): with _make_tmp_dir() as work_dir: with cd(work_dir): installer = shared._remote_fetch(env, url) env.safe_sed(os.path.basename(url), "more <<EOF", "cat <<EOF") env.safe_sudo("echo -e '\nyes\n%s\nyes\n' | bash %s" % (outdir, installer)) env.safe_sudo("chown -R %s %s" % (env.user, outdir)) comment_line = "# added by Ananconda %s installer" % version if not env.safe_contains(env.shell_config, comment_line): env.safe_append(env.shell_config, comment_line) env.safe_append(env.shell_config, "export PATH=%s/bin:$PATH" % outdir) # remove curl library with broken certificates env.safe_run("%s/bin/conda remove --yes curl" % outdir) env.safe_run("%s/bin/conda install --yes pip" % outdir)
def _download_dbnsfp(env, gid, gconfig): """Download and prepare dbNSFP functional prediction resources if configured. Feeds into VEP for annotating VCF files: https://sites.google.com/site/jpopgen/dbNSFP https://github.com/ensembl-variation/VEP_plugins/blob/master/dbNSFP.pm """ version = "2.8" url = "https://onedrive.live.com/download?cid=0D359D171E382137&resid=D359D171E382137%2154761&authkey=AFm7prRqSLLLC9g" dl_file = "dbNSFPv%s.zip" % version if gconfig.get("dbnsfp"): outfile = "dbNSFP_v%s.gz" % (version) if gid == "GRCh37" or (gid == "hg19" and not env.safe_exists("../../GRCh37")): if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, url, out_file=dl_file, samedir=True) outdir = "dbNSFPv%s" % version env.safe_run("mkdir -p %s" % outdir) env.safe_run("unzip %s -d %s" % (zipfile, outdir)) env.safe_run("cat %s/dbNSFP*_variant.chr* | bgzip -c > %s" % (outdir, outfile)) env.safe_run("rm -f %s/* && rmdir %s" % (outdir, outdir)) env.safe_run("rm -f %s" % (zipfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("tabix -s 1 -b 2 -e 2 -c '#' %s" % outfile) elif gid == "hg19": # symlink to GRCh37 download if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("ln -sf ../../GRCh37/variation/%s.tbi %s.tbi" % (outfile, outfile))
def _download_dbnsfp(env, gid, gconfig): """Download and prepare dbNSFP functional prediction resources if configured. Feeds into VEP for annotating VCF files: https://sites.google.com/site/jpopgen/dbNSFP https://github.com/ensembl-variation/VEP_plugins/blob/master/dbNSFP.pm """ version = "2.5" url = "http://dbnsfp.houstonbioinformatics.org/dbNSFPzip/dbNSFPv%s.zip" % version if gconfig.get("dbnsfp"): outfile = "dbNSFP_v%s.gz" % (version) if gid == "GRCh37": # download and prepare bgzipped output file if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, url, samedir=True) outdir = "dbNSFPv%s" % version env.safe_run("mkdir -p %s" % outdir) env.safe_run("unzip %s -d %s" % (zipfile, outdir)) env.safe_run("cat %s/dbNSFP*_variant.chr* | bgzip -c > %s" % (outdir, outfile)) env.safe_run("rm -f %s/* && rmdir %s" % (outdir, outdir)) env.safe_run("rm -f %s" % (zipfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("tabix -s 1 -b 2 -e 2 -c '#' %s" % outfile) elif gid == "hg19": # symlink to GRCh37 download if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile)) if not env.safe_exists(outfile + ".tbi"): env.safe_run("ln -sf ../../GRCh37/variation/%s.tbi %s.tbi" % (outfile, outfile))
def _remove_werror_get_boost(env): env.safe_sed("configure", " -Werror", "") # http://osdir.com/ml/abyss-users-science/2011-10/msg00108.html url = "http://downloads.sourceforge.net/project/boost/boost/1.47.0/boost_1_47_0.tar.bz2" dl_file = shared._remote_fetch(env, url) env.safe_run("tar jxf %s" % dl_file) env.safe_run("ln -s boost_1_47_0/boost boost")
def _download_ancestral(env, gid, gconfig): """Download ancestral genome sequence for loss of function evaluation. Used by LOFTEE VEP plugin: https://github.com/konradjk/loftee """ base_url = "http://www.broadinstitute.org/~konradk/loftee/human_ancestor.fa.rz" if gid == "GRCh37": for ext in ["", ".fai"]: outfile = os.path.basename(base_url) + ext if not env.safe_exists(outfile): shared._remote_fetch(env, base_url + ext, samedir=True) elif gid == "hg19": # symlink to GRCh37 download for ext in ["", ".fai"]: outfile = os.path.basename(base_url) + ext if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
def _data_liftover(lift_over_genomes): """Download chain files for running liftOver. Does not install liftOver binaries automatically. """ lo_dir = os.path.join(env.data_files, "liftOver") if not env.safe_exists(lo_dir): env.safe_run("mkdir %s" % lo_dir) lo_base_url = "ftp://hgdownload.cse.ucsc.edu/goldenPath/%s/liftOver/%s" lo_base_file = "%sTo%s.over.chain.gz" for g1 in lift_over_genomes: for g2 in [g for g in lift_over_genomes if g != g1]: g2u = g2[0].upper() + g2[1:] cur_file = lo_base_file % (g1, g2u) non_zip = os.path.splitext(cur_file)[0] worked = False with cd(lo_dir): if not env.safe_exists(non_zip): result = shared._remote_fetch(env, "%s" % (lo_base_url % (g1, cur_file)), allow_fail=True) # Lift over back and forths don't always exist # Only move forward if we found the file if result: worked = True env.safe_run("gunzip %s" % result) if worked: ref_parts = [g1, g2, os.path.join(lo_dir, non_zip)] galaxy.update_loc_file("liftOver.loc", ref_parts)
def install_tassel(env): """TASSEL: evaluate traits associations, evolutionary patterns, and linkage disequilibrium. http://www.maizegenetics.net/index.php?option=com_content&task=view&id=89&/Itemid=119 """ version = "5" build_id = "1140d3fceb75" url = "https://bitbucket.org/tasseladmin/tassel-{0}-standalone/get/{1}.zip".format( version, build_id) executables = ["start_tassel.pl", "run_pipeline.pl"] install_dir = _symlinked_java_version_dir("tassel", version, env) if install_dir: with _make_tmp_dir() as work_dir: with cd(work_dir): dl_file = shared._remote_fetch(env, url) env.safe_run("unzip %s" % dl_file) with cd("tasseladmin-tassel-{0}-standalone-{1}".format( version, build_id)): for x in executables: env.safe_sed( x, "^my \$top.*;", "use FindBin qw($RealBin); my $top = $RealBin;") env.safe_sudo("chmod a+rwx %s" % x) env.safe_sudo("mv * %s" % install_dir) for x in executables: env.safe_sudo("ln -s %s/%s %s/bin/%s" % (install_dir, x, env.system_install, x))
def install_gemini(env): """A lightweight db framework for disease and population genetics. https://github.com/arq5x/gemini """ version = "0.6.4" if versioncheck.up_to_date(env, "gemini -v", version, stdout_flag="gemini"): return elif not shared._executable_not_on_path("gemini -v"): env.safe_run("gemini update") else: iurl = "https://raw.github.com/arq5x/gemini/master/gemini/scripts/gemini_install.py" data_dir = os.path.join( env.system_install, "local" if env.system_install.find("/local") == -1 else "", "share", "gemini") with _make_tmp_dir(ext="-gemini") as work_dir: with cd(work_dir): if env.safe_exists(os.path.basename(iurl)): env.safe_run("rm -f %s" % os.path.basename(iurl)) installer = shared._remote_fetch(env, iurl) env.safe_run("%s %s %s %s %s" % (_python_cmd(env), installer, "" if env.use_sudo else "--nosudo", env.system_install, data_dir)) env.safe_run("rm -f gemini_install.py")
def _remove_werror_get_boost(env): env.safe_sed("configure", " -Werror", "") # http://osdir.com/ml/abyss-users-science/2011-10/msg00108.html url = "http://downloads.sourceforge.net/project/boost/boost/1.47.0/boost_1_47_0.tar.bz2" dl_file = shared._remote_fetch(env, url) env.safe_run("tar jxf %s" % dl_file) env.safe_run("ln -s boost_1_47_0/boost boost")
def _install_bottle(env, brew_cmd, pkg, ipkgs): """Install Linux bottles for brew packages that can be tricky to build. """ if env.distribution == "macosx": # Only Linux bottles, build away on Mac return False pkg_version, is_linked = _latest_pkg_version(env, brew_cmd, pkg) install_version = ipkgs["current"].get(pkg) if pkg_version == install_version: # Up to date if not is_linked: env.safe_run("%s link --overwrite %s" % (brew_cmd, pkg)) return True elif install_version or pkg in ipkgs["outdated"]: env.safe_run("{brew_cmd} remove --force {pkg}".format(**locals())) url = BOTTLE_URL.format(pkg=pkg, version=pkg_version) brew_cachedir = env.safe_run_output("%s --cache" % brew_cmd) brew_cellar = os.path.join(env.safe_run_output("%s --prefix" % brew_cmd), "Cellar") with quiet(): env.safe_run("mkdir -p %s" % brew_cellar) out_file = os.path.join(brew_cachedir, os.path.basename(url)) if env.safe_exists(out_file): env.safe_run("rm -f %s" % out_file) bottle_file = shared._remote_fetch(env, url, out_file=out_file, allow_fail=True, samedir=True) if bottle_file: with cd(brew_cellar): env.safe_run("tar -xf %s" % bottle_file) env.safe_run("%s link --overwrite %s" % (brew_cmd, pkg)) return True else: return False
def _download_lcrs(gid): """Retrieve low complexity regions from Heng Li's variant analysis paper. """ lcr_url = "https://github.com/lh3/varcmp/raw/master/scripts/LCR-hs37d5.bed.gz" out_file = "LCR.bed.gz" if not env.safe_exists(out_file): def _fix_chrom_names(env, orig_file): if gid == "hg19": convert_cmd = "| grep -v ^GL | grep -v ^NC | grep -v ^hs | sed 's/^/chr/'" else: assert gid == "GRCh37" convert_cmd = "" env.safe_run("zcat %s %s | bgzip -c > %s" % (orig_file, convert_cmd, out_file)) return out_file shared._remote_fetch(env, lcr_url, fix_fn=_fix_chrom_names) env.safe_run("tabix -p vcf -f %s" % out_file)
def _install_bottle(env, brew_cmd, pkg, ipkgs): """Install Linux bottles for brew packages that can be tricky to build. """ if env.distribution == "macosx": # Only Linux bottles, build away on Mac return False pkg_version, is_linked = _latest_pkg_version(env, brew_cmd, pkg) install_version = ipkgs["current"].get(pkg) if pkg_version == install_version: # Up to date if not is_linked: env.safe_run("%s link --overwrite %s" % (brew_cmd, pkg)) return True elif install_version or pkg in ipkgs["outdated"]: env.safe_run("{brew_cmd} remove --force {pkg}".format(**locals())) url = BOTTLE_URL.format(pkg=pkg, version=pkg_version) brew_cachedir = env.safe_run_output("%s --cache" % brew_cmd) brew_cellar = os.path.join(env.safe_run_output("%s --prefix" % brew_cmd), "Cellar") with quiet(): env.safe_run("mkdir -p %s" % brew_cellar) out_file = os.path.join(brew_cachedir, os.path.basename(url)) if env.safe_exists(out_file): env.safe_run("rm -f %s" % out_file) bottle_file = shared._remote_fetch(env, url, out_file=out_file, allow_fail=True, samedir=True) if bottle_file: with cd(brew_cellar): env.safe_run("tar -xf %s" % bottle_file) env.safe_run("%s link --overwrite %s" % (brew_cmd, pkg)) return True else: return False
def _download_broad_bundle(gid, bundle_version, name, ext): broad_fname = "{name}.{gid}.vcf{ext}".format(gid=gid, name=name, ext=ext) fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "") + ".gz" base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \ "{bundle}/{gid}/{fname}.gz".format( bundle=bundle_version, fname=broad_fname, gid=gid) # compress and prepare existing uncompressed versions if env.safe_exists(fname.replace(".vcf.gz", ".vcf")): env.safe_run("bgzip %s" % fname.replace(".vcf.gz", ".vcf")) env.safe_run("tabix -f -p vcf %s" % fname) # otherwise, download and bgzip and tabix index if not env.safe_exists(fname): out_file = shared._remote_fetch(env, base_url, allow_fail=True) if out_file: env.safe_run("gunzip -c %s | bgzip -c > %s" % (out_file, fname)) env.safe_run("tabix -f -p vcf %s" % fname) env.safe_run("rm -f %s" % out_file) else: env.logger.warn("dbSNP resources not available for %s" % gid) # clean up old files for ext in [".vcf", ".vcf.idx"]: if env.safe_exists(fname.replace(".vcf.gz", ext)): env.safe_run("rm -f %s" % (fname.replace(".vcf.gz", ext))) return fname
def install_anaconda(env): """Pre-packaged Anaconda Python installed from Continuum. http://docs.continuum.io/anaconda/index.html """ version = "2.0.0" outdir = os.path.join(env.system_install, "anaconda") if env.distribution in ["ubuntu", "centos", "scientificlinux", "debian", "arch", "suse"]: platform = "Linux" elif env.distribution in ["macosx"]: platform = "MacOSX" else: raise ValueError("Unexpected distribution: %s" % env.distribution) url = "http://09c8d0b2229f813c1b93-c95ac804525aac4b6dba79b00b39d1d3.r79.cf1.rackcdn.com/" \ "Anaconda-%s-%s-x86_64.sh" % (version, platform) if not env.safe_exists(outdir): with _make_tmp_dir() as work_dir: with cd(work_dir): installer = shared._remote_fetch(env, url) env.safe_sed(os.path.basename(url), "more <<EOF", "cat <<EOF") env.safe_sudo("echo -e '\nyes\n%s\nyes\n' | bash %s" % (outdir, installer)) env.safe_sudo("chown -R %s %s" % (env.user, outdir)) comment_line = "# added by Ananconda %s installer" % version if not env.safe_contains(env.shell_config, comment_line): env.safe_append(env.shell_config, comment_line) env.safe_append(env.shell_config, "export PATH=%s/bin:$PATH" % outdir) # remove curl library with broken certificates env.safe_run("%s/bin/conda remove --yes curl" % outdir) env.safe_run("%s/bin/conda install --yes pip" % outdir)
def download(self, seq_dir): base_seq = os.path.join(os.pardir, self._custom["base"], "seq", "{0}.fa".format(self._custom["base"])) assert env.safe_exists(base_seq) mask_file = os.path.basename(self._custom["mask"]) ready_mask = apply("{0}-complement{1}".format, os.path.splitext(mask_file)) out_fasta = "{0}.fa".format(self._custom["dbkey"]) if not env.safe_exists(os.path.join(seq_dir, out_fasta)): if not env.safe_exists(mask_file): shared._remote_fetch(env, self._custom["mask"]) if not env.safe_exists(ready_mask): env.safe_run("bedtools complement -i {i} -g {g}.fai > {o}".format( i=mask_file, g=base_seq, o=ready_mask)) if not env.safe_exists(out_fasta): env.safe_run("bedtools maskfasta -fi {fi} -bed {bed} -fo {fo}".format( fi=base_seq, bed=ready_mask, fo=out_fasta)) return out_fasta, [mask_file, ready_mask]
def _download_sv_repeats(gid): """Retrieve telomere and centromere exclusion regions for structural variant calling. From Delly: https://github.com/tobiasrausch/delly """ mere_url = "https://raw.githubusercontent.com/chapmanb/delly/master/human.hg19.excl.tsv" out_file = "sv_repeat_telomere_centromere.bed" if not env.safe_exists(out_file): def _select_by_gid(env, orig_file): if gid == "hg19": env.safe_run("grep ^chr %s > %s" % (orig_file, out_file)) else: assert gid == "GRCh37" env.safe_run("grep -v ^chr %s > %s" % (orig_file, out_file)) return out_file shared._remote_fetch(env, mere_url, fix_fn=_select_by_gid)
def download(self, seq_dir): base_seq = os.path.join(os.pardir, self._custom["base"], "seq", "{0}.fa".format(self._custom["base"])) assert env.safe_exists(base_seq) mask_file = os.path.basename(self._custom["mask"]) ready_mask = apply("{0}-complement{1}".format, os.path.splitext(mask_file)) out_fasta = "{0}.fa".format(self._custom["dbkey"]) if not env.safe_exists(os.path.join(seq_dir, out_fasta)): if not env.safe_exists(mask_file): shared._remote_fetch(env, self._custom["mask"]) if not env.safe_exists(ready_mask): env.safe_run("bedtools complement -i {i} -g {g}.fai > {o}".format( i=mask_file, g=base_seq, o=ready_mask)) if not env.safe_exists(out_fasta): env.safe_run("bedtools maskfasta -fi {fi} -bed {bed} -fo {fo}".format( fi=base_seq, bed=ready_mask, fo=out_fasta)) return out_fasta, [mask_file, ready_mask]
def _download_s3_index(env, manager, gid, idx): print("Downloading genome from s3: {0} {1}".format(gid, idx)) url = "https://s3.amazonaws.com/biodata/genomes/%s-%s.tar.xz" % (gid, idx) if gid in ["GRCh37", "hg19", "mm10"] and idx in ["bowtie2", "bwa", "novoalign"]: out_file = shared._remote_fetch(env, url, samedir=True) subprocess.check_call("xz -dc %s | tar -xvpf -" % out_file, shell=True) subprocess.check_call("rm -f %s" % out_file, shell=True) else: raise NotImplementedError("No pre-computed indices for %s %s" % (gid, idx))
def _download_cosmic(gid): """Prepared versions of COSMIC, pre-sorted and indexed. utils/prepare_cosmic.py handles the work of creating the VCFs from standard COSMIC resources. """ base_url = "https://s3.amazonaws.com/biodata/variants" version = "v67_20131024" supported = ["hg19", "GRCh37"] if gid in supported: url = "%s/cosmic-%s-%s.vcf.gz" % (base_url, version, gid) gzip_fname = os.path.basename(url) fname = os.path.splitext(gzip_fname)[0] if not env.safe_exists(fname): if not env.safe_exists(gzip_fname): shared._remote_fetch(env, url) env.safe_run("gunzip %s" % fname) if not env.safe_exists(fname + ".idx"): shared._remote_fetch(env, url.replace(".gz", ".idx"))
def _download_s3_index(env, manager, gid, idx): print("Downloading genome from s3: {0} {1}".format(gid, idx)) url = "https://s3.amazonaws.com/biodata/genomes/%s-%s.tar.xz" % (gid, idx) if gid in ["GRCh37", "hg19", "mm10"] and idx in ["bowtie2", "bwa", "novoalign"]: out_file = shared._remote_fetch(env, url, samedir=True) subprocess.check_call("xz -dc %s | tar -xvpf -" % out_file, shell=True) subprocess.check_call("rm -f %s" % out_file, shell=True) else: raise NotImplementedError("No pre-computed indices for %s %s" % (gid, idx))
def _download_zip(self, seq_dir): for zipped_file in ["chromFa.tar.gz", "%s.fa.gz" % self._name, "chromFa.zip"]: if not self._exists(zipped_file, seq_dir): result = shared._remote_fetch(env, "%s/%s" % (self._url, zipped_file), allow_fail=True) if result: break else: break return zipped_file
def install_mutect(env): version = "1.1.5" url = "https://github.com/broadinstitute/mutect/releases/download/" \ "%s/muTect-%s-bin.zip" % (version, version) install_dir = _symlinked_java_version_dir("mutect", version, env) if install_dir: with _make_tmp_dir() as work_dir: with cd(work_dir): out_file = shared._remote_fetch(env, url) env.safe_run("unzip %s" % out_file) env.safe_sudo("mv *.jar version.txt LICENSE* %s" % install_dir)
def _dbsnp_mouse(env, gid): """Retrieve resources for mouse variant analysis from custom S3 biodata bucket. """ remote_dir = "https://s3.amazonaws.com/biodata/variants/" files = {"mm10": ["mm10-dbSNP-2013-09-12.vcf"]} for f in files[gid]: for ext in ["", ".idx"]: fname = f + ext if not env.safe_exists(fname): out_file = shared._remote_fetch( env, "%s%s.gz" % (remote_dir, fname)) env.safe_run("gunzip %s" % out_file)
def _download_annotation_bundle(env, url, gid): """Download bundle of RNA-seq data from S3 biodata/annotation """ tarball = shared._remote_fetch(env, url, allow_fail=True) if tarball and env.safe_exists(tarball): env.logger.info("Extracting RNA-seq references: %s" % tarball) env.safe_run("xz -dc %s | tar -xpf -" % tarball) env.safe_run("rm -f %s" % tarball) return True else: env.logger.warn("RNA-seq transcripts not available for %s" % gid) return False
def install_varscan(env): """Variant detection in massively parallel sequencing data http://varscan.sourceforge.net/ """ version = "2.3.6" url = "http://downloads.sourceforge.net/project/varscan/VarScan.v%s.jar" % version install_dir = _symlinked_java_version_dir("varscan", version, env) if install_dir: with _make_tmp_dir() as work_dir: with cd(work_dir): out_file = shared._remote_fetch(env, url) env.safe_sudo("mv %s %s" % (out_file, install_dir))
def install_cram(env): """Highly efficient and tunable reference-based compression of sequence data. http://www.ebi.ac.uk/ena/about/cram_toolkit/ """ version = "2.0" url = "https://github.com/vadimzalunin/crammer/raw/master/" \ "cramtools-%s.jar" % version install_dir = _symlinked_java_version_dir("cram", version, env) if install_dir: with _make_tmp_dir() as work_dir: with cd(work_dir): out_file = shared._remote_fetch(env, url) env.safe_sudo("mv %s %s" % (out_file, install_dir))
def install_rnaseqc(env): """Quality control metrics for RNA-seq data https://www.broadinstitute.org/cancer/cga/rna-seqc """ version = "1.1.7" url = ("https://github.com/chapmanb/RNA-SeQC/releases/download/" "v%s/RNA-SeQC_v%s.jar" % (version, version)) install_dir = _symlinked_java_version_dir("RNA-SeQC", version, env) if install_dir: with _make_tmp_dir() as work_dir: with cd(work_dir): out_file = shared._remote_fetch(env, url) env.safe_sudo("mv %s %s" % (out_file, install_dir))
def install_bcbio_variation(env): """Toolkit to analyze genomic variation data with comparison and ensemble approaches. https://github.com/chapmanb/bcbio.variation """ version = "0.1.6" url = "https://github.com/chapmanb/bcbio.variation/releases/download/" \ "v%s/bcbio.variation-%s-standalone.jar" % (version, version) install_dir = _symlinked_java_version_dir("bcbio_variation", version, env) if install_dir: with _make_tmp_dir() as work_dir: with cd(work_dir): jar_file = shared._remote_fetch(env, url) env.safe_sudo("mv %s %s" % (jar_file, install_dir))
def install_rnaseqc(env): """Quality control metrics for RNA-seq data https://www.broadinstitute.org/cancer/cga/rna-seqc """ version = "1.1.7" url = ("http://www.broadinstitute.org/cancer/cga/sites/default/files/" "data/tools/rnaseqc/RNA-SeQC_v%s.jar" % version) install_dir = _symlinked_java_version_dir("RNA-SeQC", version, env) if install_dir: with _make_tmp_dir() as work_dir: with cd(work_dir): out_file = shared._remote_fetch(env, url) env.safe_sudo("mv %s %s" % (out_file, install_dir))
def _download_broad_bundle(gid, bundle_version, name, ext): broad_fname = "{name}.{gid}.vcf{ext}".format(gid=gid, name=name, ext=ext) fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "") base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \ "{bundle}/{gid}/{fname}.gz".format( bundle=bundle_version, fname=broad_fname, gid=gid) if not env.safe_exists(fname): out_file = shared._remote_fetch(env, base_url, allow_fail=True) if out_file: env.safe_run("gunzip %s" % out_file) env.safe_run("mv %s %s" % (broad_fname, fname)) else: env.logger.warn("dbSNP resources not available for %s" % gid) return fname
def install_fastq_screen(env): """A screening application for high througput sequence data. http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/ """ version = "0.4" url = "http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/" \ "fastq_screen_v%s.tar.gz" % version install_dir = shared._symlinked_shared_dir("fastqc_screen", version, env) executable = "fastq_screen" if install_dir: with _make_tmp_dir() as work_dir: with cd(work_dir): out_file = shared._remote_fetch(env, url) env.safe_run("tar -xzvpf %s" % out_file) with cd("fastq_screen_v%s" % version): env.safe_sudo("mv * %s" % install_dir) env.safe_sudo( "ln -s %s/%s %s/bin/%s" % (install_dir, executable, env.system_install, executable))
def install_fastqc(env): """A quality control tool for high throughput sequence data. http://www.bioinformatics.babraham.ac.uk/projects/fastqc/ """ version = "0.10.1" url = "http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/" \ "fastqc_v%s.zip" % version executable = "fastqc" install_dir = _symlinked_java_version_dir("fastqc", version, env) if install_dir: with _make_tmp_dir() as work_dir: with cd(work_dir): out_file = shared._remote_fetch(env, url) env.safe_run("unzip %s" % out_file) with cd("FastQC"): env.safe_sudo("chmod a+rwx %s" % executable) env.safe_sudo("mv * %s" % install_dir) env.safe_sudo( "ln -s %s/%s %s/bin/%s" % (install_dir, executable, env.system_install, executable))
def install_shrec(env): """Shrec is a bioinformatics tool for error correction of HTS read data. http://sourceforge.net/projects/shrec-ec/ """ version = "2.2" url = "http://downloads.sourceforge.net/project/shrec-ec/SHREC%%20%s/bin.zip" % version install_dir = _symlinked_java_version_dir("shrec", version, env) if install_dir: shrec_script = "%s/shrec" % install_dir with _make_tmp_dir() as work_dir: with cd(work_dir): out_file = shared._remote_fetch(env, url) env.safe_run("unzip %s" % out_file) env.safe_sudo("mv *.class %s" % install_dir) for line in _shrec_run.split("\n"): if line.strip(): env.safe_append(shrec_script, line, use_sudo=env.use_sudo) env.safe_sudo("chmod a+rwx %s" % shrec_script) env.safe_sudo("ln -s %s %s/bin/shrec" % (shrec_script, env.system_install))
def _download_qsignature(env, gid, gconfig): """Download qsignature position file to detect samples problems :param env :param gid: str genome id :param gconfig: :returns: NULL """ base_url = "http://downloads.sourceforge.net/project/adamajava/qsignature.tar.bz2" outfile = "qsignature.vcf" if gid == "GRCh37" or (gid == "hg19" and not env.safe_exists("../../GRCh37")): if not env.safe_exists(outfile): zipfile = shared._remote_fetch(env, base_url, samedir=True) outdir = "qsignature" env.safe_run("mkdir -p %s" % outdir) env.safe_run("tar -jxf %s -C %s" % (zipfile, outdir)) env.safe_run("mv %s/qsignature_positions.txt %s" % (outdir, outfile)) env.safe_run("rm -rf %s" % outdir) env.safe_run("rm -rf %s" % zipfile) elif gid == "hg19": # symlink to GRCh37 download if not env.safe_exists(outfile): env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
def _download_broad_bundle(gid, bundle_version, name, ext): # Broad bundle directories have uneven use of ".sites" in VCF files # only present in hg19 for non-dbSNP resources sites = ".sites" if gid == "hg19" and not name.startswith("dbsnp") else "" broad_fname = "{name}.{gid}{sites}.vcf{ext}".format(gid=gid, name=name, sites=sites, ext=ext) fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "") + ".gz" base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \ "{bundle}/{gid}/{fname}.gz".format( bundle=bundle_version, fname=broad_fname, gid=gid) # compress and prepare existing uncompressed versions if env.safe_exists(fname.replace(".vcf.gz", ".vcf")): env.safe_run("bgzip %s" % fname.replace(".vcf.gz", ".vcf")) env.safe_run("tabix -f -p vcf %s" % fname) # otherwise, download and bgzip and tabix index if not env.safe_exists(fname): out_file = shared._remote_fetch(env, base_url) env.safe_run("gunzip -c %s | bgzip -c > %s" % (out_file, fname)) env.safe_run("tabix -f -p vcf %s" % fname) env.safe_run("rm -f %s" % out_file) # clean up old files for ext in [".vcf", ".vcf.idx"]: if env.safe_exists(fname.replace(".vcf.gz", ext)): env.safe_run("rm -f %s" % (fname.replace(".vcf.gz", ext))) return fname
def _get_samtools(env): shared._remote_fetch(env, samtools_url) env.safe_run("tar jxf samtools-{0}.tar.bz2".format(samtools_version)) env.safe_run("ln -s samtools-{0} samtools".format(samtools_version))