Esempio n. 1
0
 def download(self, seq_dir):
     genome_file = "%s.fa" % self._name
     if not self._exists(self._get_file, seq_dir):
         shared._remote_fetch(None, "%s%s" % (self._url, self._get_file))
     if not self._exists(genome_file, seq_dir):
         subprocess.check_call("gunzip -c %s > %s" % (self._get_file, genome_file), shell=True)
     return genome_file, [self._get_file]
Esempio n. 2
0
def _data_uniref():
    """Retrieve and index UniRef databases for protein searches.

    http://www.ebi.ac.uk/uniref/

    These are currently indexed for FASTA searches. Are other indexes desired?
    Should this be separated out and organized by program like genome data?
    This should also check the release note and automatically download and
    replace older versions.
    """
    site = "ftp://ftp.uniprot.org"
    base_url = site + "/pub/databases/uniprot/" \
               "current_release/uniref/%s/%s"
    for uniref_db in ["uniref50", "uniref90", "uniref100"]:
        work_dir = os.path.join(env.data_files, "uniref", uniref_db)
        if not env.safe_exists(work_dir):
            env.safe_run("mkdir -p %s" % work_dir)
        base_work_url = base_url % (uniref_db, uniref_db)
        fasta_url = base_work_url + ".fasta.gz"
        base_file = os.path.splitext(os.path.basename(fasta_url))[0]
        with cd(work_dir):
            if not env.safe_exists(base_file):
                out_file = shared._remote_fetch(env, fasta_url)
                env.safe_run("gunzip %s" % out_file)
                shared._remote_fetch(env, base_work_url + ".release_note")
        _index_blast_db(work_dir, base_file, "prot")
Esempio n. 3
0
 def download(self, seq_dir):
     org_file = "%s.fa" % self._name
     if not self._exists(org_file, seq_dir):
         shared._remote_fetch(None, "%s%s.gz" % (self._ftp_url, self._target))
         subprocess.check_call("gunzip %s.gz" % self._target, shell=True)
         subprocess.check_call("mv %s %s" % (self._target, org_file), shell=True)
     return org_file, []
Esempio n. 4
0
 def download(self, seq_dir):
     genome_file = "%s.fa" % self._name
     if not self._exists(self._get_file, seq_dir):
         shared._remote_fetch(env, "%s%s" % (self._url, self._get_file))
     if not self._exists(genome_file, seq_dir):
         env.safe_run("gunzip -c %s > %s" % (self._get_file, genome_file))
     return genome_file, [self._get_file]
Esempio n. 5
0
 def download(self, seq_dir):
     org_file = "%s.fa" % self._name
     if not self._exists(org_file, seq_dir):
         shared._remote_fetch(env, "%s%s.gz" % (self._ftp_url, self._target))
         env.safe_run("gunzip %s.gz" % self._target)
         env.safe_run("mv %s %s" % (self._target, org_file))
     return org_file, []
Esempio n. 6
0
def _download_background_vcf(gid):
    """Download background file of variant to use in calling.
    """
    base_url = "https://s3.amazonaws.com/biodata/variants"
    base_name = "background-diversity-1000g.vcf"
    if gid in ["GRCh37"] and not env.safe_exists("{0}.gz".format(base_name)):
        for ext in ["gz", "gz.tbi"]:
            shared._remote_fetch(env, "{0}/{1}.{2}".format(base_url, base_name, ext))
Esempio n. 7
0
 def download(self, seq_dir):
     genome_file = "%s.fa" % self._name
     for fn in self._to_get:
         url = self._base_url + fn
         if not self._exists(fn, seq_dir):
             shared._remote_fetch(env, url)
             env.safe_run("gunzip -c %s >> %s" % (fn, genome_file))
     return genome_file, []
Esempio n. 8
0
 def download(self, seq_dir):
     genome_file = "%s.fa" % self._name
     for fn in self._to_get:
         url = self._base_url + fn
         if not self._exists(fn, seq_dir):
             shared._remote_fetch(None, url)
             subprocess.check_call("gunzip -c %s >> %s" % (fn, genome_file), shell=True)
     return genome_file, []
Esempio n. 9
0
def _download_ensembl_gtf(env, manager):
    """Fetch ensembl gtf file for coresponding genome - release
    """
    fname = "%s.%s.%s.gtf" % (manager._organism, manager._name, manager._release_number)
    download_url = manager._base_url
    download_url += "release-%s/gtf/%s/%s" % (manager._release_number, manager._organism.lower(), fname)
    if not env.safe_exists(fname):
        shared._remote_fetch(env, download_url + ".gz")
        env.safe_run("gunzip %s.gz" % fname)
Esempio n. 10
0
def _download_executables(env, base_url, tools):
    install_dir = shared._get_bin_dir(env)
    with _make_tmp_dir() as work_dir:
        with cd(work_dir):
            for tool in tools:
                final_tool = os.path.join(install_dir, tool)
                if not env.safe_exists(final_tool) and shared._executable_not_on_path(tool):
                    shared._remote_fetch(env, "%s%s" % (base_url, tool))
                    env.safe_sudo("cp -f %s %s" % (tool, install_dir))
Esempio n. 11
0
 def download(self, seq_dir):
     genome_file = "%s.fa" % self._name
     if not self._exists(self._get_file, seq_dir):
         shared._remote_fetch(env, "%s%s" % (self._url, self._get_file))
     if not self._exists(genome_file, seq_dir):
         env.safe_run("gunzip -c %s > %s" % (self._get_file, genome_file))
     if self._convert_to_ucsc:
         #run("sed s/ / /g %s" % genome_file)
         raise NotImplementedError("Replace with chr")
     return genome_file, [self._get_file]
Esempio n. 12
0
def _dbsnp_custom(env, gid):
    """Retrieve resources for dbsnp builds from custom S3 biodata bucket.
    """
    remote_dir = "https://s3.amazonaws.com/biodata/variants/"
    files = {"mm10": ["mm10-dbSNP-2013-09-12.vcf.gz"], "canFam3": ["canFam3-dbSNP-2014-04-10.vcf.gz"]}
    for f in files[gid]:
        for ext in ["", ".tbi"]:
            fname = f + ext
            if not env.safe_exists(fname):
                shared._remote_fetch(env, "%s%s" % (remote_dir, fname))
Esempio n. 13
0
def install_leiningen(env):
    """Clojure tool for project configuration and automation.
    http://github.com/technomancy/leiningen
    """
    bin_dir = os.path.join(env.system_install, "bin")
    with _make_tmp_dir() as work_dir:
        with cd(work_dir):
            shared._remote_fetch(env, "https://raw.github.com/technomancy/leiningen/stable/bin/lein")
            env.safe_run("chmod a+rwx lein")
            env.safe_sudo("mv lein %s" % bin_dir)
            env.safe_run("%s/lein" % bin_dir)
Esempio n. 14
0
 def download(self, seq_dir):
     genome_file = "%s.fa" % self._name
     if not self._exists(genome_file, seq_dir):
         for ref in self._refs:
             shared._remote_fetch(None, self._base_url % ref)
             subprocess.check_call("ls -l", shell=True)
             subprocess.check_call(r"sed -i 's/^>.*$/>%s/' %s.fasta" % (ref, ref), shell=True)
         tmp_file = genome_file.replace(".fa", ".txt")
         subprocess.check_call("cat *.fasta > %s" % tmp_file, shell=True)
         subprocess.check_call("rm -f *.fasta", shell=True)
         subprocess.check_call("rm -f *.bak", shell=True)
         subprocess.check_call("mv %s %s" % (tmp_file, genome_file), shell=True)
     return genome_file, []
Esempio n. 15
0
 def download(self, seq_dir):
     genome_file = "%s.fa" % self._name
     if not self._exists(genome_file, seq_dir):
         for ref in self._refs:
             shared._remote_fetch(env, self._base_url % ref)
             env.safe_run("ls -l")
             env.safe_sed('%s.fasta' % ref, '^>.*$', '>%s' % ref, '1')
         tmp_file = genome_file.replace(".fa", ".txt")
         env.safe_run("cat *.fasta > %s" % tmp_file)
         env.safe_run("rm -f *.fasta")
         env.safe_run("rm -f *.bak")
         env.safe_run("mv %s %s" % (tmp_file, genome_file))
     return genome_file, []
Esempio n. 16
0
def _ensembl_vcf(env, gid, manager):
    """Fetch ensemble vcf file (available from release 71) and do tabix indexing
    """
    fname = "%s.vcf.gz" % (manager._organism)
    download_url = manager._base_url
    section = "variation/"
    if not manager._section is "standard":
        section = ""
        fname = fname.lower()
    download_url += "release-%s/%svcf/%s/%s" % (manager._release_number, 
                    section, manager._organism.lower(), fname)
    if not env.safe_exists(fname):
        shared._remote_fetch(env, download_url)
        env.safe_run("tabix -f -p vcf %s" % fname)
Esempio n. 17
0
def _download_lcrs_custom(env, gid):
    """Retrieve low complexity regions from other sources.

    mm10 from Brent Pedersen: http://figshare.com/articles/LCR_mm10_bed_gz/1180124
    """
    urls = {"mm10": "http://files.figshare.com/1688228/LCR_mm10.bed.gz"}
    out_file = "LCR.bed.gz"
    cur_url = urls.get(gid)
    if cur_url and not env.safe_exists(out_file):
        def _bgzip_file(env, orig_file):
            env.safe_run("zcat %s | bgzip -c > %s" % (orig_file, out_file))
            return out_file
        shared._remote_fetch(env, cur_url, fix_fn=_bgzip_file)
        env.safe_run("tabix -p vcf -f %s" % out_file)
Esempio n. 18
0
def _download_sv_repeats(gid):
    """Retrieve telomere and centromere exclusion regions for structural variant calling.
    From Delly: https://github.com/tobiasrausch/delly
    """
    mere_url = "https://raw.githubusercontent.com/chapmanb/delly/master/human.hg19.excl.tsv"
    out_file = "sv_repeat_telomere_centromere.bed"
    if not env.safe_exists(out_file):
        def _select_by_gid(env, orig_file):
            if gid == "hg19":
                env.safe_run("grep ^chr %s > %s" % (orig_file, out_file))
            else:
                assert gid == "GRCh37"
                env.safe_run("grep -v ^chr %s > %s" % (orig_file, out_file))
            return out_file
        shared._remote_fetch(env, mere_url, fix_fn=_select_by_gid)
Esempio n. 19
0
def _download_cosmic(gid):
    """Prepared versions of COSMIC, pre-sorted and indexed.
    utils/prepare_cosmic.py handles the work of creating the VCFs from standard
    COSMIC resources.
    """
    base_url = "https://s3.amazonaws.com/biodata/variants"
    version = "v68"
    supported = ["hg19", "GRCh37"]
    if gid in supported:
        url = "%s/cosmic-%s-%s.vcf.gz" % (base_url, version, gid)
        fname = os.path.basename(url)
        if not env.safe_exists(fname):
            shared._remote_fetch(env, url)
        if not env.safe_exists(fname + ".tbi"):
            shared._remote_fetch(env, url + ".tbi")
Esempio n. 20
0
def _download_dbnsfp(env, gid, gconfig):
    """Download and prepare dbNSFP functional prediction resources if configured.

    Feeds into VEP for annotating VCF files:
    https://sites.google.com/site/jpopgen/dbNSFP
    https://github.com/ensembl-variation/VEP_plugins/blob/master/dbNSFP.pm
    """
    version = "2.5"
    url = "http://dbnsfp.houstonbioinformatics.org/dbNSFPzip/dbNSFPv%s.zip" % version
    if gconfig.get("dbnsfp"):
        outfile = "dbNSFP_v%s.gz" % (version)
        if gid == "GRCh37":  # download and prepare bgzipped output file
            if not env.safe_exists(outfile):
                zipfile = shared._remote_fetch(env, url, samedir=True)
                outdir = "dbNSFPv%s" % version
                env.safe_run("mkdir -p %s" % outdir)
                env.safe_run("unzip %s -d %s" % (zipfile, outdir))
                env.safe_run("cat %s/dbNSFP*_variant.chr* | bgzip -c > %s" % (outdir, outfile))
                env.safe_run("rm -f %s/* && rmdir %s" % (outdir, outdir))
                env.safe_run("rm -f %s" % (zipfile))
            if not env.safe_exists(outfile + ".tbi"):
                env.safe_run("tabix -s 1 -b 2 -e 2 -c '#' %s" % outfile)
        elif gid == "hg19":  # symlink to GRCh37 download
            if not env.safe_exists(outfile):
                env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
            if not env.safe_exists(outfile + ".tbi"):
                env.safe_run("ln -sf ../../GRCh37/variation/%s.tbi %s.tbi" % (outfile, outfile))
Esempio n. 21
0
def _download_ancestral(env, gid, gconfig):
    """Download ancestral genome sequence for loss of function evaluation.

    Used by LOFTEE VEP plugin: https://github.com/konradjk/loftee
    """
    base_url = "http://www.broadinstitute.org/~konradk/loftee/human_ancestor.fa.rz"
    if gid == "GRCh37":
        for ext in ["", ".fai"]:
            outfile = os.path.basename(base_url) + ext
            if not env.safe_exists(outfile):
                shared._remote_fetch(env, base_url + ext, samedir=True)
    elif gid == "hg19":  # symlink to GRCh37 download
        for ext in ["", ".fai"]:
            outfile = os.path.basename(base_url) + ext
            if not env.safe_exists(outfile):
                env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
Esempio n. 22
0
def _download_lcrs(gid):
    """Retrieve low complexity regions from Heng Li's variant analysis paper.
    """
    lcr_url = "https://github.com/lh3/varcmp/raw/master/scripts/LCR-hs37d5.bed.gz"
    out_file = "LCR.bed.gz"
    if not env.safe_exists(out_file):
        def _fix_chrom_names(env, orig_file):
            if gid == "hg19":
                convert_cmd = "| grep -v ^GL | grep -v ^NC | grep -v ^hs | sed 's/^/chr/'"
            else:
                assert gid == "GRCh37"
                convert_cmd = ""
            env.safe_run("zcat %s %s | bgzip -c > %s" % (orig_file, convert_cmd, out_file))
            return out_file
        shared._remote_fetch(env, lcr_url, fix_fn=_fix_chrom_names)
        env.safe_run("tabix -p vcf -f %s" % out_file)
Esempio n. 23
0
def _data_liftover(lift_over_genomes):
    """Download chain files for running liftOver.

    Does not install liftOver binaries automatically.
    """
    lo_dir = os.path.join(env.data_files, "liftOver")
    if not env.safe_exists(lo_dir):
        env.safe_run("mkdir %s" % lo_dir)
    lo_base_url = "ftp://hgdownload.cse.ucsc.edu/goldenPath/%s/liftOver/%s"
    lo_base_file = "%sTo%s.over.chain.gz"
    for g1 in lift_over_genomes:
        for g2 in [g for g in lift_over_genomes if g != g1]:
            g2u = g2[0].upper() + g2[1:]
            cur_file = lo_base_file % (g1, g2u)
            non_zip = os.path.splitext(cur_file)[0]
            worked = False
            with cd(lo_dir):
                if not env.safe_exists(non_zip):
                    result = shared._remote_fetch(env, "%s" % (lo_base_url % (g1, cur_file)), allow_fail=True)
                    # Lift over back and forths don't always exist
                    # Only move forward if we found the file
                    if result:
                        worked = True
                        env.safe_run("gunzip %s" % result)
            if worked:
                ref_parts = [g1, g2, os.path.join(lo_dir, non_zip)]
                galaxy.update_loc_file("liftOver.loc", ref_parts)
Esempio n. 24
0
 def download(self, seq_dir):
     genome_file = "%s.fa" % self._name
     if not self._exists(genome_file, seq_dir):
         for ref in self._refs:
             shared._remote_fetch(None, self._base_url % ref)
             subprocess.check_call("ls -l", shell=True)
             subprocess.check_call(r"sed -i 's/^>.*$/>%s/' %s.fasta" %
                                   (ref, ref),
                                   shell=True)
         tmp_file = genome_file.replace(".fa", ".txt")
         subprocess.check_call("cat *.fasta > %s" % tmp_file, shell=True)
         subprocess.check_call("rm -f *.fasta", shell=True)
         subprocess.check_call("rm -f *.bak", shell=True)
         subprocess.check_call("mv %s %s" % (tmp_file, genome_file),
                               shell=True)
     return genome_file, []
Esempio n. 25
0
def install_anaconda(env):
    """Pre-packaged Anaconda Python installed from Continuum.
    http://docs.continuum.io/anaconda/index.html
    """
    version = "2.0.0"
    outdir = os.path.join(env.system_install, "anaconda")
    if env.distribution in [
            "ubuntu", "centos", "scientificlinux", "debian", "arch", "suse"
    ]:
        platform = "Linux"
    elif env.distribution in ["macosx"]:
        platform = "MacOSX"
    else:
        raise ValueError("Unexpected distribution: %s" % env.distribution)
    url = "http://09c8d0b2229f813c1b93-c95ac804525aac4b6dba79b00b39d1d3.r79.cf1.rackcdn.com/" \
          "Anaconda-%s-%s-x86_64.sh" % (version, platform)
    if not env.safe_exists(outdir):
        with _make_tmp_dir() as work_dir:
            with cd(work_dir):
                installer = shared._remote_fetch(env, url)
                env.safe_sed(os.path.basename(url), "more <<EOF", "cat  <<EOF")
                env.safe_sudo("echo -e '\nyes\n%s\nyes\n' | bash %s" %
                              (outdir, installer))
                env.safe_sudo("chown -R %s %s" % (env.user, outdir))
                comment_line = "# added by Ananconda %s installer" % version
                if not env.safe_contains(env.shell_config, comment_line):
                    env.safe_append(env.shell_config, comment_line)
                    env.safe_append(env.shell_config,
                                    "export PATH=%s/bin:$PATH" % outdir)
                # remove curl library with broken certificates
                env.safe_run("%s/bin/conda remove --yes curl" % outdir)
                env.safe_run("%s/bin/conda install --yes pip" % outdir)
Esempio n. 26
0
def _download_dbnsfp(env, gid, gconfig):
    """Download and prepare dbNSFP functional prediction resources if configured.

    Feeds into VEP for annotating VCF files:
    https://sites.google.com/site/jpopgen/dbNSFP
    https://github.com/ensembl-variation/VEP_plugins/blob/master/dbNSFP.pm
    """
    version = "2.8"
    url = "https://onedrive.live.com/download?cid=0D359D171E382137&resid=D359D171E382137%2154761&authkey=AFm7prRqSLLLC9g"
    dl_file = "dbNSFPv%s.zip" % version
    if gconfig.get("dbnsfp"):
        outfile = "dbNSFP_v%s.gz" % (version)
        if gid == "GRCh37" or (gid == "hg19" and not env.safe_exists("../../GRCh37")):
            if not env.safe_exists(outfile):
                zipfile = shared._remote_fetch(env, url, out_file=dl_file, samedir=True)
                outdir = "dbNSFPv%s" % version
                env.safe_run("mkdir -p %s" % outdir)
                env.safe_run("unzip %s -d %s" % (zipfile, outdir))
                env.safe_run("cat %s/dbNSFP*_variant.chr* | bgzip -c > %s" % (outdir, outfile))
                env.safe_run("rm -f %s/* && rmdir %s" % (outdir, outdir))
                env.safe_run("rm -f %s" % (zipfile))
            if not env.safe_exists(outfile + ".tbi"):
                env.safe_run("tabix -s 1 -b 2 -e 2 -c '#' %s" % outfile)
        elif gid == "hg19":  # symlink to GRCh37 download
            if not env.safe_exists(outfile):
                env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
            if not env.safe_exists(outfile + ".tbi"):
                env.safe_run("ln -sf ../../GRCh37/variation/%s.tbi %s.tbi" % (outfile, outfile))
Esempio n. 27
0
def _download_dbnsfp(env, gid, gconfig):
    """Download and prepare dbNSFP functional prediction resources if configured.

    Feeds into VEP for annotating VCF files:
    https://sites.google.com/site/jpopgen/dbNSFP
    https://github.com/ensembl-variation/VEP_plugins/blob/master/dbNSFP.pm
    """
    version = "2.5"
    url = "http://dbnsfp.houstonbioinformatics.org/dbNSFPzip/dbNSFPv%s.zip" % version
    if gconfig.get("dbnsfp"):
        outfile = "dbNSFP_v%s.gz" % (version)
        if gid == "GRCh37":  # download and prepare bgzipped output file
            if not env.safe_exists(outfile):
                zipfile = shared._remote_fetch(env, url, samedir=True)
                outdir = "dbNSFPv%s" % version
                env.safe_run("mkdir -p %s" % outdir)
                env.safe_run("unzip %s -d %s" % (zipfile, outdir))
                env.safe_run("cat %s/dbNSFP*_variant.chr* | bgzip -c > %s" % (outdir, outfile))
                env.safe_run("rm -f %s/* && rmdir %s" % (outdir, outdir))
                env.safe_run("rm -f %s" % (zipfile))
            if not env.safe_exists(outfile + ".tbi"):
                env.safe_run("tabix -s 1 -b 2 -e 2 -c '#' %s" % outfile)
        elif gid == "hg19":  # symlink to GRCh37 download
            if not env.safe_exists(outfile):
                env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
            if not env.safe_exists(outfile + ".tbi"):
                env.safe_run("ln -sf ../../GRCh37/variation/%s.tbi %s.tbi" % (outfile, outfile))
Esempio n. 28
0
 def _remove_werror_get_boost(env):
     env.safe_sed("configure", " -Werror", "")
     # http://osdir.com/ml/abyss-users-science/2011-10/msg00108.html
     url = "http://downloads.sourceforge.net/project/boost/boost/1.47.0/boost_1_47_0.tar.bz2"
     dl_file = shared._remote_fetch(env, url)
     env.safe_run("tar jxf %s" % dl_file)
     env.safe_run("ln -s boost_1_47_0/boost boost")
Esempio n. 29
0
def _download_ancestral(env, gid, gconfig):
    """Download ancestral genome sequence for loss of function evaluation.

    Used by LOFTEE VEP plugin: https://github.com/konradjk/loftee
    """
    base_url = "http://www.broadinstitute.org/~konradk/loftee/human_ancestor.fa.rz"
    if gid == "GRCh37":
        for ext in ["", ".fai"]:
            outfile = os.path.basename(base_url) + ext
            if not env.safe_exists(outfile):
                shared._remote_fetch(env, base_url + ext, samedir=True)
    elif gid == "hg19":  # symlink to GRCh37 download
        for ext in ["", ".fai"]:
            outfile = os.path.basename(base_url) + ext
            if not env.safe_exists(outfile):
                env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
Esempio n. 30
0
def _data_liftover(lift_over_genomes):
    """Download chain files for running liftOver.

    Does not install liftOver binaries automatically.
    """
    lo_dir = os.path.join(env.data_files, "liftOver")
    if not env.safe_exists(lo_dir):
        env.safe_run("mkdir %s" % lo_dir)
    lo_base_url = "ftp://hgdownload.cse.ucsc.edu/goldenPath/%s/liftOver/%s"
    lo_base_file = "%sTo%s.over.chain.gz"
    for g1 in lift_over_genomes:
        for g2 in [g for g in lift_over_genomes if g != g1]:
            g2u = g2[0].upper() + g2[1:]
            cur_file = lo_base_file % (g1, g2u)
            non_zip = os.path.splitext(cur_file)[0]
            worked = False
            with cd(lo_dir):
                if not env.safe_exists(non_zip):
                    result = shared._remote_fetch(env, "%s" % (lo_base_url % (g1, cur_file)), allow_fail=True)
                    # Lift over back and forths don't always exist
                    # Only move forward if we found the file
                    if result:
                        worked = True
                        env.safe_run("gunzip %s" % result)
            if worked:
                ref_parts = [g1, g2, os.path.join(lo_dir, non_zip)]
                galaxy.update_loc_file("liftOver.loc", ref_parts)
Esempio n. 31
0
def install_tassel(env):
    """TASSEL: evaluate traits associations, evolutionary patterns, and linkage disequilibrium.
    http://www.maizegenetics.net/index.php?option=com_content&task=view&id=89&/Itemid=119
    """
    version = "5"
    build_id = "1140d3fceb75"
    url = "https://bitbucket.org/tasseladmin/tassel-{0}-standalone/get/{1}.zip".format(
        version, build_id)
    executables = ["start_tassel.pl", "run_pipeline.pl"]
    install_dir = _symlinked_java_version_dir("tassel", version, env)
    if install_dir:
        with _make_tmp_dir() as work_dir:
            with cd(work_dir):
                dl_file = shared._remote_fetch(env, url)
                env.safe_run("unzip %s" % dl_file)
                with cd("tasseladmin-tassel-{0}-standalone-{1}".format(
                        version, build_id)):
                    for x in executables:
                        env.safe_sed(
                            x, "^my \$top.*;",
                            "use FindBin qw($RealBin); my $top = $RealBin;")
                        env.safe_sudo("chmod a+rwx %s" % x)
                    env.safe_sudo("mv * %s" % install_dir)
                for x in executables:
                    env.safe_sudo("ln -s %s/%s %s/bin/%s" %
                                  (install_dir, x, env.system_install, x))
Esempio n. 32
0
def install_gemini(env):
    """A lightweight db framework for disease and population genetics.
    https://github.com/arq5x/gemini
    """
    version = "0.6.4"
    if versioncheck.up_to_date(env, "gemini -v", version,
                               stdout_flag="gemini"):
        return
    elif not shared._executable_not_on_path("gemini -v"):
        env.safe_run("gemini update")
    else:
        iurl = "https://raw.github.com/arq5x/gemini/master/gemini/scripts/gemini_install.py"
        data_dir = os.path.join(
            env.system_install,
            "local" if env.system_install.find("/local") == -1 else "",
            "share", "gemini")
        with _make_tmp_dir(ext="-gemini") as work_dir:
            with cd(work_dir):
                if env.safe_exists(os.path.basename(iurl)):
                    env.safe_run("rm -f %s" % os.path.basename(iurl))
                installer = shared._remote_fetch(env, iurl)
                env.safe_run("%s %s %s %s %s" %
                             (_python_cmd(env), installer, "" if env.use_sudo
                              else "--nosudo", env.system_install, data_dir))
                env.safe_run("rm -f gemini_install.py")
Esempio n. 33
0
 def _remove_werror_get_boost(env):
     env.safe_sed("configure", " -Werror", "")
     # http://osdir.com/ml/abyss-users-science/2011-10/msg00108.html
     url = "http://downloads.sourceforge.net/project/boost/boost/1.47.0/boost_1_47_0.tar.bz2"
     dl_file = shared._remote_fetch(env, url)
     env.safe_run("tar jxf %s" % dl_file)
     env.safe_run("ln -s boost_1_47_0/boost boost")
Esempio n. 34
0
def _install_bottle(env, brew_cmd, pkg, ipkgs):
    """Install Linux bottles for brew packages that can be tricky to build.
    """
    if env.distribution == "macosx":  # Only Linux bottles, build away on Mac
        return False
    pkg_version, is_linked = _latest_pkg_version(env, brew_cmd, pkg)
    install_version = ipkgs["current"].get(pkg)
    if pkg_version == install_version:  # Up to date
        if not is_linked:
            env.safe_run("%s link --overwrite %s" % (brew_cmd, pkg))
        return True
    elif install_version or pkg in ipkgs["outdated"]:
        env.safe_run("{brew_cmd} remove --force {pkg}".format(**locals()))
    url = BOTTLE_URL.format(pkg=pkg, version=pkg_version)
    brew_cachedir = env.safe_run_output("%s --cache" % brew_cmd)
    brew_cellar = os.path.join(env.safe_run_output("%s --prefix" % brew_cmd), "Cellar")
    with quiet():
        env.safe_run("mkdir -p %s" % brew_cellar)
    out_file = os.path.join(brew_cachedir, os.path.basename(url))
    if env.safe_exists(out_file):
        env.safe_run("rm -f %s" % out_file)
    bottle_file = shared._remote_fetch(env, url, out_file=out_file,
                                       allow_fail=True, samedir=True)
    if bottle_file:
        with cd(brew_cellar):
            env.safe_run("tar -xf %s" % bottle_file)
        env.safe_run("%s link --overwrite %s" % (brew_cmd, pkg))
        return True
    else:
        return False
Esempio n. 35
0
def _download_lcrs(gid):
    """Retrieve low complexity regions from Heng Li's variant analysis paper.
    """
    lcr_url = "https://github.com/lh3/varcmp/raw/master/scripts/LCR-hs37d5.bed.gz"
    out_file = "LCR.bed.gz"
    if not env.safe_exists(out_file):
        def _fix_chrom_names(env, orig_file):
            if gid == "hg19":
                convert_cmd = "| grep -v ^GL | grep -v ^NC | grep -v ^hs | sed 's/^/chr/'"
            else:
                assert gid == "GRCh37"
                convert_cmd = ""
            env.safe_run("zcat %s %s | bgzip -c > %s" % (orig_file, convert_cmd, out_file))
            return out_file
        shared._remote_fetch(env, lcr_url, fix_fn=_fix_chrom_names)
        env.safe_run("tabix -p vcf -f %s" % out_file)
Esempio n. 36
0
def _install_bottle(env, brew_cmd, pkg, ipkgs):
    """Install Linux bottles for brew packages that can be tricky to build.
    """
    if env.distribution == "macosx":  # Only Linux bottles, build away on Mac
        return False
    pkg_version, is_linked = _latest_pkg_version(env, brew_cmd, pkg)
    install_version = ipkgs["current"].get(pkg)
    if pkg_version == install_version:  # Up to date
        if not is_linked:
            env.safe_run("%s link --overwrite %s" % (brew_cmd, pkg))
        return True
    elif install_version or pkg in ipkgs["outdated"]:
        env.safe_run("{brew_cmd} remove --force {pkg}".format(**locals()))
    url = BOTTLE_URL.format(pkg=pkg, version=pkg_version)
    brew_cachedir = env.safe_run_output("%s --cache" % brew_cmd)
    brew_cellar = os.path.join(env.safe_run_output("%s --prefix" % brew_cmd), "Cellar")
    with quiet():
        env.safe_run("mkdir -p %s" % brew_cellar)
    out_file = os.path.join(brew_cachedir, os.path.basename(url))
    if env.safe_exists(out_file):
        env.safe_run("rm -f %s" % out_file)
    bottle_file = shared._remote_fetch(env, url, out_file=out_file,
                                       allow_fail=True, samedir=True)
    if bottle_file:
        with cd(brew_cellar):
            env.safe_run("tar -xf %s" % bottle_file)
        env.safe_run("%s link --overwrite %s" % (brew_cmd, pkg))
        return True
    else:
        return False
Esempio n. 37
0
def _download_broad_bundle(gid, bundle_version, name, ext):
    broad_fname = "{name}.{gid}.vcf{ext}".format(gid=gid, name=name, ext=ext)
    fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites",
                                                                "") + ".gz"
    base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \
               "{bundle}/{gid}/{fname}.gz".format(
                   bundle=bundle_version, fname=broad_fname, gid=gid)
    # compress and prepare existing uncompressed versions
    if env.safe_exists(fname.replace(".vcf.gz", ".vcf")):
        env.safe_run("bgzip %s" % fname.replace(".vcf.gz", ".vcf"))
        env.safe_run("tabix -f -p vcf %s" % fname)
    # otherwise, download and bgzip and tabix index
    if not env.safe_exists(fname):
        out_file = shared._remote_fetch(env, base_url, allow_fail=True)
        if out_file:
            env.safe_run("gunzip -c %s | bgzip -c > %s" % (out_file, fname))
            env.safe_run("tabix -f -p vcf %s" % fname)
            env.safe_run("rm -f %s" % out_file)
        else:
            env.logger.warn("dbSNP resources not available for %s" % gid)
    # clean up old files
    for ext in [".vcf", ".vcf.idx"]:
        if env.safe_exists(fname.replace(".vcf.gz", ext)):
            env.safe_run("rm -f %s" % (fname.replace(".vcf.gz", ext)))
    return fname
Esempio n. 38
0
def install_anaconda(env):
    """Pre-packaged Anaconda Python installed from Continuum.
    http://docs.continuum.io/anaconda/index.html
    """
    version = "2.0.0"
    outdir = os.path.join(env.system_install, "anaconda")
    if env.distribution in ["ubuntu", "centos", "scientificlinux", "debian", "arch", "suse"]:
        platform = "Linux"
    elif env.distribution in ["macosx"]:
        platform = "MacOSX"
    else:
        raise ValueError("Unexpected distribution: %s" % env.distribution)
    url = "http://09c8d0b2229f813c1b93-c95ac804525aac4b6dba79b00b39d1d3.r79.cf1.rackcdn.com/" \
          "Anaconda-%s-%s-x86_64.sh" % (version, platform)
    if not env.safe_exists(outdir):
        with _make_tmp_dir() as work_dir:
            with cd(work_dir):
                installer = shared._remote_fetch(env, url)
                env.safe_sed(os.path.basename(url), "more <<EOF", "cat  <<EOF")
                env.safe_sudo("echo -e '\nyes\n%s\nyes\n' | bash %s" % (outdir, installer))
                env.safe_sudo("chown -R %s %s" % (env.user, outdir))
                comment_line = "# added by Ananconda %s installer" % version
                if not env.safe_contains(env.shell_config, comment_line):
                    env.safe_append(env.shell_config, comment_line)
                    env.safe_append(env.shell_config, "export PATH=%s/bin:$PATH" % outdir)
                # remove curl library with broken certificates
                env.safe_run("%s/bin/conda remove --yes curl" % outdir)
                env.safe_run("%s/bin/conda install --yes pip" % outdir)
Esempio n. 39
0
 def download(self, seq_dir):
     base_seq = os.path.join(os.pardir, self._custom["base"],
                             "seq", "{0}.fa".format(self._custom["base"]))
     assert env.safe_exists(base_seq)
     mask_file = os.path.basename(self._custom["mask"])
     ready_mask = apply("{0}-complement{1}".format, os.path.splitext(mask_file))
     out_fasta = "{0}.fa".format(self._custom["dbkey"])
     if not env.safe_exists(os.path.join(seq_dir, out_fasta)):
         if not env.safe_exists(mask_file):
             shared._remote_fetch(env, self._custom["mask"])
         if not env.safe_exists(ready_mask):
             env.safe_run("bedtools complement -i {i} -g {g}.fai > {o}".format(
                 i=mask_file, g=base_seq, o=ready_mask))
         if not env.safe_exists(out_fasta):
             env.safe_run("bedtools maskfasta -fi {fi} -bed {bed} -fo {fo}".format(
                 fi=base_seq, bed=ready_mask, fo=out_fasta))
     return out_fasta, [mask_file, ready_mask]
Esempio n. 40
0
def _download_sv_repeats(gid):
    """Retrieve telomere and centromere exclusion regions for structural variant calling.
    From Delly: https://github.com/tobiasrausch/delly
    """
    mere_url = "https://raw.githubusercontent.com/chapmanb/delly/master/human.hg19.excl.tsv"
    out_file = "sv_repeat_telomere_centromere.bed"
    if not env.safe_exists(out_file):

        def _select_by_gid(env, orig_file):
            if gid == "hg19":
                env.safe_run("grep ^chr %s > %s" % (orig_file, out_file))
            else:
                assert gid == "GRCh37"
                env.safe_run("grep -v ^chr %s > %s" % (orig_file, out_file))
            return out_file

        shared._remote_fetch(env, mere_url, fix_fn=_select_by_gid)
Esempio n. 41
0
 def download(self, seq_dir):
     base_seq = os.path.join(os.pardir, self._custom["base"],
                             "seq", "{0}.fa".format(self._custom["base"]))
     assert env.safe_exists(base_seq)
     mask_file = os.path.basename(self._custom["mask"])
     ready_mask = apply("{0}-complement{1}".format, os.path.splitext(mask_file))
     out_fasta = "{0}.fa".format(self._custom["dbkey"])
     if not env.safe_exists(os.path.join(seq_dir, out_fasta)):
         if not env.safe_exists(mask_file):
             shared._remote_fetch(env, self._custom["mask"])
         if not env.safe_exists(ready_mask):
             env.safe_run("bedtools complement -i {i} -g {g}.fai > {o}".format(
                 i=mask_file, g=base_seq, o=ready_mask))
         if not env.safe_exists(out_fasta):
             env.safe_run("bedtools maskfasta -fi {fi} -bed {bed} -fo {fo}".format(
                 fi=base_seq, bed=ready_mask, fo=out_fasta))
     return out_fasta, [mask_file, ready_mask]
Esempio n. 42
0
def _download_s3_index(env, manager, gid, idx):
    print("Downloading genome from s3: {0} {1}".format(gid, idx))
    url = "https://s3.amazonaws.com/biodata/genomes/%s-%s.tar.xz" % (gid, idx)
    if gid in ["GRCh37", "hg19", "mm10"] and idx in ["bowtie2", "bwa", "novoalign"]:
        out_file = shared._remote_fetch(env, url, samedir=True)
        subprocess.check_call("xz -dc %s | tar -xvpf -" % out_file, shell=True)
        subprocess.check_call("rm -f %s" % out_file, shell=True)
    else:
        raise NotImplementedError("No pre-computed indices for %s %s" % (gid, idx))
Esempio n. 43
0
def _download_cosmic(gid):
    """Prepared versions of COSMIC, pre-sorted and indexed.
    utils/prepare_cosmic.py handles the work of creating the VCFs from standard
    COSMIC resources.
    """
    base_url = "https://s3.amazonaws.com/biodata/variants"
    version = "v67_20131024"
    supported = ["hg19", "GRCh37"]
    if gid in supported:
        url = "%s/cosmic-%s-%s.vcf.gz" % (base_url, version, gid)
        gzip_fname = os.path.basename(url)
        fname = os.path.splitext(gzip_fname)[0]
        if not env.safe_exists(fname):
            if not env.safe_exists(gzip_fname):
                shared._remote_fetch(env, url)
            env.safe_run("gunzip %s" % fname)
        if not env.safe_exists(fname + ".idx"):
            shared._remote_fetch(env, url.replace(".gz", ".idx"))
Esempio n. 44
0
def _download_s3_index(env, manager, gid, idx):
    print("Downloading genome from s3: {0} {1}".format(gid, idx))
    url = "https://s3.amazonaws.com/biodata/genomes/%s-%s.tar.xz" % (gid, idx)
    if gid in ["GRCh37", "hg19", "mm10"] and idx in ["bowtie2", "bwa", "novoalign"]:
        out_file = shared._remote_fetch(env, url, samedir=True)
        subprocess.check_call("xz -dc %s | tar -xvpf -" % out_file, shell=True)
        subprocess.check_call("rm -f %s" % out_file, shell=True)
    else:
        raise NotImplementedError("No pre-computed indices for %s %s" % (gid, idx))
Esempio n. 45
0
 def _download_zip(self, seq_dir):
     for zipped_file in ["chromFa.tar.gz", "%s.fa.gz" % self._name,
                         "chromFa.zip"]:
         if not self._exists(zipped_file, seq_dir):
             result = shared._remote_fetch(env, "%s/%s" % (self._url, zipped_file), allow_fail=True)
             if result:
                 break
         else:
             break
     return zipped_file
Esempio n. 46
0
def install_mutect(env):
    version = "1.1.5"
    url = "https://github.com/broadinstitute/mutect/releases/download/" \
          "%s/muTect-%s-bin.zip" % (version, version)
    install_dir = _symlinked_java_version_dir("mutect", version, env)
    if install_dir:
        with _make_tmp_dir() as work_dir:
            with cd(work_dir):
                out_file = shared._remote_fetch(env, url)
                env.safe_run("unzip %s" % out_file)
                env.safe_sudo("mv *.jar version.txt LICENSE* %s" % install_dir)
Esempio n. 47
0
def _dbsnp_mouse(env, gid):
    """Retrieve resources for mouse variant analysis from custom S3 biodata bucket.
    """
    remote_dir = "https://s3.amazonaws.com/biodata/variants/"
    files = {"mm10": ["mm10-dbSNP-2013-09-12.vcf"]}
    for f in files[gid]:
        for ext in ["", ".idx"]:
            fname = f + ext
            if not env.safe_exists(fname):
                out_file = shared._remote_fetch(
                    env, "%s%s.gz" % (remote_dir, fname))
                env.safe_run("gunzip %s" % out_file)
Esempio n. 48
0
def _download_annotation_bundle(env, url, gid):
    """Download bundle of RNA-seq data from S3 biodata/annotation
    """
    tarball = shared._remote_fetch(env, url, allow_fail=True)
    if tarball and env.safe_exists(tarball):
        env.logger.info("Extracting RNA-seq references: %s" % tarball)
        env.safe_run("xz -dc %s | tar -xpf -" % tarball)
        env.safe_run("rm -f %s" % tarball)
        return True
    else:
        env.logger.warn("RNA-seq transcripts not available for %s" % gid)
        return False
Esempio n. 49
0
def install_varscan(env):
    """Variant detection in massively parallel sequencing data
    http://varscan.sourceforge.net/
    """
    version = "2.3.6"
    url = "http://downloads.sourceforge.net/project/varscan/VarScan.v%s.jar" % version
    install_dir = _symlinked_java_version_dir("varscan", version, env)
    if install_dir:
        with _make_tmp_dir() as work_dir:
            with cd(work_dir):
                out_file = shared._remote_fetch(env, url)
                env.safe_sudo("mv %s %s" % (out_file, install_dir))
Esempio n. 50
0
def install_cram(env):
    """Highly efficient and tunable reference-based compression of sequence data.
    http://www.ebi.ac.uk/ena/about/cram_toolkit/
    """
    version = "2.0"
    url = "https://github.com/vadimzalunin/crammer/raw/master/" \
          "cramtools-%s.jar" % version
    install_dir = _symlinked_java_version_dir("cram", version, env)
    if install_dir:
        with _make_tmp_dir() as work_dir:
            with cd(work_dir):
                out_file = shared._remote_fetch(env, url)
                env.safe_sudo("mv %s %s" % (out_file, install_dir))
Esempio n. 51
0
def install_rnaseqc(env):
    """Quality control metrics for RNA-seq data
    https://www.broadinstitute.org/cancer/cga/rna-seqc
    """
    version = "1.1.7"
    url = ("https://github.com/chapmanb/RNA-SeQC/releases/download/"
           "v%s/RNA-SeQC_v%s.jar" % (version, version))
    install_dir = _symlinked_java_version_dir("RNA-SeQC", version, env)
    if install_dir:
        with _make_tmp_dir() as work_dir:
            with cd(work_dir):
                out_file = shared._remote_fetch(env, url)
                env.safe_sudo("mv %s %s" % (out_file, install_dir))
Esempio n. 52
0
def install_bcbio_variation(env):
    """Toolkit to analyze genomic variation data with comparison and ensemble approaches.
    https://github.com/chapmanb/bcbio.variation
    """
    version = "0.1.6"
    url = "https://github.com/chapmanb/bcbio.variation/releases/download/" \
          "v%s/bcbio.variation-%s-standalone.jar" % (version, version)
    install_dir = _symlinked_java_version_dir("bcbio_variation", version, env)
    if install_dir:
        with _make_tmp_dir() as work_dir:
            with cd(work_dir):
                jar_file = shared._remote_fetch(env, url)
                env.safe_sudo("mv %s %s" % (jar_file, install_dir))
Esempio n. 53
0
def install_rnaseqc(env):
    """Quality control metrics for RNA-seq data
    https://www.broadinstitute.org/cancer/cga/rna-seqc
    """
    version = "1.1.7"
    url = ("http://www.broadinstitute.org/cancer/cga/sites/default/files/"
           "data/tools/rnaseqc/RNA-SeQC_v%s.jar" % version)
    install_dir = _symlinked_java_version_dir("RNA-SeQC", version, env)
    if install_dir:
        with _make_tmp_dir() as work_dir:
            with cd(work_dir):
                out_file = shared._remote_fetch(env, url)
                env.safe_sudo("mv %s %s" % (out_file, install_dir))
Esempio n. 54
0
def _download_broad_bundle(gid, bundle_version, name, ext):
    broad_fname = "{name}.{gid}.vcf{ext}".format(gid=gid, name=name, ext=ext)
    fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "")
    base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \
               "{bundle}/{gid}/{fname}.gz".format(
                   bundle=bundle_version, fname=broad_fname, gid=gid)
    if not env.safe_exists(fname):
        out_file = shared._remote_fetch(env, base_url, allow_fail=True)
        if out_file:
            env.safe_run("gunzip %s" % out_file)
            env.safe_run("mv %s %s" % (broad_fname, fname))
        else:
            env.logger.warn("dbSNP resources not available for %s" % gid)
    return fname
Esempio n. 55
0
def install_fastq_screen(env):
    """A screening application for high througput sequence data.
    http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/
    """
    version = "0.4"
    url = "http://www.bioinformatics.babraham.ac.uk/projects/fastq_screen/" \
          "fastq_screen_v%s.tar.gz" % version
    install_dir = shared._symlinked_shared_dir("fastqc_screen", version, env)
    executable = "fastq_screen"
    if install_dir:
        with _make_tmp_dir() as work_dir:
            with cd(work_dir):
                out_file = shared._remote_fetch(env, url)
                env.safe_run("tar -xzvpf %s" % out_file)
                with cd("fastq_screen_v%s" % version):
                    env.safe_sudo("mv * %s" % install_dir)
                env.safe_sudo(
                    "ln -s %s/%s %s/bin/%s" %
                    (install_dir, executable, env.system_install, executable))
Esempio n. 56
0
def install_fastqc(env):
    """A quality control tool for high throughput sequence data.
    http://www.bioinformatics.babraham.ac.uk/projects/fastqc/
    """
    version = "0.10.1"
    url = "http://www.bioinformatics.bbsrc.ac.uk/projects/fastqc/" \
          "fastqc_v%s.zip" % version
    executable = "fastqc"
    install_dir = _symlinked_java_version_dir("fastqc", version, env)
    if install_dir:
        with _make_tmp_dir() as work_dir:
            with cd(work_dir):
                out_file = shared._remote_fetch(env, url)
                env.safe_run("unzip %s" % out_file)
                with cd("FastQC"):
                    env.safe_sudo("chmod a+rwx %s" % executable)
                    env.safe_sudo("mv * %s" % install_dir)
                env.safe_sudo(
                    "ln -s %s/%s %s/bin/%s" %
                    (install_dir, executable, env.system_install, executable))
Esempio n. 57
0
def install_shrec(env):
    """Shrec is a bioinformatics tool for error correction of HTS read data.
    http://sourceforge.net/projects/shrec-ec/
    """
    version = "2.2"
    url = "http://downloads.sourceforge.net/project/shrec-ec/SHREC%%20%s/bin.zip" % version
    install_dir = _symlinked_java_version_dir("shrec", version, env)
    if install_dir:
        shrec_script = "%s/shrec" % install_dir
        with _make_tmp_dir() as work_dir:
            with cd(work_dir):
                out_file = shared._remote_fetch(env, url)
                env.safe_run("unzip %s" % out_file)
                env.safe_sudo("mv *.class %s" % install_dir)
                for line in _shrec_run.split("\n"):
                    if line.strip():
                        env.safe_append(shrec_script,
                                        line,
                                        use_sudo=env.use_sudo)
                env.safe_sudo("chmod a+rwx %s" % shrec_script)
                env.safe_sudo("ln -s %s %s/bin/shrec" %
                              (shrec_script, env.system_install))
Esempio n. 58
0
def _download_qsignature(env, gid, gconfig):
    """Download qsignature position file to detect samples problems

    :param env
    :param gid: str genome id
    :param gconfig: 

    :returns: NULL
    """
    base_url = "http://downloads.sourceforge.net/project/adamajava/qsignature.tar.bz2"
    outfile = "qsignature.vcf"
    if gid == "GRCh37" or (gid == "hg19" and not env.safe_exists("../../GRCh37")):
        if not env.safe_exists(outfile):
            zipfile = shared._remote_fetch(env, base_url, samedir=True)
            outdir = "qsignature"
            env.safe_run("mkdir -p %s" % outdir)
            env.safe_run("tar -jxf %s -C %s" % (zipfile, outdir))
            env.safe_run("mv %s/qsignature_positions.txt %s" % (outdir, outfile))
            env.safe_run("rm -rf %s" % outdir)
            env.safe_run("rm -rf %s" % zipfile)
    elif gid == "hg19":  # symlink to GRCh37 download
        if not env.safe_exists(outfile):
            env.safe_run("ln -sf ../../GRCh37/variation/%s %s" % (outfile, outfile))
Esempio n. 59
0
def _download_broad_bundle(gid, bundle_version, name, ext):
    # Broad bundle directories have uneven use of ".sites" in VCF files
    # only present in hg19 for non-dbSNP resources
    sites = ".sites" if gid == "hg19" and not name.startswith("dbsnp") else ""
    broad_fname = "{name}.{gid}{sites}.vcf{ext}".format(gid=gid, name=name, sites=sites, ext=ext)
    fname = broad_fname.replace(".{0}".format(gid), "").replace(".sites", "") + ".gz"
    base_url = "ftp://gsapubftp-anonymous:@ftp.broadinstitute.org/bundle/" + \
               "{bundle}/{gid}/{fname}.gz".format(
                   bundle=bundle_version, fname=broad_fname, gid=gid)
    # compress and prepare existing uncompressed versions
    if env.safe_exists(fname.replace(".vcf.gz", ".vcf")):
        env.safe_run("bgzip %s" % fname.replace(".vcf.gz", ".vcf"))
        env.safe_run("tabix -f -p vcf %s" % fname)
    # otherwise, download and bgzip and tabix index
    if not env.safe_exists(fname):
        out_file = shared._remote_fetch(env, base_url)
        env.safe_run("gunzip -c %s | bgzip -c > %s" % (out_file, fname))
        env.safe_run("tabix -f -p vcf %s" % fname)
        env.safe_run("rm -f %s" % out_file)
    # clean up old files
    for ext in [".vcf", ".vcf.idx"]:
        if env.safe_exists(fname.replace(".vcf.gz", ext)):
            env.safe_run("rm -f %s" % (fname.replace(".vcf.gz", ext)))
    return fname
Esempio n. 60
0
 def _get_samtools(env):
     shared._remote_fetch(env, samtools_url)
     env.safe_run("tar jxf samtools-{0}.tar.bz2".format(samtools_version))
     env.safe_run("ln -s samtools-{0} samtools".format(samtools_version))