Ejemplo n.º 1
0
    def get_annotation_download_links(self, name, **kwargs):
        """
        Retrieve functioning gene annotation download link(s).

        Parameters
        ----------
        name : str
            genome name
        **kwargs: dict, optional:
            version : Ensembl version to use. By default the latest version is used

        Returns
        -------
        list
            http link(s)
        """
        genome = self.genomes[safe(name)]
        division, is_vertebrate = get_division(genome)

        # base directory of the genome
        ftp = "http://ftp.ensemblgenomes.org"
        if is_vertebrate:
            ftp = "http://ftp.ensembl.org"
        version = self.get_version(is_vertebrate, kwargs.get("version"))
        div_path = "" if is_vertebrate else f"/{division}"
        lwr_name = genome["url_name"].lower()

        ftp_directory = f"{ftp}/pub/release-{version}{div_path}/gtf/{lwr_name}"
        # some entries don't use url_name in their url... -,-
        # examples:
        #   - EnsemblVertebrates: mus_musculus_nzohlltj
        #   - EnsemblMetazoa: caenorhabditis_elegans
        if not check_url(ftp_directory, 2):
            lwr_name = genome["name"]
            ftp_directory = f"{ftp}/pub/release-{version}{div_path}/gtf/{lwr_name}"

        # specific gtf file
        cap_name = lwr_name.capitalize()
        asm_name = re.sub(r"\.p\d+$", "", safe(genome["assembly_name"]))

        ftp_file = f"{cap_name}.{asm_name}.{version}.gtf.gz"

        # combine
        link = f"{ftp_directory}/{ftp_file}"
        if name == "GRCh37":
            link = genome["annotation"].format(version)
        return [link] if check_url(link, max_tries=2) else []
Ejemplo n.º 2
0
    def _ftp_or_html_link(self, name, file_suffix, skip_check=False):
        """
        NCBI's files are accessible over FTP and HTTPS
        Try HTTPS first and return the first functioning link
        """
        genome = self.genomes[safe(name)]
        ftp_link = genome["ftp_path"]
        html_link = ftp_link.replace("ftp://", "https://")
        for link in [html_link, ftp_link]:
            link += "/" + link.split("/")[-1] + file_suffix

            if skip_check or check_url(link, max_tries=2, timeout=10):
                return link
Ejemplo n.º 3
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return UCSC http link to genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http/ftp download link.
        """
        ucsc_url = self._url + "/{0}/bigZips/chromFa.tar.gz"
        ucsc_url_masked = self._url + "/{0}/bigZips/chromFaMasked.tar.gz"
        alt_ucsc_url = self._url + "/{0}/bigZips/{0}.fa.gz"
        alt_ucsc_url_masked = self._url + "/{0}/bigZips/{0}.fa.masked.gz"

        # soft masked genomes. can be unmasked in _post _process_download
        urls = [ucsc_url, alt_ucsc_url]
        if mask == "hard":
            urls = [ucsc_url_masked, alt_ucsc_url_masked]

        for genome_url in urls:
            link = genome_url.format(name)

            if check_url(link, 2):
                return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URLs are broken. Select another genome or provider.\n"
            f"Broken URLs: {', '.join([url.format(name) for url in urls])}")
Ejemplo n.º 4
0
 def ping():
     """Can the provider be reached?"""
     return bool(check_url("ftp.ebi.ac.uk/pub/databases/gencode"))
Ejemplo n.º 5
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return http link to the genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http download link.
        """
        genome = self.genomes[safe(name)]
        division, is_vertebrate = get_division(genome)

        # base directory of the genome
        ftp = "http://ftp.ensemblgenomes.org"
        if is_vertebrate:
            ftp = "http://ftp.ensembl.org"
        version = self.get_version(is_vertebrate, kwargs.get("version"))
        div_path = "" if is_vertebrate else f"/{division}"
        lwr_name = genome["url_name"].lower()

        ftp_directory = f"{ftp}/pub/release-{version}{div_path}/fasta/{lwr_name}/dna"
        # some entries don't use url_name in their url... -,-
        # examples:
        #   - EnsemblVertebrates: mus_musculus_nzohlltj
        #   - EnsemblMetazoa: caenorhabditis_elegans
        if not check_url(ftp_directory, 2):
            lwr_name = genome["name"]
            ftp_directory = f"{ftp}/pub/release-{version}{div_path}/fasta/{lwr_name}/dna"

        # this assembly has its own directory
        if name == "GRCh37":
            ftp_directory = genome["genome"].format(version)

        # specific fasta file
        cap_name = lwr_name.capitalize()
        asm_name = re.sub(r"\.p\d+$", "", safe(genome["assembly_name"]))
        mask_lvl = {"soft": "_sm", "hard": "_rm", "none": ""}[mask]
        asm_lvl = "toplevel" if kwargs.get("toplevel") else "primary_assembly"
        version_tag = "" if int(version) > 30 else f".{version}"

        ftp_file = f"{cap_name}.{asm_name}{version_tag}.dna{mask_lvl}.{asm_lvl}.fa.gz"

        # combine
        link = f"{ftp_directory}/{ftp_file}"
        if check_url(link, 2):
            return link

        # primary assemblies do not always exist
        if asm_lvl == "primary_assembly":
            link = link.replace("primary_assembly", "toplevel")
            if check_url(link, 2):
                return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URL is broken. Select another genome or provider.\n"
            f"Broken URL: {link}")
Ejemplo n.º 6
0
 def ping():
     """Can the provider be reached?"""
     return bool(check_url("https://rest.ensembl.org/info/ping?"))
Ejemplo n.º 7
0
 def ping():
     """Can the provider be reached?"""
     return bool(check_url("http://hgdownload.soe.ucsc.edu/goldenPath"))
Ejemplo n.º 8
0
 def ping():
     """Can the provider be reached?"""
     return bool(
         check_url(
             "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/"))