def get_annotation_download_links(self, name, **kwargs): """ Retrieve functioning gene annotation download link(s). Parameters ---------- name : str genome name **kwargs: dict, optional: version : Ensembl version to use. By default the latest version is used Returns ------- list http link(s) """ genome = self.genomes[safe(name)] division, is_vertebrate = get_division(genome) # base directory of the genome ftp = "http://ftp.ensemblgenomes.org" if is_vertebrate: ftp = "http://ftp.ensembl.org" version = self.get_version(is_vertebrate, kwargs.get("version")) div_path = "" if is_vertebrate else f"/{division}" lwr_name = genome["url_name"].lower() ftp_directory = f"{ftp}/pub/release-{version}{div_path}/gtf/{lwr_name}" # some entries don't use url_name in their url... -,- # examples: # - EnsemblVertebrates: mus_musculus_nzohlltj # - EnsemblMetazoa: caenorhabditis_elegans if not check_url(ftp_directory, 2): lwr_name = genome["name"] ftp_directory = f"{ftp}/pub/release-{version}{div_path}/gtf/{lwr_name}" # specific gtf file cap_name = lwr_name.capitalize() asm_name = re.sub(r"\.p\d+$", "", safe(genome["assembly_name"])) ftp_file = f"{cap_name}.{asm_name}.{version}.gtf.gz" # combine link = f"{ftp_directory}/{ftp_file}" if name == "GRCh37": link = genome["annotation"].format(version) return [link] if check_url(link, max_tries=2) else []
def _ftp_or_html_link(self, name, file_suffix, skip_check=False): """ NCBI's files are accessible over FTP and HTTPS Try HTTPS first and return the first functioning link """ genome = self.genomes[safe(name)] ftp_link = genome["ftp_path"] html_link = ftp_link.replace("ftp://", "https://") for link in [html_link, ftp_link]: link += "/" + link.split("/")[-1] + file_suffix if skip_check or check_url(link, max_tries=2, timeout=10): return link
def get_genome_download_link(self, name, mask="soft", **kwargs): """ Return UCSC http link to genome sequence Parameters ---------- name : str Genome name. Current implementation will fail if exact name is not found. mask : str , optional Masking level. Options: soft, hard or none. Default is soft. Returns ------ str with the http/ftp download link. """ ucsc_url = self._url + "/{0}/bigZips/chromFa.tar.gz" ucsc_url_masked = self._url + "/{0}/bigZips/chromFaMasked.tar.gz" alt_ucsc_url = self._url + "/{0}/bigZips/{0}.fa.gz" alt_ucsc_url_masked = self._url + "/{0}/bigZips/{0}.fa.masked.gz" # soft masked genomes. can be unmasked in _post _process_download urls = [ucsc_url, alt_ucsc_url] if mask == "hard": urls = [ucsc_url_masked, alt_ucsc_url_masked] for genome_url in urls: link = genome_url.format(name) if check_url(link, 2): return link raise GenomeDownloadError( f"Could not download genome {name} from {self.name}.\n" "URLs are broken. Select another genome or provider.\n" f"Broken URLs: {', '.join([url.format(name) for url in urls])}")
def ping(): """Can the provider be reached?""" return bool(check_url("ftp.ebi.ac.uk/pub/databases/gencode"))
def get_genome_download_link(self, name, mask="soft", **kwargs): """ Return http link to the genome sequence Parameters ---------- name : str Genome name. Current implementation will fail if exact name is not found. mask : str , optional Masking level. Options: soft, hard or none. Default is soft. Returns ------ str with the http download link. """ genome = self.genomes[safe(name)] division, is_vertebrate = get_division(genome) # base directory of the genome ftp = "http://ftp.ensemblgenomes.org" if is_vertebrate: ftp = "http://ftp.ensembl.org" version = self.get_version(is_vertebrate, kwargs.get("version")) div_path = "" if is_vertebrate else f"/{division}" lwr_name = genome["url_name"].lower() ftp_directory = f"{ftp}/pub/release-{version}{div_path}/fasta/{lwr_name}/dna" # some entries don't use url_name in their url... -,- # examples: # - EnsemblVertebrates: mus_musculus_nzohlltj # - EnsemblMetazoa: caenorhabditis_elegans if not check_url(ftp_directory, 2): lwr_name = genome["name"] ftp_directory = f"{ftp}/pub/release-{version}{div_path}/fasta/{lwr_name}/dna" # this assembly has its own directory if name == "GRCh37": ftp_directory = genome["genome"].format(version) # specific fasta file cap_name = lwr_name.capitalize() asm_name = re.sub(r"\.p\d+$", "", safe(genome["assembly_name"])) mask_lvl = {"soft": "_sm", "hard": "_rm", "none": ""}[mask] asm_lvl = "toplevel" if kwargs.get("toplevel") else "primary_assembly" version_tag = "" if int(version) > 30 else f".{version}" ftp_file = f"{cap_name}.{asm_name}{version_tag}.dna{mask_lvl}.{asm_lvl}.fa.gz" # combine link = f"{ftp_directory}/{ftp_file}" if check_url(link, 2): return link # primary assemblies do not always exist if asm_lvl == "primary_assembly": link = link.replace("primary_assembly", "toplevel") if check_url(link, 2): return link raise GenomeDownloadError( f"Could not download genome {name} from {self.name}.\n" "URL is broken. Select another genome or provider.\n" f"Broken URL: {link}")
def ping(): """Can the provider be reached?""" return bool(check_url("https://rest.ensembl.org/info/ping?"))
def ping(): """Can the provider be reached?""" return bool(check_url("http://hgdownload.soe.ucsc.edu/goldenPath"))
def ping(): """Can the provider be reached?""" return bool( check_url( "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/"))