Esempio n. 1
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return NCBI ftp link to top-level genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http/ftp download link.
        """
        genome = self.genomes[safe(name)]

        # only soft masked genomes available. can be (un)masked in _post _process_download
        link = genome["ftp_path"]
        link = link.replace("ftp://", "https://")
        link += "/" + link.split("/")[-1] + "_genomic.fna.gz"

        if check_url(link, 2):
            return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URL is broken. Select another genome or provider.\n"
            f"Broken URL: {link}")
Esempio n. 2
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return UCSC http link to genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http/ftp download link.
        """
        # soft masked genomes. can be unmasked in _post _process_download
        urls = [self.ucsc_url, self.alt_ucsc_url]
        if mask == "hard":
            urls = [self.ucsc_url_masked, self.alt_ucsc_url_masked]

        for genome_url in urls:
            link = genome_url.format(name)

            if check_url(link, 2):
                return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URLs are broken. Select another genome or provider.\n"
            f"Broken URLs: {', '.join([url.format(name) for url in urls])}")
Esempio n. 3
0
    def get_annotation_download_link(self, name, **kwargs):
        """
        Parse and test the link to the UCSC annotation file.

        Will check UCSC, Ensembl, NCBI RefSeq and UCSC RefSeq annotation, respectively.
        More info on the annotation file on: https://genome.ucsc.edu/FAQ/FAQgenes.html#whatdo

        Parameters
        ----------
        name : str
            Genome name
        """
        gtf_url = f"http://hgdownload.soe.ucsc.edu/goldenPath/{name}/bigZips/genes/"
        txt_url = f"http://hgdownload.cse.ucsc.edu/goldenPath/{name}/database/"
        annot_files = {
            "ucsc": "knownGene",
            "ensembl": "ensGene",
            "ncbi_refseq": "ncbiRefSeq",
            "ucsc_refseq": "refGene",
        }

        # download gtf format if possible, txt format if not
        gtfs_exists = check_url(gtf_url, 2)
        base_url = gtf_url + name + "." if gtfs_exists else txt_url
        base_ext = ".gtf.gz" if gtfs_exists else ".txt.gz"

        # download specified annotation type if requested
        file = kwargs.get("ucsc_annotation_type")
        if file:
            link = base_url + annot_files[file.lower()] + base_ext
            if check_url(link, 2):
                return link
            sys.stderr.write(
                f"Specified annotation type ({file}) not found for {name}.\n")

        else:
            # download first available annotation type found
            for file in annot_files.values():
                link = base_url + file + base_ext
                if check_url(link, 2):
                    return link
Esempio n. 4
0
    def get_annotation_download_link(self, name, **kwargs):
        """
        check if the linked annotation file is of a supported file type (gtf/gff3/bed)
        """
        link = kwargs.get("to_annotation")
        if link:
            ext = get_file_info(link)[0]
            if ext not in [".gtf", ".gff", ".gff3", ".bed"]:
                raise TypeError(
                    "Only (gzipped) gtf, gff and bed files are supported.\n")

            if check_url(link):
                return link
Esempio n. 5
0
    def _ftp_or_html_link(self, name, file_suffix, skip_check=False):
        """
        NCBI's files are accessible over FTP and HTTPS
        Try HTTPS first and return the first functioning link
        """
        genome = self.genomes[safe(name)]
        ftp_link = genome["ftp_path"]
        html_link = ftp_link.replace("ftp://", "https://")
        for link in [html_link, ftp_link]:
            link += "/" + link.split("/")[-1] + file_suffix

            if skip_check or check_url(link, max_tries=2, timeout=10):
                return link
Esempio n. 6
0
    def get_annotation_download_link(self, name, **kwargs):
        """
        Parse and test the link to the NCBI annotation file.

        Parameters
        ----------
        name : str
            Genome name
        """
        genome = self.genomes[safe(name)]
        link = genome["ftp_path"]
        link = link.replace("ftp://", "https://")
        link += "/" + link.split("/")[-1] + "_genomic.gff.gz"

        if check_url(link, 2):
            return link
Esempio n. 7
0
    def get_annotation_download_link(self, name, **kwargs):
        """
        Parse and test the link to the Ensembl annotation file.

        Parameters
        ----------
        name : str
            Genome name
        kwargs: dict , optional:
            Provider specific options.

            version : int , optional
                Ensembl version. By default the latest version is used.
        """
        genome = self.genomes[safe(name)]
        division = genome["division"].lower().replace("ensembl", "")

        ftp_site = "ftp://ftp.ensemblgenomes.org/pub"
        if division == "vertebrates":
            ftp_site = "ftp://ftp.ensembl.org/pub"

        # Ensembl release version
        version = kwargs.get("version")
        if version is None:
            version = self.get_version(self.rest_url,
                                       division == "vertebrates")

        if division != "vertebrates":
            ftp_site += f"/{division}"

        # Get the GTF URL
        base_url = ftp_site + "/release-{}/gtf/{}/{}.{}.{}.gtf.gz"
        safe_name = re.sub(r"\.p\d+$", "", name)
        link = base_url.format(
            version,
            genome["url_name"].lower(),
            genome["url_name"].capitalize(),
            safe_name,
            version,
        )

        if check_url(link, 2):
            return link
Esempio n. 8
0
    def search_url_for_annotation(url):
        """Attempts to find a gtf or gff3 file in the same location as the genome url"""
        urldir = os.path.dirname(url)
        sys.stderr.write(
            "You have requested gene annotation to be downloaded.\n"
            "Genomepy will check the remote directory:\n"
            f"{urldir} for annotation files...\n")

        # try to find a GTF or GFF3 file
        name = get_localname(url)
        with urlopen(urldir) as f:
            for urlline in f.readlines():
                urlstr = str(urlline)
                if any(substring in urlstr.lower()
                       for substring in [".gtf", name + ".gff"]):
                    break

        # retrieve the filename from the HTML line
        fname = ""
        for split in re.split('>|<|><|/|"', urlstr):
            if split.lower().endswith((
                    ".gtf",
                    ".gtf.gz",
                    name + ".gff",
                    name + ".gff.gz",
                    name + ".gff3",
                    name + ".gff3.gz",
            )):
                fname = split
                break
        else:
            raise FileNotFoundError(
                "Could not parse the remote directory. "
                "Please supply a URL using --url-to-annotation.\n")

        # set variables for downloading
        link = urldir + "/" + fname

        if check_url(link):
            return link
Esempio n. 9
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return Ensembl http or ftp link to the genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http/ftp download link.
        """
        genome = self.genomes[safe(name)]

        # parse the division
        division = genome["division"].lower().replace("ensembl", "")
        if division == "bacteria":
            raise NotImplementedError(
                "bacteria from ensembl not yet supported")

        ftp_site = "ftp://ftp.ensemblgenomes.org/pub"
        if division == "vertebrates":
            ftp_site = "ftp://ftp.ensembl.org/pub"

        # Ensembl release version
        version = kwargs.get("version")
        if version is None:
            version = self.get_version(self.rest_url,
                                       division == "vertebrates")

        # division dependent url format
        ftp_dir = "{}/release-{}/fasta/{}/dna".format(
            division, version, genome["url_name"].lower())
        if division == "vertebrates":
            ftp_dir = "release-{}/fasta/{}/dna".format(
                version, genome["url_name"].lower())
        url = f"{ftp_site}/{ftp_dir}"

        # masking and assembly level
        def get_url(level="toplevel"):
            masks = {
                "soft": "dna_sm.{}",
                "hard": "dna_rm.{}",
                "none": "dna.{}"
            }
            pattern = masks[mask].format(level)

            asm_url = "{}/{}.{}.{}.fa.gz".format(
                url,
                genome["url_name"].capitalize(),
                re.sub(r"\.p\d+$", "", safe(genome["assembly_name"])),
                pattern,
            )
            return asm_url

        # try to get the (much smaller) primary assembly,
        # unless specified otherwise
        link = get_url("primary_assembly")
        if kwargs.get("toplevel") or not check_url(link, 2):
            link = get_url()

        if check_url(link, 2):
            return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URL is broken. Select another genome or provider.\n"
            f"Broken URL: {link}")
Esempio n. 10
0
 def provider_status(self, url, max_tries=1):
     """check if provider is online (stores results for 10 minutes)"""
     if not check_url(url, max_tries):
         raise ConnectionError(f"{self.name} appears to be offline.\n")