Esempio n. 1
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return NCBI ftp link to top-level genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http/ftp download link.
        """
        # only soft masked genomes available. can be (un)masked in _post_process_download
        link = self._ftp_or_html_link(name, file_suffix="_genomic.fna.gz")

        if link:
            return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URL is broken. Select another genome or provider.\n"
            f"Broken URL: {link}")
Esempio n. 2
0
    def get_annotation_download_link(self, name: str, **kwargs) -> str:
        """
        Return a functional annotation download link.

        Parameters
        ----------
        name : str
            genome name
        **kwargs: dict, optional:
            to_annotation : direct URL to the gene annotation

        Returns
        -------
        str
            http/ftp link

        Raises
        ------
        GenomeDownloadError
            if no functional link was found
        """
        link = kwargs.get("to_annotation")
        if link:
            ext = get_file_info(link)[0]
            if ext not in [".gtf", ".gff", ".gff3", ".bed"]:
                raise TypeError(
                    "Only (gzipped) gtf, gff and bed files are supported.\n")
            return link

        links = self.get_annotation_download_links(name)
        if links:
            return links[0]

        raise GenomeDownloadError(
            f"No gene annotations found for {get_localname(name)}.\n")
Esempio n. 3
0
    def attempt_and_report(self, name, localname, link, genomes_dir):
        if not link:
            sys.stderr.write(
                f"Could not download genome annotation for {name} from {self.name}.\n"
            )
            return

        sys.stderr.write(
            f"Downloading annotation from {self.name}.\nTarget URL: {link}...\n"
        )
        try:
            self.download_and_generate_annotation(genomes_dir, link, localname)
        except Exception:
            raise GenomeDownloadError(
                f"\nCould not download annotation for {name} from {self.name}\n"
                "If you think the annotation should be there, please file a bug report at:\n"
                "https://github.com/vanheeringen-lab/genomepy/issues\n")

        # TODO sanity check for genes
        sys.stderr.write("Annotation download successful\n")

        # Update readme annotation URL, or make a new
        readme = os.path.join(genomes_dir, localname, "README.txt")
        metadata, lines = read_readme(readme)
        metadata["annotation url"] = link
        write_readme(readme, metadata, lines)
Esempio n. 4
0
    def get_annotation_download_link(self, name: str, **kwargs) -> str:
        """
        Return a functional annotation download link.

        Parameters
        ----------
        name : str
            genome name

        Returns
        -------
        str
            http/ftp link

        Raises
        ------
        GenomeDownloadError
            if no functional link was found
        """
        links = self.annotation_links(name, **kwargs)
        if links:
            return links[0]
        raise GenomeDownloadError(
            f"No gene annotations found for {name} on {self.name}.\n"
            "Check for typos or try\n"
            f"  genomepy search {name} -p {self.name}")
Esempio n. 5
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return NCBI ftp link to top-level genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http/ftp download link.
        """
        genome = self.genomes[safe(name)]

        # only soft masked genomes available. can be (un)masked in _post _process_download
        link = genome["ftp_path"]
        link = link.replace("ftp://", "https://")
        link += "/" + link.split("/")[-1] + "_genomic.fna.gz"

        if check_url(link, 2):
            return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URL is broken. Select another genome or provider.\n"
            f"Broken URL: {link}")
Esempio n. 6
0
def connect_ftp_link(link, timeout=None) -> Tuple[FTP, str]:
    """
    Anonymous login to ftp.

    Accepts link in the form of ftp://ftp.name.domain/...
    and ftp.name.domain/...

    Parameters
    ----------
    link : str
        FTP link
    timeout : int, optional
        number of idle seconds before the connection closes

    Returns
    -------
    tuple
        ftp: FTP
            object with connection established
        target: str
            target file
    """
    link = link.replace("ftp://", "")
    host, target = link.split("/", 1)
    try:
        ftp = FTP(host, timeout=timeout)
    except socket.gaierror:
        raise GenomeDownloadError(f"FTP host not found: {host}")
    ftp.login()
    return ftp, target
Esempio n. 7
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return UCSC http link to genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http/ftp download link.
        """
        # soft masked genomes. can be unmasked in _post _process_download
        urls = [self.ucsc_url, self.alt_ucsc_url]
        if mask == "hard":
            urls = [self.ucsc_url_masked, self.alt_ucsc_url_masked]

        for genome_url in urls:
            link = genome_url.format(name)

            if check_url(link, 2):
                return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URLs are broken. Select another genome or provider.\n"
            f"Broken URLs: {', '.join([url.format(name) for url in urls])}")
Esempio n. 8
0
def _lazy_provider_selection(name, provider=None):
    """return the first PROVIDER which has genome NAME"""
    providers = _providers(provider)
    for p in providers:
        if name in p.genomes:
            return p
    else:
        raise GenomeDownloadError(
            f"{name} not found on {', '.join([p.name for p in providers])}.")
Esempio n. 9
0
def _lazy_provider_selection(name, provider=None):
    """return the first PROVIDER which has genome NAME"""
    providers = _providers(provider)
    for p in providers:
        if name in p.genomes or (p.name == "URL" and try_except_pass(
                ValueError, check_url, name)):
            return p

    raise GenomeDownloadError(
        f"{name} not found on {', '.join([p.name for p in providers])}.")
Esempio n. 10
0
    def _check_name(self, name):
        """check if genome name can be found for provider"""
        name = safe(name)
        if name in self.genomes:
            return name

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n\n"
            "Check for typos or try\n"
            f"  genomepy search {name} -p {self.name}")
Esempio n. 11
0
def _lazy_provider_selection(name, provider=None):
    """return the first PROVIDER which has genome NAME"""
    providers = []
    for p in online_providers(provider):
        providers.append(p.name)
        if name in p.genomes:
            return p
        if p.name == "URL" and try_except_pass(ValueError, check_url, name):
            return p
        if p.name == "Local" and os.path.exists(cleanpath(name)):
            return p

    raise GenomeDownloadError(f"{name} not found on {', '.join(providers)}.")
Esempio n. 12
0
def connect_ftp_link(link, timeout=None):
    """
    anonymous login to ftp
    accepts link in the form of ftp://ftp.name.domain/... and ftp.name.domain/...
    """
    link = link.replace("ftp://", "")
    host = link.split("/")[0]
    target = link.split(host)[1]

    try:
        ftp = FTP(host, timeout=timeout)
    except socket.gaierror:
        raise GenomeDownloadError(f"FTP host not found: {host}")

    ftp.login()
    return ftp, target
Esempio n. 13
0
    def download_annotation(self,
                            name,
                            genomes_dir=None,
                            localname=None,
                            **kwargs):
        """
        Download annotation file to to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install annotation

        localname : str , optional
            Custom name for your genome
        """
        name = self._check_name(name)
        link = self.get_annotation_download_link(name, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)

        logger.info(
            f"Downloading annotation from {self.name}. Target URL: {link}...")
        try:
            # download exact assembly report to rename the scaffolds
            acc = self.assembly_accession(name)
            fname = os.path.join(genomes_dir, localname, "assembly_report.txt")
            download_assembly_report(acc, fname)

            download_annotation(genomes_dir, link, localname)
            logger.info("Annotation download successful")
        except Exception as e:
            raise GenomeDownloadError(
                f"An error occured while installing the gene annotation for {name} from {self.name}.\n"
                "If you think the annotation should be there, please file a bug report at: "
                "https://github.com/vanheeringen-lab/genomepy/issues\n\n"
                f"Error: {e.args[0]}")

        # Add annotation URL to readme
        readme = os.path.join(genomes_dir, localname, "README.txt")
        update_readme(readme, updated_metadata={"annotation url": link})
Esempio n. 14
0
    def get_annotation_download_link(self, name: str, **kwargs) -> str:
        """
        Return an available annotation type.

        Parameters
        ----------
        name : str
            genome name
        **kwargs: dict, optional:
            ucsc_annotation_type : specific annotation type to download.

        Returns
        -------
        str
            http/ftp link

        Raises
        ------
        GenomeDownloadError
            if no functional link was found
        FileNotFoundError
            if the specified annotation type is unavailable
        """
        available = self.annotation_links(name)
        if not available:
            raise GenomeDownloadError(
                f"No gene annotations found for {name} on {self.name}.\n"
                "Check for typos or try\n"
                f"  genomepy search {name} -p {self.name}")
        annot = available

        usr_annot = kwargs.get("ucsc_annotation_type")
        if usr_annot:
            # not all types are available for each genome
            annot = [a for a in available if a.lower() == usr_annot.lower()]
            if not annot:
                raise FileNotFoundError(
                    f"{usr_annot} is not available for {name}. "
                    f"Options: {', '.join(available)}.\n")

        return annot[0]
Esempio n. 15
0
    def get_annotation_download_link(self, name: str, **kwargs) -> str:
        """
        Return a filepath to a matching annotation.

        Parameters
        ----------
        name : str
            genome name
        **kwargs: dict, optional:
            path_to_annotation : direct path to the gene annotation

        Returns
        -------
        str
            path

        Raises
        ------
        GenomeDownloadError
            if no functional path was found
        """
        path = kwargs.get("path_to_annotation")
        if path:
            path = cleanpath(path)
            if not os.path.exists(path):
                raise FileNotFoundError(
                    f"Local path to annotation does not exist: {path}")
            ext = get_file_info(path)[0]
            if ext not in [".gtf", ".gff", ".gff3", ".bed"]:
                raise TypeError(
                    "Only (gzipped) gtf, gff and bed files are supported.\n")
            return path

        paths = self.get_annotation_download_links(name)
        if paths:
            return paths[0]

        raise GenomeDownloadError(
            f"No gene annotations found for {get_genomename(name)}.\n")
Esempio n. 16
0
    def download_annotation(self,
                            name,
                            genomes_dir=None,
                            localname=None,
                            **kwargs):
        """
        Download the UCSC genePred via their MySQL database, and convert to annotations.
        """
        name = self._check_name(name)
        annot = self.get_annotation_download_link(name, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)

        logger.info(
            f"Downloading the {annot} annotation from the UCSC MySQL database."
        )
        try:
            download_annotation(name, annot, genomes_dir, localname)
            logger.info("Annotation download successful")
        except Exception as e:
            raise GenomeDownloadError(
                f"An error occured while installing the gene annotation for {name} from {self.name}.\n"
                "If you think the annotation should be there, please file a bug report at: "
                "https://github.com/vanheeringen-lab/genomepy/issues\n\n"
                f"Error: {e.args[0]}")

        # Add annotation URL to readme
        readme = os.path.join(genomes_dir, localname, "README.txt")
        update_readme(
            readme,
            updated_metadata={
                "annotation url":
                f"UCSC MySQL database: {name}, table: {annot}"
            },
        )
Esempio n. 17
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return Ensembl http or ftp link to the genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http/ftp download link.
        """
        genome = self.genomes[safe(name)]

        # parse the division
        division = genome["division"].lower().replace("ensembl", "")
        if division == "bacteria":
            raise NotImplementedError(
                "bacteria from ensembl not yet supported")

        ftp_site = "ftp://ftp.ensemblgenomes.org/pub"
        if division == "vertebrates":
            ftp_site = "ftp://ftp.ensembl.org/pub"

        # Ensembl release version
        version = kwargs.get("version")
        if version is None:
            version = self.get_version(self.rest_url,
                                       division == "vertebrates")

        # division dependent url format
        ftp_dir = "{}/release-{}/fasta/{}/dna".format(
            division, version, genome["url_name"].lower())
        if division == "vertebrates":
            ftp_dir = "release-{}/fasta/{}/dna".format(
                version, genome["url_name"].lower())
        url = f"{ftp_site}/{ftp_dir}"

        # masking and assembly level
        def get_url(level="toplevel"):
            masks = {
                "soft": "dna_sm.{}",
                "hard": "dna_rm.{}",
                "none": "dna.{}"
            }
            pattern = masks[mask].format(level)

            asm_url = "{}/{}.{}.{}.fa.gz".format(
                url,
                genome["url_name"].capitalize(),
                re.sub(r"\.p\d+$", "", safe(genome["assembly_name"])),
                pattern,
            )
            return asm_url

        # try to get the (much smaller) primary assembly,
        # unless specified otherwise
        link = get_url("primary_assembly")
        if kwargs.get("toplevel") or not check_url(link, 2):
            link = get_url()

        if check_url(link, 2):
            return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URL is broken. Select another genome or provider.\n"
            f"Broken URL: {link}")
Esempio n. 18
0
    def get_genome_download_link(self, name, mask="soft", **kwargs):
        """
        Return http link to the genome sequence

        Parameters
        ----------
        name : str
            Genome name. Current implementation will fail if exact
            name is not found.

        mask : str , optional
            Masking level. Options: soft, hard or none. Default is soft.

        Returns
        ------
        str with the http download link.
        """
        genome = self.genomes[safe(name)]
        division, is_vertebrate = get_division(genome)

        # base directory of the genome
        ftp = "http://ftp.ensemblgenomes.org"
        if is_vertebrate:
            ftp = "http://ftp.ensembl.org"
        version = self.get_version(is_vertebrate, kwargs.get("version"))
        div_path = "" if is_vertebrate else f"/{division}"
        lwr_name = genome["url_name"].lower()

        ftp_directory = f"{ftp}/pub/release-{version}{div_path}/fasta/{lwr_name}/dna"
        # some entries don't use url_name in their url... -,-
        # examples:
        #   - EnsemblVertebrates: mus_musculus_nzohlltj
        #   - EnsemblMetazoa: caenorhabditis_elegans
        if not check_url(ftp_directory, 2):
            lwr_name = genome["name"]
            ftp_directory = f"{ftp}/pub/release-{version}{div_path}/fasta/{lwr_name}/dna"

        # this assembly has its own directory
        if name == "GRCh37":
            ftp_directory = genome["genome"].format(version)

        # specific fasta file
        cap_name = lwr_name.capitalize()
        asm_name = re.sub(r"\.p\d+$", "", safe(genome["assembly_name"]))
        mask_lvl = {"soft": "_sm", "hard": "_rm", "none": ""}[mask]
        asm_lvl = "toplevel" if kwargs.get("toplevel") else "primary_assembly"
        version_tag = "" if int(version) > 30 else f".{version}"

        ftp_file = f"{cap_name}.{asm_name}{version_tag}.dna{mask_lvl}.{asm_lvl}.fa.gz"

        # combine
        link = f"{ftp_directory}/{ftp_file}"
        if check_url(link, 2):
            return link

        # primary assemblies do not always exist
        if asm_lvl == "primary_assembly":
            link = link.replace("primary_assembly", "toplevel")
            if check_url(link, 2):
                return link

        raise GenomeDownloadError(
            f"Could not download genome {name} from {self.name}.\n"
            "URL is broken. Select another genome or provider.\n"
            f"Broken URL: {link}")