Esempio n. 1
0
    def attempt_and_report(self, name, localname, link, genomes_dir):
        if not link:
            sys.stderr.write(
                f"Could not download genome annotation for {name} from {self.name}.\n"
            )
            return

        sys.stderr.write(
            f"Downloading annotation from {self.name}.\nTarget URL: {link}...\n"
        )
        try:
            self.download_and_generate_annotation(genomes_dir, link, localname)
        except Exception:
            raise GenomeDownloadError(
                f"\nCould not download annotation for {name} from {self.name}\n"
                "If you think the annotation should be there, please file a bug report at:\n"
                "https://github.com/vanheeringen-lab/genomepy/issues\n")

        # TODO sanity check for genes
        sys.stderr.write("Annotation download successful\n")

        # Update readme annotation URL, or make a new
        readme = os.path.join(genomes_dir, localname, "README.txt")
        metadata, lines = read_readme(readme)
        metadata["annotation url"] = link
        write_readme(readme, metadata, lines)
Esempio n. 2
0
    def _read_metadata(self):
        """
        Read genome metadata from genome README.txt (if it exists).
        """
        metadata, lines = read_readme(self.readme_file)

        if (metadata.get("provider", "na") == "na" or "tax_id" not in metadata
                or "assembly_accession" not in metadata) and os.access(
                    self.readme_file, os.W_OK):
            self._update_metadata(metadata)
            write_readme(self.readme_file, metadata, lines)

        return metadata
Esempio n. 3
0
    def download_genome(
        self,
        name,
        genomes_dir=None,
        localname=None,
        mask="soft",
        keep_alt=False,
        regex=None,
        invert_match=False,
        bgzip=None,
        **kwargs,
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)

        keep_alt : bool , optional
            Set to true to keep these alternative regions.

        regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        name = safe(name)
        self.check_name(name)

        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        sys.stderr.write(
            f"Downloading genome from {self.name}.\nTarget URL: {link}...\n")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        with TemporaryDirectory(dir=out_dir) as tmp_dir:
            fname = os.path.join(tmp_dir, f"{localname}.fa")

            # actual download
            urlcleanup()
            with urlopen(link) as response:
                # check available memory vs file size.
                available_memory = int(virtual_memory().available)
                file_size = int(response.info()["Content-Length"])
                # download file in chunks if >75% of memory would be used
                cutoff = int(available_memory * 0.75)
                chunk_size = None if file_size < cutoff else cutoff
                with open(fname, "wb") as f_out:
                    shutil.copyfileobj(response, f_out, chunk_size)
            sys.stderr.write(
                "Genome download successful, starting post processing...\n")

            # unzip genome
            if link.endswith(".tar.gz"):
                tar_to_bigfile(fname, fname)
            elif link.endswith(".gz"):
                os.rename(fname, fname + ".gz")
                ret = sp.check_call(["gunzip", "-f", fname])
                if ret != 0:
                    raise Exception(f"Error gunzipping genome {fname}")

            def regex_filer(_fname, _regex, _v):
                os.rename(_fname, _fname + "_to_regex")
                infa = _fname + "_to_regex"
                outfa = _fname
                filter_fasta(infa, outfa, regex=_regex, v=_v, force=True)

                return [
                    k for k in Fasta(infa).keys()
                    if k not in Fasta(outfa).keys()
                ]

            not_included = []
            # remove alternative regions
            if not keep_alt:
                not_included.extend(regex_filer(fname, "alt", True))

            # keep/remove user defined regions
            if regex:
                not_included.extend(regex_filer(fname, regex, invert_match))

            # process genome (e.g. masking)
            if hasattr(self, "_post_process_download"):
                self._post_process_download(name=name,
                                            localname=localname,
                                            out_dir=tmp_dir,
                                            mask=mask)

            # bgzip genome if requested
            if bgzip or config.get("bgzip"):
                ret = sp.check_call(["bgzip", "-f", fname])
                if ret != 0:
                    raise Exception(
                        f"Error bgzipping {name}. Is tabix installed?")
                fname += ".gz"

            # transfer the genome from the tmpdir to the genome_dir
            src = fname
            dst = os.path.join(genomes_dir, localname, os.path.basename(fname))
            shutil.move(src, dst)

        sys.stderr.write("\n")
        sys.stderr.write("name: {}\n".format(name))
        sys.stderr.write("local name: {}\n".format(localname))
        sys.stderr.write("fasta: {}\n".format(dst))

        # Create readme with information
        readme = os.path.join(genomes_dir, localname, "README.txt")
        metadata = {
            "name": localname,
            "provider": self.name,
            "original name": name,
            "original filename": os.path.split(link)[-1],
            "assembly_accession":
            self.assembly_accession(self.genomes.get(name)),
            "tax_id": self.genome_taxid(self.genomes.get(name)),
            "mask": mask,
            "genome url": link,
            "annotation url": "na",
            "date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        lines = []
        if regex:
            regex_line = f"regex: {regex}"
            if invert_match:
                regex_line += " (inverted match)"
            lines += ["", regex_line, "sequences that were excluded:"]
            for seq in not_included:
                lines.append(f"\t{seq}")
        write_readme(readme, metadata, lines)
Esempio n. 4
0
    def download_genome(
        self,
        name,
        genomes_dir=None,
        localname=None,
        mask="soft",
        keep_alt=False,
        regex=None,
        invert_match=False,
        bgzip=None,
        **kwargs,
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)

        keep_alt : bool , optional
            Set to true to keep these alternative regions.

        regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        name = safe(name)
        self.check_name(name)

        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        sys.stderr.write(
            f"Downloading genome from {self.name}.\nTarget URL: {link}...\n")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        tmp_dir = mkdtemp(dir=out_dir)
        fname = os.path.join(tmp_dir, f"{localname}.fa")

        urlcleanup()
        download_file(link, fname)
        sys.stderr.write(
            "Genome download successful, starting post processing...\n")

        # unzip genome
        if link.endswith(".tar.gz"):
            tar_to_bigfile(fname, fname)
        elif link.endswith(".gz"):
            os.rename(fname, fname + ".gz")
            ret = sp.check_call(["gunzip", "-f", fname])
            if ret != 0:
                raise Exception(f"Error gunzipping genome {fname}")

        def regex_filer(_fname, _regex, _v):
            infa = _fname + "_to_regex"
            os.rename(_fname, infa)
            # filter the fasta and store the output's keys
            keys_out = filter_fasta(infa,
                                    outfa=_fname,
                                    regex=_regex,
                                    v=_v,
                                    force=True).keys()
            keys_in = Fasta(infa).keys()
            return [k for k in keys_in if k not in keys_out]

        not_included = []
        # remove alternative regions
        if not keep_alt:
            not_included.extend(regex_filer(fname, "alt", True))

        # keep/remove user defined regions
        if regex:
            not_included.extend(regex_filer(fname, regex, invert_match))

        # process genome (e.g. masking)
        if hasattr(self, "_post_process_download"):
            self._post_process_download(name=name,
                                        localname=localname,
                                        out_dir=tmp_dir,
                                        mask=mask)

        # bgzip genome if requested
        if bgzip or config.get("bgzip"):
            # bgzip to stdout, track progress, and output to file
            fsize = int(os.path.getsize(fname) * 10**-6)
            cmd = (
                f"bgzip -fc {fname} | "
                f"tqdm --bytes --desc Bgzipping {fsize}MB fasta --log ERROR | "
                f"cat > {fname}.gz")
            ret = sp.check_call(cmd, shell=True)
            if ret != 0:
                raise Exception(f"Error bgzipping {name}. Is tabix installed?")
            fname += ".gz"

        # transfer the genome from the tmpdir to the genome_dir
        src = fname
        dst = os.path.join(genomes_dir, localname, os.path.basename(fname))
        shutil.move(src, dst)
        rm_rf(tmp_dir)

        sys.stderr.write("\n")
        sys.stderr.write("name: {}\n".format(name))
        sys.stderr.write("local name: {}\n".format(localname))
        sys.stderr.write("fasta: {}\n".format(dst))

        # Create readme with information
        readme = os.path.join(genomes_dir, localname, "README.txt")
        metadata = {
            "name": localname,
            "provider": self.name,
            "original name": name,
            "original filename": os.path.split(link)[-1],
            "assembly_accession":
            self.assembly_accession(self.genomes.get(name)),
            "tax_id": self.genome_taxid(self.genomes.get(name)),
            "mask": mask,
            "genome url": link,
            "annotation url": "na",
            "date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        lines = []
        if not keep_alt or regex:
            regex_line = "regex: "
            if not keep_alt:
                regex_line += "'alt' (inverted match)"

            if not keep_alt and regex:
                regex_line += " and "

            if regex:
                regex_line += f"'{regex}'"
                if invert_match:
                    regex_line += " (inverted match)"

            lines += ["", regex_line, "sequences that were excluded:"]
            for seq in not_included:
                lines.append(f"\t{seq}")
        write_readme(readme, metadata, lines)