Esempio n. 1
0
            def regex_filer(_fname, _regex, _v):
                os.rename(_fname, _fname + "_to_regex")
                infa = _fname + "_to_regex"
                outfa = _fname
                filter_fasta(infa, outfa, regex=_regex, v=_v, force=True)

                return [
                    k for k in Fasta(infa).keys()
                    if k not in Fasta(outfa).keys()
                ]
Esempio n. 2
0
 def regex_filer(_fname, _regex, _v):
     infa = _fname + "_to_regex"
     os.rename(_fname, infa)
     # filter the fasta and store the output's keys
     keys_out = filter_fasta(infa,
                             outfa=_fname,
                             regex=_regex,
                             v=_v,
                             force=True).keys()
     keys_in = Fasta(infa).keys()
     return [k for k in keys_in if k not in keys_out]
Esempio n. 3
0
    def download_genome(
        self,
        name,
        genome_dir,
        localname=None,
        mask="soft",
        regex=None,
        invert_match=False,
        bgzip=None,
        **kwargs
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genome_dir : str
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)

        regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        genome_dir = os.path.expanduser(genome_dir)
        if not os.path.exists(genome_dir):
            os.makedirs(genome_dir)

        dbname, link = self.get_genome_download_link(name, mask=mask, **kwargs)
        myname = get_localname(dbname, localname)
        if not os.path.exists(os.path.join(genome_dir, myname)):
            os.makedirs(os.path.join(genome_dir, myname))

        sys.stderr.write("Downloading genome from {}...\n".format(link))

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        with TemporaryDirectory(dir=os.path.join(genome_dir, myname)) as tmpdir:
            fname = os.path.join(tmpdir, myname + ".fa")

            # actual download
            urlcleanup()
            with urlopen(link) as response:
                # check available memory vs file size.
                available_memory = int(virtual_memory().available)
                file_size = int(response.info()["Content-Length"])
                # download file in chunks if >75% of memory would be used
                cutoff = int(available_memory * 0.75)
                chunk_size = None if file_size < cutoff else cutoff
                with open(fname, "wb") as f_out:
                    shutil.copyfileobj(response, f_out, chunk_size)

            # unzip genome
            if link.endswith("tar.gz"):
                self.tar_to_bigfile(fname, fname)
            elif link.endswith(".gz"):
                # gunzip will only work with files ending with ".gz"
                os.rename(fname, fname + ".gz")
                ret = sp.check_call(["gunzip", "-f", fname])
                if ret != 0:
                    raise Exception("Error gunzipping genome {}".format(fname))

            # process genome (e.g. masking)
            if hasattr(self, "_post_process_download"):
                self._post_process_download(name, localname, tmpdir, mask)

            if regex:
                os.rename(fname, fname + "_to_regex")
                infa = fname + "_to_regex"
                outfa = fname
                filter_fasta(infa, outfa, regex=regex, v=invert_match, force=True)

                not_included = [
                    k for k in Fasta(infa).keys() if k not in Fasta(outfa).keys()
                ]

            # bgzip genome if requested
            if bgzip is None:
                bgzip = config.get("bgzip", False)

            if bgzip:
                ret = sp.check_call(["bgzip", "-f", fname])
                if ret != 0:
                    raise Exception(
                        "Error bgzipping {}. ".format(fname) + "Is tabix installed?"
                    )
                fname += ".gz"

            # transfer the genome from the tmpdir to the genome_dir
            src = fname
            dst = os.path.join(genome_dir, myname, os.path.basename(fname))
            shutil.move(src, dst)

        sys.stderr.write("name: {}\n".format(dbname))
        sys.stderr.write("local name: {}\n".format(myname))
        sys.stderr.write("fasta: {}\n".format(dst))

        # Create readme with information
        readme = os.path.join(genome_dir, myname, "README.txt")
        with open(readme, "w") as f:
            f.write("name: {}\n".format(myname))
            f.write("original name: {}\n".format(dbname))
            f.write("original filename: {}\n".format(os.path.split(link)[-1]))
            f.write("url: {}\n".format(link))
            f.write("mask: {}\n".format(mask))
            f.write("date: {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S")))
            if regex:
                if invert_match:
                    f.write("regex: {} (inverted match)\n".format(regex))
                else:
                    f.write("regex: {}\n".format(regex))
                f.write("sequences that were excluded:\n")
                for seq in not_included:
                    f.write("\t{}\n".format(seq))
Esempio n. 4
0
    def download_genome(self,
                        name,
                        genome_dir,
                        localname=None,
                        mask="soft",
                        regex=None,
                        invert_match=False,
                        version=None):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name
        
        genome_dir : str
            Directory to install genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)
        """
        genome_dir = os.path.expanduser(genome_dir)

        if not os.path.exists(genome_dir):
            os.makedirs(genome_dir)

        dbname, link = self.get_genome_download_link(name,
                                                     mask=mask,
                                                     version=version)
        myname = dbname
        if localname:
            myname = localname

        myname = myname.replace(" ", "_")

        gzipped = False
        if link.endswith(".gz"):
            gzipped = True

        if not os.path.exists(os.path.join(genome_dir, myname)):
            os.makedirs(os.path.join(genome_dir, myname))
        urlcleanup()
        response = urlopen(link)

        sys.stderr.write("downloading from {}...\n".format(link))
        down_dir = genome_dir
        fname = os.path.join(genome_dir, myname, myname + ".fa")
        if regex:
            down_dir = mkdtemp()
            os.mkdir(os.path.join(down_dir, myname))
            fname = os.path.join(down_dir, myname, myname + ".fa")
        with open(fname, "wb") as f_out:
            if gzipped:
                # Supports both Python 2.7 as well as 3
                with gzip.GzipFile(
                        fileobj=io.BytesIO(response.read())) as f_in:
                    shutil.copyfileobj(f_in, f_out)
            else:
                f_out.write(response.read())
        sys.stderr.write("done...\n")

        if link.endswith("tar.gz"):
            self.tar_to_bigfile(fname, fname)

        if hasattr(self, '_post_process_download'):
            self._post_process_download(name, down_dir, mask)

        if regex:
            infa = fname
            outfa = os.path.join(genome_dir, myname, myname + ".fa")
            filter_fasta(infa, outfa, regex=regex, v=invert_match, force=True)

            not_included = [
                k for k in Fasta(infa).keys() if k not in Fasta(outfa).keys()
            ]
            shutil.rmtree(down_dir)
            fname = outfa

        sys.stderr.write("name: {}\n".format(dbname))
        sys.stderr.write("local name: {}\n".format(myname))
        sys.stderr.write("fasta: {}\n".format(fname))

        # Create readme with information
        readme = os.path.join(genome_dir, myname, "README.txt")
        with open(readme, "w") as f:
            f.write("name: {}\n".format(myname))
            f.write("original name: {}\n".format(dbname))
            f.write("original filename: {}\n".format(os.path.split(link)[-1]))
            f.write("url: {}\n".format(link))
            f.write("mask: {}\n".format(mask))
            f.write("date: {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S")))
            if regex:
                if invert_match:
                    f.write("regex: {} (inverted match)\n".format(regex))
                else:
                    f.write("regex: {}\n".format(regex))
                f.write("sequences that were excluded:\n")
                for seq in not_included:
                    f.write("\t{}\n".format(seq))


#

        return myname
Esempio n. 5
0
    def download_genome(
        self,
        name,
        genomes_dir=None,
        localname=None,
        mask="soft",
        regex=None,
        invert_match=False,
        bgzip=None,
        **kwargs,
    ):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name

        genomes_dir : str , optional
            Directory to install genome

        localname : str , optional
            Custom name for your genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)

        regex : str , optional
            Regular expression to select specific chromosome / scaffold names.

        invert_match : bool , optional
            Set to True to select all chromosomes that don't match the regex.

        bgzip : bool , optional
            If set to True the genome FASTA file will be compressed using bgzip.
            If not specified, the setting from the configuration file will be used.
        """
        name = safe(name)
        self.check_name(name)

        link = self.get_genome_download_link(name, mask=mask, **kwargs)

        localname = get_localname(name, localname)
        genomes_dir = get_genomes_dir(genomes_dir, check_exist=False)
        out_dir = os.path.join(genomes_dir, localname)
        if not os.path.exists(out_dir):
            mkdir_p(out_dir)

        sys.stderr.write(
            f"Downloading genome from {self.name}.\nTarget URL: {link}...\n")

        # download to tmp dir. Move genome on completion.
        # tmp dir is in genome_dir to prevent moving the genome between disks
        with TemporaryDirectory(dir=out_dir) as tmp_dir:
            fname = os.path.join(tmp_dir, f"{localname}.fa")

            # actual download
            urlcleanup()
            with urlopen(link) as response:
                # check available memory vs file size.
                available_memory = int(virtual_memory().available)
                file_size = int(response.info()["Content-Length"])
                # download file in chunks if >75% of memory would be used
                cutoff = int(available_memory * 0.75)
                chunk_size = None if file_size < cutoff else cutoff
                with open(fname, "wb") as f_out:
                    shutil.copyfileobj(response, f_out, chunk_size)
            sys.stderr.write(
                "Genome download successful, starting post processing...\n")

            # unzip genome
            if link.endswith(".tar.gz"):
                tar_to_bigfile(fname, fname)
            elif link.endswith(".gz"):
                os.rename(fname, fname + ".gz")
                ret = sp.check_call(["gunzip", "-f", fname])
                if ret != 0:
                    raise Exception(f"Error gunzipping genome {fname}")

            # process genome (e.g. masking)
            if hasattr(self, "_post_process_download"):
                self._post_process_download(name=name,
                                            localname=localname,
                                            out_dir=tmp_dir,
                                            mask=mask)

            if regex:
                os.rename(fname, fname + "_to_regex")
                infa = fname + "_to_regex"
                outfa = fname
                filter_fasta(infa,
                             outfa,
                             regex=regex,
                             v=invert_match,
                             force=True)

                not_included = [
                    k for k in Fasta(infa).keys()
                    if k not in Fasta(outfa).keys()
                ]

            # bgzip genome if requested
            if bgzip or config.get("bgzip"):
                ret = sp.check_call(["bgzip", "-f", fname])
                if ret != 0:
                    raise Exception(
                        f"Error bgzipping {name}. Is tabix installed?")
                fname += ".gz"

            # transfer the genome from the tmpdir to the genome_dir
            src = fname
            dst = os.path.join(genomes_dir, localname, os.path.basename(fname))
            shutil.move(src, dst)

        sys.stderr.write("\n")
        sys.stderr.write("name: {}\n".format(name))
        sys.stderr.write("local name: {}\n".format(localname))
        sys.stderr.write("fasta: {}\n".format(dst))

        # Create readme with information
        readme = os.path.join(genomes_dir, localname, "README.txt")
        metadata = {
            "name": localname,
            "provider": self.name,
            "original name": name,
            "original filename": os.path.split(link)[-1],
            "assembly_accession":
            self.assembly_accession(self.genomes.get(name)),
            "tax_id": self.genome_taxid(self.genomes.get(name)),
            "mask": mask,
            "genome url": link,
            "annotation url": "na",
            "date": time.strftime("%Y-%m-%d %H:%M:%S"),
        }
        lines = []
        if regex:
            regex_line = f"regex: {regex}"
            if invert_match:
                regex_line += " (inverted match)"
            lines += ["", regex_line, "sequences that were excluded:"]
            for seq in not_included:
                lines.append(f"\t{seq}")
        write_readme(readme, metadata, lines)
Esempio n. 6
0
    def download_genome(self, name, genome_dir, localname=None, mask="soft", regex=None, invert_match=False, version=None):
        """
        Download a (gzipped) genome file to a specific directory

        Parameters
        ----------
        name : str
            Genome / species name
        
        genome_dir : str
            Directory to install genome

        mask: str , optional
            Masking, soft, hard or none (all other strings)
        """
        genome_dir = os.path.expanduser(genome_dir)
        
        if not os.path.exists(genome_dir):
            os.makedirs(genome_dir)
        
        dbname, link = self.get_genome_download_link(name, mask=mask, version=version)
        myname = dbname 
        if localname:
            myname = localname
        
        myname = myname.replace(" ", "_")

        gzipped = False
        if link.endswith(".gz"):
            gzipped = True

        if not os.path.exists(os.path.join(genome_dir, myname)):
            os.makedirs(os.path.join(genome_dir, myname))
        urlcleanup()
        response = urlopen(link)
         
        sys.stderr.write("downloading from {}...\n".format(link))
        down_dir = genome_dir
        fname = os.path.join(genome_dir, myname, myname + ".fa")
        if regex:
            down_dir = mkdtemp()
            fname = os.path.join(down_dir, myname + ".fa") 
        with open(fname, "wb") as f_out:
            if gzipped:
                # Supports both Python 2.7 as well as 3
                with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f_in:
                    shutil.copyfileobj(f_in, f_out)
            else:
                f_out.write(response.read())
        sys.stderr.write("done...\n")
        
        if link.endswith("tar.gz"):
            self.tar_to_bigfile(fname, fname) 
        
        if hasattr(self, '_post_process_download'):
            self._post_process_download(name, down_dir, mask)
        
        if regex:
            infa = fname
            outfa = os.path.join(genome_dir, myname, myname + ".fa") 
            filter_fasta(
                infa,
                outfa,
                regex=regex,
                v=invert_match,
                force=True
                )

            not_included = [k for k in Fasta(infa).keys() if k not in Fasta(outfa).keys()]
            shutil.rmtree(down_dir)
            fname = outfa
        
        sys.stderr.write("name: {}\n".format(dbname))
        sys.stderr.write("local name: {}\n".format(myname))
        sys.stderr.write("fasta: {}\n".format(fname))

        # Create readme with information
        readme = os.path.join(genome_dir, myname, "README.txt")
        with open(readme, "w") as f:
            f.write("name: {}\n".format(myname))
            f.write("original name: {}\n".format(dbname))
            f.write("original filename: {}\n".format(os.path.split(link)[-1]))
            f.write("url: {}\n".format(link))
            f.write("mask: {}\n".format(mask))
            f.write("date: {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S")))
            if regex:
                if invert_match:
                    f.write("regex: {} (inverted match)\n".format(regex))
                else:
                    f.write("regex: {}\n".format(regex))
                f.write("sequences that were excluded:\n")
                for seq in not_included:
                    f.write("\t{}\n".format(seq))
#
       
        return myname