def regex_filer(_fname, _regex, _v): os.rename(_fname, _fname + "_to_regex") infa = _fname + "_to_regex" outfa = _fname filter_fasta(infa, outfa, regex=_regex, v=_v, force=True) return [ k for k in Fasta(infa).keys() if k not in Fasta(outfa).keys() ]
def regex_filer(_fname, _regex, _v): infa = _fname + "_to_regex" os.rename(_fname, infa) # filter the fasta and store the output's keys keys_out = filter_fasta(infa, outfa=_fname, regex=_regex, v=_v, force=True).keys() keys_in = Fasta(infa).keys() return [k for k in keys_in if k not in keys_out]
def download_genome( self, name, genome_dir, localname=None, mask="soft", regex=None, invert_match=False, bgzip=None, **kwargs ): """ Download a (gzipped) genome file to a specific directory Parameters ---------- name : str Genome / species name genome_dir : str Directory to install genome localname : str , optional Custom name for your genome mask: str , optional Masking, soft, hard or none (all other strings) regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. """ genome_dir = os.path.expanduser(genome_dir) if not os.path.exists(genome_dir): os.makedirs(genome_dir) dbname, link = self.get_genome_download_link(name, mask=mask, **kwargs) myname = get_localname(dbname, localname) if not os.path.exists(os.path.join(genome_dir, myname)): os.makedirs(os.path.join(genome_dir, myname)) sys.stderr.write("Downloading genome from {}...\n".format(link)) # download to tmp dir. Move genome on completion. # tmp dir is in genome_dir to prevent moving the genome between disks with TemporaryDirectory(dir=os.path.join(genome_dir, myname)) as tmpdir: fname = os.path.join(tmpdir, myname + ".fa") # actual download urlcleanup() with urlopen(link) as response: # check available memory vs file size. available_memory = int(virtual_memory().available) file_size = int(response.info()["Content-Length"]) # download file in chunks if >75% of memory would be used cutoff = int(available_memory * 0.75) chunk_size = None if file_size < cutoff else cutoff with open(fname, "wb") as f_out: shutil.copyfileobj(response, f_out, chunk_size) # unzip genome if link.endswith("tar.gz"): self.tar_to_bigfile(fname, fname) elif link.endswith(".gz"): # gunzip will only work with files ending with ".gz" os.rename(fname, fname + ".gz") ret = sp.check_call(["gunzip", "-f", fname]) if ret != 0: raise Exception("Error gunzipping genome {}".format(fname)) # process genome (e.g. masking) if hasattr(self, "_post_process_download"): self._post_process_download(name, localname, tmpdir, mask) if regex: os.rename(fname, fname + "_to_regex") infa = fname + "_to_regex" outfa = fname filter_fasta(infa, outfa, regex=regex, v=invert_match, force=True) not_included = [ k for k in Fasta(infa).keys() if k not in Fasta(outfa).keys() ] # bgzip genome if requested if bgzip is None: bgzip = config.get("bgzip", False) if bgzip: ret = sp.check_call(["bgzip", "-f", fname]) if ret != 0: raise Exception( "Error bgzipping {}. ".format(fname) + "Is tabix installed?" ) fname += ".gz" # transfer the genome from the tmpdir to the genome_dir src = fname dst = os.path.join(genome_dir, myname, os.path.basename(fname)) shutil.move(src, dst) sys.stderr.write("name: {}\n".format(dbname)) sys.stderr.write("local name: {}\n".format(myname)) sys.stderr.write("fasta: {}\n".format(dst)) # Create readme with information readme = os.path.join(genome_dir, myname, "README.txt") with open(readme, "w") as f: f.write("name: {}\n".format(myname)) f.write("original name: {}\n".format(dbname)) f.write("original filename: {}\n".format(os.path.split(link)[-1])) f.write("url: {}\n".format(link)) f.write("mask: {}\n".format(mask)) f.write("date: {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S"))) if regex: if invert_match: f.write("regex: {} (inverted match)\n".format(regex)) else: f.write("regex: {}\n".format(regex)) f.write("sequences that were excluded:\n") for seq in not_included: f.write("\t{}\n".format(seq))
def download_genome(self, name, genome_dir, localname=None, mask="soft", regex=None, invert_match=False, version=None): """ Download a (gzipped) genome file to a specific directory Parameters ---------- name : str Genome / species name genome_dir : str Directory to install genome mask: str , optional Masking, soft, hard or none (all other strings) """ genome_dir = os.path.expanduser(genome_dir) if not os.path.exists(genome_dir): os.makedirs(genome_dir) dbname, link = self.get_genome_download_link(name, mask=mask, version=version) myname = dbname if localname: myname = localname myname = myname.replace(" ", "_") gzipped = False if link.endswith(".gz"): gzipped = True if not os.path.exists(os.path.join(genome_dir, myname)): os.makedirs(os.path.join(genome_dir, myname)) urlcleanup() response = urlopen(link) sys.stderr.write("downloading from {}...\n".format(link)) down_dir = genome_dir fname = os.path.join(genome_dir, myname, myname + ".fa") if regex: down_dir = mkdtemp() os.mkdir(os.path.join(down_dir, myname)) fname = os.path.join(down_dir, myname, myname + ".fa") with open(fname, "wb") as f_out: if gzipped: # Supports both Python 2.7 as well as 3 with gzip.GzipFile( fileobj=io.BytesIO(response.read())) as f_in: shutil.copyfileobj(f_in, f_out) else: f_out.write(response.read()) sys.stderr.write("done...\n") if link.endswith("tar.gz"): self.tar_to_bigfile(fname, fname) if hasattr(self, '_post_process_download'): self._post_process_download(name, down_dir, mask) if regex: infa = fname outfa = os.path.join(genome_dir, myname, myname + ".fa") filter_fasta(infa, outfa, regex=regex, v=invert_match, force=True) not_included = [ k for k in Fasta(infa).keys() if k not in Fasta(outfa).keys() ] shutil.rmtree(down_dir) fname = outfa sys.stderr.write("name: {}\n".format(dbname)) sys.stderr.write("local name: {}\n".format(myname)) sys.stderr.write("fasta: {}\n".format(fname)) # Create readme with information readme = os.path.join(genome_dir, myname, "README.txt") with open(readme, "w") as f: f.write("name: {}\n".format(myname)) f.write("original name: {}\n".format(dbname)) f.write("original filename: {}\n".format(os.path.split(link)[-1])) f.write("url: {}\n".format(link)) f.write("mask: {}\n".format(mask)) f.write("date: {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S"))) if regex: if invert_match: f.write("regex: {} (inverted match)\n".format(regex)) else: f.write("regex: {}\n".format(regex)) f.write("sequences that were excluded:\n") for seq in not_included: f.write("\t{}\n".format(seq)) # return myname
def download_genome( self, name, genomes_dir=None, localname=None, mask="soft", regex=None, invert_match=False, bgzip=None, **kwargs, ): """ Download a (gzipped) genome file to a specific directory Parameters ---------- name : str Genome / species name genomes_dir : str , optional Directory to install genome localname : str , optional Custom name for your genome mask: str , optional Masking, soft, hard or none (all other strings) regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. """ name = safe(name) self.check_name(name) link = self.get_genome_download_link(name, mask=mask, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) if not os.path.exists(out_dir): mkdir_p(out_dir) sys.stderr.write( f"Downloading genome from {self.name}.\nTarget URL: {link}...\n") # download to tmp dir. Move genome on completion. # tmp dir is in genome_dir to prevent moving the genome between disks with TemporaryDirectory(dir=out_dir) as tmp_dir: fname = os.path.join(tmp_dir, f"{localname}.fa") # actual download urlcleanup() with urlopen(link) as response: # check available memory vs file size. available_memory = int(virtual_memory().available) file_size = int(response.info()["Content-Length"]) # download file in chunks if >75% of memory would be used cutoff = int(available_memory * 0.75) chunk_size = None if file_size < cutoff else cutoff with open(fname, "wb") as f_out: shutil.copyfileobj(response, f_out, chunk_size) sys.stderr.write( "Genome download successful, starting post processing...\n") # unzip genome if link.endswith(".tar.gz"): tar_to_bigfile(fname, fname) elif link.endswith(".gz"): os.rename(fname, fname + ".gz") ret = sp.check_call(["gunzip", "-f", fname]) if ret != 0: raise Exception(f"Error gunzipping genome {fname}") # process genome (e.g. masking) if hasattr(self, "_post_process_download"): self._post_process_download(name=name, localname=localname, out_dir=tmp_dir, mask=mask) if regex: os.rename(fname, fname + "_to_regex") infa = fname + "_to_regex" outfa = fname filter_fasta(infa, outfa, regex=regex, v=invert_match, force=True) not_included = [ k for k in Fasta(infa).keys() if k not in Fasta(outfa).keys() ] # bgzip genome if requested if bgzip or config.get("bgzip"): ret = sp.check_call(["bgzip", "-f", fname]) if ret != 0: raise Exception( f"Error bgzipping {name}. Is tabix installed?") fname += ".gz" # transfer the genome from the tmpdir to the genome_dir src = fname dst = os.path.join(genomes_dir, localname, os.path.basename(fname)) shutil.move(src, dst) sys.stderr.write("\n") sys.stderr.write("name: {}\n".format(name)) sys.stderr.write("local name: {}\n".format(localname)) sys.stderr.write("fasta: {}\n".format(dst)) # Create readme with information readme = os.path.join(genomes_dir, localname, "README.txt") metadata = { "name": localname, "provider": self.name, "original name": name, "original filename": os.path.split(link)[-1], "assembly_accession": self.assembly_accession(self.genomes.get(name)), "tax_id": self.genome_taxid(self.genomes.get(name)), "mask": mask, "genome url": link, "annotation url": "na", "date": time.strftime("%Y-%m-%d %H:%M:%S"), } lines = [] if regex: regex_line = f"regex: {regex}" if invert_match: regex_line += " (inverted match)" lines += ["", regex_line, "sequences that were excluded:"] for seq in not_included: lines.append(f"\t{seq}") write_readme(readme, metadata, lines)
def download_genome(self, name, genome_dir, localname=None, mask="soft", regex=None, invert_match=False, version=None): """ Download a (gzipped) genome file to a specific directory Parameters ---------- name : str Genome / species name genome_dir : str Directory to install genome mask: str , optional Masking, soft, hard or none (all other strings) """ genome_dir = os.path.expanduser(genome_dir) if not os.path.exists(genome_dir): os.makedirs(genome_dir) dbname, link = self.get_genome_download_link(name, mask=mask, version=version) myname = dbname if localname: myname = localname myname = myname.replace(" ", "_") gzipped = False if link.endswith(".gz"): gzipped = True if not os.path.exists(os.path.join(genome_dir, myname)): os.makedirs(os.path.join(genome_dir, myname)) urlcleanup() response = urlopen(link) sys.stderr.write("downloading from {}...\n".format(link)) down_dir = genome_dir fname = os.path.join(genome_dir, myname, myname + ".fa") if regex: down_dir = mkdtemp() fname = os.path.join(down_dir, myname + ".fa") with open(fname, "wb") as f_out: if gzipped: # Supports both Python 2.7 as well as 3 with gzip.GzipFile(fileobj=io.BytesIO(response.read())) as f_in: shutil.copyfileobj(f_in, f_out) else: f_out.write(response.read()) sys.stderr.write("done...\n") if link.endswith("tar.gz"): self.tar_to_bigfile(fname, fname) if hasattr(self, '_post_process_download'): self._post_process_download(name, down_dir, mask) if regex: infa = fname outfa = os.path.join(genome_dir, myname, myname + ".fa") filter_fasta( infa, outfa, regex=regex, v=invert_match, force=True ) not_included = [k for k in Fasta(infa).keys() if k not in Fasta(outfa).keys()] shutil.rmtree(down_dir) fname = outfa sys.stderr.write("name: {}\n".format(dbname)) sys.stderr.write("local name: {}\n".format(myname)) sys.stderr.write("fasta: {}\n".format(fname)) # Create readme with information readme = os.path.join(genome_dir, myname, "README.txt") with open(readme, "w") as f: f.write("name: {}\n".format(myname)) f.write("original name: {}\n".format(dbname)) f.write("original filename: {}\n".format(os.path.split(link)[-1])) f.write("url: {}\n".format(link)) f.write("mask: {}\n".format(mask)) f.write("date: {}\n".format(time.strftime("%Y-%m-%d %H:%M:%S"))) if regex: if invert_match: f.write("regex: {} (inverted match)\n".format(regex)) else: f.write("regex: {}\n".format(regex)) f.write("sequences that were excluded:\n") for seq in not_included: f.write("\t{}\n".format(seq)) # return myname