def parse_file(fpath, skip: Union[tuple, str] = "#"): """basic file parsing""" fpath = cleanpath(fpath) with open(fpath) as lines: for line in lines: line = line.strip() if line.startswith(skip): continue yield line
def _lazy_provider_selection(name, provider=None): """return the first PROVIDER which has genome NAME""" providers = [] for p in online_providers(provider): providers.append(p.name) if name in p.genomes: return p if p.name == "URL" and try_except_pass(ValueError, check_url, name): return p if p.name == "Local" and os.path.exists(cleanpath(name)): return p raise GenomeDownloadError(f"{name} not found on {', '.join(providers)}.")
def __init__(self, name: str, genomes_dir: str = None, quiet: bool = False): # name and directory n, g = _get_name_and_dir(name, genomes_dir) self.name = n "genome name" self.genome_dir = g "path to the genome directory" # annotation files fname = cleanpath(name) suffixes = Path(fname).suffixes[-2:] b = fname if not (".bed" in suffixes or ".BED" in suffixes): b = _get_file(self.genome_dir, f"{self.name}.annotation.bed", not quiet) self.annotation_bed_file = b "path to the gene annotation BED file" g = fname if not (".gtf" in suffixes or ".GTF" in suffixes): g = _get_file(self.genome_dir, f"{self.name}.annotation.gtf", not quiet) self.annotation_gtf_file = g "path to the gene annotation GTF file" # genome files g = fname if ".fa" not in suffixes: g = _get_file(self.genome_dir, f"{self.name}.fa", False) self.genome_file = g "path to the genome fasta" self.readme_file = _get_file(self.genome_dir, "README.txt", False) "path to the README file" self.index_file = _get_file(self.genome_dir, f"{self.name}.fa.fai", False) "path to the genome index" self.sizes_file = _get_file(self.genome_dir, f"{self.name}.fa.sizes", False) "path to the chromosome sizes file" # genome attributes t = read_readme(str(self.readme_file))[0]["tax_id"] self.tax_id = None if t == "na" else int(t) "genome taxonomy identifier"
def get_annotation_download_links(self, name, **kwargs): """Returns all files containing both name and an annotation extension""" name = cleanpath(name) genome_dir = os.path.dirname(name) search_list = os.listdir(genome_dir) search_name = get_genomename(name) hits = [] for ext in ["gtf", "gff", "gff3"]: # .*? = non greedy filler. (\.gz)? = optional .gz expr = fr"{search_name}.*?\.{ext}(\.gz)?" # noqa: W605 for line in search_list: hit = re.search(expr, line, flags=re.IGNORECASE) if hit: hit = os.path.join(genome_dir, hit[0]) hits.append(hit) return hits
def _parse_filename(self, name: str) -> str: """ accepts path to a fasta file, path to a fasta folder, or the name of a genome (e.g. hg38). returns the abspath to the fasta file """ path_name = cleanpath(name) if os.path.isfile(path_name): return path_name default_genome_dir = os.path.join(self.genomes_dir, self.name) for f in glob_ext_files(path_name) + glob_ext_files(default_genome_dir): if self.name + ".fa" in os.path.basename(f): return f raise FileNotFoundError( f"could not find {self.name}.fa(.gz) in genome_dir {default_genome_dir}" )
def get_annotation_download_link(self, name: str, **kwargs) -> str: """ Return a filepath to a matching annotation. Parameters ---------- name : str genome name **kwargs: dict, optional: path_to_annotation : direct path to the gene annotation Returns ------- str path Raises ------ GenomeDownloadError if no functional path was found """ path = kwargs.get("path_to_annotation") if path: path = cleanpath(path) if not os.path.exists(path): raise FileNotFoundError( f"Local path to annotation does not exist: {path}") ext = get_file_info(path)[0] if ext not in [".gtf", ".gff", ".gff3", ".bed"]: raise TypeError( "Only (gzipped) gtf, gff and bed files are supported.\n") return path paths = self.get_annotation_download_links(name) if paths: return paths[0] raise GenomeDownloadError( f"No gene annotations found for {get_genomename(name)}.\n")
def _get_name_and_dir(name, genomes_dir=None): """ Returns the name and directory of the genome. """ fname = cleanpath(name) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) if os.path.isfile(fname): exts = ["gtf", "GTF", "bed", "BED", "fa"] if not any(ext in fname for ext in exts): raise NotImplementedError( "Only (gzipped) bed, gtf or fasta files are supported!") genome_dir = os.path.dirname(fname) name = safe(os.path.basename(fname)) # remove suffices any_ext = "(" + ")|(".join(exts) + ")" name = re.sub(fr"(\.annotation)?\.({any_ext})(\.gz)?$", "", name) elif os.path.isdir(fname): genome_dir = fname name = safe(os.path.basename(fname)) elif name in os.listdir(genomes_dir): genome_dir = os.path.join(genomes_dir, name) else: raise FileNotFoundError(f"Could not find {name}") return name, genome_dir
def get_genome_download_link(self, path, mask=None, **kwargs): path = cleanpath(path) if not os.path.exists(path): raise FileNotFoundError( f"Local path to genome does not exist: {path}") return path