def after_genome_download(self, genome, threads=1, force=False): index_name = genome.plugin["star"]["index_name"] if not cmd_ok("STAR") or (os.path.exists(index_name) and not force): return index_dir = genome.plugin["star"]["index_dir"] rm_rf(index_dir) mkdir_p(index_dir) # gunzip genome if bgzipped and return up-to-date genome name with extracted_file(genome.filename) as fname: # index command cmd = (f"STAR --runMode genomeGenerate --runThreadN {threads} " + f"--genomeFastaFiles {fname} --genomeDir {index_dir} " + f"--outFileNamePrefix {index_dir}") # if an annotation is present, generate a splice-aware index gtf_file = genome.annotation_gtf_file if gtf_file: with extracted_file(gtf_file) as _gtf_file: # update index command with annotation cmd += f" --sjdbGTFfile {_gtf_file}" # Create index run_index_cmd("star", cmd) else: logger.info("Creating STAR index without annotation file.") # Create index run_index_cmd("star", cmd)
def after_genome_download(self, genome, threads=1, force=False): if not cmd_ok("gmap_build"): return # Create index dir index_dir = genome.plugin["gmap"]["index_dir"] if force: # Start from scratch rm_rf(index_dir) if not os.path.exists(index_dir): # unzip genome if zipped and return up-to-date genome name fname, bgzip = gunzip_and_name(genome.filename) # gmap outputs a folder named genome.name # its content is moved to index dir, consistent with other plugins tmp_dir = mkdtemp(dir=".") # Create index cmd = f"gmap_build -D {tmp_dir} -d {genome.name} {fname}" run_index_cmd("gmap", cmd) # Move files to index_dir src = os.path.join(tmp_dir, genome.name) move(src, index_dir) rm_rf(tmp_dir) # re-zip genome if unzipped bgzip_and_name(fname, bgzip)
def download_annotation(name, annot, genomes_dir, localname, n=None): """ Download the extended genePred file from the UCSC MySQL database. Next convert this to a BED and GTF file. """ out_dir = os.path.join(genomes_dir, localname) mkdir_p(out_dir) tmp_dir = mkdtemp(dir=out_dir) pred_file = f"{os.path.join(tmp_dir, localname)}.annotation.extended.gp" gtf_file = f"{os.path.join(out_dir, localname)}.annotation.gtf" bed_file = f"{os.path.join(out_dir, localname)}.annotation.bed" # MySQL query 1: get column names for this genePred command = f"SHOW COLUMNS FROM {annot};" cols = list(query_ucsc(command, database=name)) # drop columns the UCSC tools cannot handle # see https://genome.ucsc.edu/FAQ/FAQformat.html#format9 accepted_cols = [ "geneName", "name", "chrom", "strand", "txStart", "txEnd", "cdsStart", "cdsEnd", "exonCount", "exonStarts", "exonEnds", "score", "name2", "cdsStartStat", "cdsEndStat", "exonFrames", ] cols = [c[0] for c in cols if c[0] in accepted_cols] cols = ",".join(cols) # MySQL query 2: download genePred command = f"SELECT {cols} FROM {annot};" if n: command = f"SELECT {cols} FROM {annot} LIMIT {n};" ret = query_ucsc(command, database=name) # clean up genePred df = pd.DataFrame.from_records(ret) for c in [8, 9, 14]: if c in df: df[c] = df[c].str.decode("utf-8") df.to_csv(pred_file, index=False, header=False, sep="\t") # convert genePred to GTF and BED cmd = "genePredToGtf -source=genomepy file {0} {1}" sp.check_call(cmd.format(pred_file, gtf_file), shell=True) cmd = "genePredToBed {0} {1}" sp.check_call(cmd.format(pred_file, bed_file), shell=True) rm_rf(tmp_dir)
def after_genome_download(self, genome, threads=1, force=False): index_name = genome.plugin["hisat2"]["index_name"] if not cmd_ok("hisat2-build") or ( os.path.exists(f"{index_name}.1.ht2") and not force ): return index_dir = genome.plugin["hisat2"]["index_dir"] rm_rf(index_dir) mkdir_p(index_dir) # gunzip genome if bgzipped and return up-to-date genome name fname, bgzip = gunzip_and_name(genome.filename) # index command cmd = f"hisat2-build -p {threads} {fname} {index_name}" # if an annotation is present, generate a splice-aware index gtf_file = genome.annotation_gtf_file if gtf_file: # gunzip if gzipped gtf_file, gzip_file = gunzip_and_name(gtf_file) # generate splice and exon site files to enhance indexing hisat_path = ( sp.Popen("which hisat2", stdout=sp.PIPE, shell=True) .stdout.read() .decode("utf8") .strip() ) splice_script = hisat_path + "_extract_splice_sites.py" splice_file = os.path.join(genome.genome_dir, "splice_sites.txt") sp.check_call( f"python3 {splice_script} {gtf_file} > {splice_file}", shell=True ) exon_script = hisat_path + "_extract_exons.py" exon_file = os.path.join(genome.genome_dir, "exon_sites.txt") sp.check_call(f"python3 {exon_script} {gtf_file} > {exon_file}", shell=True) # re-gzip annotation if gunzipped gzip_and_name(gtf_file, gzip_file) # update index command with annotation cmd += f" --ss {splice_file} --exon {exon_file}" else: print("\nCreating Hisat2 index without annotation file.") # Create index run_index_cmd("hisat2", cmd) # re-bgzip genome if gunzipped bgzip_and_name(fname, bgzip)
def after_genome_download(self, genome, threads=1, force=False): if not cmd_ok("minimap2"): return # Create index dir index_dir = genome.plugin["minimap2"]["index_dir"] index_name = genome.plugin["minimap2"]["index_name"] if force: # Start from scratch rm_rf(index_dir) mkdir_p(index_dir) if not any(fname.endswith(".mmi") for fname in os.listdir(index_dir)): # Create index cmd = f"minimap2 -t {threads} -d {index_name} {genome.filename}" run_index_cmd("minimap2", cmd)
def after_genome_download(self, genome, threads=1, force=False): if not cmd_ok("bwa"): return # Create index dir index_dir = genome.plugin["bwa"]["index_dir"] index_name = genome.plugin["bwa"]["index_name"] if force: # Start from scratch rm_rf(index_dir) mkdir_p(index_dir) if not any(fname.endswith(".bwt") for fname in os.listdir(index_dir)): # Create index if not os.path.exists(index_name): os.symlink(genome.filename, index_name) cmd = f"bwa index {index_name}" run_index_cmd("bwa", cmd)
def extract_tarball(fname, outfile=None, concat=True) -> Union[str, None]: """Convert tar of multiple FASTAs to one file.""" fnames = [] # Extract files to temporary directory tmp_dir = mkdtemp(dir=os.path.dirname(outfile)) with tarfile.open(fname) as tar: tar.extractall(path=tmp_dir) for root, _, files in os.walk(tmp_dir): fnames += [os.path.join(root, fname) for fname in files] if len(fnames) > 1 and not concat: raise ValueError("tarball contains multiple files, but concat not specified!") # Concatenate (also works woth one file) with open(outfile, "w") as out: for infile in fnames: for line in open(infile): out.write(line) rm_rf(tmp_dir) return outfile
def generate_annot(template, target, overwrite=False): """ Create an annotation file type from the other file type. Parameters ---------- template: str a GTF or BED filepath. target: str filepath to save the new annotation to. overwrite: bool, optional overwrite existing target file? """ exts = os.path.basename(template.lower()).split(".") exts = [e for e in exts if e in ["gtf", "bed"]] if len(exts) == 0: raise ValueError("Template file must be in GTF or BED format.") template_ext = exts[-1] if not overwrite and os.path.exists(target): raise FileExistsError(f"{target} already exists! Set overwrite=True to ignore.") target_dir = os.path.dirname(target) tmp_dir = mkdtemp(dir=target_dir) tmp_target = os.path.join(tmp_dir, "new_annot") if template_ext == "bed": cmd = "bedToGenePred {0} /dev/stdout | genePredToGtf -source=genomepy file /dev/stdin {1}" else: cmd = "gtfToGenePred -genePredExt -ignoreGroupsWithoutExons {0} /dev/stdout | genePredToBed /dev/stdin {1}" # unzip template if needed with extracted_file(template) as _template: sp.check_call(cmd.format(_template, tmp_target), shell=True) # gzip if needed tmp_target = gzip_and_name(tmp_target, target.endswith(".gz")) shutil.move(tmp_target, target) rm_rf(tmp_dir)
def after_genome_download(self, genome, threads=1, force=False): index_name = genome.plugin["star"]["index_name"] if not cmd_ok("STAR") or (os.path.exists(index_name) and not force): return index_dir = genome.plugin["star"]["index_dir"] rm_rf(index_dir) mkdir_p(index_dir) # gunzip genome if bgzipped and return up-to-date genome name fname, bgzip = gunzip_and_name(genome.filename) # index command cmd = (f"STAR --runMode genomeGenerate --runThreadN {threads} " + f"--genomeFastaFiles {fname} --genomeDir {index_dir} " + f"--outFileNamePrefix {index_dir}") # if an annotation is present, generate a splice-aware index gtf_file = genome.annotation_gtf_file gzip_file = False if gtf_file: # gunzip if gzipped gtf_file, gzip_file = gunzip_and_name(gtf_file) # update index command with annotation cmd += f" --sjdbGTFfile {gtf_file}" else: print("\nCreating STAR index without annotation file.") # Create index run_index_cmd("star", cmd) # re-bgzip genome if gunzipped bgzip_and_name(fname, bgzip) # re-gzip annotation if gunzipped if gtf_file: gzip_and_name(gtf_file, gzip_file)
def head_annotations(name: str, provider=None, n: int = 2): """ Quickly inspect the metadata of each available annotation for the specified genome. For UCSC, up to 4 gene annotation styles are available: "ncbiRefSeq", "refGene", "ensGene", "knownGene" (respectively). For NCBI, the chromosome names are not yet sanitized. Parameters ---------- name: str genome name provider: str, optional only search the specified provider for the genome name n: int, optional number of lines to show """ for p in online_providers(provider): if name in p.genomes: tmp_dir = mkdtemp() p.head_annotation(name, genomes_dir=tmp_dir, n=n) rm_rf(tmp_dir)
def _apply_fasta_regex_func(infa, regex_func, outfa=None): """ filter a Fasta using the regex function. infa: path to genome fasta regex_func: a function that takes a contig header and returns a bool outfa: path to output fasta. If None, infa is overwritten returns a list of excluded contigs """ # move the original file to a tmp folder out_dir = os.path.dirname(infa) tmp_dir = mkdtemp(dir=out_dir) old_fname = os.path.join(tmp_dir, "original") if outfa is None else infa new_fname = os.path.join(tmp_dir, "filtered") shutil.move(infa, old_fname) # perform the filtering excluded_contigs = [] keep_contig = True with open(old_fname) as old, open(new_fname, "w") as new: for line in tqdm(old, desc="Filtering Fasta", unit_scale=1, unit=" lines"): if line[0] == ">": keep_contig = regex_func(line) if keep_contig is False: excluded_contigs.append(line[1:].split(" ")[0].strip()) if keep_contig: new.write(line) # move the filtered file to the original folder shutil.move(new_fname, outfa if outfa else infa) rm_rf(tmp_dir) return excluded_contigs
def download_annotation(genomes_dir, annot_url, localname, n=None): """download annotation file, convert to intermediate file and generate output files""" # create output directory if missing out_dir = os.path.join(genomes_dir, localname) mkdir_p(out_dir) # download to tmp dir. Move genome on completion. # tmp dir is in genome_dir to prevent moving the genome between disks tmp_dir = mkdtemp(dir=out_dir) ext, is_compressed = get_file_info(annot_url) annot_file = os.path.join(tmp_dir, localname + ".annotation" + ext) tmp_annot_file = os.path.join(tmp_dir, annot_url.split("/")[-1]) get_file = shutil.copyfile if os.path.exists(annot_url) else download_file if n is None: get_file(annot_url, tmp_annot_file) else: download_head(annot_url, tmp_annot_file, n) is_compressed = False # unzip input file (if needed) if is_compressed: annot_file = extract_archive(tmp_annot_file, outfile=annot_file) else: shutil.move(tmp_annot_file, annot_file) # generate intermediate file (GenePred) pred_file = annot_file.replace(ext, ".gp") if "bed" in ext: cmd = "bedToGenePred {0} {1}" elif "gff" in ext: # example annotation: GRCh38.p12 from NCBI cmd = "gff3ToGenePred -useName -warnAndContinue {0} {1}" elif "gtf" in ext: cmd = "gtfToGenePred -genePredExt -allErrors -ignoreGroupsWithoutExons {0} {1}" elif "txt" in ext: # UCSC annotations only with open(annot_file) as f: cols = f.readline().split("\t") # extract the genePred format columns start_col = 1 for i, col in enumerate(cols): if col in ["+", "-"]: start_col = i - 1 break end_col = start_col + 10 cmd = ( f"""cat {{0}} | cut -f {start_col}-{end_col} | """ # knownGene.txt.gz has spotty fields, this replaces non-integer fields with zeroes + """awk 'BEGIN {{FS=OFS="\t"}} !($11 ~ /^[0-9]+$/) {{$11="0"}}1' > {1}""" ) else: raise TypeError(f"file type extension {ext} not recognized!") if n is None and "gencode" in annot_url: rename_contigs(annot_file) sp.check_call(cmd.format(annot_file, pred_file), shell=True) # generate gzipped gtf file (if required) gtf_file = annot_file.replace(ext, ".gtf") if "gtf" not in ext: cmd = "genePredToGtf -source=genomepy file {0} {1}" sp.check_call(cmd.format(pred_file, gtf_file), shell=True) # generate gzipped bed file (if required) bed_file = annot_file.replace(ext, ".bed") if "bed" not in ext: cmd = "genePredToBed {0} {1}" sp.check_call(cmd.format(pred_file, bed_file), shell=True) # transfer the files from the tmpdir to the genome_dir for f in [gtf_file, bed_file]: src = f dst = os.path.join(out_dir, os.path.basename(f)) shutil.move(src, dst) rm_rf(tmp_dir)
def clean(): """Remove cached data on providers""" my_cache_dir = os.path.join(user_cache_dir("genomepy"), __version__) rm_rf(my_cache_dir) mkdir_p(my_cache_dir) print("All clean!")
def _delete_extensions(directory: str, exts: list): """remove (gzipped) files in a directory matching any given extension""" for ext in exts: [rm_rf(f) for f in glob_ext_files(directory, ext)]
def download_and_generate_annotation(genomes_dir, annot_url, localname): """download annotation file, convert to intermediate file and generate output files""" # create output directory if missing out_dir = os.path.join(genomes_dir, localname) if not os.path.exists(out_dir): mkdir_p(out_dir) # download to tmp dir. Move genome on completion. # tmp dir is in genome_dir to prevent moving the genome between disks tmp_dir = mkdtemp(dir=out_dir) ext, gz = get_file_info(annot_url) annot_file = os.path.join(tmp_dir, localname + ".annotation" + ext) download_file(annot_url, annot_file) # unzip input file (if needed) if gz: cmd = "mv {0} {1} && gunzip -f {1}" sp.check_call(cmd.format(annot_file, annot_file + ".gz"), shell=True) # generate intermediate file (GenePred) pred_file = annot_file.replace(ext, ".gp") if "bed" in ext: cmd = "bedToGenePred {0} {1}" elif "gff" in ext: cmd = "gff3ToGenePred -geneNameAttr=gene {0} {1}" elif "gtf" in ext: cmd = "gtfToGenePred -ignoreGroupsWithoutExons {0} {1}" elif "txt" in ext: # UCSC annotations only with open(annot_file) as f: cols = f.readline().split("\t") # extract the genePred format columns start_col = 1 for i, col in enumerate(cols): if col in ["+", "-"]: start_col = i - 1 break end_col = start_col + 10 cmd = ( f"""cat {{0}} | cut -f {start_col}-{end_col} | """ # knownGene.txt.gz has spotty fields, this replaces non-integer fields with zeroes + """awk 'BEGIN {{FS=OFS="\t"}} !($11 ~ /^[0-9]+$/) {{$11="0"}}1' > {1}""" ) else: raise TypeError(f"file type extension {ext} not recognized!") sp.check_call(cmd.format(annot_file, pred_file), shell=True) # generate gzipped gtf file (if required) gtf_file = annot_file.replace(ext, ".gtf") if "gtf" not in ext: cmd = "genePredToGtf -source=genomepy file {0} {1} && gzip -f {1}" sp.check_call(cmd.format(pred_file, gtf_file), shell=True) # generate gzipped bed file (if required) bed_file = annot_file.replace(ext, ".bed") if "bed" not in ext: cmd = "genePredToBed {0} {1} && gzip -f {1}" sp.check_call(cmd.format(pred_file, bed_file), shell=True) # if input file was gtf/bed, gzip it if ext in [".gtf", ".bed"]: cmd = "gzip -f {}" sp.check_call(cmd.format(annot_file), shell=True) # transfer the files from the tmpdir to the genome_dir for f in [gtf_file + ".gz", bed_file + ".gz"]: src = f dst = os.path.join(out_dir, os.path.basename(f)) shutil.move(src, dst) rm_rf(tmp_dir)
def download_genome( self, name, genomes_dir=None, localname=None, mask="soft", keep_alt=False, regex=None, invert_match=False, bgzip=None, **kwargs, ): """ Download a (gzipped) genome file to a specific directory Parameters ---------- name : str Genome / species name genomes_dir : str , optional Directory to install genome localname : str , optional Custom name for your genome mask: str , optional Masking, soft, hard or none (all other strings) keep_alt : bool , optional Set to true to keep these alternative regions. regex : str , optional Regular expression to select specific chromosome / scaffold names. invert_match : bool , optional Set to True to select all chromosomes that don't match the regex. bgzip : bool , optional If set to True the genome FASTA file will be compressed using bgzip. If not specified, the setting from the configuration file will be used. """ name = safe(name) self.check_name(name) link = self.get_genome_download_link(name, mask=mask, **kwargs) localname = get_localname(name, localname) genomes_dir = get_genomes_dir(genomes_dir, check_exist=False) out_dir = os.path.join(genomes_dir, localname) if not os.path.exists(out_dir): mkdir_p(out_dir) sys.stderr.write( f"Downloading genome from {self.name}.\nTarget URL: {link}...\n") # download to tmp dir. Move genome on completion. # tmp dir is in genome_dir to prevent moving the genome between disks tmp_dir = mkdtemp(dir=out_dir) fname = os.path.join(tmp_dir, f"{localname}.fa") urlcleanup() download_file(link, fname) sys.stderr.write( "Genome download successful, starting post processing...\n") # unzip genome if link.endswith(".tar.gz"): tar_to_bigfile(fname, fname) elif link.endswith(".gz"): os.rename(fname, fname + ".gz") ret = sp.check_call(["gunzip", "-f", fname]) if ret != 0: raise Exception(f"Error gunzipping genome {fname}") def regex_filer(_fname, _regex, _v): infa = _fname + "_to_regex" os.rename(_fname, infa) # filter the fasta and store the output's keys keys_out = filter_fasta(infa, outfa=_fname, regex=_regex, v=_v, force=True).keys() keys_in = Fasta(infa).keys() return [k for k in keys_in if k not in keys_out] not_included = [] # remove alternative regions if not keep_alt: not_included.extend(regex_filer(fname, "alt", True)) # keep/remove user defined regions if regex: not_included.extend(regex_filer(fname, regex, invert_match)) # process genome (e.g. masking) if hasattr(self, "_post_process_download"): self._post_process_download(name=name, localname=localname, out_dir=tmp_dir, mask=mask) # bgzip genome if requested if bgzip or config.get("bgzip"): # bgzip to stdout, track progress, and output to file fsize = int(os.path.getsize(fname) * 10**-6) cmd = ( f"bgzip -fc {fname} | " f"tqdm --bytes --desc Bgzipping {fsize}MB fasta --log ERROR | " f"cat > {fname}.gz") ret = sp.check_call(cmd, shell=True) if ret != 0: raise Exception(f"Error bgzipping {name}. Is tabix installed?") fname += ".gz" # transfer the genome from the tmpdir to the genome_dir src = fname dst = os.path.join(genomes_dir, localname, os.path.basename(fname)) shutil.move(src, dst) rm_rf(tmp_dir) sys.stderr.write("\n") sys.stderr.write("name: {}\n".format(name)) sys.stderr.write("local name: {}\n".format(localname)) sys.stderr.write("fasta: {}\n".format(dst)) # Create readme with information readme = os.path.join(genomes_dir, localname, "README.txt") metadata = { "name": localname, "provider": self.name, "original name": name, "original filename": os.path.split(link)[-1], "assembly_accession": self.assembly_accession(self.genomes.get(name)), "tax_id": self.genome_taxid(self.genomes.get(name)), "mask": mask, "genome url": link, "annotation url": "na", "date": time.strftime("%Y-%m-%d %H:%M:%S"), } lines = [] if not keep_alt or regex: regex_line = "regex: " if not keep_alt: regex_line += "'alt' (inverted match)" if not keep_alt and regex: regex_line += " and " if regex: regex_line += f"'{regex}'" if invert_match: regex_line += " (inverted match)" lines += ["", regex_line, "sequences that were excluded:"] for seq in not_included: lines.append(f"\t{seq}") write_readme(readme, metadata, lines)