def gmes(self, fasta): """ predict proteins using gmes """ logging.debug("Starting gmes function") gmesDir = os.path.join(self.cfg["outdir"], "workfiles", "gmes") file.isdir(gmesDir) gmesOut = os.path.join(gmesDir, "prot_seq.faa") gtffile = os.path.join(gmesDir, "genemark.gtf") inputfasta = os.path.abspath(os.path.join(gmesDir, "input.fna")) # GeneMark-ES g = gmes("runGMES", fasta, [gtffile, gmesOut], touch=self.cfg["touch"]) logging.debug("Defined gmes run") if g.doIneedTorun(self.cfg["force"]): # rename fasta entries, so we dont have white spaces in them # can be turned of via cleanfasta in config file if not self.cfg["touch"]: g.input = base.clearFastaNames(fasta, inputfasta) else: g.input = inputfasta logging.info("Running GeneMark-ES") g.run(cores=self.cfg["ncores"]) else: logging.debug("I do not need to run gmes, output exists:") logging.debug(gtffile) # always check if gtffile exists, if not Genemark-ES failed and # we can stop here if not file.exists(gtffile): # log and document failing # then stop pipeline logging.error("GeneMark-ES failed on this bin") self.write_outfile() exit(1) elif self.cfg["clean"]: # clean temp dirs _tmpdirs = ["data", "run", "info", "output/data", "output/gmhmm"] tempdirs = [os.path.join(gmesDir, x) for x in _tmpdirs] g.cleanup(tempdirs) # make a bed file from GTF bedf = os.path.join(gmesDir, "proteins.bed") if self.cfg["force"] or file.isnewer(gtffile, bedf) and not self.cfg["touch"]: logging.info("Extracting protein locations") bedf = base.gmesBED(gtffile, bedf) # touch files expected for next step if self.cfg["touch"]: g.touch([bedf, gmesOut]) self._clean_fasta = inputfasta return (gmesOut, bedf)
def concatHMM(self): # create a dir for this hmmdir = os.path.join(self.cfg["outdir"], "workfiles", "hmmer", "estimations") file.isdir(hmmdir) hmmconcat = os.path.join(hmmdir, "all.hmm") if self.cfg["touch"]: file.touch(hmmconcat) return hmmconcat profiles = set() for p in self.placements[self.cfg["placementMethod"]]: localpath = os.path.join(self.cfg["db"], "sets", "{}.set".format(p["node"])) with open(localpath) as f: for line in f: profiles.add(line.strip()) # make profiles to sorted list profiles = list(profiles) profiles.sort() # create all paths for all hmms hmmerpaths = [os.path.join(self.cfg["db"], "hmms", "panther", "{}.hmm".format(profile)) for profile in profiles] # sort and check if we already have the hmm for this canuseprev = False profilehash = hashlib.sha256("_".join(profiles).encode()).hexdigest() hashpath = os.path.join(hmmdir, "all.hash") if file.exists(hashpath): with open(hashpath) as f: for line in f: prevhash = line.strip() break canuseprev = prevhash == profilehash if canuseprev: # we can use the existing file, so no need to continue log("Using pressed hmms from last run") return hmmconcat # concatenate if len(profiles) == 0: logging.error("We have no profiles to evaluate") exit(1) log("{} hmm profiles need to be used for estimations".format(len(profiles))) log("Concatenating hmms, this might take a while (IO limited)") hmmconcat = base.concatenate(hmmconcat, hmmerpaths) # press log("Pressing hmms") hp = hmmpress("hmmpress", hmmconcat, None, touch=self.cfg["touch"]) hp.run() # save profile hash with open(hashpath, "w") as f: f.write(f"{profilehash}") return hmmconcat