def pygmes(self, fasta, db): outdir = os.path.join(self.cfg["outdir"], "workfiles", "pygmes") faafile = os.path.join(outdir, "predicted_proteins.faa") bedfile = os.path.join(outdir, "predicted_proteins.bed") self._clean_fasta = os.path.join( outdir, "gmesclean_{}".format(os.path.basename(fasta))) from pygmes import pygmes # check if we need to launch need_run = False if not file().exists(faafile) or not file().exists(bedfile): need_run = True elif file.isnewer(fasta, faafile): need_run = True if need_run: pygmes(fasta, outdir, db=db, clean=True, ncores=self.cfg["ncores"]) # check if pg worked if os.path.exists(faafile) and os.path.exists(bedfile): if os.stat(faafile).st_size == 0 or os.stat(bedfile).st_size == 0: logging.warning("No predicted proteins") self.write_outfile() exit(1) else: return (faafile, bedfile) else: logging.warning("No predicted proteins, pyfaidx failed") self.write_outfile() exit(1)
def runPlacedHMM(self, hmmfile, proteinfaa, bedfile): # run hmmer and strip down # define output files hmmDir = os.path.join(self.cfg["outdir"], "workfiles", "hmmer", "estimations") file.isdir(hmmDir) hmmOut = os.path.join(hmmDir, "placement.tsv") hmmOus = os.path.join(hmmDir, "placement.out") hitOut = os.path.join(hmmDir, "hits.tsv") h = hmmer( "hmmsearch", proteinfaa, hmmOut, self.cfg["debug"], touch=self.cfg["touch"], ) if h.doIneedTorun( self.cfg["force"]) or self.cfg["fplace"] or file.isnewer( hmmfile, hmmOut): logging.info("Running hmmer for chosen locations") h.run( hmmOus, hmmfiles=hmmfile, modus=self.cfg["dbinfo"]["modus"], evalue=self.cfg["evalue"], cores=self.cfg["ncores"], training=self.cfg["training"], ) # clean hmmer outpout logging.info("Processing Hmmer results") hitOut = h.clean(hmmOut, bedfile, hitOut, self.cfg["mindist"]) return hitOut
def gmes(self, fasta): """ predict proteins using gmes """ logging.debug("Starting gmes function") gmesDir = os.path.join(self.cfg["outdir"], "workfiles", "gmes") file.isdir(gmesDir) gmesOut = os.path.join(gmesDir, "prot_seq.faa") gtffile = os.path.join(gmesDir, "genemark.gtf") inputfasta = os.path.abspath(os.path.join(gmesDir, "input.fna")) # GeneMark-ES g = gmes("runGMES", fasta, [gtffile, gmesOut], touch=self.cfg["touch"]) logging.debug("Defined gmes run") if g.doIneedTorun(self.cfg["force"]): # rename fasta entries, so we dont have white spaces in them # can be turned of via cleanfasta in config file if not self.cfg["touch"]: g.input = base.clearFastaNames(fasta, inputfasta) else: g.input = inputfasta logging.info("Running GeneMark-ES") g.run(cores=self.cfg["ncores"]) else: logging.debug("I do not need to run gmes, output exists:") logging.debug(gtffile) # always check if gtffile exists, if not Genemark-ES failed and # we can stop here if not file.exists(gtffile): # log and document failing # then stop pipeline logging.error("GeneMark-ES failed on this bin") self.write_outfile() exit(1) elif self.cfg["clean"]: # clean temp dirs _tmpdirs = ["data", "run", "info", "output/data", "output/gmhmm"] tempdirs = [os.path.join(gmesDir, x) for x in _tmpdirs] g.cleanup(tempdirs) # make a bed file from GTF bedf = os.path.join(gmesDir, "proteins.bed") if self.cfg["force"] or file.isnewer(gtffile, bedf) and not self.cfg["touch"]: logging.info("Extracting protein locations") bedf = base.gmesBED(gtffile, bedf) # touch files expected for next step if self.cfg["touch"]: g.touch([bedf, gmesOut]) self._clean_fasta = inputfasta return (gmesOut, bedf)
def doIneedTorun(self, force=False): logging.debug("Testing if I need to run this step") if force or self.touchonly: return True else: for p in self.output_test: x = file.isnewer(self.input, p) if x: logging.debug(f"Need to run because of file: {p}") return x return x