def estimate(self, hits, outfile, placements): hit = {} logging.info("Estimating scores now") if self.cfg["touch"]: file.touch(outfile) logging.info("Returning as we only touch") return r = base.readTSV(hits) # count profile hits for row in r: if row["profile"] not in hit.keys(): hit[row["profile"]] = 1 else: hit[row["profile"]] += 1 singletons = set(hit.keys()) multitons = set([k for k, v in hit.items() if v > 1]) # now we can estimate completeness and contamination for each placement for i in range(len(placements)): s = self.readSet(placements[i]["node"]) # completeness is the overap of both sets cmpl = len(singletons & s) / len(s) cont = len(multitons & s) / len(s) # make to percentage and round to 2 positions placements[i]["completeness"] = round(cmpl * 100, 2) placements[i]["contamination"] = round(cont * 100, 2) log("Finished estimating") # write to output file k = [ "completeness", "contamination", "node", "n", "ngenomes", "cover", "nPlacements", "taxid", "lineage", "taxidlineage", "file", ] with open(outfile, "w") as f: f.write("{}\n".format("\t".join(k))) for p in placements: # insert the file name p["file"] = self.cfg["name"] # write to file f.write("{}\n".format("\t".join([str(p[key]) for key in k]))) log("Wrote estimates to: {}".format(outfile)) # done return True
def concatHMM(self): # create a dir for this hmmdir = os.path.join(self.cfg["outdir"], "workfiles", "hmmer", "estimations") file.isdir(hmmdir) hmmconcat = os.path.join(hmmdir, "all.hmm") if self.cfg["touch"]: file.touch(hmmconcat) return hmmconcat profiles = set() for p in self.placements[self.cfg["placementMethod"]]: localpath = os.path.join(self.cfg["db"], "sets", "{}.set".format(p["node"])) with open(localpath) as f: for line in f: profiles.add(line.strip()) # make profiles to sorted list profiles = list(profiles) profiles.sort() # create all paths for all hmms hmmerpaths = [os.path.join(self.cfg["db"], "hmms", "panther", "{}.hmm".format(profile)) for profile in profiles] # sort and check if we already have the hmm for this canuseprev = False profilehash = hashlib.sha256("_".join(profiles).encode()).hexdigest() hashpath = os.path.join(hmmdir, "all.hash") if file.exists(hashpath): with open(hashpath) as f: for line in f: prevhash = line.strip() break canuseprev = prevhash == profilehash if canuseprev: # we can use the existing file, so no need to continue log("Using pressed hmms from last run") return hmmconcat # concatenate if len(profiles) == 0: logging.error("We have no profiles to evaluate") exit(1) log("{} hmm profiles need to be used for estimations".format(len(profiles))) log("Concatenating hmms, this might take a while (IO limited)") hmmconcat = base.concatenate(hmmconcat, hmmerpaths) # press log("Pressing hmms") hp = hmmpress("hmmpress", hmmconcat, None, touch=self.cfg["touch"]) hp.run() # save profile hash with open(hashpath, "w") as f: f.write(f"{profilehash}") return hmmconcat
def estimate(self, hits, outfile, placements): hit = {} logging.info("Estimating scores now") if self.cfg["touch"]: file.touch(outfile) logging.info("Returning as we only touch") return r = base.readTSV(hits) # count profile hits for row in r: if row["profile"] not in hit.keys(): hit[row["profile"]] = 1 else: hit[row["profile"]] += 1 singletons = set(hit.keys()) multitons = set([k for k, v in hit.items() if v > 1]) # now we can estimate completeness and contamination for each placement for i in range(len(placements)): s = self.readSet(placements[i]["node"]) placements[i]["set"] = s # completeness is the overap of both sets cmpl = len(singletons & s) / len(s) cont = len(multitons & s) / len(s) # make to percentage and round to 2 positions placements[i]["completeness"] = round(cmpl * 100, 2) placements[i]["contamination"] = round(cont * 100, 2) # compute silent fraction per placement and set self.get_silent_contig(self._clean_fasta, hits, placements) log("Finished estimating") self.write_outfile(outfile, placements) # done return True