def write_outfile(self, outfile=None, result=None): if outfile is None: outfile = os.path.join(self.cfg["outdir"], "eukcc.tsv") # write to output file k = [ "completeness", "contamination", "max_silent_contamination", "node", "n", "ngenomes", "cover", "nPlacements", "taxid", "lineage", "taxidlineage", "file", ] with open(outfile, "w") as f: f.write("{}\n".format("\t".join(k))) if result is None: logging.warning("No estimates were written") exit(11) for p in result: # insert the file name p["file"] = self.cfg["name"] # write to file f.write("{}\n".format("\t".join([str(p[key]) for key in k]))) log("Wrote estimates to: {}".format(outfile))
def estimate(self, hits, outfile, placements): hit = {} logging.info("Estimating scores now") if self.cfg["touch"]: file.touch(outfile) logging.info("Returning as we only touch") return r = base.readTSV(hits) # count profile hits for row in r: if row["profile"] not in hit.keys(): hit[row["profile"]] = 1 else: hit[row["profile"]] += 1 singletons = set(hit.keys()) multitons = set([k for k, v in hit.items() if v > 1]) # now we can estimate completeness and contamination for each placement for i in range(len(placements)): s = self.readSet(placements[i]["node"]) # completeness is the overap of both sets cmpl = len(singletons & s) / len(s) cont = len(multitons & s) / len(s) # make to percentage and round to 2 positions placements[i]["completeness"] = round(cmpl * 100, 2) placements[i]["contamination"] = round(cont * 100, 2) log("Finished estimating") # write to output file k = [ "completeness", "contamination", "node", "n", "ngenomes", "cover", "nPlacements", "taxid", "lineage", "taxidlineage", "file", ] with open(outfile, "w") as f: f.write("{}\n".format("\t".join(k))) for p in placements: # insert the file name p["file"] = self.cfg["name"] # write to file f.write("{}\n".format("\t".join([str(p[key]) for key in k]))) log("Wrote estimates to: {}".format(outfile)) # done return True
def readInfo(self, name): p = os.path.join(self.cfg["db"], "refpkg", name, "CONTENTS.json") # raise error if we cant find the file if not base.exists(p): log("Could not find {}".format(p)) exit(13) # read and return json with open(p) as json_file: j = json.load(json_file) return j
def pkgfile(self, name, t): """ get a file path for a refpkg package """ info = self.readInfo(name) p = os.path.join(self.cfg["db"], "refpkg", name, info["files"][t]) if base.exists(p): return p else: log("Could not find: {}".format(p)) exit(12)
def concatHMM(self): # create a dir for this hmmdir = os.path.join(self.cfg["outdir"], "workfiles", "hmmer", "estimations") file.isdir(hmmdir) hmmconcat = os.path.join(hmmdir, "all.hmm") if self.cfg["touch"]: file.touch(hmmconcat) return hmmconcat profiles = set() for p in self.placements[self.cfg["placementMethod"]]: localpath = os.path.join(self.cfg["db"], "sets", "{}.set".format(p["node"])) with open(localpath) as f: for line in f: profiles.add(line.strip()) # make profiles to sorted list profiles = list(profiles) profiles.sort() # create all paths for all hmms hmmerpaths = [os.path.join(self.cfg["db"], "hmms", "panther", "{}.hmm".format(profile)) for profile in profiles] # sort and check if we already have the hmm for this canuseprev = False profilehash = hashlib.sha256("_".join(profiles).encode()).hexdigest() hashpath = os.path.join(hmmdir, "all.hash") if file.exists(hashpath): with open(hashpath) as f: for line in f: prevhash = line.strip() break canuseprev = prevhash == profilehash if canuseprev: # we can use the existing file, so no need to continue log("Using pressed hmms from last run") return hmmconcat # concatenate if len(profiles) == 0: logging.error("We have no profiles to evaluate") exit(1) log("{} hmm profiles need to be used for estimations".format(len(profiles))) log("Concatenating hmms, this might take a while (IO limited)") hmmconcat = base.concatenate(hmmconcat, hmmerpaths) # press log("Pressing hmms") hp = hmmpress("hmmpress", hmmconcat, None, touch=self.cfg["touch"]) hp.run() # save profile hash with open(hashpath, "w") as f: f.write(f"{profilehash}") return hmmconcat
def estimate(self, hits, outfile, placements): hit = {} logging.info("Estimating scores now") if self.cfg["touch"]: file.touch(outfile) logging.info("Returning as we only touch") return r = base.readTSV(hits) # count profile hits for row in r: if row["profile"] not in hit.keys(): hit[row["profile"]] = 1 else: hit[row["profile"]] += 1 singletons = set(hit.keys()) multitons = set([k for k, v in hit.items() if v > 1]) # now we can estimate completeness and contamination for each placement for i in range(len(placements)): s = self.readSet(placements[i]["node"]) placements[i]["set"] = s # completeness is the overap of both sets cmpl = len(singletons & s) / len(s) cont = len(multitons & s) / len(s) # make to percentage and round to 2 positions placements[i]["completeness"] = round(cmpl * 100, 2) placements[i]["contamination"] = round(cont * 100, 2) # compute silent fraction per placement and set self.get_silent_contig(self._clean_fasta, hits, placements) log("Finished estimating") self.write_outfile(outfile, placements) # done return True
parser.add_argument('--quiet', '-q', dest='quiet', action='store_true', default=False, help='silcence most output') parser.add_argument('--debug', '-d', action='store_true', default=False, help='debug and thus ignore safety') args = parser.parse_args() ############################################### # starting the analysis log("Running eukcc for {} bin{}".format(len(args.fasta), "s" if len(args.fasta) > 1 else "")) # create output if not exists if not file.isdir(args.outdir): exit() # check if a protein fasta was passed (implied ) if args.bed is not None: # set no glob args.noglob = True args.isprotein = True else: args.isprotein = False # check if we can expand glob: if len(args.fasta) == 1 and not args.noglob: