def estimate(self, hits, outfile, placements): hit = {} logging.info("Estimating scores now") if self.cfg["touch"]: file.touch(outfile) logging.info("Returning as we only touch") return r = base.readTSV(hits) # count profile hits for row in r: if row["profile"] not in hit.keys(): hit[row["profile"]] = 1 else: hit[row["profile"]] += 1 singletons = set(hit.keys()) multitons = set([k for k, v in hit.items() if v > 1]) # now we can estimate completeness and contamination for each placement for i in range(len(placements)): s = self.readSet(placements[i]["node"]) # completeness is the overap of both sets cmpl = len(singletons & s) / len(s) cont = len(multitons & s) / len(s) # make to percentage and round to 2 positions placements[i]["completeness"] = round(cmpl * 100, 2) placements[i]["contamination"] = round(cont * 100, 2) log("Finished estimating") # write to output file k = [ "completeness", "contamination", "node", "n", "ngenomes", "cover", "nPlacements", "taxid", "lineage", "taxidlineage", "file", ] with open(outfile, "w") as f: f.write("{}\n".format("\t".join(k))) for p in placements: # insert the file name p["file"] = self.cfg["name"] # write to file f.write("{}\n".format("\t".join([str(p[key]) for key in k]))) log("Wrote estimates to: {}".format(outfile)) # done return True
def get_silent_contig(self, fasta, hits, placements): """ given hits and the fasta file, we can calculate how many DNA bp can not be detected as contaminant, because no markers are on these contigs """ logging.debug("Calculating silent contig fraction") if fasta is None: logging.debug( "As no DNA fasta was given, we can't compute the contig fraction" ) for placement in placements: placement["max_silent_contamination"] = "NA" return # get dict to link profile to contigs profile_2_contigs = defaultdict(set) for row in base.readTSV(hits): profile_2_contigs[row["profile"]].add(row["chrom"]) # read in fasta once dna_lens = {} for rec in Fasta(fasta): dna_lens[rec.name] = len(str(rec)) total_len = sum([v for k, v in dna_lens.items()]) # for each placement compute the silent fraction for placement in placements: contigs = set() # make union of contigs for profile in placement["set"]: contigs = contigs | profile_2_contigs[profile] # define missing contigs as contig names we did not locate any marker genes on m_contigs = set(dna_lens.keys()) - contigs missing_len = sum([dna_lens[contig] for contig in m_contigs]) placement["max_silent_contamination"] = round( missing_len / total_len * 100, 2) logging.debug("The silent fraction could be up to {}%".format( placement["max_silent_contamination"])) return
def estimate(self, hits, outfile, placements): hit = {} logging.info("Estimating scores now") if self.cfg["touch"]: file.touch(outfile) logging.info("Returning as we only touch") return r = base.readTSV(hits) # count profile hits for row in r: if row["profile"] not in hit.keys(): hit[row["profile"]] = 1 else: hit[row["profile"]] += 1 singletons = set(hit.keys()) multitons = set([k for k, v in hit.items() if v > 1]) # now we can estimate completeness and contamination for each placement for i in range(len(placements)): s = self.readSet(placements[i]["node"]) placements[i]["set"] = s # completeness is the overap of both sets cmpl = len(singletons & s) / len(s) cont = len(multitons & s) / len(s) # make to percentage and round to 2 positions placements[i]["completeness"] = round(cmpl * 100, 2) placements[i]["contamination"] = round(cont * 100, 2) # compute silent fraction per placement and set self.get_silent_contig(self._clean_fasta, hits, placements) log("Finished estimating") self.write_outfile(outfile, placements) # done return True
def prepareAlignment( self, pplaceAlinment, hmmerOutput, proteinList, proteinFasta, config, cfg, tmpDir, ): # for each profile that we found a SCMG # make concatenated fasta (including new sequence) # align that to the hmm profile profiles = [] with open(proteinList) as pl: for l in pl: profiles.append(l.strip()) # read in hmmer output to get seqnames of target proteins # cols = [] # hmmer = [] # scmgsr = [] # with open(hmmerOutput) as ho: # for line in ho: # l = line.split() # if len(cols) == 0: # cols = l # else: # n = {} # for k,v in zip(cols, l): # n[k] = v # hmmer.append(n) # scmgsr.append(n['profile']) # load hmmer results hmmer = base.readTSV(hmmerOutput) # extract SCMGs scmgsr = [hit["profile"] for hit in hmmer] # scmgs = [p for p in set(scmgsr) if scmgsr.count(p) == 1] # db = [p for p in set(scmgsr) if scmgsr.count(p) == 2] scmgs = scmgsr # scmgs[-1] = db[0] scmgs.sort() # count the placements self.lenscmgs = len(scmgs) if self.lenscmgs > 0: # get the protein names for each scmgs proteinnames = [] # for all profiles extract proteins hmmr = {} # load into dict for row in hmmer: hmmr[row["profile"]] = row["subject"] # make list of protein names for m in scmgs: proteinnames.append(hmmr[m]) # load proteins proteins = Fasta(proteinFasta) alignments = [] # for each protein/SCMG we need to make an alignment for p, g in zip(scmgs, proteinnames): # write seq to file seq = proteins[g] name = "{}_{}".format(p, g) tmpfasta = os.path.join(tmpDir, "{}.faa".format(name)) geneAlignment = os.path.join(tmpDir, "{}.aln".format(name)) tmpfasta = base.writeFasta(tmpfasta, name, seq) profileAlignment = config.pkgfile("{}.refpkg".format(p), "aln_fasta") profileHMM = config.pkgfile("{}.refpkg".format(p), "profile") # start and run alignment of this profile ha = hmmalign("hmmalign", tmpfasta, geneAlignment) ha.run(profileAlignment, profileHMM) alignments.append(geneAlignment) # after this we concatenate the alignments # ordered by the profile name self.input = base.horizontalConcat( pplaceAlinment, alignments, scmgs, config.pkgfile("{}.refpkg".format("concat"), "aln_fasta"), )