def characterizeContigs(varsP, xmappath=None): """Log simple contigs stats, and optionally align stats from xmappath. """ #print "xmappath:", xmappath unitscale = 1e-6 dorefalign = bool( xmappath ) #i'm never actually calling refaligner here--this is just using xmappath haveref = bool(varsP.ref) #refcmap = mapClasses.multiCmap() #not used aligndir = varsP.contigAlignTarget try: #refcmap = mapClasses.multiCmap(varsP.ref) #reflen = refcmap.totalLength #note: total length of _all_ contigs reflen = mapClasses.multiCmap(varsP.ref, lengthonly=True).totalLength #in summary table, this is a denominator--make sure it's non-zero, don't bail (still get contig summary) if reflen <= 0: #print "Warning in CharacterizeModule.characterizeContigs: bad reflen", reflen, "defaulting to 1" #not necessary reflen = 1. except: reflen = 1. outstr = "" #Contig Characterization:\n" #check for .hmaps in same dir as latestMergedCmap: if any, add a line for haploid genome size hmaps = util.getListOfFilesFromDir(os.path.dirname(varsP.latestMergedCmap), ".hmap") haplotype = (len(hmaps) > 0) haplotypelen = 0 hapcontiglens = [] totcontiglen = 0 totalignlen = 0 nmapcontigs = 0 totalignqlen = 0 #defalignlen = 0; contiglens = [] #lens of all contigs in bases uniqueseg = { } #the argument to util.uniqueRange--stores all the map lengths which go into totalignlen--now each of these is a value, and the keys are the reference contig id or chromosome depending on dorefidchr for citr, cpath in enumerate([varsP.latestMergedCmap ]): #always use contigpaths mapi = mapClasses.multiCmap(cpath) totcontiglen += mapi.totalLength contiglens += mapi.getAllMapLengths( ) #getAllMapLengths is list of all map lengths if haplotype: haplotypelen += mapi.getHaplotypeTotalMapLength() hapcontiglens.extend(mapi.getHaplotypeMapLengths()) #store a list of the contig ids in this multiCmap, then remove them if they're in the xmap # if they're not, print at the end mapids = mapi.getAllMapIds( ) #this is once per cmap, not once per characterizeModule call--becuase it's keys, it's already a copy, no need to copy explicitly ncontigs = len( mapids) #this is ncontigs in this file, ie, in mapi (see below) xmapobj = mapClasses.xmap() #empty map to fix xmapobj scope if dorefalign: #get xmap object if util.checkFile(xmappath, ".xmap"): xmapobj = mapClasses.xmap(xmappath) for xitr, xmapentry in enumerate(xmapobj.xmapLookup.values()): #get map length from multicmap.getMapLength--returns 0 for any exception contiglen = mapi.getMapLength(xmapentry.contigQry) if contiglen <= 0: #this strikes me as clumsy...but I don't want to return non-zero from multiCmap.getMapLength contiglen = 1. contigcov = mapi.getMapAvgCoverage(xmapentry.contigQry) #don't print lenr for each contig--just total them lenr = xmapentry.getMappedRefLen() lenq = xmapentry.getMappedQryLen() refid = xmapentry.contigRef #int totalignlen += lenr totalignqlen += lenq #uniqueseg is now a dict to take into account which chromosome the query contig is on #note need refid bc need to separate different contigs on the _same_ chromosome if not uniqueseg.has_key( refid ): #if first contig on chromosome, need to init new list uniqueseg[refid] = [] uniqueseg[refid].append([xmapentry.RefStart, xmapentry.RefStop]) #process mapids--remove contig id (contigQry) from mapids if they're in the xmap so non-aligning contigs can be printed if xmapentry.contigQry in mapids: mapids.remove(xmapentry.contigQry) #end loop on xmap entries #now that all xmap entries are processed, all contigs with an alignment are removed from mapids, # so we can get n contigs align using this and ncontigs nmapcontigs += ncontigs - len(mapids) #sum multiple cmaps #end loop on contigs varsP.totAssemblyLenMb = totcontiglen * unitscale ncontigs = len( contiglens) #contigpaths is just files--contiglens is all contigs avgcontiglen = (float(totcontiglen) / ncontigs if ncontigs > 0 else 0) if unitscale > 1e-6: #if not megabases fstr = "%9.0f" else: #megabases fstr = "%8.3f" if haplotype: #new format for haplotype #if haplotypelen != sum(hapcontiglens) : #simply print warning in this case (do not log): ignore this bc of floating point rounding #print "Warning in characterizeContigs: haplotype lengths are inconsistent:", haplotypelen, sum(hapcontiglens) #diploid is same as else below, but names change outstr += "Diploid N Genome Maps: %i\n" % ncontigs outstr += ("Diploid Genome Map Len (Mb): " + fstr + "\n") % (totcontiglen * unitscale) outstr += ("Diploid Avg. Genome Map Len (Mb): " + fstr + "\n") % (avgcontiglen * unitscale) outstr += ("Diploid Median Genome Map Len (Mb): " + fstr + "\n") % (util.getMedian(contiglens) * unitscale) outstr += ("Diploid Genome Map n50 (Mb): " + fstr + "\n") % (util.getn50(contiglens) * unitscale) #haploid : ignore haplotypelen, just use the list hapcontiglens outstr += "Haploid N Genome Maps: %i\n" % len(hapcontiglens) tot = sum(hapcontiglens) avg = (tot / len(hapcontiglens) if len(hapcontiglens) else 0) outstr += ("Haploid Genome Map Len (Mb): " + fstr + "\n") % (tot * unitscale) outstr += ("Haploid Avg. Genome Map Len (Mb): " + fstr + "\n") % (avg * unitscale) outstr += ("Haploid Median Genome Map Len (Mb): " + fstr + "\n") % (util.getMedian(hapcontiglens) * unitscale) outstr += ("Haploid Genome Map n50 (Mb): " + fstr + "\n") % (util.getn50(hapcontiglens) * unitscale) else: #default to old format outstr += "N Genome Maps: %i\n" % ncontigs outstr += ("Total Genome Map Len (Mb): " + fstr + "\n") % (totcontiglen * unitscale) outstr += ("Avg. Genome Map Len (Mb): " + fstr + "\n") % (avgcontiglen * unitscale) outstr += ("Median Genome Map Len (Mb): " + fstr + "\n") % (util.getMedian(contiglens) * unitscale) outstr += ("Genome Map n50 (Mb): " + fstr + "\n") % (util.getn50(contiglens) * unitscale) if haveref: outstr += ("Total Ref Len (Mb): " + fstr + "\n") % (reflen * unitscale) outstr += ("Total Genome Map Len / Ref Len : " + fstr + "\n") % (totcontiglen / reflen) if dorefalign: ratio = (float(nmapcontigs) / ncontigs if ncontigs > 0 else 0) outstr += ("N Genome Maps total align : %i (%.2f)\n") % ( nmapcontigs, ratio) outstr += ("Total Aligned Len (Mb) : " + fstr + "\n") % (totalignlen * unitscale) outstr += ("Total Aligned Len / Ref Len : " + fstr + "\n") % (totalignlen / reflen) uniquelen = 0 for segs in uniqueseg.values(): # need to sum on dict entries util.uniqueRange(segs) #this modifies list in place uniquelen += util.totalLengthFromRanges(segs) outstr += ("Total Unique Aligned Len (Mb) : " + fstr + "\n") % (uniquelen * unitscale) outstr += ("Total Unique Aligned Len / Ref Len: " + fstr + "\n") % (uniquelen / reflen) return outstr
def getAlignStats(varsP, outFileList, reflen=0, isref=False, mergepath="", bnxpath=None) : '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule. reflen should be in Mb. If mergepath supplied, put merged .err there. If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this file and ignore outFileList. ''' statonly = False #bnx stats only skipbnx = False #.err file processing only if bnxpath == None : if not varsP.sorted_file : #for runAlignMol, this is empty: nothing to do in this case skipbnx = True else : bnxpath = varsP.sorted_file+".bnx" #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix else : #if bnxpath != None : statonly = True if not skipbnx and not util.checkFile(bnxpath) : varsP.updatePipeReport("Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n" % bnxpath) return #find the minlen used for bnx_sort, which is a required arg set sortargs = [] if varsP.argData.has_key('bnx_sort') : #for runAlignMol.py sortargs = varsP.argsListed('bnx_sort') minlen = 0 validminlen = False if "-minlen" in sortargs : minlen = sortargs[sortargs.index("-minlen")+1] #next ele should be the len, if next ele isn't in list, the sort job will fail minlen = util.getIntFromString(minlen) #returns None if can't cast to int if minlen : validminlen = True if not validminlen and bnxpath == None and sortargs : varsP.updatePipeReport("Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n") if bnxpath != None : #if bnxpath, ignore minlen minlen = 0 nmol = 0 #total n mol above minlen totlen = 0 #total mol len above minlen if util.checkFile(bnxpath) : #the bnxfile class is very wasteful. replace with below #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now outstr = "Reading molecule stats from %s:\n" % bnxpath outstr += "Molecule Stats:\n" moldict = util.simpleBnxStats(bnxpath, minlen) nmol = moldict["nmol"] totlen = moldict["totlen"] #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously outstr += "N mols: %i\n" % nmol outstr += ("Total len (Mb): %10.3f\n") % totlen outstr += ("Avg len (kb) : %10.3f\n") % moldict["avglen"] outstr += ("Mol N50 (kb) : %10.3f\n") % moldict["n50"] outstr += ("Lab (/100kb) : %10.3f\n") % moldict["labdensity"] # if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below # bnx.molstats[minlen].genomesizemb = 0 # outstr += str(bnx.molstats[minlen]) #nmol = bnx.molstats[minlen].nmol #totlen = bnx.molstats[minlen].totlen if reflen : cov = totlen / reflen #totlen is in Mb outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else "Contig", cov) if isref or reflen or statonly : #if neither, nothing to print varsP.updateInfoReport(outstr + "\n", printalso=True) elif not skipbnx : varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing bnx path:"+bnxpath+"\n") if statonly : return #lastly, load .xmaps and .errs from alignmol jobs and report on stats totmaplen = 0 #sum of lengths of mapped portions of all molecules, on reference totmapqrylen = 0 #sum of lengths of mapped portions of all molecules, on query totconf = 0 #sum of confidence of all alignments nalign = 0 #total number of alignments fplist = [] #lists for error rates fprlist = [] fnlist = [] bpplist = [] nmaplist = [] #from .err gmaplist = [] #from .err llrmlist = []; llrgmlist = []; bppsdlist = [] sflist = []; sdlist = []; srlist = []; reslist = []; resdlist = [] header = "" err = None #will be the alignParams object if any .err files are found mappref = "" if len(outFileList) > 0 : mappref = getMergeFilename(outFileList[0]) #make function to unify with same convention in mergeMap for outpath in outFileList : #these are file prefixes if util.checkFile(outpath+".xmap") : xmap = mc.xmap(outpath+".xmap") nalign += len(xmap.xmapLookup) totmaplen += xmap.getSumMappedRefLen() #in kb totmapqrylen += xmap.getSumMappedQryLen() #in kb totconf += sum([x.Confidence for x in xmap.xmapLookup.values()]) else : varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing xmap:"+outpath+".xmap"+"\n") if util.checkFile(outpath+".err") : err = mc.alignParams(outpath+".err") if not header : header = err.header fplist.append(err.fp) fprlist.append(err.fprate) fnlist.append(err.fn) bpplist.append(err.bpp) reslist.append(err.res) nmaplist.append(err.nmaps) gmaplist.append(err.goodmaps) llrmlist.append(err.llrm) llrgmlist.append(err.llrgm) bppsdlist.append(err.bppsd) sflist.append(err.sf) sdlist.append(err.sd) srlist.append(err.sr) resdlist.append(err.ressd) #nalign from xmap should be the same as goodmaps from .err sumgoodmaps = sum(gmaplist) if sumgoodmaps != nalign : varsP.updateInfoReport("Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n" % (sumgoodmaps, nalign), printalso=True) if totmaplen or totconf or nalign : outstr = "Molecules Aligned to %s:\n" % ("Reference" if isref else "Assembly") outstr += "N mol align : %9i\n" % nalign outstr += "Mol fraction align: %13.3f\n" % (float(nalign)/nmol if nmol else 0) outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3) #Mb if reflen > 0 : outstr += ("Effective Cov (x) : %13.3f\n") % (totmaplen / 1e3 / reflen) #totlen is in kb outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen/nalign if nalign else 0) outstr += "Fraction align len: %13.3f\n" % (totmapqrylen/1e3/totlen if totlen else 0) #totmapqrylen is in kb, totlen is in mb outstr += "Tot confidence : %11.1f\n" % totconf outstr += "Avg confidence : %11.1f\n" % (totconf/nalign if nalign else 0) varsP.updateInfoReport(outstr, printalso=True) avgfp = (sum(fplist)/len(fplist) if len(fplist) else 0) avgfpr = (sum(fprlist)/len(fprlist) if len(fprlist) else 0) avgfn = (sum(fnlist)/len(fnlist) if len(fnlist) else 0) avgbpp = (sum(bpplist)/len(bpplist) if len(bpplist) else 0) avgres = (sum(reslist)/len(reslist) if len(reslist) else 0) avgllr = (sum(llrmlist)/len(llrmlist) if len(llrmlist) else 0) avgllg = (sum(llrgmlist)/len(llrgmlist) if len(llrgmlist) else 0) avgbps = (sum(bppsdlist)/len(bppsdlist) if len(bppsdlist) else 0) avgsf = (sum(sflist)/len(sflist) if len(sflist) else 0) avgsd = (sum(sdlist)/len(sdlist) if len(sdlist) else 0) avgsr = (sum(srlist)/len(srlist) if len(srlist) else 0) avgrsd = (sum(resdlist)/len(resdlist) if len(resdlist) else 0) if avgfp or avgfn or avgbpp : outstr = "Avg FP(/100kb) : %12.2f\n" % avgfp outstr += "Avg FP ratio : %13.3f\n" % avgfpr outstr += "Avg FN ratio : %13.3f\n" % avgfn outstr += "Avg bpp : %11.1f\n" % avgbpp outstr += "Avg sf : %13.3f\n" % avgsf outstr += "Avg sd : %13.3f\n" % avgsd outstr += "Avg sr : %13.3f\n" % avgsr varsP.updateInfoReport(outstr + "\n", printalso=True) if err and mergepath : #have an error file (alignParams) object util.checkDir(mergepath) mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") outpath = os.path.join(mergepath, mappref+mrgstr+".err") err.fp = avgfp err.fn = avgfn err.sf = avgsf err.sd = avgsd err.bpp = avgbpp err.res = avgres err.nmaps = sum(nmaplist) err.llrm = avgllr err.goodmaps = sumgoodmaps err.llrgm = avgllg err.bppsd = avgbps err.fprate = avgfpr err.sr = avgsr err.ressd = avgrsd err.writeToFile(outpath)
def characterizeContigs(varsP, xmappath=None, listcontigs=False) : """Log simple contigs stats, and optionally align stats from xmappath. """ #print header of table unitscale = 1e-6 dorefalign = bool(xmappath) #i'm never actually calling refaligner here--this is just using xmappath #dorefidchr = False #dorefcid = False printrange = False #printsegrms = False #dochrstr = False iscluster = True haveref = bool(varsP.ref) #refcmap = mapClasses.multiCmap() #not used aligndir = varsP.contigAlignTarget try : #refcmap = mapClasses.multiCmap(varsP.ref) #reflen = refcmap.totalLength #note: total length of _all_ contigs reflen = mapClasses.multiCmap(varsP.ref, lengthonly=True).totalLength #in summary table, this is a denominator--make sure it's non-zero, don't bail (still get contig summary) if reflen <= 0 : #print "Warning in CharacterizeModule.characterizeContigs: bad reflen", reflen, "defaulting to 1" #not necessary reflen = 1. except: reflen = 1. outstr = "" #Contig Characterization:\n" if listcontigs and dorefalign : outstr += "cID len" outstr += " Cov" #if dorefidchr or dorefcid : # outstr += " rID" #ref index for either of these #if dorefidchr : # outstr += " rpos" outstr += " alignlen alignlen/len" if printrange : outstr += " Qry1 Qry2 Ref1 Ref2" #outstr += (" segRMS" if printsegrms else "") outstr += " Conf Conf/lenkb" outstr += " FP FN sf sd bpp" #" res" #--ditch res (not bpp) #if dochrstr : # outstr += " Chr" outstr += "\n" totcontiglen = 0; totalignlen = 0; nmapcontigs = 0; defalignlen = 0; totalignqlen = 0 contiglens = [] #lens of all contigs in bases uniqueseg = {} #the argument to util.uniqueRange--stores all the map lengths which go into totalignlen--now each of these is a value, and the keys are the reference contig id or chromosome depending on dorefidchr avgfplist = []; avgfnlist = [] #average FP/FN rates #if dorefidchr : # chrsum = refcmap.makeChrSummaryDict() #see mapClasses.multiCmap for citr, cpath in enumerate([varsP.latestMergedCmap]) : #always use contigpaths mapi = mapClasses.multiCmap(cpath) totcontiglen += mapi.totalLength contiglens += mapi.getAllMapLengths() #getAllMapLengths is list of all map lengths #store a list of the contig ids in this multiCmap, then remove them if they're in the xmap # if they're not, print at the end mapids = mapi.getAllMapIds() #this is once per cmap, not once per characterizeModule call--becuase it's keys, it's already a copy, no need to copy explicitly ncontigs = len(mapids) #this is ncontigs in this file, ie, in mapi (see below) xmapobj = mapClasses.xmap() #empty map to fix xmapobj scope if dorefalign : #get xmap object #xmappath = aligndir+os.path.split(cpath)[-1].replace(".cmap", ".xmap") #need cmap file name #xmappath = self.xmapTarget #if xmappath exists isn't a file, nothing will be loaded if os.path.isfile( xmappath ) : #was if not isfile : continue xmapobj = mapClasses.xmap(xmappath) for xitr, xmapentry in enumerate(xmapobj.xmapLookup.values()) : #print the contig id from the xmap #this is sorted by ref position; could sort the list this loop is over by the contigQry data member, # _but_, I think I like reference-oriented better because you can see gap spanning #get map length from multicmap.getMapLength--returns 0 for any exception contiglen = mapi.getMapLength(xmapentry.contigQry) if contiglen <= 0 : #this strikes me as clumsy...but I don't want to return non-zero from multiCmap.getMapLength contiglen = 1. contigcov = mapi.getMapAvgCoverage(xmapentry.contigQry) if listcontigs : outstr += "%5i" % xmapentry.contigQry outstr += " %9.1f %2i" % (contiglen, contigcov) #don't print lenr for each contig--just total them lenr = xmapentry.getMappedRefLen() lenq = xmapentry.getMappedQryLen() refid = xmapentry.contigRef #int #if dorefidchr : #this is the encoding of ref contig id to chromosome and start position # chrpos = mapClasses.intToChrPos(refid, verbose=False) #this returns a tuple (shouldn't fail, but verbose false) # refidstr = " %2s %6i" % chrpos # chrs = chrpos[0] #just for readability # if chrsum.has_key(chrs) : #the method that fills chrsum (makeChrSummaryDict) also uses intToChrPos # chrsum[chrs][0] += lenr #values are list, first ele is total aligned length, second is ref length (fixed) #elif dorefcid : # refidstr = " %3i" % refid #refid is int #else : #nothing for neither, but still need empty string refidstr = "" conf = xmapentry.Confidence #confidence from the xmap, and ratio of it to length in kb if listcontigs : alignpars = getMappedErrStats(aligndir, cpath) #an empty err file is produced for case of no align avgfplist.append( alignpars.fp ) avgfnlist.append( alignpars.fn ) outstr += "%s %9.1f %.3f" % (refidstr, lenq, lenq/contiglen) #length for refidstr set above if printrange : outstr += " %5.0f %5.0f %5.0f %5.0f" % (xmapentry.QryStart/pn, xmapentry.QryStop/pn, xmapentry.RefStart/pn, xmapentry.RefStop/pn) #outstr += (" %5.0f" % 0 if printsegrms else "") #don't print anything outstr += " %3.0f %5.3f" % (conf, conf*1000./lenq) #1000 is for kb outstr += " " + alignpars.getParamString() totalignlen += lenr totalignqlen += lenq #uniqueseg is now a dict to take into account which chromosome the query contig is on #note need refid bc need to separate different contigs on the _same_ chromosome if not uniqueseg.has_key(refid) : #if first contig on chromosome, need to init new list uniqueseg[refid] = [] uniqueseg[refid].append( [xmapentry.RefStart, xmapentry.RefStop] ) #process mapids--remove contig id (contigQry) from mapids if they're in the xmap so non-aligning contigs can be printed if xmapentry.contigQry in mapids : mapids.remove(xmapentry.contigQry) #note: the feature of multiple alignments (strict vs default) is no longer implemented defalignlen += lenr #currently, just default and strict #if listcontigs and dochrstr : # outstr += " " + refIndexToChrStr( xmapentry.contigRef ) # outstr += "\n" #end loop on xmap entries #now that all xmap entries are processed, all contigs with an alignment are removed from mapids, # so we can get n contigs align using this and ncontigs nmapcontigs += ncontigs - len(mapids) #sum multiple cmaps #and print the data for the contigs which don't align--just id, length, and coverage #these lines are kind of redundant, but I guess that's ok if listcontigs : for ids in mapids : outstr += "%5i" % ids #get map length from multicmap.getMapLength--returns 0 for any exception contiglen = mapi.getMapLength(ids) #it's ok if it's 0 bc it's never a denominator here contigcov = mapi.getMapAvgCoverage(ids) outstr += " %9.1f %2i\n" % (contiglen, contigcov) #end loop on contigs varsP.totAssemblyLenMb = totcontiglen*unitscale ncontigs = len(contiglens) #contigpaths is just files--contiglens is all contigs avgcontiglen = (float(totcontiglen)/ncontigs if ncontigs > 0 else 0) #print averages if listcontigs and not iscluster : #only do avg if not merged, otherwise just one noise parameter avgfp = sum(avgfplist)/len(avgfplist) avgfn = sum(avgfnlist)/len(avgfnlist) outstr += "AVG %9.1f %9.1f %5.3f %5.3f\n" % (avgcontiglen, totalignqlen/nmapcontigs, avgfp, avgfn) if unitscale > 1e-6 : #if not megabases fstr = "%9.0f" else : #megabases fstr = "%8.3f" outstr += "N Genome Maps: %i\n" % ncontigs outstr += ("Total Genome Map Len (Mb): "+fstr+"\n") % (totcontiglen*unitscale) outstr += ("Avg. Genome Map Len (Mb): "+fstr+"\n") % (avgcontiglen*unitscale) outstr += ("Median Genome Map Len(Mb): "+fstr+"\n") % (util.getMedian(contiglens)*unitscale) outstr += ("Genome Map n50 (Mb): "+fstr+"\n") % (util.getn50(contiglens)*unitscale) if haveref : outstr += ("Total Ref Len (Mb): "+fstr+"\n") % (reflen*unitscale) outstr += ("Total Genome Map Len / Ref Len : "+fstr+"\n") % (totcontiglen/reflen) if dorefalign : #print the chromosome summary before the strict/default/total align stats #if dorefidchr : # outstr += "Chromosome Summary:\n" # outstr += "Chromosome align len ref len (ratio):\n" # for chrs, align in chrsum.iteritems() : # outstr += "%3s %9.0f %9.0f (%5.3f)\n" % (chrs, align[0], align[1], align[0]/align[1]) ratio = (float(nmapcontigs)/ncontigs if ncontigs > 0 else 0) outstr += ("N Genome Maps total align : %i (%.2f)\n") % (nmapcontigs, ratio) outstr += ("Total Aligned Len (Mb) : "+fstr+"\n") % (totalignlen*unitscale) outstr += ("Total Aligned Len / Ref Len : "+fstr+"\n") % (totalignlen/reflen) uniquelen = 0 for segs in uniqueseg.values() : # need to sum on dict entries util.uniqueRange(segs) #this modifies list in place uniquelen += util.totalLengthFromRanges( segs ) outstr += ("Total Unique Aligned Len (Mb) : "+fstr+"\n") % (uniquelen*unitscale) outstr += ("Total Unique Aligned Len / Ref Len: "+fstr+"\n") % (uniquelen/reflen) return outstr
def getAlignStats(varsP, outFileList, reflen=0, isref=False, mergepath="", bnxpath=None): '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule. reflen should be in Mb. If mergepath supplied, put merged .err there. If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this file and ignore outFileList. ''' statonly = False #bnx stats only skipbnx = False #.err file processing only if bnxpath == None: if not varsP.sorted_file: #for runAlignMol, this is empty: nothing to do in this case skipbnx = True else: bnxpath = varsP.sorted_file + ".bnx" #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix else: #if bnxpath != None : statonly = True if not skipbnx and not util.checkFile(bnxpath): varsP.updatePipeReport( "Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n" % bnxpath) return #find the minlen used for bnx_sort, which is a required arg set sortargs = [] if varsP.argData.has_key('bnx_sort'): #for runAlignMol.py sortargs = varsP.argsListed('bnx_sort') minlen = 0 validminlen = False if "-minlen" in sortargs: minlen = sortargs[ sortargs.index("-minlen") + 1] #next ele should be the len, if next ele isn't in list, the sort job will fail minlen = util.getIntFromString( minlen) #returns None if can't cast to int if minlen: validminlen = True if not validminlen and bnxpath == None and sortargs: varsP.updatePipeReport( "Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n" ) if bnxpath != None: #if bnxpath, ignore minlen minlen = 0 nmol = 0 #total n mol above minlen totlen = 0 #total mol len above minlen if util.checkFile(bnxpath): #the bnxfile class is very wasteful. replace with below #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now outstr = "Reading molecule stats from %s:\n" % bnxpath outstr += "Molecule Stats:\n" moldict = util.simpleBnxStats(bnxpath, minlen) nmol = moldict["nmol"] totlen = moldict["totlen"] #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously outstr += "N mols: %i\n" % nmol outstr += ("Total len (Mb): %10.3f\n") % totlen outstr += ("Avg len (kb) : %10.3f\n") % moldict["avglen"] outstr += ("Mol N50 (kb) : %10.3f\n") % moldict["n50"] outstr += ("Lab (/100kb) : %10.3f\n") % moldict["labdensity"] # if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below # bnx.molstats[minlen].genomesizemb = 0 # outstr += str(bnx.molstats[minlen]) #nmol = bnx.molstats[minlen].nmol #totlen = bnx.molstats[minlen].totlen if reflen: cov = totlen / reflen #totlen is in Mb outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else "Contig", cov) if isref or reflen or statonly: #if neither, nothing to print varsP.updateInfoReport(outstr + "\n", printalso=True) elif not skipbnx: varsP.updatePipeReport( "Warning in AlignModule.getAlignStats: missing bnx path:" + bnxpath + "\n") if statonly: return #lastly, load .xmaps and .errs from alignmol jobs and report on stats totmaplen = 0 #sum of lengths of mapped portions of all molecules, on reference totmapqrylen = 0 #sum of lengths of mapped portions of all molecules, on query totconf = 0 #sum of confidence of all alignments nalign = 0 #total number of alignments fplist = [] #lists for error rates fprlist = [] fnlist = [] bpplist = [] nmaplist = [] #from .err gmaplist = [] #from .err llrmlist = [] llrgmlist = [] bppsdlist = [] sflist = [] sdlist = [] srlist = [] reslist = [] resdlist = [] header = "" err = None #will be the alignParams object if any .err files are found mappref = "" if len(outFileList) > 0: mappref = getMergeFilename( outFileList[0] ) #make function to unify with same convention in mergeMap for outpath in outFileList: #these are file prefixes if util.checkFile(outpath + ".xmap"): xmap = mc.xmap(outpath + ".xmap") nalign += len(xmap.xmapLookup) totmaplen += xmap.getSumMappedRefLen() #in kb totmapqrylen += xmap.getSumMappedQryLen() #in kb totconf += sum([x.Confidence for x in xmap.xmapLookup.values()]) else: varsP.updatePipeReport( "Warning in AlignModule.getAlignStats: missing xmap:" + outpath + ".xmap" + "\n") if util.checkFile(outpath + ".err"): err = mc.alignParams(outpath + ".err") if not header: header = err.header fplist.append(err.fp) fprlist.append(err.fprate) fnlist.append(err.fn) bpplist.append(err.bpp) reslist.append(err.res) nmaplist.append(err.nmaps) gmaplist.append(err.goodmaps) llrmlist.append(err.llrm) llrgmlist.append(err.llrgm) bppsdlist.append(err.bppsd) sflist.append(err.sf) sdlist.append(err.sd) srlist.append(err.sr) resdlist.append(err.ressd) #nalign from xmap should be the same as goodmaps from .err sumgoodmaps = sum(gmaplist) if sumgoodmaps != nalign: varsP.updateInfoReport( "Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n" % (sumgoodmaps, nalign), printalso=True) if totmaplen or totconf or nalign: outstr = "Molecules Aligned to %s:\n" % ("Reference" if isref else "Assembly") outstr += "N mol align : %9i\n" % nalign outstr += "Mol fraction align: %13.3f\n" % (float(nalign) / nmol if nmol else 0) outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3) #Mb if reflen > 0: outstr += ("Effective Cov (x) : %13.3f\n") % ( totmaplen / 1e3 / reflen) #totlen is in kb outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen / nalign if nalign else 0) outstr += "Fraction align len: %13.3f\n" % ( totmapqrylen / 1e3 / totlen if totlen else 0 ) #totmapqrylen is in kb, totlen is in mb outstr += "Tot confidence : %11.1f\n" % totconf outstr += "Avg confidence : %11.1f\n" % (totconf / nalign if nalign else 0) varsP.updateInfoReport(outstr, printalso=True) avgfp = (sum(fplist) / len(fplist) if len(fplist) else 0) avgfpr = (sum(fprlist) / len(fprlist) if len(fprlist) else 0) avgfn = (sum(fnlist) / len(fnlist) if len(fnlist) else 0) avgbpp = (sum(bpplist) / len(bpplist) if len(bpplist) else 0) avgres = (sum(reslist) / len(reslist) if len(reslist) else 0) avgllr = (sum(llrmlist) / len(llrmlist) if len(llrmlist) else 0) avgllg = (sum(llrgmlist) / len(llrgmlist) if len(llrgmlist) else 0) avgbps = (sum(bppsdlist) / len(bppsdlist) if len(bppsdlist) else 0) avgsf = (sum(sflist) / len(sflist) if len(sflist) else 0) avgsd = (sum(sdlist) / len(sdlist) if len(sdlist) else 0) avgsr = (sum(srlist) / len(srlist) if len(srlist) else 0) avgrsd = (sum(resdlist) / len(resdlist) if len(resdlist) else 0) if avgfp or avgfn or avgbpp: outstr = "Avg FP(/100kb) : %12.2f\n" % avgfp outstr += "Avg FP ratio : %13.3f\n" % avgfpr outstr += "Avg FN ratio : %13.3f\n" % avgfn outstr += "Avg bpp : %11.1f\n" % avgbpp outstr += "Avg sf : %13.3f\n" % avgsf outstr += "Avg sd : %13.3f\n" % avgsd outstr += "Avg sr : %13.3f\n" % avgsr varsP.updateInfoReport(outstr + "\n", printalso=True) if err and mergepath: #have an error file (alignParams) object util.checkDir(mergepath) mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") outpath = os.path.join(mergepath, mappref + mrgstr + ".err") err.fp = avgfp err.fn = avgfn err.sf = avgsf err.sd = avgsd err.bpp = avgbpp err.res = avgres err.nmaps = sum(nmaplist) err.llrm = avgllr err.goodmaps = sumgoodmaps err.llrgm = avgllg err.bppsd = avgbps err.fprate = avgfpr err.sr = avgsr err.ressd = avgrsd err.writeToFile(outpath)