def mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="") : """Given a list of file prefixes (outFileList), append "_r.cmap" to them, and merge them to outdir. Report to varsP if supplied, stdout if not. Also support outFileList is full paths (including "_r.cmap"). If splitByContig < 1, output each contig separately, if == 1, only output single merged cmap, and if > 1, do both. Always use stagename if supplied; if not, must supply varsP otherwise prefix is empty. """ if not util.checkDir(outdir) : err_msg = "Warning in AlignModule.mergeRcmaps: could not make outdir %s, skipping copy number" % outdir logOrPrintError(err_msg, varsP) return if not outFileList : #just an argument check--check for presence on disk is below err_msg = "Warning in AlignModule.mergeRcmaps: no maps supplied" logOrPrintError(err_msg, varsP) return outFileList.sort() #for reproducibility with runAlignMerge.py (different order when listing dir) rsuf = "_r.cmap" #mappref = os.path.split(outFileList[0])[1] #this is just prefix, but with integer suffix--get it before -- no longer used #mappref = mappref[:mappref.rfind("_")+1] #remove integer suffix #even though outFileList should all be there, a job may have failed--check all, just existence present = [] for outf in outFileList : target = (outf+rsuf if not outf.endswith(rsuf) else outf) #now support either if not util.checkFile(target) : err_msg = "Warning in AlignModule.mergeRcmaps: missing _r.cmap %s" % target logOrPrintError(err_msg, varsP) else : present.append(target) if not present : #no _r.cmaps found (this will also happen for empty outFileList) err_msg = "Warning in AlignModule.mergeRcmaps: no _r.cmaps found, skipping copy number" logOrPrintError(err_msg, varsP) return outFileList = present #yes, it's redundant, but now have rsuf appended mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") #mergedmappath = os.path.join(outdir, mappref+mrgstr+rsuf) #this is output merged _r.cmap -- unify with filepref mergedmap = mc.multiCmap(outFileList[0]) #open original, edit in memory #now add other maps for rmap in outFileList[1:] : #don't add map 0 to itself if mergedmap.addCovOcc( mc.multiCmap(rmap) ) : #when calling addCovOcc, check return, warn if True err_msg = "Warning in AlignModule.mergeRcmaps: addCovOcc call failed for map %s" % rmap logOrPrintError(err_msg, varsP) #now it's merged, but the resulting map need to be written back to disk filepref = (varsP.outputContigPrefix if varsP and stageName == "" else stageName) #see split_XMapQcmap_byContig if splitByContig < 1 or splitByContig > 1 : #print "\nself.varsP.outputContigPrefix", self.varsP.outputContigPrefix, "\n" #debug #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in split_XMapQcmap_byContig mergedmap.writeAllMapsToDisk( os.path.join(outdir, filepref+'_contig'), outsuf="_r" ) report = "mergeRcmaps: wrote %i cmaps" % len(mergedmap.cmapdict) if splitByContig > 0 : mergedmap.writeToFile( os.path.join(outdir, filepref+"_"+mrgstr+rsuf) ) #was mergedmappath report = "mergeRcmaps: wrote merged cmap with %i contigs" % len(mergedmap.cmapdict) #report result logOrPrintError(report, varsP, warn=False)
def __init__(self, varsP): self.curCharacterizeFileRoots = [] self.varsP = varsP #bc Characterize uses this for totAssemblyLenMb #this is problematic for bypass (because mergeIntoSingleCmap isn't called)--don't need it #if not len(varsP.curCharacterizeCmaps) : #need this, set in mergeIntoSingleCmap # return #ccc = varsP.curCharacterizeCmaps[0] #outFileName = os.path.split(ccc)[1].replace(".cmap", "") #outfile = os.path.join(varsP.contigAlignTarget,outFileName) #WRONG bc contigAlignTarget is wrong...try this outdir = os.path.join(varsP.outputContigFolder, self.varsP.characterizeDirName) #'alignref' if not util.checkDir( outdir, makeIfNotExist=False ): #if this doesn't exist, we can't get what we need return outfile = None for qfile in os.listdir(outdir): if qfile.endswith(".err"): #just take first .err file outfile = qfile break if not outfile: #if no .err files found, give up return outfile = os.path.join(outdir, outfile.replace(".err", "")) self.curCharacterizeFileRoots.append(outfile) #also want to get varsP.totAssemblyLenMb self.varsP.totAssemblyLenMb = mapClasses.multiCmap( varsP.latestMergedCmap, lengthonly=True).totalLength / 1e6
def getAlignStats(self): """Open output files of alignment jobs and report on statistics. """ #MapClassesRev stores totAssemblyLenMb self.varsP.updatePipeReport( "Starting AlignModule Align Stats stage for %s\n" % self.stageName, printalso=True) util.LogStatus("progress", "stage_start", "%s_stats" % self.stageName) if self.doref: reflen = mc.multiCmap(self.varsP.ref, lengthonly=True).totalLength / 1e6 else: reflen = self.varsP.totAssemblyLenMb getAlignStats(self.varsP, self.outFileList, reflen, isref=self.doref, mergepath=self.mergedir) mergeMap(self.varsP, self.outFileList, mergepath=self.mergedir) splitByContig = (2 if self.doref else 0) #see mergeRcmaps stageName = (self.varsP.alignMolvrefName if self.doref else "") mergeRcmaps(self.outFileList, self.mergedir, self.varsP, splitByContig, stageName) #xmapDict = split_XMap_byContig( self.outFileList, self.mergedir, self.varsP, self.stageName) xmapDict = split_XMap_byContig_new(self.outFileList, self.mergedir, self.varsP, stageName) #split_Qcmap_byContig(self.outFileList, self.mergedir, xmapDict, self.varsP) split_Qcmap_byContig_new(self.outFileList, self.mergedir, xmapDict, self.varsP, stageName) self.varsP.updatePipeReport( "Finished AlignModule Align Stats stage for %s\n" % self.stageName, printalso=True) util.LogStatus("progress", "stage_complete", "%s_stats" % self.stageName)
def getAlignStats(self): """Open output files of alignment jobs and report on statistics. """ #MapClassesRev stores totAssemblyLenMb self.varsP.updatePipeReport("Starting AlignModule Align Stats stage for %s\n" % self.stageName, printalso=True) util.LogStatus("progress", "stage_start", "%s_stats" % self.stageName) if self.doref : reflen = mc.multiCmap(self.varsP.ref, lengthonly=True).totalLength / 1e6 else : reflen = self.varsP.totAssemblyLenMb getAlignStats(self.varsP, self.outFileList, reflen, isref=self.doref, mergepath=self.mergedir) mergeMap(self.varsP, self.outFileList, mergepath=self.mergedir) splitByContig = (2 if self.doref else 0) #see mergeRcmaps stageName = (self.varsP.alignMolvrefName if self.doref else "") mergeRcmaps(self.outFileList, self.mergedir, self.varsP, splitByContig, stageName) #xmapDict = split_XMap_byContig( self.outFileList, self.mergedir, self.varsP, self.stageName) xmapDict = split_XMap_byContig_new( self.outFileList, self.mergedir, self.varsP, stageName) #split_Qcmap_byContig(self.outFileList, self.mergedir, xmapDict, self.varsP) split_Qcmap_byContig_new(self.outFileList, self.mergedir, xmapDict, self.varsP, stageName) self.varsP.updatePipeReport("Finished AlignModule Align Stats stage for %s\n" % self.stageName, printalso=True) util.LogStatus("progress", "stage_complete", "%s_stats" % self.stageName)
def __init__(self, varsP) : self.curCharacterizeFileRoots = [] self.varsP = varsP #bc Characterize uses this for totAssemblyLenMb #this is problematic for bypass (because mergeIntoSingleCmap isn't called)--don't need it #if not len(varsP.curCharacterizeCmaps) : #need this, set in mergeIntoSingleCmap # return #ccc = varsP.curCharacterizeCmaps[0] #outFileName = os.path.split(ccc)[1].replace(".cmap", "") #outfile = os.path.join(varsP.contigAlignTarget,outFileName) #WRONG bc contigAlignTarget is wrong...try this outdir = os.path.join(varsP.outputContigFolder, self.varsP.characterizeDirName) #'alignref' if not util.checkDir(outdir, makeIfNotExist=False) : #if this doesn't exist, we can't get what we need return outfile = None for qfile in os.listdir(outdir) : if qfile.endswith(".err") : #just take first .err file outfile = qfile break if not outfile : #if no .err files found, give up return outfile = os.path.join(outdir, outfile.replace(".err","")) self.curCharacterizeFileRoots.append(outfile) #also want to get varsP.totAssemblyLenMb self.varsP.totAssemblyLenMb = mapClasses.multiCmap(varsP.latestMergedCmap, lengthonly=True).totalLength / 1e6
def characterizeContigs(varsP, xmappath=None): """Log simple contigs stats, and optionally align stats from xmappath. """ #print "xmappath:", xmappath unitscale = 1e-6 dorefalign = bool( xmappath ) #i'm never actually calling refaligner here--this is just using xmappath haveref = bool(varsP.ref) #refcmap = mapClasses.multiCmap() #not used aligndir = varsP.contigAlignTarget try: #refcmap = mapClasses.multiCmap(varsP.ref) #reflen = refcmap.totalLength #note: total length of _all_ contigs reflen = mapClasses.multiCmap(varsP.ref, lengthonly=True).totalLength #in summary table, this is a denominator--make sure it's non-zero, don't bail (still get contig summary) if reflen <= 0: #print "Warning in CharacterizeModule.characterizeContigs: bad reflen", reflen, "defaulting to 1" #not necessary reflen = 1. except: reflen = 1. outstr = "" #Contig Characterization:\n" #check for .hmaps in same dir as latestMergedCmap: if any, add a line for haploid genome size hmaps = util.getListOfFilesFromDir(os.path.dirname(varsP.latestMergedCmap), ".hmap") haplotype = (len(hmaps) > 0) haplotypelen = 0 hapcontiglens = [] totcontiglen = 0 totalignlen = 0 nmapcontigs = 0 totalignqlen = 0 #defalignlen = 0; contiglens = [] #lens of all contigs in bases uniqueseg = { } #the argument to util.uniqueRange--stores all the map lengths which go into totalignlen--now each of these is a value, and the keys are the reference contig id or chromosome depending on dorefidchr for citr, cpath in enumerate([varsP.latestMergedCmap ]): #always use contigpaths mapi = mapClasses.multiCmap(cpath) totcontiglen += mapi.totalLength contiglens += mapi.getAllMapLengths( ) #getAllMapLengths is list of all map lengths if haplotype: haplotypelen += mapi.getHaplotypeTotalMapLength() hapcontiglens.extend(mapi.getHaplotypeMapLengths()) #store a list of the contig ids in this multiCmap, then remove them if they're in the xmap # if they're not, print at the end mapids = mapi.getAllMapIds( ) #this is once per cmap, not once per characterizeModule call--becuase it's keys, it's already a copy, no need to copy explicitly ncontigs = len( mapids) #this is ncontigs in this file, ie, in mapi (see below) xmapobj = mapClasses.xmap() #empty map to fix xmapobj scope if dorefalign: #get xmap object if util.checkFile(xmappath, ".xmap"): xmapobj = mapClasses.xmap(xmappath) for xitr, xmapentry in enumerate(xmapobj.xmapLookup.values()): #get map length from multicmap.getMapLength--returns 0 for any exception contiglen = mapi.getMapLength(xmapentry.contigQry) if contiglen <= 0: #this strikes me as clumsy...but I don't want to return non-zero from multiCmap.getMapLength contiglen = 1. contigcov = mapi.getMapAvgCoverage(xmapentry.contigQry) #don't print lenr for each contig--just total them lenr = xmapentry.getMappedRefLen() lenq = xmapentry.getMappedQryLen() refid = xmapentry.contigRef #int totalignlen += lenr totalignqlen += lenq #uniqueseg is now a dict to take into account which chromosome the query contig is on #note need refid bc need to separate different contigs on the _same_ chromosome if not uniqueseg.has_key( refid ): #if first contig on chromosome, need to init new list uniqueseg[refid] = [] uniqueseg[refid].append([xmapentry.RefStart, xmapentry.RefStop]) #process mapids--remove contig id (contigQry) from mapids if they're in the xmap so non-aligning contigs can be printed if xmapentry.contigQry in mapids: mapids.remove(xmapentry.contigQry) #end loop on xmap entries #now that all xmap entries are processed, all contigs with an alignment are removed from mapids, # so we can get n contigs align using this and ncontigs nmapcontigs += ncontigs - len(mapids) #sum multiple cmaps #end loop on contigs varsP.totAssemblyLenMb = totcontiglen * unitscale ncontigs = len( contiglens) #contigpaths is just files--contiglens is all contigs avgcontiglen = (float(totcontiglen) / ncontigs if ncontigs > 0 else 0) if unitscale > 1e-6: #if not megabases fstr = "%9.0f" else: #megabases fstr = "%8.3f" if haplotype: #new format for haplotype #if haplotypelen != sum(hapcontiglens) : #simply print warning in this case (do not log): ignore this bc of floating point rounding #print "Warning in characterizeContigs: haplotype lengths are inconsistent:", haplotypelen, sum(hapcontiglens) #diploid is same as else below, but names change outstr += "Diploid N Genome Maps: %i\n" % ncontigs outstr += ("Diploid Genome Map Len (Mb): " + fstr + "\n") % (totcontiglen * unitscale) outstr += ("Diploid Avg. Genome Map Len (Mb): " + fstr + "\n") % (avgcontiglen * unitscale) outstr += ("Diploid Median Genome Map Len (Mb): " + fstr + "\n") % (util.getMedian(contiglens) * unitscale) outstr += ("Diploid Genome Map n50 (Mb): " + fstr + "\n") % (util.getn50(contiglens) * unitscale) #haploid : ignore haplotypelen, just use the list hapcontiglens outstr += "Haploid N Genome Maps: %i\n" % len(hapcontiglens) tot = sum(hapcontiglens) avg = (tot / len(hapcontiglens) if len(hapcontiglens) else 0) outstr += ("Haploid Genome Map Len (Mb): " + fstr + "\n") % (tot * unitscale) outstr += ("Haploid Avg. Genome Map Len (Mb): " + fstr + "\n") % (avg * unitscale) outstr += ("Haploid Median Genome Map Len (Mb): " + fstr + "\n") % (util.getMedian(hapcontiglens) * unitscale) outstr += ("Haploid Genome Map n50 (Mb): " + fstr + "\n") % (util.getn50(hapcontiglens) * unitscale) else: #default to old format outstr += "N Genome Maps: %i\n" % ncontigs outstr += ("Total Genome Map Len (Mb): " + fstr + "\n") % (totcontiglen * unitscale) outstr += ("Avg. Genome Map Len (Mb): " + fstr + "\n") % (avgcontiglen * unitscale) outstr += ("Median Genome Map Len (Mb): " + fstr + "\n") % (util.getMedian(contiglens) * unitscale) outstr += ("Genome Map n50 (Mb): " + fstr + "\n") % (util.getn50(contiglens) * unitscale) if haveref: outstr += ("Total Ref Len (Mb): " + fstr + "\n") % (reflen * unitscale) outstr += ("Total Genome Map Len / Ref Len : " + fstr + "\n") % (totcontiglen / reflen) if dorefalign: ratio = (float(nmapcontigs) / ncontigs if ncontigs > 0 else 0) outstr += ("N Genome Maps total align : %i (%.2f)\n") % ( nmapcontigs, ratio) outstr += ("Total Aligned Len (Mb) : " + fstr + "\n") % (totalignlen * unitscale) outstr += ("Total Aligned Len / Ref Len : " + fstr + "\n") % (totalignlen / reflen) uniquelen = 0 for segs in uniqueseg.values(): # need to sum on dict entries util.uniqueRange(segs) #this modifies list in place uniquelen += util.totalLengthFromRanges(segs) outstr += ("Total Unique Aligned Len (Mb) : " + fstr + "\n") % (uniquelen * unitscale) outstr += ("Total Unique Aligned Len / Ref Len: " + fstr + "\n") % (uniquelen / reflen) return outstr
def characterizeContigs(varsP, xmappath=None, listcontigs=False) : """Log simple contigs stats, and optionally align stats from xmappath. """ #print header of table unitscale = 1e-6 dorefalign = bool(xmappath) #i'm never actually calling refaligner here--this is just using xmappath #dorefidchr = False #dorefcid = False printrange = False #printsegrms = False #dochrstr = False iscluster = True haveref = bool(varsP.ref) #refcmap = mapClasses.multiCmap() #not used aligndir = varsP.contigAlignTarget try : #refcmap = mapClasses.multiCmap(varsP.ref) #reflen = refcmap.totalLength #note: total length of _all_ contigs reflen = mapClasses.multiCmap(varsP.ref, lengthonly=True).totalLength #in summary table, this is a denominator--make sure it's non-zero, don't bail (still get contig summary) if reflen <= 0 : #print "Warning in CharacterizeModule.characterizeContigs: bad reflen", reflen, "defaulting to 1" #not necessary reflen = 1. except: reflen = 1. outstr = "" #Contig Characterization:\n" if listcontigs and dorefalign : outstr += "cID len" outstr += " Cov" #if dorefidchr or dorefcid : # outstr += " rID" #ref index for either of these #if dorefidchr : # outstr += " rpos" outstr += " alignlen alignlen/len" if printrange : outstr += " Qry1 Qry2 Ref1 Ref2" #outstr += (" segRMS" if printsegrms else "") outstr += " Conf Conf/lenkb" outstr += " FP FN sf sd bpp" #" res" #--ditch res (not bpp) #if dochrstr : # outstr += " Chr" outstr += "\n" totcontiglen = 0; totalignlen = 0; nmapcontigs = 0; defalignlen = 0; totalignqlen = 0 contiglens = [] #lens of all contigs in bases uniqueseg = {} #the argument to util.uniqueRange--stores all the map lengths which go into totalignlen--now each of these is a value, and the keys are the reference contig id or chromosome depending on dorefidchr avgfplist = []; avgfnlist = [] #average FP/FN rates #if dorefidchr : # chrsum = refcmap.makeChrSummaryDict() #see mapClasses.multiCmap for citr, cpath in enumerate([varsP.latestMergedCmap]) : #always use contigpaths mapi = mapClasses.multiCmap(cpath) totcontiglen += mapi.totalLength contiglens += mapi.getAllMapLengths() #getAllMapLengths is list of all map lengths #store a list of the contig ids in this multiCmap, then remove them if they're in the xmap # if they're not, print at the end mapids = mapi.getAllMapIds() #this is once per cmap, not once per characterizeModule call--becuase it's keys, it's already a copy, no need to copy explicitly ncontigs = len(mapids) #this is ncontigs in this file, ie, in mapi (see below) xmapobj = mapClasses.xmap() #empty map to fix xmapobj scope if dorefalign : #get xmap object #xmappath = aligndir+os.path.split(cpath)[-1].replace(".cmap", ".xmap") #need cmap file name #xmappath = self.xmapTarget #if xmappath exists isn't a file, nothing will be loaded if os.path.isfile( xmappath ) : #was if not isfile : continue xmapobj = mapClasses.xmap(xmappath) for xitr, xmapentry in enumerate(xmapobj.xmapLookup.values()) : #print the contig id from the xmap #this is sorted by ref position; could sort the list this loop is over by the contigQry data member, # _but_, I think I like reference-oriented better because you can see gap spanning #get map length from multicmap.getMapLength--returns 0 for any exception contiglen = mapi.getMapLength(xmapentry.contigQry) if contiglen <= 0 : #this strikes me as clumsy...but I don't want to return non-zero from multiCmap.getMapLength contiglen = 1. contigcov = mapi.getMapAvgCoverage(xmapentry.contigQry) if listcontigs : outstr += "%5i" % xmapentry.contigQry outstr += " %9.1f %2i" % (contiglen, contigcov) #don't print lenr for each contig--just total them lenr = xmapentry.getMappedRefLen() lenq = xmapentry.getMappedQryLen() refid = xmapentry.contigRef #int #if dorefidchr : #this is the encoding of ref contig id to chromosome and start position # chrpos = mapClasses.intToChrPos(refid, verbose=False) #this returns a tuple (shouldn't fail, but verbose false) # refidstr = " %2s %6i" % chrpos # chrs = chrpos[0] #just for readability # if chrsum.has_key(chrs) : #the method that fills chrsum (makeChrSummaryDict) also uses intToChrPos # chrsum[chrs][0] += lenr #values are list, first ele is total aligned length, second is ref length (fixed) #elif dorefcid : # refidstr = " %3i" % refid #refid is int #else : #nothing for neither, but still need empty string refidstr = "" conf = xmapentry.Confidence #confidence from the xmap, and ratio of it to length in kb if listcontigs : alignpars = getMappedErrStats(aligndir, cpath) #an empty err file is produced for case of no align avgfplist.append( alignpars.fp ) avgfnlist.append( alignpars.fn ) outstr += "%s %9.1f %.3f" % (refidstr, lenq, lenq/contiglen) #length for refidstr set above if printrange : outstr += " %5.0f %5.0f %5.0f %5.0f" % (xmapentry.QryStart/pn, xmapentry.QryStop/pn, xmapentry.RefStart/pn, xmapentry.RefStop/pn) #outstr += (" %5.0f" % 0 if printsegrms else "") #don't print anything outstr += " %3.0f %5.3f" % (conf, conf*1000./lenq) #1000 is for kb outstr += " " + alignpars.getParamString() totalignlen += lenr totalignqlen += lenq #uniqueseg is now a dict to take into account which chromosome the query contig is on #note need refid bc need to separate different contigs on the _same_ chromosome if not uniqueseg.has_key(refid) : #if first contig on chromosome, need to init new list uniqueseg[refid] = [] uniqueseg[refid].append( [xmapentry.RefStart, xmapentry.RefStop] ) #process mapids--remove contig id (contigQry) from mapids if they're in the xmap so non-aligning contigs can be printed if xmapentry.contigQry in mapids : mapids.remove(xmapentry.contigQry) #note: the feature of multiple alignments (strict vs default) is no longer implemented defalignlen += lenr #currently, just default and strict #if listcontigs and dochrstr : # outstr += " " + refIndexToChrStr( xmapentry.contigRef ) # outstr += "\n" #end loop on xmap entries #now that all xmap entries are processed, all contigs with an alignment are removed from mapids, # so we can get n contigs align using this and ncontigs nmapcontigs += ncontigs - len(mapids) #sum multiple cmaps #and print the data for the contigs which don't align--just id, length, and coverage #these lines are kind of redundant, but I guess that's ok if listcontigs : for ids in mapids : outstr += "%5i" % ids #get map length from multicmap.getMapLength--returns 0 for any exception contiglen = mapi.getMapLength(ids) #it's ok if it's 0 bc it's never a denominator here contigcov = mapi.getMapAvgCoverage(ids) outstr += " %9.1f %2i\n" % (contiglen, contigcov) #end loop on contigs varsP.totAssemblyLenMb = totcontiglen*unitscale ncontigs = len(contiglens) #contigpaths is just files--contiglens is all contigs avgcontiglen = (float(totcontiglen)/ncontigs if ncontigs > 0 else 0) #print averages if listcontigs and not iscluster : #only do avg if not merged, otherwise just one noise parameter avgfp = sum(avgfplist)/len(avgfplist) avgfn = sum(avgfnlist)/len(avgfnlist) outstr += "AVG %9.1f %9.1f %5.3f %5.3f\n" % (avgcontiglen, totalignqlen/nmapcontigs, avgfp, avgfn) if unitscale > 1e-6 : #if not megabases fstr = "%9.0f" else : #megabases fstr = "%8.3f" outstr += "N Genome Maps: %i\n" % ncontigs outstr += ("Total Genome Map Len (Mb): "+fstr+"\n") % (totcontiglen*unitscale) outstr += ("Avg. Genome Map Len (Mb): "+fstr+"\n") % (avgcontiglen*unitscale) outstr += ("Median Genome Map Len(Mb): "+fstr+"\n") % (util.getMedian(contiglens)*unitscale) outstr += ("Genome Map n50 (Mb): "+fstr+"\n") % (util.getn50(contiglens)*unitscale) if haveref : outstr += ("Total Ref Len (Mb): "+fstr+"\n") % (reflen*unitscale) outstr += ("Total Genome Map Len / Ref Len : "+fstr+"\n") % (totcontiglen/reflen) if dorefalign : #print the chromosome summary before the strict/default/total align stats #if dorefidchr : # outstr += "Chromosome Summary:\n" # outstr += "Chromosome align len ref len (ratio):\n" # for chrs, align in chrsum.iteritems() : # outstr += "%3s %9.0f %9.0f (%5.3f)\n" % (chrs, align[0], align[1], align[0]/align[1]) ratio = (float(nmapcontigs)/ncontigs if ncontigs > 0 else 0) outstr += ("N Genome Maps total align : %i (%.2f)\n") % (nmapcontigs, ratio) outstr += ("Total Aligned Len (Mb) : "+fstr+"\n") % (totalignlen*unitscale) outstr += ("Total Aligned Len / Ref Len : "+fstr+"\n") % (totalignlen/reflen) uniquelen = 0 for segs in uniqueseg.values() : # need to sum on dict entries util.uniqueRange(segs) #this modifies list in place uniquelen += util.totalLengthFromRanges( segs ) outstr += ("Total Unique Aligned Len (Mb) : "+fstr+"\n") % (uniquelen*unitscale) outstr += ("Total Unique Aligned Len / Ref Len: "+fstr+"\n") % (uniquelen/reflen) return outstr
def mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName=""): """Given a list of file prefixes (outFileList), append "_r.cmap" to them, and merge them to outdir. Report to varsP if supplied, stdout if not. Also support outFileList is full paths (including "_r.cmap"). If splitByContig < 1, output each contig separately, if == 1, only output single merged cmap, and if > 1, do both. Always use stagename if supplied; if not, must supply varsP otherwise prefix is empty. """ if not util.checkDir(outdir): err_msg = "Warning in AlignModule.mergeRcmaps: could not make outdir %s, skipping copy number" % outdir logOrPrintError(err_msg, varsP) return if not outFileList: #just an argument check--check for presence on disk is below err_msg = "Warning in AlignModule.mergeRcmaps: no maps supplied" logOrPrintError(err_msg, varsP) return outFileList.sort( ) #for reproducibility with runAlignMerge.py (different order when listing dir) rsuf = "_r.cmap" #mappref = os.path.split(outFileList[0])[1] #this is just prefix, but with integer suffix--get it before -- no longer used #mappref = mappref[:mappref.rfind("_")+1] #remove integer suffix #even though outFileList should all be there, a job may have failed--check all, just existence present = [] for outf in outFileList: target = (outf + rsuf if not outf.endswith(rsuf) else outf ) #now support either if not util.checkFile(target): err_msg = "Warning in AlignModule.mergeRcmaps: missing _r.cmap %s" % target logOrPrintError(err_msg, varsP) else: present.append(target) if not present: #no _r.cmaps found (this will also happen for empty outFileList) err_msg = "Warning in AlignModule.mergeRcmaps: no _r.cmaps found, skipping copy number" logOrPrintError(err_msg, varsP) return outFileList = present #yes, it's redundant, but now have rsuf appended mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") #mergedmappath = os.path.join(outdir, mappref+mrgstr+rsuf) #this is output merged _r.cmap -- unify with filepref mergedmap = mc.multiCmap(outFileList[0]) #open original, edit in memory #now add other maps for rmap in outFileList[1:]: #don't add map 0 to itself if mergedmap.addCovOcc(mc.multiCmap( rmap)): #when calling addCovOcc, check return, warn if True err_msg = "Warning in AlignModule.mergeRcmaps: addCovOcc call failed for map %s" % rmap logOrPrintError(err_msg, varsP) #now it's merged, but the resulting map need to be written back to disk filepref = ( varsP.outputContigPrefix if varsP and stageName == "" else stageName ) #see split_XMapQcmap_byContig if splitByContig < 1 or splitByContig > 1: #print "\nself.varsP.outputContigPrefix", self.varsP.outputContigPrefix, "\n" #debug #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in split_XMapQcmap_byContig mergedmap.writeAllMapsToDisk(os.path.join(outdir, filepref + '_contig'), outsuf="_r") report = "mergeRcmaps: wrote %i cmaps" % len(mergedmap.cmapdict) if splitByContig > 0: mergedmap.writeToFile( os.path.join(outdir, filepref + "_" + mrgstr + rsuf)) #was mergedmappath report = "mergeRcmaps: wrote merged cmap with %i contigs" % len( mergedmap.cmapdict) #report result logOrPrintError(report, varsP, warn=False)
def runAlignMol(): parser = argparse.ArgumentParser(description=description) parser.add_argument( '-q', dest='queryDir', help= 'Path to merged cmap to align molecules (-b) to OR alignmol dir from Pipeline for merge (if latter, no alignments are performed), required', type=str) parser.add_argument( '-b', dest='bnx', help='Input molecule (.bnx) file, required if aligning molecules', type=str) #parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file OR path to dir containing split bnx pieces, required if aligning molecules', type=str) #I should add the split feature; for now, just do single bnx parser.add_argument( '-a', dest='optArguments', help= 'Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str) parser.add_argument( '-r', help= 'If this flag is used, alignmolvref arguments are used, otherwise alignmol arguments are used (default alignmol; optional)', dest='ref', action='store_true') parser.add_argument( '-o', dest='outputDir', help= 'output dir (optional, defaults to sub-dir of input map dir called "alignmol")', default="", type=str) parser.add_argument( '-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) parser.add_argument( '-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int) parser.add_argument( '-j', dest='maxthreads', help= 'Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int) parser.add_argument( '-e', dest='errFile', help= '.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str) parser.add_argument( '-E', dest='errbinFile', help= '.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str) parser.add_argument( '-p', dest='pipelineDir', help= 'Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str) result = parser.parse_args() outprefix = "exp_refineFinal1" #this is the default; assume for now #check all Pipeline dependencies if result.pipelineDir: cwd = result.pipelineDir else: cwd = os.path.split( os.path.realpath(__file__))[0] #this is path of this script if not os.path.isfile(os.path.join( cwd, "utilities.py")): #if still not here, last try is actual cwd cwd = os.getcwd() #still check this below #this is the only one imported here and in runCharacterize if not os.path.isfile(os.path.join(cwd, "utilities.py")): print "ERROR: utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not os.path.isfile(os.path.join(cwd, "AlignModule.py")): print "ERROR: AlignModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import AlignModule as alignmod if not util.checkFile(os.path.join(cwd, "Pipeline.py")): print "ERROR: Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline if not util.checkFile(os.path.join(cwd, "mapClasses.py")): print "ERROR: mapClasses.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import mapClasses as mc #input dir if not result.queryDir: print "ERROR: Query (-q) argument not supplied." sys.exit(1) qrypath = os.path.realpath(result.queryDir) if util.checkDir( qrypath, checkWritable=False, makeIfNotExist=False): #output elsewhere so not writeable is ok runaligns = False elif util.checkCmap(qrypath): runaligns = True else: print "ERROR: Query argument (" + qrypath + ") not found or not a dir or cmap. Check -q argument." sys.exit(1) #this check isn't really necessary...make it a warning -- left over from runAlignMerge.py #if not os.path.split(qrypath)[1].endswith("alignmol") : # print "Warning: Query dir ("+qrypath+") does not end with 'alignmol'; please be sure this is a Pipeline alignmol dir\n" #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args rabin = "" #need empty string for generateJobList even though no jobs are run if runaligns: rabin = result.RefAligner #replicate Pipeline behavior: RefAligner is always required if os.path.isdir(rabin): rabin = os.path.join(rabin, "RefAligner") if not util.checkExecutable(rabin): print "ERROR: RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg." sys.exit(1) #optargs file optargs = None if runaligns and result.optArguments: #supplied on command line optargs = result.optArguments if not util.checkFile(optargs, ".xml"): print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument." sys.exit(1) elif runaligns: #load from Pipeline dir if running alignments optargs = os.path.join(cwd, "optArguments_human.xml") if not util.checkFile(optargs): print "optArguments.xml missing in Pipeline directory (" + cwd + "). Try supplying path explicitly using -a." sys.exit(1) #output dir if not result.outputDir: outdir = os.path.join(qrypath, "merge") #should be same as in AlignModule else: outdir = os.path.realpath(result.outputDir) if os.path.isdir(outdir): if not util.checkDir(outdir): #check writeable print "\nERROR: Output dir is not writeable:\n", outdir, "\n" sys.exit(1) #this is ok here #elif outdir == contigdir : # print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n" # sys.exit(1) print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n" elif not util.checkDir( outdir ): #does not exist, make, if False, can't make or not writeable print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n" sys.exit(1) #bnx file bnxfile = result.bnx if bnxfile: #must check for empty string BEFORE you do realpath, or it returns cwd bnxfile = os.path.realpath(bnxfile) if not util.checkFile(bnxfile, ".bnx"): print "ERROR: bnx file supplied but not found or incorrect suffix:", bnxfile sys.exit(1) elif runaligns: print "ERROR: bnx file not supplied but running alignments; please supply bnx file as -b argument" sys.exit(1) #nthreads nthreads = result.numThreads if nthreads <= 0: print "ERROR: Number of threads value invalid (must be > 0): %i" % nthreads sys.exit(1) #maxthreads maxthreads = result.maxthreads if maxthreads <= 0: print "ERROR: Max threads value invalid (must be > 0): %i" % maxthreads sys.exit(1) elif nthreads < maxthreads: print "Warning: num threads (-T: %i) < max threads (-j: %i): increasing num threads to equal max threads\n" % ( nthreads, maxthreads) nthreads = maxthreads #.errbin file errbinfile = result.errbinFile if errbinfile: errbinfile = os.path.realpath(result.errbinFile) if not util.checkFile(errbinfile, ".errbin"): print "ERROR: errbin file supplied but not found or incorrect suffix:", errbinfile sys.exit(1) #.err file errfile = result.errFile if errfile and errbinfile: print "Warning: .err and .errbin arguments supplied; ignoring .err file" errfile = "" elif errfile: errfile = os.path.realpath(result.errFile) if not util.checkFile(errfile, ".err"): print "err file supplied but not found or incorrect suffix:", errfile sys.exit(1) if errfile and not util.checkFile(os.path.join(cwd, "SampleCharModule.py")): print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) elif errfile: import SampleCharModule as scm doref = result.ref #DONE checking arguments print "Using output dir", outdir if runaligns: print "Aligning", bnxfile, "\nTo", qrypath, "\n" else: print "Merging", qrypath, "\n" startTime = time.time() #time since Epoch memory_log = os.path.join(outdir, "memory_log.txt") util.initMemoryLog(memory_log) varsP = Pipeline.varsPipeline() varsP.RefAlignerBin = rabin varsP.contigFolder = "" #not used but needs to be an attr varsP.outputContigFolder = "" #not used but needs to be a string attr varsP.pipeReportFile = os.path.join(outdir, "alignmol_jobs_log.txt") varsP.infoReportFile = os.path.join(outdir, "alignmol_log.txt") util.InitStatus(os.path.join(outdir, "status.xml")) if runaligns: varsP.optArgumentsFileIn = optargs varsP.latestMergedCmap = qrypath #if !doref, need this one varsP.ref = qrypath #and if doref, need this one varsP.nThreads = nthreads #necessary otherwise job won't start -- max threads per node varsP.maxthreads = maxthreads #threads per job p = os.path.split(qrypath)[1] varsP.outputContigPrefix = p[:p.rfind(".")] #filename prefix varsP.stdoutlog = True #use -stdout -stderr varsP.sorted_file = bnxfile[:bnxfile.rfind( ".")] #enables the mol fraction align in AlignModule.getAlignStats if qrypath.endswith(".cmap"): #enable the mol stats varsP.totAssemblyLenMb = mc.multiCmap( qrypath, lengthonly=True).totalLength / 1e6 varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt") varsP.parseArguments() #parses optArgumentsFile varsP.checkDependencies() varsP.RefAlignerBinOrig = rabin varsP.prerunLog( ) #general information in log -- needed for refaligner_version noisep = {} if errbinfile: noisep = {"readparameters": errbinfile} #print "Using noise parameters from "+errbinfile+"\n" #move below elif errfile: noisep = scm.readNoiseParameters(errfile.replace(".err", "")) if noisep.has_key( 'readparameters' ): #remove this because it's redundant, and it can cause problems with RefAligner compatibility del noisep['readparameters'] if not noisep: #readNoiseParameters returns empty dict on failure print "ERROR reading noise parameters, check .err file:", errfile sys.exit(1) #redundant with below? print "Using noise parameters from " + errfile + ":\n" + " ".join( ["-" + str(k) + " " + str(v) for k, v in noisep.iteritems()]) + "\n" #some code from SampleCharModule to load args into noise0 infoReport = "Loaded noise parameters:\n" klist = [ "FP", "FN", "sf", "sd", "sr", "bpp", "readparameters" ] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict. #noiseargs = self.varsP.argsListed('noise0') #not necessary for v in klist: if not noisep.has_key(v): continue param = str(noisep[v]) util.LogStatus("parameter", "auto_" + v, param) infoReport += v + ":" + param + "\n" varsP.replaceParam("noise0", "-" + v, param) varsP.updateInfoReport(infoReport + '\n', printalso=True) else: print "Getting file list from", qrypath outFileList = getOutFileList(util, qrypath) if not outFileList: print "ERROR: Query dir (" + qrypath + ") does not contain alignmol data. Check -q argument." sys.exit(1) else: print "Found", len(outFileList), "alignment results" #end if runaligns amod = alignmod.AlignModule( varsP, doref, outdir, bnxfile) #constructor will call generateJobList if runaligns: amod.runJobs() amod.checkResults() else: amod.outFileList = outFileList p = os.path.split(outFileList[0])[1] if p.count("_") > 1: #expect something like "EXP_REFINEFINAL1_4" #p = p[:p.rfind("_")+1] #remove integer suffix p = p[:p.rfind("_")] #remove integer suffix (and underscore) #else : # p += "_" #because mrgstr is appended varsP.outputContigPrefix = p if not runaligns or len(amod.jobList) > 0: amod.getAlignStats() if runaligns: print #copy from Pipeline.py if util.SummarizeErrors(varsP=varsP) == 0: varsP.updatePipeReport("Pipeline has successfully completed\n") util.LogStatus("progress", "pipeline", "success") else: varsP.updatePipeReport("Pipeline has completed with errors\n") util.LogStatus("progress", "pipeline", "failure") #BELOW OLD CODE return #in Pipeline, this is called first #print "Calling getAlignStats:" #but it won't work without varsP atm; skip it #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir) #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir) print "Calling mergeMap" print outFileList[0] #, "\n", outputdir #moved above util.logMemory(memory_log, startTime, "mergeMap_start") #mergeMap(self.varsP, self.outFileList, mergepath=self.outputdir) #varsP is optional alignmod.mergeMap(None, outFileList, outputdir) util.logMemory(memory_log, startTime, "mergeMap_end") print "Calling mergeRcmaps" util.logMemory(memory_log, startTime, "mergeRcmaps_start") #mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="alignmol") : alignmod.mergeRcmaps(outFileList, outputdir, splitByContig=True, stageName=outprefix) util.logMemory(memory_log, startTime, "mergeRcmaps_end") print "Calling split_XMap_byContig" #split_XMapQcmap_byContig" util.logMemory(memory_log, startTime, "split_XMap_byContig_start") #xmapdict = alignmod.split_XMap_byContig(outFileList, outputdir, stageName=outprefix) #old xmapdict = alignmod.split_XMap_byContig_new(outFileList, outputdir, stageName=outprefix) util.logMemory(memory_log, startTime, "split_XMap_byContig_end") print "Calling split_Qcmap_byContig" util.logMemory(memory_log, startTime, "split_Qcmap_byContig_start") #alignmod.split_Qcmap_byContig(outFileList, outputdir, xmapdict) #old alignmod.split_Qcmap_byContig_new( outFileList, outputdir, xmapdict, stageName=outprefix) #new: better performance util.logMemory(memory_log, startTime, "split_Qcmap_byContig_end") print "AlignMerge successfully completed"