def __init__(self, varsP) : """sortBNX.__init__: this class is for sorting the input bnx for subsequent splitting by the splitBNX class, and eventually easier processing with the Pairwise class. The constructor (this) will call varsP.runJobs and doAllPipeReport.""" self.stageName="SortBNX" self.varsP = varsP #fewer code modifications below self.varsP.sorted_file = self.varsP.bnxFile.replace(".bnx", "_sorted") #replace this with checkMinMol; this needs to use sorted file which isn't yet made #calculateNPairwise(self.varsP, self.varsP.bnxFile.replace(".bnx","")) #run this here bc it contains check on N mol required to start pipeline checkMinMol(self.varsP, self.varsP.bnxFile) if self.generateJobList() : #return 0 for success, 1 for skip if not util.checkFile(self.varsP.sorted_file+".bnx") : #this happens when accidentally using bypass but no sorted bnx exists--log error err = "ERROR: no sorted bnx file found (%s) (check bypass (-B) argument to Pipeline)" % (self.varsP.sorted_file+".bnx") self.varsP.updatePipeReport(err+"\n") util.LogError("critical", err) util.LogStatus("progress", "pipeline", "failure") raise RuntimeError #calculateNPairwise(self.varsP, self.varsP.sorted_file) #correct varsP.nPairwiseJobs -- already above return util.LogStatus("progress", "stage_start", self.stageName) #after above bc check if bypass (executeCurrentStage) self.varsP.runJobs(self, "SortBNX") self.doAllPipeReport() if not self.allResultsFound() : err = "ERROR: sortBNX failed. Check: "+self.varsP.bnxFile self.varsP.updatePipeReport(err+"\n") util.LogError("critical", err) util.LogStatus("progress", "pipeline", "failure") raise RuntimeError util.LogStatus("progress", "stage_complete", self.stageName)
def readCmapFile(self, cmapFile): if util.checkFile(cmapFile) : f1 = open(cmapFile) else : print "Error in MapClassesRev.MultiCmap.readCmapFile: missing file", cmapFile return newCmap = True for line in f1 : if line[0] == '#': continue tokens = line.split('\t') if newCmap: cmapID = int(tokens[0]) cmapLen = float(tokens[1]) curCmap = Cmap(cmapID, cmapLen) self.cmapDB[cmapID] = curCmap nSites = int(tokens[2]) newCmap = False #print('Adding CMAP %d' % cmapID) siteID = int(tokens[3]) if siteID > nSites: newCmap = True continue siteLoc = float(tokens[5]) covg = float(tokens[7]) curCmap.addSite(siteID, siteLoc, covg)
def mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="") : """Given a list of file prefixes (outFileList), append "_r.cmap" to them, and merge them to outdir. Report to varsP if supplied, stdout if not. Also support outFileList is full paths (including "_r.cmap"). If splitByContig < 1, output each contig separately, if == 1, only output single merged cmap, and if > 1, do both. Always use stagename if supplied; if not, must supply varsP otherwise prefix is empty. """ if not util.checkDir(outdir) : err_msg = "Warning in AlignModule.mergeRcmaps: could not make outdir %s, skipping copy number" % outdir logOrPrintError(err_msg, varsP) return if not outFileList : #just an argument check--check for presence on disk is below err_msg = "Warning in AlignModule.mergeRcmaps: no maps supplied" logOrPrintError(err_msg, varsP) return outFileList.sort() #for reproducibility with runAlignMerge.py (different order when listing dir) rsuf = "_r.cmap" #mappref = os.path.split(outFileList[0])[1] #this is just prefix, but with integer suffix--get it before -- no longer used #mappref = mappref[:mappref.rfind("_")+1] #remove integer suffix #even though outFileList should all be there, a job may have failed--check all, just existence present = [] for outf in outFileList : target = (outf+rsuf if not outf.endswith(rsuf) else outf) #now support either if not util.checkFile(target) : err_msg = "Warning in AlignModule.mergeRcmaps: missing _r.cmap %s" % target logOrPrintError(err_msg, varsP) else : present.append(target) if not present : #no _r.cmaps found (this will also happen for empty outFileList) err_msg = "Warning in AlignModule.mergeRcmaps: no _r.cmaps found, skipping copy number" logOrPrintError(err_msg, varsP) return outFileList = present #yes, it's redundant, but now have rsuf appended mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") #mergedmappath = os.path.join(outdir, mappref+mrgstr+rsuf) #this is output merged _r.cmap -- unify with filepref mergedmap = mc.multiCmap(outFileList[0]) #open original, edit in memory #now add other maps for rmap in outFileList[1:] : #don't add map 0 to itself if mergedmap.addCovOcc( mc.multiCmap(rmap) ) : #when calling addCovOcc, check return, warn if True err_msg = "Warning in AlignModule.mergeRcmaps: addCovOcc call failed for map %s" % rmap logOrPrintError(err_msg, varsP) #now it's merged, but the resulting map need to be written back to disk filepref = (varsP.outputContigPrefix if varsP and stageName == "" else stageName) #see split_XMapQcmap_byContig if splitByContig < 1 or splitByContig > 1 : #print "\nself.varsP.outputContigPrefix", self.varsP.outputContigPrefix, "\n" #debug #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in split_XMapQcmap_byContig mergedmap.writeAllMapsToDisk( os.path.join(outdir, filepref+'_contig'), outsuf="_r" ) report = "mergeRcmaps: wrote %i cmaps" % len(mergedmap.cmapdict) if splitByContig > 0 : mergedmap.writeToFile( os.path.join(outdir, filepref+"_"+mrgstr+rsuf) ) #was mergedmappath report = "mergeRcmaps: wrote merged cmap with %i contigs" % len(mergedmap.cmapdict) #report result logOrPrintError(report, varsP, warn=False)
def readMapFile(self, mapFile, verbose=0): commentChars = ['#','S','M'] if not util.checkFile(mapFile, ".map") : print "Error in MapResults.readMapFile: missing file", mapFile return for line in open(mapFile) : if commentChars.__contains__(line[0]): continue curResult = SingleMapResult(line, self.qryCmap, self.refCmap) #if verbose > 0 and self.hitDB.has_key(curResult.qryCmapID): #if you want this back, use 'in' # print " Warning MapID %d already counted" % curResult.qryCmapID self.hitDB.append( curResult )
def mergeMap(varsP, outFileList, mergepath) : """outFileList is list of path+prefixes--each should have a .map file: merge them to a merged .map file in dir mergepath.""" outFileList.sort() #sort to ensure reproducibility (order of entries) maplist = [] for outpath in outFileList : #these are file prefixes if util.checkFile(outpath+".map") : maplist.append(outpath+".map") elif varsP : varsP.updatePipeReport("Warning in AlignModule.mergeMap: missing map: "+outpath+".map"+"\n") else : print "Warning in AlignModule.mergeMap: missing map: "+outpath+".map"+"\n" if not len(maplist) : #nothing to merge return if not util.checkDir(mergepath) : varsP.updatePipeReport("Warning in AlignModule.mergeMap: merge path invalid: "+mergepath+"\n") return headstart = ["#", "S", "M"] #last two lines of header start with "Software" and "MappedMoleculeId" #header = "" headerdone = False #data = "" lineno = 1 #can't just append: need to change index in first column sep = "\t" mappref = getMergeFilename(outFileList[0]) #also in getAlignStats mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") #same for vref and not outpath = os.path.join(mergepath, mappref+mrgstr+".map") f1 = open(outpath, 'w') for path in maplist : f = open(path) for line in f : if line[0] in headstart and not headerdone : #header += line f1.write(line) elif line[0] not in headstart : tokens = line.split() tokens[0] = str(lineno) #data += sep.join(tokens)+"\n" #newline was stripped by split f1.write(sep.join(tokens)+"\n") lineno += 1 headerdone = True f.close() #f1.write(header+data) f1.close()
def generateJobList(self) : """splitBNX.generateJobList: submit varsP.nPairwiseJobs number of split bnx jobs. """ sorted_file = self.varsP.sorted_file if not util.checkFile(sorted_file+".bnx") : err = "ERROR: splitBNX input file (%s) not found; exiting" % self.varsP.sorted_file self.varsP.updatePipeReport(err+"\n") util.LogError("critical", err) util.LogStatus("progress", "pipeline", "failure") raise RuntimeError N = calculateNPairwise(self.varsP, sorted_file) #move back here (not sortBNX) bc needs to use sorted bnx #N = self.varsP.nPairwiseJobs self.varsP.updatePipeReport('Splitting BNX\n') #splitJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('splitting')) super(splitBNX, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs('splitting')) #should skip the rest and return 1, like in sortBNX, here: if not self.varsP.executeCurrentStage: return 1 #tell self.__init__ not to continue processing self.varsP.updatePipeReport("Splitting"+(" scan-scaled" if self.varsP.doScanScale else "")+" bnx file: %s.bnx\n\n" % self.varsP.sorted_file) #calculate threads per job: used to be fixed at 1, now file size / 1.5 GB rounded up. This was too low, add 1. threads = max(1, int(math.ceil( os.path.getsize(sorted_file+".bnx")/1.5e9 ))) + 1 if threads > 1 : self.varsP.updatePipeReport("Using %i threads per job\n" % threads) #the change in job partitioning breaks backward compatibility and was causing too many problems; make it conditional on refaligner version if self.varsP.refaligner_version < 3995 : for partial in range(1,N + 1): output_file=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s" %(partial, self.varsP.nPairwiseJobs)) cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", str(partial), str(N), "-bnx", "-o", output_file] if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) #print('%d/%d' % (partial, N), cargs) expectedResultFile=output_file+".bnx" self.addJob(mthread.singleJob(cargs, self.stageName + str(partial), expectedResultFile, self.stageName + str(partial), maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout")) else : #change above to single command with -subsetbin 0 N output_file=self.varsP.bnxFile.replace(".bnx", "") cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", "0", str(N), "-bnx", "-o", output_file] if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) self.addJob(mthread.singleJob(cargs, self.stageName, output_file+".bnx", self.stageName, maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout"))
def generateJobList(self): curArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('sampleChar') if util.checkFile(self.varsP.bnxTarget) : #file exists only if image processing was run bnxFiles = parseExperimentFile(self.varsP.bnxTarget) if not bnxFiles : #check that you got at least one errstr = "ERROR in SampleChar.generateJobList: no bnx files found in: "+self.varsP.bnxTarget print errstr self.varsP.updatePipeReport(errstr+"\n\n") return basepath = "" #os.path.split(bnxFiles[0])[0] #don't use basepath for this case else : #otherwise, assume this is the only bnx file bnxFiles = [self.varsP.bnxFile] #here, make a dir for the results--should really check results of checkEmptyDir for errors basepath = os.path.join(self.varsP.localRoot, "sampleChar") if self.varsP.wipe and os.path.isdir(basepath) : shutil.rmtree(basepath) #util.checkEmptyDir(basepath) #will make if not exist, but if it does, will remove and re-make -- this fn doesn't exist... #else : util.checkDir(basepath) #will make if not exist, but won't remove anything nJobs = len(bnxFiles) #for i, bnxFile in enumerate(bnxFiles): for bnxFile in bnxFiles : #bnxGroupName = '%02d' % (i+1) #get this from the path, ie, bnxFiles cargs = [self.varsP.RefAlignerBin, '-i', bnxFile] bnxname = os.path.split(bnxFile)[1].replace(".bnx","") jobname = 'Sample_Char_' + bnxname #outputTarget = os.path.join(basepath, bnxGroupName) if basepath : #bnx input outputTarget = os.path.join(basepath, bnxname) else : #image processing outputTarget = bnxFile.replace(".bnx","") + "_sampleChar" expectedResultFile = outputTarget + '.err' #this is used in checkResults currentArgs = cargs + ['-ref', self.varsP.ref, '-o' , outputTarget, '-f'] if self.varsP.stdoutlog : currentArgs.extend( ['-stdout', '-stderr'] ) currentArgs += ['-maxthreads', str(self.varsP.maxthreads)] + curArgs sJob = mthread.singleJob(currentArgs, jobname, expectedResultFile, jobname, clusterLogDir=self.varsP.clusterLogDir) # peStr is deprecated in favor of clusterargs #sJob.expTag = bnxGroupName #removed from checkResults self.addJob(sJob) self.logArguments()
def split_XMap_byContig_new(outFileList, mergepath, varsP=None, stageName="") : """outFileList is list of path+prefixes--each should have a .xmap and _q.cmap file: split into one per contig.""" logOrPrintError("Start split_XMapQcmap_byContig", varsP, warn=False) xmapFilelist = [] for outpath in outFileList : #these are file prefixes if util.checkFile(outpath+".xmap") : xmapFilelist.append(outpath+".xmap") else : err_msg = "Warning in AlignModule.split_XMapQcmap_byContig: missing xmap: "+outpath+".xmap" logOrPrintError(err_msg, varsP) if not len(xmapFilelist) : #nothing to merge err_msg = "Error in AlignModule.split_XMapQcmap_byContig: no xmaps found" logOrPrintError(err_msg, varsP) return #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in mergeRcmaps filepref = (varsP.outputContigPrefix if varsP and stageName == "" else stageName) #same as line in mergeRcmaps outFileList.sort() #sort to ensure reproducibility (order of entries) xmapLineDict = {} #if you store the number of lines here, you can avoid counting every time it's opened xmapMolDict = {} #store the molecule IDs here for use in split_Qcmap_byContig newxmaplist = [] #store paths of output xmaps to fix their headers header = "" #get header of first file with open(outFileList[0]+".xmap") as f1 : for line in f1 : #no readline--that will iterate over each char in line instead of line itself if line[0] == '#': header += line else : break for path in xmapFilelist : f = open(path) for line in f : if line[0] == "#" : #get header separately above continue #I don't think there's any way to avoid split, except looping over chars, but that's probably just as slow tokens = line.split() try: qryid = int(tokens[1]) refid = int(tokens[2]) except: continue outpref = os.path.join(mergepath, filepref+'_contig'+str(refid)) if not outpref in newxmaplist : #this loop is every line; don't duplicate newxmaplist.append(outpref) #prefixes outf = open(outpref+".xmap", "a+") #make a new file if not exists; if does, points to end of file if not refid in xmapLineDict : xmapLineDict[refid] = 1 xmapMolDict[refid] = [qryid] outf.write(header) #write header to disk else : xmapLineDict[refid] += 1 #because xmapMolDict is used to make the _q.cmap, its entries should be unique #assert xmapMolDict[refid].count(qryid) == 0, ("dup molid %i, path %s" % (qryid, path)) if not qryid in xmapMolDict[refid] : xmapMolDict[refid].append(qryid) #outf.write("\t".join([str(xmapLineDict[refid])]+tokens[1:])+"\n") tokens[0] = str(xmapLineDict[refid]) outf.write("\t".join(tokens)+"\n") outf.close() #avoid keeping too many file handles open at the expense of re-open many times #end for line in f f.close() #end for xmapFilelist #need to fix headers still, ie, the editHeaderMaps/QueryMaps: must re-read and -write files for path in newxmaplist : with open(path+".xmap", "r") as f : lines = f.readlines() with open(path+".xmap", "w") as f : for line in lines : if line.find("Query Maps") != -1 : line = line.split(":")[0] + ":\t" + path + "_q.cmap" + "\n" elif line.find("Reference Maps") != -1 : line = line.split(":")[0] + ":\t" + path + "_r.cmap" + "\n" f.write(line) logOrPrintError("split_XMapQcmap_byContig: wrote %i xmaps" % len(xmapMolDict), varsP, warn=False) #reproduce original fn if 0 : bad = False print "DEBUG:" for xl in xmapMolDict.values() : #list of mols for i in xl : if xl.count(i) > 1 : bad = True print i if bad : print xmapMolDict print "DEBUG\n" return(xmapMolDict)
def getAlignStats(varsP, outFileList, reflen=0, isref=False, mergepath="", bnxpath=None) : '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule. reflen should be in Mb. If mergepath supplied, put merged .err there. If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this file and ignore outFileList. ''' statonly = False #bnx stats only skipbnx = False #.err file processing only if bnxpath == None : if not varsP.sorted_file : #for runAlignMol, this is empty: nothing to do in this case skipbnx = True else : bnxpath = varsP.sorted_file+".bnx" #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix else : #if bnxpath != None : statonly = True if not skipbnx and not util.checkFile(bnxpath) : varsP.updatePipeReport("Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n" % bnxpath) return #find the minlen used for bnx_sort, which is a required arg set sortargs = [] if varsP.argData.has_key('bnx_sort') : #for runAlignMol.py sortargs = varsP.argsListed('bnx_sort') minlen = 0 validminlen = False if "-minlen" in sortargs : minlen = sortargs[sortargs.index("-minlen")+1] #next ele should be the len, if next ele isn't in list, the sort job will fail minlen = util.getIntFromString(minlen) #returns None if can't cast to int if minlen : validminlen = True if not validminlen and bnxpath == None and sortargs : varsP.updatePipeReport("Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n") if bnxpath != None : #if bnxpath, ignore minlen minlen = 0 nmol = 0 #total n mol above minlen totlen = 0 #total mol len above minlen if util.checkFile(bnxpath) : #the bnxfile class is very wasteful. replace with below #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now outstr = "Reading molecule stats from %s:\n" % bnxpath outstr += "Molecule Stats:\n" moldict = util.simpleBnxStats(bnxpath, minlen) nmol = moldict["nmol"] totlen = moldict["totlen"] #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously outstr += "N mols: %i\n" % nmol outstr += ("Total len (Mb): %10.3f\n") % totlen outstr += ("Avg len (kb) : %10.3f\n") % moldict["avglen"] outstr += ("Mol N50 (kb) : %10.3f\n") % moldict["n50"] outstr += ("Lab (/100kb) : %10.3f\n") % moldict["labdensity"] # if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below # bnx.molstats[minlen].genomesizemb = 0 # outstr += str(bnx.molstats[minlen]) #nmol = bnx.molstats[minlen].nmol #totlen = bnx.molstats[minlen].totlen if reflen : cov = totlen / reflen #totlen is in Mb outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else "Contig", cov) if isref or reflen or statonly : #if neither, nothing to print varsP.updateInfoReport(outstr + "\n", printalso=True) elif not skipbnx : varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing bnx path:"+bnxpath+"\n") if statonly : return #lastly, load .xmaps and .errs from alignmol jobs and report on stats totmaplen = 0 #sum of lengths of mapped portions of all molecules, on reference totmapqrylen = 0 #sum of lengths of mapped portions of all molecules, on query totconf = 0 #sum of confidence of all alignments nalign = 0 #total number of alignments fplist = [] #lists for error rates fprlist = [] fnlist = [] bpplist = [] nmaplist = [] #from .err gmaplist = [] #from .err llrmlist = []; llrgmlist = []; bppsdlist = [] sflist = []; sdlist = []; srlist = []; reslist = []; resdlist = [] header = "" err = None #will be the alignParams object if any .err files are found mappref = "" if len(outFileList) > 0 : mappref = getMergeFilename(outFileList[0]) #make function to unify with same convention in mergeMap for outpath in outFileList : #these are file prefixes if util.checkFile(outpath+".xmap") : xmap = mc.xmap(outpath+".xmap") nalign += len(xmap.xmapLookup) totmaplen += xmap.getSumMappedRefLen() #in kb totmapqrylen += xmap.getSumMappedQryLen() #in kb totconf += sum([x.Confidence for x in xmap.xmapLookup.values()]) else : varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing xmap:"+outpath+".xmap"+"\n") if util.checkFile(outpath+".err") : err = mc.alignParams(outpath+".err") if not header : header = err.header fplist.append(err.fp) fprlist.append(err.fprate) fnlist.append(err.fn) bpplist.append(err.bpp) reslist.append(err.res) nmaplist.append(err.nmaps) gmaplist.append(err.goodmaps) llrmlist.append(err.llrm) llrgmlist.append(err.llrgm) bppsdlist.append(err.bppsd) sflist.append(err.sf) sdlist.append(err.sd) srlist.append(err.sr) resdlist.append(err.ressd) #nalign from xmap should be the same as goodmaps from .err sumgoodmaps = sum(gmaplist) if sumgoodmaps != nalign : varsP.updateInfoReport("Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n" % (sumgoodmaps, nalign), printalso=True) if totmaplen or totconf or nalign : outstr = "Molecules Aligned to %s:\n" % ("Reference" if isref else "Assembly") outstr += "N mol align : %9i\n" % nalign outstr += "Mol fraction align: %13.3f\n" % (float(nalign)/nmol if nmol else 0) outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3) #Mb if reflen > 0 : outstr += ("Effective Cov (x) : %13.3f\n") % (totmaplen / 1e3 / reflen) #totlen is in kb outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen/nalign if nalign else 0) outstr += "Fraction align len: %13.3f\n" % (totmapqrylen/1e3/totlen if totlen else 0) #totmapqrylen is in kb, totlen is in mb outstr += "Tot confidence : %11.1f\n" % totconf outstr += "Avg confidence : %11.1f\n" % (totconf/nalign if nalign else 0) varsP.updateInfoReport(outstr, printalso=True) avgfp = (sum(fplist)/len(fplist) if len(fplist) else 0) avgfpr = (sum(fprlist)/len(fprlist) if len(fprlist) else 0) avgfn = (sum(fnlist)/len(fnlist) if len(fnlist) else 0) avgbpp = (sum(bpplist)/len(bpplist) if len(bpplist) else 0) avgres = (sum(reslist)/len(reslist) if len(reslist) else 0) avgllr = (sum(llrmlist)/len(llrmlist) if len(llrmlist) else 0) avgllg = (sum(llrgmlist)/len(llrgmlist) if len(llrgmlist) else 0) avgbps = (sum(bppsdlist)/len(bppsdlist) if len(bppsdlist) else 0) avgsf = (sum(sflist)/len(sflist) if len(sflist) else 0) avgsd = (sum(sdlist)/len(sdlist) if len(sdlist) else 0) avgsr = (sum(srlist)/len(srlist) if len(srlist) else 0) avgrsd = (sum(resdlist)/len(resdlist) if len(resdlist) else 0) if avgfp or avgfn or avgbpp : outstr = "Avg FP(/100kb) : %12.2f\n" % avgfp outstr += "Avg FP ratio : %13.3f\n" % avgfpr outstr += "Avg FN ratio : %13.3f\n" % avgfn outstr += "Avg bpp : %11.1f\n" % avgbpp outstr += "Avg sf : %13.3f\n" % avgsf outstr += "Avg sd : %13.3f\n" % avgsd outstr += "Avg sr : %13.3f\n" % avgsr varsP.updateInfoReport(outstr + "\n", printalso=True) if err and mergepath : #have an error file (alignParams) object util.checkDir(mergepath) mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") outpath = os.path.join(mergepath, mappref+mrgstr+".err") err.fp = avgfp err.fn = avgfn err.sf = avgsf err.sd = avgsd err.bpp = avgbpp err.res = avgres err.nmaps = sum(nmaplist) err.llrm = avgllr err.goodmaps = sumgoodmaps err.llrgm = avgllg err.bppsd = avgbps err.fprate = avgfpr err.sr = avgsr err.ressd = avgrsd err.writeToFile(outpath)
def __init__(self, varsP) : """splitBNX.__init__: this class is for sorting the input bnx for subsequent splitting by the splitBNX class, and eventually easier processing with the Pairwise class. The constructor (this) will call varsP.runJobs and doAllPipeReport, then instantiate splitBNX, which will do all the splitting required for the Pairwise class. """ self.stageName = "Autonoise0" self.varsP = varsP #fewer code modifications below util.LogStatus("progress", "stage_start", self.stageName) #after above bc check if bypass (executeCurrentStage) bnxfile = self.varsP.bnxFile if varsP.noiseOnly else self.varsP.sorted_file+".bnx" if self.generateJobListChar({}, bnxfile, "autoNoise0") : #return 0 for success, 1 for skip return self.varsP.runJobs(self, "AutoNoise0") self.doAllPipeReport() if not self.allResultsFound() : self.varsP.updatePipeReport("ERROR: AutoNoise0 failed. Check: "+self.output_file+".stdout\n") raise RuntimeError util.LogStatus("progress", "stage_complete", self.stageName) self.varsP.noise0 = readNoiseParameters(self.output_file) self.isBadErrorParams(self.varsP.noise0) self.stageName = "Autonoise1" util.LogStatus("progress", "stage_start", self.stageName) self.clearJobs() self.varsP.replaceParam("noise0", "-readparameters", self.output_file+".errbin") if self.generateJobListChar(self.varsP.noise0, bnxfile, "autoNoise1") : #return 0 for success, 1 for skip return self.varsP.runJobs(self, "AutoNoise1") self.doAllPipeReport() if not self.allResultsFound() : self.varsP.updatePipeReport("ERROR: AutoNoise1 failed. Check: "+self.output_file+".stdout\n") raise RuntimeError self.varsP.noise1 = readNoiseParameters(self.output_file) infoReport="Automatically determined noise parameters:\n" klist = ["FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict. #for v in self.varsP.noise1.keys(): for v in klist : if not self.varsP.noise1.has_key(v) : continue param=str(self.varsP.noise1[v]) util.LogStatus("parameter", "auto_"+v, param) infoReport+=v+":"+param+"\n" self.varsP.replaceParam("noise0", "-"+v, param) self.varsP.updateInfoReport(infoReport + '\n') self.isBadErrorParams(self.varsP.noise1) if self.varsP.doScanScale : #change the sorted_file to the rescaled bnx file rescaledbnx = self.output_file + self.varsP.rescaleSuffix #no ".bnx" in suffix if not util.checkFile(rescaledbnx+".bnx") : #not found--not an error if bnx 0.1 is used err = "Warning: scan scaled bnx not found after autoNoise1; not performing scan scaling--check that bnx 1.0 or later used in input" self.varsP.updatePipeReport( err+"\n\n" ) util.LogError("warning", err) self.varsP.doScanScale = False else : self.varsP.sorted_file = rescaledbnx #this variable is used in splitBNX (PairwiseModule.py) util.LogStatus("progress", "stage_complete", self.stageName)
def getArgs(): parser = argparse.ArgumentParser(description=description) parser.add_argument( '-t', dest='RefAligner', help='Path to RefAligner (required unless xmap is specified (-x))') parser.add_argument( '-r', dest='referenceMap', help= 'Path to reference maps (.cmap or .spots), 1 file only (required unless xmap specified (-x) and _r.cmap is present in same dir as xmap)', default="") parser.add_argument( '-q', dest='queryMap', help= 'Path to query maps (.cmap), 1 file only (required--if xmap specified (-x), this should be input (-i argument) for that command)', default="") parser.add_argument( '-x', dest='xmap', help= 'Path to .xmap, 1 file only (optional, if specified, no alignment is done, if not specified, -t, -r, and -q must be specified)' ) parser.add_argument( '-p', dest='pipelineDir', help='Pipeline dir (optional, defaults to current directory)') parser.add_argument( '-a', dest='optArguments', help= 'Path to optArguments.xml (optional, default in Pipeline dir if found, otherwise required)' ) parser.add_argument( '-n', dest='numThreads', help='Number of threads (cores) to use (optional, default 4)', default=4, type=int) parser.add_argument('-v', dest='pvalue', help='Pvalue (-T) used for alignment', default="1e-12") result = parser.parse_args() #check all Pipeline dependencies if result.pipelineDir: cwd = result.pipelineDir else: cwd = os.getcwd() #this is the only one imported here and in runCharacterize if not os.path.isfile(os.path.join(cwd, "utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util #xmap -- optional runaligns = True #default is to run the alignment xmappath = None if result.xmap: xmappath = result.xmap if not util.checkFile(xmappath, ".xmap"): print "Xmap path is supplied (" + xmappath + ") but not found or doesn't end in .xmap." sys.exit(1) runaligns = False #RefAligner -- only required if xmap not specified rabin = result.RefAligner if not xmappath and not util.checkExecutable(rabin): print "RefAligner not found at", rabin, "\nPlease supply RefAligner full path as -t arg." sys.exit(1) #reference maps -- only required if xmap not specified refcmap = result.referenceMap if runaligns and not util.checkFile( refcmap, ".cmap") and not util.checkFile(refcmap, ".spots"): print "Reference map file (" + refcmap + ") not found or does not end in .cmap or .spots. Check -r argument." sys.exit(1) #query maps -- only required if xmap not specified qrypath = result.queryMap #if runaligns and not util.checkFile(qrypath, ".cmap") : if not util.checkFile(qrypath, ".cmap"): #always required print "Query map file (" + qrypath + ") not found or does not end in .cmap or .spots. Check -q argument." sys.exit(1) #if runaligns : contigdir = os.path.split(qrypath)[0] #dir of query maps contigbase = os.path.split(qrypath)[1] #filename #else : # contigdir = os.path.split(xmappath)[0] # contigbase = os.path.split(xmappath)[1] #filename contigbase = contigbase[:contigbase.find(".")] #remove suffix #optargs file optargs = None if result.optArguments: #supplied on command line optargs = result.optArguments if not util.checkFile(optargs, ".xml"): print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument." sys.exit(1) elif runaligns: #load from Pipeline dir if running alignments optafile = "optArguments_human.xml" optargs = os.path.join(cwd, optafile) if not util.checkFile(optargs): print "%s missing in Pipeline directory (%s). Try supplying path explicitly using -a." % ( optafile, cwd) sys.exit(1) #nthreads nthreads = result.numThreads if nthreads <= 0: print "Number of threads value invalid (must be >= 0): " + nthreads sys.exit(1) #pvalue if result.pvalue: #supplied on command line pvalue = result.pvalue else: pvalue = "1e-12" #yes, this is messy...but I don't want another class (besides varsPipeline) and they just go to runCharacterize return cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, pvalue
def split_XMap_byContig_new(outFileList, mergepath, varsP=None, stageName=""): """outFileList is list of path+prefixes--each should have a .xmap and _q.cmap file: split into one per contig.""" logOrPrintError("Start split_XMapQcmap_byContig", varsP, warn=False) xmapFilelist = [] for outpath in outFileList: #these are file prefixes if util.checkFile(outpath + ".xmap"): xmapFilelist.append(outpath + ".xmap") else: err_msg = "Warning in AlignModule.split_XMapQcmap_byContig: missing xmap: " + outpath + ".xmap" logOrPrintError(err_msg, varsP) if not len(xmapFilelist): #nothing to merge err_msg = "Error in AlignModule.split_XMapQcmap_byContig: no xmaps found" logOrPrintError(err_msg, varsP) return #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in mergeRcmaps filepref = ( varsP.outputContigPrefix if varsP and stageName == "" else stageName ) #same as line in mergeRcmaps outFileList.sort() #sort to ensure reproducibility (order of entries) xmapLineDict = { } #if you store the number of lines here, you can avoid counting every time it's opened xmapMolDict = { } #store the molecule IDs here for use in split_Qcmap_byContig newxmaplist = [] #store paths of output xmaps to fix their headers header = "" #get header of first file with open(outFileList[0] + ".xmap") as f1: for line in f1: #no readline--that will iterate over each char in line instead of line itself if line[0] == '#': header += line else: break for path in xmapFilelist: f = open(path) for line in f: if line[0] == "#": #get header separately above continue #I don't think there's any way to avoid split, except looping over chars, but that's probably just as slow tokens = line.split() try: qryid = int(tokens[1]) refid = int(tokens[2]) except: continue outpref = os.path.join(mergepath, filepref + '_contig' + str(refid)) if not outpref in newxmaplist: #this loop is every line; don't duplicate newxmaplist.append(outpref) #prefixes outf = open( outpref + ".xmap", "a+" ) #make a new file if not exists; if does, points to end of file if not refid in xmapLineDict: xmapLineDict[refid] = 1 xmapMolDict[refid] = [qryid] outf.write(header) #write header to disk else: xmapLineDict[refid] += 1 #because xmapMolDict is used to make the _q.cmap, its entries should be unique #assert xmapMolDict[refid].count(qryid) == 0, ("dup molid %i, path %s" % (qryid, path)) if not qryid in xmapMolDict[refid]: xmapMolDict[refid].append(qryid) #outf.write("\t".join([str(xmapLineDict[refid])]+tokens[1:])+"\n") tokens[0] = str(xmapLineDict[refid]) outf.write("\t".join(tokens) + "\n") outf.close( ) #avoid keeping too many file handles open at the expense of re-open many times #end for line in f f.close() #end for xmapFilelist #need to fix headers still, ie, the editHeaderMaps/QueryMaps: must re-read and -write files for path in newxmaplist: with open(path + ".xmap", "r") as f: lines = f.readlines() with open(path + ".xmap", "w") as f: for line in lines: if line.find("Query Maps") != -1: line = line.split(":")[0] + ":\t" + path + "_q.cmap" + "\n" elif line.find("Reference Maps") != -1: line = line.split(":")[0] + ":\t" + path + "_r.cmap" + "\n" f.write(line) logOrPrintError("split_XMapQcmap_byContig: wrote %i xmaps" % len(xmapMolDict), varsP, warn=False) #reproduce original fn if 0: bad = False print "DEBUG:" for xl in xmapMolDict.values(): #list of mols for i in xl: if xl.count(i) > 1: bad = True print i if bad: print xmapMolDict print "DEBUG\n" return (xmapMolDict)
def characterizeContigs(varsP, xmappath=None): """Log simple contigs stats, and optionally align stats from xmappath. """ #print "xmappath:", xmappath unitscale = 1e-6 dorefalign = bool( xmappath ) #i'm never actually calling refaligner here--this is just using xmappath haveref = bool(varsP.ref) #refcmap = mapClasses.multiCmap() #not used aligndir = varsP.contigAlignTarget try: #refcmap = mapClasses.multiCmap(varsP.ref) #reflen = refcmap.totalLength #note: total length of _all_ contigs reflen = mapClasses.multiCmap(varsP.ref, lengthonly=True).totalLength #in summary table, this is a denominator--make sure it's non-zero, don't bail (still get contig summary) if reflen <= 0: #print "Warning in CharacterizeModule.characterizeContigs: bad reflen", reflen, "defaulting to 1" #not necessary reflen = 1. except: reflen = 1. outstr = "" #Contig Characterization:\n" #check for .hmaps in same dir as latestMergedCmap: if any, add a line for haploid genome size hmaps = util.getListOfFilesFromDir(os.path.dirname(varsP.latestMergedCmap), ".hmap") haplotype = (len(hmaps) > 0) haplotypelen = 0 hapcontiglens = [] totcontiglen = 0 totalignlen = 0 nmapcontigs = 0 totalignqlen = 0 #defalignlen = 0; contiglens = [] #lens of all contigs in bases uniqueseg = { } #the argument to util.uniqueRange--stores all the map lengths which go into totalignlen--now each of these is a value, and the keys are the reference contig id or chromosome depending on dorefidchr for citr, cpath in enumerate([varsP.latestMergedCmap ]): #always use contigpaths mapi = mapClasses.multiCmap(cpath) totcontiglen += mapi.totalLength contiglens += mapi.getAllMapLengths( ) #getAllMapLengths is list of all map lengths if haplotype: haplotypelen += mapi.getHaplotypeTotalMapLength() hapcontiglens.extend(mapi.getHaplotypeMapLengths()) #store a list of the contig ids in this multiCmap, then remove them if they're in the xmap # if they're not, print at the end mapids = mapi.getAllMapIds( ) #this is once per cmap, not once per characterizeModule call--becuase it's keys, it's already a copy, no need to copy explicitly ncontigs = len( mapids) #this is ncontigs in this file, ie, in mapi (see below) xmapobj = mapClasses.xmap() #empty map to fix xmapobj scope if dorefalign: #get xmap object if util.checkFile(xmappath, ".xmap"): xmapobj = mapClasses.xmap(xmappath) for xitr, xmapentry in enumerate(xmapobj.xmapLookup.values()): #get map length from multicmap.getMapLength--returns 0 for any exception contiglen = mapi.getMapLength(xmapentry.contigQry) if contiglen <= 0: #this strikes me as clumsy...but I don't want to return non-zero from multiCmap.getMapLength contiglen = 1. contigcov = mapi.getMapAvgCoverage(xmapentry.contigQry) #don't print lenr for each contig--just total them lenr = xmapentry.getMappedRefLen() lenq = xmapentry.getMappedQryLen() refid = xmapentry.contigRef #int totalignlen += lenr totalignqlen += lenq #uniqueseg is now a dict to take into account which chromosome the query contig is on #note need refid bc need to separate different contigs on the _same_ chromosome if not uniqueseg.has_key( refid ): #if first contig on chromosome, need to init new list uniqueseg[refid] = [] uniqueseg[refid].append([xmapentry.RefStart, xmapentry.RefStop]) #process mapids--remove contig id (contigQry) from mapids if they're in the xmap so non-aligning contigs can be printed if xmapentry.contigQry in mapids: mapids.remove(xmapentry.contigQry) #end loop on xmap entries #now that all xmap entries are processed, all contigs with an alignment are removed from mapids, # so we can get n contigs align using this and ncontigs nmapcontigs += ncontigs - len(mapids) #sum multiple cmaps #end loop on contigs varsP.totAssemblyLenMb = totcontiglen * unitscale ncontigs = len( contiglens) #contigpaths is just files--contiglens is all contigs avgcontiglen = (float(totcontiglen) / ncontigs if ncontigs > 0 else 0) if unitscale > 1e-6: #if not megabases fstr = "%9.0f" else: #megabases fstr = "%8.3f" if haplotype: #new format for haplotype #if haplotypelen != sum(hapcontiglens) : #simply print warning in this case (do not log): ignore this bc of floating point rounding #print "Warning in characterizeContigs: haplotype lengths are inconsistent:", haplotypelen, sum(hapcontiglens) #diploid is same as else below, but names change outstr += "Diploid N Genome Maps: %i\n" % ncontigs outstr += ("Diploid Genome Map Len (Mb): " + fstr + "\n") % (totcontiglen * unitscale) outstr += ("Diploid Avg. Genome Map Len (Mb): " + fstr + "\n") % (avgcontiglen * unitscale) outstr += ("Diploid Median Genome Map Len (Mb): " + fstr + "\n") % (util.getMedian(contiglens) * unitscale) outstr += ("Diploid Genome Map n50 (Mb): " + fstr + "\n") % (util.getn50(contiglens) * unitscale) #haploid : ignore haplotypelen, just use the list hapcontiglens outstr += "Haploid N Genome Maps: %i\n" % len(hapcontiglens) tot = sum(hapcontiglens) avg = (tot / len(hapcontiglens) if len(hapcontiglens) else 0) outstr += ("Haploid Genome Map Len (Mb): " + fstr + "\n") % (tot * unitscale) outstr += ("Haploid Avg. Genome Map Len (Mb): " + fstr + "\n") % (avg * unitscale) outstr += ("Haploid Median Genome Map Len (Mb): " + fstr + "\n") % (util.getMedian(hapcontiglens) * unitscale) outstr += ("Haploid Genome Map n50 (Mb): " + fstr + "\n") % (util.getn50(hapcontiglens) * unitscale) else: #default to old format outstr += "N Genome Maps: %i\n" % ncontigs outstr += ("Total Genome Map Len (Mb): " + fstr + "\n") % (totcontiglen * unitscale) outstr += ("Avg. Genome Map Len (Mb): " + fstr + "\n") % (avgcontiglen * unitscale) outstr += ("Median Genome Map Len (Mb): " + fstr + "\n") % (util.getMedian(contiglens) * unitscale) outstr += ("Genome Map n50 (Mb): " + fstr + "\n") % (util.getn50(contiglens) * unitscale) if haveref: outstr += ("Total Ref Len (Mb): " + fstr + "\n") % (reflen * unitscale) outstr += ("Total Genome Map Len / Ref Len : " + fstr + "\n") % (totcontiglen / reflen) if dorefalign: ratio = (float(nmapcontigs) / ncontigs if ncontigs > 0 else 0) outstr += ("N Genome Maps total align : %i (%.2f)\n") % ( nmapcontigs, ratio) outstr += ("Total Aligned Len (Mb) : " + fstr + "\n") % (totalignlen * unitscale) outstr += ("Total Aligned Len / Ref Len : " + fstr + "\n") % (totalignlen / reflen) uniquelen = 0 for segs in uniqueseg.values(): # need to sum on dict entries util.uniqueRange(segs) #this modifies list in place uniquelen += util.totalLengthFromRanges(segs) outstr += ("Total Unique Aligned Len (Mb) : " + fstr + "\n") % (uniquelen * unitscale) outstr += ("Total Unique Aligned Len / Ref Len: " + fstr + "\n") % (uniquelen / reflen) return outstr
def runAlignMol(): parser = argparse.ArgumentParser(description=description) parser.add_argument( '-q', dest='queryDir', help= 'Path to merged cmap to align molecules (-b) to OR alignmol dir from Pipeline for merge (if latter, no alignments are performed), required', type=str) parser.add_argument( '-b', dest='bnx', help='Input molecule (.bnx) file, required if aligning molecules', type=str) #parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file OR path to dir containing split bnx pieces, required if aligning molecules', type=str) #I should add the split feature; for now, just do single bnx parser.add_argument( '-a', dest='optArguments', help= 'Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str) parser.add_argument( '-r', help= 'If this flag is used, alignmolvref arguments are used, otherwise alignmol arguments are used (default alignmol; optional)', dest='ref', action='store_true') parser.add_argument( '-o', dest='outputDir', help= 'output dir (optional, defaults to sub-dir of input map dir called "alignmol")', default="", type=str) parser.add_argument( '-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) parser.add_argument( '-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int) parser.add_argument( '-j', dest='maxthreads', help= 'Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int) parser.add_argument( '-e', dest='errFile', help= '.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str) parser.add_argument( '-E', dest='errbinFile', help= '.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str) parser.add_argument( '-p', dest='pipelineDir', help= 'Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str) result = parser.parse_args() outprefix = "exp_refineFinal1" #this is the default; assume for now #check all Pipeline dependencies if result.pipelineDir: cwd = result.pipelineDir else: cwd = os.path.split( os.path.realpath(__file__))[0] #this is path of this script if not os.path.isfile(os.path.join( cwd, "utilities.py")): #if still not here, last try is actual cwd cwd = os.getcwd() #still check this below #this is the only one imported here and in runCharacterize if not os.path.isfile(os.path.join(cwd, "utilities.py")): print "ERROR: utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not os.path.isfile(os.path.join(cwd, "AlignModule.py")): print "ERROR: AlignModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import AlignModule as alignmod if not util.checkFile(os.path.join(cwd, "Pipeline.py")): print "ERROR: Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline if not util.checkFile(os.path.join(cwd, "mapClasses.py")): print "ERROR: mapClasses.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import mapClasses as mc #input dir if not result.queryDir: print "ERROR: Query (-q) argument not supplied." sys.exit(1) qrypath = os.path.realpath(result.queryDir) if util.checkDir( qrypath, checkWritable=False, makeIfNotExist=False): #output elsewhere so not writeable is ok runaligns = False elif util.checkCmap(qrypath): runaligns = True else: print "ERROR: Query argument (" + qrypath + ") not found or not a dir or cmap. Check -q argument." sys.exit(1) #this check isn't really necessary...make it a warning -- left over from runAlignMerge.py #if not os.path.split(qrypath)[1].endswith("alignmol") : # print "Warning: Query dir ("+qrypath+") does not end with 'alignmol'; please be sure this is a Pipeline alignmol dir\n" #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args rabin = "" #need empty string for generateJobList even though no jobs are run if runaligns: rabin = result.RefAligner #replicate Pipeline behavior: RefAligner is always required if os.path.isdir(rabin): rabin = os.path.join(rabin, "RefAligner") if not util.checkExecutable(rabin): print "ERROR: RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg." sys.exit(1) #optargs file optargs = None if runaligns and result.optArguments: #supplied on command line optargs = result.optArguments if not util.checkFile(optargs, ".xml"): print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument." sys.exit(1) elif runaligns: #load from Pipeline dir if running alignments optargs = os.path.join(cwd, "optArguments_human.xml") if not util.checkFile(optargs): print "optArguments.xml missing in Pipeline directory (" + cwd + "). Try supplying path explicitly using -a." sys.exit(1) #output dir if not result.outputDir: outdir = os.path.join(qrypath, "merge") #should be same as in AlignModule else: outdir = os.path.realpath(result.outputDir) if os.path.isdir(outdir): if not util.checkDir(outdir): #check writeable print "\nERROR: Output dir is not writeable:\n", outdir, "\n" sys.exit(1) #this is ok here #elif outdir == contigdir : # print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n" # sys.exit(1) print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n" elif not util.checkDir( outdir ): #does not exist, make, if False, can't make or not writeable print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n" sys.exit(1) #bnx file bnxfile = result.bnx if bnxfile: #must check for empty string BEFORE you do realpath, or it returns cwd bnxfile = os.path.realpath(bnxfile) if not util.checkFile(bnxfile, ".bnx"): print "ERROR: bnx file supplied but not found or incorrect suffix:", bnxfile sys.exit(1) elif runaligns: print "ERROR: bnx file not supplied but running alignments; please supply bnx file as -b argument" sys.exit(1) #nthreads nthreads = result.numThreads if nthreads <= 0: print "ERROR: Number of threads value invalid (must be > 0): %i" % nthreads sys.exit(1) #maxthreads maxthreads = result.maxthreads if maxthreads <= 0: print "ERROR: Max threads value invalid (must be > 0): %i" % maxthreads sys.exit(1) elif nthreads < maxthreads: print "Warning: num threads (-T: %i) < max threads (-j: %i): increasing num threads to equal max threads\n" % ( nthreads, maxthreads) nthreads = maxthreads #.errbin file errbinfile = result.errbinFile if errbinfile: errbinfile = os.path.realpath(result.errbinFile) if not util.checkFile(errbinfile, ".errbin"): print "ERROR: errbin file supplied but not found or incorrect suffix:", errbinfile sys.exit(1) #.err file errfile = result.errFile if errfile and errbinfile: print "Warning: .err and .errbin arguments supplied; ignoring .err file" errfile = "" elif errfile: errfile = os.path.realpath(result.errFile) if not util.checkFile(errfile, ".err"): print "err file supplied but not found or incorrect suffix:", errfile sys.exit(1) if errfile and not util.checkFile(os.path.join(cwd, "SampleCharModule.py")): print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) elif errfile: import SampleCharModule as scm doref = result.ref #DONE checking arguments print "Using output dir", outdir if runaligns: print "Aligning", bnxfile, "\nTo", qrypath, "\n" else: print "Merging", qrypath, "\n" startTime = time.time() #time since Epoch memory_log = os.path.join(outdir, "memory_log.txt") util.initMemoryLog(memory_log) varsP = Pipeline.varsPipeline() varsP.RefAlignerBin = rabin varsP.contigFolder = "" #not used but needs to be an attr varsP.outputContigFolder = "" #not used but needs to be a string attr varsP.pipeReportFile = os.path.join(outdir, "alignmol_jobs_log.txt") varsP.infoReportFile = os.path.join(outdir, "alignmol_log.txt") util.InitStatus(os.path.join(outdir, "status.xml")) if runaligns: varsP.optArgumentsFileIn = optargs varsP.latestMergedCmap = qrypath #if !doref, need this one varsP.ref = qrypath #and if doref, need this one varsP.nThreads = nthreads #necessary otherwise job won't start -- max threads per node varsP.maxthreads = maxthreads #threads per job p = os.path.split(qrypath)[1] varsP.outputContigPrefix = p[:p.rfind(".")] #filename prefix varsP.stdoutlog = True #use -stdout -stderr varsP.sorted_file = bnxfile[:bnxfile.rfind( ".")] #enables the mol fraction align in AlignModule.getAlignStats if qrypath.endswith(".cmap"): #enable the mol stats varsP.totAssemblyLenMb = mc.multiCmap( qrypath, lengthonly=True).totalLength / 1e6 varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt") varsP.parseArguments() #parses optArgumentsFile varsP.checkDependencies() varsP.RefAlignerBinOrig = rabin varsP.prerunLog( ) #general information in log -- needed for refaligner_version noisep = {} if errbinfile: noisep = {"readparameters": errbinfile} #print "Using noise parameters from "+errbinfile+"\n" #move below elif errfile: noisep = scm.readNoiseParameters(errfile.replace(".err", "")) if noisep.has_key( 'readparameters' ): #remove this because it's redundant, and it can cause problems with RefAligner compatibility del noisep['readparameters'] if not noisep: #readNoiseParameters returns empty dict on failure print "ERROR reading noise parameters, check .err file:", errfile sys.exit(1) #redundant with below? print "Using noise parameters from " + errfile + ":\n" + " ".join( ["-" + str(k) + " " + str(v) for k, v in noisep.iteritems()]) + "\n" #some code from SampleCharModule to load args into noise0 infoReport = "Loaded noise parameters:\n" klist = [ "FP", "FN", "sf", "sd", "sr", "bpp", "readparameters" ] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict. #noiseargs = self.varsP.argsListed('noise0') #not necessary for v in klist: if not noisep.has_key(v): continue param = str(noisep[v]) util.LogStatus("parameter", "auto_" + v, param) infoReport += v + ":" + param + "\n" varsP.replaceParam("noise0", "-" + v, param) varsP.updateInfoReport(infoReport + '\n', printalso=True) else: print "Getting file list from", qrypath outFileList = getOutFileList(util, qrypath) if not outFileList: print "ERROR: Query dir (" + qrypath + ") does not contain alignmol data. Check -q argument." sys.exit(1) else: print "Found", len(outFileList), "alignment results" #end if runaligns amod = alignmod.AlignModule( varsP, doref, outdir, bnxfile) #constructor will call generateJobList if runaligns: amod.runJobs() amod.checkResults() else: amod.outFileList = outFileList p = os.path.split(outFileList[0])[1] if p.count("_") > 1: #expect something like "EXP_REFINEFINAL1_4" #p = p[:p.rfind("_")+1] #remove integer suffix p = p[:p.rfind("_")] #remove integer suffix (and underscore) #else : # p += "_" #because mrgstr is appended varsP.outputContigPrefix = p if not runaligns or len(amod.jobList) > 0: amod.getAlignStats() if runaligns: print #copy from Pipeline.py if util.SummarizeErrors(varsP=varsP) == 0: varsP.updatePipeReport("Pipeline has successfully completed\n") util.LogStatus("progress", "pipeline", "success") else: varsP.updatePipeReport("Pipeline has completed with errors\n") util.LogStatus("progress", "pipeline", "failure") #BELOW OLD CODE return #in Pipeline, this is called first #print "Calling getAlignStats:" #but it won't work without varsP atm; skip it #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir) #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir) print "Calling mergeMap" print outFileList[0] #, "\n", outputdir #moved above util.logMemory(memory_log, startTime, "mergeMap_start") #mergeMap(self.varsP, self.outFileList, mergepath=self.outputdir) #varsP is optional alignmod.mergeMap(None, outFileList, outputdir) util.logMemory(memory_log, startTime, "mergeMap_end") print "Calling mergeRcmaps" util.logMemory(memory_log, startTime, "mergeRcmaps_start") #mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="alignmol") : alignmod.mergeRcmaps(outFileList, outputdir, splitByContig=True, stageName=outprefix) util.logMemory(memory_log, startTime, "mergeRcmaps_end") print "Calling split_XMap_byContig" #split_XMapQcmap_byContig" util.logMemory(memory_log, startTime, "split_XMap_byContig_start") #xmapdict = alignmod.split_XMap_byContig(outFileList, outputdir, stageName=outprefix) #old xmapdict = alignmod.split_XMap_byContig_new(outFileList, outputdir, stageName=outprefix) util.logMemory(memory_log, startTime, "split_XMap_byContig_end") print "Calling split_Qcmap_byContig" util.logMemory(memory_log, startTime, "split_Qcmap_byContig_start") #alignmod.split_Qcmap_byContig(outFileList, outputdir, xmapdict) #old alignmod.split_Qcmap_byContig_new( outFileList, outputdir, xmapdict, stageName=outprefix) #new: better performance util.logMemory(memory_log, startTime, "split_Qcmap_byContig_end") print "AlignMerge successfully completed"
def getArgs(): parser = argparse.ArgumentParser(description=description) parser.add_argument( '-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) parser.add_argument( '-r', dest='referenceMap', help='Path to reference maps (.cmap), 1 file only (required)', type=str) parser.add_argument( '-q', dest='queryDir', help='Path to dir containing query maps (.cmaps) (required)', type=str) #parser.add_argument('-x', dest='xmap', help='Path to .xmap, 1 file only (optional, if specified, no alignment is done, if not specified, -t, -r, and -q must be specified)') #not supported parser.add_argument( '-o', dest='outputDir', help= 'output dir (optional, defaults to input map dir with suffix "_sv")', default="", type=str) parser.add_argument( '-p', dest='pipelineDir', help= 'Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str) parser.add_argument( '-a', dest='optArguments', help= 'Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str) parser.add_argument( '-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int) parser.add_argument( '-j', dest='maxthreads', help= 'Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int) parser.add_argument( '-b', dest='bedFile', help= '.bed file with gaps in reference for flagging SVs which overlap N-base gaps (optional)', default="", type=str) parser.add_argument( '-e', dest='errFile', help= '.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)', default="", type=str) parser.add_argument( '-E', dest='errbinFile', help= '.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)', default="", type=str) parser.add_argument( '-C', help= 'Run on cluster, read XML file for submission arguments (optional--will not use cluster submission if absent)', dest='cxml', default=None) parser.add_argument( '-s', help= 'SV jobs configuration: 0 = single job (required for correct haplotype calls), 1 = single job per contig (not recommended), 2 = grouped (default 0; optional)', dest='groupsv', type=int, default=0) #parser.add_argument('-s', help='Disable grouping of SV jobs (default grouped; optional)', dest='groupsv', action='store_false') #old one result = parser.parse_args() #check all Pipeline dependencies if result.pipelineDir: cwd = result.pipelineDir else: cwd = os.path.split( os.path.realpath(__file__))[0] #this is path of this script if not os.path.isfile(os.path.join( cwd, "utilities.py")): #if still not here, last try is actual cwd cwd = os.getcwd() #still check this below #this is the only one imported here and in runCharacterize if not os.path.isfile(os.path.join(cwd, "utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util #xmap -- don't use this runaligns = True #default is to run the alignment xmappath = None #if result.xmap : # xmappath = result.xmap # if not util.checkFile(xmappath, ".xmap") : # print "Xmap path is supplied ("+xmappath+") but not found or doesn't end in .xmap." # sys.exit(1) # runaligns = False #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args rabin = result.RefAligner #replicate Pipeline behavior: RefAligner is always required if os.path.isdir(rabin): rabin = os.path.join(rabin, "RefAligner") if not util.checkExecutable(rabin): print "RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg." sys.exit(1) #reference maps -- only required if xmap not specified refcmap = os.path.realpath(result.referenceMap) if runaligns and not util.checkFile( refcmap, ".cmap"): #and not util.checkFile(refcmap, ".spots") : print "Reference map file (" + refcmap + ") not found or does not end in .cmap or .spots. Check -r argument." sys.exit(1) #query maps qrypath = os.path.realpath(result.queryDir) #if runaligns and not util.checkFile(qrypath, ".cmap") : # print "Query map file ("+qrypath+") not found or does not end in .cmap or .spots. Check -q argument." # sys.exit(1) if not util.checkDir(qrypath, checkWritable=False, makeIfNotExist=False): #does NOT have to be writeable print "Query dir (" + qrypath + ") not found or not a dir. Check -q argument." sys.exit(1) if runaligns: contigdir = qrypath #os.path.split(qrypath)[0] #dir of query maps contigbase = os.path.split(qrypath)[1] #filename else: contigdir = os.path.split(xmappath)[0] contigbase = os.path.split(xmappath)[1] #filename #contigbase = contigbase[:contigbase.find(".")] #remove suffix #optargs file optargs = None if result.optArguments: #supplied on command line optargs = result.optArguments if not util.checkFile(optargs, ".xml"): print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument." sys.exit(1) elif runaligns: #load from Pipeline dir if running alignments optargs = os.path.join(cwd, "optArguments_human.xml") if not util.checkFile(optargs): print "optArguments.xml missing in Pipeline directory (" + cwd + "). Try supplying path explicitly using -a." sys.exit(1) #cluster args clustargs = None if result.cxml: clustargs = os.path.realpath(result.cxml) if not util.checkFile(clustargs, ".xml"): print "clusterArguments path is supplied (" + clustargs + ") but not found or doesn't end in .xml, check -C argument." sys.exit(1) #nthreads nthreads = result.numThreads if nthreads <= 0: print "Number of threads value invalid (must be > 0): %i" % nthreads sys.exit(1) #maxthreads maxthreads = result.maxthreads if maxthreads <= 0: print "Max threads value invalid (must be > 0): %i" % maxthreads sys.exit(1) #bed file bedfile = result.bedFile #must make local for return statement below if bedfile: #must check for empty string BEFORE you do realpath, or it returns cwd bedfile = os.path.realpath(result.bedFile) if not util.checkFile(bedfile, ".bed"): print "bed file supplied but not found or incorrect suffix:", bedfile sys.exit(1) #.errbin file errbinfile = result.errbinFile if errbinfile: errbinfile = os.path.realpath(result.errbinFile) if not util.checkFile(errbinfile, ".errbin"): print "errbin file supplied but not found or incorrect suffix:", errbinfile sys.exit(1) #.err file errfile = result.errFile if errfile and errbinfile: print "Warning: .err and .errbin arguments supplied; ignoring .err file" errfile = "" elif errfile: errfile = os.path.realpath(result.errFile) if not util.checkFile(errfile, ".err"): print "err file supplied but not found or incorrect suffix:", errfile sys.exit(1) outdir = os.path.realpath(result.outputDir) groupsv = result.groupsv if groupsv < 0 or groupsv > 2: print 'ERROR: -s (grouped SV) must be 0, 1, or 2\n' sys.exit(1) #yes, this is messy...but I don't want another class (besides varsPipeline) and they just go to runCharacterize return cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, groupsv
def runSV(cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, groupsv): '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns; report on those alignments or the xmap provided as xmappath. ''' printargs = True if not os.path.isfile(os.path.join(cwd, "utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not util.checkFile(os.path.join(cwd, "Pipeline.py")): print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline if not util.checkFile(os.path.join(cwd, "SVModule.py")): print "SVModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import SVModule as svm if errfile and not util.checkFile(os.path.join(cwd, "SampleCharModule.py")): print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) elif errfile: import SampleCharModule as scm #use Pipeline objects varsP = Pipeline.varsPipeline() varsP.optArgumentsFileIn = optargs varsP.RefAlignerBin = rabin varsP.latestMergedCmap = os.path.join( contigdir, contigbase + ".cmap") #file suffix required to be .cmap varsP.contigFolder = os.path.split(contigdir)[0] varsP.nThreads = nthreads #necessary otherwise job won't start -- max threads per node varsP.maxthreads = maxthreads #threads per job varsP.ref = refcmap varsP.stdoutlog = True #enable -stdout -stderr args to RefAligner varsP.curCharacterizeCmaps = [varsP.latestMergedCmap] varsP.contigSubDirectories = True #needed for prepareContigIO varsP.doAlignMolvRef = False #do not look for copy number varsP.groupSV = groupsv #mimic Pipeline behavior: group or not if runaligns: #varsP.contigAlignTarget = outdir varsP.runSV = False varsP.groupContigs = False varsP.stdoutlog = True #use -stdout -stderr varsP.stageComplete = contigbase varsP.outputContigPrefix = getContigPrefix( util, contigdir ) #if outdir is not supplied, this is used as dir prefix; also used as file pref for -o arg varsP.outputContigFolder = contigdir #cmaps are copied from here if not outdir: outdir = contigdir + "_sv" #this will be outdir of sv jobs if os.path.isdir(outdir): if not util.checkDir(outdir): #check writeable print "\nERROR: Output dir is not writeable:\n", outdir, "\n" sys.exit(1) elif outdir == contigdir: print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n" sys.exit(1) print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n" elif not util.checkDir( outdir ): #does not exist, make, if False, can't make or not writeable print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n" sys.exit(1) if clustargs: #os.putenv('SGE_ROOT', '/var/lib/gridengine') #do I want this??? NO! It could very well be wrong. varsP.onCluster = True varsP.checkCluster() #call varsPipeline method to check SGE_ROOT #note: before, above default is wrong. Now, there is no default--user is required to set environment variable; but this is consistent with the Pipeline varsP.clusterLogDir = os.path.join(outdir, 'ClusterLogs') util.checkDir(varsP.clusterLogDir) #make it varsP.checkCluster() varsP.clusterArgumentsFileIn = clustargs #required for parseArguments varsP.parseArguments(readingClusterFile=True) if varsP.error: print varsP.message sys.exit(1) varsP.RefAlignerBin += "${BINARY_SUFFIX:=}" #copy from varsPipeline, handled by external script on phi host varsP.pipeReportFile = os.path.join(outdir, "sv_jobs_log.txt") varsP.infoReportFile = os.path.join(outdir, "sv_log.txt") varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt") if bedfile: varsP.bedFile = bedfile util.InitStatus(os.path.join(outdir, "status.xml")) varsP.parseArguments() #parses optArgumentsFile varsP.checkDependencies() varsP.RefAlignerBinOrig = rabin varsP.prerunLog( ) #general information in log -- needed for refaligner_version if printargs: print "\nRunning SV detection with arguments (" + os.path.split( optargs)[1] + "):\n" + " ".join( varsP.argsListed('svdetect')) + '\n' noisep = {} if errbinfile: noisep = {"readparameters": errbinfile} print "Using noise parameters from " + errbinfile + "\n" elif errfile: noisep = scm.readNoiseParameters(errfile.replace(".err", "")) if noisep.has_key( 'readparameters' ): #remove this because it's redundant, and it can cause problems with RefAligner compatibility del noisep['readparameters'] if not noisep: #readNoiseParameters returns empty dict on failure print "ERROR reading noise parameters, check .err file:", errfile sys.exit(1) print "Using noise parameters from " + errfile + ":\n" + " ".join( ["-" + str(k) + " " + str(v) for k, v in noisep.iteritems()]) + "\n" varsP.outputContigFolder = contigdir #cmaps are copied from here #make merged cmap to replace merged _q.cmap if not produced by RefAligner cmaps = util.getListOfFilesFromDir(varsP.outputContigFolder, suffix=".cmap") if len(cmaps) > 1: varsP.contigPathTxtFile = os.path.join( outdir, "contig_list.txt") #mergeIntoSingleCmap creates this file print "Creating merged cmap" varsP.mergeIntoSingleCmap(outdir) print "Merged cmap created:", varsP.latestMergedCmap, "\n" if varsP.groupSV == 0: #if it is a single job, use merged map just created varsP.outputContigFolder = outdir #input == output #print "varsP.outputContigFolder =", varsP.outputContigFolder #debug elif len(cmaps) == 1: varsP.latestMergedCmap = cmaps[0] else: #this is already checked in getContigPrefix (redundant) print "No cmaps found in input dir; check dir %s\n" % varsP.outputContigFolder sys.exit(1) svmodule = svm.SVdetect(varsP, noisep, outdir, skipderes=True) #this got duplicated above #if hasattr(util, "InitStatus") : #if old version, skip -- do this after SVdetect.__init__ bc makes outdir # util.InitStatus(os.path.join(outdir, "status.xml")) #needed otherwise call to status_log fails svmodule.runJobs() svmodule.checkResults() util.SummarizeErrors(varsP) else: varsP.contigAlignTarget = contigdir #this is dir in which _q and _r cmaps must be located print "ERROR: feature not supported" #not implemented to not run jobs
def mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName=""): """Given a list of file prefixes (outFileList), append "_r.cmap" to them, and merge them to outdir. Report to varsP if supplied, stdout if not. Also support outFileList is full paths (including "_r.cmap"). If splitByContig < 1, output each contig separately, if == 1, only output single merged cmap, and if > 1, do both. Always use stagename if supplied; if not, must supply varsP otherwise prefix is empty. """ if not util.checkDir(outdir): err_msg = "Warning in AlignModule.mergeRcmaps: could not make outdir %s, skipping copy number" % outdir logOrPrintError(err_msg, varsP) return if not outFileList: #just an argument check--check for presence on disk is below err_msg = "Warning in AlignModule.mergeRcmaps: no maps supplied" logOrPrintError(err_msg, varsP) return outFileList.sort( ) #for reproducibility with runAlignMerge.py (different order when listing dir) rsuf = "_r.cmap" #mappref = os.path.split(outFileList[0])[1] #this is just prefix, but with integer suffix--get it before -- no longer used #mappref = mappref[:mappref.rfind("_")+1] #remove integer suffix #even though outFileList should all be there, a job may have failed--check all, just existence present = [] for outf in outFileList: target = (outf + rsuf if not outf.endswith(rsuf) else outf ) #now support either if not util.checkFile(target): err_msg = "Warning in AlignModule.mergeRcmaps: missing _r.cmap %s" % target logOrPrintError(err_msg, varsP) else: present.append(target) if not present: #no _r.cmaps found (this will also happen for empty outFileList) err_msg = "Warning in AlignModule.mergeRcmaps: no _r.cmaps found, skipping copy number" logOrPrintError(err_msg, varsP) return outFileList = present #yes, it's redundant, but now have rsuf appended mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") #mergedmappath = os.path.join(outdir, mappref+mrgstr+rsuf) #this is output merged _r.cmap -- unify with filepref mergedmap = mc.multiCmap(outFileList[0]) #open original, edit in memory #now add other maps for rmap in outFileList[1:]: #don't add map 0 to itself if mergedmap.addCovOcc(mc.multiCmap( rmap)): #when calling addCovOcc, check return, warn if True err_msg = "Warning in AlignModule.mergeRcmaps: addCovOcc call failed for map %s" % rmap logOrPrintError(err_msg, varsP) #now it's merged, but the resulting map need to be written back to disk filepref = ( varsP.outputContigPrefix if varsP and stageName == "" else stageName ) #see split_XMapQcmap_byContig if splitByContig < 1 or splitByContig > 1: #print "\nself.varsP.outputContigPrefix", self.varsP.outputContigPrefix, "\n" #debug #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in split_XMapQcmap_byContig mergedmap.writeAllMapsToDisk(os.path.join(outdir, filepref + '_contig'), outsuf="_r") report = "mergeRcmaps: wrote %i cmaps" % len(mergedmap.cmapdict) if splitByContig > 0: mergedmap.writeToFile( os.path.join(outdir, filepref + "_" + mrgstr + rsuf)) #was mergedmappath report = "mergeRcmaps: wrote merged cmap with %i contigs" % len( mergedmap.cmapdict) #report result logOrPrintError(report, varsP, warn=False)
def split_Qcmap_byContig_new(inFileList, mergepath, xmapDict, varsP=None, stageName=""): # readin all _q.cmap: qcmapFilelist = [] for outpath in sorted( inFileList ): #these are file prefixes--sort to ensure reproducibility if util.checkFile(outpath + "_q.cmap"): qcmapFilelist.append(outpath + "_q.cmap") else: err_msg = "Warning in AlignModule.split_XMapQcmap_byContig: missing _q.cmap: " + outpath + "_q.cmap" logOrPrintError(err_msg, varsP) if not len(qcmapFilelist): #nothing to merge err_msg = "Error in AlignModule.split_XMapQcmap_byContig: no _q.cmaps found" logOrPrintError(err_msg, varsP) return header = "" #get header of first qcmap with open(qcmapFilelist[0]) as f1: for line in f1: #no readline--that will iterate over each char in line instead of line itself if line[0] == '#': header += line else: break #create all output files, header only #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in mergeRcmaps filepref = ( varsP.outputContigPrefix if varsP and stageName == "" else stageName ) #same as line in mergeRcmaps for contigid in xmapDict.keys(): outQmapFile = os.path.join( mergepath, filepref + '_contig' + str(contigid) + '_q.cmap') f1 = open(outQmapFile, "w") f1.write(header) f1.close() #convert xmapDict to a molDict: keys are molids, and values are contig ids -- this should speed up below molDict = {} #for cid,xmap in xmapDict.iteritems() : #for xmapentry in xmap.xmapLookup.values() : #if not molDict.has_key(xmapentry.contigQry) : #new mol # molDict[xmapentry.contigQry] = [xmapentry.contigRef] #else : # molDict[xmapentry.contigQry].append(xmapentry.contigRef) #old xmapDict was contigid:"xmap object"; new one is contigid:"list of mol ids" for cid, molids in xmapDict.iteritems(): for molid in molids: if not molDict.has_key(molid): #new mol molDict[molid] = [cid] else: molDict[molid].append(cid) #print "DEBUG:\n", molDict, "DEBUG\n" #debug #read input files, find all contigs to which each molecule aligns, write to that qcmap nmol = 0 for qcmap in qcmapFilelist: previd = 0 #molecule id from _q.cmap, int to compare with xmap.contigQry molstr = "" #all the lines in the _q.cmap for this molecule f1 = open(qcmap) for line in f1: if line[0] == '#': continue molid = int(line.split()[0]) #use int bc compare to xmap.contigQry if molid == previd: #get data for this mol molstr += line else: #write previous mol to output qcmap if molstr: #not for first mol for cid in molDict[previd]: outQmapFile = os.path.join( mergepath, filepref + '_contig' + str(cid) + '_q.cmap') f2 = open(outQmapFile, "a") f2.write(molstr) f2.close() #prepare for next mol molstr = line previd = molid nmol += 1 f1.close() #get last molecule for cid in molDict[molid]: outQmapFile = os.path.join( mergepath, filepref + '_contig' + str(cid) + '_q.cmap') f2 = open(outQmapFile, "a") f2.write(molstr) f2.close() logOrPrintError( "split_XMapQcmap_byContig: wrote %i _q.cmaps with %i molecules" % (len(xmapDict), nmol), varsP, warn=False)
def split_Qcmap_byContig_new(inFileList, mergepath, xmapDict, varsP=None, stageName="") : # readin all _q.cmap: qcmapFilelist = [] for outpath in sorted(inFileList) : #these are file prefixes--sort to ensure reproducibility if util.checkFile(outpath+"_q.cmap") : qcmapFilelist.append(outpath+"_q.cmap") else : err_msg = "Warning in AlignModule.split_XMapQcmap_byContig: missing _q.cmap: "+outpath+"_q.cmap" logOrPrintError(err_msg, varsP) if not len(qcmapFilelist) : #nothing to merge err_msg = "Error in AlignModule.split_XMapQcmap_byContig: no _q.cmaps found" logOrPrintError(err_msg, varsP) return header = "" #get header of first qcmap with open(qcmapFilelist[0]) as f1 : for line in f1 : #no readline--that will iterate over each char in line instead of line itself if line[0] == '#': header += line else : break #create all output files, header only #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in mergeRcmaps filepref = (varsP.outputContigPrefix if varsP and stageName == "" else stageName) #same as line in mergeRcmaps for contigid in xmapDict.keys() : outQmapFile = os.path.join(mergepath, filepref+'_contig'+str(contigid)+'_q.cmap') f1 = open(outQmapFile, "w") f1.write(header) f1.close() #convert xmapDict to a molDict: keys are molids, and values are contig ids -- this should speed up below molDict = {} #for cid,xmap in xmapDict.iteritems() : #for xmapentry in xmap.xmapLookup.values() : #if not molDict.has_key(xmapentry.contigQry) : #new mol # molDict[xmapentry.contigQry] = [xmapentry.contigRef] #else : # molDict[xmapentry.contigQry].append(xmapentry.contigRef) #old xmapDict was contigid:"xmap object"; new one is contigid:"list of mol ids" for cid,molids in xmapDict.iteritems() : for molid in molids : if not molDict.has_key(molid) : #new mol molDict[molid] = [cid] else : molDict[molid].append(cid) #print "DEBUG:\n", molDict, "DEBUG\n" #debug #read input files, find all contigs to which each molecule aligns, write to that qcmap nmol = 0 for qcmap in qcmapFilelist : previd = 0 #molecule id from _q.cmap, int to compare with xmap.contigQry molstr = "" #all the lines in the _q.cmap for this molecule f1 = open(qcmap) for line in f1 : if line[0] == '#' : continue molid = int(line.split()[0]) #use int bc compare to xmap.contigQry if molid == previd : #get data for this mol molstr += line else : #write previous mol to output qcmap if molstr : #not for first mol for cid in molDict[previd] : outQmapFile = os.path.join(mergepath, filepref+'_contig'+str(cid)+'_q.cmap') f2 = open(outQmapFile, "a") f2.write(molstr) f2.close() #prepare for next mol molstr = line previd = molid nmol += 1 f1.close() #get last molecule for cid in molDict[molid] : outQmapFile = os.path.join(mergepath, filepref+'_contig'+str(cid)+'_q.cmap') f2 = open(outQmapFile, "a") f2.write(molstr) f2.close() logOrPrintError("split_XMapQcmap_byContig: wrote %i _q.cmaps with %i molecules" % (len(xmapDict), nmol), varsP, warn=False)
def getArgs() : parser = argparse.ArgumentParser(description=description) parser.add_argument('-t', dest='RefAligner', help='Path to RefAligner (required unless xmap is specified (-x))') parser.add_argument('-r', dest='referenceMap', help='Path to reference maps (.cmap or .spots), 1 file only (required unless xmap specified (-x) and _r.cmap is present in same dir as xmap)', default="") parser.add_argument('-q', dest='queryMap', help='Path to query maps (.cmap), 1 file only (required--if xmap specified (-x), this should be input (-i argument) for that command)', default="") parser.add_argument('-x', dest='xmap', help='Path to .xmap, 1 file only (optional, if specified, no alignment is done, if not specified, -t, -r, and -q must be specified)') parser.add_argument('-p', dest='pipelineDir', help='Pipeline dir (optional, defaults to current directory)') parser.add_argument('-a', dest='optArguments', help='Path to optArguments.xml (optional, default in Pipeline dir if found, otherwise required)') parser.add_argument('-n', dest='numThreads', help='Number of threads (cores) to use (optional, default 4)', default=4, type=int) result = parser.parse_args() #check all Pipeline dependencies if result.pipelineDir : cwd = result.pipelineDir else : cwd = os.getcwd() #this is the only one imported here and in runCharacterize if not os.path.isfile(os.path.join(cwd,"utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util #xmap -- optional runaligns = True #default is to run the alignment xmappath = None if result.xmap : xmappath = result.xmap if not util.checkFile(xmappath, ".xmap") : print "Xmap path is supplied ("+xmappath+") but not found or doesn't end in .xmap." sys.exit(1) runaligns = False #RefAligner -- only required if xmap not specified rabin = result.RefAligner if not xmappath and not util.checkExecutable(rabin): print "RefAligner not found at", rabin, "\nPlease supply RefAligner full path as -t arg." sys.exit(1) #reference maps -- only required if xmap not specified refcmap = result.referenceMap if runaligns and not util.checkFile(refcmap, ".cmap") and not util.checkFile(refcmap, ".spots") : print "Reference map file ("+refcmap+") not found or does not end in .cmap or .spots. Check -r argument." sys.exit(1) #query maps -- only required if xmap not specified qrypath = result.queryMap #if runaligns and not util.checkFile(qrypath, ".cmap") : if not util.checkFile(qrypath, ".cmap") : #always required print "Query map file ("+qrypath+") not found or does not end in .cmap or .spots. Check -q argument." sys.exit(1) #if runaligns : contigdir = os.path.split(qrypath)[0] #dir of query maps contigbase = os.path.split(qrypath)[1] #filename #else : # contigdir = os.path.split(xmappath)[0] # contigbase = os.path.split(xmappath)[1] #filename contigbase = contigbase[:contigbase.find(".")] #remove suffix #optargs file optargs = None if result.optArguments : #supplied on command line optargs = result.optArguments if not util.checkFile(optargs, ".xml") : print "optArguments path is supplied ("+optargs+") but not found or doesn't end in .xml, check -a argument." sys.exit(1) elif runaligns : #load from Pipeline dir if running alignments optafile = "optArguments_human.xml" optargs = os.path.join(cwd, optafile) if not util.checkFile(optargs): print "%s missing in Pipeline directory (%s). Try supplying path explicitly using -a." % (optafile, cwd) sys.exit(1) #nthreads nthreads = result.numThreads if nthreads <= 0 : print "Number of threads value invalid (must be >= 0): "+nthreads sys.exit(1) #yes, this is messy...but I don't want another class (besides varsPipeline) and they just go to runCharacterize return cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads
def getAlignStats(varsP, outFileList, reflen=0, isref=False, mergepath="", bnxpath=None): '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule. reflen should be in Mb. If mergepath supplied, put merged .err there. If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this file and ignore outFileList. ''' statonly = False #bnx stats only skipbnx = False #.err file processing only if bnxpath == None: if not varsP.sorted_file: #for runAlignMol, this is empty: nothing to do in this case skipbnx = True else: bnxpath = varsP.sorted_file + ".bnx" #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix else: #if bnxpath != None : statonly = True if not skipbnx and not util.checkFile(bnxpath): varsP.updatePipeReport( "Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n" % bnxpath) return #find the minlen used for bnx_sort, which is a required arg set sortargs = [] if varsP.argData.has_key('bnx_sort'): #for runAlignMol.py sortargs = varsP.argsListed('bnx_sort') minlen = 0 validminlen = False if "-minlen" in sortargs: minlen = sortargs[ sortargs.index("-minlen") + 1] #next ele should be the len, if next ele isn't in list, the sort job will fail minlen = util.getIntFromString( minlen) #returns None if can't cast to int if minlen: validminlen = True if not validminlen and bnxpath == None and sortargs: varsP.updatePipeReport( "Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n" ) if bnxpath != None: #if bnxpath, ignore minlen minlen = 0 nmol = 0 #total n mol above minlen totlen = 0 #total mol len above minlen if util.checkFile(bnxpath): #the bnxfile class is very wasteful. replace with below #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now outstr = "Reading molecule stats from %s:\n" % bnxpath outstr += "Molecule Stats:\n" moldict = util.simpleBnxStats(bnxpath, minlen) nmol = moldict["nmol"] totlen = moldict["totlen"] #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously outstr += "N mols: %i\n" % nmol outstr += ("Total len (Mb): %10.3f\n") % totlen outstr += ("Avg len (kb) : %10.3f\n") % moldict["avglen"] outstr += ("Mol N50 (kb) : %10.3f\n") % moldict["n50"] outstr += ("Lab (/100kb) : %10.3f\n") % moldict["labdensity"] # if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below # bnx.molstats[minlen].genomesizemb = 0 # outstr += str(bnx.molstats[minlen]) #nmol = bnx.molstats[minlen].nmol #totlen = bnx.molstats[minlen].totlen if reflen: cov = totlen / reflen #totlen is in Mb outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else "Contig", cov) if isref or reflen or statonly: #if neither, nothing to print varsP.updateInfoReport(outstr + "\n", printalso=True) elif not skipbnx: varsP.updatePipeReport( "Warning in AlignModule.getAlignStats: missing bnx path:" + bnxpath + "\n") if statonly: return #lastly, load .xmaps and .errs from alignmol jobs and report on stats totmaplen = 0 #sum of lengths of mapped portions of all molecules, on reference totmapqrylen = 0 #sum of lengths of mapped portions of all molecules, on query totconf = 0 #sum of confidence of all alignments nalign = 0 #total number of alignments fplist = [] #lists for error rates fprlist = [] fnlist = [] bpplist = [] nmaplist = [] #from .err gmaplist = [] #from .err llrmlist = [] llrgmlist = [] bppsdlist = [] sflist = [] sdlist = [] srlist = [] reslist = [] resdlist = [] header = "" err = None #will be the alignParams object if any .err files are found mappref = "" if len(outFileList) > 0: mappref = getMergeFilename( outFileList[0] ) #make function to unify with same convention in mergeMap for outpath in outFileList: #these are file prefixes if util.checkFile(outpath + ".xmap"): xmap = mc.xmap(outpath + ".xmap") nalign += len(xmap.xmapLookup) totmaplen += xmap.getSumMappedRefLen() #in kb totmapqrylen += xmap.getSumMappedQryLen() #in kb totconf += sum([x.Confidence for x in xmap.xmapLookup.values()]) else: varsP.updatePipeReport( "Warning in AlignModule.getAlignStats: missing xmap:" + outpath + ".xmap" + "\n") if util.checkFile(outpath + ".err"): err = mc.alignParams(outpath + ".err") if not header: header = err.header fplist.append(err.fp) fprlist.append(err.fprate) fnlist.append(err.fn) bpplist.append(err.bpp) reslist.append(err.res) nmaplist.append(err.nmaps) gmaplist.append(err.goodmaps) llrmlist.append(err.llrm) llrgmlist.append(err.llrgm) bppsdlist.append(err.bppsd) sflist.append(err.sf) sdlist.append(err.sd) srlist.append(err.sr) resdlist.append(err.ressd) #nalign from xmap should be the same as goodmaps from .err sumgoodmaps = sum(gmaplist) if sumgoodmaps != nalign: varsP.updateInfoReport( "Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n" % (sumgoodmaps, nalign), printalso=True) if totmaplen or totconf or nalign: outstr = "Molecules Aligned to %s:\n" % ("Reference" if isref else "Assembly") outstr += "N mol align : %9i\n" % nalign outstr += "Mol fraction align: %13.3f\n" % (float(nalign) / nmol if nmol else 0) outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3) #Mb if reflen > 0: outstr += ("Effective Cov (x) : %13.3f\n") % ( totmaplen / 1e3 / reflen) #totlen is in kb outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen / nalign if nalign else 0) outstr += "Fraction align len: %13.3f\n" % ( totmapqrylen / 1e3 / totlen if totlen else 0 ) #totmapqrylen is in kb, totlen is in mb outstr += "Tot confidence : %11.1f\n" % totconf outstr += "Avg confidence : %11.1f\n" % (totconf / nalign if nalign else 0) varsP.updateInfoReport(outstr, printalso=True) avgfp = (sum(fplist) / len(fplist) if len(fplist) else 0) avgfpr = (sum(fprlist) / len(fprlist) if len(fprlist) else 0) avgfn = (sum(fnlist) / len(fnlist) if len(fnlist) else 0) avgbpp = (sum(bpplist) / len(bpplist) if len(bpplist) else 0) avgres = (sum(reslist) / len(reslist) if len(reslist) else 0) avgllr = (sum(llrmlist) / len(llrmlist) if len(llrmlist) else 0) avgllg = (sum(llrgmlist) / len(llrgmlist) if len(llrgmlist) else 0) avgbps = (sum(bppsdlist) / len(bppsdlist) if len(bppsdlist) else 0) avgsf = (sum(sflist) / len(sflist) if len(sflist) else 0) avgsd = (sum(sdlist) / len(sdlist) if len(sdlist) else 0) avgsr = (sum(srlist) / len(srlist) if len(srlist) else 0) avgrsd = (sum(resdlist) / len(resdlist) if len(resdlist) else 0) if avgfp or avgfn or avgbpp: outstr = "Avg FP(/100kb) : %12.2f\n" % avgfp outstr += "Avg FP ratio : %13.3f\n" % avgfpr outstr += "Avg FN ratio : %13.3f\n" % avgfn outstr += "Avg bpp : %11.1f\n" % avgbpp outstr += "Avg sf : %13.3f\n" % avgsf outstr += "Avg sd : %13.3f\n" % avgsd outstr += "Avg sr : %13.3f\n" % avgsr varsP.updateInfoReport(outstr + "\n", printalso=True) if err and mergepath: #have an error file (alignParams) object util.checkDir(mergepath) mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") outpath = os.path.join(mergepath, mappref + mrgstr + ".err") err.fp = avgfp err.fn = avgfn err.sf = avgsf err.sd = avgsd err.bpp = avgbpp err.res = avgres err.nmaps = sum(nmaplist) err.llrm = avgllr err.goodmaps = sumgoodmaps err.llrgm = avgllg err.bppsd = avgbps err.fprate = avgfpr err.sr = avgsr err.ressd = avgrsd err.writeToFile(outpath)
def getArgs() : parser = argparse.ArgumentParser(description=description) parser.add_argument('-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) parser.add_argument('-r', dest='referenceMap', help='Path to reference maps (.cmap), 1 file only (required)', type=str) parser.add_argument('-q', dest='queryDir', help='Path to dir containing query maps (.cmaps) (required)', type=str) #parser.add_argument('-x', dest='xmap', help='Path to .xmap, 1 file only (optional, if specified, no alignment is done, if not specified, -t, -r, and -q must be specified)') #not supported parser.add_argument('-o', dest='outputDir', help='output dir (optional, defaults to input map dir with suffix "_sv")', default="", type=str) parser.add_argument('-p', dest='pipelineDir', help='Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str) parser.add_argument('-a', dest='optArguments', help='Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str) parser.add_argument('-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int) parser.add_argument('-j', dest='maxthreads', help='Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int) parser.add_argument('-b', dest='bedFile', help='.bed file with gaps in reference for flagging SVs which overlap N-base gaps (optional)', default="", type=str) parser.add_argument('-e', dest='errFile', help='.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)', default="", type=str) parser.add_argument('-E', dest='errbinFile', help='.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)', default="", type=str) parser.add_argument('-C', help='Run on cluster, read XML file for submission arguments (optional--will not use cluster submission if absent)', dest='cxml', default=None) parser.add_argument('-s', help='Disable grouping of SV jobs (default grouped; optional)', dest='groupsv', action='store_false') result = parser.parse_args() #check all Pipeline dependencies if result.pipelineDir : cwd = result.pipelineDir else : cwd = os.path.split(os.path.realpath(__file__))[0] #this is path of this script if not os.path.isfile(os.path.join(cwd,"utilities.py")) : #if still not here, last try is actual cwd cwd = os.getcwd() #still check this below #this is the only one imported here and in runCharacterize if not os.path.isfile(os.path.join(cwd,"utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util #xmap -- don't use this runaligns = True #default is to run the alignment xmappath = None #if result.xmap : # xmappath = result.xmap # if not util.checkFile(xmappath, ".xmap") : # print "Xmap path is supplied ("+xmappath+") but not found or doesn't end in .xmap." # sys.exit(1) # runaligns = False #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args rabin = result.RefAligner #replicate Pipeline behavior: RefAligner is always required if os.path.isdir(rabin) : rabin = os.path.join(rabin, "RefAligner") if not util.checkExecutable(rabin): print "RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg." sys.exit(1) #reference maps -- only required if xmap not specified refcmap = os.path.realpath(result.referenceMap) if runaligns and not util.checkFile(refcmap, ".cmap") : #and not util.checkFile(refcmap, ".spots") : print "Reference map file ("+refcmap+") not found or does not end in .cmap or .spots. Check -r argument." sys.exit(1) #query maps qrypath = os.path.realpath(result.queryDir) #if runaligns and not util.checkFile(qrypath, ".cmap") : # print "Query map file ("+qrypath+") not found or does not end in .cmap or .spots. Check -q argument." # sys.exit(1) if not util.checkDir(qrypath, checkWritable=False, makeIfNotExist=False) : #does NOT have to be writeable print "Query dir ("+qrypath+") not found or not a dir. Check -q argument." sys.exit(1) if runaligns : contigdir = qrypath #os.path.split(qrypath)[0] #dir of query maps contigbase = os.path.split(qrypath)[1] #filename else : contigdir = os.path.split(xmappath)[0] contigbase = os.path.split(xmappath)[1] #filename #contigbase = contigbase[:contigbase.find(".")] #remove suffix #optargs file optargs = None if result.optArguments : #supplied on command line optargs = result.optArguments if not util.checkFile(optargs, ".xml") : print "optArguments path is supplied ("+optargs+") but not found or doesn't end in .xml, check -a argument." sys.exit(1) elif runaligns : #load from Pipeline dir if running alignments optargs = os.path.join(cwd,"optArguments_human.xml") if not util.checkFile(optargs): print "optArguments.xml missing in Pipeline directory ("+cwd+"). Try supplying path explicitly using -a." sys.exit(1) #cluster args clustargs = None if result.cxml : clustargs = os.path.realpath(result.cxml) if not util.checkFile(clustargs, ".xml") : print "clusterArguments path is supplied ("+clustargs+") but not found or doesn't end in .xml, check -C argument." sys.exit(1) #nthreads nthreads = result.numThreads if nthreads <= 0 : print "Number of threads value invalid (must be > 0): %i" % nthreads sys.exit(1) #maxthreads maxthreads = result.maxthreads if maxthreads <= 0 : print "Max threads value invalid (must be > 0): %i" % maxthreads sys.exit(1) #bed file bedfile = result.bedFile #must make local for return statement below if bedfile : #must check for empty string BEFORE you do realpath, or it returns cwd bedfile = os.path.realpath(result.bedFile) if not util.checkFile(bedfile, ".bed") : print "bed file supplied but not found or incorrect suffix:", bedfile sys.exit(1) #.errbin file errbinfile = result.errbinFile if errbinfile : errbinfile = os.path.realpath(result.errbinFile) if not util.checkFile(errbinfile, ".errbin") : print "errbin file supplied but not found or incorrect suffix:", errbinfile sys.exit(1) #.err file errfile = result.errFile if errfile and errbinfile : print "Warning: .err and .errbin arguments supplied; ignoring .err file" errfile = "" elif errfile : errfile = os.path.realpath(result.errFile) if not util.checkFile(errfile, ".err") : print "err file supplied but not found or incorrect suffix:", errfile sys.exit(1) outdir = os.path.realpath(result.outputDir) #yes, this is messy...but I don't want another class (besides varsPipeline) and they just go to runCharacterize return cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, result.groupsv
def runSV(cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, groupsv): '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns; report on those alignments or the xmap provided as xmappath. ''' printargs = True if not os.path.isfile(os.path.join(cwd,"utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not util.checkFile(os.path.join(cwd,"Pipeline.py")): print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline if not util.checkFile(os.path.join(cwd,"SVModule.py")): print "SVModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import SVModule as svm if errfile and not util.checkFile(os.path.join(cwd,"SampleCharModule.py")): print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) elif errfile : import SampleCharModule as scm #use Pipeline objects varsP = Pipeline.varsPipeline() varsP.optArgumentsFileIn = optargs varsP.RefAlignerBin = rabin varsP.latestMergedCmap = os.path.join(contigdir, contigbase+".cmap") #file suffix required to be .cmap varsP.contigFolder = os.path.split(contigdir)[0] varsP.nThreads = nthreads #necessary otherwise job won't start -- max threads per node varsP.maxthreads = maxthreads #threads per job varsP.ref = refcmap varsP.stdoutlog = True #enable -stdout -stderr args to RefAligner varsP.curCharacterizeCmaps = [varsP.latestMergedCmap] varsP.contigSubDirectories = True #needed for prepareContigIO varsP.doAlignMolvRef = False #do not look for copy number varsP.groupSV = groupsv #mimic Pipeline behavior: group or not if runaligns : #varsP.contigAlignTarget = outdir varsP.runSV = False varsP.groupContigs = False varsP.stdoutlog = True #use -stdout -stderr varsP.stageComplete = contigbase varsP.outputContigPrefix = getContigPrefix(util, contigdir) #if outdir is not supplied, this is used as dir prefix; also used as file pref for -o arg varsP.outputContigFolder = contigdir #cmaps are copied from here if not outdir : outdir = contigdir+"_sv" #this will be outdir of sv jobs if os.path.isdir(outdir) : if not util.checkDir(outdir) : #check writeable print "\nERROR: Output dir is not writeable:\n", outdir, "\n" sys.exit(1) elif outdir == contigdir : print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n" sys.exit(1) print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n" elif not util.checkDir(outdir) : #does not exist, make, if False, can't make or not writeable print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n" sys.exit(1) if clustargs : os.putenv('SGE_ROOT', '/var/lib/gridengine') #do I want this??? varsP.onCluster = True varsP.clusterLogDir = os.path.join(outdir, 'ClusterLogs') util.checkDir(varsP.clusterLogDir) #make it varsP.checkCluster() varsP.clusterArgumentsFileIn = clustargs #required for parseArguments varsP.parseArguments(readingClusterFile=True) if varsP.error : print varsP.message sys.exit(1) varsP.RefAlignerBin += "${BINARY_SUFFIX:=}" #copy from varsPipeline, handled by external script on phi host varsP.pipeReportFile = os.path.join(outdir, "sv_jobs_log.txt") varsP.infoReportFile = os.path.join(outdir, "sv_log.txt") varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt") if bedfile : varsP.bedFile = bedfile util.InitStatus( os.path.join(outdir, "status.xml") ) varsP.parseArguments() #parses optArgumentsFile varsP.checkDependencies() varsP.RefAlignerBinOrig = rabin varsP.prerunLog() #general information in log -- needed for refaligner_version if printargs : print "\nRunning SV detection with arguments ("+os.path.split(optargs)[1]+"):\n" + " ".join(varsP.argsListed('svdetect')) + '\n' noisep = {} if errbinfile : noisep = {"readparameters": errbinfile} print "Using noise parameters from "+errbinfile+"\n" elif errfile : noisep = scm.readNoiseParameters(errfile.replace(".err","")) if noisep.has_key('readparameters') : #remove this because it's redundant, and it can cause problems with RefAligner compatibility del noisep['readparameters'] if not noisep : #readNoiseParameters returns empty dict on failure print "ERROR reading noise parameters, check .err file:", errfile sys.exit(1) print "Using noise parameters from "+errfile+":\n" + " ".join(["-"+str(k)+" "+str(v) for k,v in noisep.iteritems()])+"\n" #make merged cmap to replace merged _q.cmap if not produced by RefAligner varsP.contigPathTxtFile = os.path.join(outdir, "contig_list.txt") #mergeIntoSingleCmap creates this file print "Creating merged cmap" varsP.mergeIntoSingleCmap(outdir) print "Merged cmap created:", varsP.latestMergedCmap, "\n" varsP.outputContigFolder = contigdir #cmaps are copied from here svmodule = svm.SVdetect(varsP, noisep, outdir, skipderes=True) #this got duplicated above #if hasattr(util, "InitStatus") : #if old version, skip -- do this after SVdetect.__init__ bc makes outdir # util.InitStatus(os.path.join(outdir, "status.xml")) #needed otherwise call to status_log fails svmodule.runJobs() svmodule.checkResults() util.SummarizeErrors(varsP) else : varsP.contigAlignTarget = contigdir #this is dir in which _q and _r cmaps must be located print "ERROR: feature not supported" #not implemented to not run jobs
def runAlignMol() : parser = argparse.ArgumentParser(description=description) parser.add_argument('-q', dest='queryDir', help='Path to merged cmap to align molecules (-b) to OR alignmol dir from Pipeline for merge (if latter, no alignments are performed), required', type=str) parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file, required if aligning molecules', type=str) #parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file OR path to dir containing split bnx pieces, required if aligning molecules', type=str) #I should add the split feature; for now, just do single bnx parser.add_argument('-a', dest='optArguments', help='Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str) parser.add_argument('-r', help='If this flag is used, alignmolvref arguments are used, otherwise alignmol arguments are used (default alignmol; optional)', dest='ref', action='store_true') parser.add_argument('-o', dest='outputDir', help='output dir (optional, defaults to sub-dir of input map dir called "alignmol")', default="", type=str) parser.add_argument('-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) parser.add_argument('-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int) parser.add_argument('-j', dest='maxthreads', help='Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int) parser.add_argument('-e', dest='errFile', help='.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str) parser.add_argument('-E', dest='errbinFile', help='.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str) parser.add_argument('-p', dest='pipelineDir', help='Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str) parser.add_argument('-v', dest='pvalue', help='Alignment pvalue', default="1e-12") result = parser.parse_args() outprefix = "exp_refineFinal1" #this is the default; assume for now #check all Pipeline dependencies if result.pipelineDir : cwd = result.pipelineDir else : cwd = os.path.split(os.path.realpath(__file__))[0] #this is path of this script if not os.path.isfile(os.path.join(cwd,"utilities.py")) : #if still not here, last try is actual cwd cwd = os.getcwd() #still check this below #this is the only one imported here and in runCharacterize if not os.path.isfile(os.path.join(cwd,"utilities.py")): print "ERROR: utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not os.path.isfile(os.path.join(cwd,"AlignModule.py")): print "ERROR: AlignModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import AlignModule as alignmod if not util.checkFile(os.path.join(cwd,"Pipeline.py")): print "ERROR: Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline #input dir if not result.queryDir : print "ERROR: Query (-q) argument not supplied." sys.exit(1) qrypath = os.path.realpath(result.queryDir) if util.checkDir(qrypath, checkWritable=False, makeIfNotExist=False) : #output elsewhere so not writeable is ok runaligns = False elif util.checkCmap(qrypath) : runaligns = True else : print "ERROR: Query argument ("+qrypath+") not found or not a dir or cmap. Check -q argument." sys.exit(1) #this check isn't really necessary...make it a warning -- left over from runAlignMerge.py #if not os.path.split(qrypath)[1].endswith("alignmol") : # print "Warning: Query dir ("+qrypath+") does not end with 'alignmol'; please be sure this is a Pipeline alignmol dir\n" #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args rabin = "" #need empty string for generateJobList even though no jobs are run if runaligns : rabin = result.RefAligner #replicate Pipeline behavior: RefAligner is always required if os.path.isdir(rabin) : rabin = os.path.join(rabin, "RefAligner") if not util.checkExecutable(rabin): print "ERROR: RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg." sys.exit(1) #optargs file optargs = None if runaligns and result.optArguments : #supplied on command line optargs = result.optArguments if not util.checkFile(optargs, ".xml") : print "optArguments path is supplied ("+optargs+") but not found or doesn't end in .xml, check -a argument." sys.exit(1) elif runaligns : #load from Pipeline dir if running alignments optargs = os.path.join(cwd,"optArguments_human.xml") if not util.checkFile(optargs): print "optArguments.xml missing in Pipeline directory ("+cwd+"). Try supplying path explicitly using -a." sys.exit(1) #output dir if not result.outputDir : outdir = os.path.join(qrypath, "merge") #should be same as in AlignModule else : outdir = os.path.realpath(result.outputDir) if os.path.isdir(outdir) : if not util.checkDir(outdir) : #check writeable print "\nERROR: Output dir is not writeable:\n", outdir, "\n" sys.exit(1) #this is ok here #elif outdir == contigdir : # print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n" # sys.exit(1) print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n" elif not util.checkDir(outdir) : #does not exist, make, if False, can't make or not writeable print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n" sys.exit(1) #bnx file bnxfile = result.bnx if bnxfile : #must check for empty string BEFORE you do realpath, or it returns cwd bnxfile = os.path.realpath(bnxfile) if not util.checkFile(bnxfile, ".bnx") : print "ERROR: bnx file supplied but not found or incorrect suffix:", bnxfile sys.exit(1) elif runaligns : print "ERROR: bnx file not supplied but running alignments; please supply bnx file as -b argument" sys.exit(1) #nthreads nthreads = result.numThreads if nthreads <= 0 : print "ERROR: Number of threads value invalid (must be > 0): %i" % nthreads sys.exit(1) #maxthreads maxthreads = result.maxthreads if maxthreads <= 0 : print "ERROR: Max threads value invalid (must be > 0): %i" % maxthreads sys.exit(1) elif nthreads < maxthreads : print "Warning: num threads (-T: %i) < max threads (-j: %i): increasing num threads to equal max threads\n" % (nthreads, maxthreads) nthreads = maxthreads #pvalue if result.pvalue : #supplied on command line pvalue = result.pvalue else : pvalue = "1e-12" #.errbin file errbinfile = result.errbinFile if errbinfile : errbinfile = os.path.realpath(result.errbinFile) if not util.checkFile(errbinfile, ".errbin") : print "ERROR: errbin file supplied but not found or incorrect suffix:", errbinfile sys.exit(1) #.err file errfile = result.errFile if errfile and errbinfile : print "Warning: .err and .errbin arguments supplied; ignoring .err file" errfile = "" elif errfile : errfile = os.path.realpath(result.errFile) if not util.checkFile(errfile, ".err") : print "err file supplied but not found or incorrect suffix:", errfile sys.exit(1) if errfile and not util.checkFile(os.path.join(cwd,"SampleCharModule.py")): print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) elif errfile : import SampleCharModule as scm doref = result.ref #DONE checking arguments print "Using output dir", outdir if runaligns : print "Aligning", bnxfile, "\nTo", qrypath, "\n" else : print "Merging", qrypath, "\n" startTime = time.time() #time since Epoch memory_log = os.path.join(outdir, "memory_log.txt") util.initMemoryLog(memory_log) varsP = Pipeline.varsPipeline() varsP.RefAlignerBin = rabin varsP.contigFolder = "" #not used but needs to be an attr varsP.outputContigFolder = "" #not used but needs to be a string attr varsP.pipeReportFile = os.path.join(outdir, "alignmol_jobs_log.txt") varsP.infoReportFile = os.path.join(outdir, "alignmol_log.txt") util.InitStatus( os.path.join(outdir, "status.xml") ) if runaligns : varsP.optArgumentsFileIn = optargs varsP.latestMergedCmap = qrypath #if !doref, need this one varsP.ref = qrypath #and if doref, need this one varsP.nThreads = nthreads #necessary otherwise job won't start -- max threads per node varsP.maxthreads = maxthreads #threads per job p = os.path.split(qrypath)[1] varsP.outputContigPrefix = p[:p.rfind(".")] #filename prefix varsP.stdoutlog = True #use -stdout -stderr varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt") varsP.parseArguments() #parses optArgumentsFile varsP.replaceParam("alignmol", "-T", pvalue) varsP.checkDependencies() varsP.RefAlignerBinOrig = rabin varsP.prerunLog() #general information in log -- needed for refaligner_version noisep = {} if errbinfile : noisep = {"readparameters": errbinfile} #print "Using noise parameters from "+errbinfile+"\n" #move below elif errfile : noisep = scm.readNoiseParameters(errfile.replace(".err","")) if noisep.has_key('readparameters') : #remove this because it's redundant, and it can cause problems with RefAligner compatibility del noisep['readparameters'] if not noisep : #readNoiseParameters returns empty dict on failure print "ERROR reading noise parameters, check .err file:", errfile sys.exit(1) #redundant with below? print "Using noise parameters from "+errfile+":\n" + " ".join(["-"+str(k)+" "+str(v) for k,v in noisep.iteritems()])+"\n" #some code from SampleCharModule to load args into noise0 infoReport="Loaded noise parameters:\n" klist = ["FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict. #noiseargs = self.varsP.argsListed('noise0') #not necessary for v in klist : if not noisep.has_key(v) : continue param=str(noisep[v]) util.LogStatus("parameter", "auto_"+v, param) infoReport+=v+":"+param+"\n" varsP.replaceParam("noise0", "-"+v, param) varsP.updateInfoReport(infoReport + '\n', printalso=True) else : print "Getting file list from", qrypath outFileList = getOutFileList(util, qrypath) if not outFileList : print "ERROR: Query dir ("+qrypath+") does not contain alignmol data. Check -q argument." sys.exit(1) else : print "Found", len(outFileList), "alignment results" #end if runaligns amod = alignmod.AlignModule(varsP, doref, outdir, bnxfile) #constructor will call generateJobList if runaligns : amod.runJobs() amod.checkResults() else : amod.outFileList = outFileList p = os.path.split(outFileList[0])[1] if p.count("_") > 1 : #expect something like "EXP_REFINEFINAL1_4" #p = p[:p.rfind("_")+1] #remove integer suffix p = p[:p.rfind("_")] #remove integer suffix (and underscore) #else : # p += "_" #because mrgstr is appended varsP.outputContigPrefix = p if not runaligns or len(amod.jobList) > 0 : amod.getAlignStats() if runaligns : print #copy from Pipeline.py if util.SummarizeErrors(varsP=varsP)==0: varsP.updatePipeReport("Pipeline has successfully completed\n") util.LogStatus("progress", "pipeline", "success") else: varsP.updatePipeReport("Pipeline has completed with errors\n") util.LogStatus("progress", "pipeline", "failure") #BELOW OLD CODE return #in Pipeline, this is called first #print "Calling getAlignStats:" #but it won't work without varsP atm; skip it #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir) #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir) print "Calling mergeMap" print outFileList[0] #, "\n", outputdir #moved above util.logMemory(memory_log, startTime, "mergeMap_start") #mergeMap(self.varsP, self.outFileList, mergepath=self.outputdir) #varsP is optional alignmod.mergeMap(None, outFileList, outputdir) util.logMemory(memory_log, startTime, "mergeMap_end") print "Calling mergeRcmaps" util.logMemory(memory_log, startTime, "mergeRcmaps_start") #mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="alignmol") : alignmod.mergeRcmaps(outFileList, outputdir, splitByContig=True, stageName=outprefix) util.logMemory(memory_log, startTime, "mergeRcmaps_end") print "Calling split_XMap_byContig" #split_XMapQcmap_byContig" util.logMemory(memory_log, startTime, "split_XMap_byContig_start") #xmapdict = alignmod.split_XMap_byContig(outFileList, outputdir, stageName=outprefix) #old xmapdict = alignmod.split_XMap_byContig_new(outFileList, outputdir, stageName=outprefix) util.logMemory(memory_log, startTime, "split_XMap_byContig_end") print "Calling split_Qcmap_byContig" util.logMemory(memory_log, startTime, "split_Qcmap_byContig_start") #alignmod.split_Qcmap_byContig(outFileList, outputdir, xmapdict) #old alignmod.split_Qcmap_byContig_new(outFileList, outputdir, xmapdict, stageName=outprefix) #new: better performance util.logMemory(memory_log, startTime, "split_Qcmap_byContig_end") print "AlignMerge successfully completed"
def __init__(self, varsP) : """splitBNX.__init__: this class is for sorting the input bnx for subsequent splitting by the splitBNX class, and eventually easier processing with the Pairwise class. The constructor (this) will call varsP.runJobs and doAllPipeReport, then instantiate splitBNX, which will do all the splitting required for the Pairwise class. """ self.stageName = "Autonoise0" self.varsP = varsP #fewer code modifications below util.LogStatus("progress", "stage_start", self.stageName) #after above bc check if bypass (executeCurrentStage) self.output_folder = os.path.join(self.varsP.contigFolder, "auto_noise") if not util.checkDir(self.output_folder) : #will make if not exist, only returns False if already exists or can't make print "ERROR in autoNoise: bad dir:", self.output_folder raise RuntimeError # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short. #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly')) super(autoNoise, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly")) bnxfile = self.varsP.bnxFile if varsP.noiseOnly else self.varsP.sorted_file+".bnx" #was return if generateJobListChar, but need to get readparameters if bypass if not self.generateJobListChar({}, bnxfile, "autoNoise0") : #return 0 for success, 1 for skip self.varsP.runJobs(self, "AutoNoise0") self.doAllPipeReport() if not self.allResultsFound() : self.varsP.updatePipeReport("ERROR: AutoNoise0 failed. Check: "+self.output_file+".stdout\n") raise RuntimeError util.LogStatus("progress", "stage_complete", self.stageName) self.varsP.noise0 = readNoiseParameters(self.output_file) self.isBadErrorParams(self.varsP.noise0, 0) self.stageName = "Autonoise1" self.groupName = self.stageName #fix so that LogStatus call in MultiThreading.multiThreadRunJobs util.LogStatus("progress", "stage_start", self.stageName) self.clearJobs() self.varsP.replaceParam("noise0", "-readparameters", self.output_file+".errbin") #need to call again to set self.output_file if not self.generateJobListChar(self.varsP.noise0, bnxfile, "autoNoise1") : #return 0 for success, 1 for skip self.varsP.runJobs(self, "AutoNoise1") self.doAllPipeReport() if not self.allResultsFound() : self.varsP.updatePipeReport("ERROR: AutoNoise1 failed. Check: "+self.output_file+".stdout\n") raise RuntimeError self.varsP.noise1 = readNoiseParameters(self.output_file) infoReport="Automatically determined noise parameters:\n" klist = ["FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict. for v in klist : if not self.varsP.noise1.has_key(v) : continue param=str(self.varsP.noise1[v]) util.LogStatus("parameter", "auto_"+v, param) infoReport+=v+":"+param+"\n" self.varsP.replaceParam("noise0", "-"+v, param) self.varsP.updateInfoReport(infoReport + '\n') self.isBadErrorParams(self.varsP.noise1, 1) if self.varsP.doScanScale : #change the sorted_file to the rescaled bnx file rescaledbnx = self.output_file + self.varsP.rescaleSuffix #no ".bnx" in suffix if not util.checkFile(rescaledbnx+".bnx") : #not found--not an error if bnx 0.1 is used err = "Warning: scan scaled bnx not found after autoNoise1; not performing scan scaling--check that bnx 1.0 or later used in input" self.varsP.updatePipeReport( err+"\n\n" ) util.LogError("warning", err) self.varsP.doScanScale = False else : #log that scan scaling is used self.varsP.updatePipeReport( "Using scan scaled bnx: "+rescaledbnx+".bnx\n\n" ) util.LogStatus("parameter", "scanscaled_bnx", rescaledbnx+".bnx") self.varsP.sorted_file = rescaledbnx #this variable is used in splitBNX (PairwiseModule.py) util.LogStatus("progress", "stage_complete", self.stageName)
def runCharacterize(cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads): '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns; report on those alignments or the xmap provided as xmappath. ''' printargs = True if not os.path.isfile(os.path.join(cwd,"utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not util.checkFile(os.path.join(cwd,"Pipeline.py")): print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline if not util.checkFile(os.path.join(cwd,"CharacterizeModule.py")): print "CharacterizeModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import CharacterizeModule as cm #if not util.checkFile(os.path.join(cwd,"MapClassesRev.py")): # print "MapClassesRev.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" # sys.exit(1) #import MapClassesRev #use Pipeline objects varsP = Pipeline.varsPipeline() varsP.optArgumentsFileIn = optargs varsP.RefAlignerBin = rabin varsP.latestMergedCmap = os.path.join(contigdir, contigbase+".cmap") #file suffix required to be .cmap varsP.contigFolder = contigdir varsP.nThreads = nthreads #necessary otherwise job won't start varsP.ref = refcmap varsP.stdoutlog = True #enable -stdout -stderr args to RefAligner varsP.curCharacterizeCmaps = [varsP.latestMergedCmap] if runaligns : varsP.contigAlignTarget = contigdir+"/alignref" #this is output dir varsP.runSV = False varsP.groupContigs = False varsP.stageComplete = contigbase varsP.outputContigFolder = contigdir varsP.memoryLogpath = os.path.join(contigdir, "memory_log.txt") varsP.pipeReportFile = os.path.join(contigdir, "pipeReport.txt") varsP.parseArguments() #parses optArgumentsFile if printargs : print "\nRunning Characterization with arguments:\n" + " ".join(varsP.argsListed('characterizeDefault')) + '\n' if hasattr(util, "InitStatus") : #if old version, skip util.InitStatus(os.path.join(contigdir, "status.xml")) #needed otherwise call to status_log fails charmod = cm.Characterize(varsP) #create Characterize object from CharacterizeModule -- this also calls generateJobList xmappath = charmod.xmapTarget #set in Characterize.generateJobList charmod.runJobs() else : #varsP.contigAlignTarget = contigdir #this is dir in which _q and _r cmaps must be located -- contigdir is from cmap; this should be from xmap varsP.contigAlignTarget = os.path.split(xmappath)[0] print "Loading alignments from\n" + xmappath + "\n" #no longer using this in Pipeline #print MapClassesRev.TopLevelCharacterization(varsP, [os.path.join(varsP.contigAlignTarget, contigbase)]) print cm.characterizeContigs(varsP, xmappath)
def runCharacterize(cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, pvalue): '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns; report on those alignments or the xmap provided as xmappath. ''' printargs = True if not os.path.isfile(os.path.join(cwd, "utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not util.checkFile(os.path.join(cwd, "Pipeline.py")): print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline if not util.checkFile(os.path.join(cwd, "CharacterizeModule.py")): print "CharacterizeModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import CharacterizeModule as cm if not util.checkFile(os.path.join(cwd, "MapClassesRev.py")): print "MapClassesRev.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import MapClassesRev #use Pipeline objects varsP = Pipeline.varsPipeline() varsP.optArgumentsFileIn = optargs varsP.RefAlignerBin = rabin varsP.latestMergedCmap = os.path.join( contigdir, contigbase + ".cmap") #file suffix required to be .cmap varsP.contigFolder = contigdir varsP.nThreads = nthreads #necessary otherwise job won't start varsP.ref = refcmap varsP.stdoutlog = True #enable -stdout -stderr args to RefAligner varsP.curCharacterizeCmaps = [varsP.latestMergedCmap] if runaligns: varsP.contigAlignTarget = contigdir + "/alignref_final" #this is output dir varsP.runSV = False varsP.groupContigs = False varsP.stageComplete = contigbase varsP.outputContigFolder = contigdir varsP.memoryLogpath = os.path.join(contigdir, "memory_log.txt") varsP.stdoutlog = True varsP.pipeReportFile = os.path.join(contigdir, "pipeReport.txt") varsP.parseArguments() #parses optArgumentsFile varsP.replaceParam("characterizeFinal", "-T", pvalue) if printargs: print "\nRunning Characterization with arguments:\n" + " ".join( varsP.argsListed('characterizeFinal')) + '\n' if hasattr(util, "InitStatus"): #if old version, skip util.InitStatus(os.path.join( contigdir, "status.xml")) #needed otherwise call to status_log fails charmod = cm.Characterize( varsP, 1 ) #create Characterize object from CharacterizeModule -- this also calls generateJobList xmappath = charmod.xmapTarget #set in Characterize.generateJobList charmod.runJobs() else: #varsP.contigAlignTarget = contigdir #this is dir in which _q and _r cmaps must be located -- contigdir is from cmap; this should be from xmap varsP.contigAlignTarget = os.path.split(xmappath)[0] print "Loading alignments from\n" + xmappath + "\n" #no longer using this in Pipeline #print MapClassesRev.TopLevelCharacterization(varsP, [os.path.join(varsP.contigAlignTarget, contigbase)]) print cm.characterizeContigs(varsP, xmappath) #this is redundant with above