def joinBnxFiles(varsP, bnxFiles): """After image processing, merge results into all.bnx. """ #the old way was to use this fn which simply copies lines # while this is fine most of the time, RefAligner is more sophisticated, # so it should be more robust to use RefAligner #molecule.joinBnxFiles(bnxFiles, self.bnxFile) #this used to be called writeIntermediate varsP.writeListToFile(bnxFiles, varsP.bnxTarget) # args, jobName, expectedResultFile, uniqueString args = [varsP.RefAlignerBin, "-if", varsP.bnxTarget, "-merge", "-bnx", "-o", varsP.bnxFile.replace(".bnx",""), "-f"] if varsP.stdoutlog : args.extend( ['-stdout', '-stderr'] ) #print "joinBnxFiles: args:", args jobwrapper = mthread.jobWrapper(varsP, "joinBnxFiles") jobwrapper.addJob( mthread.singleJob(args, "joinBnxFiles", varsP.bnxFile, "joinBnxFiles") ) jobwrapper.multiThreadRunJobs(1) jobwrapper.doAllPipeReport() success = jobwrapper.allResultsFound() if not success : varsP.updatePipeReport("ERROR in performImageAnalysis: joinBnxFiles failed. Check: "+varsP.bnxTarget+"\n") # this is just putting the path of bnxFile in bnxTarget # on second thought, if I don't do this, then SampleCharModule will run on each bnx individually #if success : # varsP.writeListToFile([varsP.bnxFile], varsP.bnxTarget) #sense of return of allResultsFound is opposite of return of performImageAnalysis: # allResultsFound is True for all jobs success, False for any jobs fail # performImage analysis return is 1 for failure return not success
def generateJobList(self) : """splitBNX.generateJobList: submit varsP.nPairwiseJobs number of split bnx jobs. """ sorted_file = self.varsP.sorted_file if not util.checkFile(sorted_file+".bnx") : err = "ERROR: splitBNX input file (%s) not found; exiting" % self.varsP.sorted_file self.varsP.updatePipeReport(err+"\n") util.LogError("critical", err) util.LogStatus("progress", "pipeline", "failure") raise RuntimeError N = calculateNPairwise(self.varsP, sorted_file) #move back here (not sortBNX) bc needs to use sorted bnx #N = self.varsP.nPairwiseJobs self.varsP.updatePipeReport('Splitting BNX\n') #splitJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('splitting')) super(splitBNX, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs('splitting')) #should skip the rest and return 1, like in sortBNX, here: if not self.varsP.executeCurrentStage: return 1 #tell self.__init__ not to continue processing self.varsP.updatePipeReport("Splitting"+(" scan-scaled" if self.varsP.doScanScale else "")+" bnx file: %s.bnx\n\n" % self.varsP.sorted_file) #calculate threads per job: used to be fixed at 1, now file size / 1.5 GB rounded up. This was too low, add 1. threads = max(1, int(math.ceil( os.path.getsize(sorted_file+".bnx")/1.5e9 ))) + 1 if threads > 1 : self.varsP.updatePipeReport("Using %i threads per job\n" % threads) #the change in job partitioning breaks backward compatibility and was causing too many problems; make it conditional on refaligner version #this is now obsolete: assume binaries are up-to-date if False : #self.varsP.refaligner_version < 3995 : for partial in range(1,N + 1): output_file=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s" %(partial, self.varsP.nPairwiseJobs)) cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", str(partial), str(N), "-bnx", "-o", output_file] if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) #print('%d/%d' % (partial, N), cargs) expectedResultFile=output_file+".bnx" self.addJob(mthread.singleJob(cargs, self.stageName + str(partial), expectedResultFile, self.stageName + str(partial), maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout")) else : #change above to single command with -subsetbin 0 N output_file=self.varsP.bnxFile.replace(".bnx", "") cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", "0", str(N), "-bnx", "-o", output_file] if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) self.addJob(mthread.singleJob(cargs, self.stageName, output_file+".bnx", self.stageName, maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout"))
def getDetectJobs(self, contingentJob=None): self.molFile = self.localTiff + '.mol' #if the molFile is already there, the image processing is already done; do not repeat if os.path.exists(self.molFile): return [] #print "remoteTiff:", self.remoteTiff #debug #print "localTiff:", self.localTiff, os.path.exists(self.localTiff) #debug #print "expectedOverlap", self.curExp.ExpectedOverlap, "minOverlap", minOverlap #debug #there was an issue with self.curExp.ExpectedOverlap being incorrectly computed due to a bad value in the xml (see manageTargetLocation.py) #hopefully this will work if self.curExp.ExpectedOverlap > 100 : oldoverlap = self.curExp.ExpectedOverlap self.curExp.ExpectedOverlap = 15 #print "Warning: calculated expectedOverlap", oldoverlap, "too large; defaulting to", self.curExp.ExpectedOverlap self.varsP.updateInfoReport("Warning: "+self.nameStr()+": calculated expectedOverlap %i too large; defaulting to %i\n" % (oldoverlap, self.curExp.ExpectedOverlap), printalso=True) expolap = (self.curExp.ExpectedOverlap - 10 if self.curExp.ExpectedOverlap >= 10 else 0) # ExpectedOverlap - 10 minOverlap = '%d' % (expolap) maxOverlap = '%d' % (self.curExp.ExpectedOverlap + 10) # ExpectedOverlap + 10 dmOverlapArgs = ['-o', minOverlap, '-O', maxOverlap] dmArgs = self.varsP.argsListed('imgDetection') dmArgs = util.argumentReplaceList(dmArgs, ['-x', str(self.curExp.ScanColumnCount)]) dmArgs = util.argumentReplaceList(dmArgs, ['-y', str(self.curExp.ScanRowCount)]) dmArgs = util.argumentReplaceList(dmArgs, ['-p', str(self.curExp.Pitch)]) nchan = (self.curExp.nColors - 1 if self.curExp.nColors >= 2 else 1) #must be at least 1 colorArgs = ['-n', str(nchan)] sJobCpName = 'cp ' + shorten(self.remoteTiff) + ' to ' + shorten(self.localTiff) #print "cp\n"+self.remoteTiff, "\n"+self.localTiff #debug sJobCp = mthread.singleJob(['cp', self.remoteTiff, self.localTiff], sJobCpName, self.localTiff, 'cpTiff', throttleClass = True) if contingentJob: sJobCp.addContingentJob(contingentJob) sJobDMName = 'Detect ' + shorten(self.localTiff) curArgs = [self.varsP.DMstaticBin] + dmOverlapArgs + dmArgs + colorArgs + [self.localTiff] argumentString = " ".join(curArgs) + '\n' print " ".join(curArgs) #debug sJobDM = mthread.singleJob(curArgs, sJobDMName, self.molFile, 'Detect') sJobDM.addContingentJob(sJobCp) sJobDM.bpp = self.curExp.basesPerPixel sJobDM.molTag = self.molTag sJobDM.numLabelChannels = self.numLabelChannels #inputMoleculesReport += ' ' + self.molTag + ' ' + self.remoteTiff + '\n' dorm = True #default True (False for debug) joblist = [sJobCp, sJobDM] if dorm : sJobRmImgName = 'Detect Complete, rm ' + shorten(self.localTiff) sJobRmImg = mthread.singleJob(['rm', self.localTiff], sJobRmImgName, '', 'rmFile') sJobRmImg.addContingentJob(sJobDM) joblist.append( sJobRmImg ) return joblist
def generateJobListLinear(self): """Pairwise.generateJobListLinear: This method is the old way of doing pairwise comparison of all molecules. It uses the -partial option to RefAligner. This option is _incompatible_ with the various hashing options to RefAligner. """ baseArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('pairwise') ct = 0 outputTarget = os.path.join(self.varsP.alignFolder, 'exp') cArgs = [self.varsP.RefAlignerBin, '-i', self.varsP.bnxFile] for jobNum in range(1,self.varsP.nPairwiseJobs + 1): jobName = 'Pairwise %d of %d' % (jobNum, self.varsP.nPairwiseJobs) outputString = 'pairwise%dof%d' % (jobNum,self.varsP.nPairwiseJobs) expectedResultFile = outputTarget + outputString + '.align' partialArgs = ['-partial', str(jobNum), str(self.varsP.nPairwiseJobs)] currentArgs = cArgs + baseArgs + ['-o' , outputTarget + outputString] if self.varsP.stdoutlog : currentArgs.extend( ['-stdout', '-stderr'] ) if self.varsP.nPairwiseJobs > 1: currentArgs += partialArgs currentArgs += ['-maxthreads', str(self.varsP.maxthreads)] if self.varsP.bnxStatsFile!=None: currentArgs += ['-XmapStatRead', self.varsP.bnxStatsFile] sJob = mthread.singleJob(currentArgs, jobName, expectedResultFile, outputString, maxThreads=self.varsP.maxthreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=outputTarget + outputString+".stdout") ct += 1 self.addJob(sJob) self.logArguments()
def generateJobList(self): """Defines job parameters for merge. Updates variables for subsequent completion test in mergeComplete() """ self.clearJobs() self.prevPrefix = self.varsP.inputContigPrefix #self.curPrefix = self.prefixIter.next() self.curPrefix = self.stagePrefix + self.alphabet[self.iterCount] self.groupName = self.stageName + self.alphabet[self.iterCount] #jobWrapper data member utilities.LogStatus("progress", "stage_start", self.groupName) self.varsP.updatePipeReport(' PREV PREFIX %s, CUR PREFIX %s' % (self.prevPrefix, self.curPrefix)) self.iterCount += 1 outputString = os.path.join(self.varsP.outputContigFolder, self.curPrefix) currentArgs = [self.varsP.RefAlignerBin, '-o', outputString] #if self.varsP.stdoutlog : #always use this here bc it's the only output which should always be there currentArgs.extend( ['-f', '-stdout', '-stderr'] ) currentArgs += self.varsP.argsListed('merge') currentArgs += ['-maxthreads', str(self.varsP.nThreads)] contigsTextFile = os.path.join(self.varsP.inputContigFolder, 'mergeContigs.txt') contigFiles, contigIDs = self.varsP.findContigs(self.varsP.inputContigFolder, self.prevPrefix, txtOutput=contigsTextFile) #this method creates the mergeContigs.txt file which is necessary for this job self.varsP.prefixUsed.append(self.curPrefix) fileArgs = ['-if', contigsTextFile] #expoutput = outputString+".align" #don't know which contigs will disappear, but should always get an align file -- with new arg 'pairmergeRepeat', there's no .align; use stdout expoutput = outputString+".stdout" s1Job = mthread.singleJob(currentArgs + fileArgs, self.groupName, expoutput, self.groupName, maxThreads=self.varsP.nThreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile = outputString + ".stdout") self.addJob(s1Job) self.logArguments()
def generateJobList(self): baseArgs1 = self.varsP.argsListed(self.refineStage) if self.refineStage != 'refineNGS' : #noise args are in refineNGS baseArgs1 += self.varsP.argsListed('noise0') contigFiles, contigIDs = self.varsP.findContigs(self.varsP.inputContigFolder, self.varsP.inputContigPrefix) #nJobs = len(contigFiles) bnx = self.varsP.sorted_file+".bnx" #was self.varsP.bnxFile, but need sorted bc ids are different after sorting if self.refineStage == 'refineA' : #refineA uses assembler, all others use refaligner r1args = [self.varsP.AssemblerBin, '-i', bnx] #need this before -contigs r1args += ['-contigs', os.path.join(self.varsP.inputContigFolder, self.varsP.inputContigPrefix) + '.contigs'] else : #should be same for refineB/NGS/Final r1args = [self.varsP.RefAlignerBin, '-i', bnx] self.writeIDFile(len(contigFiles)) #nJobs) output1String = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix) for contigID in contigIDs : expectedOutputString = self.varsP.outputContigPrefix + '_contig' + contigID expectedResultFile = os.path.join(self.varsP.outputContigFolder, expectedOutputString + '.cmap') #refineB jobName = self.refineStage + ' %5s' % contigID if self.refineStage == 'refineA' : currentArgs = 2*[str(contigID)] #this must come after r1args because it's actually an argument to -contigs else : #should be same for refineB/NGS/Final r1_cmapFile = self.varsP.inputContigPrefix + '_contig' + str(contigID) + '.cmap' r1_cmapFile = os.path.join(self.varsP.inputContigFolder, r1_cmapFile) currentArgs = ['-maxthreads', str(self.varsP.maxthreads), '-ref', r1_cmapFile, '-id', contigID] currentArgs = r1args + currentArgs + baseArgs1 + ['-o', output1String] if self.varsP.stdoutlog : currentArgs.extend( ['-stdout', '-stderr'] ) s1Job = mthread.singleJob(currentArgs, jobName, expectedResultFile, expectedOutputString, maxThreads=self.varsP.maxthreads, clusterLogDir=self.varsP.clusterLogDir) self.addJob(s1Job) self.logArguments()
def generateJobList(self): contigFiles, contigIDs = self.varsP.findContigs(self.varsP.inputContigFolder, self.varsP.inputContigPrefix) curargs = [self.varsP.RefAlignerBin, '-i', self.varsP.sorted_file+".bnx"] #was bnxFile baseArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('extension') nJobs = contigFiles.__len__() ct = 0 logArguments = "" #just in case the following loop isn't entered for jobNum in range(1,nJobs + 1): contigID = contigIDs[jobNum - 1] #jobName = 'Extend ' + contigID + ', Job ' + str(jobNum) + ' of ' + str(nJobs) expContigString = self.varsP.outputContigPrefix + '_contig' + contigID outputString = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix) expectedResultFile = os.path.join(self.varsP.outputContigFolder, expContigString + '.cmap')# '_refined.cmap') jobName = 'Ext %s' % expContigString# + ', Job ' + str(jobNum) + ' of ' + str(nJobs) currentContig = contigFiles[jobNum - 1] currentArgs = curargs + baseArgs currentArgs += ['-maxthreads', str(self.varsP.maxthreads), '-o', outputString, '-id', contigID, '-ref', currentContig] if self.varsP.stdoutlog : currentArgs.extend( ['-stdout', '-stderr'] ) s1Job = mthread.singleJob(currentArgs, jobName, expectedResultFile, expContigString, maxThreads=self.varsP.maxthreads, forceForward = currentContig, clusterLogDir=self.varsP.clusterLogDir) self.addJob(s1Job) ct += 1 self.logArguments()
def generateJobListSubsample(self) : if not self.varsP.executeCurrentStage: return 1 #tell self.__init__ not to continue processing sorted_file = self.varsP.sorted_file nmols = 1000 seed = 1 self.subsampled=sorted_file+"_subsampled" self.varsP.updatePipeReport('Subsampling %s\n' % (sorted_file)) jobName="SubsamplingBNX" expectedResultFile=self.subsampled+".bnx" # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short. #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly')) super(autoNoise, self).__init__(self.varsP, jobName, clusterArgs=self.varsP.getClusterArgs("assembly")) cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(self.varsP.maxthreads), "-merge", "-minlen", "200", "-randomize", str(seed), "-subset", "1", str(nmols), "-bnx", "-o", self.subsampled] + self.varsP.argsListed('bnx_sort') if self.varsP.bnxStatsFile!=None: cargs += ['-XmapStatRead', self.varsP.bnxStatsFile] if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) self.addJob(mthread.singleJob(cargs, jobName, expectedResultFile, jobName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=self.subsampled+".stdout")) return 0 #success
def generateJobListChar(self, noise_in, input_file, optSection) : if not self.varsP.executeCurrentStage: return 1 #tell self.__init__ not to continue processing self.varsP.updatePipeReport('%s\n' % (optSection)) self.output_folder=os.path.join(self.varsP.contigFolder, "auto_noise") if not util.checkDir(self.output_folder) : #will make if not exist, only returns False if already exists or can't make print "ERROR in autoNoise.varsPipeline.prepareContigIO: bad dir:", self.output_folder self.output_file=os.path.join(self.output_folder, optSection) expectedResultFile=self.output_file+".err" # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short. #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly')) super(autoNoise, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly")) #cargs=[self.varsP.RefAlignerBin, '-f', '-i', input_file, "-ref", self.varsP.ref, "-maxthreads", str(self.varsP.maxthreads), "-o", self.output_file] cargs=[self.varsP.RefAlignerBin, '-f', '-i', input_file, "-ref", self.varsP.ref, "-o", self.output_file] #remove maxthreads bc this is always running on its own if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) for v in noise_in.keys(): cargs.extend(["-"+v, str(noise_in[v])]) cargs.extend(self.varsP.argsListed(optSection)) if self.varsP.bnxStatsFile!=None: cargs += ['-XmapStatWrite', self.varsP.bnxStatsFile] self.addJob(mthread.singleJob(cargs, self.stageName, expectedResultFile, self.stageName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=self.output_file+".stdout")) return 0 #success
def generateJobList(self): """ Instantiate job wrapper class with queue of single jobs for assembly """ if self.varsP.pairwiseTriangleMode : AssemblerInputFlag="-if" AssemblerInputFile=self.varsP.bnxFileList else: AssemblerInputFlag="-i" AssemblerInputFile=self.varsP.bnxFile cargs = [self.varsP.AssemblerBin, AssemblerInputFlag, AssemblerInputFile, '-af', self.varsP.alignTarget] if self.varsP.bnxStatsFile!=None: cargs += ['-XmapStatRead', self.varsP.bnxStatsFile] baseArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('assembly') logFile = os.path.join(self.varsP.localRoot, 'AssemblyLog.txt') errFile = os.path.join(self.varsP.localRoot, 'AssemblyLog_stderr.txt') outFile = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix) #no suffix for -o arg of Assembler self.contigsFile = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix+".contigs") #Assembler will append this suffix currentArgs = cargs + baseArgs + ['-o', outFile] if self.varsP.stdoutlog : currentArgs.extend( ['-stdout', '-stderr'] ) logArguments = " ".join(currentArgs) + 2 * '\n' jobName = 'Assembly' #sJob = mthread.singleJob(currentArgs, jobName, self.contigsFile, jobName, stdOutFile=logFile, stdErrOutFile=errFile,clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=outFile+".stdout") sJob = mthread.singleJob(currentArgs, jobName, self.contigsFile, jobName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=outFile+".stdout") self.addJob(sJob) self.logArguments()
def generateJobListChar(self, noise_in, input_file, optSection) : self.output_file=os.path.join(self.output_folder, optSection) #must assign before return bc used in constructor if not self.varsP.executeCurrentStage: return 1 #tell self.__init__ not to continue processing self.varsP.updatePipeReport('%s\n' % (optSection)) #move to constructor #self.output_folder=os.path.join(self.varsP.contigFolder, "auto_noise") #if not util.checkDir(self.output_folder) : #will make if not exist, only returns False if already exists or can't make # print "ERROR in autoNoise.varsPipeline.prepareContigIO: bad dir:", self.output_folder expectedResultFile=self.output_file+".err" # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short. #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly')) #move to constructor #super(autoNoise, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly")) #cargs=[self.varsP.RefAlignerBin, '-f', '-i', input_file, "-ref", self.varsP.ref, "-maxthreads", str(self.varsP.maxthreads), "-o", self.output_file] cargs=[self.varsP.RefAlignerBin, '-f', '-i', input_file, "-ref", self.varsP.ref, "-o", self.output_file] #remove maxthreads bc this is always running on its own if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) cargs.extend( ['-output-veto-filter', 'intervals.txt$'] ) for v in noise_in.keys(): cargs.extend(["-"+v, str(noise_in[v])]) cargs.extend(self.varsP.argsListed(optSection)) if self.varsP.bnxStatsFile!=None: cargs += ['-XmapStatWrite', self.varsP.bnxStatsFile] self.addJob(mthread.singleJob(cargs, self.stageName, expectedResultFile, self.stageName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=self.output_file+".stdout")) return 0 #success
def generateJobList(self,argset=-1): if not self.varsP.ref : #no jobs if no ref return jobargs = [self.varsP.RefAlignerBin, '-ref', self.varsP.ref] if argset == -1 and self.varsP.argData.has_key('characterizeDefault') : # don't use nominal default opta = self.varsP.argsListed('characterizeDefault') elif argset == 1 and self.varsP.argData.has_key('characterizeFinal') : #extend (on default) -- make this default opta = self.varsP.argsListed('characterizeFinal') else : #this is an error self.varsP.updatePipeReport("ERROR in CharacterizeModule.generateJobList: invalid argset %s\n" % str(argset)) return for i, curCharacterizeCmap in enumerate(self.varsP.curCharacterizeCmaps): if self.varsP.numCharacterizeJobs == 1: jobName = 'Char'+self.argStr+'_%s' % self.varsP.stageComplete else: jobName = 'Char'+self.argStr+'_%s_%d' % (self.varsP.stageComplete, i+1) outFileName = os.path.split(curCharacterizeCmap)[-1].replace(".cmap", "") outfile = os.path.join(self.varsP.contigAlignTarget,outFileName) self.curCharacterizeFileRoots.append(outfile) expectedResultFile = outfile+".xmap" self.xmapTarget = expectedResultFile currentArgs = jobargs + ["-i", curCharacterizeCmap, "-o", outfile] stdoutf = None if self.varsP.stdoutlog : currentArgs.extend( ['-stdout', '-stderr'] ) stdoutf = outfile+".stdout" currentArgs += ['-maxthreads', str(self.varsP.nThreads)] currentArgs += ['-output-veto-filter', '_intervals.txt$'] currentArgs += opta s1Job = mthread.singleJob(currentArgs, jobName, expectedResultFile, jobName.replace(' ',''),maxThreads=self.varsP.nThreads,clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf) self.addJob(s1Job) if i==0: self.logArguments()
def generateJobList(self) : """splitBNX.generateJobList: submit varsP.nPairwiseJobs number of split bnx jobs. """ sorted_file = self.varsP.sorted_file if not util.checkFile(sorted_file+".bnx") : err = "ERROR: splitBNX input file (%s) not found; exiting" % self.varsP.sorted_file self.varsP.updatePipeReport(err+"\n") util.LogError("critical", err) util.LogStatus("progress", "pipeline", "failure") raise RuntimeError N = calculateNPairwise(self.varsP, sorted_file) #move back here (not sortBNX) bc needs to use sorted bnx #N = self.varsP.nPairwiseJobs self.varsP.updatePipeReport('Splitting BNX\n') #splitJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('splitting')) super(splitBNX, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs('splitting')) #should skip the rest and return 1, like in sortBNX, here: if not self.varsP.executeCurrentStage: return 1 #tell self.__init__ not to continue processing self.varsP.updatePipeReport("Splitting"+(" scan-scaled" if self.varsP.doScanScale else "")+" bnx file: %s.bnx\n\n" % self.varsP.sorted_file) #calculate threads per job: used to be fixed at 1, now file size / 1.5 GB rounded up. This was too low, add 1. threads = max(1, int(math.ceil( os.path.getsize(sorted_file+".bnx")/1.5e9 ))) + 1 if threads > 1 : self.varsP.updatePipeReport("Using %i threads per job\n" % threads) #the change in job partitioning breaks backward compatibility and was causing too many problems; make it conditional on refaligner version if self.varsP.refaligner_version < 3995 : for partial in range(1,N + 1): output_file=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s" %(partial, self.varsP.nPairwiseJobs)) cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", str(partial), str(N), "-bnx", "-o", output_file] if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) #print('%d/%d' % (partial, N), cargs) expectedResultFile=output_file+".bnx" self.addJob(mthread.singleJob(cargs, self.stageName + str(partial), expectedResultFile, self.stageName + str(partial), maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout")) else : #change above to single command with -subsetbin 0 N output_file=self.varsP.bnxFile.replace(".bnx", "") cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", "0", str(N), "-bnx", "-o", output_file] if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) self.addJob(mthread.singleJob(cargs, self.stageName, output_file+".bnx", self.stageName, maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout"))
def getLambdaMapJob(self, snrCutoff=0, verbose=False): #Note, verbose will print once per job, so use for debugging only # add lambda alignment band to this lambdaFilter = self.varsP.argData['lambdaFilter'] lamMinLen = float(lambdaFilter[lambdaFilter.index('-minlen') +1]) if '-minlen' in lambdaFilter else 40. lamMaxLen = float(lambdaFilter[lambdaFilter.index('-maxlen') +1]) if '-maxlen' in lambdaFilter else 60. lamMinLab = float(lambdaFilter[lambdaFilter.index('-minsites')+1]) if '-minsites' in lambdaFilter else 6. lamMaxLab = float(lambdaFilter[lambdaFilter.index('-maxsites')+1]) if '-maxsites' in lambdaFilter else 10. #old format below (dict, not list) #lamMinLen = int(lambdaFilter['-minlen' ]) # 40 #lamMaxLen = int(lambdaFilter['-maxlen' ]) # 60 #lamMinLab = int(lambdaFilter['-minsites']) # 6 #lamMaxLab = int(lambdaFilter['-maxsites']) # 10 if verbose : self.varsP.updateInfoReport("lamMinLen = %.0f\n" % lamMinLen, printalso=True) self.varsP.updateInfoReport("lamMaxLen = %.0f\n" % lamMaxLen, printalso=True) self.varsP.updateInfoReport("lamMinLab = %.0f\n" % lamMinLab, printalso=True) self.varsP.updateInfoReport("lamMaxLab = %.0f\n" % lamMaxLab, printalso=True) #need mol file to do this; if doesn't exist, return with warning if not(os.path.exists(self.molFile)): print "Skipping map lambda job", self.molTag, "because mol file missing:", self.molFile self.lambdaErrFile = None return bnxFileLambda = '%s_lambda.bnx' % self.molTag bnxFileLambda = os.path.join(os.path.split(self.molFile)[0], bnxFileLambda) #if lambda bnx exists, skip the isolation step if os.path.exists(bnxFileLambda) : print "Using lambda bnx", bnxFileLambda else : print ' Isolating Lambda %s' % self.molTag lab2File = self.molFile.replace('.mol', '.0.lab') scanDset = molecule.moleculeDataset(self.curExp.basesPerPixel, molTag=int(self.molTag)) scanDset.readMolFile(self.molFile) scanDset.annotateLabels(lab2File) # Introduce optArguments for Lambda Band scanDsetLambda = molecule.filteredSubset(scanDset,snrCutoff,lamMinLab,lamMaxLab,lamMinLen,lamMaxLen,True) scanDsetLambda.writeBnxFile(bnxFileLambda, quality=self.quality) self.lambdaBnx = bnxFileLambda baseArgs = self.varsP.argsListed('mapLambda') outputTarget = bnxFileLambda.replace('.bnx', '') curArgs = [self.varsP.RefAlignerBin, '-i', bnxFileLambda, '-o', outputTarget, '-ref', self.varsP.lambdaRef] + baseArgs if self.varsP.stdoutlog : curArgs.extend( ['-stdout', '-stderr'] ) jobTag = self.molTag + '_lambda' self.lambdaErrFile = outputTarget + '.err' #if the err file exists, no need to process if os.path.exists(self.lambdaErrFile) : print "Skipping map lambda job ", jobTag, "because err file exists", self.lambdaErrFile return return mthread.singleJob(curArgs, jobTag, self.lambdaErrFile, jobTag)
def generateJobList(self): baseArgs1 = self.varsP.argsListed(self.refineStage) if self.refineStage != 'refineNGS': #noise args are in refineNGS baseArgs1 += self.varsP.argsListed('noise0') contigFiles, contigIDs = self.varsP.findContigs( self.varsP.inputContigFolder, self.varsP.inputContigPrefix) #nJobs = len(contigFiles) bnx = self.varsP.sorted_file + ".bnx" #was self.varsP.bnxFile, but need sorted bc ids are different after sorting if self.refineStage == 'refineA': #refineA uses assembler, all others use refaligner r1args = [self.varsP.AssemblerBin, '-i', bnx] #need this before -contigs r1args += [ '-contigs', os.path.join(self.varsP.inputContigFolder, self.varsP.inputContigPrefix) + '.contigs' ] else: #should be same for refineB/NGS/Final r1args = [self.varsP.RefAlignerBin, '-i', bnx] self.writeIDFile(len(contigFiles)) #nJobs) output1String = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix) for contigID in contigIDs: expectedOutputString = self.varsP.outputContigPrefix + '_contig' + contigID expectedResultFile = os.path.join(self.varsP.outputContigFolder, expectedOutputString + '.cmap') #refineB jobName = self.refineStage + ' %5s' % contigID if self.refineStage == 'refineA': currentArgs = 2 * [ str(contigID) ] #this must come after r1args because it's actually an argument to -contigs else: #should be same for refineB/NGS/Final r1_cmapFile = self.varsP.inputContigPrefix + '_contig' + str( contigID) + '.cmap' r1_cmapFile = os.path.join(self.varsP.inputContigFolder, r1_cmapFile) currentArgs = [ '-maxthreads', str(self.varsP.maxthreads), '-ref', r1_cmapFile, '-id', contigID ] currentArgs = r1args + currentArgs + baseArgs1 + [ '-o', output1String ] if self.varsP.stdoutlog: currentArgs.extend(['-stdout', '-stderr']) s1Job = mthread.singleJob(currentArgs, jobName, expectedResultFile, expectedOutputString, maxThreads=self.varsP.maxthreads, clusterLogDir=self.varsP.clusterLogDir) self.addJob(s1Job) self.logArguments()
def getTargetJobs(self, dormdir=False): localDataLocation = os.path.join(self.varsP.localRoot, self.expTag + '/') #print "localDataLocation:", localDataLocation #debug if dormdir : sJobRmName = 'Pre-Remove Folder: ' + shorten(localDataLocation) sJobRm = mthread.singleJob(['rm', '-f', '-r', localDataLocation], sJobRmName, '', 'rmDir') sJobMkdirName = 'Make Folder: ' + shorten(localDataLocation) sJobMkdir = mthread.singleJob(['mkdir', localDataLocation], sJobMkdirName, localDataLocation, 'mkDir') sJobMkdir.addContingentJob(sJobRm) allJobs = [sJobRm, sJobMkdir] contingentjob = sJobMkdir else : util.checkDir(localDataLocation) #will make dir localDataLocation allJobs = [] contingentjob = None for scan in self.scans: scanjobs = scan.getDetectJobs(contingentjob) if not scanjobs : #no scan jobs means the scan has already been processed--clear all jobs self.varsP.updatePipeReport("Device.getTargetJobs: skipping path "+scan.nameStr()+"\n") #localDataLocation else : allJobs += scanjobs return allJobs
def __init__(self, varsP) : jobName = "reference_process" opta_section = "referenceSvdetect" default_mres = "2.9" mres = "-mres" self.varsP = varsP usedefault = False if self.varsP.argData.has_key(opta_section) : #check if in optargs opta = self.varsP.argsListed(opta_section) if not mres in opta : #must have mres self.varsP.updatePipeReport("Warning in referenceProcess: "+mres+" missing in optArguments section "+opta_section+"\n") usedefault = True else : self.varsP.updatePipeReport("Warning in referenceProcess: optArguments section "+opta_section+" missing\n") usedefault = True if usedefault : opta = [mres, default_mres] mresstr = opta[opta.index(mres)+1] #get string for mres value for output name mresstr = mresstr.replace(".","") if not util.checkDir(self.varsP.refFolder) : self.varsP.updatePipeReport( "ERROR in referenceProcess: could not make output dir %s\n" % self.varsP.refFolder ) return None refpref = os.path.basename(self.varsP.ref[:self.varsP.ref.rfind(".")]) + "_res" + mresstr outarg = os.path.join(self.varsP.refFolder, refpref) #refFolder is new output folder for this job expectedResultFile = outarg+".cmap" #if ref is spots, is this spots? args = [self.varsP.RefAlignerBin, '-o', outarg, '-i', self.varsP.ref, '-f', '-merge'] + opta stdoutf = None if self.varsP.stdoutlog : args.extend( ['-stdout', '-stderr'] ) stdoutf = outarg+".stdout" args += ['-maxthreads', str(self.varsP.nThreads)] super(referenceProcess, self).__init__(self.varsP, jobName, clusterArgs=self.varsP.getClusterArgs("assembly")) job = mthread.singleJob(args, jobName, expectedResultFile, jobName, maxThreads=self.varsP.nThreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf) self.addJob(job) util.LogStatus("progress", "stage_start", jobName) self.varsP.runJobs(self, "referenceProcess") self.doAllPipeReport() if not self.allResultsFound() : #this is an error, but we'll continue processing without SV detect err = "ERROR in referenceProcess: job failed, disabling SV detect" self.varsP.updatePipeReport( err+"\n" ) util.LogError("error", err) #self.varsP.runSV = False #no need since this class is used in SVModule else : self.varsP.refDeresed = expectedResultFile #store good result for SV detect self.varsP.updatePipeReport( "referenceProcess: using reference %s for svdetect\n" % self.varsP.refDeresed ) util.LogStatus("progress", "stage_complete", jobName)
def generateJobList(self, argset=-1): if not self.varsP.ref: #no jobs if no ref return jobargs = [self.varsP.RefAlignerBin, '-ref', self.varsP.ref] if argset == -1 and self.varsP.argData.has_key( 'characterizeDefault'): # don't use nominal default opta = self.varsP.argsListed('characterizeDefault') elif argset == 1 and self.varsP.argData.has_key( 'characterizeFinal' ): #extend (on default) -- make this default opta = self.varsP.argsListed('characterizeFinal') else: #this is an error self.varsP.updatePipeReport( "ERROR in CharacterizeModule.generateJobList: invalid argset %s\n" % str(argset)) return for i, curCharacterizeCmap in enumerate( self.varsP.curCharacterizeCmaps): if self.varsP.numCharacterizeJobs == 1: jobName = 'Char' + self.argStr + '_%s' % self.varsP.stageComplete else: jobName = 'Char' + self.argStr + '_%s_%d' % ( self.varsP.stageComplete, i + 1) outFileName = os.path.split(curCharacterizeCmap)[-1].replace( ".cmap", "") outfile = os.path.join(self.varsP.contigAlignTarget, outFileName) self.curCharacterizeFileRoots.append(outfile) expectedResultFile = outfile + ".xmap" self.xmapTarget = expectedResultFile currentArgs = jobargs + ["-i", curCharacterizeCmap, "-o", outfile] stdoutf = None if self.varsP.stdoutlog: currentArgs.extend(['-stdout', '-stderr']) stdoutf = outfile + ".stdout" currentArgs += ['-maxthreads', str(self.varsP.nThreads)] currentArgs += ['-output-veto-filter', '_intervals.txt$'] currentArgs += opta s1Job = mthread.singleJob(currentArgs, jobName, expectedResultFile, jobName.replace(' ', ''), maxThreads=self.varsP.nThreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf) self.addJob(s1Job) if i == 0: self.logArguments()
def generateJobListTriangle(self): baseArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('pairwise') cArgs = [self.varsP.RefAlignerBin, '-i', self.varsP.bnxFile] ct = 0 outputTarget = os.path.join(self.varsP.alignFolder, 'exp') njobs=self.varsP.nPairwiseJobs*(self.varsP.nPairwiseJobs+1)/2 BNX_list=[] for i in range(1,self.varsP.nPairwiseJobs + 1): file1=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s.bnx" %(i, self.varsP.nPairwiseJobs)) BNX_list.append(file1+"\n") for j in range(i,self.varsP.nPairwiseJobs + 1): file2=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s.bnx" %(j, self.varsP.nPairwiseJobs)) jobName = 'Pairwise %d of %d' % (ct+1, njobs) outputString = 'pairwise%dof%d' % (ct+1, njobs) expectedResultFile = outputTarget + outputString + '.align' if i==j : currentArgs = [self.varsP.RefAlignerBin, '-i', file1] + ['-o' , outputTarget + outputString] + baseArgs else : currentArgs = [self.varsP.RefAlignerBin, "-first", "-1", "-i", file1, "-i", file2] + ['-o' , outputTarget + outputString] + baseArgs if self.varsP.stdoutlog : currentArgs.extend( ['-stdout', '-stderr'] ) #if self.varsP.nPairwiseJobs > 1: #currentArgs += partialArgs currentArgs += ['-maxthreads', str(self.varsP.maxthreads)] if self.varsP.bnxStatsFile!=None: currentArgs += ['-XmapStatRead', self.varsP.bnxStatsFile] #if ct == 0: #redundant with logArguments below # self.pipeReport += " ".join(currentArgs) + 2 * '\n' sJob = mthread.singleJob(currentArgs, jobName, expectedResultFile, outputString, maxThreads=self.varsP.maxthreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=outputTarget + outputString+".stdout", )#, shell=True) ct += 1 self.addJob(sJob) self.varsP.bnxFileList=self.varsP.bnxFile.replace(".bnx", ".list") f=open(self.varsP.bnxFileList, "w") f.writelines(BNX_list) f.close() self.logArguments()
def generateJobList(self): """Defines job parameters for merge. Updates variables for subsequent completion test in mergeComplete() """ self.clearJobs() self.prevPrefix = self.varsP.inputContigPrefix #self.curPrefix = self.prefixIter.next() self.curPrefix = self.stagePrefix + self.alphabet[self.iterCount] self.groupName = self.stageName + self.alphabet[ self.iterCount] #jobWrapper data member utilities.LogStatus("progress", "stage_start", self.groupName) self.varsP.updatePipeReport(' PREV PREFIX %s, CUR PREFIX %s' % (self.prevPrefix, self.curPrefix)) self.iterCount += 1 outputString = os.path.join(self.varsP.outputContigFolder, self.curPrefix) currentArgs = [self.varsP.RefAlignerBin, '-o', outputString] #if self.varsP.stdoutlog : #always use this here bc it's the only output which should always be there currentArgs.extend(['-f', '-stdout', '-stderr']) currentArgs += self.varsP.argsListed('merge') currentArgs += ['-maxthreads', str(self.varsP.nThreads)] contigsTextFile = os.path.join(self.varsP.inputContigFolder, 'mergeContigs.txt') contigFiles, contigIDs = self.varsP.findContigs( self.varsP.inputContigFolder, self.prevPrefix, txtOutput=contigsTextFile ) #this method creates the mergeContigs.txt file which is necessary for this job self.varsP.prefixUsed.append(self.curPrefix) fileArgs = ['-if', contigsTextFile] #expoutput = outputString+".align" #don't know which contigs will disappear, but should always get an align file -- with new arg 'pairmergeRepeat', there's no .align; use stdout expoutput = outputString + ".stdout" s1Job = mthread.singleJob(currentArgs + fileArgs, self.groupName, expoutput, self.groupName, maxThreads=self.varsP.nThreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=outputString + ".stdout") self.addJob(s1Job) self.logArguments()
def generateJobList(self): contigFiles, contigIDs = self.varsP.findContigs( self.varsP.inputContigFolder, self.varsP.inputContigPrefix) curargs = [ self.varsP.RefAlignerBin, '-i', self.varsP.sorted_file + ".bnx" ] #was bnxFile baseArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed( 'extension') nJobs = contigFiles.__len__() ct = 0 logArguments = "" #just in case the following loop isn't entered for jobNum in range(1, nJobs + 1): contigID = contigIDs[jobNum - 1] #jobName = 'Extend ' + contigID + ', Job ' + str(jobNum) + ' of ' + str(nJobs) expContigString = self.varsP.outputContigPrefix + '_contig' + contigID outputString = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix) expectedResultFile = os.path.join(self.varsP.outputContigFolder, expContigString + '.cmap') # '_refined.cmap') jobName = 'Ext %s' % expContigString # + ', Job ' + str(jobNum) + ' of ' + str(nJobs) currentContig = contigFiles[jobNum - 1] currentArgs = curargs + baseArgs currentArgs += [ '-maxthreads', str(self.varsP.maxthreads), '-o', outputString, '-id', contigID, '-ref', currentContig ] if self.varsP.stdoutlog: currentArgs.extend(['-stdout', '-stderr']) s1Job = mthread.singleJob(currentArgs, jobName, expectedResultFile, expContigString, maxThreads=self.varsP.maxthreads, forceForward=currentContig, clusterLogDir=self.varsP.clusterLogDir) self.addJob(s1Job) ct += 1 self.logArguments()
def generateJobList(self): curArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('sampleChar') if util.checkFile(self.varsP.bnxTarget) : #file exists only if image processing was run bnxFiles = parseExperimentFile(self.varsP.bnxTarget) if not bnxFiles : #check that you got at least one errstr = "ERROR in SampleChar.generateJobList: no bnx files found in: "+self.varsP.bnxTarget print errstr self.varsP.updatePipeReport(errstr+"\n\n") return basepath = "" #os.path.split(bnxFiles[0])[0] #don't use basepath for this case else : #otherwise, assume this is the only bnx file bnxFiles = [self.varsP.bnxFile] #here, make a dir for the results--should really check results of checkEmptyDir for errors basepath = os.path.join(self.varsP.localRoot, "sampleChar") if self.varsP.wipe and os.path.isdir(basepath) : shutil.rmtree(basepath) #util.checkEmptyDir(basepath) #will make if not exist, but if it does, will remove and re-make -- this fn doesn't exist... #else : util.checkDir(basepath) #will make if not exist, but won't remove anything nJobs = len(bnxFiles) #for i, bnxFile in enumerate(bnxFiles): for bnxFile in bnxFiles : #bnxGroupName = '%02d' % (i+1) #get this from the path, ie, bnxFiles cargs = [self.varsP.RefAlignerBin, '-i', bnxFile] bnxname = os.path.split(bnxFile)[1].replace(".bnx","") jobname = 'Sample_Char_' + bnxname #outputTarget = os.path.join(basepath, bnxGroupName) if basepath : #bnx input outputTarget = os.path.join(basepath, bnxname) else : #image processing outputTarget = bnxFile.replace(".bnx","") + "_sampleChar" expectedResultFile = outputTarget + '.err' #this is used in checkResults currentArgs = cargs + ['-ref', self.varsP.ref, '-o' , outputTarget, '-f'] if self.varsP.stdoutlog : currentArgs.extend( ['-stdout', '-stderr'] ) currentArgs += ['-maxthreads', str(self.varsP.maxthreads)] + curArgs sJob = mthread.singleJob(currentArgs, jobname, expectedResultFile, jobname, clusterLogDir=self.varsP.clusterLogDir) # peStr is deprecated in favor of clusterargs #sJob.expTag = bnxGroupName #removed from checkResults self.addJob(sJob) self.logArguments()
def generateJobList(self) : if not self.varsP.executeCurrentStage: return 1 #tell self.__init__ not to continue processing sorted_file = self.varsP.sorted_file self.varsP.updatePipeReport('Sorting %s into %s\n' % (self.varsP.bnxFile, sorted_file)) expectedResultFile=sorted_file+".bnx" # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short. #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly')) super(sortBNX, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly")) cargs=[self.varsP.RefAlignerBin, '-f', '-i', self.varsP.bnxFile, "-maxthreads", str(self.varsP.maxthreads), "-merge", "-sort-idinc", "-bnx", "-o", sorted_file] + self.varsP.argsListed('bnx_sort') if self.varsP.bnxStatsFile!=None: cargs += ['-XmapStatWrite', self.varsP.bnxStatsFile] if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) self.addJob(mthread.singleJob(cargs, self.stageName, expectedResultFile, self.stageName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=sorted_file+".stdout")) return 0 #success
def __init__(self, varsP): jobName = "reference_process" opta_section = "referenceSvdetect" default_mres = "2.9" mres = "-mres" self.varsP = varsP usedefault = False if self.varsP.argData.has_key(opta_section): #check if in optargs opta = self.varsP.argsListed(opta_section) if not mres in opta: #must have mres self.varsP.updatePipeReport( "Warning in referenceProcess: " + mres + " missing in optArguments section " + opta_section + "\n") usedefault = True else: self.varsP.updatePipeReport( "Warning in referenceProcess: optArguments section " + opta_section + " missing\n") usedefault = True if usedefault: opta = [mres, default_mres] mresstr = opta[opta.index(mres) + 1] #get string for mres value for output name mresstr = mresstr.replace(".", "") if not util.checkDir(self.varsP.refFolder): self.varsP.updatePipeReport( "ERROR in referenceProcess: could not make output dir %s\n" % self.varsP.refFolder) return None refpref = os.path.basename( self.varsP.ref[:self.varsP.ref.rfind(".")]) + "_res" + mresstr outarg = os.path.join( self.varsP.refFolder, refpref) #refFolder is new output folder for this job expectedResultFile = outarg + ".cmap" #if ref is spots, is this spots? args = [ self.varsP.RefAlignerBin, '-f', '-o', outarg, '-i', self.varsP.ref, '-merge' ] + opta stdoutf = None if self.varsP.stdoutlog: args.extend(['-stdout', '-stderr']) stdoutf = outarg + ".stdout" args += ['-maxthreads', str(self.varsP.nThreads)] super(referenceProcess, self).__init__(self.varsP, jobName, clusterArgs=self.varsP.getClusterArgs("assembly")) job = mthread.singleJob(args, jobName, expectedResultFile, jobName, maxThreads=self.varsP.nThreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf) self.addJob(job) util.LogStatus("progress", "stage_start", jobName) self.varsP.runJobs(self, "referenceProcess") self.doAllPipeReport() if not self.allResultsFound( ): #this is an error, but we'll continue processing without SV detect err = "ERROR in referenceProcess: job failed, disabling SV detect" self.varsP.updatePipeReport(err + "\n") util.LogError("error", err) #self.varsP.runSV = False #no need since this class is used in SVModule else: self.varsP.refDeresed = expectedResultFile #store good result for SV detect self.varsP.updatePipeReport( "referenceProcess: using reference %s for svdetect\n" % self.varsP.refDeresed) util.LogStatus("progress", "stage_complete", jobName)
break if case(): self.varsP.updatePipeReport( "Internal error: cannot handle stage %s" % (self.refineStage)) raise ValueError if self.varsP.bnxStatsFile != None: currentArgs.extend(['-XmapStatRead', self.varsP.bnxStatsFile]) s1Job = mthread.singleJob( currentArgs, jobName, expectedResultFile, expectedOutputString, maxThreads=nthreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=expectedStdoutFile, ) self.addJob(s1Job) self.logArguments() def checkResults(self, stageSuffix=""): '''Call jobWrapper (self) .doAllPipeReport, and varsP.mergeIntoSingleCmap. stageSuffix, if supplied, is appended to varsP.stageComplete in order to fix the stage name reported by the CharacterizeModule in the informaticsReport. ''' self.doAllPipeReport() self.varsP.stageComplete = self.refineStage + stageSuffix if self.refineStage not in ['refineB0', 'refineFinal0', 'extension0']:
def generateJobList(self): """AlignModule.generateJobList: create RefAligner jobs for aligning molecules to contigs. """ #for runAlignMol, this method is called but not used: exit if RefAlignerBin is empty if not self.varsP.RefAlignerBin: return #the contigs are obtained from varsP.latestMergedCmap--check its validity, a return will mean no jobs, and no jobs is now handled in multiThreadRunJobs. if not self.doref and ( not self.varsP.latestMergedCmap or not util.checkCmap(self.varsP.latestMergedCmap)): err = "Error in AlignModule.generateJobList: varsP.latestMergedCmap is not set or not valid cmap; skipping %s" % self.stageName self.varsP.updatePipeReport(err + "\n") util.LogError("error", err) return #Note: noise parameters should be fixed becuase when bnx is split, -M # would find different parameters for different contigs. Use noise0. baseargs = [self.varsP.RefAlignerBin] if not self.doref: baseargs += ['-ref', self.varsP.latestMergedCmap ] #reference is latest merged cmap mappref = os.path.split(self.varsP.latestMergedCmap)[1] mappref = mappref[:mappref.find(".")] else: baseargs += ['-ref', self.varsP.ref] mappref = self.stageName #use stageName also for output filename noiseargs = self.varsP.argsListed('noise0') haverefargs = False try: #argsListed does not check key refargs = self.varsP.argsListed(self.stageName) #'alignmolvref' haverefargs = True except KeyError: #this is same as old behavior #refargs = self.varsP.argsListed('noise0') + self.varsP.argsListed(self.argStageName) #old refargs = self.varsP.argsListed(self.argStageName) #new #refargs = noiseargs + refargs if haverefargs: self.jobargs = refargs #single job with bnxin (constructor) if self.bnxin: outarg = os.path.join(self.alignTarget, mappref) self.outFileList.append(outarg) #file prefixes jobargs = baseargs + ['-o', outarg] jobargs += ['-i', self.bnxin] stdoutf = None if self.varsP.stdoutlog: #remember, these must be after -o jobargs.extend(['-f', '-stdout', '-stderr']) stdoutf = outarg + ".stdout" jobargs += ['-maxthreads', str(self.varsP.maxthreads)] #add noise0 before alignmol (stageName) so that the latter can override the former jobargs += noiseargs jobargs.extend(['-output-veto-filter', 'intervals.txt$' ]) #this feature not in old RefAligner jobargs += refargs s1Job = mthread.singleJob(jobargs, self.stageName, outarg + ".xmap", self.stageName, maxThreads=self.varsP.maxthreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf) self.addJob(s1Job) return #and this is the only job #loop over the split bnxs, make one job per bnx for idx in range(1, self.varsP.nPairwiseJobs + 1): outarg = os.path.join(self.alignTarget, mappref + "_" + str(idx)) self.outFileList.append(outarg) #file prefixes jobargs = baseargs + ['-o', outarg] idxstr = "_%s_of_%s" % (idx, self.varsP.nPairwiseJobs) jobargs += [ '-i', self.varsP.bnxFile.replace(".bnx", idxstr + ".bnx") ] stdoutf = None if self.varsP.stdoutlog: #remember, these must be after -o jobargs.extend(['-f', '-stdout', '-stderr']) stdoutf = outarg + ".stdout" jobargs += ['-maxthreads', str(self.varsP.maxthreads)] #add noise0 before alignmol (stageName) so that the latter can override the former jobargs += noiseargs #if idx != 1 : #keep _r for first job only -- copied from SVModule # jobargs.extend( ['-output-veto-filter', '_r.cmap$'] ) #need this for copy number; do NOT veto jobargs.extend(['-output-veto-filter', 'intervals.txt$' ]) #this feature not in old RefAligner jobargs += refargs s1Job = mthread.singleJob(jobargs, self.stageName + idxstr, outarg + ".xmap", self.stageName + idxstr, maxThreads=self.varsP.maxthreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf) self.addJob(s1Job)
def generateJobList(self): baseArgs1 = self.varsP.argsListed(self.refineStage) for case in util.switch(self.refineStage): if case("refine(B1|Final1)", regexp=True): baseArgs1 += self.varsP.argsListed('noise0') ContigGroupList=self.findGroupedContigs() r1args = [self.varsP.RefAlignerBin] break if case("refine(B0|Final0)", regexp=True): baseArgs1 += self.varsP.argsListed('noise0') ContigGroupListFull=self.groupContigs() setattr(self.varsP, "count_"+self.varsP.outputContigPrefix, (ContigGroupListFull)) #print self.varsP.outputContigPrefix, getattr(self.varsP, "count_"+self.varsP.outputContigPrefix) #r1args = [self.varsP.RefAlignerBin, '-i', self.varsP.bnxFile] #InputFileList=[self.varsP.bnxFile] r1args = [self.varsP.RefAlignerBin] ContigGroupList = zip(range(1,self.varsP.nPairwiseJobs + 1), range(1,self.varsP.nPairwiseJobs + 1), [self.varsP.bnxFile.replace(".bnx", "_%s_of_%s.bnx" %(x, self.varsP.nPairwiseJobs)) for x in range(1,self.varsP.nPairwiseJobs + 1)], [1]*self.varsP.nPairwiseJobs) break if case("refineA"): baseArgs1 += self.varsP.argsListed('noise0') ContigGroupList=self.groupContigs() #print("Found %d groups for refineA" % (len(ContigGroupList))) #r1args = [self.varsP.AssemblerBin, '-i', self.varsP.bnxFile.replace(".bnx", "_sorted.bnx")] #need this before -contigs -- can no longer use all_sorted.bnx due to scan scaling: must refer to varsP.sorted_file #r1args = [self.varsP.AssemblerBin, '-i', self.varsP.sorted_file+".bnx"] #need this before -contigs r1args = [self.varsP.AssemblerBin, '-if', self.varsP.bnxFileList] #need this before -contigs; use split files in case splitting changed (eg due to scan scaling producing labels at < 20 bp) r1args += ['-contigs', os.path.join(self.varsP.inputContigFolder, self.varsP.inputContigPrefix) + '.contigs'] break if case("refineNGS"): r1args = [self.varsP.RefAlignerBin, '-i', self.varsP.bnxFile] ContigGroupList=self.groupContigs() break if case("extension0"): baseArgs1 += self.varsP.argsListed('noise0') ContigGroupList=self.groupContigs() setattr(self.varsP, "count_"+self.varsP.outputContigPrefix, (ContigGroupList)) #print self.varsP.outputContigPrefix, getattr(self.varsP, "count_"+self.varsP.outputContigPrefix), self.varsP.inputContigFolder, self.varsP.inputContigPrefix #r1args = [self.varsP.RefAlignerBin, '-i', self.varsP.bnxFile] #InputFileList=[self.varsP.bnxFile] r1args = [self.varsP.RefAlignerBin] ContigGroupList = zip(range(1,self.varsP.nPairwiseJobs + 1), range(1,self.varsP.nPairwiseJobs + 1), [self.varsP.bnxFile.replace(".bnx", "_%s_of_%s.bnx" %(x, self.varsP.nPairwiseJobs)) for x in range(1,self.varsP.nPairwiseJobs + 1)], [1]*self.varsP.nPairwiseJobs) break; if case("extension1"): baseArgs1 += self.varsP.argsListed('noise0') ContigGroupList=self.findGroupedContigs() r1args = [self.varsP.RefAlignerBin] break; if case(): varsP.error += 1 varsP.message += ' Error: Refine stage name invalid: '+str(StageName)+'\n' return stdarg = [] if self.varsP.stdoutlog : #this is the same for all cases below stdarg = ['-stdout', '-stderr'] #contigFiles, contigIDs = self.varsP.findContigs(self.varsP.inputContigFolder, self.varsP.inputContigPrefix) #nJobs = len(contigFiles) output1String = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix) #for jobNum in range(1,nJobs + 1): #contigID = contigIDs[jobNum - 1] for m in range(0, len(ContigGroupList)): contigID=ContigGroupList[m][0] rawContigID=ContigGroupList[m][1] contig=ContigGroupList[m][2] # Figure out desired number of threads to use threadBoost=ceil(ContigGroupList[m][3]) if threadBoost<1: threadBoost=1 minthreads=self.varsP.getClusterArgs(self.refineStage, category="MinThreads") if minthreads: minthreads=Template(minthreads).substitute(maxthreads=self.varsP.maxthreads) else: minthreads=self.varsP.maxthreads nthreads=float(minthreads) nthreads=int(round(nthreads*threadBoost)) if nthreads>self.varsP.maxthreads: nthreads=self.varsP.maxthreads # for contigID, contig in ContigGroupList : jobName = self.refineStage + ' %5s' % contigID for case in util.switch(self.refineStage): if case("refineA"): endId=int(rawContigID)+self.bunching-1 if m+1<len(ContigGroupList) : endId=int(ContigGroupList[m+1][1])-1 currentArgs = [str(rawContigID), str(endId), '-maxthreads', str(nthreads)] #this must come after r1args because it's actually an argument to -contigs #currentArgs = r1args + currentArgs + baseArgs1 + ['-id', str(contigID), '-i', contig+"_mapped.bnx", '-o', output1String] currentArgs = r1args + currentArgs + ['-o', output1String] + stdarg + baseArgs1 expectedOutputString = self.varsP.outputContigPrefix + '_contig' + str(rawContigID) expectedResultFile = os.path.join(self.varsP.outputContigFolder, expectedOutputString + '.cmap') #refineB expectedStdoutFile = output1String + "_id"+str(rawContigID)+".stdout" break #if case("refineB1|refineFinal1|extension1", regexp=True): ## TODO: make thread number configurable from clusterArgs #currentArgs = ['-maxthreads', str(16), self.ref_arg, contig] #currentArgs = r1args + currentArgs + baseArgs1 + ['-id', str(contigID), '-i', contig+"_mapped.bnx", '-o', output1String] #expectedOutputString = self.varsP.outputContigPrefix + '_contig' + str(contigID) #expectedResultFile = os.path.join(self.varsP.outputContigFolder, expectedOutputString + '.cmap') #refineB #break if case("refineB1|refineFinal1|extension1", regexp=True): Inputs=zip(["-i"]*self.varsP.nPairwiseJobs, [contig.replace("_group", "_group"+str(i)+"_mapped_group")+".bnx" for i in range(1,self.varsP.nPairwiseJobs + 1)]) Inputs=[x for t in Inputs for x in t] #-id must come before -o, otherwise expectedStdoutFile is wrong currentArgs = ['-maxthreads', str(nthreads), '-id', str(contigID), '-o', output1String, self.ref_arg, contig] currentArgs = r1args + currentArgs + stdarg + baseArgs1 + Inputs expectedOutputString = self.varsP.outputContigPrefix + '_contig' + str(rawContigID) expectedResultFile = os.path.join(self.varsP.outputContigFolder, expectedOutputString + '.cmap') #refineB expectedStdoutFile = output1String + "_id"+str(contigID)+".stdout" break #if case("refineB0|refineFinal0|extension0", regexp=True): #currentArgs = ['-maxthreads', str(self.varsP.maxthreads), self.ref_arg, contig] #currentArgs = r1args + currentArgs + baseArgs1 + ['-mapped-unsplit', '1', '-refine', '0', '-mapped', contig+"_mapped", "-o", "/dev/null"] #expectedOutputString = self.refineStage + "contig"+str(contigID) + "_mapped.bnx" #expectedResultFile = contig + "_mapped.bnx" #refineB #break if case("refineB0|refineFinal0|extension0", regexp=True): currentArgs = [ '-maxthreads', str(nthreads), "-ref", os.path.join(self.varsP.inputContigFolder, util.uniquifyContigName(self.varsP.inputContigPrefix)+".cmap")] outputfile = os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix+'_group'+str(contigID)) #-id must come before -o, otherwise expectedStdoutFile is wrong currentArgs = r1args + ['-i', contig, '-id', str(contigID), '-o', outputfile] + stdarg + currentArgs + baseArgs1 currentArgs += ['-refine', '0', '-grouped', os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix+'_group_manifest'), '-mapped', os.path.join(self.varsP.outputContigFolder, self.varsP.outputContigPrefix+'_group'+str(contigID)+"_mapped"), '-output-filter', ".*.bnx"] expectedOutputString = self.varsP.outputContigPrefix+'_group'+str(contigID)+"_mapped.bnx" expectedResultFile = outputfile + "_mapped_group1.bnx" expectedStdoutFile = outputfile + "_id"+str(contigID)+ ".stdout" break if case(): self.varsP.updatePipeReport("Internal error: cannot handle stage %s" % (self.refineStage)) raise ValueError if self.varsP.bnxStatsFile!=None: currentArgs.extend(['-XmapStatRead', self.varsP.bnxStatsFile]) s1Job = mthread.singleJob(currentArgs, jobName, expectedResultFile, expectedOutputString, maxThreads=nthreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=expectedStdoutFile, ) self.addJob(s1Job) self.logArguments()
def generateJobList(self): """AlignModule.generateJobList: create RefAligner jobs for aligning molecules to contigs. """ #for runAlignMol, this method is called but not used: exit if RefAlignerBin is empty if not self.varsP.RefAlignerBin : return #the contigs are obtained from varsP.latestMergedCmap--check its validity, a return will mean no jobs, and no jobs is now handled in multiThreadRunJobs. if not self.doref and ( not self.varsP.latestMergedCmap or not util.checkCmap(self.varsP.latestMergedCmap) ) : err = "Error in AlignModule.generateJobList: varsP.latestMergedCmap is not set or not valid cmap; skipping %s" % self.stageName self.varsP.updatePipeReport(err+"\n") util.LogError("error", err) return #Note: noise parameters should be fixed becuase when bnx is split, -M # would find different parameters for different contigs. Use noise0. baseargs = [self.varsP.RefAlignerBin] if not self.doref : baseargs += ['-ref', self.varsP.latestMergedCmap] #reference is latest merged cmap mappref = os.path.split(self.varsP.latestMergedCmap)[1] mappref = mappref[:mappref.find(".")] else : baseargs += ['-ref', self.varsP.ref] mappref = self.stageName #use stageName also for output filename noiseargs = self.varsP.argsListed('noise0') haverefargs = False try : #argsListed does not check key refargs = self.varsP.argsListed(self.stageName) #'alignmolvref' haverefargs = True except KeyError : #this is same as old behavior #refargs = self.varsP.argsListed('noise0') + self.varsP.argsListed(self.argStageName) #old refargs = self.varsP.argsListed(self.argStageName) #new #refargs = noiseargs + refargs if haverefargs : self.jobargs = refargs #single job with bnxin (constructor) if self.bnxin : outarg = os.path.join(self.alignTarget, mappref) self.outFileList.append( outarg ) #file prefixes jobargs = baseargs + ['-o', outarg] jobargs += ['-i', self.bnxin] stdoutf = None if self.varsP.stdoutlog : #remember, these must be after -o jobargs.extend( ['-f', '-stdout', '-stderr'] ) stdoutf = outarg+".stdout" jobargs += ['-maxthreads', str(self.varsP.maxthreads)] #add noise0 before alignmol (stageName) so that the latter can override the former jobargs += noiseargs jobargs.extend( ['-output-veto-filter', 'intervals.txt$'] ) #this feature not in old RefAligner jobargs += refargs s1Job = mthread.singleJob(jobargs, self.stageName, outarg+".xmap", self.stageName, maxThreads=self.varsP.maxthreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf) self.addJob(s1Job) return #and this is the only job #loop over the split bnxs, make one job per bnx for idx in range(1,self.varsP.nPairwiseJobs+1) : outarg = os.path.join(self.alignTarget, mappref+"_"+str(idx)) self.outFileList.append( outarg ) #file prefixes jobargs = baseargs + ['-o', outarg] idxstr = "_%s_of_%s" % (idx, self.varsP.nPairwiseJobs) jobargs += ['-i', self.varsP.bnxFile.replace(".bnx", idxstr+".bnx")] stdoutf = None if self.varsP.stdoutlog : #remember, these must be after -o jobargs.extend( ['-f', '-stdout', '-stderr'] ) stdoutf = outarg+".stdout" jobargs += ['-maxthreads', str(self.varsP.maxthreads)] #add noise0 before alignmol (stageName) so that the latter can override the former jobargs += noiseargs #if idx != 1 : #keep _r for first job only -- copied from SVModule # jobargs.extend( ['-output-veto-filter', '_r.cmap$'] ) #need this for copy number; do NOT veto jobargs.extend( ['-output-veto-filter', 'intervals.txt$'] ) #this feature not in old RefAligner jobargs += refargs s1Job = mthread.singleJob(jobargs, self.stageName+idxstr, outarg+".xmap", self.stageName+idxstr, maxThreads=self.varsP.maxthreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf) self.addJob(s1Job)