def __init__(self, varsP, doref=False, outputdir=None, bnxin=None): """doref determines parameter set from optargs. outputdir not needed for Pipeline, but used in runAlignMol.py. If bnxin supplied, will run single job with it. """ self.varsP = varsP self.doref = doref self.bnxin = bnxin #see generateJobList self.argStageName = 'alignmol' #use arguments from alignmol (optArgs, not clusterArgs) if not doref: self.stageName = 'alignmol' #also name of dir which is sub-dir of varsP.outputContigFolder self.alignTarget = os.path.join(varsP.outputContigFolder, self.stageName) #output dir self.varsP.alignMolDir = self.alignTarget #store in varsP for subsequent processing else: self.stageName = self.varsP.alignMolvrefName #also name of dir which is sub-dir of localRoot self.alignTarget = os.path.join(self.varsP.contigFolder, self.stageName) #output dir if outputdir: self.alignTarget = outputdir util.checkDir(self.alignTarget) #will make if doesn't exist self.mergedir = os.path.join( self.alignTarget, self.varsP.alignMolvrefMergeName) #copy from AlignRefModule super(AlignModule, self).__init__( self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs(self.stageName)) self.outFileList = [] self.generateJobList() self.logArguments()
def __init__(self, varsP, doref=False, outputdir=None, bnxin=None): """doref determines parameter set from optargs. outputdir not needed for Pipeline, but used in runAlignMol.py. If bnxin supplied, will run single job with it. """ self.varsP = varsP self.doref = doref self.bnxin = bnxin #see generateJobList self.argStageName = 'alignmol' #use arguments from alignmol (optArgs, not clusterArgs) if not doref : self.stageName = 'alignmol' #also name of dir which is sub-dir of varsP.outputContigFolder self.alignTarget = os.path.join(varsP.outputContigFolder, self.stageName) #output dir self.varsP.alignMolDir = self.alignTarget #store in varsP for subsequent processing else : self.stageName = self.varsP.alignMolvrefName #also name of dir which is sub-dir of localRoot self.alignTarget = os.path.join(self.varsP.contigFolder, self.stageName) #output dir if outputdir : self.alignTarget = outputdir util.checkDir(self.alignTarget) #will make if doesn't exist self.mergedir = os.path.join(self.alignTarget, self.varsP.alignMolvrefMergeName) #copy from AlignRefModule super(AlignModule, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs(self.stageName)) self.outFileList = [] self.generateJobList() self.logArguments()
def generateJobListChar(self, noise_in, input_file, optSection) : if not self.varsP.executeCurrentStage: return 1 #tell self.__init__ not to continue processing self.varsP.updatePipeReport('%s\n' % (optSection)) self.output_folder=os.path.join(self.varsP.contigFolder, "auto_noise") if not util.checkDir(self.output_folder) : #will make if not exist, only returns False if already exists or can't make print "ERROR in autoNoise.varsPipeline.prepareContigIO: bad dir:", self.output_folder self.output_file=os.path.join(self.output_folder, optSection) expectedResultFile=self.output_file+".err" # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short. #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly')) super(autoNoise, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly")) #cargs=[self.varsP.RefAlignerBin, '-f', '-i', input_file, "-ref", self.varsP.ref, "-maxthreads", str(self.varsP.maxthreads), "-o", self.output_file] cargs=[self.varsP.RefAlignerBin, '-f', '-i', input_file, "-ref", self.varsP.ref, "-o", self.output_file] #remove maxthreads bc this is always running on its own if self.varsP.stdoutlog : cargs.extend( ['-stdout', '-stderr'] ) for v in noise_in.keys(): cargs.extend(["-"+v, str(noise_in[v])]) cargs.extend(self.varsP.argsListed(optSection)) if self.varsP.bnxStatsFile!=None: cargs += ['-XmapStatWrite', self.varsP.bnxStatsFile] self.addJob(mthread.singleJob(cargs, self.stageName, expectedResultFile, self.stageName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=self.output_file+".stdout")) return 0 #success
def __init__(self, varsP): self.curCharacterizeFileRoots = [] self.varsP = varsP #bc Characterize uses this for totAssemblyLenMb #this is problematic for bypass (because mergeIntoSingleCmap isn't called)--don't need it #if not len(varsP.curCharacterizeCmaps) : #need this, set in mergeIntoSingleCmap # return #ccc = varsP.curCharacterizeCmaps[0] #outFileName = os.path.split(ccc)[1].replace(".cmap", "") #outfile = os.path.join(varsP.contigAlignTarget,outFileName) #WRONG bc contigAlignTarget is wrong...try this outdir = os.path.join(varsP.outputContigFolder, self.varsP.characterizeDirName) #'alignref' if not util.checkDir( outdir, makeIfNotExist=False ): #if this doesn't exist, we can't get what we need return outfile = None for qfile in os.listdir(outdir): if qfile.endswith(".err"): #just take first .err file outfile = qfile break if not outfile: #if no .err files found, give up return outfile = os.path.join(outdir, outfile.replace(".err", "")) self.curCharacterizeFileRoots.append(outfile) #also want to get varsP.totAssemblyLenMb self.varsP.totAssemblyLenMb = mapClasses.multiCmap( varsP.latestMergedCmap, lengthonly=True).totalLength / 1e6
def checkResults(self): if self.varsP.ngsBypass : #this means that pairwise is skipped completely, so do not check anything return #return None means success self.doAllPipeReport() #loops over self.jobList and calls CheckIfFileFound #check for align files if not util.checkDir(self.varsP.alignFolder, makeIfNotExist=False) : self.varsP.updatePipeReport("ERROR: bad alignFolder:%s\n\n" % self.varsP.alignFolder) return 1 alignFiles = [] #for sJob in self.jobList: # sJob.CheckIfFileFound() # alignFile = sJob.expectedResultFile # if sJob.resultFound: # alignFiles.append(alignFile) # else: # self.warning += 1 # self.messages += ' PW Warning Missing Expected File: %s\n' % alignFile #if alignFiles.__len__() == 0: # self.error += 1 # self.messages += ' Error: PW Missing All Align Files\n' #else: #Above uses results in singleJob instances, below reads from disk. Either way should work for ifile in os.listdir(self.varsP.alignFolder) : if ifile.endswith(".align") : alignFiles.append( os.path.join(self.varsP.alignFolder, ifile) ) if len(alignFiles) == 0 : self.varsP.updatePipeReport("ERROR: no align files in alignFolder %s\n\n" % self.varsP.alignFolder) return 1 alignFiles.sort() self.varsP.writeListToFile(alignFiles, self.varsP.alignTarget) self.varsP.stageComplete = 'Pairwise'
def mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="") : """Given a list of file prefixes (outFileList), append "_r.cmap" to them, and merge them to outdir. Report to varsP if supplied, stdout if not. Also support outFileList is full paths (including "_r.cmap"). If splitByContig < 1, output each contig separately, if == 1, only output single merged cmap, and if > 1, do both. Always use stagename if supplied; if not, must supply varsP otherwise prefix is empty. """ if not util.checkDir(outdir) : err_msg = "Warning in AlignModule.mergeRcmaps: could not make outdir %s, skipping copy number" % outdir logOrPrintError(err_msg, varsP) return if not outFileList : #just an argument check--check for presence on disk is below err_msg = "Warning in AlignModule.mergeRcmaps: no maps supplied" logOrPrintError(err_msg, varsP) return outFileList.sort() #for reproducibility with runAlignMerge.py (different order when listing dir) rsuf = "_r.cmap" #mappref = os.path.split(outFileList[0])[1] #this is just prefix, but with integer suffix--get it before -- no longer used #mappref = mappref[:mappref.rfind("_")+1] #remove integer suffix #even though outFileList should all be there, a job may have failed--check all, just existence present = [] for outf in outFileList : target = (outf+rsuf if not outf.endswith(rsuf) else outf) #now support either if not util.checkFile(target) : err_msg = "Warning in AlignModule.mergeRcmaps: missing _r.cmap %s" % target logOrPrintError(err_msg, varsP) else : present.append(target) if not present : #no _r.cmaps found (this will also happen for empty outFileList) err_msg = "Warning in AlignModule.mergeRcmaps: no _r.cmaps found, skipping copy number" logOrPrintError(err_msg, varsP) return outFileList = present #yes, it's redundant, but now have rsuf appended mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") #mergedmappath = os.path.join(outdir, mappref+mrgstr+rsuf) #this is output merged _r.cmap -- unify with filepref mergedmap = mc.multiCmap(outFileList[0]) #open original, edit in memory #now add other maps for rmap in outFileList[1:] : #don't add map 0 to itself if mergedmap.addCovOcc( mc.multiCmap(rmap) ) : #when calling addCovOcc, check return, warn if True err_msg = "Warning in AlignModule.mergeRcmaps: addCovOcc call failed for map %s" % rmap logOrPrintError(err_msg, varsP) #now it's merged, but the resulting map need to be written back to disk filepref = (varsP.outputContigPrefix if varsP and stageName == "" else stageName) #see split_XMapQcmap_byContig if splitByContig < 1 or splitByContig > 1 : #print "\nself.varsP.outputContigPrefix", self.varsP.outputContigPrefix, "\n" #debug #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in split_XMapQcmap_byContig mergedmap.writeAllMapsToDisk( os.path.join(outdir, filepref+'_contig'), outsuf="_r" ) report = "mergeRcmaps: wrote %i cmaps" % len(mergedmap.cmapdict) if splitByContig > 0 : mergedmap.writeToFile( os.path.join(outdir, filepref+"_"+mrgstr+rsuf) ) #was mergedmappath report = "mergeRcmaps: wrote merged cmap with %i contigs" % len(mergedmap.cmapdict) #report result logOrPrintError(report, varsP, warn=False)
def mergeMap(varsP, outFileList, mergepath): """outFileList is list of path+prefixes--each should have a .map file: merge them to a merged .map file in dir mergepath.""" outFileList.sort() #sort to ensure reproducibility (order of entries) maplist = [] for outpath in outFileList: #these are file prefixes if util.checkFile(outpath + ".map"): maplist.append(outpath + ".map") elif varsP: varsP.updatePipeReport( "Warning in AlignModule.mergeMap: missing map: " + outpath + ".map" + "\n") else: print "Warning in AlignModule.mergeMap: missing map: " + outpath + ".map" + "\n" if not len(maplist): #nothing to merge return if not util.checkDir(mergepath): varsP.updatePipeReport( "Warning in AlignModule.mergeMap: merge path invalid: " + mergepath + "\n") return headstart = [ "#", "S", "M" ] #last two lines of header start with "Software" and "MappedMoleculeId" #header = "" headerdone = False #data = "" lineno = 1 #can't just append: need to change index in first column sep = "\t" mappref = getMergeFilename(outFileList[0]) #also in getAlignStats mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge" ) #same for vref and not outpath = os.path.join(mergepath, mappref + mrgstr + ".map") f1 = open(outpath, 'w') for path in maplist: f = open(path) for line in f: if line[0] in headstart and not headerdone: #header += line f1.write(line) elif line[0] not in headstart: tokens = line.split() tokens[0] = str(lineno) #data += sep.join(tokens)+"\n" #newline was stripped by split f1.write(sep.join(tokens) + "\n") lineno += 1 headerdone = True f.close() #f1.write(header+data) f1.close()
def generateJobList(self): curArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('sampleChar') if util.checkFile(self.varsP.bnxTarget) : #file exists only if image processing was run bnxFiles = parseExperimentFile(self.varsP.bnxTarget) if not bnxFiles : #check that you got at least one errstr = "ERROR in SampleChar.generateJobList: no bnx files found in: "+self.varsP.bnxTarget print errstr self.varsP.updatePipeReport(errstr+"\n\n") return basepath = "" #os.path.split(bnxFiles[0])[0] #don't use basepath for this case else : #otherwise, assume this is the only bnx file bnxFiles = [self.varsP.bnxFile] #here, make a dir for the results--should really check results of checkEmptyDir for errors basepath = os.path.join(self.varsP.localRoot, "sampleChar") if self.varsP.wipe and os.path.isdir(basepath) : shutil.rmtree(basepath) #util.checkEmptyDir(basepath) #will make if not exist, but if it does, will remove and re-make -- this fn doesn't exist... #else : util.checkDir(basepath) #will make if not exist, but won't remove anything nJobs = len(bnxFiles) #for i, bnxFile in enumerate(bnxFiles): for bnxFile in bnxFiles : #bnxGroupName = '%02d' % (i+1) #get this from the path, ie, bnxFiles cargs = [self.varsP.RefAlignerBin, '-i', bnxFile] bnxname = os.path.split(bnxFile)[1].replace(".bnx","") jobname = 'Sample_Char_' + bnxname #outputTarget = os.path.join(basepath, bnxGroupName) if basepath : #bnx input outputTarget = os.path.join(basepath, bnxname) else : #image processing outputTarget = bnxFile.replace(".bnx","") + "_sampleChar" expectedResultFile = outputTarget + '.err' #this is used in checkResults currentArgs = cargs + ['-ref', self.varsP.ref, '-o' , outputTarget, '-f'] if self.varsP.stdoutlog : currentArgs.extend( ['-stdout', '-stderr'] ) currentArgs += ['-maxthreads', str(self.varsP.maxthreads)] + curArgs sJob = mthread.singleJob(currentArgs, jobname, expectedResultFile, jobname, clusterLogDir=self.varsP.clusterLogDir) # peStr is deprecated in favor of clusterargs #sJob.expTag = bnxGroupName #removed from checkResults self.addJob(sJob) self.logArguments()
def getTargetJobs(self, dormdir=False): localDataLocation = os.path.join(self.varsP.localRoot, self.expTag + '/') #print "localDataLocation:", localDataLocation #debug if dormdir : sJobRmName = 'Pre-Remove Folder: ' + shorten(localDataLocation) sJobRm = mthread.singleJob(['rm', '-f', '-r', localDataLocation], sJobRmName, '', 'rmDir') sJobMkdirName = 'Make Folder: ' + shorten(localDataLocation) sJobMkdir = mthread.singleJob(['mkdir', localDataLocation], sJobMkdirName, localDataLocation, 'mkDir') sJobMkdir.addContingentJob(sJobRm) allJobs = [sJobRm, sJobMkdir] contingentjob = sJobMkdir else : util.checkDir(localDataLocation) #will make dir localDataLocation allJobs = [] contingentjob = None for scan in self.scans: scanjobs = scan.getDetectJobs(contingentjob) if not scanjobs : #no scan jobs means the scan has already been processed--clear all jobs self.varsP.updatePipeReport("Device.getTargetJobs: skipping path "+scan.nameStr()+"\n") #localDataLocation else : allJobs += scanjobs return allJobs
def __init__(self, varsP) : jobName = "reference_process" opta_section = "referenceSvdetect" default_mres = "2.9" mres = "-mres" self.varsP = varsP usedefault = False if self.varsP.argData.has_key(opta_section) : #check if in optargs opta = self.varsP.argsListed(opta_section) if not mres in opta : #must have mres self.varsP.updatePipeReport("Warning in referenceProcess: "+mres+" missing in optArguments section "+opta_section+"\n") usedefault = True else : self.varsP.updatePipeReport("Warning in referenceProcess: optArguments section "+opta_section+" missing\n") usedefault = True if usedefault : opta = [mres, default_mres] mresstr = opta[opta.index(mres)+1] #get string for mres value for output name mresstr = mresstr.replace(".","") if not util.checkDir(self.varsP.refFolder) : self.varsP.updatePipeReport( "ERROR in referenceProcess: could not make output dir %s\n" % self.varsP.refFolder ) return None refpref = os.path.basename(self.varsP.ref[:self.varsP.ref.rfind(".")]) + "_res" + mresstr outarg = os.path.join(self.varsP.refFolder, refpref) #refFolder is new output folder for this job expectedResultFile = outarg+".cmap" #if ref is spots, is this spots? args = [self.varsP.RefAlignerBin, '-o', outarg, '-i', self.varsP.ref, '-f', '-merge'] + opta stdoutf = None if self.varsP.stdoutlog : args.extend( ['-stdout', '-stderr'] ) stdoutf = outarg+".stdout" args += ['-maxthreads', str(self.varsP.nThreads)] super(referenceProcess, self).__init__(self.varsP, jobName, clusterArgs=self.varsP.getClusterArgs("assembly")) job = mthread.singleJob(args, jobName, expectedResultFile, jobName, maxThreads=self.varsP.nThreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf) self.addJob(job) util.LogStatus("progress", "stage_start", jobName) self.varsP.runJobs(self, "referenceProcess") self.doAllPipeReport() if not self.allResultsFound() : #this is an error, but we'll continue processing without SV detect err = "ERROR in referenceProcess: job failed, disabling SV detect" self.varsP.updatePipeReport( err+"\n" ) util.LogError("error", err) #self.varsP.runSV = False #no need since this class is used in SVModule else : self.varsP.refDeresed = expectedResultFile #store good result for SV detect self.varsP.updatePipeReport( "referenceProcess: using reference %s for svdetect\n" % self.varsP.refDeresed ) util.LogStatus("progress", "stage_complete", jobName)
def mergeMap(varsP, outFileList, mergepath) : """outFileList is list of path+prefixes--each should have a .map file: merge them to a merged .map file in dir mergepath.""" outFileList.sort() #sort to ensure reproducibility (order of entries) maplist = [] for outpath in outFileList : #these are file prefixes if util.checkFile(outpath+".map") : maplist.append(outpath+".map") elif varsP : varsP.updatePipeReport("Warning in AlignModule.mergeMap: missing map: "+outpath+".map"+"\n") else : print "Warning in AlignModule.mergeMap: missing map: "+outpath+".map"+"\n" if not len(maplist) : #nothing to merge return if not util.checkDir(mergepath) : varsP.updatePipeReport("Warning in AlignModule.mergeMap: merge path invalid: "+mergepath+"\n") return headstart = ["#", "S", "M"] #last two lines of header start with "Software" and "MappedMoleculeId" #header = "" headerdone = False #data = "" lineno = 1 #can't just append: need to change index in first column sep = "\t" mappref = getMergeFilename(outFileList[0]) #also in getAlignStats mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") #same for vref and not outpath = os.path.join(mergepath, mappref+mrgstr+".map") f1 = open(outpath, 'w') for path in maplist : f = open(path) for line in f : if line[0] in headstart and not headerdone : #header += line f1.write(line) elif line[0] not in headstart : tokens = line.split() tokens[0] = str(lineno) #data += sep.join(tokens)+"\n" #newline was stripped by split f1.write(sep.join(tokens)+"\n") lineno += 1 headerdone = True f.close() #f1.write(header+data) f1.close()
def __init__(self, varsP) : self.curCharacterizeFileRoots = [] self.varsP = varsP #bc Characterize uses this for totAssemblyLenMb #this is problematic for bypass (because mergeIntoSingleCmap isn't called)--don't need it #if not len(varsP.curCharacterizeCmaps) : #need this, set in mergeIntoSingleCmap # return #ccc = varsP.curCharacterizeCmaps[0] #outFileName = os.path.split(ccc)[1].replace(".cmap", "") #outfile = os.path.join(varsP.contigAlignTarget,outFileName) #WRONG bc contigAlignTarget is wrong...try this outdir = os.path.join(varsP.outputContigFolder, self.varsP.characterizeDirName) #'alignref' if not util.checkDir(outdir, makeIfNotExist=False) : #if this doesn't exist, we can't get what we need return outfile = None for qfile in os.listdir(outdir) : if qfile.endswith(".err") : #just take first .err file outfile = qfile break if not outfile : #if no .err files found, give up return outfile = os.path.join(outdir, outfile.replace(".err","")) self.curCharacterizeFileRoots.append(outfile) #also want to get varsP.totAssemblyLenMb self.varsP.totAssemblyLenMb = mapClasses.multiCmap(varsP.latestMergedCmap, lengthonly=True).totalLength / 1e6
def runAlignMol() : parser = argparse.ArgumentParser(description=description) parser.add_argument('-q', dest='queryDir', help='Path to merged cmap to align molecules (-b) to OR alignmol dir from Pipeline for merge (if latter, no alignments are performed), required', type=str) parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file, required if aligning molecules', type=str) #parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file OR path to dir containing split bnx pieces, required if aligning molecules', type=str) #I should add the split feature; for now, just do single bnx parser.add_argument('-a', dest='optArguments', help='Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str) parser.add_argument('-r', help='If this flag is used, alignmolvref arguments are used, otherwise alignmol arguments are used (default alignmol; optional)', dest='ref', action='store_true') parser.add_argument('-o', dest='outputDir', help='output dir (optional, defaults to sub-dir of input map dir called "alignmol")', default="", type=str) parser.add_argument('-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) parser.add_argument('-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int) parser.add_argument('-j', dest='maxthreads', help='Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int) parser.add_argument('-e', dest='errFile', help='.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str) parser.add_argument('-E', dest='errbinFile', help='.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str) parser.add_argument('-p', dest='pipelineDir', help='Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str) parser.add_argument('-v', dest='pvalue', help='Alignment pvalue', default="1e-12") result = parser.parse_args() outprefix = "exp_refineFinal1" #this is the default; assume for now #check all Pipeline dependencies if result.pipelineDir : cwd = result.pipelineDir else : cwd = os.path.split(os.path.realpath(__file__))[0] #this is path of this script if not os.path.isfile(os.path.join(cwd,"utilities.py")) : #if still not here, last try is actual cwd cwd = os.getcwd() #still check this below #this is the only one imported here and in runCharacterize if not os.path.isfile(os.path.join(cwd,"utilities.py")): print "ERROR: utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not os.path.isfile(os.path.join(cwd,"AlignModule.py")): print "ERROR: AlignModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import AlignModule as alignmod if not util.checkFile(os.path.join(cwd,"Pipeline.py")): print "ERROR: Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline #input dir if not result.queryDir : print "ERROR: Query (-q) argument not supplied." sys.exit(1) qrypath = os.path.realpath(result.queryDir) if util.checkDir(qrypath, checkWritable=False, makeIfNotExist=False) : #output elsewhere so not writeable is ok runaligns = False elif util.checkCmap(qrypath) : runaligns = True else : print "ERROR: Query argument ("+qrypath+") not found or not a dir or cmap. Check -q argument." sys.exit(1) #this check isn't really necessary...make it a warning -- left over from runAlignMerge.py #if not os.path.split(qrypath)[1].endswith("alignmol") : # print "Warning: Query dir ("+qrypath+") does not end with 'alignmol'; please be sure this is a Pipeline alignmol dir\n" #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args rabin = "" #need empty string for generateJobList even though no jobs are run if runaligns : rabin = result.RefAligner #replicate Pipeline behavior: RefAligner is always required if os.path.isdir(rabin) : rabin = os.path.join(rabin, "RefAligner") if not util.checkExecutable(rabin): print "ERROR: RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg." sys.exit(1) #optargs file optargs = None if runaligns and result.optArguments : #supplied on command line optargs = result.optArguments if not util.checkFile(optargs, ".xml") : print "optArguments path is supplied ("+optargs+") but not found or doesn't end in .xml, check -a argument." sys.exit(1) elif runaligns : #load from Pipeline dir if running alignments optargs = os.path.join(cwd,"optArguments_human.xml") if not util.checkFile(optargs): print "optArguments.xml missing in Pipeline directory ("+cwd+"). Try supplying path explicitly using -a." sys.exit(1) #output dir if not result.outputDir : outdir = os.path.join(qrypath, "merge") #should be same as in AlignModule else : outdir = os.path.realpath(result.outputDir) if os.path.isdir(outdir) : if not util.checkDir(outdir) : #check writeable print "\nERROR: Output dir is not writeable:\n", outdir, "\n" sys.exit(1) #this is ok here #elif outdir == contigdir : # print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n" # sys.exit(1) print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n" elif not util.checkDir(outdir) : #does not exist, make, if False, can't make or not writeable print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n" sys.exit(1) #bnx file bnxfile = result.bnx if bnxfile : #must check for empty string BEFORE you do realpath, or it returns cwd bnxfile = os.path.realpath(bnxfile) if not util.checkFile(bnxfile, ".bnx") : print "ERROR: bnx file supplied but not found or incorrect suffix:", bnxfile sys.exit(1) elif runaligns : print "ERROR: bnx file not supplied but running alignments; please supply bnx file as -b argument" sys.exit(1) #nthreads nthreads = result.numThreads if nthreads <= 0 : print "ERROR: Number of threads value invalid (must be > 0): %i" % nthreads sys.exit(1) #maxthreads maxthreads = result.maxthreads if maxthreads <= 0 : print "ERROR: Max threads value invalid (must be > 0): %i" % maxthreads sys.exit(1) elif nthreads < maxthreads : print "Warning: num threads (-T: %i) < max threads (-j: %i): increasing num threads to equal max threads\n" % (nthreads, maxthreads) nthreads = maxthreads #pvalue if result.pvalue : #supplied on command line pvalue = result.pvalue else : pvalue = "1e-12" #.errbin file errbinfile = result.errbinFile if errbinfile : errbinfile = os.path.realpath(result.errbinFile) if not util.checkFile(errbinfile, ".errbin") : print "ERROR: errbin file supplied but not found or incorrect suffix:", errbinfile sys.exit(1) #.err file errfile = result.errFile if errfile and errbinfile : print "Warning: .err and .errbin arguments supplied; ignoring .err file" errfile = "" elif errfile : errfile = os.path.realpath(result.errFile) if not util.checkFile(errfile, ".err") : print "err file supplied but not found or incorrect suffix:", errfile sys.exit(1) if errfile and not util.checkFile(os.path.join(cwd,"SampleCharModule.py")): print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) elif errfile : import SampleCharModule as scm doref = result.ref #DONE checking arguments print "Using output dir", outdir if runaligns : print "Aligning", bnxfile, "\nTo", qrypath, "\n" else : print "Merging", qrypath, "\n" startTime = time.time() #time since Epoch memory_log = os.path.join(outdir, "memory_log.txt") util.initMemoryLog(memory_log) varsP = Pipeline.varsPipeline() varsP.RefAlignerBin = rabin varsP.contigFolder = "" #not used but needs to be an attr varsP.outputContigFolder = "" #not used but needs to be a string attr varsP.pipeReportFile = os.path.join(outdir, "alignmol_jobs_log.txt") varsP.infoReportFile = os.path.join(outdir, "alignmol_log.txt") util.InitStatus( os.path.join(outdir, "status.xml") ) if runaligns : varsP.optArgumentsFileIn = optargs varsP.latestMergedCmap = qrypath #if !doref, need this one varsP.ref = qrypath #and if doref, need this one varsP.nThreads = nthreads #necessary otherwise job won't start -- max threads per node varsP.maxthreads = maxthreads #threads per job p = os.path.split(qrypath)[1] varsP.outputContigPrefix = p[:p.rfind(".")] #filename prefix varsP.stdoutlog = True #use -stdout -stderr varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt") varsP.parseArguments() #parses optArgumentsFile varsP.replaceParam("alignmol", "-T", pvalue) varsP.checkDependencies() varsP.RefAlignerBinOrig = rabin varsP.prerunLog() #general information in log -- needed for refaligner_version noisep = {} if errbinfile : noisep = {"readparameters": errbinfile} #print "Using noise parameters from "+errbinfile+"\n" #move below elif errfile : noisep = scm.readNoiseParameters(errfile.replace(".err","")) if noisep.has_key('readparameters') : #remove this because it's redundant, and it can cause problems with RefAligner compatibility del noisep['readparameters'] if not noisep : #readNoiseParameters returns empty dict on failure print "ERROR reading noise parameters, check .err file:", errfile sys.exit(1) #redundant with below? print "Using noise parameters from "+errfile+":\n" + " ".join(["-"+str(k)+" "+str(v) for k,v in noisep.iteritems()])+"\n" #some code from SampleCharModule to load args into noise0 infoReport="Loaded noise parameters:\n" klist = ["FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict. #noiseargs = self.varsP.argsListed('noise0') #not necessary for v in klist : if not noisep.has_key(v) : continue param=str(noisep[v]) util.LogStatus("parameter", "auto_"+v, param) infoReport+=v+":"+param+"\n" varsP.replaceParam("noise0", "-"+v, param) varsP.updateInfoReport(infoReport + '\n', printalso=True) else : print "Getting file list from", qrypath outFileList = getOutFileList(util, qrypath) if not outFileList : print "ERROR: Query dir ("+qrypath+") does not contain alignmol data. Check -q argument." sys.exit(1) else : print "Found", len(outFileList), "alignment results" #end if runaligns amod = alignmod.AlignModule(varsP, doref, outdir, bnxfile) #constructor will call generateJobList if runaligns : amod.runJobs() amod.checkResults() else : amod.outFileList = outFileList p = os.path.split(outFileList[0])[1] if p.count("_") > 1 : #expect something like "EXP_REFINEFINAL1_4" #p = p[:p.rfind("_")+1] #remove integer suffix p = p[:p.rfind("_")] #remove integer suffix (and underscore) #else : # p += "_" #because mrgstr is appended varsP.outputContigPrefix = p if not runaligns or len(amod.jobList) > 0 : amod.getAlignStats() if runaligns : print #copy from Pipeline.py if util.SummarizeErrors(varsP=varsP)==0: varsP.updatePipeReport("Pipeline has successfully completed\n") util.LogStatus("progress", "pipeline", "success") else: varsP.updatePipeReport("Pipeline has completed with errors\n") util.LogStatus("progress", "pipeline", "failure") #BELOW OLD CODE return #in Pipeline, this is called first #print "Calling getAlignStats:" #but it won't work without varsP atm; skip it #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir) #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir) print "Calling mergeMap" print outFileList[0] #, "\n", outputdir #moved above util.logMemory(memory_log, startTime, "mergeMap_start") #mergeMap(self.varsP, self.outFileList, mergepath=self.outputdir) #varsP is optional alignmod.mergeMap(None, outFileList, outputdir) util.logMemory(memory_log, startTime, "mergeMap_end") print "Calling mergeRcmaps" util.logMemory(memory_log, startTime, "mergeRcmaps_start") #mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="alignmol") : alignmod.mergeRcmaps(outFileList, outputdir, splitByContig=True, stageName=outprefix) util.logMemory(memory_log, startTime, "mergeRcmaps_end") print "Calling split_XMap_byContig" #split_XMapQcmap_byContig" util.logMemory(memory_log, startTime, "split_XMap_byContig_start") #xmapdict = alignmod.split_XMap_byContig(outFileList, outputdir, stageName=outprefix) #old xmapdict = alignmod.split_XMap_byContig_new(outFileList, outputdir, stageName=outprefix) util.logMemory(memory_log, startTime, "split_XMap_byContig_end") print "Calling split_Qcmap_byContig" util.logMemory(memory_log, startTime, "split_Qcmap_byContig_start") #alignmod.split_Qcmap_byContig(outFileList, outputdir, xmapdict) #old alignmod.split_Qcmap_byContig_new(outFileList, outputdir, xmapdict, stageName=outprefix) #new: better performance util.logMemory(memory_log, startTime, "split_Qcmap_byContig_end") print "AlignMerge successfully completed"
def getAlignStats(varsP, outFileList, reflen=0, isref=False, mergepath="", bnxpath=None): '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule. reflen should be in Mb. If mergepath supplied, put merged .err there. If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this file and ignore outFileList. ''' statonly = False #bnx stats only skipbnx = False #.err file processing only if bnxpath == None: if not varsP.sorted_file: #for runAlignMol, this is empty: nothing to do in this case skipbnx = True else: bnxpath = varsP.sorted_file + ".bnx" #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix else: #if bnxpath != None : statonly = True if not skipbnx and not util.checkFile(bnxpath): varsP.updatePipeReport( "Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n" % bnxpath) return #find the minlen used for bnx_sort, which is a required arg set sortargs = [] if varsP.argData.has_key('bnx_sort'): #for runAlignMol.py sortargs = varsP.argsListed('bnx_sort') minlen = 0 validminlen = False if "-minlen" in sortargs: minlen = sortargs[ sortargs.index("-minlen") + 1] #next ele should be the len, if next ele isn't in list, the sort job will fail minlen = util.getIntFromString( minlen) #returns None if can't cast to int if minlen: validminlen = True if not validminlen and bnxpath == None and sortargs: varsP.updatePipeReport( "Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n" ) if bnxpath != None: #if bnxpath, ignore minlen minlen = 0 nmol = 0 #total n mol above minlen totlen = 0 #total mol len above minlen if util.checkFile(bnxpath): #the bnxfile class is very wasteful. replace with below #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now outstr = "Reading molecule stats from %s:\n" % bnxpath outstr += "Molecule Stats:\n" moldict = util.simpleBnxStats(bnxpath, minlen) nmol = moldict["nmol"] totlen = moldict["totlen"] #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously outstr += "N mols: %i\n" % nmol outstr += ("Total len (Mb): %10.3f\n") % totlen outstr += ("Avg len (kb) : %10.3f\n") % moldict["avglen"] outstr += ("Mol N50 (kb) : %10.3f\n") % moldict["n50"] outstr += ("Lab (/100kb) : %10.3f\n") % moldict["labdensity"] # if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below # bnx.molstats[minlen].genomesizemb = 0 # outstr += str(bnx.molstats[minlen]) #nmol = bnx.molstats[minlen].nmol #totlen = bnx.molstats[minlen].totlen if reflen: cov = totlen / reflen #totlen is in Mb outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else "Contig", cov) if isref or reflen or statonly: #if neither, nothing to print varsP.updateInfoReport(outstr + "\n", printalso=True) elif not skipbnx: varsP.updatePipeReport( "Warning in AlignModule.getAlignStats: missing bnx path:" + bnxpath + "\n") if statonly: return #lastly, load .xmaps and .errs from alignmol jobs and report on stats totmaplen = 0 #sum of lengths of mapped portions of all molecules, on reference totmapqrylen = 0 #sum of lengths of mapped portions of all molecules, on query totconf = 0 #sum of confidence of all alignments nalign = 0 #total number of alignments fplist = [] #lists for error rates fprlist = [] fnlist = [] bpplist = [] nmaplist = [] #from .err gmaplist = [] #from .err llrmlist = [] llrgmlist = [] bppsdlist = [] sflist = [] sdlist = [] srlist = [] reslist = [] resdlist = [] header = "" err = None #will be the alignParams object if any .err files are found mappref = "" if len(outFileList) > 0: mappref = getMergeFilename( outFileList[0] ) #make function to unify with same convention in mergeMap for outpath in outFileList: #these are file prefixes if util.checkFile(outpath + ".xmap"): xmap = mc.xmap(outpath + ".xmap") nalign += len(xmap.xmapLookup) totmaplen += xmap.getSumMappedRefLen() #in kb totmapqrylen += xmap.getSumMappedQryLen() #in kb totconf += sum([x.Confidence for x in xmap.xmapLookup.values()]) else: varsP.updatePipeReport( "Warning in AlignModule.getAlignStats: missing xmap:" + outpath + ".xmap" + "\n") if util.checkFile(outpath + ".err"): err = mc.alignParams(outpath + ".err") if not header: header = err.header fplist.append(err.fp) fprlist.append(err.fprate) fnlist.append(err.fn) bpplist.append(err.bpp) reslist.append(err.res) nmaplist.append(err.nmaps) gmaplist.append(err.goodmaps) llrmlist.append(err.llrm) llrgmlist.append(err.llrgm) bppsdlist.append(err.bppsd) sflist.append(err.sf) sdlist.append(err.sd) srlist.append(err.sr) resdlist.append(err.ressd) #nalign from xmap should be the same as goodmaps from .err sumgoodmaps = sum(gmaplist) if sumgoodmaps != nalign: varsP.updateInfoReport( "Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n" % (sumgoodmaps, nalign), printalso=True) if totmaplen or totconf or nalign: outstr = "Molecules Aligned to %s:\n" % ("Reference" if isref else "Assembly") outstr += "N mol align : %9i\n" % nalign outstr += "Mol fraction align: %13.3f\n" % (float(nalign) / nmol if nmol else 0) outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3) #Mb if reflen > 0: outstr += ("Effective Cov (x) : %13.3f\n") % ( totmaplen / 1e3 / reflen) #totlen is in kb outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen / nalign if nalign else 0) outstr += "Fraction align len: %13.3f\n" % ( totmapqrylen / 1e3 / totlen if totlen else 0 ) #totmapqrylen is in kb, totlen is in mb outstr += "Tot confidence : %11.1f\n" % totconf outstr += "Avg confidence : %11.1f\n" % (totconf / nalign if nalign else 0) varsP.updateInfoReport(outstr, printalso=True) avgfp = (sum(fplist) / len(fplist) if len(fplist) else 0) avgfpr = (sum(fprlist) / len(fprlist) if len(fprlist) else 0) avgfn = (sum(fnlist) / len(fnlist) if len(fnlist) else 0) avgbpp = (sum(bpplist) / len(bpplist) if len(bpplist) else 0) avgres = (sum(reslist) / len(reslist) if len(reslist) else 0) avgllr = (sum(llrmlist) / len(llrmlist) if len(llrmlist) else 0) avgllg = (sum(llrgmlist) / len(llrgmlist) if len(llrgmlist) else 0) avgbps = (sum(bppsdlist) / len(bppsdlist) if len(bppsdlist) else 0) avgsf = (sum(sflist) / len(sflist) if len(sflist) else 0) avgsd = (sum(sdlist) / len(sdlist) if len(sdlist) else 0) avgsr = (sum(srlist) / len(srlist) if len(srlist) else 0) avgrsd = (sum(resdlist) / len(resdlist) if len(resdlist) else 0) if avgfp or avgfn or avgbpp: outstr = "Avg FP(/100kb) : %12.2f\n" % avgfp outstr += "Avg FP ratio : %13.3f\n" % avgfpr outstr += "Avg FN ratio : %13.3f\n" % avgfn outstr += "Avg bpp : %11.1f\n" % avgbpp outstr += "Avg sf : %13.3f\n" % avgsf outstr += "Avg sd : %13.3f\n" % avgsd outstr += "Avg sr : %13.3f\n" % avgsr varsP.updateInfoReport(outstr + "\n", printalso=True) if err and mergepath: #have an error file (alignParams) object util.checkDir(mergepath) mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") outpath = os.path.join(mergepath, mappref + mrgstr + ".err") err.fp = avgfp err.fn = avgfn err.sf = avgsf err.sd = avgsd err.bpp = avgbpp err.res = avgres err.nmaps = sum(nmaplist) err.llrm = avgllr err.goodmaps = sumgoodmaps err.llrgm = avgllg err.bppsd = avgbps err.fprate = avgfpr err.sr = avgsr err.ressd = avgrsd err.writeToFile(outpath)
def __init__(self, varsP): jobName = "reference_process" opta_section = "referenceSvdetect" default_mres = "2.9" mres = "-mres" self.varsP = varsP usedefault = False if self.varsP.argData.has_key(opta_section): #check if in optargs opta = self.varsP.argsListed(opta_section) if not mres in opta: #must have mres self.varsP.updatePipeReport( "Warning in referenceProcess: " + mres + " missing in optArguments section " + opta_section + "\n") usedefault = True else: self.varsP.updatePipeReport( "Warning in referenceProcess: optArguments section " + opta_section + " missing\n") usedefault = True if usedefault: opta = [mres, default_mres] mresstr = opta[opta.index(mres) + 1] #get string for mres value for output name mresstr = mresstr.replace(".", "") if not util.checkDir(self.varsP.refFolder): self.varsP.updatePipeReport( "ERROR in referenceProcess: could not make output dir %s\n" % self.varsP.refFolder) return None refpref = os.path.basename( self.varsP.ref[:self.varsP.ref.rfind(".")]) + "_res" + mresstr outarg = os.path.join( self.varsP.refFolder, refpref) #refFolder is new output folder for this job expectedResultFile = outarg + ".cmap" #if ref is spots, is this spots? args = [ self.varsP.RefAlignerBin, '-f', '-o', outarg, '-i', self.varsP.ref, '-merge' ] + opta stdoutf = None if self.varsP.stdoutlog: args.extend(['-stdout', '-stderr']) stdoutf = outarg + ".stdout" args += ['-maxthreads', str(self.varsP.nThreads)] super(referenceProcess, self).__init__(self.varsP, jobName, clusterArgs=self.varsP.getClusterArgs("assembly")) job = mthread.singleJob(args, jobName, expectedResultFile, jobName, maxThreads=self.varsP.nThreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf) self.addJob(job) util.LogStatus("progress", "stage_start", jobName) self.varsP.runJobs(self, "referenceProcess") self.doAllPipeReport() if not self.allResultsFound( ): #this is an error, but we'll continue processing without SV detect err = "ERROR in referenceProcess: job failed, disabling SV detect" self.varsP.updatePipeReport(err + "\n") util.LogError("error", err) #self.varsP.runSV = False #no need since this class is used in SVModule else: self.varsP.refDeresed = expectedResultFile #store good result for SV detect self.varsP.updatePipeReport( "referenceProcess: using reference %s for svdetect\n" % self.varsP.refDeresed) util.LogStatus("progress", "stage_complete", jobName)
def mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName=""): """Given a list of file prefixes (outFileList), append "_r.cmap" to them, and merge them to outdir. Report to varsP if supplied, stdout if not. Also support outFileList is full paths (including "_r.cmap"). If splitByContig < 1, output each contig separately, if == 1, only output single merged cmap, and if > 1, do both. Always use stagename if supplied; if not, must supply varsP otherwise prefix is empty. """ if not util.checkDir(outdir): err_msg = "Warning in AlignModule.mergeRcmaps: could not make outdir %s, skipping copy number" % outdir logOrPrintError(err_msg, varsP) return if not outFileList: #just an argument check--check for presence on disk is below err_msg = "Warning in AlignModule.mergeRcmaps: no maps supplied" logOrPrintError(err_msg, varsP) return outFileList.sort( ) #for reproducibility with runAlignMerge.py (different order when listing dir) rsuf = "_r.cmap" #mappref = os.path.split(outFileList[0])[1] #this is just prefix, but with integer suffix--get it before -- no longer used #mappref = mappref[:mappref.rfind("_")+1] #remove integer suffix #even though outFileList should all be there, a job may have failed--check all, just existence present = [] for outf in outFileList: target = (outf + rsuf if not outf.endswith(rsuf) else outf ) #now support either if not util.checkFile(target): err_msg = "Warning in AlignModule.mergeRcmaps: missing _r.cmap %s" % target logOrPrintError(err_msg, varsP) else: present.append(target) if not present: #no _r.cmaps found (this will also happen for empty outFileList) err_msg = "Warning in AlignModule.mergeRcmaps: no _r.cmaps found, skipping copy number" logOrPrintError(err_msg, varsP) return outFileList = present #yes, it's redundant, but now have rsuf appended mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") #mergedmappath = os.path.join(outdir, mappref+mrgstr+rsuf) #this is output merged _r.cmap -- unify with filepref mergedmap = mc.multiCmap(outFileList[0]) #open original, edit in memory #now add other maps for rmap in outFileList[1:]: #don't add map 0 to itself if mergedmap.addCovOcc(mc.multiCmap( rmap)): #when calling addCovOcc, check return, warn if True err_msg = "Warning in AlignModule.mergeRcmaps: addCovOcc call failed for map %s" % rmap logOrPrintError(err_msg, varsP) #now it's merged, but the resulting map need to be written back to disk filepref = ( varsP.outputContigPrefix if varsP and stageName == "" else stageName ) #see split_XMapQcmap_byContig if splitByContig < 1 or splitByContig > 1: #print "\nself.varsP.outputContigPrefix", self.varsP.outputContigPrefix, "\n" #debug #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in split_XMapQcmap_byContig mergedmap.writeAllMapsToDisk(os.path.join(outdir, filepref + '_contig'), outsuf="_r") report = "mergeRcmaps: wrote %i cmaps" % len(mergedmap.cmapdict) if splitByContig > 0: mergedmap.writeToFile( os.path.join(outdir, filepref + "_" + mrgstr + rsuf)) #was mergedmappath report = "mergeRcmaps: wrote merged cmap with %i contigs" % len( mergedmap.cmapdict) #report result logOrPrintError(report, varsP, warn=False)
def getAlignStats(varsP, outFileList, reflen=0, isref=False, mergepath="", bnxpath=None) : '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule. reflen should be in Mb. If mergepath supplied, put merged .err there. If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this file and ignore outFileList. ''' statonly = False #bnx stats only skipbnx = False #.err file processing only if bnxpath == None : if not varsP.sorted_file : #for runAlignMol, this is empty: nothing to do in this case skipbnx = True else : bnxpath = varsP.sorted_file+".bnx" #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix else : #if bnxpath != None : statonly = True if not skipbnx and not util.checkFile(bnxpath) : varsP.updatePipeReport("Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n" % bnxpath) return #find the minlen used for bnx_sort, which is a required arg set sortargs = [] if varsP.argData.has_key('bnx_sort') : #for runAlignMol.py sortargs = varsP.argsListed('bnx_sort') minlen = 0 validminlen = False if "-minlen" in sortargs : minlen = sortargs[sortargs.index("-minlen")+1] #next ele should be the len, if next ele isn't in list, the sort job will fail minlen = util.getIntFromString(minlen) #returns None if can't cast to int if minlen : validminlen = True if not validminlen and bnxpath == None and sortargs : varsP.updatePipeReport("Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n") if bnxpath != None : #if bnxpath, ignore minlen minlen = 0 nmol = 0 #total n mol above minlen totlen = 0 #total mol len above minlen if util.checkFile(bnxpath) : #the bnxfile class is very wasteful. replace with below #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now outstr = "Reading molecule stats from %s:\n" % bnxpath outstr += "Molecule Stats:\n" moldict = util.simpleBnxStats(bnxpath, minlen) nmol = moldict["nmol"] totlen = moldict["totlen"] #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously outstr += "N mols: %i\n" % nmol outstr += ("Total len (Mb): %10.3f\n") % totlen outstr += ("Avg len (kb) : %10.3f\n") % moldict["avglen"] outstr += ("Mol N50 (kb) : %10.3f\n") % moldict["n50"] outstr += ("Lab (/100kb) : %10.3f\n") % moldict["labdensity"] # if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below # bnx.molstats[minlen].genomesizemb = 0 # outstr += str(bnx.molstats[minlen]) #nmol = bnx.molstats[minlen].nmol #totlen = bnx.molstats[minlen].totlen if reflen : cov = totlen / reflen #totlen is in Mb outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else "Contig", cov) if isref or reflen or statonly : #if neither, nothing to print varsP.updateInfoReport(outstr + "\n", printalso=True) elif not skipbnx : varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing bnx path:"+bnxpath+"\n") if statonly : return #lastly, load .xmaps and .errs from alignmol jobs and report on stats totmaplen = 0 #sum of lengths of mapped portions of all molecules, on reference totmapqrylen = 0 #sum of lengths of mapped portions of all molecules, on query totconf = 0 #sum of confidence of all alignments nalign = 0 #total number of alignments fplist = [] #lists for error rates fprlist = [] fnlist = [] bpplist = [] nmaplist = [] #from .err gmaplist = [] #from .err llrmlist = []; llrgmlist = []; bppsdlist = [] sflist = []; sdlist = []; srlist = []; reslist = []; resdlist = [] header = "" err = None #will be the alignParams object if any .err files are found mappref = "" if len(outFileList) > 0 : mappref = getMergeFilename(outFileList[0]) #make function to unify with same convention in mergeMap for outpath in outFileList : #these are file prefixes if util.checkFile(outpath+".xmap") : xmap = mc.xmap(outpath+".xmap") nalign += len(xmap.xmapLookup) totmaplen += xmap.getSumMappedRefLen() #in kb totmapqrylen += xmap.getSumMappedQryLen() #in kb totconf += sum([x.Confidence for x in xmap.xmapLookup.values()]) else : varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing xmap:"+outpath+".xmap"+"\n") if util.checkFile(outpath+".err") : err = mc.alignParams(outpath+".err") if not header : header = err.header fplist.append(err.fp) fprlist.append(err.fprate) fnlist.append(err.fn) bpplist.append(err.bpp) reslist.append(err.res) nmaplist.append(err.nmaps) gmaplist.append(err.goodmaps) llrmlist.append(err.llrm) llrgmlist.append(err.llrgm) bppsdlist.append(err.bppsd) sflist.append(err.sf) sdlist.append(err.sd) srlist.append(err.sr) resdlist.append(err.ressd) #nalign from xmap should be the same as goodmaps from .err sumgoodmaps = sum(gmaplist) if sumgoodmaps != nalign : varsP.updateInfoReport("Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n" % (sumgoodmaps, nalign), printalso=True) if totmaplen or totconf or nalign : outstr = "Molecules Aligned to %s:\n" % ("Reference" if isref else "Assembly") outstr += "N mol align : %9i\n" % nalign outstr += "Mol fraction align: %13.3f\n" % (float(nalign)/nmol if nmol else 0) outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3) #Mb if reflen > 0 : outstr += ("Effective Cov (x) : %13.3f\n") % (totmaplen / 1e3 / reflen) #totlen is in kb outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen/nalign if nalign else 0) outstr += "Fraction align len: %13.3f\n" % (totmapqrylen/1e3/totlen if totlen else 0) #totmapqrylen is in kb, totlen is in mb outstr += "Tot confidence : %11.1f\n" % totconf outstr += "Avg confidence : %11.1f\n" % (totconf/nalign if nalign else 0) varsP.updateInfoReport(outstr, printalso=True) avgfp = (sum(fplist)/len(fplist) if len(fplist) else 0) avgfpr = (sum(fprlist)/len(fprlist) if len(fprlist) else 0) avgfn = (sum(fnlist)/len(fnlist) if len(fnlist) else 0) avgbpp = (sum(bpplist)/len(bpplist) if len(bpplist) else 0) avgres = (sum(reslist)/len(reslist) if len(reslist) else 0) avgllr = (sum(llrmlist)/len(llrmlist) if len(llrmlist) else 0) avgllg = (sum(llrgmlist)/len(llrgmlist) if len(llrgmlist) else 0) avgbps = (sum(bppsdlist)/len(bppsdlist) if len(bppsdlist) else 0) avgsf = (sum(sflist)/len(sflist) if len(sflist) else 0) avgsd = (sum(sdlist)/len(sdlist) if len(sdlist) else 0) avgsr = (sum(srlist)/len(srlist) if len(srlist) else 0) avgrsd = (sum(resdlist)/len(resdlist) if len(resdlist) else 0) if avgfp or avgfn or avgbpp : outstr = "Avg FP(/100kb) : %12.2f\n" % avgfp outstr += "Avg FP ratio : %13.3f\n" % avgfpr outstr += "Avg FN ratio : %13.3f\n" % avgfn outstr += "Avg bpp : %11.1f\n" % avgbpp outstr += "Avg sf : %13.3f\n" % avgsf outstr += "Avg sd : %13.3f\n" % avgsd outstr += "Avg sr : %13.3f\n" % avgsr varsP.updateInfoReport(outstr + "\n", printalso=True) if err and mergepath : #have an error file (alignParams) object util.checkDir(mergepath) mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge") outpath = os.path.join(mergepath, mappref+mrgstr+".err") err.fp = avgfp err.fn = avgfn err.sf = avgsf err.sd = avgsd err.bpp = avgbpp err.res = avgres err.nmaps = sum(nmaplist) err.llrm = avgllr err.goodmaps = sumgoodmaps err.llrgm = avgllg err.bppsd = avgbps err.fprate = avgfpr err.sr = avgsr err.ressd = avgrsd err.writeToFile(outpath)
def runSV(cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, groupsv): '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns; report on those alignments or the xmap provided as xmappath. ''' printargs = True if not os.path.isfile(os.path.join(cwd, "utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not util.checkFile(os.path.join(cwd, "Pipeline.py")): print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline if not util.checkFile(os.path.join(cwd, "SVModule.py")): print "SVModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import SVModule as svm if errfile and not util.checkFile(os.path.join(cwd, "SampleCharModule.py")): print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) elif errfile: import SampleCharModule as scm #use Pipeline objects varsP = Pipeline.varsPipeline() varsP.optArgumentsFileIn = optargs varsP.RefAlignerBin = rabin varsP.latestMergedCmap = os.path.join( contigdir, contigbase + ".cmap") #file suffix required to be .cmap varsP.contigFolder = os.path.split(contigdir)[0] varsP.nThreads = nthreads #necessary otherwise job won't start -- max threads per node varsP.maxthreads = maxthreads #threads per job varsP.ref = refcmap varsP.stdoutlog = True #enable -stdout -stderr args to RefAligner varsP.curCharacterizeCmaps = [varsP.latestMergedCmap] varsP.contigSubDirectories = True #needed for prepareContigIO varsP.doAlignMolvRef = False #do not look for copy number varsP.groupSV = groupsv #mimic Pipeline behavior: group or not if runaligns: #varsP.contigAlignTarget = outdir varsP.runSV = False varsP.groupContigs = False varsP.stdoutlog = True #use -stdout -stderr varsP.stageComplete = contigbase varsP.outputContigPrefix = getContigPrefix( util, contigdir ) #if outdir is not supplied, this is used as dir prefix; also used as file pref for -o arg varsP.outputContigFolder = contigdir #cmaps are copied from here if not outdir: outdir = contigdir + "_sv" #this will be outdir of sv jobs if os.path.isdir(outdir): if not util.checkDir(outdir): #check writeable print "\nERROR: Output dir is not writeable:\n", outdir, "\n" sys.exit(1) elif outdir == contigdir: print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n" sys.exit(1) print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n" elif not util.checkDir( outdir ): #does not exist, make, if False, can't make or not writeable print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n" sys.exit(1) if clustargs: #os.putenv('SGE_ROOT', '/var/lib/gridengine') #do I want this??? NO! It could very well be wrong. varsP.onCluster = True varsP.checkCluster() #call varsPipeline method to check SGE_ROOT #note: before, above default is wrong. Now, there is no default--user is required to set environment variable; but this is consistent with the Pipeline varsP.clusterLogDir = os.path.join(outdir, 'ClusterLogs') util.checkDir(varsP.clusterLogDir) #make it varsP.checkCluster() varsP.clusterArgumentsFileIn = clustargs #required for parseArguments varsP.parseArguments(readingClusterFile=True) if varsP.error: print varsP.message sys.exit(1) varsP.RefAlignerBin += "${BINARY_SUFFIX:=}" #copy from varsPipeline, handled by external script on phi host varsP.pipeReportFile = os.path.join(outdir, "sv_jobs_log.txt") varsP.infoReportFile = os.path.join(outdir, "sv_log.txt") varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt") if bedfile: varsP.bedFile = bedfile util.InitStatus(os.path.join(outdir, "status.xml")) varsP.parseArguments() #parses optArgumentsFile varsP.checkDependencies() varsP.RefAlignerBinOrig = rabin varsP.prerunLog( ) #general information in log -- needed for refaligner_version if printargs: print "\nRunning SV detection with arguments (" + os.path.split( optargs)[1] + "):\n" + " ".join( varsP.argsListed('svdetect')) + '\n' noisep = {} if errbinfile: noisep = {"readparameters": errbinfile} print "Using noise parameters from " + errbinfile + "\n" elif errfile: noisep = scm.readNoiseParameters(errfile.replace(".err", "")) if noisep.has_key( 'readparameters' ): #remove this because it's redundant, and it can cause problems with RefAligner compatibility del noisep['readparameters'] if not noisep: #readNoiseParameters returns empty dict on failure print "ERROR reading noise parameters, check .err file:", errfile sys.exit(1) print "Using noise parameters from " + errfile + ":\n" + " ".join( ["-" + str(k) + " " + str(v) for k, v in noisep.iteritems()]) + "\n" varsP.outputContigFolder = contigdir #cmaps are copied from here #make merged cmap to replace merged _q.cmap if not produced by RefAligner cmaps = util.getListOfFilesFromDir(varsP.outputContigFolder, suffix=".cmap") if len(cmaps) > 1: varsP.contigPathTxtFile = os.path.join( outdir, "contig_list.txt") #mergeIntoSingleCmap creates this file print "Creating merged cmap" varsP.mergeIntoSingleCmap(outdir) print "Merged cmap created:", varsP.latestMergedCmap, "\n" if varsP.groupSV == 0: #if it is a single job, use merged map just created varsP.outputContigFolder = outdir #input == output #print "varsP.outputContigFolder =", varsP.outputContigFolder #debug elif len(cmaps) == 1: varsP.latestMergedCmap = cmaps[0] else: #this is already checked in getContigPrefix (redundant) print "No cmaps found in input dir; check dir %s\n" % varsP.outputContigFolder sys.exit(1) svmodule = svm.SVdetect(varsP, noisep, outdir, skipderes=True) #this got duplicated above #if hasattr(util, "InitStatus") : #if old version, skip -- do this after SVdetect.__init__ bc makes outdir # util.InitStatus(os.path.join(outdir, "status.xml")) #needed otherwise call to status_log fails svmodule.runJobs() svmodule.checkResults() util.SummarizeErrors(varsP) else: varsP.contigAlignTarget = contigdir #this is dir in which _q and _r cmaps must be located print "ERROR: feature not supported" #not implemented to not run jobs
def getArgs(): parser = argparse.ArgumentParser(description=description) parser.add_argument( '-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) parser.add_argument( '-r', dest='referenceMap', help='Path to reference maps (.cmap), 1 file only (required)', type=str) parser.add_argument( '-q', dest='queryDir', help='Path to dir containing query maps (.cmaps) (required)', type=str) #parser.add_argument('-x', dest='xmap', help='Path to .xmap, 1 file only (optional, if specified, no alignment is done, if not specified, -t, -r, and -q must be specified)') #not supported parser.add_argument( '-o', dest='outputDir', help= 'output dir (optional, defaults to input map dir with suffix "_sv")', default="", type=str) parser.add_argument( '-p', dest='pipelineDir', help= 'Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str) parser.add_argument( '-a', dest='optArguments', help= 'Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str) parser.add_argument( '-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int) parser.add_argument( '-j', dest='maxthreads', help= 'Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int) parser.add_argument( '-b', dest='bedFile', help= '.bed file with gaps in reference for flagging SVs which overlap N-base gaps (optional)', default="", type=str) parser.add_argument( '-e', dest='errFile', help= '.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)', default="", type=str) parser.add_argument( '-E', dest='errbinFile', help= '.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)', default="", type=str) parser.add_argument( '-C', help= 'Run on cluster, read XML file for submission arguments (optional--will not use cluster submission if absent)', dest='cxml', default=None) parser.add_argument( '-s', help= 'SV jobs configuration: 0 = single job (required for correct haplotype calls), 1 = single job per contig (not recommended), 2 = grouped (default 0; optional)', dest='groupsv', type=int, default=0) #parser.add_argument('-s', help='Disable grouping of SV jobs (default grouped; optional)', dest='groupsv', action='store_false') #old one result = parser.parse_args() #check all Pipeline dependencies if result.pipelineDir: cwd = result.pipelineDir else: cwd = os.path.split( os.path.realpath(__file__))[0] #this is path of this script if not os.path.isfile(os.path.join( cwd, "utilities.py")): #if still not here, last try is actual cwd cwd = os.getcwd() #still check this below #this is the only one imported here and in runCharacterize if not os.path.isfile(os.path.join(cwd, "utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util #xmap -- don't use this runaligns = True #default is to run the alignment xmappath = None #if result.xmap : # xmappath = result.xmap # if not util.checkFile(xmappath, ".xmap") : # print "Xmap path is supplied ("+xmappath+") but not found or doesn't end in .xmap." # sys.exit(1) # runaligns = False #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args rabin = result.RefAligner #replicate Pipeline behavior: RefAligner is always required if os.path.isdir(rabin): rabin = os.path.join(rabin, "RefAligner") if not util.checkExecutable(rabin): print "RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg." sys.exit(1) #reference maps -- only required if xmap not specified refcmap = os.path.realpath(result.referenceMap) if runaligns and not util.checkFile( refcmap, ".cmap"): #and not util.checkFile(refcmap, ".spots") : print "Reference map file (" + refcmap + ") not found or does not end in .cmap or .spots. Check -r argument." sys.exit(1) #query maps qrypath = os.path.realpath(result.queryDir) #if runaligns and not util.checkFile(qrypath, ".cmap") : # print "Query map file ("+qrypath+") not found or does not end in .cmap or .spots. Check -q argument." # sys.exit(1) if not util.checkDir(qrypath, checkWritable=False, makeIfNotExist=False): #does NOT have to be writeable print "Query dir (" + qrypath + ") not found or not a dir. Check -q argument." sys.exit(1) if runaligns: contigdir = qrypath #os.path.split(qrypath)[0] #dir of query maps contigbase = os.path.split(qrypath)[1] #filename else: contigdir = os.path.split(xmappath)[0] contigbase = os.path.split(xmappath)[1] #filename #contigbase = contigbase[:contigbase.find(".")] #remove suffix #optargs file optargs = None if result.optArguments: #supplied on command line optargs = result.optArguments if not util.checkFile(optargs, ".xml"): print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument." sys.exit(1) elif runaligns: #load from Pipeline dir if running alignments optargs = os.path.join(cwd, "optArguments_human.xml") if not util.checkFile(optargs): print "optArguments.xml missing in Pipeline directory (" + cwd + "). Try supplying path explicitly using -a." sys.exit(1) #cluster args clustargs = None if result.cxml: clustargs = os.path.realpath(result.cxml) if not util.checkFile(clustargs, ".xml"): print "clusterArguments path is supplied (" + clustargs + ") but not found or doesn't end in .xml, check -C argument." sys.exit(1) #nthreads nthreads = result.numThreads if nthreads <= 0: print "Number of threads value invalid (must be > 0): %i" % nthreads sys.exit(1) #maxthreads maxthreads = result.maxthreads if maxthreads <= 0: print "Max threads value invalid (must be > 0): %i" % maxthreads sys.exit(1) #bed file bedfile = result.bedFile #must make local for return statement below if bedfile: #must check for empty string BEFORE you do realpath, or it returns cwd bedfile = os.path.realpath(result.bedFile) if not util.checkFile(bedfile, ".bed"): print "bed file supplied but not found or incorrect suffix:", bedfile sys.exit(1) #.errbin file errbinfile = result.errbinFile if errbinfile: errbinfile = os.path.realpath(result.errbinFile) if not util.checkFile(errbinfile, ".errbin"): print "errbin file supplied but not found or incorrect suffix:", errbinfile sys.exit(1) #.err file errfile = result.errFile if errfile and errbinfile: print "Warning: .err and .errbin arguments supplied; ignoring .err file" errfile = "" elif errfile: errfile = os.path.realpath(result.errFile) if not util.checkFile(errfile, ".err"): print "err file supplied but not found or incorrect suffix:", errfile sys.exit(1) outdir = os.path.realpath(result.outputDir) groupsv = result.groupsv if groupsv < 0 or groupsv > 2: print 'ERROR: -s (grouped SV) must be 0, 1, or 2\n' sys.exit(1) #yes, this is messy...but I don't want another class (besides varsPipeline) and they just go to runCharacterize return cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, groupsv
def runAlignMol(): parser = argparse.ArgumentParser(description=description) parser.add_argument( '-q', dest='queryDir', help= 'Path to merged cmap to align molecules (-b) to OR alignmol dir from Pipeline for merge (if latter, no alignments are performed), required', type=str) parser.add_argument( '-b', dest='bnx', help='Input molecule (.bnx) file, required if aligning molecules', type=str) #parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file OR path to dir containing split bnx pieces, required if aligning molecules', type=str) #I should add the split feature; for now, just do single bnx parser.add_argument( '-a', dest='optArguments', help= 'Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str) parser.add_argument( '-r', help= 'If this flag is used, alignmolvref arguments are used, otherwise alignmol arguments are used (default alignmol; optional)', dest='ref', action='store_true') parser.add_argument( '-o', dest='outputDir', help= 'output dir (optional, defaults to sub-dir of input map dir called "alignmol")', default="", type=str) parser.add_argument( '-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) parser.add_argument( '-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int) parser.add_argument( '-j', dest='maxthreads', help= 'Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int) parser.add_argument( '-e', dest='errFile', help= '.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str) parser.add_argument( '-E', dest='errbinFile', help= '.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str) parser.add_argument( '-p', dest='pipelineDir', help= 'Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str) result = parser.parse_args() outprefix = "exp_refineFinal1" #this is the default; assume for now #check all Pipeline dependencies if result.pipelineDir: cwd = result.pipelineDir else: cwd = os.path.split( os.path.realpath(__file__))[0] #this is path of this script if not os.path.isfile(os.path.join( cwd, "utilities.py")): #if still not here, last try is actual cwd cwd = os.getcwd() #still check this below #this is the only one imported here and in runCharacterize if not os.path.isfile(os.path.join(cwd, "utilities.py")): print "ERROR: utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not os.path.isfile(os.path.join(cwd, "AlignModule.py")): print "ERROR: AlignModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import AlignModule as alignmod if not util.checkFile(os.path.join(cwd, "Pipeline.py")): print "ERROR: Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline if not util.checkFile(os.path.join(cwd, "mapClasses.py")): print "ERROR: mapClasses.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import mapClasses as mc #input dir if not result.queryDir: print "ERROR: Query (-q) argument not supplied." sys.exit(1) qrypath = os.path.realpath(result.queryDir) if util.checkDir( qrypath, checkWritable=False, makeIfNotExist=False): #output elsewhere so not writeable is ok runaligns = False elif util.checkCmap(qrypath): runaligns = True else: print "ERROR: Query argument (" + qrypath + ") not found or not a dir or cmap. Check -q argument." sys.exit(1) #this check isn't really necessary...make it a warning -- left over from runAlignMerge.py #if not os.path.split(qrypath)[1].endswith("alignmol") : # print "Warning: Query dir ("+qrypath+") does not end with 'alignmol'; please be sure this is a Pipeline alignmol dir\n" #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args rabin = "" #need empty string for generateJobList even though no jobs are run if runaligns: rabin = result.RefAligner #replicate Pipeline behavior: RefAligner is always required if os.path.isdir(rabin): rabin = os.path.join(rabin, "RefAligner") if not util.checkExecutable(rabin): print "ERROR: RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg." sys.exit(1) #optargs file optargs = None if runaligns and result.optArguments: #supplied on command line optargs = result.optArguments if not util.checkFile(optargs, ".xml"): print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument." sys.exit(1) elif runaligns: #load from Pipeline dir if running alignments optargs = os.path.join(cwd, "optArguments_human.xml") if not util.checkFile(optargs): print "optArguments.xml missing in Pipeline directory (" + cwd + "). Try supplying path explicitly using -a." sys.exit(1) #output dir if not result.outputDir: outdir = os.path.join(qrypath, "merge") #should be same as in AlignModule else: outdir = os.path.realpath(result.outputDir) if os.path.isdir(outdir): if not util.checkDir(outdir): #check writeable print "\nERROR: Output dir is not writeable:\n", outdir, "\n" sys.exit(1) #this is ok here #elif outdir == contigdir : # print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n" # sys.exit(1) print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n" elif not util.checkDir( outdir ): #does not exist, make, if False, can't make or not writeable print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n" sys.exit(1) #bnx file bnxfile = result.bnx if bnxfile: #must check for empty string BEFORE you do realpath, or it returns cwd bnxfile = os.path.realpath(bnxfile) if not util.checkFile(bnxfile, ".bnx"): print "ERROR: bnx file supplied but not found or incorrect suffix:", bnxfile sys.exit(1) elif runaligns: print "ERROR: bnx file not supplied but running alignments; please supply bnx file as -b argument" sys.exit(1) #nthreads nthreads = result.numThreads if nthreads <= 0: print "ERROR: Number of threads value invalid (must be > 0): %i" % nthreads sys.exit(1) #maxthreads maxthreads = result.maxthreads if maxthreads <= 0: print "ERROR: Max threads value invalid (must be > 0): %i" % maxthreads sys.exit(1) elif nthreads < maxthreads: print "Warning: num threads (-T: %i) < max threads (-j: %i): increasing num threads to equal max threads\n" % ( nthreads, maxthreads) nthreads = maxthreads #.errbin file errbinfile = result.errbinFile if errbinfile: errbinfile = os.path.realpath(result.errbinFile) if not util.checkFile(errbinfile, ".errbin"): print "ERROR: errbin file supplied but not found or incorrect suffix:", errbinfile sys.exit(1) #.err file errfile = result.errFile if errfile and errbinfile: print "Warning: .err and .errbin arguments supplied; ignoring .err file" errfile = "" elif errfile: errfile = os.path.realpath(result.errFile) if not util.checkFile(errfile, ".err"): print "err file supplied but not found or incorrect suffix:", errfile sys.exit(1) if errfile and not util.checkFile(os.path.join(cwd, "SampleCharModule.py")): print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) elif errfile: import SampleCharModule as scm doref = result.ref #DONE checking arguments print "Using output dir", outdir if runaligns: print "Aligning", bnxfile, "\nTo", qrypath, "\n" else: print "Merging", qrypath, "\n" startTime = time.time() #time since Epoch memory_log = os.path.join(outdir, "memory_log.txt") util.initMemoryLog(memory_log) varsP = Pipeline.varsPipeline() varsP.RefAlignerBin = rabin varsP.contigFolder = "" #not used but needs to be an attr varsP.outputContigFolder = "" #not used but needs to be a string attr varsP.pipeReportFile = os.path.join(outdir, "alignmol_jobs_log.txt") varsP.infoReportFile = os.path.join(outdir, "alignmol_log.txt") util.InitStatus(os.path.join(outdir, "status.xml")) if runaligns: varsP.optArgumentsFileIn = optargs varsP.latestMergedCmap = qrypath #if !doref, need this one varsP.ref = qrypath #and if doref, need this one varsP.nThreads = nthreads #necessary otherwise job won't start -- max threads per node varsP.maxthreads = maxthreads #threads per job p = os.path.split(qrypath)[1] varsP.outputContigPrefix = p[:p.rfind(".")] #filename prefix varsP.stdoutlog = True #use -stdout -stderr varsP.sorted_file = bnxfile[:bnxfile.rfind( ".")] #enables the mol fraction align in AlignModule.getAlignStats if qrypath.endswith(".cmap"): #enable the mol stats varsP.totAssemblyLenMb = mc.multiCmap( qrypath, lengthonly=True).totalLength / 1e6 varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt") varsP.parseArguments() #parses optArgumentsFile varsP.checkDependencies() varsP.RefAlignerBinOrig = rabin varsP.prerunLog( ) #general information in log -- needed for refaligner_version noisep = {} if errbinfile: noisep = {"readparameters": errbinfile} #print "Using noise parameters from "+errbinfile+"\n" #move below elif errfile: noisep = scm.readNoiseParameters(errfile.replace(".err", "")) if noisep.has_key( 'readparameters' ): #remove this because it's redundant, and it can cause problems with RefAligner compatibility del noisep['readparameters'] if not noisep: #readNoiseParameters returns empty dict on failure print "ERROR reading noise parameters, check .err file:", errfile sys.exit(1) #redundant with below? print "Using noise parameters from " + errfile + ":\n" + " ".join( ["-" + str(k) + " " + str(v) for k, v in noisep.iteritems()]) + "\n" #some code from SampleCharModule to load args into noise0 infoReport = "Loaded noise parameters:\n" klist = [ "FP", "FN", "sf", "sd", "sr", "bpp", "readparameters" ] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict. #noiseargs = self.varsP.argsListed('noise0') #not necessary for v in klist: if not noisep.has_key(v): continue param = str(noisep[v]) util.LogStatus("parameter", "auto_" + v, param) infoReport += v + ":" + param + "\n" varsP.replaceParam("noise0", "-" + v, param) varsP.updateInfoReport(infoReport + '\n', printalso=True) else: print "Getting file list from", qrypath outFileList = getOutFileList(util, qrypath) if not outFileList: print "ERROR: Query dir (" + qrypath + ") does not contain alignmol data. Check -q argument." sys.exit(1) else: print "Found", len(outFileList), "alignment results" #end if runaligns amod = alignmod.AlignModule( varsP, doref, outdir, bnxfile) #constructor will call generateJobList if runaligns: amod.runJobs() amod.checkResults() else: amod.outFileList = outFileList p = os.path.split(outFileList[0])[1] if p.count("_") > 1: #expect something like "EXP_REFINEFINAL1_4" #p = p[:p.rfind("_")+1] #remove integer suffix p = p[:p.rfind("_")] #remove integer suffix (and underscore) #else : # p += "_" #because mrgstr is appended varsP.outputContigPrefix = p if not runaligns or len(amod.jobList) > 0: amod.getAlignStats() if runaligns: print #copy from Pipeline.py if util.SummarizeErrors(varsP=varsP) == 0: varsP.updatePipeReport("Pipeline has successfully completed\n") util.LogStatus("progress", "pipeline", "success") else: varsP.updatePipeReport("Pipeline has completed with errors\n") util.LogStatus("progress", "pipeline", "failure") #BELOW OLD CODE return #in Pipeline, this is called first #print "Calling getAlignStats:" #but it won't work without varsP atm; skip it #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir) #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir) print "Calling mergeMap" print outFileList[0] #, "\n", outputdir #moved above util.logMemory(memory_log, startTime, "mergeMap_start") #mergeMap(self.varsP, self.outFileList, mergepath=self.outputdir) #varsP is optional alignmod.mergeMap(None, outFileList, outputdir) util.logMemory(memory_log, startTime, "mergeMap_end") print "Calling mergeRcmaps" util.logMemory(memory_log, startTime, "mergeRcmaps_start") #mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="alignmol") : alignmod.mergeRcmaps(outFileList, outputdir, splitByContig=True, stageName=outprefix) util.logMemory(memory_log, startTime, "mergeRcmaps_end") print "Calling split_XMap_byContig" #split_XMapQcmap_byContig" util.logMemory(memory_log, startTime, "split_XMap_byContig_start") #xmapdict = alignmod.split_XMap_byContig(outFileList, outputdir, stageName=outprefix) #old xmapdict = alignmod.split_XMap_byContig_new(outFileList, outputdir, stageName=outprefix) util.logMemory(memory_log, startTime, "split_XMap_byContig_end") print "Calling split_Qcmap_byContig" util.logMemory(memory_log, startTime, "split_Qcmap_byContig_start") #alignmod.split_Qcmap_byContig(outFileList, outputdir, xmapdict) #old alignmod.split_Qcmap_byContig_new( outFileList, outputdir, xmapdict, stageName=outprefix) #new: better performance util.logMemory(memory_log, startTime, "split_Qcmap_byContig_end") print "AlignMerge successfully completed"
def __init__(self, varsP) : """splitBNX.__init__: this class is for sorting the input bnx for subsequent splitting by the splitBNX class, and eventually easier processing with the Pairwise class. The constructor (this) will call varsP.runJobs and doAllPipeReport, then instantiate splitBNX, which will do all the splitting required for the Pairwise class. """ self.stageName = "Autonoise0" self.varsP = varsP #fewer code modifications below util.LogStatus("progress", "stage_start", self.stageName) #after above bc check if bypass (executeCurrentStage) self.output_folder = os.path.join(self.varsP.contigFolder, "auto_noise") if not util.checkDir(self.output_folder) : #will make if not exist, only returns False if already exists or can't make print "ERROR in autoNoise: bad dir:", self.output_folder raise RuntimeError # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short. #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly')) super(autoNoise, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly")) bnxfile = self.varsP.bnxFile if varsP.noiseOnly else self.varsP.sorted_file+".bnx" #was return if generateJobListChar, but need to get readparameters if bypass if not self.generateJobListChar({}, bnxfile, "autoNoise0") : #return 0 for success, 1 for skip self.varsP.runJobs(self, "AutoNoise0") self.doAllPipeReport() if not self.allResultsFound() : self.varsP.updatePipeReport("ERROR: AutoNoise0 failed. Check: "+self.output_file+".stdout\n") raise RuntimeError util.LogStatus("progress", "stage_complete", self.stageName) self.varsP.noise0 = readNoiseParameters(self.output_file) self.isBadErrorParams(self.varsP.noise0, 0) self.stageName = "Autonoise1" self.groupName = self.stageName #fix so that LogStatus call in MultiThreading.multiThreadRunJobs util.LogStatus("progress", "stage_start", self.stageName) self.clearJobs() self.varsP.replaceParam("noise0", "-readparameters", self.output_file+".errbin") #need to call again to set self.output_file if not self.generateJobListChar(self.varsP.noise0, bnxfile, "autoNoise1") : #return 0 for success, 1 for skip self.varsP.runJobs(self, "AutoNoise1") self.doAllPipeReport() if not self.allResultsFound() : self.varsP.updatePipeReport("ERROR: AutoNoise1 failed. Check: "+self.output_file+".stdout\n") raise RuntimeError self.varsP.noise1 = readNoiseParameters(self.output_file) infoReport="Automatically determined noise parameters:\n" klist = ["FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict. for v in klist : if not self.varsP.noise1.has_key(v) : continue param=str(self.varsP.noise1[v]) util.LogStatus("parameter", "auto_"+v, param) infoReport+=v+":"+param+"\n" self.varsP.replaceParam("noise0", "-"+v, param) self.varsP.updateInfoReport(infoReport + '\n') self.isBadErrorParams(self.varsP.noise1, 1) if self.varsP.doScanScale : #change the sorted_file to the rescaled bnx file rescaledbnx = self.output_file + self.varsP.rescaleSuffix #no ".bnx" in suffix if not util.checkFile(rescaledbnx+".bnx") : #not found--not an error if bnx 0.1 is used err = "Warning: scan scaled bnx not found after autoNoise1; not performing scan scaling--check that bnx 1.0 or later used in input" self.varsP.updatePipeReport( err+"\n\n" ) util.LogError("warning", err) self.varsP.doScanScale = False else : #log that scan scaling is used self.varsP.updatePipeReport( "Using scan scaled bnx: "+rescaledbnx+".bnx\n\n" ) util.LogStatus("parameter", "scanscaled_bnx", rescaledbnx+".bnx") self.varsP.sorted_file = rescaledbnx #this variable is used in splitBNX (PairwiseModule.py) util.LogStatus("progress", "stage_complete", self.stageName)
def runSV(cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, groupsv): '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns; report on those alignments or the xmap provided as xmappath. ''' printargs = True if not os.path.isfile(os.path.join(cwd,"utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util if not util.checkFile(os.path.join(cwd,"Pipeline.py")): print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import Pipeline if not util.checkFile(os.path.join(cwd,"SVModule.py")): print "SVModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import SVModule as svm if errfile and not util.checkFile(os.path.join(cwd,"SampleCharModule.py")): print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) elif errfile : import SampleCharModule as scm #use Pipeline objects varsP = Pipeline.varsPipeline() varsP.optArgumentsFileIn = optargs varsP.RefAlignerBin = rabin varsP.latestMergedCmap = os.path.join(contigdir, contigbase+".cmap") #file suffix required to be .cmap varsP.contigFolder = os.path.split(contigdir)[0] varsP.nThreads = nthreads #necessary otherwise job won't start -- max threads per node varsP.maxthreads = maxthreads #threads per job varsP.ref = refcmap varsP.stdoutlog = True #enable -stdout -stderr args to RefAligner varsP.curCharacterizeCmaps = [varsP.latestMergedCmap] varsP.contigSubDirectories = True #needed for prepareContigIO varsP.doAlignMolvRef = False #do not look for copy number varsP.groupSV = groupsv #mimic Pipeline behavior: group or not if runaligns : #varsP.contigAlignTarget = outdir varsP.runSV = False varsP.groupContigs = False varsP.stdoutlog = True #use -stdout -stderr varsP.stageComplete = contigbase varsP.outputContigPrefix = getContigPrefix(util, contigdir) #if outdir is not supplied, this is used as dir prefix; also used as file pref for -o arg varsP.outputContigFolder = contigdir #cmaps are copied from here if not outdir : outdir = contigdir+"_sv" #this will be outdir of sv jobs if os.path.isdir(outdir) : if not util.checkDir(outdir) : #check writeable print "\nERROR: Output dir is not writeable:\n", outdir, "\n" sys.exit(1) elif outdir == contigdir : print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n" sys.exit(1) print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n" elif not util.checkDir(outdir) : #does not exist, make, if False, can't make or not writeable print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n" sys.exit(1) if clustargs : os.putenv('SGE_ROOT', '/var/lib/gridengine') #do I want this??? varsP.onCluster = True varsP.clusterLogDir = os.path.join(outdir, 'ClusterLogs') util.checkDir(varsP.clusterLogDir) #make it varsP.checkCluster() varsP.clusterArgumentsFileIn = clustargs #required for parseArguments varsP.parseArguments(readingClusterFile=True) if varsP.error : print varsP.message sys.exit(1) varsP.RefAlignerBin += "${BINARY_SUFFIX:=}" #copy from varsPipeline, handled by external script on phi host varsP.pipeReportFile = os.path.join(outdir, "sv_jobs_log.txt") varsP.infoReportFile = os.path.join(outdir, "sv_log.txt") varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt") if bedfile : varsP.bedFile = bedfile util.InitStatus( os.path.join(outdir, "status.xml") ) varsP.parseArguments() #parses optArgumentsFile varsP.checkDependencies() varsP.RefAlignerBinOrig = rabin varsP.prerunLog() #general information in log -- needed for refaligner_version if printargs : print "\nRunning SV detection with arguments ("+os.path.split(optargs)[1]+"):\n" + " ".join(varsP.argsListed('svdetect')) + '\n' noisep = {} if errbinfile : noisep = {"readparameters": errbinfile} print "Using noise parameters from "+errbinfile+"\n" elif errfile : noisep = scm.readNoiseParameters(errfile.replace(".err","")) if noisep.has_key('readparameters') : #remove this because it's redundant, and it can cause problems with RefAligner compatibility del noisep['readparameters'] if not noisep : #readNoiseParameters returns empty dict on failure print "ERROR reading noise parameters, check .err file:", errfile sys.exit(1) print "Using noise parameters from "+errfile+":\n" + " ".join(["-"+str(k)+" "+str(v) for k,v in noisep.iteritems()])+"\n" #make merged cmap to replace merged _q.cmap if not produced by RefAligner varsP.contigPathTxtFile = os.path.join(outdir, "contig_list.txt") #mergeIntoSingleCmap creates this file print "Creating merged cmap" varsP.mergeIntoSingleCmap(outdir) print "Merged cmap created:", varsP.latestMergedCmap, "\n" varsP.outputContigFolder = contigdir #cmaps are copied from here svmodule = svm.SVdetect(varsP, noisep, outdir, skipderes=True) #this got duplicated above #if hasattr(util, "InitStatus") : #if old version, skip -- do this after SVdetect.__init__ bc makes outdir # util.InitStatus(os.path.join(outdir, "status.xml")) #needed otherwise call to status_log fails svmodule.runJobs() svmodule.checkResults() util.SummarizeErrors(varsP) else : varsP.contigAlignTarget = contigdir #this is dir in which _q and _r cmaps must be located print "ERROR: feature not supported" #not implemented to not run jobs
def getArgs() : parser = argparse.ArgumentParser(description=description) parser.add_argument('-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) parser.add_argument('-r', dest='referenceMap', help='Path to reference maps (.cmap), 1 file only (required)', type=str) parser.add_argument('-q', dest='queryDir', help='Path to dir containing query maps (.cmaps) (required)', type=str) #parser.add_argument('-x', dest='xmap', help='Path to .xmap, 1 file only (optional, if specified, no alignment is done, if not specified, -t, -r, and -q must be specified)') #not supported parser.add_argument('-o', dest='outputDir', help='output dir (optional, defaults to input map dir with suffix "_sv")', default="", type=str) parser.add_argument('-p', dest='pipelineDir', help='Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str) parser.add_argument('-a', dest='optArguments', help='Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str) parser.add_argument('-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int) parser.add_argument('-j', dest='maxthreads', help='Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int) parser.add_argument('-b', dest='bedFile', help='.bed file with gaps in reference for flagging SVs which overlap N-base gaps (optional)', default="", type=str) parser.add_argument('-e', dest='errFile', help='.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)', default="", type=str) parser.add_argument('-E', dest='errbinFile', help='.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)', default="", type=str) parser.add_argument('-C', help='Run on cluster, read XML file for submission arguments (optional--will not use cluster submission if absent)', dest='cxml', default=None) parser.add_argument('-s', help='Disable grouping of SV jobs (default grouped; optional)', dest='groupsv', action='store_false') result = parser.parse_args() #check all Pipeline dependencies if result.pipelineDir : cwd = result.pipelineDir else : cwd = os.path.split(os.path.realpath(__file__))[0] #this is path of this script if not os.path.isfile(os.path.join(cwd,"utilities.py")) : #if still not here, last try is actual cwd cwd = os.getcwd() #still check this below #this is the only one imported here and in runCharacterize if not os.path.isfile(os.path.join(cwd,"utilities.py")): print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir" sys.exit(1) import utilities as util #xmap -- don't use this runaligns = True #default is to run the alignment xmappath = None #if result.xmap : # xmappath = result.xmap # if not util.checkFile(xmappath, ".xmap") : # print "Xmap path is supplied ("+xmappath+") but not found or doesn't end in .xmap." # sys.exit(1) # runaligns = False #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args rabin = result.RefAligner #replicate Pipeline behavior: RefAligner is always required if os.path.isdir(rabin) : rabin = os.path.join(rabin, "RefAligner") if not util.checkExecutable(rabin): print "RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg." sys.exit(1) #reference maps -- only required if xmap not specified refcmap = os.path.realpath(result.referenceMap) if runaligns and not util.checkFile(refcmap, ".cmap") : #and not util.checkFile(refcmap, ".spots") : print "Reference map file ("+refcmap+") not found or does not end in .cmap or .spots. Check -r argument." sys.exit(1) #query maps qrypath = os.path.realpath(result.queryDir) #if runaligns and not util.checkFile(qrypath, ".cmap") : # print "Query map file ("+qrypath+") not found or does not end in .cmap or .spots. Check -q argument." # sys.exit(1) if not util.checkDir(qrypath, checkWritable=False, makeIfNotExist=False) : #does NOT have to be writeable print "Query dir ("+qrypath+") not found or not a dir. Check -q argument." sys.exit(1) if runaligns : contigdir = qrypath #os.path.split(qrypath)[0] #dir of query maps contigbase = os.path.split(qrypath)[1] #filename else : contigdir = os.path.split(xmappath)[0] contigbase = os.path.split(xmappath)[1] #filename #contigbase = contigbase[:contigbase.find(".")] #remove suffix #optargs file optargs = None if result.optArguments : #supplied on command line optargs = result.optArguments if not util.checkFile(optargs, ".xml") : print "optArguments path is supplied ("+optargs+") but not found or doesn't end in .xml, check -a argument." sys.exit(1) elif runaligns : #load from Pipeline dir if running alignments optargs = os.path.join(cwd,"optArguments_human.xml") if not util.checkFile(optargs): print "optArguments.xml missing in Pipeline directory ("+cwd+"). Try supplying path explicitly using -a." sys.exit(1) #cluster args clustargs = None if result.cxml : clustargs = os.path.realpath(result.cxml) if not util.checkFile(clustargs, ".xml") : print "clusterArguments path is supplied ("+clustargs+") but not found or doesn't end in .xml, check -C argument." sys.exit(1) #nthreads nthreads = result.numThreads if nthreads <= 0 : print "Number of threads value invalid (must be > 0): %i" % nthreads sys.exit(1) #maxthreads maxthreads = result.maxthreads if maxthreads <= 0 : print "Max threads value invalid (must be > 0): %i" % maxthreads sys.exit(1) #bed file bedfile = result.bedFile #must make local for return statement below if bedfile : #must check for empty string BEFORE you do realpath, or it returns cwd bedfile = os.path.realpath(result.bedFile) if not util.checkFile(bedfile, ".bed") : print "bed file supplied but not found or incorrect suffix:", bedfile sys.exit(1) #.errbin file errbinfile = result.errbinFile if errbinfile : errbinfile = os.path.realpath(result.errbinFile) if not util.checkFile(errbinfile, ".errbin") : print "errbin file supplied but not found or incorrect suffix:", errbinfile sys.exit(1) #.err file errfile = result.errFile if errfile and errbinfile : print "Warning: .err and .errbin arguments supplied; ignoring .err file" errfile = "" elif errfile : errfile = os.path.realpath(result.errFile) if not util.checkFile(errfile, ".err") : print "err file supplied but not found or incorrect suffix:", errfile sys.exit(1) outdir = os.path.realpath(result.outputDir) #yes, this is messy...but I don't want another class (besides varsPipeline) and they just go to runCharacterize return cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, result.groupsv