Ejemplo n.º 1
0
    def __init__(self, varsP, doref=False, outputdir=None, bnxin=None):
        """doref determines parameter set from optargs.
        outputdir not needed for Pipeline, but used in runAlignMol.py.
        If bnxin supplied, will run single job with it.
        """
        self.varsP = varsP
        self.doref = doref
        self.bnxin = bnxin  #see generateJobList

        self.argStageName = 'alignmol'  #use arguments from alignmol (optArgs, not clusterArgs)
        if not doref:
            self.stageName = 'alignmol'  #also name of dir which is sub-dir of varsP.outputContigFolder
            self.alignTarget = os.path.join(varsP.outputContigFolder,
                                            self.stageName)  #output dir
            self.varsP.alignMolDir = self.alignTarget  #store in varsP for subsequent processing
        else:
            self.stageName = self.varsP.alignMolvrefName  #also name of dir which is sub-dir of localRoot
            self.alignTarget = os.path.join(self.varsP.contigFolder,
                                            self.stageName)  #output dir
        if outputdir:
            self.alignTarget = outputdir

        util.checkDir(self.alignTarget)  #will make if doesn't exist
        self.mergedir = os.path.join(
            self.alignTarget,
            self.varsP.alignMolvrefMergeName)  #copy from AlignRefModule

        super(AlignModule, self).__init__(
            self.varsP,
            self.stageName,
            clusterArgs=self.varsP.getClusterArgs(self.stageName))

        self.outFileList = []
        self.generateJobList()
        self.logArguments()
    def __init__(self, varsP, doref=False, outputdir=None, bnxin=None):
        """doref determines parameter set from optargs.
        outputdir not needed for Pipeline, but used in runAlignMol.py.
        If bnxin supplied, will run single job with it.
        """
        self.varsP = varsP
        self.doref = doref
        self.bnxin = bnxin #see generateJobList

        self.argStageName = 'alignmol' #use arguments from alignmol (optArgs, not clusterArgs)
        if not doref :
            self.stageName = 'alignmol' #also name of dir which is sub-dir of varsP.outputContigFolder
            self.alignTarget = os.path.join(varsP.outputContigFolder, self.stageName) #output dir
            self.varsP.alignMolDir = self.alignTarget #store in varsP for subsequent processing
        else :
            self.stageName = self.varsP.alignMolvrefName #also name of dir which is sub-dir of localRoot
            self.alignTarget = os.path.join(self.varsP.contigFolder, self.stageName) #output dir
        if outputdir :
            self.alignTarget = outputdir

        util.checkDir(self.alignTarget) #will make if doesn't exist
        self.mergedir = os.path.join(self.alignTarget, self.varsP.alignMolvrefMergeName) #copy from AlignRefModule

        super(AlignModule, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs(self.stageName)) 

        self.outFileList = []
        self.generateJobList()
        self.logArguments()
    def generateJobListChar(self, noise_in, input_file, optSection) :

        if not self.varsP.executeCurrentStage:
            return 1 #tell self.__init__ not to continue processing
	    
        self.varsP.updatePipeReport('%s\n' % (optSection))
        
        self.output_folder=os.path.join(self.varsP.contigFolder, "auto_noise")
        if not util.checkDir(self.output_folder) : #will make if not exist, only returns False if already exists or can't make
                    print "ERROR in autoNoise.varsPipeline.prepareContigIO: bad dir:", self.output_folder
                    
        self.output_file=os.path.join(self.output_folder, optSection)
	    
        expectedResultFile=self.output_file+".err"
        # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short.
        #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly'))
        super(autoNoise, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly"))

        #cargs=[self.varsP.RefAlignerBin, '-f', '-i', input_file, "-ref", self.varsP.ref, "-maxthreads", str(self.varsP.maxthreads), "-o", self.output_file] 
        cargs=[self.varsP.RefAlignerBin, '-f', '-i', input_file, "-ref", self.varsP.ref, "-o", self.output_file] #remove maxthreads bc this is always running on its own
        if self.varsP.stdoutlog :
            cargs.extend( ['-stdout', '-stderr'] )
        for v in noise_in.keys():
		cargs.extend(["-"+v, str(noise_in[v])])
		
        cargs.extend(self.varsP.argsListed(optSection))
	if self.varsP.bnxStatsFile!=None:
		cargs += ['-XmapStatWrite', self.varsP.bnxStatsFile]
        self.addJob(mthread.singleJob(cargs, self.stageName, expectedResultFile, self.stageName, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=self.output_file+".stdout"))

        return 0 #success
Ejemplo n.º 4
0
    def __init__(self, varsP):
        self.curCharacterizeFileRoots = []
        self.varsP = varsP  #bc Characterize uses this for totAssemblyLenMb
        #this is problematic for bypass (because mergeIntoSingleCmap isn't called)--don't need it
        #if not len(varsP.curCharacterizeCmaps) : #need this, set in mergeIntoSingleCmap
        #    return
        #ccc = varsP.curCharacterizeCmaps[0]
        #outFileName = os.path.split(ccc)[1].replace(".cmap", "")
        #outfile = os.path.join(varsP.contigAlignTarget,outFileName) #WRONG bc contigAlignTarget is wrong...try this

        outdir = os.path.join(varsP.outputContigFolder,
                              self.varsP.characterizeDirName)  #'alignref'
        if not util.checkDir(
                outdir, makeIfNotExist=False
        ):  #if this doesn't exist, we can't get what we need
            return
        outfile = None
        for qfile in os.listdir(outdir):
            if qfile.endswith(".err"):  #just take first .err file
                outfile = qfile
                break
        if not outfile:  #if no .err files found, give up
            return
        outfile = os.path.join(outdir, outfile.replace(".err", ""))
        self.curCharacterizeFileRoots.append(outfile)
        #also want to get varsP.totAssemblyLenMb
        self.varsP.totAssemblyLenMb = mapClasses.multiCmap(
            varsP.latestMergedCmap, lengthonly=True).totalLength / 1e6
    def checkResults(self):
        if self.varsP.ngsBypass : #this means that pairwise is skipped completely, so do not check anything
            return #return None means success
        self.doAllPipeReport() #loops over self.jobList and calls CheckIfFileFound
        #check for align files
        if not util.checkDir(self.varsP.alignFolder, makeIfNotExist=False) :
            self.varsP.updatePipeReport("ERROR: bad alignFolder:%s\n\n" % self.varsP.alignFolder)
            return 1

        alignFiles = []
        #for sJob in self.jobList:
        #    sJob.CheckIfFileFound()
        #    alignFile = sJob.expectedResultFile
        #    if sJob.resultFound:
        #        alignFiles.append(alignFile)
        #    else:
        #        self.warning += 1
        #        self.messages += '  PW Warning Missing Expected File: %s\n' % alignFile
        #if alignFiles.__len__() == 0:
        #    self.error += 1
        #    self.messages += '  Error: PW  Missing All Align Files\n' 
        #else:

        #Above uses results in singleJob instances, below reads from disk. Either way should work
        for ifile in os.listdir(self.varsP.alignFolder) :
            if ifile.endswith(".align") :
                alignFiles.append( os.path.join(self.varsP.alignFolder, ifile) )
        if len(alignFiles) == 0 :
            self.varsP.updatePipeReport("ERROR: no align files in alignFolder %s\n\n" % self.varsP.alignFolder)
            return 1

        alignFiles.sort()
        self.varsP.writeListToFile(alignFiles, self.varsP.alignTarget)
        self.varsP.stageComplete = 'Pairwise'
Ejemplo n.º 6
0
    def checkResults(self):
        if self.varsP.ngsBypass : #this means that pairwise is skipped completely, so do not check anything
            return #return None means success
        self.doAllPipeReport() #loops over self.jobList and calls CheckIfFileFound
        #check for align files
        if not util.checkDir(self.varsP.alignFolder, makeIfNotExist=False) :
            self.varsP.updatePipeReport("ERROR: bad alignFolder:%s\n\n" % self.varsP.alignFolder)
            return 1

        alignFiles = []
        #for sJob in self.jobList:
        #    sJob.CheckIfFileFound()
        #    alignFile = sJob.expectedResultFile
        #    if sJob.resultFound:
        #        alignFiles.append(alignFile)
        #    else:
        #        self.warning += 1
        #        self.messages += '  PW Warning Missing Expected File: %s\n' % alignFile
        #if alignFiles.__len__() == 0:
        #    self.error += 1
        #    self.messages += '  Error: PW  Missing All Align Files\n' 
        #else:

        #Above uses results in singleJob instances, below reads from disk. Either way should work
        for ifile in os.listdir(self.varsP.alignFolder) :
            if ifile.endswith(".align") :
                alignFiles.append( os.path.join(self.varsP.alignFolder, ifile) )
        if len(alignFiles) == 0 :
            self.varsP.updatePipeReport("ERROR: no align files in alignFolder %s\n\n" % self.varsP.alignFolder)
            return 1

        alignFiles.sort()
        self.varsP.writeListToFile(alignFiles, self.varsP.alignTarget)
        self.varsP.stageComplete = 'Pairwise'
def mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="") :
    """Given a list of file prefixes (outFileList), append "_r.cmap" to them, and merge them
    to outdir. Report to varsP if supplied, stdout if not.
    Also support outFileList is full paths (including "_r.cmap").
    If splitByContig < 1, output each contig separately, if == 1, only output single merged cmap,
    and if > 1, do both.
    Always use stagename if supplied; if not, must supply varsP otherwise prefix is empty.
    """
    
    if not util.checkDir(outdir) :
        err_msg = "Warning in AlignModule.mergeRcmaps: could not make outdir %s, skipping copy number" % outdir
        logOrPrintError(err_msg, varsP)
        return

    if not outFileList : #just an argument check--check for presence on disk is below
        err_msg = "Warning in AlignModule.mergeRcmaps: no maps supplied"
        logOrPrintError(err_msg, varsP)
        return

    outFileList.sort() #for reproducibility with runAlignMerge.py (different order when listing dir)
    rsuf = "_r.cmap"
    #mappref = os.path.split(outFileList[0])[1] #this is just prefix, but with integer suffix--get it before -- no longer used
    #mappref = mappref[:mappref.rfind("_")+1] #remove integer suffix
    #even though outFileList should all be there, a job may have failed--check all, just existence
    present = []
    for outf in outFileList :
        target = (outf+rsuf if not outf.endswith(rsuf) else outf) #now support either
        if not util.checkFile(target) :
            err_msg = "Warning in AlignModule.mergeRcmaps: missing _r.cmap %s" % target
            logOrPrintError(err_msg, varsP)
        else :
            present.append(target)
    if not present : #no _r.cmaps found (this will also happen for empty outFileList)
        err_msg = "Warning in AlignModule.mergeRcmaps: no _r.cmaps found, skipping copy number"
        logOrPrintError(err_msg, varsP)
        return
    outFileList = present #yes, it's redundant, but now have rsuf appended

    mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
    #mergedmappath = os.path.join(outdir, mappref+mrgstr+rsuf) #this is output merged _r.cmap -- unify with filepref

    mergedmap = mc.multiCmap(outFileList[0]) #open original, edit in memory
    #now add other maps
    for rmap in outFileList[1:] : #don't add map 0 to itself
        if mergedmap.addCovOcc( mc.multiCmap(rmap) ) : #when calling addCovOcc, check return, warn if True
            err_msg = "Warning in AlignModule.mergeRcmaps: addCovOcc call failed for map %s" % rmap
            logOrPrintError(err_msg, varsP)
    #now it's merged, but the resulting map need to be written back to disk
    filepref = (varsP.outputContigPrefix if varsP and stageName == "" else stageName) #see split_XMapQcmap_byContig
    if splitByContig < 1 or splitByContig > 1 :
        #print "\nself.varsP.outputContigPrefix", self.varsP.outputContigPrefix, "\n" #debug
        #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in split_XMapQcmap_byContig
        mergedmap.writeAllMapsToDisk( os.path.join(outdir, filepref+'_contig'), outsuf="_r" )
        report = "mergeRcmaps: wrote %i cmaps" % len(mergedmap.cmapdict)
    if splitByContig > 0 :
        mergedmap.writeToFile( os.path.join(outdir, filepref+"_"+mrgstr+rsuf) ) #was mergedmappath
        report = "mergeRcmaps: wrote merged cmap with %i contigs" % len(mergedmap.cmapdict)
    #report result
    logOrPrintError(report, varsP, warn=False)
Ejemplo n.º 8
0
def mergeMap(varsP, outFileList, mergepath):
    """outFileList is list of path+prefixes--each should have a .map file:
    merge them to a merged .map file in dir mergepath."""

    outFileList.sort()  #sort to ensure reproducibility (order of entries)
    maplist = []
    for outpath in outFileList:  #these are file prefixes
        if util.checkFile(outpath + ".map"):
            maplist.append(outpath + ".map")
        elif varsP:
            varsP.updatePipeReport(
                "Warning in AlignModule.mergeMap: missing map: " + outpath +
                ".map" + "\n")
        else:
            print "Warning in AlignModule.mergeMap: missing map: " + outpath + ".map" + "\n"

    if not len(maplist):  #nothing to merge
        return

    if not util.checkDir(mergepath):
        varsP.updatePipeReport(
            "Warning in AlignModule.mergeMap: merge path invalid: " +
            mergepath + "\n")
        return

    headstart = [
        "#", "S", "M"
    ]  #last two lines of header start with "Software" and "MappedMoleculeId"
    #header = ""
    headerdone = False
    #data = ""
    lineno = 1  #can't just append: need to change index in first column
    sep = "\t"
    mappref = getMergeFilename(outFileList[0])  #also in getAlignStats
    mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge"
              )  #same for vref and not
    outpath = os.path.join(mergepath, mappref + mrgstr + ".map")
    f1 = open(outpath, 'w')
    for path in maplist:
        f = open(path)
        for line in f:
            if line[0] in headstart and not headerdone:
                #header += line
                f1.write(line)
            elif line[0] not in headstart:
                tokens = line.split()
                tokens[0] = str(lineno)
                #data += sep.join(tokens)+"\n" #newline was stripped by split
                f1.write(sep.join(tokens) + "\n")
                lineno += 1
        headerdone = True
        f.close()

    #f1.write(header+data)
    f1.close()
Ejemplo n.º 9
0
 def generateJobList(self):
     curArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('sampleChar')
     if util.checkFile(self.varsP.bnxTarget) : #file exists only if image processing was run
         bnxFiles = parseExperimentFile(self.varsP.bnxTarget)
         if not bnxFiles : #check that you got at least one
             errstr = "ERROR in SampleChar.generateJobList: no bnx files found in: "+self.varsP.bnxTarget
             print errstr
             self.varsP.updatePipeReport(errstr+"\n\n")
             return
         basepath = "" #os.path.split(bnxFiles[0])[0] #don't use basepath for this case
     else : #otherwise, assume this is the only bnx file
         bnxFiles = [self.varsP.bnxFile]
         #here, make a dir for the results--should really check results of checkEmptyDir for errors
         basepath = os.path.join(self.varsP.localRoot, "sampleChar")
         if self.varsP.wipe and os.path.isdir(basepath) :
             shutil.rmtree(basepath)
             #util.checkEmptyDir(basepath) #will make if not exist, but if it does, will remove and re-make -- this fn doesn't exist...
         #else :
         util.checkDir(basepath) #will make if not exist, but won't remove anything
     nJobs = len(bnxFiles)
     #for i, bnxFile in enumerate(bnxFiles):
     for bnxFile in bnxFiles :
         #bnxGroupName = '%02d' % (i+1) #get this from the path, ie, bnxFiles
         cargs = [self.varsP.RefAlignerBin, '-i', bnxFile]
         bnxname = os.path.split(bnxFile)[1].replace(".bnx","")
         jobname = 'Sample_Char_' + bnxname
         #outputTarget = os.path.join(basepath, bnxGroupName)
         if basepath : #bnx input
             outputTarget = os.path.join(basepath, bnxname)
         else : #image processing
             outputTarget = bnxFile.replace(".bnx","") + "_sampleChar"
         expectedResultFile = outputTarget + '.err' #this is used in checkResults
         currentArgs = cargs + ['-ref', self.varsP.ref, '-o' , outputTarget, '-f']
         if self.varsP.stdoutlog :
             currentArgs.extend( ['-stdout', '-stderr'] )
         currentArgs += ['-maxthreads', str(self.varsP.maxthreads)] + curArgs
         sJob = mthread.singleJob(currentArgs, jobname, expectedResultFile, jobname, clusterLogDir=self.varsP.clusterLogDir) # peStr is deprecated in favor of clusterargs
         #sJob.expTag = bnxGroupName #removed from checkResults
         self.addJob(sJob)
     self.logArguments()
 def generateJobList(self):
     curArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('sampleChar')
     if util.checkFile(self.varsP.bnxTarget) : #file exists only if image processing was run
         bnxFiles = parseExperimentFile(self.varsP.bnxTarget)
         if not bnxFiles : #check that you got at least one
             errstr = "ERROR in SampleChar.generateJobList: no bnx files found in: "+self.varsP.bnxTarget
             print errstr
             self.varsP.updatePipeReport(errstr+"\n\n")
             return
         basepath = "" #os.path.split(bnxFiles[0])[0] #don't use basepath for this case
     else : #otherwise, assume this is the only bnx file
         bnxFiles = [self.varsP.bnxFile]
         #here, make a dir for the results--should really check results of checkEmptyDir for errors
         basepath = os.path.join(self.varsP.localRoot, "sampleChar")
         if self.varsP.wipe and os.path.isdir(basepath) :
             shutil.rmtree(basepath)
             #util.checkEmptyDir(basepath) #will make if not exist, but if it does, will remove and re-make -- this fn doesn't exist...
         #else :
         util.checkDir(basepath) #will make if not exist, but won't remove anything
     nJobs = len(bnxFiles)
     #for i, bnxFile in enumerate(bnxFiles):
     for bnxFile in bnxFiles :
         #bnxGroupName = '%02d' % (i+1) #get this from the path, ie, bnxFiles
         cargs = [self.varsP.RefAlignerBin, '-i', bnxFile]
         bnxname = os.path.split(bnxFile)[1].replace(".bnx","")
         jobname = 'Sample_Char_' + bnxname
         #outputTarget = os.path.join(basepath, bnxGroupName)
         if basepath : #bnx input
             outputTarget = os.path.join(basepath, bnxname)
         else : #image processing
             outputTarget = bnxFile.replace(".bnx","") + "_sampleChar"
         expectedResultFile = outputTarget + '.err' #this is used in checkResults
         currentArgs = cargs + ['-ref', self.varsP.ref, '-o' , outputTarget, '-f']
         if self.varsP.stdoutlog :
             currentArgs.extend( ['-stdout', '-stderr'] )
         currentArgs += ['-maxthreads', str(self.varsP.maxthreads)] + curArgs
         sJob = mthread.singleJob(currentArgs, jobname, expectedResultFile, jobname, clusterLogDir=self.varsP.clusterLogDir) # peStr is deprecated in favor of clusterargs
         #sJob.expTag = bnxGroupName #removed from checkResults
         self.addJob(sJob)
     self.logArguments()
 def getTargetJobs(self, dormdir=False):
     localDataLocation = os.path.join(self.varsP.localRoot, self.expTag + '/')
     #print "localDataLocation:", localDataLocation #debug
     if dormdir :
         sJobRmName = 'Pre-Remove Folder: ' + shorten(localDataLocation)
         sJobRm = mthread.singleJob(['rm', '-f', '-r', localDataLocation], sJobRmName, '', 'rmDir')
         sJobMkdirName = 'Make Folder: ' + shorten(localDataLocation)
         sJobMkdir = mthread.singleJob(['mkdir', localDataLocation], sJobMkdirName, localDataLocation, 'mkDir')
         sJobMkdir.addContingentJob(sJobRm)
         allJobs = [sJobRm, sJobMkdir]
         contingentjob = sJobMkdir
     else :
         util.checkDir(localDataLocation) #will make dir localDataLocation
         allJobs = []
         contingentjob = None
     for scan in self.scans:
         scanjobs = scan.getDetectJobs(contingentjob)
         if not scanjobs : #no scan jobs means the scan has already been processed--clear all jobs
             self.varsP.updatePipeReport("Device.getTargetJobs: skipping path "+scan.nameStr()+"\n") #localDataLocation
         else :
             allJobs += scanjobs
     return allJobs
    def __init__(self, varsP) :
        jobName = "reference_process"
        opta_section = "referenceSvdetect"
        default_mres = "2.9"
        mres = "-mres"
        self.varsP = varsP
        usedefault = False
        if self.varsP.argData.has_key(opta_section) : #check if in optargs
            opta = self.varsP.argsListed(opta_section)
            if not mres in opta : #must have mres
                self.varsP.updatePipeReport("Warning in referenceProcess: "+mres+" missing in optArguments section "+opta_section+"\n")
                usedefault = True
        else :
            self.varsP.updatePipeReport("Warning in referenceProcess: optArguments section "+opta_section+" missing\n")
            usedefault = True
        if usedefault :
            opta = [mres, default_mres]

        mresstr = opta[opta.index(mres)+1] #get string for mres value for output name
        mresstr = mresstr.replace(".","")

        if not util.checkDir(self.varsP.refFolder) :
            self.varsP.updatePipeReport( "ERROR in referenceProcess: could not make output dir %s\n" % self.varsP.refFolder )
            return None
        refpref = os.path.basename(self.varsP.ref[:self.varsP.ref.rfind(".")]) + "_res" + mresstr
        outarg = os.path.join(self.varsP.refFolder, refpref) #refFolder is new output folder for this job
        expectedResultFile = outarg+".cmap" #if ref is spots, is this spots?
        args = [self.varsP.RefAlignerBin, '-o', outarg, '-i', self.varsP.ref, '-f', '-merge'] + opta
        stdoutf = None
        if self.varsP.stdoutlog :
            args.extend( ['-stdout', '-stderr'] )
            stdoutf = outarg+".stdout"
        args += ['-maxthreads', str(self.varsP.nThreads)]

        super(referenceProcess, self).__init__(self.varsP, jobName, clusterArgs=self.varsP.getClusterArgs("assembly"))

        job = mthread.singleJob(args, jobName, expectedResultFile, jobName, maxThreads=self.varsP.nThreads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=stdoutf)
        self.addJob(job)

        util.LogStatus("progress", "stage_start", jobName)
        self.varsP.runJobs(self, "referenceProcess")
        self.doAllPipeReport()
        if not self.allResultsFound() : #this is an error, but we'll continue processing without SV detect
            err = "ERROR in referenceProcess: job failed, disabling SV detect"
            self.varsP.updatePipeReport( err+"\n" )
            util.LogError("error", err)
            #self.varsP.runSV = False #no need since this class is used in SVModule
        else :
            self.varsP.refDeresed = expectedResultFile #store good result for SV detect
            self.varsP.updatePipeReport( "referenceProcess: using reference %s for svdetect\n" % self.varsP.refDeresed )
        util.LogStatus("progress", "stage_complete", jobName)            
def mergeMap(varsP, outFileList, mergepath) :
    """outFileList is list of path+prefixes--each should have a .map file:
    merge them to a merged .map file in dir mergepath."""

    outFileList.sort() #sort to ensure reproducibility (order of entries)
    maplist = []
    for outpath in outFileList : #these are file prefixes
        if util.checkFile(outpath+".map") :
            maplist.append(outpath+".map")
        elif varsP :
            varsP.updatePipeReport("Warning in AlignModule.mergeMap: missing map: "+outpath+".map"+"\n")
        else :
            print "Warning in AlignModule.mergeMap: missing map: "+outpath+".map"+"\n"

    if not len(maplist) : #nothing to merge
        return

    if not util.checkDir(mergepath) :
        varsP.updatePipeReport("Warning in AlignModule.mergeMap: merge path invalid: "+mergepath+"\n")
        return

    headstart = ["#", "S", "M"] #last two lines of header start with "Software" and "MappedMoleculeId"
    #header = ""
    headerdone = False
    #data = ""
    lineno = 1 #can't just append: need to change index in first column
    sep = "\t"
    mappref = getMergeFilename(outFileList[0]) #also in getAlignStats
    mrgstr  = (varsP.alignMolvrefMergeName if varsP else "merge") #same for vref and not
    outpath = os.path.join(mergepath, mappref+mrgstr+".map")
    f1 = open(outpath, 'w')
    for path in maplist :
        f = open(path)
        for line in f :
            if line[0] in headstart and not headerdone :
                #header += line
                f1.write(line)
            elif line[0] not in headstart :
                tokens = line.split()
                tokens[0] = str(lineno)
                #data += sep.join(tokens)+"\n" #newline was stripped by split
                f1.write(sep.join(tokens)+"\n")
                lineno += 1
        headerdone = True
        f.close()

    #f1.write(header+data) 
    f1.close()
    def __init__(self, varsP) :
        self.curCharacterizeFileRoots = []
        self.varsP = varsP #bc Characterize uses this for totAssemblyLenMb
        #this is problematic for bypass (because mergeIntoSingleCmap isn't called)--don't need it
        #if not len(varsP.curCharacterizeCmaps) : #need this, set in mergeIntoSingleCmap
        #    return
        #ccc = varsP.curCharacterizeCmaps[0]
        #outFileName = os.path.split(ccc)[1].replace(".cmap", "")
        #outfile = os.path.join(varsP.contigAlignTarget,outFileName) #WRONG bc contigAlignTarget is wrong...try this

        outdir = os.path.join(varsP.outputContigFolder, self.varsP.characterizeDirName) #'alignref'
        if not util.checkDir(outdir, makeIfNotExist=False) : #if this doesn't exist, we can't get what we need
            return
        outfile = None
        for qfile in os.listdir(outdir) :
            if qfile.endswith(".err") : #just take first .err file
                outfile = qfile
                break
        if not outfile : #if no .err files found, give up
            return
        outfile = os.path.join(outdir, outfile.replace(".err",""))
        self.curCharacterizeFileRoots.append(outfile)
        #also want to get varsP.totAssemblyLenMb
        self.varsP.totAssemblyLenMb = mapClasses.multiCmap(varsP.latestMergedCmap, lengthonly=True).totalLength / 1e6
def runAlignMol() :    
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument('-q', dest='queryDir', help='Path to merged cmap to align molecules (-b) to OR alignmol dir from Pipeline for merge (if latter, no alignments are performed), required', type=str)
    parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file, required if aligning molecules', type=str)
    #parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file OR path to dir containing split bnx pieces, required if aligning molecules', type=str) #I should add the split feature; for now, just do single bnx
    parser.add_argument('-a', dest='optArguments', help='Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str)
    parser.add_argument('-r', help='If this flag is used, alignmolvref arguments are used, otherwise alignmol arguments are used (default alignmol; optional)', dest='ref', action='store_true')
    parser.add_argument('-o', dest='outputDir', help='output dir (optional, defaults to sub-dir of input map dir called "alignmol")', default="", type=str)
    parser.add_argument('-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) 
    parser.add_argument('-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int)
    parser.add_argument('-j', dest='maxthreads', help='Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int)
    parser.add_argument('-e', dest='errFile', help='.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str)
    parser.add_argument('-E', dest='errbinFile', help='.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str)
    parser.add_argument('-p', dest='pipelineDir', help='Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str)
    parser.add_argument('-v', dest='pvalue', help='Alignment pvalue', default="1e-12")
    result = parser.parse_args()

    outprefix = "exp_refineFinal1" #this is the default; assume for now

    #check all Pipeline dependencies
    if result.pipelineDir :
        cwd = result.pipelineDir
    else :
        cwd = os.path.split(os.path.realpath(__file__))[0] #this is path of this script
        if not os.path.isfile(os.path.join(cwd,"utilities.py")) : #if still not here, last try is actual cwd
            cwd = os.getcwd() #still check this below

    #this is the only one imported here and in runCharacterize
    if not os.path.isfile(os.path.join(cwd,"utilities.py")):
        print "ERROR: utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    if not os.path.isfile(os.path.join(cwd,"AlignModule.py")):
        print "ERROR: AlignModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import AlignModule as alignmod

    if not util.checkFile(os.path.join(cwd,"Pipeline.py")):
        print "ERROR: Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import Pipeline

    #input dir
    if not result.queryDir :
        print "ERROR: Query (-q) argument not supplied."
        sys.exit(1)
    qrypath = os.path.realpath(result.queryDir)
    if util.checkDir(qrypath, checkWritable=False, makeIfNotExist=False) : #output elsewhere so not writeable is ok
        runaligns = False
    elif util.checkCmap(qrypath) :
        runaligns = True
    else :
        print "ERROR: Query argument ("+qrypath+") not found or not a dir or cmap. Check -q argument."
        sys.exit(1)

    #this check isn't really necessary...make it a warning -- left over from runAlignMerge.py
    #if not os.path.split(qrypath)[1].endswith("alignmol") :
    #    print "Warning: Query dir ("+qrypath+") does not end with 'alignmol'; please be sure this is a Pipeline alignmol dir\n"

    #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args
    rabin = "" #need empty string for generateJobList even though no jobs are run
    if runaligns :
        rabin = result.RefAligner
        #replicate Pipeline behavior: RefAligner is always required
        if os.path.isdir(rabin) :
            rabin = os.path.join(rabin, "RefAligner")
        if not util.checkExecutable(rabin):
            print "ERROR: RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg."
            sys.exit(1)

    #optargs file
    optargs = None
    if runaligns and result.optArguments : #supplied on command line
        optargs = result.optArguments
        if not util.checkFile(optargs, ".xml") :
            print "optArguments path is supplied ("+optargs+") but not found or doesn't end in .xml, check -a argument."
            sys.exit(1)
    elif runaligns : #load from Pipeline dir if running alignments
        optargs = os.path.join(cwd,"optArguments_human.xml")
        if not util.checkFile(optargs):
            print "optArguments.xml missing in Pipeline directory ("+cwd+"). Try supplying path explicitly using -a."
            sys.exit(1)

    #output dir
    if not result.outputDir :
        outdir = os.path.join(qrypath, "merge") #should be same as in AlignModule
    else :
        outdir = os.path.realpath(result.outputDir)
    if os.path.isdir(outdir) :
        if not util.checkDir(outdir) : #check writeable
            print "\nERROR: Output dir is not writeable:\n", outdir, "\n"                
            sys.exit(1)
        #this is ok here
        #elif outdir == contigdir :
        #    print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n"                
        #    sys.exit(1)                
        print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n"
    elif not util.checkDir(outdir) : #does not exist, make, if False, can't make or not writeable
        print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n"
        sys.exit(1)
    
    #bnx file
    bnxfile = result.bnx
    if bnxfile : #must check for empty string BEFORE you do realpath, or it returns cwd
        bnxfile = os.path.realpath(bnxfile)
        if not util.checkFile(bnxfile, ".bnx") :
            print "ERROR: bnx file supplied but not found or incorrect suffix:", bnxfile
            sys.exit(1)
    elif runaligns :
        print "ERROR: bnx file not supplied but running alignments; please supply bnx file as -b argument"
        sys.exit(1)

    #nthreads
    nthreads = result.numThreads
    if nthreads <= 0 :
        print "ERROR: Number of threads value invalid (must be > 0): %i" % nthreads
        sys.exit(1)

    #maxthreads
    maxthreads = result.maxthreads
    if maxthreads <= 0 :
        print "ERROR: Max threads value invalid (must be > 0): %i" % maxthreads
        sys.exit(1)
    elif nthreads < maxthreads :
        print "Warning: num threads (-T: %i) < max threads (-j: %i): increasing num threads to equal max threads\n" % (nthreads, maxthreads)
        nthreads = maxthreads

    #pvalue
    if result.pvalue : #supplied on command line
        pvalue = result.pvalue
    else :
        pvalue = "1e-12"    

    #.errbin file
    errbinfile = result.errbinFile
    if errbinfile :
        errbinfile = os.path.realpath(result.errbinFile)
        if not util.checkFile(errbinfile, ".errbin") :
            print "ERROR: errbin file supplied but not found or incorrect suffix:", errbinfile
            sys.exit(1)

    #.err file
    errfile = result.errFile
    if errfile and errbinfile :
        print "Warning: .err and .errbin arguments supplied; ignoring .err file"
        errfile = ""
    elif errfile :
        errfile = os.path.realpath(result.errFile)
        if not util.checkFile(errfile, ".err") :
            print "err file supplied but not found or incorrect suffix:", errfile
            sys.exit(1)

    if errfile and not util.checkFile(os.path.join(cwd,"SampleCharModule.py")):
        print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    elif errfile :
        import SampleCharModule as scm

    doref = result.ref

    #DONE checking arguments

    print "Using output dir", outdir
    if runaligns :
        print "Aligning", bnxfile, "\nTo", qrypath, "\n"
    else :
        print "Merging", qrypath, "\n"

    startTime = time.time() #time since Epoch
    memory_log = os.path.join(outdir, "memory_log.txt")
    util.initMemoryLog(memory_log)

    varsP = Pipeline.varsPipeline()
    varsP.RefAlignerBin        = rabin
    varsP.contigFolder         = "" #not used but needs to be an attr
    varsP.outputContigFolder   = "" #not used but needs to be a string attr
    varsP.pipeReportFile = os.path.join(outdir, "alignmol_jobs_log.txt")
    varsP.infoReportFile = os.path.join(outdir, "alignmol_log.txt")
    util.InitStatus( os.path.join(outdir, "status.xml") )

    if runaligns :
        varsP.optArgumentsFileIn   = optargs
        varsP.latestMergedCmap     = qrypath #if !doref, need this one
        varsP.ref                  = qrypath #and if doref, need this one
        varsP.nThreads             = nthreads #necessary otherwise job won't start -- max threads per node
        varsP.maxthreads           = maxthreads #threads per job
        p = os.path.split(qrypath)[1]
        varsP.outputContigPrefix   = p[:p.rfind(".")] #filename prefix
        varsP.stdoutlog    = True #use -stdout -stderr

        varsP.memoryLogpath  = os.path.join(outdir, "memory_log.txt")
        varsP.parseArguments() #parses optArgumentsFile
        varsP.replaceParam("alignmol", "-T", pvalue)
        varsP.checkDependencies()
        varsP.RefAlignerBinOrig = rabin
        varsP.prerunLog() #general information in log -- needed for refaligner_version

        noisep = {}
        if errbinfile :
            noisep = {"readparameters": errbinfile}
            #print "Using noise parameters from "+errbinfile+"\n" #move below
        elif errfile :
            noisep = scm.readNoiseParameters(errfile.replace(".err",""))
            if noisep.has_key('readparameters') : #remove this because it's redundant, and it can cause problems with RefAligner compatibility
                del noisep['readparameters']
            if not noisep : #readNoiseParameters returns empty dict on failure
                print "ERROR reading noise parameters, check .err file:", errfile
                sys.exit(1)
            #redundant with below?
            print "Using noise parameters from "+errfile+":\n" + " ".join(["-"+str(k)+" "+str(v) for k,v in noisep.iteritems()])+"\n"

        #some code from SampleCharModule to load args into noise0
        infoReport="Loaded noise parameters:\n"
        klist = ["FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict.
        #noiseargs = self.varsP.argsListed('noise0') #not necessary
        for v in klist :
            if not noisep.has_key(v) :
                continue
            param=str(noisep[v])
            util.LogStatus("parameter", "auto_"+v, param)
            infoReport+=v+":"+param+"\n"
            varsP.replaceParam("noise0", "-"+v, param)
        varsP.updateInfoReport(infoReport + '\n', printalso=True)

    else :
        print "Getting file list from", qrypath
        outFileList = getOutFileList(util, qrypath)
        if not outFileList :
            print "ERROR: Query dir ("+qrypath+") does not contain alignmol data. Check -q argument."
            sys.exit(1)
        else :
            print "Found", len(outFileList), "alignment results"
    #end if runaligns

    amod = alignmod.AlignModule(varsP, doref, outdir, bnxfile) #constructor will call generateJobList

    if runaligns :
        amod.runJobs()
	amod.checkResults()
    else :
        amod.outFileList = outFileList
        p = os.path.split(outFileList[0])[1]
        if p.count("_") > 1 : #expect something like "EXP_REFINEFINAL1_4"
            #p = p[:p.rfind("_")+1] #remove integer suffix
            p = p[:p.rfind("_")] #remove integer suffix (and underscore)
        #else :
        #    p += "_" #because mrgstr is appended
        varsP.outputContigPrefix = p

    if not runaligns or len(amod.jobList) > 0 :
        amod.getAlignStats()

    if runaligns :
        print
        #copy from Pipeline.py
        if util.SummarizeErrors(varsP=varsP)==0:
            varsP.updatePipeReport("Pipeline has successfully completed\n") 
            util.LogStatus("progress", "pipeline", "success")
        else:
            varsP.updatePipeReport("Pipeline has completed with errors\n") 
            util.LogStatus("progress", "pipeline", "failure")

    #BELOW OLD CODE

    return

    #in Pipeline, this is called first
    #print "Calling getAlignStats:" #but it won't work without varsP atm; skip it
    #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir)
    #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir)

    print "Calling mergeMap"
    print outFileList[0] #, "\n", outputdir #moved above
    util.logMemory(memory_log, startTime, "mergeMap_start")
    #mergeMap(self.varsP, self.outFileList, mergepath=self.outputdir) #varsP is optional
    alignmod.mergeMap(None, outFileList, outputdir) 
    util.logMemory(memory_log, startTime, "mergeMap_end")

    print "Calling mergeRcmaps"
    util.logMemory(memory_log, startTime, "mergeRcmaps_start")
    #mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="alignmol") :
    alignmod.mergeRcmaps(outFileList, outputdir, splitByContig=True, stageName=outprefix) 
    util.logMemory(memory_log, startTime, "mergeRcmaps_end")

    print "Calling split_XMap_byContig" #split_XMapQcmap_byContig"
    util.logMemory(memory_log, startTime, "split_XMap_byContig_start")
    #xmapdict = alignmod.split_XMap_byContig(outFileList, outputdir, stageName=outprefix) #old
    xmapdict = alignmod.split_XMap_byContig_new(outFileList, outputdir, stageName=outprefix)
    util.logMemory(memory_log, startTime, "split_XMap_byContig_end")

    print "Calling split_Qcmap_byContig" 
    util.logMemory(memory_log, startTime, "split_Qcmap_byContig_start")
    #alignmod.split_Qcmap_byContig(outFileList, outputdir, xmapdict) #old
    alignmod.split_Qcmap_byContig_new(outFileList, outputdir, xmapdict, stageName=outprefix) #new: better performance
    util.logMemory(memory_log, startTime, "split_Qcmap_byContig_end")

    print "AlignMerge successfully completed"
Ejemplo n.º 16
0
def getAlignStats(varsP,
                  outFileList,
                  reflen=0,
                  isref=False,
                  mergepath="",
                  bnxpath=None):
    '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule.
    reflen should be in Mb. If mergepath supplied, put merged .err there.
    If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this
    file and ignore outFileList.
    '''

    statonly = False  #bnx stats only
    skipbnx = False  #.err file processing only
    if bnxpath == None:
        if not varsP.sorted_file:  #for runAlignMol, this is empty: nothing to do in this case
            skipbnx = True
        else:
            bnxpath = varsP.sorted_file + ".bnx"  #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix
    else:  #if bnxpath != None :
        statonly = True
    if not skipbnx and not util.checkFile(bnxpath):
        varsP.updatePipeReport(
            "Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n"
            % bnxpath)
        return

    #find the minlen used for bnx_sort, which is a required arg set
    sortargs = []
    if varsP.argData.has_key('bnx_sort'):  #for runAlignMol.py
        sortargs = varsP.argsListed('bnx_sort')
    minlen = 0
    validminlen = False
    if "-minlen" in sortargs:
        minlen = sortargs[
            sortargs.index("-minlen") +
            1]  #next ele should be the len, if next ele isn't in list, the sort job will fail
        minlen = util.getIntFromString(
            minlen)  #returns None if can't cast to int
        if minlen:
            validminlen = True

    if not validminlen and bnxpath == None and sortargs:
        varsP.updatePipeReport(
            "Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n"
        )
    if bnxpath != None:  #if bnxpath, ignore minlen
        minlen = 0

    nmol = 0  #total n mol above minlen
    totlen = 0  #total mol len above minlen
    if util.checkFile(bnxpath):
        #the bnxfile class is very wasteful. replace with below
        #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now
        outstr = "Reading molecule stats from %s:\n" % bnxpath
        outstr += "Molecule Stats:\n"
        moldict = util.simpleBnxStats(bnxpath, minlen)
        nmol = moldict["nmol"]
        totlen = moldict["totlen"]
        #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously
        outstr += "N mols: %i\n" % nmol
        outstr += ("Total len (Mb): %10.3f\n") % totlen
        outstr += ("Avg len (kb)  : %10.3f\n") % moldict["avglen"]
        outstr += ("Mol N50 (kb)  : %10.3f\n") % moldict["n50"]
        outstr += ("Lab (/100kb)  : %10.3f\n") % moldict["labdensity"]
        #    if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below
        #        bnx.molstats[minlen].genomesizemb = 0
        #    outstr += str(bnx.molstats[minlen])
        #nmol = bnx.molstats[minlen].nmol
        #totlen = bnx.molstats[minlen].totlen

        if reflen:
            cov = totlen / reflen  #totlen is in Mb
            outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else
                                                    "Contig", cov)
        if isref or reflen or statonly:  #if neither, nothing to print
            varsP.updateInfoReport(outstr + "\n", printalso=True)
    elif not skipbnx:
        varsP.updatePipeReport(
            "Warning in AlignModule.getAlignStats: missing bnx path:" +
            bnxpath + "\n")

    if statonly:
        return

    #lastly, load .xmaps and .errs from alignmol jobs and report on stats
    totmaplen = 0  #sum of lengths of mapped portions of all molecules, on reference
    totmapqrylen = 0  #sum of lengths of mapped portions of all molecules, on query
    totconf = 0  #sum of confidence of all alignments
    nalign = 0  #total number of alignments
    fplist = []  #lists for error rates
    fprlist = []
    fnlist = []
    bpplist = []
    nmaplist = []  #from .err
    gmaplist = []  #from .err
    llrmlist = []
    llrgmlist = []
    bppsdlist = []
    sflist = []
    sdlist = []
    srlist = []
    reslist = []
    resdlist = []
    header = ""
    err = None  #will be the alignParams object if any .err files are found
    mappref = ""
    if len(outFileList) > 0:
        mappref = getMergeFilename(
            outFileList[0]
        )  #make function to unify with same convention in mergeMap
    for outpath in outFileList:  #these are file prefixes
        if util.checkFile(outpath + ".xmap"):
            xmap = mc.xmap(outpath + ".xmap")
            nalign += len(xmap.xmapLookup)
            totmaplen += xmap.getSumMappedRefLen()  #in kb
            totmapqrylen += xmap.getSumMappedQryLen()  #in kb
            totconf += sum([x.Confidence for x in xmap.xmapLookup.values()])
        else:
            varsP.updatePipeReport(
                "Warning in AlignModule.getAlignStats: missing xmap:" +
                outpath + ".xmap" + "\n")
        if util.checkFile(outpath + ".err"):
            err = mc.alignParams(outpath + ".err")
            if not header:
                header = err.header
            fplist.append(err.fp)
            fprlist.append(err.fprate)
            fnlist.append(err.fn)
            bpplist.append(err.bpp)
            reslist.append(err.res)
            nmaplist.append(err.nmaps)
            gmaplist.append(err.goodmaps)
            llrmlist.append(err.llrm)
            llrgmlist.append(err.llrgm)
            bppsdlist.append(err.bppsd)
            sflist.append(err.sf)
            sdlist.append(err.sd)
            srlist.append(err.sr)
            resdlist.append(err.ressd)

    #nalign from xmap should be the same as goodmaps from .err
    sumgoodmaps = sum(gmaplist)
    if sumgoodmaps != nalign:
        varsP.updateInfoReport(
            "Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n"
            % (sumgoodmaps, nalign),
            printalso=True)
    if totmaplen or totconf or nalign:
        outstr = "Molecules Aligned to %s:\n" % ("Reference"
                                                 if isref else "Assembly")
        outstr += "N mol align       : %9i\n" % nalign
        outstr += "Mol fraction align: %13.3f\n" % (float(nalign) /
                                                    nmol if nmol else 0)
        outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3)  #Mb
        if reflen > 0:
            outstr += ("Effective Cov (x) : %13.3f\n") % (
                totmaplen / 1e3 / reflen)  #totlen is in kb
        outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen /
                                                    nalign if nalign else 0)
        outstr += "Fraction align len: %13.3f\n" % (
            totmapqrylen / 1e3 / totlen if totlen else 0
        )  #totmapqrylen is in kb, totlen is in mb
        outstr += "Tot confidence    : %11.1f\n" % totconf
        outstr += "Avg confidence    : %11.1f\n" % (totconf /
                                                    nalign if nalign else 0)
        varsP.updateInfoReport(outstr, printalso=True)
    avgfp = (sum(fplist) / len(fplist) if len(fplist) else 0)
    avgfpr = (sum(fprlist) / len(fprlist) if len(fprlist) else 0)
    avgfn = (sum(fnlist) / len(fnlist) if len(fnlist) else 0)
    avgbpp = (sum(bpplist) / len(bpplist) if len(bpplist) else 0)
    avgres = (sum(reslist) / len(reslist) if len(reslist) else 0)
    avgllr = (sum(llrmlist) / len(llrmlist) if len(llrmlist) else 0)
    avgllg = (sum(llrgmlist) / len(llrgmlist) if len(llrgmlist) else 0)
    avgbps = (sum(bppsdlist) / len(bppsdlist) if len(bppsdlist) else 0)
    avgsf = (sum(sflist) / len(sflist) if len(sflist) else 0)
    avgsd = (sum(sdlist) / len(sdlist) if len(sdlist) else 0)
    avgsr = (sum(srlist) / len(srlist) if len(srlist) else 0)
    avgrsd = (sum(resdlist) / len(resdlist) if len(resdlist) else 0)
    if avgfp or avgfn or avgbpp:
        outstr = "Avg FP(/100kb)    : %12.2f\n" % avgfp
        outstr += "Avg FP ratio      : %13.3f\n" % avgfpr
        outstr += "Avg FN ratio      : %13.3f\n" % avgfn
        outstr += "Avg bpp           : %11.1f\n" % avgbpp
        outstr += "Avg sf            : %13.3f\n" % avgsf
        outstr += "Avg sd            : %13.3f\n" % avgsd
        outstr += "Avg sr            : %13.3f\n" % avgsr
        varsP.updateInfoReport(outstr + "\n", printalso=True)
    if err and mergepath:  #have an error file (alignParams) object
        util.checkDir(mergepath)
        mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
        outpath = os.path.join(mergepath, mappref + mrgstr + ".err")
        err.fp = avgfp
        err.fn = avgfn
        err.sf = avgsf
        err.sd = avgsd
        err.bpp = avgbpp
        err.res = avgres
        err.nmaps = sum(nmaplist)
        err.llrm = avgllr
        err.goodmaps = sumgoodmaps
        err.llrgm = avgllg
        err.bppsd = avgbps
        err.fprate = avgfpr
        err.sr = avgsr
        err.ressd = avgrsd
        err.writeToFile(outpath)
Ejemplo n.º 17
0
    def __init__(self, varsP):
        jobName = "reference_process"
        opta_section = "referenceSvdetect"
        default_mres = "2.9"
        mres = "-mres"
        self.varsP = varsP
        usedefault = False
        if self.varsP.argData.has_key(opta_section):  #check if in optargs
            opta = self.varsP.argsListed(opta_section)
            if not mres in opta:  #must have mres
                self.varsP.updatePipeReport(
                    "Warning in referenceProcess: " + mres +
                    " missing in optArguments section " + opta_section + "\n")
                usedefault = True
        else:
            self.varsP.updatePipeReport(
                "Warning in referenceProcess: optArguments section " +
                opta_section + " missing\n")
            usedefault = True
        if usedefault:
            opta = [mres, default_mres]

        mresstr = opta[opta.index(mres) +
                       1]  #get string for mres value for output name
        mresstr = mresstr.replace(".", "")

        if not util.checkDir(self.varsP.refFolder):
            self.varsP.updatePipeReport(
                "ERROR in referenceProcess: could not make output dir %s\n" %
                self.varsP.refFolder)
            return None
        refpref = os.path.basename(
            self.varsP.ref[:self.varsP.ref.rfind(".")]) + "_res" + mresstr
        outarg = os.path.join(
            self.varsP.refFolder,
            refpref)  #refFolder is new output folder for this job
        expectedResultFile = outarg + ".cmap"  #if ref is spots, is this spots?
        args = [
            self.varsP.RefAlignerBin, '-f', '-o', outarg, '-i', self.varsP.ref,
            '-merge'
        ] + opta
        stdoutf = None
        if self.varsP.stdoutlog:
            args.extend(['-stdout', '-stderr'])
            stdoutf = outarg + ".stdout"
        args += ['-maxthreads', str(self.varsP.nThreads)]

        super(referenceProcess,
              self).__init__(self.varsP,
                             jobName,
                             clusterArgs=self.varsP.getClusterArgs("assembly"))

        job = mthread.singleJob(args,
                                jobName,
                                expectedResultFile,
                                jobName,
                                maxThreads=self.varsP.nThreads,
                                clusterLogDir=self.varsP.clusterLogDir,
                                expectedStdoutFile=stdoutf)
        self.addJob(job)

        util.LogStatus("progress", "stage_start", jobName)
        self.varsP.runJobs(self, "referenceProcess")
        self.doAllPipeReport()
        if not self.allResultsFound(
        ):  #this is an error, but we'll continue processing without SV detect
            err = "ERROR in referenceProcess: job failed, disabling SV detect"
            self.varsP.updatePipeReport(err + "\n")
            util.LogError("error", err)
            #self.varsP.runSV = False #no need since this class is used in SVModule
        else:
            self.varsP.refDeresed = expectedResultFile  #store good result for SV detect
            self.varsP.updatePipeReport(
                "referenceProcess: using reference %s for svdetect\n" %
                self.varsP.refDeresed)
        util.LogStatus("progress", "stage_complete", jobName)
Ejemplo n.º 18
0
def mergeRcmaps(outFileList,
                outdir,
                varsP=None,
                splitByContig=None,
                stageName=""):
    """Given a list of file prefixes (outFileList), append "_r.cmap" to them, and merge them
    to outdir. Report to varsP if supplied, stdout if not.
    Also support outFileList is full paths (including "_r.cmap").
    If splitByContig < 1, output each contig separately, if == 1, only output single merged cmap,
    and if > 1, do both.
    Always use stagename if supplied; if not, must supply varsP otherwise prefix is empty.
    """

    if not util.checkDir(outdir):
        err_msg = "Warning in AlignModule.mergeRcmaps: could not make outdir %s, skipping copy number" % outdir
        logOrPrintError(err_msg, varsP)
        return

    if not outFileList:  #just an argument check--check for presence on disk is below
        err_msg = "Warning in AlignModule.mergeRcmaps: no maps supplied"
        logOrPrintError(err_msg, varsP)
        return

    outFileList.sort(
    )  #for reproducibility with runAlignMerge.py (different order when listing dir)
    rsuf = "_r.cmap"
    #mappref = os.path.split(outFileList[0])[1] #this is just prefix, but with integer suffix--get it before -- no longer used
    #mappref = mappref[:mappref.rfind("_")+1] #remove integer suffix
    #even though outFileList should all be there, a job may have failed--check all, just existence
    present = []
    for outf in outFileList:
        target = (outf + rsuf if not outf.endswith(rsuf) else outf
                  )  #now support either
        if not util.checkFile(target):
            err_msg = "Warning in AlignModule.mergeRcmaps: missing _r.cmap %s" % target
            logOrPrintError(err_msg, varsP)
        else:
            present.append(target)
    if not present:  #no _r.cmaps found (this will also happen for empty outFileList)
        err_msg = "Warning in AlignModule.mergeRcmaps: no _r.cmaps found, skipping copy number"
        logOrPrintError(err_msg, varsP)
        return
    outFileList = present  #yes, it's redundant, but now have rsuf appended

    mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
    #mergedmappath = os.path.join(outdir, mappref+mrgstr+rsuf) #this is output merged _r.cmap -- unify with filepref

    mergedmap = mc.multiCmap(outFileList[0])  #open original, edit in memory
    #now add other maps
    for rmap in outFileList[1:]:  #don't add map 0 to itself
        if mergedmap.addCovOcc(mc.multiCmap(
                rmap)):  #when calling addCovOcc, check return, warn if True
            err_msg = "Warning in AlignModule.mergeRcmaps: addCovOcc call failed for map %s" % rmap
            logOrPrintError(err_msg, varsP)
    #now it's merged, but the resulting map need to be written back to disk
    filepref = (
        varsP.outputContigPrefix if varsP and stageName == "" else stageName
    )  #see split_XMapQcmap_byContig
    if splitByContig < 1 or splitByContig > 1:
        #print "\nself.varsP.outputContigPrefix", self.varsP.outputContigPrefix, "\n" #debug
        #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in split_XMapQcmap_byContig
        mergedmap.writeAllMapsToDisk(os.path.join(outdir,
                                                  filepref + '_contig'),
                                     outsuf="_r")
        report = "mergeRcmaps: wrote %i cmaps" % len(mergedmap.cmapdict)
    if splitByContig > 0:
        mergedmap.writeToFile(
            os.path.join(outdir,
                         filepref + "_" + mrgstr + rsuf))  #was mergedmappath
        report = "mergeRcmaps: wrote merged cmap with %i contigs" % len(
            mergedmap.cmapdict)
    #report result
    logOrPrintError(report, varsP, warn=False)
def getAlignStats(varsP, outFileList, reflen=0, isref=False, mergepath="", bnxpath=None) :
    '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule.
    reflen should be in Mb. If mergepath supplied, put merged .err there.
    If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this
    file and ignore outFileList.
    '''

    statonly = False #bnx stats only
    skipbnx = False #.err file processing only
    if bnxpath == None :
        if not varsP.sorted_file : #for runAlignMol, this is empty: nothing to do in this case
            skipbnx = True
        else :
            bnxpath = varsP.sorted_file+".bnx" #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix
    else : #if bnxpath != None :
        statonly = True
    if not skipbnx and not util.checkFile(bnxpath) :
        varsP.updatePipeReport("Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n" % bnxpath)
        return

    #find the minlen used for bnx_sort, which is a required arg set
    sortargs = []
    if varsP.argData.has_key('bnx_sort') : #for runAlignMol.py
        sortargs = varsP.argsListed('bnx_sort')
    minlen = 0
    validminlen = False
    if "-minlen" in sortargs :
        minlen = sortargs[sortargs.index("-minlen")+1] #next ele should be the len, if next ele isn't in list, the sort job will fail
        minlen = util.getIntFromString(minlen) #returns None if can't cast to int
        if minlen :
            validminlen = True

    if not validminlen and bnxpath == None and sortargs :
        varsP.updatePipeReport("Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n")
    if bnxpath != None : #if bnxpath, ignore minlen
        minlen = 0

    nmol = 0 #total n mol above minlen
    totlen = 0 #total mol len above minlen
    if util.checkFile(bnxpath) :
        #the bnxfile class is very wasteful. replace with below
        #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now
        outstr = "Reading molecule stats from %s:\n" % bnxpath
        outstr += "Molecule Stats:\n"
        moldict = util.simpleBnxStats(bnxpath, minlen)
        nmol = moldict["nmol"]
        totlen = moldict["totlen"]
        #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously
        outstr += "N mols: %i\n" % nmol
        outstr += ("Total len (Mb): %10.3f\n") % totlen
        outstr += ("Avg len (kb)  : %10.3f\n") % moldict["avglen"]
        outstr += ("Mol N50 (kb)  : %10.3f\n") % moldict["n50"]
        outstr += ("Lab (/100kb)  : %10.3f\n") % moldict["labdensity"]
        #    if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below
        #        bnx.molstats[minlen].genomesizemb = 0 
        #    outstr += str(bnx.molstats[minlen]) 
        #nmol = bnx.molstats[minlen].nmol
        #totlen = bnx.molstats[minlen].totlen

        if reflen : 
            cov = totlen / reflen #totlen is in Mb
            outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else "Contig", cov)
        if isref or reflen or statonly : #if neither, nothing to print
            varsP.updateInfoReport(outstr + "\n", printalso=True)
    elif not skipbnx :
        varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing bnx path:"+bnxpath+"\n")

    if statonly :
        return

    #lastly, load .xmaps and .errs from alignmol jobs and report on stats
    totmaplen = 0 #sum of lengths of mapped portions of all molecules, on reference
    totmapqrylen = 0 #sum of lengths of mapped portions of all molecules, on query
    totconf = 0 #sum of confidence of all alignments
    nalign = 0 #total number of alignments
    fplist = [] #lists for error rates
    fprlist = []
    fnlist = []
    bpplist = []
    nmaplist = [] #from .err
    gmaplist = [] #from .err
    llrmlist  = []; llrgmlist = []; bppsdlist = []
    sflist = []; sdlist = []; srlist = []; reslist = []; resdlist = []
    header = ""
    err = None #will be the alignParams object if any .err files are found
    mappref = ""
    if len(outFileList) > 0 :
        mappref = getMergeFilename(outFileList[0]) #make function to unify with same convention in mergeMap
    for outpath in outFileList : #these are file prefixes
        if util.checkFile(outpath+".xmap") :
            xmap = mc.xmap(outpath+".xmap")
            nalign += len(xmap.xmapLookup)
            totmaplen += xmap.getSumMappedRefLen() #in kb
            totmapqrylen += xmap.getSumMappedQryLen() #in kb
            totconf += sum([x.Confidence for x in xmap.xmapLookup.values()])
        else :
            varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing xmap:"+outpath+".xmap"+"\n")
        if util.checkFile(outpath+".err") :
            err = mc.alignParams(outpath+".err")
            if not header :
                header = err.header
            fplist.append(err.fp)
            fprlist.append(err.fprate)
            fnlist.append(err.fn)
            bpplist.append(err.bpp)
            reslist.append(err.res)
            nmaplist.append(err.nmaps)
            gmaplist.append(err.goodmaps)
            llrmlist.append(err.llrm)
            llrgmlist.append(err.llrgm)
            bppsdlist.append(err.bppsd)
            sflist.append(err.sf)
            sdlist.append(err.sd)
            srlist.append(err.sr)
            resdlist.append(err.ressd)

    #nalign from xmap should be the same as goodmaps from .err
    sumgoodmaps = sum(gmaplist)
    if sumgoodmaps != nalign :
        varsP.updateInfoReport("Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n" % (sumgoodmaps, nalign), printalso=True)
    if totmaplen or totconf or nalign : 
        outstr =  "Molecules Aligned to %s:\n" % ("Reference" if isref else "Assembly")
        outstr += "N mol align       : %9i\n" % nalign
        outstr += "Mol fraction align: %13.3f\n" % (float(nalign)/nmol if nmol else 0)
        outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3) #Mb
        if reflen > 0 : 
            outstr += ("Effective Cov (x) : %13.3f\n") % (totmaplen / 1e3 / reflen) #totlen is in kb
        outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen/nalign if nalign else 0)
        outstr += "Fraction align len: %13.3f\n" % (totmapqrylen/1e3/totlen if totlen else 0) #totmapqrylen is in kb, totlen is in mb
        outstr += "Tot confidence    : %11.1f\n" % totconf
        outstr += "Avg confidence    : %11.1f\n" % (totconf/nalign if nalign else 0)
        varsP.updateInfoReport(outstr, printalso=True)
    avgfp  = (sum(fplist)/len(fplist)   if len(fplist) else 0)
    avgfpr = (sum(fprlist)/len(fprlist) if len(fprlist) else 0)
    avgfn  = (sum(fnlist)/len(fnlist)   if len(fnlist) else 0)
    avgbpp = (sum(bpplist)/len(bpplist) if len(bpplist) else 0)
    avgres = (sum(reslist)/len(reslist) if len(reslist) else 0)
    avgllr = (sum(llrmlist)/len(llrmlist) if len(llrmlist) else 0)
    avgllg = (sum(llrgmlist)/len(llrgmlist) if len(llrgmlist) else 0)
    avgbps = (sum(bppsdlist)/len(bppsdlist) if len(bppsdlist) else 0)
    avgsf  = (sum(sflist)/len(sflist) if len(sflist) else 0)
    avgsd  = (sum(sdlist)/len(sdlist) if len(sdlist) else 0)
    avgsr  = (sum(srlist)/len(srlist) if len(srlist) else 0)
    avgrsd = (sum(resdlist)/len(resdlist) if len(resdlist) else 0)
    if avgfp or avgfn or avgbpp :
        outstr =  "Avg FP(/100kb)    : %12.2f\n" % avgfp
        outstr += "Avg FP ratio      : %13.3f\n" % avgfpr
        outstr += "Avg FN ratio      : %13.3f\n" % avgfn
        outstr += "Avg bpp           : %11.1f\n" % avgbpp
        outstr += "Avg sf            : %13.3f\n" % avgsf
        outstr += "Avg sd            : %13.3f\n" % avgsd
        outstr += "Avg sr            : %13.3f\n" % avgsr
        varsP.updateInfoReport(outstr + "\n", printalso=True)
    if err and mergepath : #have an error file (alignParams) object
        util.checkDir(mergepath)
        mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
        outpath = os.path.join(mergepath, mappref+mrgstr+".err")
        err.fp = avgfp
        err.fn = avgfn
        err.sf = avgsf
        err.sd = avgsd
        err.bpp = avgbpp
        err.res = avgres
        err.nmaps = sum(nmaplist)
        err.llrm  = avgllr
        err.goodmaps = sumgoodmaps
        err.llrgm = avgllg
        err.bppsd = avgbps
        err.fprate = avgfpr
        err.sr = avgsr
        err.ressd = avgrsd
        err.writeToFile(outpath)
Ejemplo n.º 20
0
def runSV(cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath,
          optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile,
          clustargs, groupsv):
    '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns;
    report on those alignments or the xmap provided as xmappath.
    '''

    printargs = True

    if not os.path.isfile(os.path.join(cwd, "utilities.py")):
        print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    if not util.checkFile(os.path.join(cwd, "Pipeline.py")):
        print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import Pipeline

    if not util.checkFile(os.path.join(cwd, "SVModule.py")):
        print "SVModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import SVModule as svm

    if errfile and not util.checkFile(os.path.join(cwd,
                                                   "SampleCharModule.py")):
        print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    elif errfile:
        import SampleCharModule as scm

    #use Pipeline objects

    varsP = Pipeline.varsPipeline()

    varsP.optArgumentsFileIn = optargs
    varsP.RefAlignerBin = rabin
    varsP.latestMergedCmap = os.path.join(
        contigdir, contigbase + ".cmap")  #file suffix required to be .cmap
    varsP.contigFolder = os.path.split(contigdir)[0]
    varsP.nThreads = nthreads  #necessary otherwise job won't start -- max threads per node
    varsP.maxthreads = maxthreads  #threads per job
    varsP.ref = refcmap
    varsP.stdoutlog = True  #enable -stdout -stderr args to RefAligner
    varsP.curCharacterizeCmaps = [varsP.latestMergedCmap]
    varsP.contigSubDirectories = True  #needed for prepareContigIO
    varsP.doAlignMolvRef = False  #do not look for copy number
    varsP.groupSV = groupsv  #mimic Pipeline behavior: group or not

    if runaligns:
        #varsP.contigAlignTarget = outdir
        varsP.runSV = False
        varsP.groupContigs = False
        varsP.stdoutlog = True  #use -stdout -stderr
        varsP.stageComplete = contigbase
        varsP.outputContigPrefix = getContigPrefix(
            util, contigdir
        )  #if outdir is not supplied, this is used as dir prefix; also used as file pref for -o arg
        varsP.outputContigFolder = contigdir  #cmaps are copied from here

        if not outdir:
            outdir = contigdir + "_sv"  #this will be outdir of sv jobs
        if os.path.isdir(outdir):
            if not util.checkDir(outdir):  #check writeable
                print "\nERROR: Output dir is not writeable:\n", outdir, "\n"
                sys.exit(1)
            elif outdir == contigdir:
                print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n"
                sys.exit(1)
            print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n"
        elif not util.checkDir(
                outdir
        ):  #does not exist, make, if False, can't make or not writeable
            print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n"
            sys.exit(1)

        if clustargs:
            #os.putenv('SGE_ROOT', '/var/lib/gridengine') #do I want this??? NO! It could very well be wrong.
            varsP.onCluster = True
            varsP.checkCluster()  #call varsPipeline method to check SGE_ROOT
            #note: before, above default is wrong. Now, there is no default--user is required to set environment variable; but this is consistent with the Pipeline
            varsP.clusterLogDir = os.path.join(outdir, 'ClusterLogs')
            util.checkDir(varsP.clusterLogDir)  #make it
            varsP.checkCluster()
            varsP.clusterArgumentsFileIn = clustargs  #required for parseArguments
            varsP.parseArguments(readingClusterFile=True)
            if varsP.error:
                print varsP.message
                sys.exit(1)
            varsP.RefAlignerBin += "${BINARY_SUFFIX:=}"  #copy from varsPipeline, handled by external script on phi host

        varsP.pipeReportFile = os.path.join(outdir, "sv_jobs_log.txt")
        varsP.infoReportFile = os.path.join(outdir, "sv_log.txt")
        varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt")
        if bedfile:
            varsP.bedFile = bedfile
        util.InitStatus(os.path.join(outdir, "status.xml"))
        varsP.parseArguments()  #parses optArgumentsFile
        varsP.checkDependencies()
        varsP.RefAlignerBinOrig = rabin
        varsP.prerunLog(
        )  #general information in log -- needed for refaligner_version
        if printargs:
            print "\nRunning SV detection with arguments (" + os.path.split(
                optargs)[1] + "):\n" + " ".join(
                    varsP.argsListed('svdetect')) + '\n'

        noisep = {}
        if errbinfile:
            noisep = {"readparameters": errbinfile}
            print "Using noise parameters from " + errbinfile + "\n"
        elif errfile:
            noisep = scm.readNoiseParameters(errfile.replace(".err", ""))
            if noisep.has_key(
                    'readparameters'
            ):  #remove this because it's redundant, and it can cause problems with RefAligner compatibility
                del noisep['readparameters']
            if not noisep:  #readNoiseParameters returns empty dict on failure
                print "ERROR reading noise parameters, check .err file:", errfile
                sys.exit(1)
            print "Using noise parameters from " + errfile + ":\n" + " ".join(
                ["-" + str(k) + " " + str(v)
                 for k, v in noisep.iteritems()]) + "\n"

        varsP.outputContigFolder = contigdir  #cmaps are copied from here

        #make merged cmap to replace merged _q.cmap if not produced by RefAligner
        cmaps = util.getListOfFilesFromDir(varsP.outputContigFolder,
                                           suffix=".cmap")
        if len(cmaps) > 1:
            varsP.contigPathTxtFile = os.path.join(
                outdir,
                "contig_list.txt")  #mergeIntoSingleCmap creates this file
            print "Creating merged cmap"
            varsP.mergeIntoSingleCmap(outdir)
            print "Merged cmap created:", varsP.latestMergedCmap, "\n"
            if varsP.groupSV == 0:  #if it is a single job, use merged map just created
                varsP.outputContigFolder = outdir  #input == output
                #print "varsP.outputContigFolder =", varsP.outputContigFolder #debug
        elif len(cmaps) == 1:
            varsP.latestMergedCmap = cmaps[0]
        else:  #this is already checked in getContigPrefix (redundant)
            print "No cmaps found in input dir; check dir %s\n" % varsP.outputContigFolder
            sys.exit(1)

        svmodule = svm.SVdetect(varsP, noisep, outdir, skipderes=True)
        #this got duplicated above
        #if hasattr(util, "InitStatus") : #if old version, skip -- do this after SVdetect.__init__ bc makes outdir
        #    util.InitStatus(os.path.join(outdir, "status.xml")) #needed otherwise call to status_log fails
        svmodule.runJobs()
        svmodule.checkResults()
        util.SummarizeErrors(varsP)

    else:
        varsP.contigAlignTarget = contigdir  #this is dir in which _q and _r cmaps must be located
        print "ERROR: feature not supported"  #not implemented to not run jobs
Ejemplo n.º 21
0
def getArgs():
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument(
        '-t',
        dest='RefAligner',
        help='Path to RefAligner or dir containing it (required)',
        type=str)
    parser.add_argument(
        '-r',
        dest='referenceMap',
        help='Path to reference maps (.cmap), 1 file only (required)',
        type=str)
    parser.add_argument(
        '-q',
        dest='queryDir',
        help='Path to dir containing query maps (.cmaps) (required)',
        type=str)
    #parser.add_argument('-x', dest='xmap', help='Path to .xmap, 1 file only (optional, if specified, no alignment is done, if not specified, -t, -r, and -q must be specified)') #not supported
    parser.add_argument(
        '-o',
        dest='outputDir',
        help=
        'output dir (optional, defaults to input map dir with suffix "_sv")',
        default="",
        type=str)
    parser.add_argument(
        '-p',
        dest='pipelineDir',
        help=
        'Pipeline dir (optional, defaults to script dir, or current directory)',
        default="",
        type=str)
    parser.add_argument(
        '-a',
        dest='optArguments',
        help=
        'Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)',
        default="",
        type=str)
    parser.add_argument(
        '-T',
        dest='numThreads',
        help='Total number of threads (cores) to use (optional, default 4)',
        default=4,
        type=int)
    parser.add_argument(
        '-j',
        dest='maxthreads',
        help=
        'Threads per Job, -maxthreads (non-cluster only;optional, default 4)',
        default=4,
        type=int)
    parser.add_argument(
        '-b',
        dest='bedFile',
        help=
        '.bed file with gaps in reference for flagging SVs which overlap N-base gaps (optional)',
        default="",
        type=str)
    parser.add_argument(
        '-e',
        dest='errFile',
        help=
        '.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)',
        default="",
        type=str)
    parser.add_argument(
        '-E',
        dest='errbinFile',
        help=
        '.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)',
        default="",
        type=str)
    parser.add_argument(
        '-C',
        help=
        'Run on cluster, read XML file for submission arguments (optional--will not use cluster submission if absent)',
        dest='cxml',
        default=None)
    parser.add_argument(
        '-s',
        help=
        'SV jobs configuration: 0 = single job (required for correct haplotype calls), 1 = single job per contig (not recommended), 2 = grouped (default 0; optional)',
        dest='groupsv',
        type=int,
        default=0)
    #parser.add_argument('-s', help='Disable grouping of SV jobs (default grouped; optional)', dest='groupsv', action='store_false') #old one
    result = parser.parse_args()

    #check all Pipeline dependencies
    if result.pipelineDir:
        cwd = result.pipelineDir
    else:
        cwd = os.path.split(
            os.path.realpath(__file__))[0]  #this is path of this script
        if not os.path.isfile(os.path.join(
                cwd,
                "utilities.py")):  #if still not here, last try is actual cwd
            cwd = os.getcwd()  #still check this below

    #this is the only one imported here and in runCharacterize
    if not os.path.isfile(os.path.join(cwd, "utilities.py")):
        print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    #xmap -- don't use this
    runaligns = True  #default is to run the alignment
    xmappath = None
    #if result.xmap :
    #    xmappath = result.xmap
    #    if not util.checkFile(xmappath, ".xmap") :
    #        print "Xmap path is supplied ("+xmappath+") but not found or doesn't end in .xmap."
    #        sys.exit(1)
    #    runaligns = False

    #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args
    rabin = result.RefAligner
    #replicate Pipeline behavior: RefAligner is always required
    if os.path.isdir(rabin):
        rabin = os.path.join(rabin, "RefAligner")
    if not util.checkExecutable(rabin):
        print "RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg."
        sys.exit(1)

    #reference maps -- only required if xmap not specified
    refcmap = os.path.realpath(result.referenceMap)
    if runaligns and not util.checkFile(
            refcmap, ".cmap"):  #and not util.checkFile(refcmap, ".spots") :
        print "Reference map file (" + refcmap + ") not found or does not end in .cmap or .spots. Check -r argument."
        sys.exit(1)

    #query maps
    qrypath = os.path.realpath(result.queryDir)
    #if runaligns and not util.checkFile(qrypath, ".cmap") :
    #    print "Query map file ("+qrypath+") not found or does not end in .cmap or .spots. Check -q argument."
    #    sys.exit(1)
    if not util.checkDir(qrypath, checkWritable=False,
                         makeIfNotExist=False):  #does NOT have to be writeable
        print "Query dir (" + qrypath + ") not found or not a dir. Check -q argument."
        sys.exit(1)
    if runaligns:
        contigdir = qrypath  #os.path.split(qrypath)[0] #dir of query maps
        contigbase = os.path.split(qrypath)[1]  #filename
    else:
        contigdir = os.path.split(xmappath)[0]
        contigbase = os.path.split(xmappath)[1]  #filename
    #contigbase = contigbase[:contigbase.find(".")] #remove suffix

    #optargs file
    optargs = None
    if result.optArguments:  #supplied on command line
        optargs = result.optArguments
        if not util.checkFile(optargs, ".xml"):
            print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument."
            sys.exit(1)
    elif runaligns:  #load from Pipeline dir if running alignments
        optargs = os.path.join(cwd, "optArguments_human.xml")
        if not util.checkFile(optargs):
            print "optArguments.xml missing in Pipeline directory (" + cwd + "). Try supplying path explicitly using -a."
            sys.exit(1)

    #cluster args
    clustargs = None
    if result.cxml:
        clustargs = os.path.realpath(result.cxml)
        if not util.checkFile(clustargs, ".xml"):
            print "clusterArguments path is supplied (" + clustargs + ") but not found or doesn't end in .xml, check -C argument."
            sys.exit(1)

    #nthreads
    nthreads = result.numThreads
    if nthreads <= 0:
        print "Number of threads value invalid (must be > 0): %i" % nthreads
        sys.exit(1)

    #maxthreads
    maxthreads = result.maxthreads
    if maxthreads <= 0:
        print "Max threads value invalid (must be > 0): %i" % maxthreads
        sys.exit(1)

    #bed file
    bedfile = result.bedFile  #must make local for return statement below
    if bedfile:  #must check for empty string BEFORE you do realpath, or it returns cwd
        bedfile = os.path.realpath(result.bedFile)
        if not util.checkFile(bedfile, ".bed"):
            print "bed file supplied but not found or incorrect suffix:", bedfile
            sys.exit(1)

    #.errbin file
    errbinfile = result.errbinFile
    if errbinfile:
        errbinfile = os.path.realpath(result.errbinFile)
        if not util.checkFile(errbinfile, ".errbin"):
            print "errbin file supplied but not found or incorrect suffix:", errbinfile
            sys.exit(1)

    #.err file
    errfile = result.errFile
    if errfile and errbinfile:
        print "Warning: .err and .errbin arguments supplied; ignoring .err file"
        errfile = ""
    elif errfile:
        errfile = os.path.realpath(result.errFile)
        if not util.checkFile(errfile, ".err"):
            print "err file supplied but not found or incorrect suffix:", errfile
            sys.exit(1)

    outdir = os.path.realpath(result.outputDir)

    groupsv = result.groupsv
    if groupsv < 0 or groupsv > 2:
        print 'ERROR: -s (grouped SV) must be 0, 1, or 2\n'
        sys.exit(1)

    #yes, this is messy...but I don't want another class (besides varsPipeline) and they just go to runCharacterize
    return cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, groupsv
Ejemplo n.º 22
0
def runAlignMol():
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument(
        '-q',
        dest='queryDir',
        help=
        'Path to merged cmap to align molecules (-b) to OR alignmol dir from Pipeline for merge (if latter, no alignments are performed), required',
        type=str)
    parser.add_argument(
        '-b',
        dest='bnx',
        help='Input molecule (.bnx) file, required if aligning molecules',
        type=str)
    #parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file OR path to dir containing split bnx pieces, required if aligning molecules', type=str) #I should add the split feature; for now, just do single bnx
    parser.add_argument(
        '-a',
        dest='optArguments',
        help=
        'Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)',
        default="",
        type=str)
    parser.add_argument(
        '-r',
        help=
        'If this flag is used, alignmolvref arguments are used, otherwise alignmol arguments are used (default alignmol; optional)',
        dest='ref',
        action='store_true')
    parser.add_argument(
        '-o',
        dest='outputDir',
        help=
        'output dir (optional, defaults to sub-dir of input map dir called "alignmol")',
        default="",
        type=str)
    parser.add_argument(
        '-t',
        dest='RefAligner',
        help='Path to RefAligner or dir containing it (required)',
        type=str)
    parser.add_argument(
        '-T',
        dest='numThreads',
        help='Total number of threads (cores) to use (optional, default 4)',
        default=4,
        type=int)
    parser.add_argument(
        '-j',
        dest='maxthreads',
        help=
        'Threads per Job, -maxthreads (non-cluster only;optional, default 4)',
        default=4,
        type=int)
    parser.add_argument(
        '-e',
        dest='errFile',
        help=
        '.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise',
        default="",
        type=str)
    parser.add_argument(
        '-E',
        dest='errbinFile',
        help=
        '.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise',
        default="",
        type=str)
    parser.add_argument(
        '-p',
        dest='pipelineDir',
        help=
        'Pipeline dir (optional, defaults to script dir, or current directory)',
        default="",
        type=str)
    result = parser.parse_args()

    outprefix = "exp_refineFinal1"  #this is the default; assume for now

    #check all Pipeline dependencies
    if result.pipelineDir:
        cwd = result.pipelineDir
    else:
        cwd = os.path.split(
            os.path.realpath(__file__))[0]  #this is path of this script
        if not os.path.isfile(os.path.join(
                cwd,
                "utilities.py")):  #if still not here, last try is actual cwd
            cwd = os.getcwd()  #still check this below

    #this is the only one imported here and in runCharacterize
    if not os.path.isfile(os.path.join(cwd, "utilities.py")):
        print "ERROR: utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    if not os.path.isfile(os.path.join(cwd, "AlignModule.py")):
        print "ERROR: AlignModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import AlignModule as alignmod

    if not util.checkFile(os.path.join(cwd, "Pipeline.py")):
        print "ERROR: Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import Pipeline

    if not util.checkFile(os.path.join(cwd, "mapClasses.py")):
        print "ERROR: mapClasses.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import mapClasses as mc

    #input dir
    if not result.queryDir:
        print "ERROR: Query (-q) argument not supplied."
        sys.exit(1)
    qrypath = os.path.realpath(result.queryDir)
    if util.checkDir(
            qrypath, checkWritable=False,
            makeIfNotExist=False):  #output elsewhere so not writeable is ok
        runaligns = False
    elif util.checkCmap(qrypath):
        runaligns = True
    else:
        print "ERROR: Query argument (" + qrypath + ") not found or not a dir or cmap. Check -q argument."
        sys.exit(1)

    #this check isn't really necessary...make it a warning -- left over from runAlignMerge.py
    #if not os.path.split(qrypath)[1].endswith("alignmol") :
    #    print "Warning: Query dir ("+qrypath+") does not end with 'alignmol'; please be sure this is a Pipeline alignmol dir\n"

    #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args
    rabin = ""  #need empty string for generateJobList even though no jobs are run
    if runaligns:
        rabin = result.RefAligner
        #replicate Pipeline behavior: RefAligner is always required
        if os.path.isdir(rabin):
            rabin = os.path.join(rabin, "RefAligner")
        if not util.checkExecutable(rabin):
            print "ERROR: RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg."
            sys.exit(1)

    #optargs file
    optargs = None
    if runaligns and result.optArguments:  #supplied on command line
        optargs = result.optArguments
        if not util.checkFile(optargs, ".xml"):
            print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument."
            sys.exit(1)
    elif runaligns:  #load from Pipeline dir if running alignments
        optargs = os.path.join(cwd, "optArguments_human.xml")
        if not util.checkFile(optargs):
            print "optArguments.xml missing in Pipeline directory (" + cwd + "). Try supplying path explicitly using -a."
            sys.exit(1)

    #output dir
    if not result.outputDir:
        outdir = os.path.join(qrypath,
                              "merge")  #should be same as in AlignModule
    else:
        outdir = os.path.realpath(result.outputDir)
    if os.path.isdir(outdir):
        if not util.checkDir(outdir):  #check writeable
            print "\nERROR: Output dir is not writeable:\n", outdir, "\n"
            sys.exit(1)
        #this is ok here
        #elif outdir == contigdir :
        #    print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n"
        #    sys.exit(1)
        print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n"
    elif not util.checkDir(
            outdir
    ):  #does not exist, make, if False, can't make or not writeable
        print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n"
        sys.exit(1)

    #bnx file
    bnxfile = result.bnx
    if bnxfile:  #must check for empty string BEFORE you do realpath, or it returns cwd
        bnxfile = os.path.realpath(bnxfile)
        if not util.checkFile(bnxfile, ".bnx"):
            print "ERROR: bnx file supplied but not found or incorrect suffix:", bnxfile
            sys.exit(1)
    elif runaligns:
        print "ERROR: bnx file not supplied but running alignments; please supply bnx file as -b argument"
        sys.exit(1)

    #nthreads
    nthreads = result.numThreads
    if nthreads <= 0:
        print "ERROR: Number of threads value invalid (must be > 0): %i" % nthreads
        sys.exit(1)

    #maxthreads
    maxthreads = result.maxthreads
    if maxthreads <= 0:
        print "ERROR: Max threads value invalid (must be > 0): %i" % maxthreads
        sys.exit(1)
    elif nthreads < maxthreads:
        print "Warning: num threads (-T: %i) < max threads (-j: %i): increasing num threads to equal max threads\n" % (
            nthreads, maxthreads)
        nthreads = maxthreads

    #.errbin file
    errbinfile = result.errbinFile
    if errbinfile:
        errbinfile = os.path.realpath(result.errbinFile)
        if not util.checkFile(errbinfile, ".errbin"):
            print "ERROR: errbin file supplied but not found or incorrect suffix:", errbinfile
            sys.exit(1)

    #.err file
    errfile = result.errFile
    if errfile and errbinfile:
        print "Warning: .err and .errbin arguments supplied; ignoring .err file"
        errfile = ""
    elif errfile:
        errfile = os.path.realpath(result.errFile)
        if not util.checkFile(errfile, ".err"):
            print "err file supplied but not found or incorrect suffix:", errfile
            sys.exit(1)

    if errfile and not util.checkFile(os.path.join(cwd,
                                                   "SampleCharModule.py")):
        print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    elif errfile:
        import SampleCharModule as scm

    doref = result.ref

    #DONE checking arguments

    print "Using output dir", outdir
    if runaligns:
        print "Aligning", bnxfile, "\nTo", qrypath, "\n"
    else:
        print "Merging", qrypath, "\n"

    startTime = time.time()  #time since Epoch
    memory_log = os.path.join(outdir, "memory_log.txt")
    util.initMemoryLog(memory_log)

    varsP = Pipeline.varsPipeline()
    varsP.RefAlignerBin = rabin
    varsP.contigFolder = ""  #not used but needs to be an attr
    varsP.outputContigFolder = ""  #not used but needs to be a string attr
    varsP.pipeReportFile = os.path.join(outdir, "alignmol_jobs_log.txt")
    varsP.infoReportFile = os.path.join(outdir, "alignmol_log.txt")
    util.InitStatus(os.path.join(outdir, "status.xml"))

    if runaligns:
        varsP.optArgumentsFileIn = optargs
        varsP.latestMergedCmap = qrypath  #if !doref, need this one
        varsP.ref = qrypath  #and if doref, need this one
        varsP.nThreads = nthreads  #necessary otherwise job won't start -- max threads per node
        varsP.maxthreads = maxthreads  #threads per job
        p = os.path.split(qrypath)[1]
        varsP.outputContigPrefix = p[:p.rfind(".")]  #filename prefix
        varsP.stdoutlog = True  #use -stdout -stderr
        varsP.sorted_file = bnxfile[:bnxfile.rfind(
            ".")]  #enables the mol fraction align in AlignModule.getAlignStats
        if qrypath.endswith(".cmap"):  #enable the mol stats
            varsP.totAssemblyLenMb = mc.multiCmap(
                qrypath, lengthonly=True).totalLength / 1e6

        varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt")
        varsP.parseArguments()  #parses optArgumentsFile
        varsP.checkDependencies()
        varsP.RefAlignerBinOrig = rabin
        varsP.prerunLog(
        )  #general information in log -- needed for refaligner_version

        noisep = {}
        if errbinfile:
            noisep = {"readparameters": errbinfile}
            #print "Using noise parameters from "+errbinfile+"\n" #move below
        elif errfile:
            noisep = scm.readNoiseParameters(errfile.replace(".err", ""))
            if noisep.has_key(
                    'readparameters'
            ):  #remove this because it's redundant, and it can cause problems with RefAligner compatibility
                del noisep['readparameters']
            if not noisep:  #readNoiseParameters returns empty dict on failure
                print "ERROR reading noise parameters, check .err file:", errfile
                sys.exit(1)
            #redundant with below?
            print "Using noise parameters from " + errfile + ":\n" + " ".join(
                ["-" + str(k) + " " + str(v)
                 for k, v in noisep.iteritems()]) + "\n"

        #some code from SampleCharModule to load args into noise0
        infoReport = "Loaded noise parameters:\n"
        klist = [
            "FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"
        ]  #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict.
        #noiseargs = self.varsP.argsListed('noise0') #not necessary
        for v in klist:
            if not noisep.has_key(v):
                continue
            param = str(noisep[v])
            util.LogStatus("parameter", "auto_" + v, param)
            infoReport += v + ":" + param + "\n"
            varsP.replaceParam("noise0", "-" + v, param)
        varsP.updateInfoReport(infoReport + '\n', printalso=True)

    else:
        print "Getting file list from", qrypath
        outFileList = getOutFileList(util, qrypath)
        if not outFileList:
            print "ERROR: Query dir (" + qrypath + ") does not contain alignmol data. Check -q argument."
            sys.exit(1)
        else:
            print "Found", len(outFileList), "alignment results"
    #end if runaligns

    amod = alignmod.AlignModule(
        varsP, doref, outdir, bnxfile)  #constructor will call generateJobList

    if runaligns:
        amod.runJobs()
        amod.checkResults()
    else:
        amod.outFileList = outFileList
        p = os.path.split(outFileList[0])[1]
        if p.count("_") > 1:  #expect something like "EXP_REFINEFINAL1_4"
            #p = p[:p.rfind("_")+1] #remove integer suffix
            p = p[:p.rfind("_")]  #remove integer suffix (and underscore)
        #else :
        #    p += "_" #because mrgstr is appended
        varsP.outputContigPrefix = p

    if not runaligns or len(amod.jobList) > 0:
        amod.getAlignStats()

    if runaligns:
        print
        #copy from Pipeline.py
        if util.SummarizeErrors(varsP=varsP) == 0:
            varsP.updatePipeReport("Pipeline has successfully completed\n")
            util.LogStatus("progress", "pipeline", "success")
        else:
            varsP.updatePipeReport("Pipeline has completed with errors\n")
            util.LogStatus("progress", "pipeline", "failure")

    #BELOW OLD CODE

    return

    #in Pipeline, this is called first
    #print "Calling getAlignStats:" #but it won't work without varsP atm; skip it
    #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir)
    #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir)

    print "Calling mergeMap"
    print outFileList[0]  #, "\n", outputdir #moved above
    util.logMemory(memory_log, startTime, "mergeMap_start")
    #mergeMap(self.varsP, self.outFileList, mergepath=self.outputdir) #varsP is optional
    alignmod.mergeMap(None, outFileList, outputdir)
    util.logMemory(memory_log, startTime, "mergeMap_end")

    print "Calling mergeRcmaps"
    util.logMemory(memory_log, startTime, "mergeRcmaps_start")
    #mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="alignmol") :
    alignmod.mergeRcmaps(outFileList,
                         outputdir,
                         splitByContig=True,
                         stageName=outprefix)
    util.logMemory(memory_log, startTime, "mergeRcmaps_end")

    print "Calling split_XMap_byContig"  #split_XMapQcmap_byContig"
    util.logMemory(memory_log, startTime, "split_XMap_byContig_start")
    #xmapdict = alignmod.split_XMap_byContig(outFileList, outputdir, stageName=outprefix) #old
    xmapdict = alignmod.split_XMap_byContig_new(outFileList,
                                                outputdir,
                                                stageName=outprefix)
    util.logMemory(memory_log, startTime, "split_XMap_byContig_end")

    print "Calling split_Qcmap_byContig"
    util.logMemory(memory_log, startTime, "split_Qcmap_byContig_start")
    #alignmod.split_Qcmap_byContig(outFileList, outputdir, xmapdict) #old
    alignmod.split_Qcmap_byContig_new(
        outFileList, outputdir, xmapdict,
        stageName=outprefix)  #new: better performance
    util.logMemory(memory_log, startTime, "split_Qcmap_byContig_end")

    print "AlignMerge successfully completed"
Ejemplo n.º 23
0
    def __init__(self, varsP) :
        """splitBNX.__init__: this class is for sorting the input bnx
        for subsequent splitting by the splitBNX class, and eventually
        easier processing with the Pairwise class. The constructor
        (this) will call varsP.runJobs and doAllPipeReport, then
        instantiate splitBNX, which will do all the splitting required
        for the Pairwise class.
        """
        self.stageName = "Autonoise0"
        self.varsP = varsP #fewer code modifications below
        
        util.LogStatus("progress", "stage_start", self.stageName) #after above bc check if bypass (executeCurrentStage)

        self.output_folder = os.path.join(self.varsP.contigFolder, "auto_noise")
        if not util.checkDir(self.output_folder) : #will make if not exist, only returns False if already exists or can't make
            print "ERROR in autoNoise: bad dir:", self.output_folder
            raise RuntimeError
	    
        # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short.
        #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly'))
        super(autoNoise, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly"))

        bnxfile = self.varsP.bnxFile if varsP.noiseOnly else self.varsP.sorted_file+".bnx"
        #was return if generateJobListChar, but need to get readparameters if bypass
        if not self.generateJobListChar({}, bnxfile, "autoNoise0") : #return 0 for success, 1 for skip
            self.varsP.runJobs(self, "AutoNoise0")
            self.doAllPipeReport()
        if not self.allResultsFound() :
            self.varsP.updatePipeReport("ERROR: AutoNoise0 failed. Check: "+self.output_file+".stdout\n")
            raise RuntimeError
        util.LogStatus("progress", "stage_complete", self.stageName)
            
        self.varsP.noise0 = readNoiseParameters(self.output_file)
	self.isBadErrorParams(self.varsP.noise0, 0)

        self.stageName = "Autonoise1"
        self.groupName = self.stageName #fix so that LogStatus call in MultiThreading.multiThreadRunJobs
        util.LogStatus("progress", "stage_start", self.stageName)

        self.clearJobs()
        
	self.varsP.replaceParam("noise0", "-readparameters", self.output_file+".errbin")

        #need to call again to set self.output_file
        if not self.generateJobListChar(self.varsP.noise0, bnxfile, "autoNoise1") : #return 0 for success, 1 for skip
            self.varsP.runJobs(self, "AutoNoise1")
            self.doAllPipeReport()
        if not self.allResultsFound() :
            self.varsP.updatePipeReport("ERROR: AutoNoise1 failed. Check: "+self.output_file+".stdout\n")
            raise RuntimeError
            
        self.varsP.noise1 = readNoiseParameters(self.output_file)
        
	infoReport="Automatically determined noise parameters:\n"
        klist = ["FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict.
        for v in klist :
            if not self.varsP.noise1.has_key(v) :
                continue
            param=str(self.varsP.noise1[v])
            util.LogStatus("parameter", "auto_"+v, param)
            infoReport+=v+":"+param+"\n"
            self.varsP.replaceParam("noise0", "-"+v, param)
        self.varsP.updateInfoReport(infoReport + '\n')
        self.isBadErrorParams(self.varsP.noise1, 1)

        if self.varsP.doScanScale : #change the sorted_file to the rescaled bnx file
            rescaledbnx = self.output_file + self.varsP.rescaleSuffix #no ".bnx" in suffix
            if not util.checkFile(rescaledbnx+".bnx") : #not found--not an error if bnx 0.1 is used
                err = "Warning: scan scaled bnx not found after autoNoise1; not performing scan scaling--check that bnx 1.0 or later used in input"
                self.varsP.updatePipeReport( err+"\n\n" )
                util.LogError("warning", err)
                self.varsP.doScanScale = False
            else : #log that scan scaling is used
                self.varsP.updatePipeReport( "Using scan scaled bnx: "+rescaledbnx+".bnx\n\n" )
                util.LogStatus("parameter", "scanscaled_bnx", rescaledbnx+".bnx")
                self.varsP.sorted_file = rescaledbnx #this variable is used in splitBNX (PairwiseModule.py)
            
        util.LogStatus("progress", "stage_complete", self.stageName)
Ejemplo n.º 24
0
def runSV(cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, groupsv):
    '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns;
    report on those alignments or the xmap provided as xmappath.
    '''

    printargs = True

    if not os.path.isfile(os.path.join(cwd,"utilities.py")):
        print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    if not util.checkFile(os.path.join(cwd,"Pipeline.py")):
        print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import Pipeline

    if not util.checkFile(os.path.join(cwd,"SVModule.py")):
        print "SVModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import SVModule as svm

    if errfile and not util.checkFile(os.path.join(cwd,"SampleCharModule.py")):
        print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    elif errfile :
        import SampleCharModule as scm

    #use Pipeline objects

    varsP = Pipeline.varsPipeline()

    varsP.optArgumentsFileIn   = optargs
    varsP.RefAlignerBin        = rabin
    varsP.latestMergedCmap     = os.path.join(contigdir, contigbase+".cmap") #file suffix required to be .cmap
    varsP.contigFolder         = os.path.split(contigdir)[0]
    varsP.nThreads             = nthreads #necessary otherwise job won't start -- max threads per node
    varsP.maxthreads           = maxthreads #threads per job
    varsP.ref                  = refcmap
    varsP.stdoutlog            = True #enable -stdout -stderr args to RefAligner
    varsP.curCharacterizeCmaps = [varsP.latestMergedCmap]
    varsP.contigSubDirectories = True #needed for prepareContigIO
    varsP.doAlignMolvRef       = False #do not look for copy number
    varsP.groupSV              = groupsv #mimic Pipeline behavior: group or not 

    if runaligns :
        #varsP.contigAlignTarget = outdir
        varsP.runSV = False
        varsP.groupContigs = False
        varsP.stdoutlog    = True #use -stdout -stderr
        varsP.stageComplete = contigbase
        varsP.outputContigPrefix = getContigPrefix(util, contigdir) #if outdir is not supplied, this is used as dir prefix; also used as file pref for -o arg
        varsP.outputContigFolder = contigdir #cmaps are copied from here

        if not outdir :
            outdir = contigdir+"_sv" #this will be outdir of sv jobs
        if os.path.isdir(outdir) :
            if not util.checkDir(outdir) : #check writeable
                print "\nERROR: Output dir is not writeable:\n", outdir, "\n"                
                sys.exit(1)
            elif outdir == contigdir :
                print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n"                
                sys.exit(1)                
            print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n"
        elif not util.checkDir(outdir) : #does not exist, make, if False, can't make or not writeable
            print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n"
            sys.exit(1)

        if clustargs :
            os.putenv('SGE_ROOT', '/var/lib/gridengine') #do I want this???
            varsP.onCluster = True
            varsP.clusterLogDir = os.path.join(outdir, 'ClusterLogs')
            util.checkDir(varsP.clusterLogDir) #make it
            varsP.checkCluster()
            varsP.clusterArgumentsFileIn = clustargs #required for parseArguments
            varsP.parseArguments(readingClusterFile=True)
            if varsP.error :
                print varsP.message
                sys.exit(1)
            varsP.RefAlignerBin += "${BINARY_SUFFIX:=}" #copy from varsPipeline, handled by external script on phi host

        varsP.pipeReportFile = os.path.join(outdir, "sv_jobs_log.txt")
        varsP.infoReportFile = os.path.join(outdir, "sv_log.txt")
        varsP.memoryLogpath  = os.path.join(outdir, "memory_log.txt")
        if bedfile :
            varsP.bedFile = bedfile
        util.InitStatus( os.path.join(outdir, "status.xml") )
        varsP.parseArguments() #parses optArgumentsFile
        varsP.checkDependencies()
        varsP.RefAlignerBinOrig = rabin
        varsP.prerunLog() #general information in log -- needed for refaligner_version
        if printargs :
            print "\nRunning SV detection with arguments ("+os.path.split(optargs)[1]+"):\n" + " ".join(varsP.argsListed('svdetect')) + '\n'

        noisep = {}
        if errbinfile :
            noisep = {"readparameters": errbinfile}
            print "Using noise parameters from "+errbinfile+"\n"
        elif errfile :
            noisep = scm.readNoiseParameters(errfile.replace(".err",""))
            if noisep.has_key('readparameters') : #remove this because it's redundant, and it can cause problems with RefAligner compatibility
                del noisep['readparameters']
            if not noisep : #readNoiseParameters returns empty dict on failure
                print "ERROR reading noise parameters, check .err file:", errfile
                sys.exit(1)
            print "Using noise parameters from "+errfile+":\n" + " ".join(["-"+str(k)+" "+str(v) for k,v in noisep.iteritems()])+"\n"

        #make merged cmap to replace merged _q.cmap if not produced by RefAligner
        varsP.contigPathTxtFile = os.path.join(outdir, "contig_list.txt") #mergeIntoSingleCmap creates this file
        print "Creating merged cmap"
        varsP.mergeIntoSingleCmap(outdir)
        print "Merged cmap created:", varsP.latestMergedCmap, "\n"

        varsP.outputContigFolder = contigdir #cmaps are copied from here
        svmodule = svm.SVdetect(varsP, noisep, outdir, skipderes=True)
        #this got duplicated above
        #if hasattr(util, "InitStatus") : #if old version, skip -- do this after SVdetect.__init__ bc makes outdir
        #    util.InitStatus(os.path.join(outdir, "status.xml")) #needed otherwise call to status_log fails
        svmodule.runJobs()
        svmodule.checkResults()
        util.SummarizeErrors(varsP) 

    else :
        varsP.contigAlignTarget = contigdir #this is dir in which _q and _r cmaps must be located
        print "ERROR: feature not supported" #not implemented to not run jobs
Ejemplo n.º 25
0
def getArgs() :    
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument('-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) 
    parser.add_argument('-r', dest='referenceMap', help='Path to reference maps (.cmap), 1 file only (required)', type=str)
    parser.add_argument('-q', dest='queryDir', help='Path to dir containing query maps (.cmaps) (required)', type=str)
    #parser.add_argument('-x', dest='xmap', help='Path to .xmap, 1 file only (optional, if specified, no alignment is done, if not specified, -t, -r, and -q must be specified)') #not supported
    parser.add_argument('-o', dest='outputDir', help='output dir (optional, defaults to input map dir with suffix "_sv")', default="", type=str)
    parser.add_argument('-p', dest='pipelineDir', help='Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str)
    parser.add_argument('-a', dest='optArguments', help='Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str)
    parser.add_argument('-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int)
    parser.add_argument('-j', dest='maxthreads', help='Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int)
    parser.add_argument('-b', dest='bedFile', help='.bed file with gaps in reference for flagging SVs which overlap N-base gaps (optional)', default="", type=str)
    parser.add_argument('-e', dest='errFile', help='.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)', default="", type=str)
    parser.add_argument('-E', dest='errbinFile', help='.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)', default="", type=str)
    parser.add_argument('-C', help='Run on cluster, read XML file for submission arguments (optional--will not use cluster submission if absent)', dest='cxml', default=None)
    parser.add_argument('-s', help='Disable grouping of SV jobs (default grouped; optional)', dest='groupsv', action='store_false')
    result = parser.parse_args()

    #check all Pipeline dependencies
    if result.pipelineDir :
        cwd = result.pipelineDir
    else :
        cwd = os.path.split(os.path.realpath(__file__))[0] #this is path of this script
        if not os.path.isfile(os.path.join(cwd,"utilities.py")) : #if still not here, last try is actual cwd
            cwd = os.getcwd() #still check this below

    #this is the only one imported here and in runCharacterize
    if not os.path.isfile(os.path.join(cwd,"utilities.py")):
        print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    #xmap -- don't use this
    runaligns = True #default is to run the alignment
    xmappath = None
    #if result.xmap :
    #    xmappath = result.xmap
    #    if not util.checkFile(xmappath, ".xmap") :
    #        print "Xmap path is supplied ("+xmappath+") but not found or doesn't end in .xmap."
    #        sys.exit(1)
    #    runaligns = False

    #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args
    rabin = result.RefAligner
    #replicate Pipeline behavior: RefAligner is always required
    if os.path.isdir(rabin) :
        rabin = os.path.join(rabin, "RefAligner")
    if not util.checkExecutable(rabin):
        print "RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg."
        sys.exit(1)

    #reference maps -- only required if xmap not specified
    refcmap = os.path.realpath(result.referenceMap)
    if runaligns and not util.checkFile(refcmap, ".cmap") : #and not util.checkFile(refcmap, ".spots") :
        print "Reference map file ("+refcmap+") not found or does not end in .cmap or .spots. Check -r argument."
        sys.exit(1)

    #query maps
    qrypath = os.path.realpath(result.queryDir)
    #if runaligns and not util.checkFile(qrypath, ".cmap") :
    #    print "Query map file ("+qrypath+") not found or does not end in .cmap or .spots. Check -q argument."
    #    sys.exit(1)
    if not util.checkDir(qrypath, checkWritable=False, makeIfNotExist=False) : #does NOT have to be writeable
        print "Query dir ("+qrypath+") not found or not a dir. Check -q argument."
        sys.exit(1)
    if runaligns :
        contigdir  = qrypath #os.path.split(qrypath)[0] #dir of query maps
        contigbase = os.path.split(qrypath)[1] #filename
    else :
        contigdir  = os.path.split(xmappath)[0]
        contigbase = os.path.split(xmappath)[1] #filename
    #contigbase = contigbase[:contigbase.find(".")] #remove suffix

    #optargs file
    optargs = None
    if result.optArguments : #supplied on command line
        optargs = result.optArguments
        if not util.checkFile(optargs, ".xml") :
            print "optArguments path is supplied ("+optargs+") but not found or doesn't end in .xml, check -a argument."
            sys.exit(1)
    elif runaligns : #load from Pipeline dir if running alignments
        optargs = os.path.join(cwd,"optArguments_human.xml")
        if not util.checkFile(optargs):
            print "optArguments.xml missing in Pipeline directory ("+cwd+"). Try supplying path explicitly using -a."
            sys.exit(1)

    #cluster args
    clustargs = None
    if result.cxml :
        clustargs = os.path.realpath(result.cxml)
        if not util.checkFile(clustargs, ".xml") :
            print "clusterArguments path is supplied ("+clustargs+") but not found or doesn't end in .xml, check -C argument."
            sys.exit(1)

    #nthreads
    nthreads = result.numThreads
    if nthreads <= 0 :
        print "Number of threads value invalid (must be > 0): %i" % nthreads
        sys.exit(1)

    #maxthreads
    maxthreads = result.maxthreads
    if maxthreads <= 0 :
        print "Max threads value invalid (must be > 0): %i" % maxthreads
        sys.exit(1)

    #bed file
    bedfile = result.bedFile #must make local for return statement below
    if bedfile : #must check for empty string BEFORE you do realpath, or it returns cwd
        bedfile = os.path.realpath(result.bedFile)
        if not util.checkFile(bedfile, ".bed") :
            print "bed file supplied but not found or incorrect suffix:", bedfile
            sys.exit(1)

    #.errbin file
    errbinfile = result.errbinFile
    if errbinfile :
        errbinfile = os.path.realpath(result.errbinFile)
        if not util.checkFile(errbinfile, ".errbin") :
            print "errbin file supplied but not found or incorrect suffix:", errbinfile
            sys.exit(1)

    #.err file
    errfile = result.errFile
    if errfile and errbinfile :
        print "Warning: .err and .errbin arguments supplied; ignoring .err file"
        errfile = ""
    elif errfile :
        errfile = os.path.realpath(result.errFile)
        if not util.checkFile(errfile, ".err") :
            print "err file supplied but not found or incorrect suffix:", errfile
            sys.exit(1)

    outdir = os.path.realpath(result.outputDir)

    #yes, this is messy...but I don't want another class (besides varsPipeline) and they just go to runCharacterize
    return cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, result.groupsv