def __init__(self, varsP) :
     """sortBNX.__init__: this class is for sorting the input bnx
     for subsequent splitting by the splitBNX class, and eventually
     easier processing with the Pairwise class. The constructor
     (this) will call varsP.runJobs and doAllPipeReport."""
     self.stageName="SortBNX"
     self.varsP = varsP #fewer code modifications below
     self.varsP.sorted_file = self.varsP.bnxFile.replace(".bnx", "_sorted")
     #replace this with checkMinMol; this needs to use sorted file which isn't yet made
     #calculateNPairwise(self.varsP, self.varsP.bnxFile.replace(".bnx","")) #run this here bc it contains check on N mol required to start pipeline
     checkMinMol(self.varsP, self.varsP.bnxFile)
     if self.generateJobList() : #return 0 for success, 1 for skip
         if not util.checkFile(self.varsP.sorted_file+".bnx") : #this happens when accidentally using bypass but no sorted bnx exists--log error
             err = "ERROR: no sorted bnx file found (%s) (check bypass (-B) argument to Pipeline)" % (self.varsP.sorted_file+".bnx")
             self.varsP.updatePipeReport(err+"\n")
             util.LogError("critical", err)
             util.LogStatus("progress", "pipeline", "failure")
             raise RuntimeError
         #calculateNPairwise(self.varsP, self.varsP.sorted_file) #correct varsP.nPairwiseJobs -- already above
         return
     util.LogStatus("progress", "stage_start", self.stageName) #after above bc check if bypass (executeCurrentStage)
     self.varsP.runJobs(self, "SortBNX")
     self.doAllPipeReport()
     if not self.allResultsFound() :
         err = "ERROR: sortBNX failed. Check: "+self.varsP.bnxFile
         self.varsP.updatePipeReport(err+"\n")
         util.LogError("critical", err)
         util.LogStatus("progress", "pipeline", "failure")
         raise RuntimeError
     util.LogStatus("progress", "stage_complete", self.stageName)
 def readCmapFile(self, cmapFile):
     if util.checkFile(cmapFile) :
         f1 = open(cmapFile)
     else :
         print "Error in MapClassesRev.MultiCmap.readCmapFile: missing file", cmapFile
         return
     newCmap = True
     for line in f1 :
         if line[0] == '#':
             continue
         tokens = line.split('\t')
         if newCmap:
             cmapID = int(tokens[0])
             cmapLen = float(tokens[1])
             curCmap = Cmap(cmapID, cmapLen)
             self.cmapDB[cmapID] = curCmap
             nSites = int(tokens[2])
             newCmap = False
             #print('Adding CMAP %d' % cmapID)
         siteID = int(tokens[3])
         if siteID > nSites:
             newCmap = True
             continue
         siteLoc = float(tokens[5])
         covg = float(tokens[7])
         curCmap.addSite(siteID, siteLoc, covg)
def mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="") :
    """Given a list of file prefixes (outFileList), append "_r.cmap" to them, and merge them
    to outdir. Report to varsP if supplied, stdout if not.
    Also support outFileList is full paths (including "_r.cmap").
    If splitByContig < 1, output each contig separately, if == 1, only output single merged cmap,
    and if > 1, do both.
    Always use stagename if supplied; if not, must supply varsP otherwise prefix is empty.
    """
    
    if not util.checkDir(outdir) :
        err_msg = "Warning in AlignModule.mergeRcmaps: could not make outdir %s, skipping copy number" % outdir
        logOrPrintError(err_msg, varsP)
        return

    if not outFileList : #just an argument check--check for presence on disk is below
        err_msg = "Warning in AlignModule.mergeRcmaps: no maps supplied"
        logOrPrintError(err_msg, varsP)
        return

    outFileList.sort() #for reproducibility with runAlignMerge.py (different order when listing dir)
    rsuf = "_r.cmap"
    #mappref = os.path.split(outFileList[0])[1] #this is just prefix, but with integer suffix--get it before -- no longer used
    #mappref = mappref[:mappref.rfind("_")+1] #remove integer suffix
    #even though outFileList should all be there, a job may have failed--check all, just existence
    present = []
    for outf in outFileList :
        target = (outf+rsuf if not outf.endswith(rsuf) else outf) #now support either
        if not util.checkFile(target) :
            err_msg = "Warning in AlignModule.mergeRcmaps: missing _r.cmap %s" % target
            logOrPrintError(err_msg, varsP)
        else :
            present.append(target)
    if not present : #no _r.cmaps found (this will also happen for empty outFileList)
        err_msg = "Warning in AlignModule.mergeRcmaps: no _r.cmaps found, skipping copy number"
        logOrPrintError(err_msg, varsP)
        return
    outFileList = present #yes, it's redundant, but now have rsuf appended

    mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
    #mergedmappath = os.path.join(outdir, mappref+mrgstr+rsuf) #this is output merged _r.cmap -- unify with filepref

    mergedmap = mc.multiCmap(outFileList[0]) #open original, edit in memory
    #now add other maps
    for rmap in outFileList[1:] : #don't add map 0 to itself
        if mergedmap.addCovOcc( mc.multiCmap(rmap) ) : #when calling addCovOcc, check return, warn if True
            err_msg = "Warning in AlignModule.mergeRcmaps: addCovOcc call failed for map %s" % rmap
            logOrPrintError(err_msg, varsP)
    #now it's merged, but the resulting map need to be written back to disk
    filepref = (varsP.outputContigPrefix if varsP and stageName == "" else stageName) #see split_XMapQcmap_byContig
    if splitByContig < 1 or splitByContig > 1 :
        #print "\nself.varsP.outputContigPrefix", self.varsP.outputContigPrefix, "\n" #debug
        #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in split_XMapQcmap_byContig
        mergedmap.writeAllMapsToDisk( os.path.join(outdir, filepref+'_contig'), outsuf="_r" )
        report = "mergeRcmaps: wrote %i cmaps" % len(mergedmap.cmapdict)
    if splitByContig > 0 :
        mergedmap.writeToFile( os.path.join(outdir, filepref+"_"+mrgstr+rsuf) ) #was mergedmappath
        report = "mergeRcmaps: wrote merged cmap with %i contigs" % len(mergedmap.cmapdict)
    #report result
    logOrPrintError(report, varsP, warn=False)
 def readMapFile(self, mapFile, verbose=0):
     commentChars = ['#','S','M']
     if not util.checkFile(mapFile, ".map") :
         print "Error in MapResults.readMapFile: missing file", mapFile
         return
     for line in open(mapFile) :
         if commentChars.__contains__(line[0]):
             continue
         curResult = SingleMapResult(line, self.qryCmap, self.refCmap)
         #if verbose > 0 and self.hitDB.has_key(curResult.qryCmapID): #if you want this back, use 'in'
         #    print "  Warning MapID %d already counted" % curResult.qryCmapID
         self.hitDB.append( curResult )
def mergeMap(varsP, outFileList, mergepath) :
    """outFileList is list of path+prefixes--each should have a .map file:
    merge them to a merged .map file in dir mergepath."""

    outFileList.sort() #sort to ensure reproducibility (order of entries)
    maplist = []
    for outpath in outFileList : #these are file prefixes
        if util.checkFile(outpath+".map") :
            maplist.append(outpath+".map")
        elif varsP :
            varsP.updatePipeReport("Warning in AlignModule.mergeMap: missing map: "+outpath+".map"+"\n")
        else :
            print "Warning in AlignModule.mergeMap: missing map: "+outpath+".map"+"\n"

    if not len(maplist) : #nothing to merge
        return

    if not util.checkDir(mergepath) :
        varsP.updatePipeReport("Warning in AlignModule.mergeMap: merge path invalid: "+mergepath+"\n")
        return

    headstart = ["#", "S", "M"] #last two lines of header start with "Software" and "MappedMoleculeId"
    #header = ""
    headerdone = False
    #data = ""
    lineno = 1 #can't just append: need to change index in first column
    sep = "\t"
    mappref = getMergeFilename(outFileList[0]) #also in getAlignStats
    mrgstr  = (varsP.alignMolvrefMergeName if varsP else "merge") #same for vref and not
    outpath = os.path.join(mergepath, mappref+mrgstr+".map")
    f1 = open(outpath, 'w')
    for path in maplist :
        f = open(path)
        for line in f :
            if line[0] in headstart and not headerdone :
                #header += line
                f1.write(line)
            elif line[0] not in headstart :
                tokens = line.split()
                tokens[0] = str(lineno)
                #data += sep.join(tokens)+"\n" #newline was stripped by split
                f1.write(sep.join(tokens)+"\n")
                lineno += 1
        headerdone = True
        f.close()

    #f1.write(header+data) 
    f1.close()
    def generateJobList(self) :
        """splitBNX.generateJobList: submit varsP.nPairwiseJobs number of split bnx jobs. """

        sorted_file = self.varsP.sorted_file
        if not util.checkFile(sorted_file+".bnx") :
            err = "ERROR: splitBNX input file (%s) not found; exiting" % self.varsP.sorted_file
            self.varsP.updatePipeReport(err+"\n")
            util.LogError("critical", err)
            util.LogStatus("progress", "pipeline", "failure")
            raise RuntimeError

        N = calculateNPairwise(self.varsP, sorted_file) #move back here (not sortBNX) bc needs to use sorted bnx
        #N = self.varsP.nPairwiseJobs

        self.varsP.updatePipeReport('Splitting BNX\n')
        #splitJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('splitting'))
        super(splitBNX, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs('splitting'))

        #should skip the rest and return 1, like in sortBNX, here:
        if not self.varsP.executeCurrentStage:
            return 1 #tell self.__init__ not to continue processing

        self.varsP.updatePipeReport("Splitting"+(" scan-scaled" if self.varsP.doScanScale else "")+" bnx file: %s.bnx\n\n" % self.varsP.sorted_file)

        #calculate threads per job: used to be fixed at 1, now file size / 1.5 GB rounded up. This was too low, add 1.
        threads = max(1, int(math.ceil( os.path.getsize(sorted_file+".bnx")/1.5e9 ))) + 1
        if threads > 1 :
            self.varsP.updatePipeReport("Using %i threads per job\n" % threads)

        #the change in job partitioning breaks backward compatibility and was causing too many problems; make it conditional on refaligner version
        if self.varsP.refaligner_version < 3995 :
            for partial in range(1,N + 1):
                output_file=self.varsP.bnxFile.replace(".bnx", "_%s_of_%s" %(partial, self.varsP.nPairwiseJobs))
                cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", str(partial), str(N), "-bnx", "-o",  output_file]
                if self.varsP.stdoutlog :
                    cargs.extend( ['-stdout', '-stderr'] )
                #print('%d/%d' % (partial, N), cargs)
                expectedResultFile=output_file+".bnx"
                self.addJob(mthread.singleJob(cargs, self.stageName + str(partial), expectedResultFile, self.stageName + str(partial), maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout"))

        else :
            #change above to single command with -subsetbin 0 N
            output_file=self.varsP.bnxFile.replace(".bnx", "")
            cargs=[self.varsP.RefAlignerBin, '-f', '-i', sorted_file+".bnx", "-maxthreads", str(threads), "-merge", "-subsetbin", "0", str(N), "-bnx", "-o",  output_file]
            if self.varsP.stdoutlog :
                cargs.extend( ['-stdout', '-stderr'] )
            self.addJob(mthread.singleJob(cargs, self.stageName, output_file+".bnx", self.stageName, maxThreads=threads, clusterLogDir=self.varsP.clusterLogDir, expectedStdoutFile=output_file+".stdout"))
 def generateJobList(self):
     curArgs = self.varsP.argsListed('noise0') + self.varsP.argsListed('sampleChar')
     if util.checkFile(self.varsP.bnxTarget) : #file exists only if image processing was run
         bnxFiles = parseExperimentFile(self.varsP.bnxTarget)
         if not bnxFiles : #check that you got at least one
             errstr = "ERROR in SampleChar.generateJobList: no bnx files found in: "+self.varsP.bnxTarget
             print errstr
             self.varsP.updatePipeReport(errstr+"\n\n")
             return
         basepath = "" #os.path.split(bnxFiles[0])[0] #don't use basepath for this case
     else : #otherwise, assume this is the only bnx file
         bnxFiles = [self.varsP.bnxFile]
         #here, make a dir for the results--should really check results of checkEmptyDir for errors
         basepath = os.path.join(self.varsP.localRoot, "sampleChar")
         if self.varsP.wipe and os.path.isdir(basepath) :
             shutil.rmtree(basepath)
             #util.checkEmptyDir(basepath) #will make if not exist, but if it does, will remove and re-make -- this fn doesn't exist...
         #else :
         util.checkDir(basepath) #will make if not exist, but won't remove anything
     nJobs = len(bnxFiles)
     #for i, bnxFile in enumerate(bnxFiles):
     for bnxFile in bnxFiles :
         #bnxGroupName = '%02d' % (i+1) #get this from the path, ie, bnxFiles
         cargs = [self.varsP.RefAlignerBin, '-i', bnxFile]
         bnxname = os.path.split(bnxFile)[1].replace(".bnx","")
         jobname = 'Sample_Char_' + bnxname
         #outputTarget = os.path.join(basepath, bnxGroupName)
         if basepath : #bnx input
             outputTarget = os.path.join(basepath, bnxname)
         else : #image processing
             outputTarget = bnxFile.replace(".bnx","") + "_sampleChar"
         expectedResultFile = outputTarget + '.err' #this is used in checkResults
         currentArgs = cargs + ['-ref', self.varsP.ref, '-o' , outputTarget, '-f']
         if self.varsP.stdoutlog :
             currentArgs.extend( ['-stdout', '-stderr'] )
         currentArgs += ['-maxthreads', str(self.varsP.maxthreads)] + curArgs
         sJob = mthread.singleJob(currentArgs, jobname, expectedResultFile, jobname, clusterLogDir=self.varsP.clusterLogDir) # peStr is deprecated in favor of clusterargs
         #sJob.expTag = bnxGroupName #removed from checkResults
         self.addJob(sJob)
     self.logArguments()
def split_XMap_byContig_new(outFileList, mergepath, varsP=None, stageName="") :
    """outFileList is list of path+prefixes--each should have a .xmap and _q.cmap file:
    split into one per contig."""
    logOrPrintError("Start split_XMapQcmap_byContig", varsP, warn=False)
    xmapFilelist = []
    for outpath in outFileList : #these are file prefixes
        if util.checkFile(outpath+".xmap") :
            xmapFilelist.append(outpath+".xmap")
        else :
            err_msg = "Warning in AlignModule.split_XMapQcmap_byContig: missing xmap: "+outpath+".xmap"
            logOrPrintError(err_msg, varsP)

    if not len(xmapFilelist) : #nothing to merge
        err_msg = "Error in AlignModule.split_XMapQcmap_byContig: no xmaps found"
        logOrPrintError(err_msg, varsP)
        return

    #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in mergeRcmaps
    filepref = (varsP.outputContigPrefix if varsP and stageName == "" else stageName) #same as line in mergeRcmaps

    outFileList.sort() #sort to ensure reproducibility (order of entries)
    xmapLineDict = {} #if you store the number of lines here, you can avoid counting every time it's opened
    xmapMolDict = {} #store the molecule IDs here for use in split_Qcmap_byContig
    newxmaplist = [] #store paths of output xmaps to fix their headers
    header = "" #get header of first file
    with open(outFileList[0]+".xmap") as f1 :
        for line in f1 : #no readline--that will iterate over each char in line instead of line itself
            if line[0] == '#':
                header += line
            else :
                break
    for path in xmapFilelist :
        f = open(path)
        for line in f :
            if line[0] == "#" : #get header separately above
                continue
            #I don't think there's any way to avoid split, except looping over chars, but that's probably just as slow
            tokens = line.split()
            try:
                qryid = int(tokens[1])
                refid = int(tokens[2])
            except:
                continue

            outpref = os.path.join(mergepath, filepref+'_contig'+str(refid)) 
            if not outpref in newxmaplist : #this loop is every line; don't duplicate
                newxmaplist.append(outpref) #prefixes
            outf = open(outpref+".xmap", "a+") #make a new file if not exists; if does, points to end of file
            if not refid in xmapLineDict :
                xmapLineDict[refid] = 1
                xmapMolDict[refid] = [qryid]
                outf.write(header) #write header to disk
            else :
                xmapLineDict[refid] += 1
                #because xmapMolDict is used to make the _q.cmap, its entries should be unique
                #assert xmapMolDict[refid].count(qryid) == 0, ("dup molid %i, path %s" % (qryid, path))
                if not qryid in xmapMolDict[refid] :
                    xmapMolDict[refid].append(qryid)
            #outf.write("\t".join([str(xmapLineDict[refid])]+tokens[1:])+"\n")
            tokens[0] = str(xmapLineDict[refid])
            outf.write("\t".join(tokens)+"\n")
            outf.close() #avoid keeping too many file handles open at the expense of re-open many times
        #end for line in f
        f.close()
    #end for xmapFilelist

    #need to fix headers still, ie, the editHeaderMaps/QueryMaps: must re-read and -write files
    for path in newxmaplist :
        with open(path+".xmap", "r") as f :
            lines = f.readlines()
        with open(path+".xmap", "w") as f :
            for line in lines :
                if line.find("Query Maps") != -1 :
                    line = line.split(":")[0] + ":\t" + path + "_q.cmap" + "\n"
                elif line.find("Reference Maps") != -1 :
                    line = line.split(":")[0] + ":\t" + path + "_r.cmap" + "\n"
                f.write(line)
    logOrPrintError("split_XMapQcmap_byContig: wrote %i xmaps" % len(xmapMolDict), varsP, warn=False) #reproduce original fn
    if 0 :
        bad = False 
        print "DEBUG:"
        for xl in xmapMolDict.values() : #list of mols
            for i in xl :
                if xl.count(i) > 1 :
                    bad = True
                    print i
        if bad :
            print xmapMolDict
        print "DEBUG\n"
    return(xmapMolDict)
def getAlignStats(varsP, outFileList, reflen=0, isref=False, mergepath="", bnxpath=None) :
    '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule.
    reflen should be in Mb. If mergepath supplied, put merged .err there.
    If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this
    file and ignore outFileList.
    '''

    statonly = False #bnx stats only
    skipbnx = False #.err file processing only
    if bnxpath == None :
        if not varsP.sorted_file : #for runAlignMol, this is empty: nothing to do in this case
            skipbnx = True
        else :
            bnxpath = varsP.sorted_file+".bnx" #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix
    else : #if bnxpath != None :
        statonly = True
    if not skipbnx and not util.checkFile(bnxpath) :
        varsP.updatePipeReport("Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n" % bnxpath)
        return

    #find the minlen used for bnx_sort, which is a required arg set
    sortargs = []
    if varsP.argData.has_key('bnx_sort') : #for runAlignMol.py
        sortargs = varsP.argsListed('bnx_sort')
    minlen = 0
    validminlen = False
    if "-minlen" in sortargs :
        minlen = sortargs[sortargs.index("-minlen")+1] #next ele should be the len, if next ele isn't in list, the sort job will fail
        minlen = util.getIntFromString(minlen) #returns None if can't cast to int
        if minlen :
            validminlen = True

    if not validminlen and bnxpath == None and sortargs :
        varsP.updatePipeReport("Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n")
    if bnxpath != None : #if bnxpath, ignore minlen
        minlen = 0

    nmol = 0 #total n mol above minlen
    totlen = 0 #total mol len above minlen
    if util.checkFile(bnxpath) :
        #the bnxfile class is very wasteful. replace with below
        #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now
        outstr = "Reading molecule stats from %s:\n" % bnxpath
        outstr += "Molecule Stats:\n"
        moldict = util.simpleBnxStats(bnxpath, minlen)
        nmol = moldict["nmol"]
        totlen = moldict["totlen"]
        #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously
        outstr += "N mols: %i\n" % nmol
        outstr += ("Total len (Mb): %10.3f\n") % totlen
        outstr += ("Avg len (kb)  : %10.3f\n") % moldict["avglen"]
        outstr += ("Mol N50 (kb)  : %10.3f\n") % moldict["n50"]
        outstr += ("Lab (/100kb)  : %10.3f\n") % moldict["labdensity"]
        #    if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below
        #        bnx.molstats[minlen].genomesizemb = 0 
        #    outstr += str(bnx.molstats[minlen]) 
        #nmol = bnx.molstats[minlen].nmol
        #totlen = bnx.molstats[minlen].totlen

        if reflen : 
            cov = totlen / reflen #totlen is in Mb
            outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else "Contig", cov)
        if isref or reflen or statonly : #if neither, nothing to print
            varsP.updateInfoReport(outstr + "\n", printalso=True)
    elif not skipbnx :
        varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing bnx path:"+bnxpath+"\n")

    if statonly :
        return

    #lastly, load .xmaps and .errs from alignmol jobs and report on stats
    totmaplen = 0 #sum of lengths of mapped portions of all molecules, on reference
    totmapqrylen = 0 #sum of lengths of mapped portions of all molecules, on query
    totconf = 0 #sum of confidence of all alignments
    nalign = 0 #total number of alignments
    fplist = [] #lists for error rates
    fprlist = []
    fnlist = []
    bpplist = []
    nmaplist = [] #from .err
    gmaplist = [] #from .err
    llrmlist  = []; llrgmlist = []; bppsdlist = []
    sflist = []; sdlist = []; srlist = []; reslist = []; resdlist = []
    header = ""
    err = None #will be the alignParams object if any .err files are found
    mappref = ""
    if len(outFileList) > 0 :
        mappref = getMergeFilename(outFileList[0]) #make function to unify with same convention in mergeMap
    for outpath in outFileList : #these are file prefixes
        if util.checkFile(outpath+".xmap") :
            xmap = mc.xmap(outpath+".xmap")
            nalign += len(xmap.xmapLookup)
            totmaplen += xmap.getSumMappedRefLen() #in kb
            totmapqrylen += xmap.getSumMappedQryLen() #in kb
            totconf += sum([x.Confidence for x in xmap.xmapLookup.values()])
        else :
            varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing xmap:"+outpath+".xmap"+"\n")
        if util.checkFile(outpath+".err") :
            err = mc.alignParams(outpath+".err")
            if not header :
                header = err.header
            fplist.append(err.fp)
            fprlist.append(err.fprate)
            fnlist.append(err.fn)
            bpplist.append(err.bpp)
            reslist.append(err.res)
            nmaplist.append(err.nmaps)
            gmaplist.append(err.goodmaps)
            llrmlist.append(err.llrm)
            llrgmlist.append(err.llrgm)
            bppsdlist.append(err.bppsd)
            sflist.append(err.sf)
            sdlist.append(err.sd)
            srlist.append(err.sr)
            resdlist.append(err.ressd)

    #nalign from xmap should be the same as goodmaps from .err
    sumgoodmaps = sum(gmaplist)
    if sumgoodmaps != nalign :
        varsP.updateInfoReport("Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n" % (sumgoodmaps, nalign), printalso=True)
    if totmaplen or totconf or nalign : 
        outstr =  "Molecules Aligned to %s:\n" % ("Reference" if isref else "Assembly")
        outstr += "N mol align       : %9i\n" % nalign
        outstr += "Mol fraction align: %13.3f\n" % (float(nalign)/nmol if nmol else 0)
        outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3) #Mb
        if reflen > 0 : 
            outstr += ("Effective Cov (x) : %13.3f\n") % (totmaplen / 1e3 / reflen) #totlen is in kb
        outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen/nalign if nalign else 0)
        outstr += "Fraction align len: %13.3f\n" % (totmapqrylen/1e3/totlen if totlen else 0) #totmapqrylen is in kb, totlen is in mb
        outstr += "Tot confidence    : %11.1f\n" % totconf
        outstr += "Avg confidence    : %11.1f\n" % (totconf/nalign if nalign else 0)
        varsP.updateInfoReport(outstr, printalso=True)
    avgfp  = (sum(fplist)/len(fplist)   if len(fplist) else 0)
    avgfpr = (sum(fprlist)/len(fprlist) if len(fprlist) else 0)
    avgfn  = (sum(fnlist)/len(fnlist)   if len(fnlist) else 0)
    avgbpp = (sum(bpplist)/len(bpplist) if len(bpplist) else 0)
    avgres = (sum(reslist)/len(reslist) if len(reslist) else 0)
    avgllr = (sum(llrmlist)/len(llrmlist) if len(llrmlist) else 0)
    avgllg = (sum(llrgmlist)/len(llrgmlist) if len(llrgmlist) else 0)
    avgbps = (sum(bppsdlist)/len(bppsdlist) if len(bppsdlist) else 0)
    avgsf  = (sum(sflist)/len(sflist) if len(sflist) else 0)
    avgsd  = (sum(sdlist)/len(sdlist) if len(sdlist) else 0)
    avgsr  = (sum(srlist)/len(srlist) if len(srlist) else 0)
    avgrsd = (sum(resdlist)/len(resdlist) if len(resdlist) else 0)
    if avgfp or avgfn or avgbpp :
        outstr =  "Avg FP(/100kb)    : %12.2f\n" % avgfp
        outstr += "Avg FP ratio      : %13.3f\n" % avgfpr
        outstr += "Avg FN ratio      : %13.3f\n" % avgfn
        outstr += "Avg bpp           : %11.1f\n" % avgbpp
        outstr += "Avg sf            : %13.3f\n" % avgsf
        outstr += "Avg sd            : %13.3f\n" % avgsd
        outstr += "Avg sr            : %13.3f\n" % avgsr
        varsP.updateInfoReport(outstr + "\n", printalso=True)
    if err and mergepath : #have an error file (alignParams) object
        util.checkDir(mergepath)
        mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
        outpath = os.path.join(mergepath, mappref+mrgstr+".err")
        err.fp = avgfp
        err.fn = avgfn
        err.sf = avgsf
        err.sd = avgsd
        err.bpp = avgbpp
        err.res = avgres
        err.nmaps = sum(nmaplist)
        err.llrm  = avgllr
        err.goodmaps = sumgoodmaps
        err.llrgm = avgllg
        err.bppsd = avgbps
        err.fprate = avgfpr
        err.sr = avgsr
        err.ressd = avgrsd
        err.writeToFile(outpath)
    def __init__(self, varsP) :
        """splitBNX.__init__: this class is for sorting the input bnx
        for subsequent splitting by the splitBNX class, and eventually
        easier processing with the Pairwise class. The constructor
        (this) will call varsP.runJobs and doAllPipeReport, then
        instantiate splitBNX, which will do all the splitting required
        for the Pairwise class.
        """
        self.stageName = "Autonoise0"
        self.varsP = varsP #fewer code modifications below
        
        util.LogStatus("progress", "stage_start", self.stageName) #after above bc check if bypass (executeCurrentStage)
        bnxfile = self.varsP.bnxFile if varsP.noiseOnly else self.varsP.sorted_file+".bnx"
        if self.generateJobListChar({}, bnxfile, "autoNoise0") : #return 0 for success, 1 for skip
            return
        self.varsP.runJobs(self, "AutoNoise0")
        self.doAllPipeReport()
        if not self.allResultsFound() :
            self.varsP.updatePipeReport("ERROR: AutoNoise0 failed. Check: "+self.output_file+".stdout\n")
            raise RuntimeError
        util.LogStatus("progress", "stage_complete", self.stageName)
            
        self.varsP.noise0 = readNoiseParameters(self.output_file)
	self.isBadErrorParams(self.varsP.noise0)

        self.stageName = "Autonoise1"
        util.LogStatus("progress", "stage_start", self.stageName)

        self.clearJobs()
        
	self.varsP.replaceParam("noise0", "-readparameters", self.output_file+".errbin")
        
        if self.generateJobListChar(self.varsP.noise0, bnxfile, "autoNoise1") : #return 0 for success, 1 for skip
            return
        self.varsP.runJobs(self, "AutoNoise1")
        self.doAllPipeReport()
        if not self.allResultsFound() :
            self.varsP.updatePipeReport("ERROR: AutoNoise1 failed. Check: "+self.output_file+".stdout\n")
            raise RuntimeError
            
        self.varsP.noise1 = readNoiseParameters(self.output_file)
        
	infoReport="Automatically determined noise parameters:\n"
        klist = ["FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict.
        #for v in self.varsP.noise1.keys():
        for v in klist :
            if not self.varsP.noise1.has_key(v) :
                continue
            param=str(self.varsP.noise1[v])
            util.LogStatus("parameter", "auto_"+v, param)
            infoReport+=v+":"+param+"\n"
            self.varsP.replaceParam("noise0", "-"+v, param)
        self.varsP.updateInfoReport(infoReport + '\n')
        self.isBadErrorParams(self.varsP.noise1)

        if self.varsP.doScanScale : #change the sorted_file to the rescaled bnx file
            rescaledbnx = self.output_file + self.varsP.rescaleSuffix #no ".bnx" in suffix
            if not util.checkFile(rescaledbnx+".bnx") : #not found--not an error if bnx 0.1 is used
                err = "Warning: scan scaled bnx not found after autoNoise1; not performing scan scaling--check that bnx 1.0 or later used in input"
                self.varsP.updatePipeReport( err+"\n\n" )
                util.LogError("warning", err)
                self.varsP.doScanScale = False
            else :
                self.varsP.sorted_file = rescaledbnx #this variable is used in splitBNX (PairwiseModule.py)
            
        util.LogStatus("progress", "stage_complete", self.stageName)
def getArgs():
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument(
        '-t',
        dest='RefAligner',
        help='Path to RefAligner (required unless xmap is specified (-x))')
    parser.add_argument(
        '-r',
        dest='referenceMap',
        help=
        'Path to reference maps (.cmap or .spots), 1 file only (required unless xmap specified (-x) and _r.cmap is present in same dir as xmap)',
        default="")
    parser.add_argument(
        '-q',
        dest='queryMap',
        help=
        'Path to query maps (.cmap), 1 file only (required--if xmap specified (-x), this should be input (-i argument) for that command)',
        default="")
    parser.add_argument(
        '-x',
        dest='xmap',
        help=
        'Path to .xmap, 1 file only (optional, if specified, no alignment is done, if not specified, -t, -r, and -q must be specified)'
    )
    parser.add_argument(
        '-p',
        dest='pipelineDir',
        help='Pipeline dir (optional, defaults to current directory)')
    parser.add_argument(
        '-a',
        dest='optArguments',
        help=
        'Path to optArguments.xml (optional, default in Pipeline dir if found, otherwise required)'
    )
    parser.add_argument(
        '-n',
        dest='numThreads',
        help='Number of threads (cores) to use (optional, default 4)',
        default=4,
        type=int)
    parser.add_argument('-v',
                        dest='pvalue',
                        help='Pvalue (-T) used for alignment',
                        default="1e-12")
    result = parser.parse_args()

    #check all Pipeline dependencies
    if result.pipelineDir:
        cwd = result.pipelineDir
    else:
        cwd = os.getcwd()

    #this is the only one imported here and in runCharacterize
    if not os.path.isfile(os.path.join(cwd, "utilities.py")):
        print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    #xmap -- optional
    runaligns = True  #default is to run the alignment
    xmappath = None
    if result.xmap:
        xmappath = result.xmap
        if not util.checkFile(xmappath, ".xmap"):
            print "Xmap path is supplied (" + xmappath + ") but not found or doesn't end in .xmap."
            sys.exit(1)
        runaligns = False

    #RefAligner -- only required if xmap not specified
    rabin = result.RefAligner
    if not xmappath and not util.checkExecutable(rabin):
        print "RefAligner not found at", rabin, "\nPlease supply RefAligner full path as -t arg."
        sys.exit(1)

    #reference maps -- only required if xmap not specified
    refcmap = result.referenceMap
    if runaligns and not util.checkFile(
            refcmap, ".cmap") and not util.checkFile(refcmap, ".spots"):
        print "Reference map file (" + refcmap + ") not found or does not end in .cmap or .spots. Check -r argument."
        sys.exit(1)

    #query maps -- only required if xmap not specified
    qrypath = result.queryMap
    #if runaligns and not util.checkFile(qrypath, ".cmap") :
    if not util.checkFile(qrypath, ".cmap"):  #always required
        print "Query map file (" + qrypath + ") not found or does not end in .cmap or .spots. Check -q argument."
        sys.exit(1)
    #if runaligns :
    contigdir = os.path.split(qrypath)[0]  #dir of query maps
    contigbase = os.path.split(qrypath)[1]  #filename
    #else :
    #    contigdir  = os.path.split(xmappath)[0]
    #    contigbase = os.path.split(xmappath)[1] #filename
    contigbase = contigbase[:contigbase.find(".")]  #remove suffix

    #optargs file
    optargs = None
    if result.optArguments:  #supplied on command line
        optargs = result.optArguments
        if not util.checkFile(optargs, ".xml"):
            print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument."
            sys.exit(1)
    elif runaligns:  #load from Pipeline dir if running alignments
        optafile = "optArguments_human.xml"
        optargs = os.path.join(cwd, optafile)
        if not util.checkFile(optargs):
            print "%s missing in Pipeline directory (%s). Try supplying path explicitly using -a." % (
                optafile, cwd)
            sys.exit(1)

    #nthreads
    nthreads = result.numThreads
    if nthreads <= 0:
        print "Number of threads value invalid (must be >= 0): " + nthreads
        sys.exit(1)

    #pvalue
    if result.pvalue:  #supplied on command line
        pvalue = result.pvalue
    else:
        pvalue = "1e-12"

    #yes, this is messy...but I don't want another class (besides varsPipeline) and they just go to runCharacterize
    return cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, pvalue
Example #12
0
def split_XMap_byContig_new(outFileList, mergepath, varsP=None, stageName=""):
    """outFileList is list of path+prefixes--each should have a .xmap and _q.cmap file:
    split into one per contig."""
    logOrPrintError("Start split_XMapQcmap_byContig", varsP, warn=False)
    xmapFilelist = []
    for outpath in outFileList:  #these are file prefixes
        if util.checkFile(outpath + ".xmap"):
            xmapFilelist.append(outpath + ".xmap")
        else:
            err_msg = "Warning in AlignModule.split_XMapQcmap_byContig: missing xmap: " + outpath + ".xmap"
            logOrPrintError(err_msg, varsP)

    if not len(xmapFilelist):  #nothing to merge
        err_msg = "Error in AlignModule.split_XMapQcmap_byContig: no xmaps found"
        logOrPrintError(err_msg, varsP)
        return

    #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in mergeRcmaps
    filepref = (
        varsP.outputContigPrefix if varsP and stageName == "" else stageName
    )  #same as line in mergeRcmaps

    outFileList.sort()  #sort to ensure reproducibility (order of entries)
    xmapLineDict = {
    }  #if you store the number of lines here, you can avoid counting every time it's opened
    xmapMolDict = {
    }  #store the molecule IDs here for use in split_Qcmap_byContig
    newxmaplist = []  #store paths of output xmaps to fix their headers
    header = ""  #get header of first file
    with open(outFileList[0] + ".xmap") as f1:
        for line in f1:  #no readline--that will iterate over each char in line instead of line itself
            if line[0] == '#':
                header += line
            else:
                break
    for path in xmapFilelist:
        f = open(path)
        for line in f:
            if line[0] == "#":  #get header separately above
                continue
            #I don't think there's any way to avoid split, except looping over chars, but that's probably just as slow
            tokens = line.split()
            try:
                qryid = int(tokens[1])
                refid = int(tokens[2])
            except:
                continue

            outpref = os.path.join(mergepath,
                                   filepref + '_contig' + str(refid))
            if not outpref in newxmaplist:  #this loop is every line; don't duplicate
                newxmaplist.append(outpref)  #prefixes
            outf = open(
                outpref + ".xmap", "a+"
            )  #make a new file if not exists; if does, points to end of file
            if not refid in xmapLineDict:
                xmapLineDict[refid] = 1
                xmapMolDict[refid] = [qryid]
                outf.write(header)  #write header to disk
            else:
                xmapLineDict[refid] += 1
                #because xmapMolDict is used to make the _q.cmap, its entries should be unique
                #assert xmapMolDict[refid].count(qryid) == 0, ("dup molid %i, path %s" % (qryid, path))
                if not qryid in xmapMolDict[refid]:
                    xmapMolDict[refid].append(qryid)
            #outf.write("\t".join([str(xmapLineDict[refid])]+tokens[1:])+"\n")
            tokens[0] = str(xmapLineDict[refid])
            outf.write("\t".join(tokens) + "\n")
            outf.close(
            )  #avoid keeping too many file handles open at the expense of re-open many times
        #end for line in f
        f.close()
    #end for xmapFilelist

    #need to fix headers still, ie, the editHeaderMaps/QueryMaps: must re-read and -write files
    for path in newxmaplist:
        with open(path + ".xmap", "r") as f:
            lines = f.readlines()
        with open(path + ".xmap", "w") as f:
            for line in lines:
                if line.find("Query Maps") != -1:
                    line = line.split(":")[0] + ":\t" + path + "_q.cmap" + "\n"
                elif line.find("Reference Maps") != -1:
                    line = line.split(":")[0] + ":\t" + path + "_r.cmap" + "\n"
                f.write(line)
    logOrPrintError("split_XMapQcmap_byContig: wrote %i xmaps" %
                    len(xmapMolDict),
                    varsP,
                    warn=False)  #reproduce original fn
    if 0:
        bad = False
        print "DEBUG:"
        for xl in xmapMolDict.values():  #list of mols
            for i in xl:
                if xl.count(i) > 1:
                    bad = True
                    print i
        if bad:
            print xmapMolDict
        print "DEBUG\n"
    return (xmapMolDict)
Example #13
0
def characterizeContigs(varsP, xmappath=None):
    """Log simple contigs stats, and optionally align stats from xmappath.
    """
    #print "xmappath:", xmappath
    unitscale = 1e-6
    dorefalign = bool(
        xmappath
    )  #i'm never actually calling refaligner here--this is just using xmappath
    haveref = bool(varsP.ref)

    #refcmap = mapClasses.multiCmap() #not used
    aligndir = varsP.contigAlignTarget

    try:
        #refcmap = mapClasses.multiCmap(varsP.ref)
        #reflen = refcmap.totalLength #note: total length of _all_ contigs
        reflen = mapClasses.multiCmap(varsP.ref, lengthonly=True).totalLength
        #in summary table, this is a denominator--make sure it's non-zero, don't bail (still get contig summary)
        if reflen <= 0:
            #print "Warning in CharacterizeModule.characterizeContigs: bad reflen", reflen, "defaulting to 1" #not necessary
            reflen = 1.
    except:
        reflen = 1.

    outstr = ""  #Contig Characterization:\n"

    #check for .hmaps in same dir as latestMergedCmap: if any, add a line for haploid genome size
    hmaps = util.getListOfFilesFromDir(os.path.dirname(varsP.latestMergedCmap),
                                       ".hmap")
    haplotype = (len(hmaps) > 0)
    haplotypelen = 0
    hapcontiglens = []

    totcontiglen = 0
    totalignlen = 0
    nmapcontigs = 0
    totalignqlen = 0  #defalignlen = 0;
    contiglens = []  #lens of all contigs in bases
    uniqueseg = {
    }  #the argument to util.uniqueRange--stores all the map lengths which go into totalignlen--now each of these is a value, and the keys are the reference contig id or chromosome depending on dorefidchr
    for citr, cpath in enumerate([varsP.latestMergedCmap
                                  ]):  #always use contigpaths
        mapi = mapClasses.multiCmap(cpath)
        totcontiglen += mapi.totalLength
        contiglens += mapi.getAllMapLengths(
        )  #getAllMapLengths is list of all map lengths
        if haplotype:
            haplotypelen += mapi.getHaplotypeTotalMapLength()
            hapcontiglens.extend(mapi.getHaplotypeMapLengths())

        #store a list of the contig ids in this multiCmap, then remove them if they're in the xmap
        # if they're not, print at the end
        mapids = mapi.getAllMapIds(
        )  #this is once per cmap, not once per characterizeModule call--becuase it's keys, it's already a copy, no need to copy explicitly
        ncontigs = len(
            mapids)  #this is ncontigs in this file, ie, in mapi (see below)

        xmapobj = mapClasses.xmap()  #empty map to fix xmapobj scope
        if dorefalign:  #get xmap object
            if util.checkFile(xmappath, ".xmap"):
                xmapobj = mapClasses.xmap(xmappath)

        for xitr, xmapentry in enumerate(xmapobj.xmapLookup.values()):

            #get map length from multicmap.getMapLength--returns 0 for any exception
            contiglen = mapi.getMapLength(xmapentry.contigQry)
            if contiglen <= 0:  #this strikes me as clumsy...but I don't want to return non-zero from multiCmap.getMapLength
                contiglen = 1.
            contigcov = mapi.getMapAvgCoverage(xmapentry.contigQry)

            #don't print lenr for each contig--just total them
            lenr = xmapentry.getMappedRefLen()
            lenq = xmapentry.getMappedQryLen()
            refid = xmapentry.contigRef  #int

            totalignlen += lenr
            totalignqlen += lenq

            #uniqueseg is now a dict to take into account which chromosome the query contig is on
            #note need refid bc need to separate different contigs on the _same_ chromosome
            if not uniqueseg.has_key(
                    refid
            ):  #if first contig on chromosome, need to init new list
                uniqueseg[refid] = []
            uniqueseg[refid].append([xmapentry.RefStart, xmapentry.RefStop])

            #process mapids--remove contig id (contigQry) from mapids if they're in the xmap so non-aligning contigs can be printed
            if xmapentry.contigQry in mapids:
                mapids.remove(xmapentry.contigQry)

        #end loop on xmap entries

        #now that all xmap entries are processed, all contigs with an alignment are removed from mapids,
        # so we can get n contigs align using this and ncontigs
        nmapcontigs += ncontigs - len(mapids)  #sum multiple cmaps

    #end loop on contigs

    varsP.totAssemblyLenMb = totcontiglen * unitscale
    ncontigs = len(
        contiglens)  #contigpaths is just files--contiglens is all contigs
    avgcontiglen = (float(totcontiglen) / ncontigs if ncontigs > 0 else 0)

    if unitscale > 1e-6:  #if not megabases
        fstr = "%9.0f"
    else:  #megabases
        fstr = "%8.3f"

    if haplotype:  #new format for haplotype
        #if haplotypelen != sum(hapcontiglens) : #simply print warning in this case (do not log): ignore this bc of floating point rounding
        #print "Warning in characterizeContigs: haplotype lengths are inconsistent:", haplotypelen, sum(hapcontiglens)
        #diploid is same as else below, but names change
        outstr += "Diploid N Genome Maps: %i\n" % ncontigs
        outstr += ("Diploid Genome Map Len        (Mb): " + fstr +
                   "\n") % (totcontiglen * unitscale)
        outstr += ("Diploid Avg. Genome Map Len   (Mb): " + fstr +
                   "\n") % (avgcontiglen * unitscale)
        outstr += ("Diploid Median Genome Map Len (Mb): " + fstr +
                   "\n") % (util.getMedian(contiglens) * unitscale)
        outstr += ("Diploid Genome Map n50        (Mb): " + fstr +
                   "\n") % (util.getn50(contiglens) * unitscale)
        #haploid : ignore haplotypelen, just use the list hapcontiglens
        outstr += "Haploid N Genome Maps: %i\n" % len(hapcontiglens)
        tot = sum(hapcontiglens)
        avg = (tot / len(hapcontiglens) if len(hapcontiglens) else 0)
        outstr += ("Haploid Genome Map Len        (Mb): " + fstr +
                   "\n") % (tot * unitscale)
        outstr += ("Haploid Avg. Genome Map Len   (Mb): " + fstr +
                   "\n") % (avg * unitscale)
        outstr += ("Haploid Median Genome Map Len (Mb): " + fstr +
                   "\n") % (util.getMedian(hapcontiglens) * unitscale)
        outstr += ("Haploid Genome Map n50        (Mb): " + fstr +
                   "\n") % (util.getn50(hapcontiglens) * unitscale)
    else:  #default to old format
        outstr += "N Genome Maps: %i\n" % ncontigs
        outstr += ("Total Genome Map Len  (Mb): " + fstr +
                   "\n") % (totcontiglen * unitscale)
        outstr += ("Avg. Genome Map Len   (Mb): " + fstr +
                   "\n") % (avgcontiglen * unitscale)
        outstr += ("Median Genome Map Len (Mb): " + fstr +
                   "\n") % (util.getMedian(contiglens) * unitscale)
        outstr += ("Genome Map n50        (Mb): " + fstr +
                   "\n") % (util.getn50(contiglens) * unitscale)

    if haveref:
        outstr += ("Total Ref Len   (Mb): " + fstr + "\n") % (reflen *
                                                              unitscale)
        outstr += ("Total Genome Map Len / Ref Len : " + fstr +
                   "\n") % (totcontiglen / reflen)
    if dorefalign:
        ratio = (float(nmapcontigs) / ncontigs if ncontigs > 0 else 0)
        outstr += ("N Genome Maps total align      : %i (%.2f)\n") % (
            nmapcontigs, ratio)
        outstr += ("Total Aligned Len (Mb)            : " + fstr +
                   "\n") % (totalignlen * unitscale)
        outstr += ("Total Aligned Len / Ref Len       : " + fstr +
                   "\n") % (totalignlen / reflen)
        uniquelen = 0
        for segs in uniqueseg.values():  # need to sum on dict entries
            util.uniqueRange(segs)  #this modifies list in place
            uniquelen += util.totalLengthFromRanges(segs)
        outstr += ("Total Unique Aligned Len (Mb)     : " + fstr +
                   "\n") % (uniquelen * unitscale)
        outstr += ("Total Unique Aligned Len / Ref Len: " + fstr +
                   "\n") % (uniquelen / reflen)

    return outstr
def runAlignMol():
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument(
        '-q',
        dest='queryDir',
        help=
        'Path to merged cmap to align molecules (-b) to OR alignmol dir from Pipeline for merge (if latter, no alignments are performed), required',
        type=str)
    parser.add_argument(
        '-b',
        dest='bnx',
        help='Input molecule (.bnx) file, required if aligning molecules',
        type=str)
    #parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file OR path to dir containing split bnx pieces, required if aligning molecules', type=str) #I should add the split feature; for now, just do single bnx
    parser.add_argument(
        '-a',
        dest='optArguments',
        help=
        'Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)',
        default="",
        type=str)
    parser.add_argument(
        '-r',
        help=
        'If this flag is used, alignmolvref arguments are used, otherwise alignmol arguments are used (default alignmol; optional)',
        dest='ref',
        action='store_true')
    parser.add_argument(
        '-o',
        dest='outputDir',
        help=
        'output dir (optional, defaults to sub-dir of input map dir called "alignmol")',
        default="",
        type=str)
    parser.add_argument(
        '-t',
        dest='RefAligner',
        help='Path to RefAligner or dir containing it (required)',
        type=str)
    parser.add_argument(
        '-T',
        dest='numThreads',
        help='Total number of threads (cores) to use (optional, default 4)',
        default=4,
        type=int)
    parser.add_argument(
        '-j',
        dest='maxthreads',
        help=
        'Threads per Job, -maxthreads (non-cluster only;optional, default 4)',
        default=4,
        type=int)
    parser.add_argument(
        '-e',
        dest='errFile',
        help=
        '.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise',
        default="",
        type=str)
    parser.add_argument(
        '-E',
        dest='errbinFile',
        help=
        '.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise',
        default="",
        type=str)
    parser.add_argument(
        '-p',
        dest='pipelineDir',
        help=
        'Pipeline dir (optional, defaults to script dir, or current directory)',
        default="",
        type=str)
    result = parser.parse_args()

    outprefix = "exp_refineFinal1"  #this is the default; assume for now

    #check all Pipeline dependencies
    if result.pipelineDir:
        cwd = result.pipelineDir
    else:
        cwd = os.path.split(
            os.path.realpath(__file__))[0]  #this is path of this script
        if not os.path.isfile(os.path.join(
                cwd,
                "utilities.py")):  #if still not here, last try is actual cwd
            cwd = os.getcwd()  #still check this below

    #this is the only one imported here and in runCharacterize
    if not os.path.isfile(os.path.join(cwd, "utilities.py")):
        print "ERROR: utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    if not os.path.isfile(os.path.join(cwd, "AlignModule.py")):
        print "ERROR: AlignModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import AlignModule as alignmod

    if not util.checkFile(os.path.join(cwd, "Pipeline.py")):
        print "ERROR: Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import Pipeline

    if not util.checkFile(os.path.join(cwd, "mapClasses.py")):
        print "ERROR: mapClasses.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import mapClasses as mc

    #input dir
    if not result.queryDir:
        print "ERROR: Query (-q) argument not supplied."
        sys.exit(1)
    qrypath = os.path.realpath(result.queryDir)
    if util.checkDir(
            qrypath, checkWritable=False,
            makeIfNotExist=False):  #output elsewhere so not writeable is ok
        runaligns = False
    elif util.checkCmap(qrypath):
        runaligns = True
    else:
        print "ERROR: Query argument (" + qrypath + ") not found or not a dir or cmap. Check -q argument."
        sys.exit(1)

    #this check isn't really necessary...make it a warning -- left over from runAlignMerge.py
    #if not os.path.split(qrypath)[1].endswith("alignmol") :
    #    print "Warning: Query dir ("+qrypath+") does not end with 'alignmol'; please be sure this is a Pipeline alignmol dir\n"

    #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args
    rabin = ""  #need empty string for generateJobList even though no jobs are run
    if runaligns:
        rabin = result.RefAligner
        #replicate Pipeline behavior: RefAligner is always required
        if os.path.isdir(rabin):
            rabin = os.path.join(rabin, "RefAligner")
        if not util.checkExecutable(rabin):
            print "ERROR: RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg."
            sys.exit(1)

    #optargs file
    optargs = None
    if runaligns and result.optArguments:  #supplied on command line
        optargs = result.optArguments
        if not util.checkFile(optargs, ".xml"):
            print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument."
            sys.exit(1)
    elif runaligns:  #load from Pipeline dir if running alignments
        optargs = os.path.join(cwd, "optArguments_human.xml")
        if not util.checkFile(optargs):
            print "optArguments.xml missing in Pipeline directory (" + cwd + "). Try supplying path explicitly using -a."
            sys.exit(1)

    #output dir
    if not result.outputDir:
        outdir = os.path.join(qrypath,
                              "merge")  #should be same as in AlignModule
    else:
        outdir = os.path.realpath(result.outputDir)
    if os.path.isdir(outdir):
        if not util.checkDir(outdir):  #check writeable
            print "\nERROR: Output dir is not writeable:\n", outdir, "\n"
            sys.exit(1)
        #this is ok here
        #elif outdir == contigdir :
        #    print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n"
        #    sys.exit(1)
        print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n"
    elif not util.checkDir(
            outdir
    ):  #does not exist, make, if False, can't make or not writeable
        print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n"
        sys.exit(1)

    #bnx file
    bnxfile = result.bnx
    if bnxfile:  #must check for empty string BEFORE you do realpath, or it returns cwd
        bnxfile = os.path.realpath(bnxfile)
        if not util.checkFile(bnxfile, ".bnx"):
            print "ERROR: bnx file supplied but not found or incorrect suffix:", bnxfile
            sys.exit(1)
    elif runaligns:
        print "ERROR: bnx file not supplied but running alignments; please supply bnx file as -b argument"
        sys.exit(1)

    #nthreads
    nthreads = result.numThreads
    if nthreads <= 0:
        print "ERROR: Number of threads value invalid (must be > 0): %i" % nthreads
        sys.exit(1)

    #maxthreads
    maxthreads = result.maxthreads
    if maxthreads <= 0:
        print "ERROR: Max threads value invalid (must be > 0): %i" % maxthreads
        sys.exit(1)
    elif nthreads < maxthreads:
        print "Warning: num threads (-T: %i) < max threads (-j: %i): increasing num threads to equal max threads\n" % (
            nthreads, maxthreads)
        nthreads = maxthreads

    #.errbin file
    errbinfile = result.errbinFile
    if errbinfile:
        errbinfile = os.path.realpath(result.errbinFile)
        if not util.checkFile(errbinfile, ".errbin"):
            print "ERROR: errbin file supplied but not found or incorrect suffix:", errbinfile
            sys.exit(1)

    #.err file
    errfile = result.errFile
    if errfile and errbinfile:
        print "Warning: .err and .errbin arguments supplied; ignoring .err file"
        errfile = ""
    elif errfile:
        errfile = os.path.realpath(result.errFile)
        if not util.checkFile(errfile, ".err"):
            print "err file supplied but not found or incorrect suffix:", errfile
            sys.exit(1)

    if errfile and not util.checkFile(os.path.join(cwd,
                                                   "SampleCharModule.py")):
        print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    elif errfile:
        import SampleCharModule as scm

    doref = result.ref

    #DONE checking arguments

    print "Using output dir", outdir
    if runaligns:
        print "Aligning", bnxfile, "\nTo", qrypath, "\n"
    else:
        print "Merging", qrypath, "\n"

    startTime = time.time()  #time since Epoch
    memory_log = os.path.join(outdir, "memory_log.txt")
    util.initMemoryLog(memory_log)

    varsP = Pipeline.varsPipeline()
    varsP.RefAlignerBin = rabin
    varsP.contigFolder = ""  #not used but needs to be an attr
    varsP.outputContigFolder = ""  #not used but needs to be a string attr
    varsP.pipeReportFile = os.path.join(outdir, "alignmol_jobs_log.txt")
    varsP.infoReportFile = os.path.join(outdir, "alignmol_log.txt")
    util.InitStatus(os.path.join(outdir, "status.xml"))

    if runaligns:
        varsP.optArgumentsFileIn = optargs
        varsP.latestMergedCmap = qrypath  #if !doref, need this one
        varsP.ref = qrypath  #and if doref, need this one
        varsP.nThreads = nthreads  #necessary otherwise job won't start -- max threads per node
        varsP.maxthreads = maxthreads  #threads per job
        p = os.path.split(qrypath)[1]
        varsP.outputContigPrefix = p[:p.rfind(".")]  #filename prefix
        varsP.stdoutlog = True  #use -stdout -stderr
        varsP.sorted_file = bnxfile[:bnxfile.rfind(
            ".")]  #enables the mol fraction align in AlignModule.getAlignStats
        if qrypath.endswith(".cmap"):  #enable the mol stats
            varsP.totAssemblyLenMb = mc.multiCmap(
                qrypath, lengthonly=True).totalLength / 1e6

        varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt")
        varsP.parseArguments()  #parses optArgumentsFile
        varsP.checkDependencies()
        varsP.RefAlignerBinOrig = rabin
        varsP.prerunLog(
        )  #general information in log -- needed for refaligner_version

        noisep = {}
        if errbinfile:
            noisep = {"readparameters": errbinfile}
            #print "Using noise parameters from "+errbinfile+"\n" #move below
        elif errfile:
            noisep = scm.readNoiseParameters(errfile.replace(".err", ""))
            if noisep.has_key(
                    'readparameters'
            ):  #remove this because it's redundant, and it can cause problems with RefAligner compatibility
                del noisep['readparameters']
            if not noisep:  #readNoiseParameters returns empty dict on failure
                print "ERROR reading noise parameters, check .err file:", errfile
                sys.exit(1)
            #redundant with below?
            print "Using noise parameters from " + errfile + ":\n" + " ".join(
                ["-" + str(k) + " " + str(v)
                 for k, v in noisep.iteritems()]) + "\n"

        #some code from SampleCharModule to load args into noise0
        infoReport = "Loaded noise parameters:\n"
        klist = [
            "FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"
        ]  #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict.
        #noiseargs = self.varsP.argsListed('noise0') #not necessary
        for v in klist:
            if not noisep.has_key(v):
                continue
            param = str(noisep[v])
            util.LogStatus("parameter", "auto_" + v, param)
            infoReport += v + ":" + param + "\n"
            varsP.replaceParam("noise0", "-" + v, param)
        varsP.updateInfoReport(infoReport + '\n', printalso=True)

    else:
        print "Getting file list from", qrypath
        outFileList = getOutFileList(util, qrypath)
        if not outFileList:
            print "ERROR: Query dir (" + qrypath + ") does not contain alignmol data. Check -q argument."
            sys.exit(1)
        else:
            print "Found", len(outFileList), "alignment results"
    #end if runaligns

    amod = alignmod.AlignModule(
        varsP, doref, outdir, bnxfile)  #constructor will call generateJobList

    if runaligns:
        amod.runJobs()
        amod.checkResults()
    else:
        amod.outFileList = outFileList
        p = os.path.split(outFileList[0])[1]
        if p.count("_") > 1:  #expect something like "EXP_REFINEFINAL1_4"
            #p = p[:p.rfind("_")+1] #remove integer suffix
            p = p[:p.rfind("_")]  #remove integer suffix (and underscore)
        #else :
        #    p += "_" #because mrgstr is appended
        varsP.outputContigPrefix = p

    if not runaligns or len(amod.jobList) > 0:
        amod.getAlignStats()

    if runaligns:
        print
        #copy from Pipeline.py
        if util.SummarizeErrors(varsP=varsP) == 0:
            varsP.updatePipeReport("Pipeline has successfully completed\n")
            util.LogStatus("progress", "pipeline", "success")
        else:
            varsP.updatePipeReport("Pipeline has completed with errors\n")
            util.LogStatus("progress", "pipeline", "failure")

    #BELOW OLD CODE

    return

    #in Pipeline, this is called first
    #print "Calling getAlignStats:" #but it won't work without varsP atm; skip it
    #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir)
    #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir)

    print "Calling mergeMap"
    print outFileList[0]  #, "\n", outputdir #moved above
    util.logMemory(memory_log, startTime, "mergeMap_start")
    #mergeMap(self.varsP, self.outFileList, mergepath=self.outputdir) #varsP is optional
    alignmod.mergeMap(None, outFileList, outputdir)
    util.logMemory(memory_log, startTime, "mergeMap_end")

    print "Calling mergeRcmaps"
    util.logMemory(memory_log, startTime, "mergeRcmaps_start")
    #mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="alignmol") :
    alignmod.mergeRcmaps(outFileList,
                         outputdir,
                         splitByContig=True,
                         stageName=outprefix)
    util.logMemory(memory_log, startTime, "mergeRcmaps_end")

    print "Calling split_XMap_byContig"  #split_XMapQcmap_byContig"
    util.logMemory(memory_log, startTime, "split_XMap_byContig_start")
    #xmapdict = alignmod.split_XMap_byContig(outFileList, outputdir, stageName=outprefix) #old
    xmapdict = alignmod.split_XMap_byContig_new(outFileList,
                                                outputdir,
                                                stageName=outprefix)
    util.logMemory(memory_log, startTime, "split_XMap_byContig_end")

    print "Calling split_Qcmap_byContig"
    util.logMemory(memory_log, startTime, "split_Qcmap_byContig_start")
    #alignmod.split_Qcmap_byContig(outFileList, outputdir, xmapdict) #old
    alignmod.split_Qcmap_byContig_new(
        outFileList, outputdir, xmapdict,
        stageName=outprefix)  #new: better performance
    util.logMemory(memory_log, startTime, "split_Qcmap_byContig_end")

    print "AlignMerge successfully completed"
Example #15
0
def getArgs():
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument(
        '-t',
        dest='RefAligner',
        help='Path to RefAligner or dir containing it (required)',
        type=str)
    parser.add_argument(
        '-r',
        dest='referenceMap',
        help='Path to reference maps (.cmap), 1 file only (required)',
        type=str)
    parser.add_argument(
        '-q',
        dest='queryDir',
        help='Path to dir containing query maps (.cmaps) (required)',
        type=str)
    #parser.add_argument('-x', dest='xmap', help='Path to .xmap, 1 file only (optional, if specified, no alignment is done, if not specified, -t, -r, and -q must be specified)') #not supported
    parser.add_argument(
        '-o',
        dest='outputDir',
        help=
        'output dir (optional, defaults to input map dir with suffix "_sv")',
        default="",
        type=str)
    parser.add_argument(
        '-p',
        dest='pipelineDir',
        help=
        'Pipeline dir (optional, defaults to script dir, or current directory)',
        default="",
        type=str)
    parser.add_argument(
        '-a',
        dest='optArguments',
        help=
        'Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)',
        default="",
        type=str)
    parser.add_argument(
        '-T',
        dest='numThreads',
        help='Total number of threads (cores) to use (optional, default 4)',
        default=4,
        type=int)
    parser.add_argument(
        '-j',
        dest='maxthreads',
        help=
        'Threads per Job, -maxthreads (non-cluster only;optional, default 4)',
        default=4,
        type=int)
    parser.add_argument(
        '-b',
        dest='bedFile',
        help=
        '.bed file with gaps in reference for flagging SVs which overlap N-base gaps (optional)',
        default="",
        type=str)
    parser.add_argument(
        '-e',
        dest='errFile',
        help=
        '.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)',
        default="",
        type=str)
    parser.add_argument(
        '-E',
        dest='errbinFile',
        help=
        '.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)',
        default="",
        type=str)
    parser.add_argument(
        '-C',
        help=
        'Run on cluster, read XML file for submission arguments (optional--will not use cluster submission if absent)',
        dest='cxml',
        default=None)
    parser.add_argument(
        '-s',
        help=
        'SV jobs configuration: 0 = single job (required for correct haplotype calls), 1 = single job per contig (not recommended), 2 = grouped (default 0; optional)',
        dest='groupsv',
        type=int,
        default=0)
    #parser.add_argument('-s', help='Disable grouping of SV jobs (default grouped; optional)', dest='groupsv', action='store_false') #old one
    result = parser.parse_args()

    #check all Pipeline dependencies
    if result.pipelineDir:
        cwd = result.pipelineDir
    else:
        cwd = os.path.split(
            os.path.realpath(__file__))[0]  #this is path of this script
        if not os.path.isfile(os.path.join(
                cwd,
                "utilities.py")):  #if still not here, last try is actual cwd
            cwd = os.getcwd()  #still check this below

    #this is the only one imported here and in runCharacterize
    if not os.path.isfile(os.path.join(cwd, "utilities.py")):
        print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    #xmap -- don't use this
    runaligns = True  #default is to run the alignment
    xmappath = None
    #if result.xmap :
    #    xmappath = result.xmap
    #    if not util.checkFile(xmappath, ".xmap") :
    #        print "Xmap path is supplied ("+xmappath+") but not found or doesn't end in .xmap."
    #        sys.exit(1)
    #    runaligns = False

    #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args
    rabin = result.RefAligner
    #replicate Pipeline behavior: RefAligner is always required
    if os.path.isdir(rabin):
        rabin = os.path.join(rabin, "RefAligner")
    if not util.checkExecutable(rabin):
        print "RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg."
        sys.exit(1)

    #reference maps -- only required if xmap not specified
    refcmap = os.path.realpath(result.referenceMap)
    if runaligns and not util.checkFile(
            refcmap, ".cmap"):  #and not util.checkFile(refcmap, ".spots") :
        print "Reference map file (" + refcmap + ") not found or does not end in .cmap or .spots. Check -r argument."
        sys.exit(1)

    #query maps
    qrypath = os.path.realpath(result.queryDir)
    #if runaligns and not util.checkFile(qrypath, ".cmap") :
    #    print "Query map file ("+qrypath+") not found or does not end in .cmap or .spots. Check -q argument."
    #    sys.exit(1)
    if not util.checkDir(qrypath, checkWritable=False,
                         makeIfNotExist=False):  #does NOT have to be writeable
        print "Query dir (" + qrypath + ") not found or not a dir. Check -q argument."
        sys.exit(1)
    if runaligns:
        contigdir = qrypath  #os.path.split(qrypath)[0] #dir of query maps
        contigbase = os.path.split(qrypath)[1]  #filename
    else:
        contigdir = os.path.split(xmappath)[0]
        contigbase = os.path.split(xmappath)[1]  #filename
    #contigbase = contigbase[:contigbase.find(".")] #remove suffix

    #optargs file
    optargs = None
    if result.optArguments:  #supplied on command line
        optargs = result.optArguments
        if not util.checkFile(optargs, ".xml"):
            print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument."
            sys.exit(1)
    elif runaligns:  #load from Pipeline dir if running alignments
        optargs = os.path.join(cwd, "optArguments_human.xml")
        if not util.checkFile(optargs):
            print "optArguments.xml missing in Pipeline directory (" + cwd + "). Try supplying path explicitly using -a."
            sys.exit(1)

    #cluster args
    clustargs = None
    if result.cxml:
        clustargs = os.path.realpath(result.cxml)
        if not util.checkFile(clustargs, ".xml"):
            print "clusterArguments path is supplied (" + clustargs + ") but not found or doesn't end in .xml, check -C argument."
            sys.exit(1)

    #nthreads
    nthreads = result.numThreads
    if nthreads <= 0:
        print "Number of threads value invalid (must be > 0): %i" % nthreads
        sys.exit(1)

    #maxthreads
    maxthreads = result.maxthreads
    if maxthreads <= 0:
        print "Max threads value invalid (must be > 0): %i" % maxthreads
        sys.exit(1)

    #bed file
    bedfile = result.bedFile  #must make local for return statement below
    if bedfile:  #must check for empty string BEFORE you do realpath, or it returns cwd
        bedfile = os.path.realpath(result.bedFile)
        if not util.checkFile(bedfile, ".bed"):
            print "bed file supplied but not found or incorrect suffix:", bedfile
            sys.exit(1)

    #.errbin file
    errbinfile = result.errbinFile
    if errbinfile:
        errbinfile = os.path.realpath(result.errbinFile)
        if not util.checkFile(errbinfile, ".errbin"):
            print "errbin file supplied but not found or incorrect suffix:", errbinfile
            sys.exit(1)

    #.err file
    errfile = result.errFile
    if errfile and errbinfile:
        print "Warning: .err and .errbin arguments supplied; ignoring .err file"
        errfile = ""
    elif errfile:
        errfile = os.path.realpath(result.errFile)
        if not util.checkFile(errfile, ".err"):
            print "err file supplied but not found or incorrect suffix:", errfile
            sys.exit(1)

    outdir = os.path.realpath(result.outputDir)

    groupsv = result.groupsv
    if groupsv < 0 or groupsv > 2:
        print 'ERROR: -s (grouped SV) must be 0, 1, or 2\n'
        sys.exit(1)

    #yes, this is messy...but I don't want another class (besides varsPipeline) and they just go to runCharacterize
    return cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, groupsv
Example #16
0
def runSV(cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath,
          optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile,
          clustargs, groupsv):
    '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns;
    report on those alignments or the xmap provided as xmappath.
    '''

    printargs = True

    if not os.path.isfile(os.path.join(cwd, "utilities.py")):
        print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    if not util.checkFile(os.path.join(cwd, "Pipeline.py")):
        print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import Pipeline

    if not util.checkFile(os.path.join(cwd, "SVModule.py")):
        print "SVModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import SVModule as svm

    if errfile and not util.checkFile(os.path.join(cwd,
                                                   "SampleCharModule.py")):
        print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    elif errfile:
        import SampleCharModule as scm

    #use Pipeline objects

    varsP = Pipeline.varsPipeline()

    varsP.optArgumentsFileIn = optargs
    varsP.RefAlignerBin = rabin
    varsP.latestMergedCmap = os.path.join(
        contigdir, contigbase + ".cmap")  #file suffix required to be .cmap
    varsP.contigFolder = os.path.split(contigdir)[0]
    varsP.nThreads = nthreads  #necessary otherwise job won't start -- max threads per node
    varsP.maxthreads = maxthreads  #threads per job
    varsP.ref = refcmap
    varsP.stdoutlog = True  #enable -stdout -stderr args to RefAligner
    varsP.curCharacterizeCmaps = [varsP.latestMergedCmap]
    varsP.contigSubDirectories = True  #needed for prepareContigIO
    varsP.doAlignMolvRef = False  #do not look for copy number
    varsP.groupSV = groupsv  #mimic Pipeline behavior: group or not

    if runaligns:
        #varsP.contigAlignTarget = outdir
        varsP.runSV = False
        varsP.groupContigs = False
        varsP.stdoutlog = True  #use -stdout -stderr
        varsP.stageComplete = contigbase
        varsP.outputContigPrefix = getContigPrefix(
            util, contigdir
        )  #if outdir is not supplied, this is used as dir prefix; also used as file pref for -o arg
        varsP.outputContigFolder = contigdir  #cmaps are copied from here

        if not outdir:
            outdir = contigdir + "_sv"  #this will be outdir of sv jobs
        if os.path.isdir(outdir):
            if not util.checkDir(outdir):  #check writeable
                print "\nERROR: Output dir is not writeable:\n", outdir, "\n"
                sys.exit(1)
            elif outdir == contigdir:
                print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n"
                sys.exit(1)
            print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n"
        elif not util.checkDir(
                outdir
        ):  #does not exist, make, if False, can't make or not writeable
            print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n"
            sys.exit(1)

        if clustargs:
            #os.putenv('SGE_ROOT', '/var/lib/gridengine') #do I want this??? NO! It could very well be wrong.
            varsP.onCluster = True
            varsP.checkCluster()  #call varsPipeline method to check SGE_ROOT
            #note: before, above default is wrong. Now, there is no default--user is required to set environment variable; but this is consistent with the Pipeline
            varsP.clusterLogDir = os.path.join(outdir, 'ClusterLogs')
            util.checkDir(varsP.clusterLogDir)  #make it
            varsP.checkCluster()
            varsP.clusterArgumentsFileIn = clustargs  #required for parseArguments
            varsP.parseArguments(readingClusterFile=True)
            if varsP.error:
                print varsP.message
                sys.exit(1)
            varsP.RefAlignerBin += "${BINARY_SUFFIX:=}"  #copy from varsPipeline, handled by external script on phi host

        varsP.pipeReportFile = os.path.join(outdir, "sv_jobs_log.txt")
        varsP.infoReportFile = os.path.join(outdir, "sv_log.txt")
        varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt")
        if bedfile:
            varsP.bedFile = bedfile
        util.InitStatus(os.path.join(outdir, "status.xml"))
        varsP.parseArguments()  #parses optArgumentsFile
        varsP.checkDependencies()
        varsP.RefAlignerBinOrig = rabin
        varsP.prerunLog(
        )  #general information in log -- needed for refaligner_version
        if printargs:
            print "\nRunning SV detection with arguments (" + os.path.split(
                optargs)[1] + "):\n" + " ".join(
                    varsP.argsListed('svdetect')) + '\n'

        noisep = {}
        if errbinfile:
            noisep = {"readparameters": errbinfile}
            print "Using noise parameters from " + errbinfile + "\n"
        elif errfile:
            noisep = scm.readNoiseParameters(errfile.replace(".err", ""))
            if noisep.has_key(
                    'readparameters'
            ):  #remove this because it's redundant, and it can cause problems with RefAligner compatibility
                del noisep['readparameters']
            if not noisep:  #readNoiseParameters returns empty dict on failure
                print "ERROR reading noise parameters, check .err file:", errfile
                sys.exit(1)
            print "Using noise parameters from " + errfile + ":\n" + " ".join(
                ["-" + str(k) + " " + str(v)
                 for k, v in noisep.iteritems()]) + "\n"

        varsP.outputContigFolder = contigdir  #cmaps are copied from here

        #make merged cmap to replace merged _q.cmap if not produced by RefAligner
        cmaps = util.getListOfFilesFromDir(varsP.outputContigFolder,
                                           suffix=".cmap")
        if len(cmaps) > 1:
            varsP.contigPathTxtFile = os.path.join(
                outdir,
                "contig_list.txt")  #mergeIntoSingleCmap creates this file
            print "Creating merged cmap"
            varsP.mergeIntoSingleCmap(outdir)
            print "Merged cmap created:", varsP.latestMergedCmap, "\n"
            if varsP.groupSV == 0:  #if it is a single job, use merged map just created
                varsP.outputContigFolder = outdir  #input == output
                #print "varsP.outputContigFolder =", varsP.outputContigFolder #debug
        elif len(cmaps) == 1:
            varsP.latestMergedCmap = cmaps[0]
        else:  #this is already checked in getContigPrefix (redundant)
            print "No cmaps found in input dir; check dir %s\n" % varsP.outputContigFolder
            sys.exit(1)

        svmodule = svm.SVdetect(varsP, noisep, outdir, skipderes=True)
        #this got duplicated above
        #if hasattr(util, "InitStatus") : #if old version, skip -- do this after SVdetect.__init__ bc makes outdir
        #    util.InitStatus(os.path.join(outdir, "status.xml")) #needed otherwise call to status_log fails
        svmodule.runJobs()
        svmodule.checkResults()
        util.SummarizeErrors(varsP)

    else:
        varsP.contigAlignTarget = contigdir  #this is dir in which _q and _r cmaps must be located
        print "ERROR: feature not supported"  #not implemented to not run jobs
Example #17
0
def mergeRcmaps(outFileList,
                outdir,
                varsP=None,
                splitByContig=None,
                stageName=""):
    """Given a list of file prefixes (outFileList), append "_r.cmap" to them, and merge them
    to outdir. Report to varsP if supplied, stdout if not.
    Also support outFileList is full paths (including "_r.cmap").
    If splitByContig < 1, output each contig separately, if == 1, only output single merged cmap,
    and if > 1, do both.
    Always use stagename if supplied; if not, must supply varsP otherwise prefix is empty.
    """

    if not util.checkDir(outdir):
        err_msg = "Warning in AlignModule.mergeRcmaps: could not make outdir %s, skipping copy number" % outdir
        logOrPrintError(err_msg, varsP)
        return

    if not outFileList:  #just an argument check--check for presence on disk is below
        err_msg = "Warning in AlignModule.mergeRcmaps: no maps supplied"
        logOrPrintError(err_msg, varsP)
        return

    outFileList.sort(
    )  #for reproducibility with runAlignMerge.py (different order when listing dir)
    rsuf = "_r.cmap"
    #mappref = os.path.split(outFileList[0])[1] #this is just prefix, but with integer suffix--get it before -- no longer used
    #mappref = mappref[:mappref.rfind("_")+1] #remove integer suffix
    #even though outFileList should all be there, a job may have failed--check all, just existence
    present = []
    for outf in outFileList:
        target = (outf + rsuf if not outf.endswith(rsuf) else outf
                  )  #now support either
        if not util.checkFile(target):
            err_msg = "Warning in AlignModule.mergeRcmaps: missing _r.cmap %s" % target
            logOrPrintError(err_msg, varsP)
        else:
            present.append(target)
    if not present:  #no _r.cmaps found (this will also happen for empty outFileList)
        err_msg = "Warning in AlignModule.mergeRcmaps: no _r.cmaps found, skipping copy number"
        logOrPrintError(err_msg, varsP)
        return
    outFileList = present  #yes, it's redundant, but now have rsuf appended

    mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
    #mergedmappath = os.path.join(outdir, mappref+mrgstr+rsuf) #this is output merged _r.cmap -- unify with filepref

    mergedmap = mc.multiCmap(outFileList[0])  #open original, edit in memory
    #now add other maps
    for rmap in outFileList[1:]:  #don't add map 0 to itself
        if mergedmap.addCovOcc(mc.multiCmap(
                rmap)):  #when calling addCovOcc, check return, warn if True
            err_msg = "Warning in AlignModule.mergeRcmaps: addCovOcc call failed for map %s" % rmap
            logOrPrintError(err_msg, varsP)
    #now it's merged, but the resulting map need to be written back to disk
    filepref = (
        varsP.outputContigPrefix if varsP and stageName == "" else stageName
    )  #see split_XMapQcmap_byContig
    if splitByContig < 1 or splitByContig > 1:
        #print "\nself.varsP.outputContigPrefix", self.varsP.outputContigPrefix, "\n" #debug
        #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in split_XMapQcmap_byContig
        mergedmap.writeAllMapsToDisk(os.path.join(outdir,
                                                  filepref + '_contig'),
                                     outsuf="_r")
        report = "mergeRcmaps: wrote %i cmaps" % len(mergedmap.cmapdict)
    if splitByContig > 0:
        mergedmap.writeToFile(
            os.path.join(outdir,
                         filepref + "_" + mrgstr + rsuf))  #was mergedmappath
        report = "mergeRcmaps: wrote merged cmap with %i contigs" % len(
            mergedmap.cmapdict)
    #report result
    logOrPrintError(report, varsP, warn=False)
Example #18
0
def split_Qcmap_byContig_new(inFileList,
                             mergepath,
                             xmapDict,
                             varsP=None,
                             stageName=""):
    # readin all _q.cmap:
    qcmapFilelist = []
    for outpath in sorted(
            inFileList
    ):  #these are file prefixes--sort to ensure reproducibility
        if util.checkFile(outpath + "_q.cmap"):
            qcmapFilelist.append(outpath + "_q.cmap")
        else:
            err_msg = "Warning in AlignModule.split_XMapQcmap_byContig: missing _q.cmap: " + outpath + "_q.cmap"
            logOrPrintError(err_msg, varsP)

    if not len(qcmapFilelist):  #nothing to merge
        err_msg = "Error in AlignModule.split_XMapQcmap_byContig: no _q.cmaps found"
        logOrPrintError(err_msg, varsP)
        return

    header = ""  #get header of first qcmap
    with open(qcmapFilelist[0]) as f1:
        for line in f1:  #no readline--that will iterate over each char in line instead of line itself
            if line[0] == '#':
                header += line
            else:
                break
    #create all output files, header only
    #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in mergeRcmaps
    filepref = (
        varsP.outputContigPrefix if varsP and stageName == "" else stageName
    )  #same as line in mergeRcmaps
    for contigid in xmapDict.keys():
        outQmapFile = os.path.join(
            mergepath, filepref + '_contig' + str(contigid) + '_q.cmap')
        f1 = open(outQmapFile, "w")
        f1.write(header)
        f1.close()
    #convert xmapDict to a molDict: keys are molids, and values are contig ids -- this should speed up below
    molDict = {}

    #for cid,xmap in xmapDict.iteritems() :
    #for xmapentry in xmap.xmapLookup.values() :
    #if not molDict.has_key(xmapentry.contigQry) : #new mol
    #    molDict[xmapentry.contigQry] = [xmapentry.contigRef]
    #else :
    #    molDict[xmapentry.contigQry].append(xmapentry.contigRef)
    #old xmapDict was contigid:"xmap object"; new one is contigid:"list of mol ids"
    for cid, molids in xmapDict.iteritems():
        for molid in molids:
            if not molDict.has_key(molid):  #new mol
                molDict[molid] = [cid]
            else:
                molDict[molid].append(cid)
    #print "DEBUG:\n", molDict, "DEBUG\n" #debug
    #read input files, find all contigs to which each molecule aligns, write to that qcmap
    nmol = 0
    for qcmap in qcmapFilelist:
        previd = 0  #molecule id from _q.cmap, int to compare with xmap.contigQry
        molstr = ""  #all the lines in the _q.cmap for this molecule
        f1 = open(qcmap)
        for line in f1:
            if line[0] == '#':
                continue
            molid = int(line.split()[0])  #use int bc compare to xmap.contigQry
            if molid == previd:  #get data for this mol
                molstr += line
            else:  #write previous mol to output qcmap
                if molstr:  #not for first mol
                    for cid in molDict[previd]:
                        outQmapFile = os.path.join(
                            mergepath,
                            filepref + '_contig' + str(cid) + '_q.cmap')
                        f2 = open(outQmapFile, "a")
                        f2.write(molstr)
                        f2.close()
                #prepare for next mol
                molstr = line
                previd = molid
                nmol += 1
        f1.close()
        #get last molecule
        for cid in molDict[molid]:
            outQmapFile = os.path.join(
                mergepath, filepref + '_contig' + str(cid) + '_q.cmap')
            f2 = open(outQmapFile, "a")
            f2.write(molstr)
            f2.close()
    logOrPrintError(
        "split_XMapQcmap_byContig: wrote %i _q.cmaps with %i molecules" %
        (len(xmapDict), nmol),
        varsP,
        warn=False)
def split_Qcmap_byContig_new(inFileList, mergepath, xmapDict, varsP=None, stageName="") :
    # readin all _q.cmap:
    qcmapFilelist = []
    for outpath in sorted(inFileList) : #these are file prefixes--sort to ensure reproducibility
        if util.checkFile(outpath+"_q.cmap") :
            qcmapFilelist.append(outpath+"_q.cmap")
        else :
            err_msg = "Warning in AlignModule.split_XMapQcmap_byContig: missing _q.cmap: "+outpath+"_q.cmap"
            logOrPrintError(err_msg, varsP)

    if not len(qcmapFilelist) : #nothing to merge
        err_msg = "Error in AlignModule.split_XMapQcmap_byContig: no _q.cmaps found"
        logOrPrintError(err_msg, varsP)
        return     

    header = "" #get header of first qcmap
    with open(qcmapFilelist[0]) as f1 :
        for line in f1 : #no readline--that will iterate over each char in line instead of line itself
            if line[0] == '#':
                header += line
            else :
                break
    #create all output files, header only
    #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in mergeRcmaps
    filepref = (varsP.outputContigPrefix if varsP and stageName == "" else stageName) #same as line in mergeRcmaps
    for contigid in xmapDict.keys() :
        outQmapFile = os.path.join(mergepath, filepref+'_contig'+str(contigid)+'_q.cmap')
        f1 = open(outQmapFile, "w")
        f1.write(header)
        f1.close()
    #convert xmapDict to a molDict: keys are molids, and values are contig ids -- this should speed up below
    molDict = {}

    #for cid,xmap in xmapDict.iteritems() :
        #for xmapentry in xmap.xmapLookup.values() :
            #if not molDict.has_key(xmapentry.contigQry) : #new mol
            #    molDict[xmapentry.contigQry] = [xmapentry.contigRef]
            #else :
            #    molDict[xmapentry.contigQry].append(xmapentry.contigRef)
    #old xmapDict was contigid:"xmap object"; new one is contigid:"list of mol ids"
    for cid,molids in xmapDict.iteritems() :
        for molid in molids :
            if not molDict.has_key(molid) : #new mol
                molDict[molid] = [cid]
            else :
                molDict[molid].append(cid)
    #print "DEBUG:\n", molDict, "DEBUG\n" #debug
    #read input files, find all contigs to which each molecule aligns, write to that qcmap
    nmol = 0
    for qcmap in qcmapFilelist :
        previd = 0 #molecule id from _q.cmap, int to compare with xmap.contigQry
        molstr = "" #all the lines in the _q.cmap for this molecule
        f1 = open(qcmap)
        for line in f1 :
            if line[0] == '#' :
                continue
            molid = int(line.split()[0]) #use int bc compare to xmap.contigQry
            if molid == previd : #get data for this mol
                molstr += line
            else : #write previous mol to output qcmap
                if molstr : #not for first mol
                    for cid in molDict[previd] :
                        outQmapFile = os.path.join(mergepath, filepref+'_contig'+str(cid)+'_q.cmap')
                        f2 = open(outQmapFile, "a")
                        f2.write(molstr)
                        f2.close()
                #prepare for next mol
                molstr = line
                previd = molid
                nmol += 1
        f1.close()
        #get last molecule
        for cid in molDict[molid] :
            outQmapFile = os.path.join(mergepath, filepref+'_contig'+str(cid)+'_q.cmap')
            f2 = open(outQmapFile, "a")
            f2.write(molstr)
            f2.close()
    logOrPrintError("split_XMapQcmap_byContig: wrote %i _q.cmaps with %i molecules" % (len(xmapDict), nmol), varsP, warn=False) 
def getArgs() :    
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument('-t', dest='RefAligner', help='Path to RefAligner (required unless xmap is specified (-x))')
    parser.add_argument('-r', dest='referenceMap', help='Path to reference maps (.cmap or .spots), 1 file only (required unless xmap specified (-x) and _r.cmap is present in same dir as xmap)', default="")
    parser.add_argument('-q', dest='queryMap', help='Path to query maps (.cmap), 1 file only (required--if xmap specified (-x), this should be input (-i argument) for that command)', default="")
    parser.add_argument('-x', dest='xmap', help='Path to .xmap, 1 file only (optional, if specified, no alignment is done, if not specified, -t, -r, and -q must be specified)')
    parser.add_argument('-p', dest='pipelineDir', help='Pipeline dir (optional, defaults to current directory)')
    parser.add_argument('-a', dest='optArguments', help='Path to optArguments.xml (optional, default in Pipeline dir if found, otherwise required)')
    parser.add_argument('-n', dest='numThreads', help='Number of threads (cores) to use (optional, default 4)', default=4, type=int)
    result = parser.parse_args()

    #check all Pipeline dependencies
    if result.pipelineDir :
        cwd = result.pipelineDir
    else :
        cwd = os.getcwd()

    #this is the only one imported here and in runCharacterize
    if not os.path.isfile(os.path.join(cwd,"utilities.py")):
        print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    #xmap -- optional
    runaligns = True #default is to run the alignment
    xmappath = None
    if result.xmap :
        xmappath = result.xmap
        if not util.checkFile(xmappath, ".xmap") :
            print "Xmap path is supplied ("+xmappath+") but not found or doesn't end in .xmap."
            sys.exit(1)
        runaligns = False

    #RefAligner -- only required if xmap not specified
    rabin = result.RefAligner
    if not xmappath and not util.checkExecutable(rabin):
        print "RefAligner not found at", rabin, "\nPlease supply RefAligner full path as -t arg."
        sys.exit(1)

    #reference maps -- only required if xmap not specified
    refcmap = result.referenceMap
    if runaligns and not util.checkFile(refcmap, ".cmap") and not util.checkFile(refcmap, ".spots") :
        print "Reference map file ("+refcmap+") not found or does not end in .cmap or .spots. Check -r argument."
        sys.exit(1)

    #query maps -- only required if xmap not specified
    qrypath = result.queryMap
    #if runaligns and not util.checkFile(qrypath, ".cmap") :
    if not util.checkFile(qrypath, ".cmap") : #always required
        print "Query map file ("+qrypath+") not found or does not end in .cmap or .spots. Check -q argument."
        sys.exit(1)
    #if runaligns :
    contigdir  = os.path.split(qrypath)[0] #dir of query maps
    contigbase = os.path.split(qrypath)[1] #filename
    #else :
    #    contigdir  = os.path.split(xmappath)[0]
    #    contigbase = os.path.split(xmappath)[1] #filename
    contigbase = contigbase[:contigbase.find(".")] #remove suffix

    #optargs file
    optargs = None
    if result.optArguments : #supplied on command line
        optargs = result.optArguments
        if not util.checkFile(optargs, ".xml") :
            print "optArguments path is supplied ("+optargs+") but not found or doesn't end in .xml, check -a argument."
            sys.exit(1)
    elif runaligns : #load from Pipeline dir if running alignments
        optafile = "optArguments_human.xml"
        optargs = os.path.join(cwd, optafile)
        if not util.checkFile(optargs):
            print "%s missing in Pipeline directory (%s). Try supplying path explicitly using -a." % (optafile, cwd)
            sys.exit(1)

    #nthreads
    nthreads = result.numThreads
    if nthreads <= 0 :
        print "Number of threads value invalid (must be >= 0): "+nthreads
        sys.exit(1)

    #yes, this is messy...but I don't want another class (besides varsPipeline) and they just go to runCharacterize
    return cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads
Example #21
0
def getAlignStats(varsP,
                  outFileList,
                  reflen=0,
                  isref=False,
                  mergepath="",
                  bnxpath=None):
    '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule.
    reflen should be in Mb. If mergepath supplied, put merged .err there.
    If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this
    file and ignore outFileList.
    '''

    statonly = False  #bnx stats only
    skipbnx = False  #.err file processing only
    if bnxpath == None:
        if not varsP.sorted_file:  #for runAlignMol, this is empty: nothing to do in this case
            skipbnx = True
        else:
            bnxpath = varsP.sorted_file + ".bnx"  #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix
    else:  #if bnxpath != None :
        statonly = True
    if not skipbnx and not util.checkFile(bnxpath):
        varsP.updatePipeReport(
            "Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n"
            % bnxpath)
        return

    #find the minlen used for bnx_sort, which is a required arg set
    sortargs = []
    if varsP.argData.has_key('bnx_sort'):  #for runAlignMol.py
        sortargs = varsP.argsListed('bnx_sort')
    minlen = 0
    validminlen = False
    if "-minlen" in sortargs:
        minlen = sortargs[
            sortargs.index("-minlen") +
            1]  #next ele should be the len, if next ele isn't in list, the sort job will fail
        minlen = util.getIntFromString(
            minlen)  #returns None if can't cast to int
        if minlen:
            validminlen = True

    if not validminlen and bnxpath == None and sortargs:
        varsP.updatePipeReport(
            "Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n"
        )
    if bnxpath != None:  #if bnxpath, ignore minlen
        minlen = 0

    nmol = 0  #total n mol above minlen
    totlen = 0  #total mol len above minlen
    if util.checkFile(bnxpath):
        #the bnxfile class is very wasteful. replace with below
        #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now
        outstr = "Reading molecule stats from %s:\n" % bnxpath
        outstr += "Molecule Stats:\n"
        moldict = util.simpleBnxStats(bnxpath, minlen)
        nmol = moldict["nmol"]
        totlen = moldict["totlen"]
        #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously
        outstr += "N mols: %i\n" % nmol
        outstr += ("Total len (Mb): %10.3f\n") % totlen
        outstr += ("Avg len (kb)  : %10.3f\n") % moldict["avglen"]
        outstr += ("Mol N50 (kb)  : %10.3f\n") % moldict["n50"]
        outstr += ("Lab (/100kb)  : %10.3f\n") % moldict["labdensity"]
        #    if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below
        #        bnx.molstats[minlen].genomesizemb = 0
        #    outstr += str(bnx.molstats[minlen])
        #nmol = bnx.molstats[minlen].nmol
        #totlen = bnx.molstats[minlen].totlen

        if reflen:
            cov = totlen / reflen  #totlen is in Mb
            outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else
                                                    "Contig", cov)
        if isref or reflen or statonly:  #if neither, nothing to print
            varsP.updateInfoReport(outstr + "\n", printalso=True)
    elif not skipbnx:
        varsP.updatePipeReport(
            "Warning in AlignModule.getAlignStats: missing bnx path:" +
            bnxpath + "\n")

    if statonly:
        return

    #lastly, load .xmaps and .errs from alignmol jobs and report on stats
    totmaplen = 0  #sum of lengths of mapped portions of all molecules, on reference
    totmapqrylen = 0  #sum of lengths of mapped portions of all molecules, on query
    totconf = 0  #sum of confidence of all alignments
    nalign = 0  #total number of alignments
    fplist = []  #lists for error rates
    fprlist = []
    fnlist = []
    bpplist = []
    nmaplist = []  #from .err
    gmaplist = []  #from .err
    llrmlist = []
    llrgmlist = []
    bppsdlist = []
    sflist = []
    sdlist = []
    srlist = []
    reslist = []
    resdlist = []
    header = ""
    err = None  #will be the alignParams object if any .err files are found
    mappref = ""
    if len(outFileList) > 0:
        mappref = getMergeFilename(
            outFileList[0]
        )  #make function to unify with same convention in mergeMap
    for outpath in outFileList:  #these are file prefixes
        if util.checkFile(outpath + ".xmap"):
            xmap = mc.xmap(outpath + ".xmap")
            nalign += len(xmap.xmapLookup)
            totmaplen += xmap.getSumMappedRefLen()  #in kb
            totmapqrylen += xmap.getSumMappedQryLen()  #in kb
            totconf += sum([x.Confidence for x in xmap.xmapLookup.values()])
        else:
            varsP.updatePipeReport(
                "Warning in AlignModule.getAlignStats: missing xmap:" +
                outpath + ".xmap" + "\n")
        if util.checkFile(outpath + ".err"):
            err = mc.alignParams(outpath + ".err")
            if not header:
                header = err.header
            fplist.append(err.fp)
            fprlist.append(err.fprate)
            fnlist.append(err.fn)
            bpplist.append(err.bpp)
            reslist.append(err.res)
            nmaplist.append(err.nmaps)
            gmaplist.append(err.goodmaps)
            llrmlist.append(err.llrm)
            llrgmlist.append(err.llrgm)
            bppsdlist.append(err.bppsd)
            sflist.append(err.sf)
            sdlist.append(err.sd)
            srlist.append(err.sr)
            resdlist.append(err.ressd)

    #nalign from xmap should be the same as goodmaps from .err
    sumgoodmaps = sum(gmaplist)
    if sumgoodmaps != nalign:
        varsP.updateInfoReport(
            "Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n"
            % (sumgoodmaps, nalign),
            printalso=True)
    if totmaplen or totconf or nalign:
        outstr = "Molecules Aligned to %s:\n" % ("Reference"
                                                 if isref else "Assembly")
        outstr += "N mol align       : %9i\n" % nalign
        outstr += "Mol fraction align: %13.3f\n" % (float(nalign) /
                                                    nmol if nmol else 0)
        outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3)  #Mb
        if reflen > 0:
            outstr += ("Effective Cov (x) : %13.3f\n") % (
                totmaplen / 1e3 / reflen)  #totlen is in kb
        outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen /
                                                    nalign if nalign else 0)
        outstr += "Fraction align len: %13.3f\n" % (
            totmapqrylen / 1e3 / totlen if totlen else 0
        )  #totmapqrylen is in kb, totlen is in mb
        outstr += "Tot confidence    : %11.1f\n" % totconf
        outstr += "Avg confidence    : %11.1f\n" % (totconf /
                                                    nalign if nalign else 0)
        varsP.updateInfoReport(outstr, printalso=True)
    avgfp = (sum(fplist) / len(fplist) if len(fplist) else 0)
    avgfpr = (sum(fprlist) / len(fprlist) if len(fprlist) else 0)
    avgfn = (sum(fnlist) / len(fnlist) if len(fnlist) else 0)
    avgbpp = (sum(bpplist) / len(bpplist) if len(bpplist) else 0)
    avgres = (sum(reslist) / len(reslist) if len(reslist) else 0)
    avgllr = (sum(llrmlist) / len(llrmlist) if len(llrmlist) else 0)
    avgllg = (sum(llrgmlist) / len(llrgmlist) if len(llrgmlist) else 0)
    avgbps = (sum(bppsdlist) / len(bppsdlist) if len(bppsdlist) else 0)
    avgsf = (sum(sflist) / len(sflist) if len(sflist) else 0)
    avgsd = (sum(sdlist) / len(sdlist) if len(sdlist) else 0)
    avgsr = (sum(srlist) / len(srlist) if len(srlist) else 0)
    avgrsd = (sum(resdlist) / len(resdlist) if len(resdlist) else 0)
    if avgfp or avgfn or avgbpp:
        outstr = "Avg FP(/100kb)    : %12.2f\n" % avgfp
        outstr += "Avg FP ratio      : %13.3f\n" % avgfpr
        outstr += "Avg FN ratio      : %13.3f\n" % avgfn
        outstr += "Avg bpp           : %11.1f\n" % avgbpp
        outstr += "Avg sf            : %13.3f\n" % avgsf
        outstr += "Avg sd            : %13.3f\n" % avgsd
        outstr += "Avg sr            : %13.3f\n" % avgsr
        varsP.updateInfoReport(outstr + "\n", printalso=True)
    if err and mergepath:  #have an error file (alignParams) object
        util.checkDir(mergepath)
        mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
        outpath = os.path.join(mergepath, mappref + mrgstr + ".err")
        err.fp = avgfp
        err.fn = avgfn
        err.sf = avgsf
        err.sd = avgsd
        err.bpp = avgbpp
        err.res = avgres
        err.nmaps = sum(nmaplist)
        err.llrm = avgllr
        err.goodmaps = sumgoodmaps
        err.llrgm = avgllg
        err.bppsd = avgbps
        err.fprate = avgfpr
        err.sr = avgsr
        err.ressd = avgrsd
        err.writeToFile(outpath)
Example #22
0
def getArgs() :    
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument('-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) 
    parser.add_argument('-r', dest='referenceMap', help='Path to reference maps (.cmap), 1 file only (required)', type=str)
    parser.add_argument('-q', dest='queryDir', help='Path to dir containing query maps (.cmaps) (required)', type=str)
    #parser.add_argument('-x', dest='xmap', help='Path to .xmap, 1 file only (optional, if specified, no alignment is done, if not specified, -t, -r, and -q must be specified)') #not supported
    parser.add_argument('-o', dest='outputDir', help='output dir (optional, defaults to input map dir with suffix "_sv")', default="", type=str)
    parser.add_argument('-p', dest='pipelineDir', help='Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str)
    parser.add_argument('-a', dest='optArguments', help='Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str)
    parser.add_argument('-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int)
    parser.add_argument('-j', dest='maxthreads', help='Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int)
    parser.add_argument('-b', dest='bedFile', help='.bed file with gaps in reference for flagging SVs which overlap N-base gaps (optional)', default="", type=str)
    parser.add_argument('-e', dest='errFile', help='.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)', default="", type=str)
    parser.add_argument('-E', dest='errbinFile', help='.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)', default="", type=str)
    parser.add_argument('-C', help='Run on cluster, read XML file for submission arguments (optional--will not use cluster submission if absent)', dest='cxml', default=None)
    parser.add_argument('-s', help='Disable grouping of SV jobs (default grouped; optional)', dest='groupsv', action='store_false')
    result = parser.parse_args()

    #check all Pipeline dependencies
    if result.pipelineDir :
        cwd = result.pipelineDir
    else :
        cwd = os.path.split(os.path.realpath(__file__))[0] #this is path of this script
        if not os.path.isfile(os.path.join(cwd,"utilities.py")) : #if still not here, last try is actual cwd
            cwd = os.getcwd() #still check this below

    #this is the only one imported here and in runCharacterize
    if not os.path.isfile(os.path.join(cwd,"utilities.py")):
        print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    #xmap -- don't use this
    runaligns = True #default is to run the alignment
    xmappath = None
    #if result.xmap :
    #    xmappath = result.xmap
    #    if not util.checkFile(xmappath, ".xmap") :
    #        print "Xmap path is supplied ("+xmappath+") but not found or doesn't end in .xmap."
    #        sys.exit(1)
    #    runaligns = False

    #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args
    rabin = result.RefAligner
    #replicate Pipeline behavior: RefAligner is always required
    if os.path.isdir(rabin) :
        rabin = os.path.join(rabin, "RefAligner")
    if not util.checkExecutable(rabin):
        print "RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg."
        sys.exit(1)

    #reference maps -- only required if xmap not specified
    refcmap = os.path.realpath(result.referenceMap)
    if runaligns and not util.checkFile(refcmap, ".cmap") : #and not util.checkFile(refcmap, ".spots") :
        print "Reference map file ("+refcmap+") not found or does not end in .cmap or .spots. Check -r argument."
        sys.exit(1)

    #query maps
    qrypath = os.path.realpath(result.queryDir)
    #if runaligns and not util.checkFile(qrypath, ".cmap") :
    #    print "Query map file ("+qrypath+") not found or does not end in .cmap or .spots. Check -q argument."
    #    sys.exit(1)
    if not util.checkDir(qrypath, checkWritable=False, makeIfNotExist=False) : #does NOT have to be writeable
        print "Query dir ("+qrypath+") not found or not a dir. Check -q argument."
        sys.exit(1)
    if runaligns :
        contigdir  = qrypath #os.path.split(qrypath)[0] #dir of query maps
        contigbase = os.path.split(qrypath)[1] #filename
    else :
        contigdir  = os.path.split(xmappath)[0]
        contigbase = os.path.split(xmappath)[1] #filename
    #contigbase = contigbase[:contigbase.find(".")] #remove suffix

    #optargs file
    optargs = None
    if result.optArguments : #supplied on command line
        optargs = result.optArguments
        if not util.checkFile(optargs, ".xml") :
            print "optArguments path is supplied ("+optargs+") but not found or doesn't end in .xml, check -a argument."
            sys.exit(1)
    elif runaligns : #load from Pipeline dir if running alignments
        optargs = os.path.join(cwd,"optArguments_human.xml")
        if not util.checkFile(optargs):
            print "optArguments.xml missing in Pipeline directory ("+cwd+"). Try supplying path explicitly using -a."
            sys.exit(1)

    #cluster args
    clustargs = None
    if result.cxml :
        clustargs = os.path.realpath(result.cxml)
        if not util.checkFile(clustargs, ".xml") :
            print "clusterArguments path is supplied ("+clustargs+") but not found or doesn't end in .xml, check -C argument."
            sys.exit(1)

    #nthreads
    nthreads = result.numThreads
    if nthreads <= 0 :
        print "Number of threads value invalid (must be > 0): %i" % nthreads
        sys.exit(1)

    #maxthreads
    maxthreads = result.maxthreads
    if maxthreads <= 0 :
        print "Max threads value invalid (must be > 0): %i" % maxthreads
        sys.exit(1)

    #bed file
    bedfile = result.bedFile #must make local for return statement below
    if bedfile : #must check for empty string BEFORE you do realpath, or it returns cwd
        bedfile = os.path.realpath(result.bedFile)
        if not util.checkFile(bedfile, ".bed") :
            print "bed file supplied but not found or incorrect suffix:", bedfile
            sys.exit(1)

    #.errbin file
    errbinfile = result.errbinFile
    if errbinfile :
        errbinfile = os.path.realpath(result.errbinFile)
        if not util.checkFile(errbinfile, ".errbin") :
            print "errbin file supplied but not found or incorrect suffix:", errbinfile
            sys.exit(1)

    #.err file
    errfile = result.errFile
    if errfile and errbinfile :
        print "Warning: .err and .errbin arguments supplied; ignoring .err file"
        errfile = ""
    elif errfile :
        errfile = os.path.realpath(result.errFile)
        if not util.checkFile(errfile, ".err") :
            print "err file supplied but not found or incorrect suffix:", errfile
            sys.exit(1)

    outdir = os.path.realpath(result.outputDir)

    #yes, this is messy...but I don't want another class (besides varsPipeline) and they just go to runCharacterize
    return cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, result.groupsv
Example #23
0
def runSV(cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads, maxthreads, bedfile, errfile, outdir, errbinfile, clustargs, groupsv):
    '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns;
    report on those alignments or the xmap provided as xmappath.
    '''

    printargs = True

    if not os.path.isfile(os.path.join(cwd,"utilities.py")):
        print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    if not util.checkFile(os.path.join(cwd,"Pipeline.py")):
        print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import Pipeline

    if not util.checkFile(os.path.join(cwd,"SVModule.py")):
        print "SVModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import SVModule as svm

    if errfile and not util.checkFile(os.path.join(cwd,"SampleCharModule.py")):
        print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    elif errfile :
        import SampleCharModule as scm

    #use Pipeline objects

    varsP = Pipeline.varsPipeline()

    varsP.optArgumentsFileIn   = optargs
    varsP.RefAlignerBin        = rabin
    varsP.latestMergedCmap     = os.path.join(contigdir, contigbase+".cmap") #file suffix required to be .cmap
    varsP.contigFolder         = os.path.split(contigdir)[0]
    varsP.nThreads             = nthreads #necessary otherwise job won't start -- max threads per node
    varsP.maxthreads           = maxthreads #threads per job
    varsP.ref                  = refcmap
    varsP.stdoutlog            = True #enable -stdout -stderr args to RefAligner
    varsP.curCharacterizeCmaps = [varsP.latestMergedCmap]
    varsP.contigSubDirectories = True #needed for prepareContigIO
    varsP.doAlignMolvRef       = False #do not look for copy number
    varsP.groupSV              = groupsv #mimic Pipeline behavior: group or not 

    if runaligns :
        #varsP.contigAlignTarget = outdir
        varsP.runSV = False
        varsP.groupContigs = False
        varsP.stdoutlog    = True #use -stdout -stderr
        varsP.stageComplete = contigbase
        varsP.outputContigPrefix = getContigPrefix(util, contigdir) #if outdir is not supplied, this is used as dir prefix; also used as file pref for -o arg
        varsP.outputContigFolder = contigdir #cmaps are copied from here

        if not outdir :
            outdir = contigdir+"_sv" #this will be outdir of sv jobs
        if os.path.isdir(outdir) :
            if not util.checkDir(outdir) : #check writeable
                print "\nERROR: Output dir is not writeable:\n", outdir, "\n"                
                sys.exit(1)
            elif outdir == contigdir :
                print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n"                
                sys.exit(1)                
            print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n"
        elif not util.checkDir(outdir) : #does not exist, make, if False, can't make or not writeable
            print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n"
            sys.exit(1)

        if clustargs :
            os.putenv('SGE_ROOT', '/var/lib/gridengine') #do I want this???
            varsP.onCluster = True
            varsP.clusterLogDir = os.path.join(outdir, 'ClusterLogs')
            util.checkDir(varsP.clusterLogDir) #make it
            varsP.checkCluster()
            varsP.clusterArgumentsFileIn = clustargs #required for parseArguments
            varsP.parseArguments(readingClusterFile=True)
            if varsP.error :
                print varsP.message
                sys.exit(1)
            varsP.RefAlignerBin += "${BINARY_SUFFIX:=}" #copy from varsPipeline, handled by external script on phi host

        varsP.pipeReportFile = os.path.join(outdir, "sv_jobs_log.txt")
        varsP.infoReportFile = os.path.join(outdir, "sv_log.txt")
        varsP.memoryLogpath  = os.path.join(outdir, "memory_log.txt")
        if bedfile :
            varsP.bedFile = bedfile
        util.InitStatus( os.path.join(outdir, "status.xml") )
        varsP.parseArguments() #parses optArgumentsFile
        varsP.checkDependencies()
        varsP.RefAlignerBinOrig = rabin
        varsP.prerunLog() #general information in log -- needed for refaligner_version
        if printargs :
            print "\nRunning SV detection with arguments ("+os.path.split(optargs)[1]+"):\n" + " ".join(varsP.argsListed('svdetect')) + '\n'

        noisep = {}
        if errbinfile :
            noisep = {"readparameters": errbinfile}
            print "Using noise parameters from "+errbinfile+"\n"
        elif errfile :
            noisep = scm.readNoiseParameters(errfile.replace(".err",""))
            if noisep.has_key('readparameters') : #remove this because it's redundant, and it can cause problems with RefAligner compatibility
                del noisep['readparameters']
            if not noisep : #readNoiseParameters returns empty dict on failure
                print "ERROR reading noise parameters, check .err file:", errfile
                sys.exit(1)
            print "Using noise parameters from "+errfile+":\n" + " ".join(["-"+str(k)+" "+str(v) for k,v in noisep.iteritems()])+"\n"

        #make merged cmap to replace merged _q.cmap if not produced by RefAligner
        varsP.contigPathTxtFile = os.path.join(outdir, "contig_list.txt") #mergeIntoSingleCmap creates this file
        print "Creating merged cmap"
        varsP.mergeIntoSingleCmap(outdir)
        print "Merged cmap created:", varsP.latestMergedCmap, "\n"

        varsP.outputContigFolder = contigdir #cmaps are copied from here
        svmodule = svm.SVdetect(varsP, noisep, outdir, skipderes=True)
        #this got duplicated above
        #if hasattr(util, "InitStatus") : #if old version, skip -- do this after SVdetect.__init__ bc makes outdir
        #    util.InitStatus(os.path.join(outdir, "status.xml")) #needed otherwise call to status_log fails
        svmodule.runJobs()
        svmodule.checkResults()
        util.SummarizeErrors(varsP) 

    else :
        varsP.contigAlignTarget = contigdir #this is dir in which _q and _r cmaps must be located
        print "ERROR: feature not supported" #not implemented to not run jobs
def runAlignMol() :    
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument('-q', dest='queryDir', help='Path to merged cmap to align molecules (-b) to OR alignmol dir from Pipeline for merge (if latter, no alignments are performed), required', type=str)
    parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file, required if aligning molecules', type=str)
    #parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file OR path to dir containing split bnx pieces, required if aligning molecules', type=str) #I should add the split feature; for now, just do single bnx
    parser.add_argument('-a', dest='optArguments', help='Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)', default="", type=str)
    parser.add_argument('-r', help='If this flag is used, alignmolvref arguments are used, otherwise alignmol arguments are used (default alignmol; optional)', dest='ref', action='store_true')
    parser.add_argument('-o', dest='outputDir', help='output dir (optional, defaults to sub-dir of input map dir called "alignmol")', default="", type=str)
    parser.add_argument('-t', dest='RefAligner', help='Path to RefAligner or dir containing it (required)', type=str) 
    parser.add_argument('-T', dest='numThreads', help='Total number of threads (cores) to use (optional, default 4)', default=4, type=int)
    parser.add_argument('-j', dest='maxthreads', help='Threads per Job, -maxthreads (non-cluster only;optional, default 4)', default=4, type=int)
    parser.add_argument('-e', dest='errFile', help='.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str)
    parser.add_argument('-E', dest='errbinFile', help='.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise', default="", type=str)
    parser.add_argument('-p', dest='pipelineDir', help='Pipeline dir (optional, defaults to script dir, or current directory)', default="", type=str)
    parser.add_argument('-v', dest='pvalue', help='Alignment pvalue', default="1e-12")
    result = parser.parse_args()

    outprefix = "exp_refineFinal1" #this is the default; assume for now

    #check all Pipeline dependencies
    if result.pipelineDir :
        cwd = result.pipelineDir
    else :
        cwd = os.path.split(os.path.realpath(__file__))[0] #this is path of this script
        if not os.path.isfile(os.path.join(cwd,"utilities.py")) : #if still not here, last try is actual cwd
            cwd = os.getcwd() #still check this below

    #this is the only one imported here and in runCharacterize
    if not os.path.isfile(os.path.join(cwd,"utilities.py")):
        print "ERROR: utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    if not os.path.isfile(os.path.join(cwd,"AlignModule.py")):
        print "ERROR: AlignModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import AlignModule as alignmod

    if not util.checkFile(os.path.join(cwd,"Pipeline.py")):
        print "ERROR: Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import Pipeline

    #input dir
    if not result.queryDir :
        print "ERROR: Query (-q) argument not supplied."
        sys.exit(1)
    qrypath = os.path.realpath(result.queryDir)
    if util.checkDir(qrypath, checkWritable=False, makeIfNotExist=False) : #output elsewhere so not writeable is ok
        runaligns = False
    elif util.checkCmap(qrypath) :
        runaligns = True
    else :
        print "ERROR: Query argument ("+qrypath+") not found or not a dir or cmap. Check -q argument."
        sys.exit(1)

    #this check isn't really necessary...make it a warning -- left over from runAlignMerge.py
    #if not os.path.split(qrypath)[1].endswith("alignmol") :
    #    print "Warning: Query dir ("+qrypath+") does not end with 'alignmol'; please be sure this is a Pipeline alignmol dir\n"

    #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args
    rabin = "" #need empty string for generateJobList even though no jobs are run
    if runaligns :
        rabin = result.RefAligner
        #replicate Pipeline behavior: RefAligner is always required
        if os.path.isdir(rabin) :
            rabin = os.path.join(rabin, "RefAligner")
        if not util.checkExecutable(rabin):
            print "ERROR: RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg."
            sys.exit(1)

    #optargs file
    optargs = None
    if runaligns and result.optArguments : #supplied on command line
        optargs = result.optArguments
        if not util.checkFile(optargs, ".xml") :
            print "optArguments path is supplied ("+optargs+") but not found or doesn't end in .xml, check -a argument."
            sys.exit(1)
    elif runaligns : #load from Pipeline dir if running alignments
        optargs = os.path.join(cwd,"optArguments_human.xml")
        if not util.checkFile(optargs):
            print "optArguments.xml missing in Pipeline directory ("+cwd+"). Try supplying path explicitly using -a."
            sys.exit(1)

    #output dir
    if not result.outputDir :
        outdir = os.path.join(qrypath, "merge") #should be same as in AlignModule
    else :
        outdir = os.path.realpath(result.outputDir)
    if os.path.isdir(outdir) :
        if not util.checkDir(outdir) : #check writeable
            print "\nERROR: Output dir is not writeable:\n", outdir, "\n"                
            sys.exit(1)
        #this is ok here
        #elif outdir == contigdir :
        #    print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n"                
        #    sys.exit(1)                
        print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n"
    elif not util.checkDir(outdir) : #does not exist, make, if False, can't make or not writeable
        print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n"
        sys.exit(1)
    
    #bnx file
    bnxfile = result.bnx
    if bnxfile : #must check for empty string BEFORE you do realpath, or it returns cwd
        bnxfile = os.path.realpath(bnxfile)
        if not util.checkFile(bnxfile, ".bnx") :
            print "ERROR: bnx file supplied but not found or incorrect suffix:", bnxfile
            sys.exit(1)
    elif runaligns :
        print "ERROR: bnx file not supplied but running alignments; please supply bnx file as -b argument"
        sys.exit(1)

    #nthreads
    nthreads = result.numThreads
    if nthreads <= 0 :
        print "ERROR: Number of threads value invalid (must be > 0): %i" % nthreads
        sys.exit(1)

    #maxthreads
    maxthreads = result.maxthreads
    if maxthreads <= 0 :
        print "ERROR: Max threads value invalid (must be > 0): %i" % maxthreads
        sys.exit(1)
    elif nthreads < maxthreads :
        print "Warning: num threads (-T: %i) < max threads (-j: %i): increasing num threads to equal max threads\n" % (nthreads, maxthreads)
        nthreads = maxthreads

    #pvalue
    if result.pvalue : #supplied on command line
        pvalue = result.pvalue
    else :
        pvalue = "1e-12"    

    #.errbin file
    errbinfile = result.errbinFile
    if errbinfile :
        errbinfile = os.path.realpath(result.errbinFile)
        if not util.checkFile(errbinfile, ".errbin") :
            print "ERROR: errbin file supplied but not found or incorrect suffix:", errbinfile
            sys.exit(1)

    #.err file
    errfile = result.errFile
    if errfile and errbinfile :
        print "Warning: .err and .errbin arguments supplied; ignoring .err file"
        errfile = ""
    elif errfile :
        errfile = os.path.realpath(result.errFile)
        if not util.checkFile(errfile, ".err") :
            print "err file supplied but not found or incorrect suffix:", errfile
            sys.exit(1)

    if errfile and not util.checkFile(os.path.join(cwd,"SampleCharModule.py")):
        print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    elif errfile :
        import SampleCharModule as scm

    doref = result.ref

    #DONE checking arguments

    print "Using output dir", outdir
    if runaligns :
        print "Aligning", bnxfile, "\nTo", qrypath, "\n"
    else :
        print "Merging", qrypath, "\n"

    startTime = time.time() #time since Epoch
    memory_log = os.path.join(outdir, "memory_log.txt")
    util.initMemoryLog(memory_log)

    varsP = Pipeline.varsPipeline()
    varsP.RefAlignerBin        = rabin
    varsP.contigFolder         = "" #not used but needs to be an attr
    varsP.outputContigFolder   = "" #not used but needs to be a string attr
    varsP.pipeReportFile = os.path.join(outdir, "alignmol_jobs_log.txt")
    varsP.infoReportFile = os.path.join(outdir, "alignmol_log.txt")
    util.InitStatus( os.path.join(outdir, "status.xml") )

    if runaligns :
        varsP.optArgumentsFileIn   = optargs
        varsP.latestMergedCmap     = qrypath #if !doref, need this one
        varsP.ref                  = qrypath #and if doref, need this one
        varsP.nThreads             = nthreads #necessary otherwise job won't start -- max threads per node
        varsP.maxthreads           = maxthreads #threads per job
        p = os.path.split(qrypath)[1]
        varsP.outputContigPrefix   = p[:p.rfind(".")] #filename prefix
        varsP.stdoutlog    = True #use -stdout -stderr

        varsP.memoryLogpath  = os.path.join(outdir, "memory_log.txt")
        varsP.parseArguments() #parses optArgumentsFile
        varsP.replaceParam("alignmol", "-T", pvalue)
        varsP.checkDependencies()
        varsP.RefAlignerBinOrig = rabin
        varsP.prerunLog() #general information in log -- needed for refaligner_version

        noisep = {}
        if errbinfile :
            noisep = {"readparameters": errbinfile}
            #print "Using noise parameters from "+errbinfile+"\n" #move below
        elif errfile :
            noisep = scm.readNoiseParameters(errfile.replace(".err",""))
            if noisep.has_key('readparameters') : #remove this because it's redundant, and it can cause problems with RefAligner compatibility
                del noisep['readparameters']
            if not noisep : #readNoiseParameters returns empty dict on failure
                print "ERROR reading noise parameters, check .err file:", errfile
                sys.exit(1)
            #redundant with below?
            print "Using noise parameters from "+errfile+":\n" + " ".join(["-"+str(k)+" "+str(v) for k,v in noisep.iteritems()])+"\n"

        #some code from SampleCharModule to load args into noise0
        infoReport="Loaded noise parameters:\n"
        klist = ["FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict.
        #noiseargs = self.varsP.argsListed('noise0') #not necessary
        for v in klist :
            if not noisep.has_key(v) :
                continue
            param=str(noisep[v])
            util.LogStatus("parameter", "auto_"+v, param)
            infoReport+=v+":"+param+"\n"
            varsP.replaceParam("noise0", "-"+v, param)
        varsP.updateInfoReport(infoReport + '\n', printalso=True)

    else :
        print "Getting file list from", qrypath
        outFileList = getOutFileList(util, qrypath)
        if not outFileList :
            print "ERROR: Query dir ("+qrypath+") does not contain alignmol data. Check -q argument."
            sys.exit(1)
        else :
            print "Found", len(outFileList), "alignment results"
    #end if runaligns

    amod = alignmod.AlignModule(varsP, doref, outdir, bnxfile) #constructor will call generateJobList

    if runaligns :
        amod.runJobs()
	amod.checkResults()
    else :
        amod.outFileList = outFileList
        p = os.path.split(outFileList[0])[1]
        if p.count("_") > 1 : #expect something like "EXP_REFINEFINAL1_4"
            #p = p[:p.rfind("_")+1] #remove integer suffix
            p = p[:p.rfind("_")] #remove integer suffix (and underscore)
        #else :
        #    p += "_" #because mrgstr is appended
        varsP.outputContigPrefix = p

    if not runaligns or len(amod.jobList) > 0 :
        amod.getAlignStats()

    if runaligns :
        print
        #copy from Pipeline.py
        if util.SummarizeErrors(varsP=varsP)==0:
            varsP.updatePipeReport("Pipeline has successfully completed\n") 
            util.LogStatus("progress", "pipeline", "success")
        else:
            varsP.updatePipeReport("Pipeline has completed with errors\n") 
            util.LogStatus("progress", "pipeline", "failure")

    #BELOW OLD CODE

    return

    #in Pipeline, this is called first
    #print "Calling getAlignStats:" #but it won't work without varsP atm; skip it
    #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir)
    #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir)

    print "Calling mergeMap"
    print outFileList[0] #, "\n", outputdir #moved above
    util.logMemory(memory_log, startTime, "mergeMap_start")
    #mergeMap(self.varsP, self.outFileList, mergepath=self.outputdir) #varsP is optional
    alignmod.mergeMap(None, outFileList, outputdir) 
    util.logMemory(memory_log, startTime, "mergeMap_end")

    print "Calling mergeRcmaps"
    util.logMemory(memory_log, startTime, "mergeRcmaps_start")
    #mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="alignmol") :
    alignmod.mergeRcmaps(outFileList, outputdir, splitByContig=True, stageName=outprefix) 
    util.logMemory(memory_log, startTime, "mergeRcmaps_end")

    print "Calling split_XMap_byContig" #split_XMapQcmap_byContig"
    util.logMemory(memory_log, startTime, "split_XMap_byContig_start")
    #xmapdict = alignmod.split_XMap_byContig(outFileList, outputdir, stageName=outprefix) #old
    xmapdict = alignmod.split_XMap_byContig_new(outFileList, outputdir, stageName=outprefix)
    util.logMemory(memory_log, startTime, "split_XMap_byContig_end")

    print "Calling split_Qcmap_byContig" 
    util.logMemory(memory_log, startTime, "split_Qcmap_byContig_start")
    #alignmod.split_Qcmap_byContig(outFileList, outputdir, xmapdict) #old
    alignmod.split_Qcmap_byContig_new(outFileList, outputdir, xmapdict, stageName=outprefix) #new: better performance
    util.logMemory(memory_log, startTime, "split_Qcmap_byContig_end")

    print "AlignMerge successfully completed"
Example #25
0
    def __init__(self, varsP) :
        """splitBNX.__init__: this class is for sorting the input bnx
        for subsequent splitting by the splitBNX class, and eventually
        easier processing with the Pairwise class. The constructor
        (this) will call varsP.runJobs and doAllPipeReport, then
        instantiate splitBNX, which will do all the splitting required
        for the Pairwise class.
        """
        self.stageName = "Autonoise0"
        self.varsP = varsP #fewer code modifications below
        
        util.LogStatus("progress", "stage_start", self.stageName) #after above bc check if bypass (executeCurrentStage)

        self.output_folder = os.path.join(self.varsP.contigFolder, "auto_noise")
        if not util.checkDir(self.output_folder) : #will make if not exist, only returns False if already exists or can't make
            print "ERROR in autoNoise: bad dir:", self.output_folder
            raise RuntimeError
	    
        # We use assembly section here because the memory usage is higher than pairwise, while the jobs are quite short.
        #sortJobSet=mthread.jobWrapper(self.varsP,jobName,clusterArgs=self.varsP.getClusterArgs('assembly'))
        super(autoNoise, self).__init__(self.varsP, self.stageName, clusterArgs=self.varsP.getClusterArgs("assembly"))

        bnxfile = self.varsP.bnxFile if varsP.noiseOnly else self.varsP.sorted_file+".bnx"
        #was return if generateJobListChar, but need to get readparameters if bypass
        if not self.generateJobListChar({}, bnxfile, "autoNoise0") : #return 0 for success, 1 for skip
            self.varsP.runJobs(self, "AutoNoise0")
            self.doAllPipeReport()
        if not self.allResultsFound() :
            self.varsP.updatePipeReport("ERROR: AutoNoise0 failed. Check: "+self.output_file+".stdout\n")
            raise RuntimeError
        util.LogStatus("progress", "stage_complete", self.stageName)
            
        self.varsP.noise0 = readNoiseParameters(self.output_file)
	self.isBadErrorParams(self.varsP.noise0, 0)

        self.stageName = "Autonoise1"
        self.groupName = self.stageName #fix so that LogStatus call in MultiThreading.multiThreadRunJobs
        util.LogStatus("progress", "stage_start", self.stageName)

        self.clearJobs()
        
	self.varsP.replaceParam("noise0", "-readparameters", self.output_file+".errbin")

        #need to call again to set self.output_file
        if not self.generateJobListChar(self.varsP.noise0, bnxfile, "autoNoise1") : #return 0 for success, 1 for skip
            self.varsP.runJobs(self, "AutoNoise1")
            self.doAllPipeReport()
        if not self.allResultsFound() :
            self.varsP.updatePipeReport("ERROR: AutoNoise1 failed. Check: "+self.output_file+".stdout\n")
            raise RuntimeError
            
        self.varsP.noise1 = readNoiseParameters(self.output_file)
        
	infoReport="Automatically determined noise parameters:\n"
        klist = ["FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"] #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict.
        for v in klist :
            if not self.varsP.noise1.has_key(v) :
                continue
            param=str(self.varsP.noise1[v])
            util.LogStatus("parameter", "auto_"+v, param)
            infoReport+=v+":"+param+"\n"
            self.varsP.replaceParam("noise0", "-"+v, param)
        self.varsP.updateInfoReport(infoReport + '\n')
        self.isBadErrorParams(self.varsP.noise1, 1)

        if self.varsP.doScanScale : #change the sorted_file to the rescaled bnx file
            rescaledbnx = self.output_file + self.varsP.rescaleSuffix #no ".bnx" in suffix
            if not util.checkFile(rescaledbnx+".bnx") : #not found--not an error if bnx 0.1 is used
                err = "Warning: scan scaled bnx not found after autoNoise1; not performing scan scaling--check that bnx 1.0 or later used in input"
                self.varsP.updatePipeReport( err+"\n\n" )
                util.LogError("warning", err)
                self.varsP.doScanScale = False
            else : #log that scan scaling is used
                self.varsP.updatePipeReport( "Using scan scaled bnx: "+rescaledbnx+".bnx\n\n" )
                util.LogStatus("parameter", "scanscaled_bnx", rescaledbnx+".bnx")
                self.varsP.sorted_file = rescaledbnx #this variable is used in splitBNX (PairwiseModule.py)
            
        util.LogStatus("progress", "stage_complete", self.stageName)
def runCharacterize(cwd, rabin, refcmap, contigdir, contigbase, runaligns, xmappath, optargs, nthreads):
    '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns;
    report on those alignments or the xmap provided as xmappath.
    '''

    printargs = True

    if not os.path.isfile(os.path.join(cwd,"utilities.py")):
        print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    if not util.checkFile(os.path.join(cwd,"Pipeline.py")):
        print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import Pipeline

    if not util.checkFile(os.path.join(cwd,"CharacterizeModule.py")):
        print "CharacterizeModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import CharacterizeModule as cm

    #if not util.checkFile(os.path.join(cwd,"MapClassesRev.py")):
    #    print "MapClassesRev.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
    #    sys.exit(1)
    #import MapClassesRev

    #use Pipeline objects

    varsP = Pipeline.varsPipeline()

    varsP.optArgumentsFileIn   = optargs
    varsP.RefAlignerBin        = rabin
    varsP.latestMergedCmap     = os.path.join(contigdir, contigbase+".cmap") #file suffix required to be .cmap
    varsP.contigFolder         = contigdir
    varsP.nThreads             = nthreads #necessary otherwise job won't start
    varsP.ref                  = refcmap
    varsP.stdoutlog            = True #enable -stdout -stderr args to RefAligner
    varsP.curCharacterizeCmaps = [varsP.latestMergedCmap]

    if runaligns :
        varsP.contigAlignTarget = contigdir+"/alignref" #this is output dir
        varsP.runSV = False
        varsP.groupContigs = False
        varsP.stageComplete = contigbase
        varsP.outputContigFolder = contigdir
        varsP.memoryLogpath  = os.path.join(contigdir, "memory_log.txt")
        varsP.pipeReportFile = os.path.join(contigdir, "pipeReport.txt")
        varsP.parseArguments() #parses optArgumentsFile
        if printargs :
            print "\nRunning Characterization with arguments:\n" + " ".join(varsP.argsListed('characterizeDefault')) + '\n'
        if hasattr(util, "InitStatus") : #if old version, skip
            util.InitStatus(os.path.join(contigdir, "status.xml")) #needed otherwise call to status_log fails
        charmod = cm.Characterize(varsP) #create Characterize object from CharacterizeModule -- this also calls generateJobList
        xmappath = charmod.xmapTarget #set in Characterize.generateJobList
        charmod.runJobs()
    else :
        #varsP.contigAlignTarget = contigdir #this is dir in which _q and _r cmaps must be located -- contigdir is from cmap; this should be from xmap
        varsP.contigAlignTarget = os.path.split(xmappath)[0]
        print "Loading alignments from\n" + xmappath + "\n"

    #no longer using this in Pipeline
    #print MapClassesRev.TopLevelCharacterization(varsP, [os.path.join(varsP.contigAlignTarget, contigbase)])

    print cm.characterizeContigs(varsP, xmappath) 
def runCharacterize(cwd, rabin, refcmap, contigdir, contigbase, runaligns,
                    xmappath, optargs, nthreads, pvalue):
    '''Load Pipeline files from first arg; configure CharacterizeModule; run alignments if runaligns;
    report on those alignments or the xmap provided as xmappath.
    '''

    printargs = True

    if not os.path.isfile(os.path.join(cwd, "utilities.py")):
        print "utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    if not util.checkFile(os.path.join(cwd, "Pipeline.py")):
        print "Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import Pipeline

    if not util.checkFile(os.path.join(cwd, "CharacterizeModule.py")):
        print "CharacterizeModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import CharacterizeModule as cm

    if not util.checkFile(os.path.join(cwd, "MapClassesRev.py")):
        print "MapClassesRev.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import MapClassesRev

    #use Pipeline objects

    varsP = Pipeline.varsPipeline()

    varsP.optArgumentsFileIn = optargs
    varsP.RefAlignerBin = rabin
    varsP.latestMergedCmap = os.path.join(
        contigdir, contigbase + ".cmap")  #file suffix required to be .cmap
    varsP.contigFolder = contigdir
    varsP.nThreads = nthreads  #necessary otherwise job won't start
    varsP.ref = refcmap
    varsP.stdoutlog = True  #enable -stdout -stderr args to RefAligner
    varsP.curCharacterizeCmaps = [varsP.latestMergedCmap]

    if runaligns:
        varsP.contigAlignTarget = contigdir + "/alignref_final"  #this is output dir
        varsP.runSV = False
        varsP.groupContigs = False
        varsP.stageComplete = contigbase
        varsP.outputContigFolder = contigdir
        varsP.memoryLogpath = os.path.join(contigdir, "memory_log.txt")
        varsP.stdoutlog = True
        varsP.pipeReportFile = os.path.join(contigdir, "pipeReport.txt")
        varsP.parseArguments()  #parses optArgumentsFile
        varsP.replaceParam("characterizeFinal", "-T", pvalue)
        if printargs:
            print "\nRunning Characterization with arguments:\n" + " ".join(
                varsP.argsListed('characterizeFinal')) + '\n'
        if hasattr(util, "InitStatus"):  #if old version, skip
            util.InitStatus(os.path.join(
                contigdir,
                "status.xml"))  #needed otherwise call to status_log fails
        charmod = cm.Characterize(
            varsP, 1
        )  #create Characterize object from CharacterizeModule -- this also calls generateJobList
        xmappath = charmod.xmapTarget  #set in Characterize.generateJobList
        charmod.runJobs()
    else:
        #varsP.contigAlignTarget = contigdir #this is dir in which _q and _r cmaps must be located -- contigdir is from cmap; this should be from xmap
        varsP.contigAlignTarget = os.path.split(xmappath)[0]
        print "Loading alignments from\n" + xmappath + "\n"

#no longer using this in Pipeline
#print MapClassesRev.TopLevelCharacterization(varsP, [os.path.join(varsP.contigAlignTarget, contigbase)])

    print cm.characterizeContigs(varsP,
                                 xmappath)  #this is redundant with above