def getMergeFilename(mappref) :
    mappref = os.path.split(mappref)[1] #this is path + filename prefix, but possibly with integer suffix
    #if mappref.count("_") > 1 : #rather than checking multiple, check after any _
    pos = mappref.rfind("_")
    if pos != -1 :
        suf = mappref[pos+1:] #check if suffix is integer
        if util.getIntFromString(suf) != None :
            mappref = mappref[:pos+1] #remove integer suffix
        elif pos < len(mappref)-1 : #same as no suffix below, don't make "__"
            mappref += "_" #because mrgstr is appended
    else :
        mappref += "_" #because mrgstr is appended
    return mappref
Esempio n. 2
0
def getMergeFilename(mappref):
    mappref = os.path.split(mappref)[
        1]  #this is path + filename prefix, but possibly with integer suffix
    #if mappref.count("_") > 1 : #rather than checking multiple, check after any _
    pos = mappref.rfind("_")
    if pos != -1:
        suf = mappref[pos + 1:]  #check if suffix is integer
        if util.getIntFromString(suf) != None:
            mappref = mappref[:pos + 1]  #remove integer suffix
        elif pos < len(mappref) - 1:  #same as no suffix below, don't make "__"
            mappref += "_"  #because mrgstr is appended
    else:
        mappref += "_"  #because mrgstr is appended
    return mappref
def getAlignStats(varsP, outFileList, reflen=0, isref=False, mergepath="", bnxpath=None) :
    '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule.
    reflen should be in Mb. If mergepath supplied, put merged .err there.
    If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this
    file and ignore outFileList.
    '''

    statonly = False #bnx stats only
    skipbnx = False #.err file processing only
    if bnxpath == None :
        if not varsP.sorted_file : #for runAlignMol, this is empty: nothing to do in this case
            skipbnx = True
        else :
            bnxpath = varsP.sorted_file+".bnx" #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix
    else : #if bnxpath != None :
        statonly = True
    if not skipbnx and not util.checkFile(bnxpath) :
        varsP.updatePipeReport("Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n" % bnxpath)
        return

    #find the minlen used for bnx_sort, which is a required arg set
    sortargs = []
    if varsP.argData.has_key('bnx_sort') : #for runAlignMol.py
        sortargs = varsP.argsListed('bnx_sort')
    minlen = 0
    validminlen = False
    if "-minlen" in sortargs :
        minlen = sortargs[sortargs.index("-minlen")+1] #next ele should be the len, if next ele isn't in list, the sort job will fail
        minlen = util.getIntFromString(minlen) #returns None if can't cast to int
        if minlen :
            validminlen = True

    if not validminlen and bnxpath == None and sortargs :
        varsP.updatePipeReport("Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n")
    if bnxpath != None : #if bnxpath, ignore minlen
        minlen = 0

    nmol = 0 #total n mol above minlen
    totlen = 0 #total mol len above minlen
    if util.checkFile(bnxpath) :
        #the bnxfile class is very wasteful. replace with below
        #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now
        outstr = "Reading molecule stats from %s:\n" % bnxpath
        outstr += "Molecule Stats:\n"
        moldict = util.simpleBnxStats(bnxpath, minlen)
        nmol = moldict["nmol"]
        totlen = moldict["totlen"]
        #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously
        outstr += "N mols: %i\n" % nmol
        outstr += ("Total len (Mb): %10.3f\n") % totlen
        outstr += ("Avg len (kb)  : %10.3f\n") % moldict["avglen"]
        outstr += ("Mol N50 (kb)  : %10.3f\n") % moldict["n50"]
        outstr += ("Lab (/100kb)  : %10.3f\n") % moldict["labdensity"]
        #    if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below
        #        bnx.molstats[minlen].genomesizemb = 0 
        #    outstr += str(bnx.molstats[minlen]) 
        #nmol = bnx.molstats[minlen].nmol
        #totlen = bnx.molstats[minlen].totlen

        if reflen : 
            cov = totlen / reflen #totlen is in Mb
            outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else "Contig", cov)
        if isref or reflen or statonly : #if neither, nothing to print
            varsP.updateInfoReport(outstr + "\n", printalso=True)
    elif not skipbnx :
        varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing bnx path:"+bnxpath+"\n")

    if statonly :
        return

    #lastly, load .xmaps and .errs from alignmol jobs and report on stats
    totmaplen = 0 #sum of lengths of mapped portions of all molecules, on reference
    totmapqrylen = 0 #sum of lengths of mapped portions of all molecules, on query
    totconf = 0 #sum of confidence of all alignments
    nalign = 0 #total number of alignments
    fplist = [] #lists for error rates
    fprlist = []
    fnlist = []
    bpplist = []
    nmaplist = [] #from .err
    gmaplist = [] #from .err
    llrmlist  = []; llrgmlist = []; bppsdlist = []
    sflist = []; sdlist = []; srlist = []; reslist = []; resdlist = []
    header = ""
    err = None #will be the alignParams object if any .err files are found
    mappref = ""
    if len(outFileList) > 0 :
        mappref = getMergeFilename(outFileList[0]) #make function to unify with same convention in mergeMap
    for outpath in outFileList : #these are file prefixes
        if util.checkFile(outpath+".xmap") :
            xmap = mc.xmap(outpath+".xmap")
            nalign += len(xmap.xmapLookup)
            totmaplen += xmap.getSumMappedRefLen() #in kb
            totmapqrylen += xmap.getSumMappedQryLen() #in kb
            totconf += sum([x.Confidence for x in xmap.xmapLookup.values()])
        else :
            varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing xmap:"+outpath+".xmap"+"\n")
        if util.checkFile(outpath+".err") :
            err = mc.alignParams(outpath+".err")
            if not header :
                header = err.header
            fplist.append(err.fp)
            fprlist.append(err.fprate)
            fnlist.append(err.fn)
            bpplist.append(err.bpp)
            reslist.append(err.res)
            nmaplist.append(err.nmaps)
            gmaplist.append(err.goodmaps)
            llrmlist.append(err.llrm)
            llrgmlist.append(err.llrgm)
            bppsdlist.append(err.bppsd)
            sflist.append(err.sf)
            sdlist.append(err.sd)
            srlist.append(err.sr)
            resdlist.append(err.ressd)

    #nalign from xmap should be the same as goodmaps from .err
    sumgoodmaps = sum(gmaplist)
    if sumgoodmaps != nalign :
        varsP.updateInfoReport("Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n" % (sumgoodmaps, nalign), printalso=True)
    if totmaplen or totconf or nalign : 
        outstr =  "Molecules Aligned to %s:\n" % ("Reference" if isref else "Assembly")
        outstr += "N mol align       : %9i\n" % nalign
        outstr += "Mol fraction align: %13.3f\n" % (float(nalign)/nmol if nmol else 0)
        outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3) #Mb
        if reflen > 0 : 
            outstr += ("Effective Cov (x) : %13.3f\n") % (totmaplen / 1e3 / reflen) #totlen is in kb
        outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen/nalign if nalign else 0)
        outstr += "Fraction align len: %13.3f\n" % (totmapqrylen/1e3/totlen if totlen else 0) #totmapqrylen is in kb, totlen is in mb
        outstr += "Tot confidence    : %11.1f\n" % totconf
        outstr += "Avg confidence    : %11.1f\n" % (totconf/nalign if nalign else 0)
        varsP.updateInfoReport(outstr, printalso=True)
    avgfp  = (sum(fplist)/len(fplist)   if len(fplist) else 0)
    avgfpr = (sum(fprlist)/len(fprlist) if len(fprlist) else 0)
    avgfn  = (sum(fnlist)/len(fnlist)   if len(fnlist) else 0)
    avgbpp = (sum(bpplist)/len(bpplist) if len(bpplist) else 0)
    avgres = (sum(reslist)/len(reslist) if len(reslist) else 0)
    avgllr = (sum(llrmlist)/len(llrmlist) if len(llrmlist) else 0)
    avgllg = (sum(llrgmlist)/len(llrgmlist) if len(llrgmlist) else 0)
    avgbps = (sum(bppsdlist)/len(bppsdlist) if len(bppsdlist) else 0)
    avgsf  = (sum(sflist)/len(sflist) if len(sflist) else 0)
    avgsd  = (sum(sdlist)/len(sdlist) if len(sdlist) else 0)
    avgsr  = (sum(srlist)/len(srlist) if len(srlist) else 0)
    avgrsd = (sum(resdlist)/len(resdlist) if len(resdlist) else 0)
    if avgfp or avgfn or avgbpp :
        outstr =  "Avg FP(/100kb)    : %12.2f\n" % avgfp
        outstr += "Avg FP ratio      : %13.3f\n" % avgfpr
        outstr += "Avg FN ratio      : %13.3f\n" % avgfn
        outstr += "Avg bpp           : %11.1f\n" % avgbpp
        outstr += "Avg sf            : %13.3f\n" % avgsf
        outstr += "Avg sd            : %13.3f\n" % avgsd
        outstr += "Avg sr            : %13.3f\n" % avgsr
        varsP.updateInfoReport(outstr + "\n", printalso=True)
    if err and mergepath : #have an error file (alignParams) object
        util.checkDir(mergepath)
        mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
        outpath = os.path.join(mergepath, mappref+mrgstr+".err")
        err.fp = avgfp
        err.fn = avgfn
        err.sf = avgsf
        err.sd = avgsd
        err.bpp = avgbpp
        err.res = avgres
        err.nmaps = sum(nmaplist)
        err.llrm  = avgllr
        err.goodmaps = sumgoodmaps
        err.llrgm = avgllg
        err.bppsd = avgbps
        err.fprate = avgfpr
        err.sr = avgsr
        err.ressd = avgrsd
        err.writeToFile(outpath)
Esempio n. 4
0
def getAlignStats(varsP,
                  outFileList,
                  reflen=0,
                  isref=False,
                  mergepath="",
                  bnxpath=None):
    '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule.
    reflen should be in Mb. If mergepath supplied, put merged .err there.
    If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this
    file and ignore outFileList.
    '''

    statonly = False  #bnx stats only
    skipbnx = False  #.err file processing only
    if bnxpath == None:
        if not varsP.sorted_file:  #for runAlignMol, this is empty: nothing to do in this case
            skipbnx = True
        else:
            bnxpath = varsP.sorted_file + ".bnx"  #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix
    else:  #if bnxpath != None :
        statonly = True
    if not skipbnx and not util.checkFile(bnxpath):
        varsP.updatePipeReport(
            "Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n"
            % bnxpath)
        return

    #find the minlen used for bnx_sort, which is a required arg set
    sortargs = []
    if varsP.argData.has_key('bnx_sort'):  #for runAlignMol.py
        sortargs = varsP.argsListed('bnx_sort')
    minlen = 0
    validminlen = False
    if "-minlen" in sortargs:
        minlen = sortargs[
            sortargs.index("-minlen") +
            1]  #next ele should be the len, if next ele isn't in list, the sort job will fail
        minlen = util.getIntFromString(
            minlen)  #returns None if can't cast to int
        if minlen:
            validminlen = True

    if not validminlen and bnxpath == None and sortargs:
        varsP.updatePipeReport(
            "Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n"
        )
    if bnxpath != None:  #if bnxpath, ignore minlen
        minlen = 0

    nmol = 0  #total n mol above minlen
    totlen = 0  #total mol len above minlen
    if util.checkFile(bnxpath):
        #the bnxfile class is very wasteful. replace with below
        #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now
        outstr = "Reading molecule stats from %s:\n" % bnxpath
        outstr += "Molecule Stats:\n"
        moldict = util.simpleBnxStats(bnxpath, minlen)
        nmol = moldict["nmol"]
        totlen = moldict["totlen"]
        #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously
        outstr += "N mols: %i\n" % nmol
        outstr += ("Total len (Mb): %10.3f\n") % totlen
        outstr += ("Avg len (kb)  : %10.3f\n") % moldict["avglen"]
        outstr += ("Mol N50 (kb)  : %10.3f\n") % moldict["n50"]
        outstr += ("Lab (/100kb)  : %10.3f\n") % moldict["labdensity"]
        #    if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below
        #        bnx.molstats[minlen].genomesizemb = 0
        #    outstr += str(bnx.molstats[minlen])
        #nmol = bnx.molstats[minlen].nmol
        #totlen = bnx.molstats[minlen].totlen

        if reflen:
            cov = totlen / reflen  #totlen is in Mb
            outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else
                                                    "Contig", cov)
        if isref or reflen or statonly:  #if neither, nothing to print
            varsP.updateInfoReport(outstr + "\n", printalso=True)
    elif not skipbnx:
        varsP.updatePipeReport(
            "Warning in AlignModule.getAlignStats: missing bnx path:" +
            bnxpath + "\n")

    if statonly:
        return

    #lastly, load .xmaps and .errs from alignmol jobs and report on stats
    totmaplen = 0  #sum of lengths of mapped portions of all molecules, on reference
    totmapqrylen = 0  #sum of lengths of mapped portions of all molecules, on query
    totconf = 0  #sum of confidence of all alignments
    nalign = 0  #total number of alignments
    fplist = []  #lists for error rates
    fprlist = []
    fnlist = []
    bpplist = []
    nmaplist = []  #from .err
    gmaplist = []  #from .err
    llrmlist = []
    llrgmlist = []
    bppsdlist = []
    sflist = []
    sdlist = []
    srlist = []
    reslist = []
    resdlist = []
    header = ""
    err = None  #will be the alignParams object if any .err files are found
    mappref = ""
    if len(outFileList) > 0:
        mappref = getMergeFilename(
            outFileList[0]
        )  #make function to unify with same convention in mergeMap
    for outpath in outFileList:  #these are file prefixes
        if util.checkFile(outpath + ".xmap"):
            xmap = mc.xmap(outpath + ".xmap")
            nalign += len(xmap.xmapLookup)
            totmaplen += xmap.getSumMappedRefLen()  #in kb
            totmapqrylen += xmap.getSumMappedQryLen()  #in kb
            totconf += sum([x.Confidence for x in xmap.xmapLookup.values()])
        else:
            varsP.updatePipeReport(
                "Warning in AlignModule.getAlignStats: missing xmap:" +
                outpath + ".xmap" + "\n")
        if util.checkFile(outpath + ".err"):
            err = mc.alignParams(outpath + ".err")
            if not header:
                header = err.header
            fplist.append(err.fp)
            fprlist.append(err.fprate)
            fnlist.append(err.fn)
            bpplist.append(err.bpp)
            reslist.append(err.res)
            nmaplist.append(err.nmaps)
            gmaplist.append(err.goodmaps)
            llrmlist.append(err.llrm)
            llrgmlist.append(err.llrgm)
            bppsdlist.append(err.bppsd)
            sflist.append(err.sf)
            sdlist.append(err.sd)
            srlist.append(err.sr)
            resdlist.append(err.ressd)

    #nalign from xmap should be the same as goodmaps from .err
    sumgoodmaps = sum(gmaplist)
    if sumgoodmaps != nalign:
        varsP.updateInfoReport(
            "Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n"
            % (sumgoodmaps, nalign),
            printalso=True)
    if totmaplen or totconf or nalign:
        outstr = "Molecules Aligned to %s:\n" % ("Reference"
                                                 if isref else "Assembly")
        outstr += "N mol align       : %9i\n" % nalign
        outstr += "Mol fraction align: %13.3f\n" % (float(nalign) /
                                                    nmol if nmol else 0)
        outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3)  #Mb
        if reflen > 0:
            outstr += ("Effective Cov (x) : %13.3f\n") % (
                totmaplen / 1e3 / reflen)  #totlen is in kb
        outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen /
                                                    nalign if nalign else 0)
        outstr += "Fraction align len: %13.3f\n" % (
            totmapqrylen / 1e3 / totlen if totlen else 0
        )  #totmapqrylen is in kb, totlen is in mb
        outstr += "Tot confidence    : %11.1f\n" % totconf
        outstr += "Avg confidence    : %11.1f\n" % (totconf /
                                                    nalign if nalign else 0)
        varsP.updateInfoReport(outstr, printalso=True)
    avgfp = (sum(fplist) / len(fplist) if len(fplist) else 0)
    avgfpr = (sum(fprlist) / len(fprlist) if len(fprlist) else 0)
    avgfn = (sum(fnlist) / len(fnlist) if len(fnlist) else 0)
    avgbpp = (sum(bpplist) / len(bpplist) if len(bpplist) else 0)
    avgres = (sum(reslist) / len(reslist) if len(reslist) else 0)
    avgllr = (sum(llrmlist) / len(llrmlist) if len(llrmlist) else 0)
    avgllg = (sum(llrgmlist) / len(llrgmlist) if len(llrgmlist) else 0)
    avgbps = (sum(bppsdlist) / len(bppsdlist) if len(bppsdlist) else 0)
    avgsf = (sum(sflist) / len(sflist) if len(sflist) else 0)
    avgsd = (sum(sdlist) / len(sdlist) if len(sdlist) else 0)
    avgsr = (sum(srlist) / len(srlist) if len(srlist) else 0)
    avgrsd = (sum(resdlist) / len(resdlist) if len(resdlist) else 0)
    if avgfp or avgfn or avgbpp:
        outstr = "Avg FP(/100kb)    : %12.2f\n" % avgfp
        outstr += "Avg FP ratio      : %13.3f\n" % avgfpr
        outstr += "Avg FN ratio      : %13.3f\n" % avgfn
        outstr += "Avg bpp           : %11.1f\n" % avgbpp
        outstr += "Avg sf            : %13.3f\n" % avgsf
        outstr += "Avg sd            : %13.3f\n" % avgsd
        outstr += "Avg sr            : %13.3f\n" % avgsr
        varsP.updateInfoReport(outstr + "\n", printalso=True)
    if err and mergepath:  #have an error file (alignParams) object
        util.checkDir(mergepath)
        mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
        outpath = os.path.join(mergepath, mappref + mrgstr + ".err")
        err.fp = avgfp
        err.fn = avgfn
        err.sf = avgsf
        err.sd = avgsd
        err.bpp = avgbpp
        err.res = avgres
        err.nmaps = sum(nmaplist)
        err.llrm = avgllr
        err.goodmaps = sumgoodmaps
        err.llrgm = avgllg
        err.bppsd = avgbps
        err.fprate = avgfpr
        err.sr = avgsr
        err.ressd = avgrsd
        err.writeToFile(outpath)