Ejemplo n.º 1
0
def characterizeContigs(varsP, xmappath=None):
    """Log simple contigs stats, and optionally align stats from xmappath.
    """
    #print "xmappath:", xmappath
    unitscale = 1e-6
    dorefalign = bool(
        xmappath
    )  #i'm never actually calling refaligner here--this is just using xmappath
    haveref = bool(varsP.ref)

    #refcmap = mapClasses.multiCmap() #not used
    aligndir = varsP.contigAlignTarget

    try:
        #refcmap = mapClasses.multiCmap(varsP.ref)
        #reflen = refcmap.totalLength #note: total length of _all_ contigs
        reflen = mapClasses.multiCmap(varsP.ref, lengthonly=True).totalLength
        #in summary table, this is a denominator--make sure it's non-zero, don't bail (still get contig summary)
        if reflen <= 0:
            #print "Warning in CharacterizeModule.characterizeContigs: bad reflen", reflen, "defaulting to 1" #not necessary
            reflen = 1.
    except:
        reflen = 1.

    outstr = ""  #Contig Characterization:\n"

    #check for .hmaps in same dir as latestMergedCmap: if any, add a line for haploid genome size
    hmaps = util.getListOfFilesFromDir(os.path.dirname(varsP.latestMergedCmap),
                                       ".hmap")
    haplotype = (len(hmaps) > 0)
    haplotypelen = 0
    hapcontiglens = []

    totcontiglen = 0
    totalignlen = 0
    nmapcontigs = 0
    totalignqlen = 0  #defalignlen = 0;
    contiglens = []  #lens of all contigs in bases
    uniqueseg = {
    }  #the argument to util.uniqueRange--stores all the map lengths which go into totalignlen--now each of these is a value, and the keys are the reference contig id or chromosome depending on dorefidchr
    for citr, cpath in enumerate([varsP.latestMergedCmap
                                  ]):  #always use contigpaths
        mapi = mapClasses.multiCmap(cpath)
        totcontiglen += mapi.totalLength
        contiglens += mapi.getAllMapLengths(
        )  #getAllMapLengths is list of all map lengths
        if haplotype:
            haplotypelen += mapi.getHaplotypeTotalMapLength()
            hapcontiglens.extend(mapi.getHaplotypeMapLengths())

        #store a list of the contig ids in this multiCmap, then remove them if they're in the xmap
        # if they're not, print at the end
        mapids = mapi.getAllMapIds(
        )  #this is once per cmap, not once per characterizeModule call--becuase it's keys, it's already a copy, no need to copy explicitly
        ncontigs = len(
            mapids)  #this is ncontigs in this file, ie, in mapi (see below)

        xmapobj = mapClasses.xmap()  #empty map to fix xmapobj scope
        if dorefalign:  #get xmap object
            if util.checkFile(xmappath, ".xmap"):
                xmapobj = mapClasses.xmap(xmappath)

        for xitr, xmapentry in enumerate(xmapobj.xmapLookup.values()):

            #get map length from multicmap.getMapLength--returns 0 for any exception
            contiglen = mapi.getMapLength(xmapentry.contigQry)
            if contiglen <= 0:  #this strikes me as clumsy...but I don't want to return non-zero from multiCmap.getMapLength
                contiglen = 1.
            contigcov = mapi.getMapAvgCoverage(xmapentry.contigQry)

            #don't print lenr for each contig--just total them
            lenr = xmapentry.getMappedRefLen()
            lenq = xmapentry.getMappedQryLen()
            refid = xmapentry.contigRef  #int

            totalignlen += lenr
            totalignqlen += lenq

            #uniqueseg is now a dict to take into account which chromosome the query contig is on
            #note need refid bc need to separate different contigs on the _same_ chromosome
            if not uniqueseg.has_key(
                    refid
            ):  #if first contig on chromosome, need to init new list
                uniqueseg[refid] = []
            uniqueseg[refid].append([xmapentry.RefStart, xmapentry.RefStop])

            #process mapids--remove contig id (contigQry) from mapids if they're in the xmap so non-aligning contigs can be printed
            if xmapentry.contigQry in mapids:
                mapids.remove(xmapentry.contigQry)

        #end loop on xmap entries

        #now that all xmap entries are processed, all contigs with an alignment are removed from mapids,
        # so we can get n contigs align using this and ncontigs
        nmapcontigs += ncontigs - len(mapids)  #sum multiple cmaps

    #end loop on contigs

    varsP.totAssemblyLenMb = totcontiglen * unitscale
    ncontigs = len(
        contiglens)  #contigpaths is just files--contiglens is all contigs
    avgcontiglen = (float(totcontiglen) / ncontigs if ncontigs > 0 else 0)

    if unitscale > 1e-6:  #if not megabases
        fstr = "%9.0f"
    else:  #megabases
        fstr = "%8.3f"

    if haplotype:  #new format for haplotype
        #if haplotypelen != sum(hapcontiglens) : #simply print warning in this case (do not log): ignore this bc of floating point rounding
        #print "Warning in characterizeContigs: haplotype lengths are inconsistent:", haplotypelen, sum(hapcontiglens)
        #diploid is same as else below, but names change
        outstr += "Diploid N Genome Maps: %i\n" % ncontigs
        outstr += ("Diploid Genome Map Len        (Mb): " + fstr +
                   "\n") % (totcontiglen * unitscale)
        outstr += ("Diploid Avg. Genome Map Len   (Mb): " + fstr +
                   "\n") % (avgcontiglen * unitscale)
        outstr += ("Diploid Median Genome Map Len (Mb): " + fstr +
                   "\n") % (util.getMedian(contiglens) * unitscale)
        outstr += ("Diploid Genome Map n50        (Mb): " + fstr +
                   "\n") % (util.getn50(contiglens) * unitscale)
        #haploid : ignore haplotypelen, just use the list hapcontiglens
        outstr += "Haploid N Genome Maps: %i\n" % len(hapcontiglens)
        tot = sum(hapcontiglens)
        avg = (tot / len(hapcontiglens) if len(hapcontiglens) else 0)
        outstr += ("Haploid Genome Map Len        (Mb): " + fstr +
                   "\n") % (tot * unitscale)
        outstr += ("Haploid Avg. Genome Map Len   (Mb): " + fstr +
                   "\n") % (avg * unitscale)
        outstr += ("Haploid Median Genome Map Len (Mb): " + fstr +
                   "\n") % (util.getMedian(hapcontiglens) * unitscale)
        outstr += ("Haploid Genome Map n50        (Mb): " + fstr +
                   "\n") % (util.getn50(hapcontiglens) * unitscale)
    else:  #default to old format
        outstr += "N Genome Maps: %i\n" % ncontigs
        outstr += ("Total Genome Map Len  (Mb): " + fstr +
                   "\n") % (totcontiglen * unitscale)
        outstr += ("Avg. Genome Map Len   (Mb): " + fstr +
                   "\n") % (avgcontiglen * unitscale)
        outstr += ("Median Genome Map Len (Mb): " + fstr +
                   "\n") % (util.getMedian(contiglens) * unitscale)
        outstr += ("Genome Map n50        (Mb): " + fstr +
                   "\n") % (util.getn50(contiglens) * unitscale)

    if haveref:
        outstr += ("Total Ref Len   (Mb): " + fstr + "\n") % (reflen *
                                                              unitscale)
        outstr += ("Total Genome Map Len / Ref Len : " + fstr +
                   "\n") % (totcontiglen / reflen)
    if dorefalign:
        ratio = (float(nmapcontigs) / ncontigs if ncontigs > 0 else 0)
        outstr += ("N Genome Maps total align      : %i (%.2f)\n") % (
            nmapcontigs, ratio)
        outstr += ("Total Aligned Len (Mb)            : " + fstr +
                   "\n") % (totalignlen * unitscale)
        outstr += ("Total Aligned Len / Ref Len       : " + fstr +
                   "\n") % (totalignlen / reflen)
        uniquelen = 0
        for segs in uniqueseg.values():  # need to sum on dict entries
            util.uniqueRange(segs)  #this modifies list in place
            uniquelen += util.totalLengthFromRanges(segs)
        outstr += ("Total Unique Aligned Len (Mb)     : " + fstr +
                   "\n") % (uniquelen * unitscale)
        outstr += ("Total Unique Aligned Len / Ref Len: " + fstr +
                   "\n") % (uniquelen / reflen)

    return outstr
def getAlignStats(varsP, outFileList, reflen=0, isref=False, mergepath="", bnxpath=None) :
    '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule.
    reflen should be in Mb. If mergepath supplied, put merged .err there.
    If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this
    file and ignore outFileList.
    '''

    statonly = False #bnx stats only
    skipbnx = False #.err file processing only
    if bnxpath == None :
        if not varsP.sorted_file : #for runAlignMol, this is empty: nothing to do in this case
            skipbnx = True
        else :
            bnxpath = varsP.sorted_file+".bnx" #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix
    else : #if bnxpath != None :
        statonly = True
    if not skipbnx and not util.checkFile(bnxpath) :
        varsP.updatePipeReport("Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n" % bnxpath)
        return

    #find the minlen used for bnx_sort, which is a required arg set
    sortargs = []
    if varsP.argData.has_key('bnx_sort') : #for runAlignMol.py
        sortargs = varsP.argsListed('bnx_sort')
    minlen = 0
    validminlen = False
    if "-minlen" in sortargs :
        minlen = sortargs[sortargs.index("-minlen")+1] #next ele should be the len, if next ele isn't in list, the sort job will fail
        minlen = util.getIntFromString(minlen) #returns None if can't cast to int
        if minlen :
            validminlen = True

    if not validminlen and bnxpath == None and sortargs :
        varsP.updatePipeReport("Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n")
    if bnxpath != None : #if bnxpath, ignore minlen
        minlen = 0

    nmol = 0 #total n mol above minlen
    totlen = 0 #total mol len above minlen
    if util.checkFile(bnxpath) :
        #the bnxfile class is very wasteful. replace with below
        #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now
        outstr = "Reading molecule stats from %s:\n" % bnxpath
        outstr += "Molecule Stats:\n"
        moldict = util.simpleBnxStats(bnxpath, minlen)
        nmol = moldict["nmol"]
        totlen = moldict["totlen"]
        #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously
        outstr += "N mols: %i\n" % nmol
        outstr += ("Total len (Mb): %10.3f\n") % totlen
        outstr += ("Avg len (kb)  : %10.3f\n") % moldict["avglen"]
        outstr += ("Mol N50 (kb)  : %10.3f\n") % moldict["n50"]
        outstr += ("Lab (/100kb)  : %10.3f\n") % moldict["labdensity"]
        #    if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below
        #        bnx.molstats[minlen].genomesizemb = 0 
        #    outstr += str(bnx.molstats[minlen]) 
        #nmol = bnx.molstats[minlen].nmol
        #totlen = bnx.molstats[minlen].totlen

        if reflen : 
            cov = totlen / reflen #totlen is in Mb
            outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else "Contig", cov)
        if isref or reflen or statonly : #if neither, nothing to print
            varsP.updateInfoReport(outstr + "\n", printalso=True)
    elif not skipbnx :
        varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing bnx path:"+bnxpath+"\n")

    if statonly :
        return

    #lastly, load .xmaps and .errs from alignmol jobs and report on stats
    totmaplen = 0 #sum of lengths of mapped portions of all molecules, on reference
    totmapqrylen = 0 #sum of lengths of mapped portions of all molecules, on query
    totconf = 0 #sum of confidence of all alignments
    nalign = 0 #total number of alignments
    fplist = [] #lists for error rates
    fprlist = []
    fnlist = []
    bpplist = []
    nmaplist = [] #from .err
    gmaplist = [] #from .err
    llrmlist  = []; llrgmlist = []; bppsdlist = []
    sflist = []; sdlist = []; srlist = []; reslist = []; resdlist = []
    header = ""
    err = None #will be the alignParams object if any .err files are found
    mappref = ""
    if len(outFileList) > 0 :
        mappref = getMergeFilename(outFileList[0]) #make function to unify with same convention in mergeMap
    for outpath in outFileList : #these are file prefixes
        if util.checkFile(outpath+".xmap") :
            xmap = mc.xmap(outpath+".xmap")
            nalign += len(xmap.xmapLookup)
            totmaplen += xmap.getSumMappedRefLen() #in kb
            totmapqrylen += xmap.getSumMappedQryLen() #in kb
            totconf += sum([x.Confidence for x in xmap.xmapLookup.values()])
        else :
            varsP.updatePipeReport("Warning in AlignModule.getAlignStats: missing xmap:"+outpath+".xmap"+"\n")
        if util.checkFile(outpath+".err") :
            err = mc.alignParams(outpath+".err")
            if not header :
                header = err.header
            fplist.append(err.fp)
            fprlist.append(err.fprate)
            fnlist.append(err.fn)
            bpplist.append(err.bpp)
            reslist.append(err.res)
            nmaplist.append(err.nmaps)
            gmaplist.append(err.goodmaps)
            llrmlist.append(err.llrm)
            llrgmlist.append(err.llrgm)
            bppsdlist.append(err.bppsd)
            sflist.append(err.sf)
            sdlist.append(err.sd)
            srlist.append(err.sr)
            resdlist.append(err.ressd)

    #nalign from xmap should be the same as goodmaps from .err
    sumgoodmaps = sum(gmaplist)
    if sumgoodmaps != nalign :
        varsP.updateInfoReport("Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n" % (sumgoodmaps, nalign), printalso=True)
    if totmaplen or totconf or nalign : 
        outstr =  "Molecules Aligned to %s:\n" % ("Reference" if isref else "Assembly")
        outstr += "N mol align       : %9i\n" % nalign
        outstr += "Mol fraction align: %13.3f\n" % (float(nalign)/nmol if nmol else 0)
        outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3) #Mb
        if reflen > 0 : 
            outstr += ("Effective Cov (x) : %13.3f\n") % (totmaplen / 1e3 / reflen) #totlen is in kb
        outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen/nalign if nalign else 0)
        outstr += "Fraction align len: %13.3f\n" % (totmapqrylen/1e3/totlen if totlen else 0) #totmapqrylen is in kb, totlen is in mb
        outstr += "Tot confidence    : %11.1f\n" % totconf
        outstr += "Avg confidence    : %11.1f\n" % (totconf/nalign if nalign else 0)
        varsP.updateInfoReport(outstr, printalso=True)
    avgfp  = (sum(fplist)/len(fplist)   if len(fplist) else 0)
    avgfpr = (sum(fprlist)/len(fprlist) if len(fprlist) else 0)
    avgfn  = (sum(fnlist)/len(fnlist)   if len(fnlist) else 0)
    avgbpp = (sum(bpplist)/len(bpplist) if len(bpplist) else 0)
    avgres = (sum(reslist)/len(reslist) if len(reslist) else 0)
    avgllr = (sum(llrmlist)/len(llrmlist) if len(llrmlist) else 0)
    avgllg = (sum(llrgmlist)/len(llrgmlist) if len(llrgmlist) else 0)
    avgbps = (sum(bppsdlist)/len(bppsdlist) if len(bppsdlist) else 0)
    avgsf  = (sum(sflist)/len(sflist) if len(sflist) else 0)
    avgsd  = (sum(sdlist)/len(sdlist) if len(sdlist) else 0)
    avgsr  = (sum(srlist)/len(srlist) if len(srlist) else 0)
    avgrsd = (sum(resdlist)/len(resdlist) if len(resdlist) else 0)
    if avgfp or avgfn or avgbpp :
        outstr =  "Avg FP(/100kb)    : %12.2f\n" % avgfp
        outstr += "Avg FP ratio      : %13.3f\n" % avgfpr
        outstr += "Avg FN ratio      : %13.3f\n" % avgfn
        outstr += "Avg bpp           : %11.1f\n" % avgbpp
        outstr += "Avg sf            : %13.3f\n" % avgsf
        outstr += "Avg sd            : %13.3f\n" % avgsd
        outstr += "Avg sr            : %13.3f\n" % avgsr
        varsP.updateInfoReport(outstr + "\n", printalso=True)
    if err and mergepath : #have an error file (alignParams) object
        util.checkDir(mergepath)
        mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
        outpath = os.path.join(mergepath, mappref+mrgstr+".err")
        err.fp = avgfp
        err.fn = avgfn
        err.sf = avgsf
        err.sd = avgsd
        err.bpp = avgbpp
        err.res = avgres
        err.nmaps = sum(nmaplist)
        err.llrm  = avgllr
        err.goodmaps = sumgoodmaps
        err.llrgm = avgllg
        err.bppsd = avgbps
        err.fprate = avgfpr
        err.sr = avgsr
        err.ressd = avgrsd
        err.writeToFile(outpath)
def characterizeContigs(varsP, xmappath=None, listcontigs=False) :
    """Log simple contigs stats, and optionally align stats from xmappath.
    """
    #print header of table
    unitscale = 1e-6
    dorefalign = bool(xmappath) #i'm never actually calling refaligner here--this is just using xmappath
    #dorefidchr = False
    #dorefcid = False
    printrange = False
    #printsegrms = False
    #dochrstr = False
    iscluster = True
    haveref = bool(varsP.ref)

    #refcmap = mapClasses.multiCmap() #not used
    aligndir = varsP.contigAlignTarget

    try :
        #refcmap = mapClasses.multiCmap(varsP.ref)
        #reflen = refcmap.totalLength #note: total length of _all_ contigs
        reflen = mapClasses.multiCmap(varsP.ref, lengthonly=True).totalLength
        #in summary table, this is a denominator--make sure it's non-zero, don't bail (still get contig summary)
        if reflen <= 0 :
            #print "Warning in CharacterizeModule.characterizeContigs: bad reflen", reflen, "defaulting to 1" #not necessary
            reflen = 1.
    except:
        reflen = 1.

    outstr = "" #Contig Characterization:\n"

    if listcontigs and dorefalign :
        outstr += "cID  len"
        outstr += "  Cov"
        #if dorefidchr or dorefcid :
        #    outstr += "  rID" #ref index for either of these
        #if dorefidchr :
        #    outstr += "  rpos"
        outstr += "  alignlen  alignlen/len"
        if printrange :
            outstr += "  Qry1  Qry2  Ref1  Ref2"
        #outstr += ("  segRMS" if printsegrms else "")
        outstr += "  Conf  Conf/lenkb"
        outstr += "  FP  FN  sf  sd  bpp" #"  res" #--ditch res (not bpp)
        #if dochrstr :
        #    outstr += "  Chr"
        outstr += "\n"

    totcontiglen = 0; totalignlen = 0; nmapcontigs = 0; defalignlen = 0; totalignqlen = 0
    contiglens = [] #lens of all contigs in bases
    uniqueseg = {} #the argument to util.uniqueRange--stores all the map lengths which go into totalignlen--now each of these is a value, and the keys are the reference contig id or chromosome depending on dorefidchr
    avgfplist = []; avgfnlist = [] #average FP/FN rates
    #if dorefidchr :
    #    chrsum = refcmap.makeChrSummaryDict() #see mapClasses.multiCmap
    for citr, cpath in enumerate([varsP.latestMergedCmap]) : #always use contigpaths
        mapi = mapClasses.multiCmap(cpath) 
        totcontiglen += mapi.totalLength
        contiglens += mapi.getAllMapLengths() #getAllMapLengths is list of all map lengths

        #store a list of the contig ids in this multiCmap, then remove them if they're in the xmap
        # if they're not, print at the end
        mapids = mapi.getAllMapIds() #this is once per cmap, not once per characterizeModule call--becuase it's keys, it's already a copy, no need to copy explicitly
        ncontigs = len(mapids) #this is ncontigs in this file, ie, in mapi (see below)

        xmapobj = mapClasses.xmap() #empty map to fix xmapobj scope
        if dorefalign : #get xmap object
            #xmappath = aligndir+os.path.split(cpath)[-1].replace(".cmap", ".xmap") #need cmap file name
            #xmappath = self.xmapTarget
            #if xmappath exists isn't a file, nothing will be loaded
            if os.path.isfile( xmappath ) : #was if not isfile : continue
                xmapobj = mapClasses.xmap(xmappath)

        for xitr, xmapentry in enumerate(xmapobj.xmapLookup.values()) :
            #print the contig id from the xmap
            #this is sorted by ref position; could sort the list this loop is over by the contigQry data member,
            # _but_, I think I like reference-oriented better because you can see gap spanning

            #get map length from multicmap.getMapLength--returns 0 for any exception
            contiglen = mapi.getMapLength(xmapentry.contigQry)
            if contiglen <= 0 : #this strikes me as clumsy...but I don't want to return non-zero from multiCmap.getMapLength
                contiglen = 1.
            contigcov = mapi.getMapAvgCoverage(xmapentry.contigQry)

            if listcontigs :
                outstr += "%5i" % xmapentry.contigQry
                outstr += "  %9.1f  %2i" % (contiglen, contigcov)

            #don't print lenr for each contig--just total them
            lenr = xmapentry.getMappedRefLen()
            lenq = xmapentry.getMappedQryLen()
            refid = xmapentry.contigRef #int
            #if dorefidchr : #this is the encoding of ref contig id to chromosome and start position
            #    chrpos = mapClasses.intToChrPos(refid, verbose=False) #this returns a tuple (shouldn't fail, but verbose false)
            #    refidstr = "  %2s  %6i" % chrpos
            #    chrs = chrpos[0] #just for readability
            #    if chrsum.has_key(chrs) : #the method that fills chrsum (makeChrSummaryDict) also uses intToChrPos
            #        chrsum[chrs][0] += lenr #values are list, first ele is total aligned length, second is ref length (fixed)
            #elif dorefcid :
            #    refidstr = "  %3i" % refid #refid is int
            #else : #nothing for neither, but still need empty string
            refidstr = ""

            conf = xmapentry.Confidence #confidence from the xmap, and ratio of it to length in kb
            if listcontigs :
                alignpars = getMappedErrStats(aligndir, cpath) #an empty err file is produced for case of no align
                avgfplist.append( alignpars.fp ) 
                avgfnlist.append( alignpars.fn ) 
                outstr += "%s  %9.1f  %.3f" % (refidstr, lenq, lenq/contiglen) #length for refidstr set above
                if printrange :
                    outstr += "  %5.0f  %5.0f  %5.0f  %5.0f" % (xmapentry.QryStart/pn, xmapentry.QryStop/pn, xmapentry.RefStart/pn, xmapentry.RefStop/pn)
                #outstr += ("  %5.0f" % 0 if printsegrms else "") #don't print anything
                outstr += "  %3.0f  %5.3f" % (conf, conf*1000./lenq) #1000 is for kb
                outstr += "  " + alignpars.getParamString()

            totalignlen  += lenr
            totalignqlen += lenq

            #uniqueseg is now a dict to take into account which chromosome the query contig is on
            #note need refid bc need to separate different contigs on the _same_ chromosome
            if not uniqueseg.has_key(refid) : #if first contig on chromosome, need to init new list
                uniqueseg[refid] = []
            uniqueseg[refid].append( [xmapentry.RefStart, xmapentry.RefStop] )

            #process mapids--remove contig id (contigQry) from mapids if they're in the xmap so non-aligning contigs can be printed
            if xmapentry.contigQry in mapids :
                mapids.remove(xmapentry.contigQry)

            #note: the feature of multiple alignments (strict vs default) is no longer implemented
            defalignlen  += lenr #currently, just default and strict

            #if listcontigs and dochrstr :
            #    outstr += "  " + refIndexToChrStr( xmapentry.contigRef )
            #    outstr += "\n"
            
        #end loop on xmap entries

        #now that all xmap entries are processed, all contigs with an alignment are removed from mapids,
        # so we can get n contigs align using this and ncontigs
        nmapcontigs += ncontigs - len(mapids) #sum multiple cmaps

        #and print the data for the contigs which don't align--just id, length, and coverage
        #these lines are kind of redundant, but I guess that's ok
        if listcontigs :
            for ids in mapids :
                outstr += "%5i" % ids
                #get map length from multicmap.getMapLength--returns 0 for any exception
                contiglen = mapi.getMapLength(ids) #it's ok if it's 0 bc it's never a denominator here
                contigcov = mapi.getMapAvgCoverage(ids)
                outstr += "  %9.1f  %2i\n" % (contiglen, contigcov)

    #end loop on contigs

    varsP.totAssemblyLenMb = totcontiglen*unitscale
    ncontigs = len(contiglens) #contigpaths is just files--contiglens is all contigs
    avgcontiglen = (float(totcontiglen)/ncontigs if ncontigs > 0 else 0)

    #print averages
    if listcontigs and not iscluster : #only do avg if not merged, otherwise just one noise parameter
        avgfp    = sum(avgfplist)/len(avgfplist)
        avgfn    = sum(avgfnlist)/len(avgfnlist)
        outstr += "AVG    %9.1f           %9.1f                     %5.3f  %5.3f\n" % (avgcontiglen, totalignqlen/nmapcontigs, avgfp, avgfn)

    if unitscale > 1e-6 : #if not megabases
        fstr = "%9.0f"
    else : #megabases
        fstr = "%8.3f" 

    outstr += "N Genome Maps: %i\n" % ncontigs
    outstr += ("Total Genome Map Len (Mb): "+fstr+"\n") % (totcontiglen*unitscale)
    outstr += ("Avg. Genome Map Len  (Mb): "+fstr+"\n") % (avgcontiglen*unitscale)
    outstr += ("Median Genome Map Len(Mb): "+fstr+"\n") % (util.getMedian(contiglens)*unitscale)
    outstr += ("Genome Map n50       (Mb): "+fstr+"\n") % (util.getn50(contiglens)*unitscale)

    if haveref :
        outstr += ("Total Ref Len   (Mb): "+fstr+"\n") % (reflen*unitscale)
        outstr += ("Total Genome Map Len / Ref Len : "+fstr+"\n") % (totcontiglen/reflen)
    if dorefalign :
        #print the chromosome summary before the strict/default/total align stats
        #if dorefidchr :
        #    outstr += "Chromosome Summary:\n"
        #    outstr += "Chromosome  align len  ref len  (ratio):\n"
        #    for chrs, align in chrsum.iteritems() :
        #        outstr += "%3s  %9.0f  %9.0f  (%5.3f)\n" % (chrs, align[0], align[1], align[0]/align[1])

        ratio = (float(nmapcontigs)/ncontigs if ncontigs > 0 else 0)
        outstr += ("N Genome Maps total align      : %i (%.2f)\n") % (nmapcontigs, ratio)
        outstr += ("Total Aligned Len (Mb)            : "+fstr+"\n") % (totalignlen*unitscale)
        outstr += ("Total Aligned Len / Ref Len       : "+fstr+"\n") % (totalignlen/reflen)
        uniquelen = 0
        for segs in uniqueseg.values() : # need to sum on dict entries
            util.uniqueRange(segs) #this modifies list in place
            uniquelen += util.totalLengthFromRanges( segs )
        outstr += ("Total Unique Aligned Len (Mb)     : "+fstr+"\n") % (uniquelen*unitscale)
        outstr += ("Total Unique Aligned Len / Ref Len: "+fstr+"\n") % (uniquelen/reflen)

    return outstr
Ejemplo n.º 4
0
def getAlignStats(varsP,
                  outFileList,
                  reflen=0,
                  isref=False,
                  mergepath="",
                  bnxpath=None):
    '''Standalone fn for alignment statistics for both AlignModule and AlignRefModule.
    reflen should be in Mb. If mergepath supplied, put merged .err there.
    If bnxpath == None, assume varsP.sorted_file; otherwise, just report stats of this
    file and ignore outFileList.
    '''

    statonly = False  #bnx stats only
    skipbnx = False  #.err file processing only
    if bnxpath == None:
        if not varsP.sorted_file:  #for runAlignMol, this is empty: nothing to do in this case
            skipbnx = True
        else:
            bnxpath = varsP.sorted_file + ".bnx"  #set in PairwiseModule.sort_BNX even if bypassed, but needs suffix
    else:  #if bnxpath != None :
        statonly = True
    if not skipbnx and not util.checkFile(bnxpath):
        varsP.updatePipeReport(
            "Warning in AlignModule.getAlignStats: bnxpath supplied but not found: %s\n"
            % bnxpath)
        return

    #find the minlen used for bnx_sort, which is a required arg set
    sortargs = []
    if varsP.argData.has_key('bnx_sort'):  #for runAlignMol.py
        sortargs = varsP.argsListed('bnx_sort')
    minlen = 0
    validminlen = False
    if "-minlen" in sortargs:
        minlen = sortargs[
            sortargs.index("-minlen") +
            1]  #next ele should be the len, if next ele isn't in list, the sort job will fail
        minlen = util.getIntFromString(
            minlen)  #returns None if can't cast to int
        if minlen:
            validminlen = True

    if not validminlen and bnxpath == None and sortargs:
        varsP.updatePipeReport(
            "Warning in AlignModule.getAlignStats: unable to obtain minlen from bnx_sort arguments; defaulting to 0\n"
        )
    if bnxpath != None:  #if bnxpath, ignore minlen
        minlen = 0

    nmol = 0  #total n mol above minlen
    totlen = 0  #total mol len above minlen
    if util.checkFile(bnxpath):
        #the bnxfile class is very wasteful. replace with below
        #bnx = util.bnxfile(bnxpath, [minlen]) #second arg are minlen thresholds, just use one for now
        outstr = "Reading molecule stats from %s:\n" % bnxpath
        outstr += "Molecule Stats:\n"
        moldict = util.simpleBnxStats(bnxpath, minlen)
        nmol = moldict["nmol"]
        totlen = moldict["totlen"]
        #if isref : #this is the same for isref or not, but just print twice bc no easy way to tell if was printed previously
        outstr += "N mols: %i\n" % nmol
        outstr += ("Total len (Mb): %10.3f\n") % totlen
        outstr += ("Avg len (kb)  : %10.3f\n") % moldict["avglen"]
        outstr += ("Mol N50 (kb)  : %10.3f\n") % moldict["n50"]
        outstr += ("Lab (/100kb)  : %10.3f\n") % moldict["labdensity"]
        #    if reflen : #disable the "Genome Cov" line bc its redundant with Ref Cov below
        #        bnx.molstats[minlen].genomesizemb = 0
        #    outstr += str(bnx.molstats[minlen])
        #nmol = bnx.molstats[minlen].nmol
        #totlen = bnx.molstats[minlen].totlen

        if reflen:
            cov = totlen / reflen  #totlen is in Mb
            outstr += ("%-6s Cov (x): %10.3f\n") % ("Ref" if isref else
                                                    "Contig", cov)
        if isref or reflen or statonly:  #if neither, nothing to print
            varsP.updateInfoReport(outstr + "\n", printalso=True)
    elif not skipbnx:
        varsP.updatePipeReport(
            "Warning in AlignModule.getAlignStats: missing bnx path:" +
            bnxpath + "\n")

    if statonly:
        return

    #lastly, load .xmaps and .errs from alignmol jobs and report on stats
    totmaplen = 0  #sum of lengths of mapped portions of all molecules, on reference
    totmapqrylen = 0  #sum of lengths of mapped portions of all molecules, on query
    totconf = 0  #sum of confidence of all alignments
    nalign = 0  #total number of alignments
    fplist = []  #lists for error rates
    fprlist = []
    fnlist = []
    bpplist = []
    nmaplist = []  #from .err
    gmaplist = []  #from .err
    llrmlist = []
    llrgmlist = []
    bppsdlist = []
    sflist = []
    sdlist = []
    srlist = []
    reslist = []
    resdlist = []
    header = ""
    err = None  #will be the alignParams object if any .err files are found
    mappref = ""
    if len(outFileList) > 0:
        mappref = getMergeFilename(
            outFileList[0]
        )  #make function to unify with same convention in mergeMap
    for outpath in outFileList:  #these are file prefixes
        if util.checkFile(outpath + ".xmap"):
            xmap = mc.xmap(outpath + ".xmap")
            nalign += len(xmap.xmapLookup)
            totmaplen += xmap.getSumMappedRefLen()  #in kb
            totmapqrylen += xmap.getSumMappedQryLen()  #in kb
            totconf += sum([x.Confidence for x in xmap.xmapLookup.values()])
        else:
            varsP.updatePipeReport(
                "Warning in AlignModule.getAlignStats: missing xmap:" +
                outpath + ".xmap" + "\n")
        if util.checkFile(outpath + ".err"):
            err = mc.alignParams(outpath + ".err")
            if not header:
                header = err.header
            fplist.append(err.fp)
            fprlist.append(err.fprate)
            fnlist.append(err.fn)
            bpplist.append(err.bpp)
            reslist.append(err.res)
            nmaplist.append(err.nmaps)
            gmaplist.append(err.goodmaps)
            llrmlist.append(err.llrm)
            llrgmlist.append(err.llrgm)
            bppsdlist.append(err.bppsd)
            sflist.append(err.sf)
            sdlist.append(err.sd)
            srlist.append(err.sr)
            resdlist.append(err.ressd)

    #nalign from xmap should be the same as goodmaps from .err
    sumgoodmaps = sum(gmaplist)
    if sumgoodmaps != nalign:
        varsP.updateInfoReport(
            "Warning in getAlignStats: n mol align differ in .err files (%i) and .xmaps (%i)\n"
            % (sumgoodmaps, nalign),
            printalso=True)
    if totmaplen or totconf or nalign:
        outstr = "Molecules Aligned to %s:\n" % ("Reference"
                                                 if isref else "Assembly")
        outstr += "N mol align       : %9i\n" % nalign
        outstr += "Mol fraction align: %13.3f\n" % (float(nalign) /
                                                    nmol if nmol else 0)
        outstr += "Tot align len (Mb): %11.1f\n" % (totmapqrylen / 1e3)  #Mb
        if reflen > 0:
            outstr += ("Effective Cov (x) : %13.3f\n") % (
                totmaplen / 1e3 / reflen)  #totlen is in kb
        outstr += "Avg align len (kb): %11.1f\n" % (totmapqrylen /
                                                    nalign if nalign else 0)
        outstr += "Fraction align len: %13.3f\n" % (
            totmapqrylen / 1e3 / totlen if totlen else 0
        )  #totmapqrylen is in kb, totlen is in mb
        outstr += "Tot confidence    : %11.1f\n" % totconf
        outstr += "Avg confidence    : %11.1f\n" % (totconf /
                                                    nalign if nalign else 0)
        varsP.updateInfoReport(outstr, printalso=True)
    avgfp = (sum(fplist) / len(fplist) if len(fplist) else 0)
    avgfpr = (sum(fprlist) / len(fprlist) if len(fprlist) else 0)
    avgfn = (sum(fnlist) / len(fnlist) if len(fnlist) else 0)
    avgbpp = (sum(bpplist) / len(bpplist) if len(bpplist) else 0)
    avgres = (sum(reslist) / len(reslist) if len(reslist) else 0)
    avgllr = (sum(llrmlist) / len(llrmlist) if len(llrmlist) else 0)
    avgllg = (sum(llrgmlist) / len(llrgmlist) if len(llrgmlist) else 0)
    avgbps = (sum(bppsdlist) / len(bppsdlist) if len(bppsdlist) else 0)
    avgsf = (sum(sflist) / len(sflist) if len(sflist) else 0)
    avgsd = (sum(sdlist) / len(sdlist) if len(sdlist) else 0)
    avgsr = (sum(srlist) / len(srlist) if len(srlist) else 0)
    avgrsd = (sum(resdlist) / len(resdlist) if len(resdlist) else 0)
    if avgfp or avgfn or avgbpp:
        outstr = "Avg FP(/100kb)    : %12.2f\n" % avgfp
        outstr += "Avg FP ratio      : %13.3f\n" % avgfpr
        outstr += "Avg FN ratio      : %13.3f\n" % avgfn
        outstr += "Avg bpp           : %11.1f\n" % avgbpp
        outstr += "Avg sf            : %13.3f\n" % avgsf
        outstr += "Avg sd            : %13.3f\n" % avgsd
        outstr += "Avg sr            : %13.3f\n" % avgsr
        varsP.updateInfoReport(outstr + "\n", printalso=True)
    if err and mergepath:  #have an error file (alignParams) object
        util.checkDir(mergepath)
        mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
        outpath = os.path.join(mergepath, mappref + mrgstr + ".err")
        err.fp = avgfp
        err.fn = avgfn
        err.sf = avgsf
        err.sd = avgsd
        err.bpp = avgbpp
        err.res = avgres
        err.nmaps = sum(nmaplist)
        err.llrm = avgllr
        err.goodmaps = sumgoodmaps
        err.llrgm = avgllg
        err.bppsd = avgbps
        err.fprate = avgfpr
        err.sr = avgsr
        err.ressd = avgrsd
        err.writeToFile(outpath)