Python getn50 Examples

Programming Language: Python

Namespace/Package Name: utilities

Method/Function: getn50

Examples at hotexamples.com: 4

Python getn50 - 4 examples found. These are the top rated real world Python examples of utilities.getn50 extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: MapClassesRev.py Project: alexharkess/BNG-FragileSiteRepair

 def profileMapLengths(self):
     cmapLens = []
     nMaps = 0
     totalLen = 0
     for curCmap in self.cmapDB.values():
         cmapLen = curCmap.cmapLen
         cmapLens.append(cmapLen)
         totalLen += cmapLen
         nMaps += 1
     
     if nMaps:
         self.n = nMaps
         self.n50 = util.getn50(cmapLens)
         self.totalLen = totalLen
         self.aveLen = totalLen / nMaps

Example #2

Show file

    def profileMapLengths(self):
        cmapLens = []
        nMaps = 0
        totalLen = 0
        for curCmap in self.cmapDB.values():
            cmapLen = curCmap.cmapLen
            cmapLens.append(cmapLen)
            totalLen += cmapLen
            nMaps += 1

        if nMaps:
            self.n = nMaps
            self.n50 = util.getn50(cmapLens)
            self.totalLen = totalLen
            self.aveLen = totalLen / nMaps

Example #3

Show file

def characterizeContigs(varsP, xmappath=None):
    """Log simple contigs stats, and optionally align stats from xmappath.
    """
    #print "xmappath:", xmappath
    unitscale = 1e-6
    dorefalign = bool(
        xmappath
    )  #i'm never actually calling refaligner here--this is just using xmappath
    haveref = bool(varsP.ref)

    #refcmap = mapClasses.multiCmap() #not used
    aligndir = varsP.contigAlignTarget

    try:
        #refcmap = mapClasses.multiCmap(varsP.ref)
        #reflen = refcmap.totalLength #note: total length of _all_ contigs
        reflen = mapClasses.multiCmap(varsP.ref, lengthonly=True).totalLength
        #in summary table, this is a denominator--make sure it's non-zero, don't bail (still get contig summary)
        if reflen <= 0:
            #print "Warning in CharacterizeModule.characterizeContigs: bad reflen", reflen, "defaulting to 1" #not necessary
            reflen = 1.
    except:
        reflen = 1.

    outstr = ""  #Contig Characterization:\n"

    #check for .hmaps in same dir as latestMergedCmap: if any, add a line for haploid genome size
    hmaps = util.getListOfFilesFromDir(os.path.dirname(varsP.latestMergedCmap),
                                       ".hmap")
    haplotype = (len(hmaps) > 0)
    haplotypelen = 0
    hapcontiglens = []

    totcontiglen = 0
    totalignlen = 0
    nmapcontigs = 0
    totalignqlen = 0  #defalignlen = 0;
    contiglens = []  #lens of all contigs in bases
    uniqueseg = {
    }  #the argument to util.uniqueRange--stores all the map lengths which go into totalignlen--now each of these is a value, and the keys are the reference contig id or chromosome depending on dorefidchr
    for citr, cpath in enumerate([varsP.latestMergedCmap
                                  ]):  #always use contigpaths
        mapi = mapClasses.multiCmap(cpath)
        totcontiglen += mapi.totalLength
        contiglens += mapi.getAllMapLengths(
        )  #getAllMapLengths is list of all map lengths
        if haplotype:
            haplotypelen += mapi.getHaplotypeTotalMapLength()
            hapcontiglens.extend(mapi.getHaplotypeMapLengths())

        #store a list of the contig ids in this multiCmap, then remove them if they're in the xmap
        # if they're not, print at the end
        mapids = mapi.getAllMapIds(
        )  #this is once per cmap, not once per characterizeModule call--becuase it's keys, it's already a copy, no need to copy explicitly
        ncontigs = len(
            mapids)  #this is ncontigs in this file, ie, in mapi (see below)

        xmapobj = mapClasses.xmap()  #empty map to fix xmapobj scope
        if dorefalign:  #get xmap object
            if util.checkFile(xmappath, ".xmap"):
                xmapobj = mapClasses.xmap(xmappath)

        for xitr, xmapentry in enumerate(xmapobj.xmapLookup.values()):

            #get map length from multicmap.getMapLength--returns 0 for any exception
            contiglen = mapi.getMapLength(xmapentry.contigQry)
            if contiglen <= 0:  #this strikes me as clumsy...but I don't want to return non-zero from multiCmap.getMapLength
                contiglen = 1.
            contigcov = mapi.getMapAvgCoverage(xmapentry.contigQry)

            #don't print lenr for each contig--just total them
            lenr = xmapentry.getMappedRefLen()
            lenq = xmapentry.getMappedQryLen()
            refid = xmapentry.contigRef  #int

            totalignlen += lenr
            totalignqlen += lenq

            #uniqueseg is now a dict to take into account which chromosome the query contig is on
            #note need refid bc need to separate different contigs on the _same_ chromosome
            if not uniqueseg.has_key(
                    refid
            ):  #if first contig on chromosome, need to init new list
                uniqueseg[refid] = []
            uniqueseg[refid].append([xmapentry.RefStart, xmapentry.RefStop])

            #process mapids--remove contig id (contigQry) from mapids if they're in the xmap so non-aligning contigs can be printed
            if xmapentry.contigQry in mapids:
                mapids.remove(xmapentry.contigQry)

        #end loop on xmap entries

        #now that all xmap entries are processed, all contigs with an alignment are removed from mapids,
        # so we can get n contigs align using this and ncontigs
        nmapcontigs += ncontigs - len(mapids)  #sum multiple cmaps

    #end loop on contigs

    varsP.totAssemblyLenMb = totcontiglen * unitscale
    ncontigs = len(
        contiglens)  #contigpaths is just files--contiglens is all contigs
    avgcontiglen = (float(totcontiglen) / ncontigs if ncontigs > 0 else 0)

    if unitscale > 1e-6:  #if not megabases
        fstr = "%9.0f"
    else:  #megabases
        fstr = "%8.3f"

    if haplotype:  #new format for haplotype
        #if haplotypelen != sum(hapcontiglens) : #simply print warning in this case (do not log): ignore this bc of floating point rounding
        #print "Warning in characterizeContigs: haplotype lengths are inconsistent:", haplotypelen, sum(hapcontiglens)
        #diploid is same as else below, but names change
        outstr += "Diploid N Genome Maps: %i\n" % ncontigs
        outstr += ("Diploid Genome Map Len        (Mb): " + fstr +
                   "\n") % (totcontiglen * unitscale)
        outstr += ("Diploid Avg. Genome Map Len   (Mb): " + fstr +
                   "\n") % (avgcontiglen * unitscale)
        outstr += ("Diploid Median Genome Map Len (Mb): " + fstr +
                   "\n") % (util.getMedian(contiglens) * unitscale)
        outstr += ("Diploid Genome Map n50        (Mb): " + fstr +
                   "\n") % (util.getn50(contiglens) * unitscale)
        #haploid : ignore haplotypelen, just use the list hapcontiglens
        outstr += "Haploid N Genome Maps: %i\n" % len(hapcontiglens)
        tot = sum(hapcontiglens)
        avg = (tot / len(hapcontiglens) if len(hapcontiglens) else 0)
        outstr += ("Haploid Genome Map Len        (Mb): " + fstr +
                   "\n") % (tot * unitscale)
        outstr += ("Haploid Avg. Genome Map Len   (Mb): " + fstr +
                   "\n") % (avg * unitscale)
        outstr += ("Haploid Median Genome Map Len (Mb): " + fstr +
                   "\n") % (util.getMedian(hapcontiglens) * unitscale)
        outstr += ("Haploid Genome Map n50        (Mb): " + fstr +
                   "\n") % (util.getn50(hapcontiglens) * unitscale)
    else:  #default to old format
        outstr += "N Genome Maps: %i\n" % ncontigs
        outstr += ("Total Genome Map Len  (Mb): " + fstr +
                   "\n") % (totcontiglen * unitscale)
        outstr += ("Avg. Genome Map Len   (Mb): " + fstr +
                   "\n") % (avgcontiglen * unitscale)
        outstr += ("Median Genome Map Len (Mb): " + fstr +
                   "\n") % (util.getMedian(contiglens) * unitscale)
        outstr += ("Genome Map n50        (Mb): " + fstr +
                   "\n") % (util.getn50(contiglens) * unitscale)

    if haveref:
        outstr += ("Total Ref Len   (Mb): " + fstr + "\n") % (reflen *
                                                              unitscale)
        outstr += ("Total Genome Map Len / Ref Len : " + fstr +
                   "\n") % (totcontiglen / reflen)
    if dorefalign:
        ratio = (float(nmapcontigs) / ncontigs if ncontigs > 0 else 0)
        outstr += ("N Genome Maps total align      : %i (%.2f)\n") % (
            nmapcontigs, ratio)
        outstr += ("Total Aligned Len (Mb)            : " + fstr +
                   "\n") % (totalignlen * unitscale)
        outstr += ("Total Aligned Len / Ref Len       : " + fstr +
                   "\n") % (totalignlen / reflen)
        uniquelen = 0
        for segs in uniqueseg.values():  # need to sum on dict entries
            util.uniqueRange(segs)  #this modifies list in place
            uniquelen += util.totalLengthFromRanges(segs)
        outstr += ("Total Unique Aligned Len (Mb)     : " + fstr +
                   "\n") % (uniquelen * unitscale)
        outstr += ("Total Unique Aligned Len / Ref Len: " + fstr +
                   "\n") % (uniquelen / reflen)

    return outstr

Example #4

Show file

File: CharacterizeModule.py Project: alexharkess/BNG-FragileSiteRepair

def characterizeContigs(varsP, xmappath=None, listcontigs=False) :
    """Log simple contigs stats, and optionally align stats from xmappath.
    """
    #print header of table
    unitscale = 1e-6
    dorefalign = bool(xmappath) #i'm never actually calling refaligner here--this is just using xmappath
    #dorefidchr = False
    #dorefcid = False
    printrange = False
    #printsegrms = False
    #dochrstr = False
    iscluster = True
    haveref = bool(varsP.ref)

    #refcmap = mapClasses.multiCmap() #not used
    aligndir = varsP.contigAlignTarget

    try :
        #refcmap = mapClasses.multiCmap(varsP.ref)
        #reflen = refcmap.totalLength #note: total length of _all_ contigs
        reflen = mapClasses.multiCmap(varsP.ref, lengthonly=True).totalLength
        #in summary table, this is a denominator--make sure it's non-zero, don't bail (still get contig summary)
        if reflen <= 0 :
            #print "Warning in CharacterizeModule.characterizeContigs: bad reflen", reflen, "defaulting to 1" #not necessary
            reflen = 1.
    except:
        reflen = 1.

    outstr = "" #Contig Characterization:\n"

    if listcontigs and dorefalign :
        outstr += "cID  len"
        outstr += "  Cov"
        #if dorefidchr or dorefcid :
        #    outstr += "  rID" #ref index for either of these
        #if dorefidchr :
        #    outstr += "  rpos"
        outstr += "  alignlen  alignlen/len"
        if printrange :
            outstr += "  Qry1  Qry2  Ref1  Ref2"
        #outstr += ("  segRMS" if printsegrms else "")
        outstr += "  Conf  Conf/lenkb"
        outstr += "  FP  FN  sf  sd  bpp" #"  res" #--ditch res (not bpp)
        #if dochrstr :
        #    outstr += "  Chr"
        outstr += "\n"

    totcontiglen = 0; totalignlen = 0; nmapcontigs = 0; defalignlen = 0; totalignqlen = 0
    contiglens = [] #lens of all contigs in bases
    uniqueseg = {} #the argument to util.uniqueRange--stores all the map lengths which go into totalignlen--now each of these is a value, and the keys are the reference contig id or chromosome depending on dorefidchr
    avgfplist = []; avgfnlist = [] #average FP/FN rates
    #if dorefidchr :
    #    chrsum = refcmap.makeChrSummaryDict() #see mapClasses.multiCmap
    for citr, cpath in enumerate([varsP.latestMergedCmap]) : #always use contigpaths
        mapi = mapClasses.multiCmap(cpath) 
        totcontiglen += mapi.totalLength
        contiglens += mapi.getAllMapLengths() #getAllMapLengths is list of all map lengths

        #store a list of the contig ids in this multiCmap, then remove them if they're in the xmap
        # if they're not, print at the end
        mapids = mapi.getAllMapIds() #this is once per cmap, not once per characterizeModule call--becuase it's keys, it's already a copy, no need to copy explicitly
        ncontigs = len(mapids) #this is ncontigs in this file, ie, in mapi (see below)

        xmapobj = mapClasses.xmap() #empty map to fix xmapobj scope
        if dorefalign : #get xmap object
            #xmappath = aligndir+os.path.split(cpath)[-1].replace(".cmap", ".xmap") #need cmap file name
            #xmappath = self.xmapTarget
            #if xmappath exists isn't a file, nothing will be loaded
            if os.path.isfile( xmappath ) : #was if not isfile : continue
                xmapobj = mapClasses.xmap(xmappath)

        for xitr, xmapentry in enumerate(xmapobj.xmapLookup.values()) :
            #print the contig id from the xmap
            #this is sorted by ref position; could sort the list this loop is over by the contigQry data member,
            # _but_, I think I like reference-oriented better because you can see gap spanning

            #get map length from multicmap.getMapLength--returns 0 for any exception
            contiglen = mapi.getMapLength(xmapentry.contigQry)
            if contiglen <= 0 : #this strikes me as clumsy...but I don't want to return non-zero from multiCmap.getMapLength
                contiglen = 1.
            contigcov = mapi.getMapAvgCoverage(xmapentry.contigQry)

            if listcontigs :
                outstr += "%5i" % xmapentry.contigQry
                outstr += "  %9.1f  %2i" % (contiglen, contigcov)

            #don't print lenr for each contig--just total them
            lenr = xmapentry.getMappedRefLen()
            lenq = xmapentry.getMappedQryLen()
            refid = xmapentry.contigRef #int
            #if dorefidchr : #this is the encoding of ref contig id to chromosome and start position
            #    chrpos = mapClasses.intToChrPos(refid, verbose=False) #this returns a tuple (shouldn't fail, but verbose false)
            #    refidstr = "  %2s  %6i" % chrpos
            #    chrs = chrpos[0] #just for readability
            #    if chrsum.has_key(chrs) : #the method that fills chrsum (makeChrSummaryDict) also uses intToChrPos
            #        chrsum[chrs][0] += lenr #values are list, first ele is total aligned length, second is ref length (fixed)
            #elif dorefcid :
            #    refidstr = "  %3i" % refid #refid is int
            #else : #nothing for neither, but still need empty string
            refidstr = ""

            conf = xmapentry.Confidence #confidence from the xmap, and ratio of it to length in kb
            if listcontigs :
                alignpars = getMappedErrStats(aligndir, cpath) #an empty err file is produced for case of no align
                avgfplist.append( alignpars.fp ) 
                avgfnlist.append( alignpars.fn ) 
                outstr += "%s  %9.1f  %.3f" % (refidstr, lenq, lenq/contiglen) #length for refidstr set above
                if printrange :
                    outstr += "  %5.0f  %5.0f  %5.0f  %5.0f" % (xmapentry.QryStart/pn, xmapentry.QryStop/pn, xmapentry.RefStart/pn, xmapentry.RefStop/pn)
                #outstr += ("  %5.0f" % 0 if printsegrms else "") #don't print anything
                outstr += "  %3.0f  %5.3f" % (conf, conf*1000./lenq) #1000 is for kb
                outstr += "  " + alignpars.getParamString()

            totalignlen  += lenr
            totalignqlen += lenq

            #uniqueseg is now a dict to take into account which chromosome the query contig is on
            #note need refid bc need to separate different contigs on the _same_ chromosome
            if not uniqueseg.has_key(refid) : #if first contig on chromosome, need to init new list
                uniqueseg[refid] = []
            uniqueseg[refid].append( [xmapentry.RefStart, xmapentry.RefStop] )

            #process mapids--remove contig id (contigQry) from mapids if they're in the xmap so non-aligning contigs can be printed
            if xmapentry.contigQry in mapids :
                mapids.remove(xmapentry.contigQry)

            #note: the feature of multiple alignments (strict vs default) is no longer implemented
            defalignlen  += lenr #currently, just default and strict

            #if listcontigs and dochrstr :
            #    outstr += "  " + refIndexToChrStr( xmapentry.contigRef )
            #    outstr += "\n"
            
        #end loop on xmap entries

        #now that all xmap entries are processed, all contigs with an alignment are removed from mapids,
        # so we can get n contigs align using this and ncontigs
        nmapcontigs += ncontigs - len(mapids) #sum multiple cmaps

        #and print the data for the contigs which don't align--just id, length, and coverage
        #these lines are kind of redundant, but I guess that's ok
        if listcontigs :
            for ids in mapids :
                outstr += "%5i" % ids
                #get map length from multicmap.getMapLength--returns 0 for any exception
                contiglen = mapi.getMapLength(ids) #it's ok if it's 0 bc it's never a denominator here
                contigcov = mapi.getMapAvgCoverage(ids)
                outstr += "  %9.1f  %2i\n" % (contiglen, contigcov)

    #end loop on contigs

    varsP.totAssemblyLenMb = totcontiglen*unitscale
    ncontigs = len(contiglens) #contigpaths is just files--contiglens is all contigs
    avgcontiglen = (float(totcontiglen)/ncontigs if ncontigs > 0 else 0)

    #print averages
    if listcontigs and not iscluster : #only do avg if not merged, otherwise just one noise parameter
        avgfp    = sum(avgfplist)/len(avgfplist)
        avgfn    = sum(avgfnlist)/len(avgfnlist)
        outstr += "AVG    %9.1f           %9.1f                     %5.3f  %5.3f\n" % (avgcontiglen, totalignqlen/nmapcontigs, avgfp, avgfn)

    if unitscale > 1e-6 : #if not megabases
        fstr = "%9.0f"
    else : #megabases
        fstr = "%8.3f" 

    outstr += "N Genome Maps: %i\n" % ncontigs
    outstr += ("Total Genome Map Len (Mb): "+fstr+"\n") % (totcontiglen*unitscale)
    outstr += ("Avg. Genome Map Len  (Mb): "+fstr+"\n") % (avgcontiglen*unitscale)
    outstr += ("Median Genome Map Len(Mb): "+fstr+"\n") % (util.getMedian(contiglens)*unitscale)
    outstr += ("Genome Map n50       (Mb): "+fstr+"\n") % (util.getn50(contiglens)*unitscale)

    if haveref :
        outstr += ("Total Ref Len   (Mb): "+fstr+"\n") % (reflen*unitscale)
        outstr += ("Total Genome Map Len / Ref Len : "+fstr+"\n") % (totcontiglen/reflen)
    if dorefalign :
        #print the chromosome summary before the strict/default/total align stats
        #if dorefidchr :
        #    outstr += "Chromosome Summary:\n"
        #    outstr += "Chromosome  align len  ref len  (ratio):\n"
        #    for chrs, align in chrsum.iteritems() :
        #        outstr += "%3s  %9.0f  %9.0f  (%5.3f)\n" % (chrs, align[0], align[1], align[0]/align[1])

        ratio = (float(nmapcontigs)/ncontigs if ncontigs > 0 else 0)
        outstr += ("N Genome Maps total align      : %i (%.2f)\n") % (nmapcontigs, ratio)
        outstr += ("Total Aligned Len (Mb)            : "+fstr+"\n") % (totalignlen*unitscale)
        outstr += ("Total Aligned Len / Ref Len       : "+fstr+"\n") % (totalignlen/reflen)
        uniquelen = 0
        for segs in uniqueseg.values() : # need to sum on dict entries
            util.uniqueRange(segs) #this modifies list in place
            uniquelen += util.totalLengthFromRanges( segs )
        outstr += ("Total Unique Aligned Len (Mb)     : "+fstr+"\n") % (uniquelen*unitscale)
        outstr += ("Total Unique Aligned Len / Ref Len: "+fstr+"\n") % (uniquelen/reflen)

    return outstr