def mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="") :
    """Given a list of file prefixes (outFileList), append "_r.cmap" to them, and merge them
    to outdir. Report to varsP if supplied, stdout if not.
    Also support outFileList is full paths (including "_r.cmap").
    If splitByContig < 1, output each contig separately, if == 1, only output single merged cmap,
    and if > 1, do both.
    Always use stagename if supplied; if not, must supply varsP otherwise prefix is empty.
    """
    
    if not util.checkDir(outdir) :
        err_msg = "Warning in AlignModule.mergeRcmaps: could not make outdir %s, skipping copy number" % outdir
        logOrPrintError(err_msg, varsP)
        return

    if not outFileList : #just an argument check--check for presence on disk is below
        err_msg = "Warning in AlignModule.mergeRcmaps: no maps supplied"
        logOrPrintError(err_msg, varsP)
        return

    outFileList.sort() #for reproducibility with runAlignMerge.py (different order when listing dir)
    rsuf = "_r.cmap"
    #mappref = os.path.split(outFileList[0])[1] #this is just prefix, but with integer suffix--get it before -- no longer used
    #mappref = mappref[:mappref.rfind("_")+1] #remove integer suffix
    #even though outFileList should all be there, a job may have failed--check all, just existence
    present = []
    for outf in outFileList :
        target = (outf+rsuf if not outf.endswith(rsuf) else outf) #now support either
        if not util.checkFile(target) :
            err_msg = "Warning in AlignModule.mergeRcmaps: missing _r.cmap %s" % target
            logOrPrintError(err_msg, varsP)
        else :
            present.append(target)
    if not present : #no _r.cmaps found (this will also happen for empty outFileList)
        err_msg = "Warning in AlignModule.mergeRcmaps: no _r.cmaps found, skipping copy number"
        logOrPrintError(err_msg, varsP)
        return
    outFileList = present #yes, it's redundant, but now have rsuf appended

    mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
    #mergedmappath = os.path.join(outdir, mappref+mrgstr+rsuf) #this is output merged _r.cmap -- unify with filepref

    mergedmap = mc.multiCmap(outFileList[0]) #open original, edit in memory
    #now add other maps
    for rmap in outFileList[1:] : #don't add map 0 to itself
        if mergedmap.addCovOcc( mc.multiCmap(rmap) ) : #when calling addCovOcc, check return, warn if True
            err_msg = "Warning in AlignModule.mergeRcmaps: addCovOcc call failed for map %s" % rmap
            logOrPrintError(err_msg, varsP)
    #now it's merged, but the resulting map need to be written back to disk
    filepref = (varsP.outputContigPrefix if varsP and stageName == "" else stageName) #see split_XMapQcmap_byContig
    if splitByContig < 1 or splitByContig > 1 :
        #print "\nself.varsP.outputContigPrefix", self.varsP.outputContigPrefix, "\n" #debug
        #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in split_XMapQcmap_byContig
        mergedmap.writeAllMapsToDisk( os.path.join(outdir, filepref+'_contig'), outsuf="_r" )
        report = "mergeRcmaps: wrote %i cmaps" % len(mergedmap.cmapdict)
    if splitByContig > 0 :
        mergedmap.writeToFile( os.path.join(outdir, filepref+"_"+mrgstr+rsuf) ) #was mergedmappath
        report = "mergeRcmaps: wrote merged cmap with %i contigs" % len(mergedmap.cmapdict)
    #report result
    logOrPrintError(report, varsP, warn=False)
Beispiel #2
0
    def __init__(self, varsP):
        self.curCharacterizeFileRoots = []
        self.varsP = varsP  #bc Characterize uses this for totAssemblyLenMb
        #this is problematic for bypass (because mergeIntoSingleCmap isn't called)--don't need it
        #if not len(varsP.curCharacterizeCmaps) : #need this, set in mergeIntoSingleCmap
        #    return
        #ccc = varsP.curCharacterizeCmaps[0]
        #outFileName = os.path.split(ccc)[1].replace(".cmap", "")
        #outfile = os.path.join(varsP.contigAlignTarget,outFileName) #WRONG bc contigAlignTarget is wrong...try this

        outdir = os.path.join(varsP.outputContigFolder,
                              self.varsP.characterizeDirName)  #'alignref'
        if not util.checkDir(
                outdir, makeIfNotExist=False
        ):  #if this doesn't exist, we can't get what we need
            return
        outfile = None
        for qfile in os.listdir(outdir):
            if qfile.endswith(".err"):  #just take first .err file
                outfile = qfile
                break
        if not outfile:  #if no .err files found, give up
            return
        outfile = os.path.join(outdir, outfile.replace(".err", ""))
        self.curCharacterizeFileRoots.append(outfile)
        #also want to get varsP.totAssemblyLenMb
        self.varsP.totAssemblyLenMb = mapClasses.multiCmap(
            varsP.latestMergedCmap, lengthonly=True).totalLength / 1e6
Beispiel #3
0
 def getAlignStats(self):
     """Open output files of alignment jobs and report on statistics.
     """
     #MapClassesRev stores totAssemblyLenMb
     self.varsP.updatePipeReport(
         "Starting AlignModule Align Stats stage for %s\n" % self.stageName,
         printalso=True)
     util.LogStatus("progress", "stage_start", "%s_stats" % self.stageName)
     if self.doref:
         reflen = mc.multiCmap(self.varsP.ref,
                               lengthonly=True).totalLength / 1e6
     else:
         reflen = self.varsP.totAssemblyLenMb
     getAlignStats(self.varsP,
                   self.outFileList,
                   reflen,
                   isref=self.doref,
                   mergepath=self.mergedir)
     mergeMap(self.varsP, self.outFileList, mergepath=self.mergedir)
     splitByContig = (2 if self.doref else 0)  #see mergeRcmaps
     stageName = (self.varsP.alignMolvrefName if self.doref else "")
     mergeRcmaps(self.outFileList, self.mergedir, self.varsP, splitByContig,
                 stageName)
     #xmapDict = split_XMap_byContig( self.outFileList, self.mergedir, self.varsP, self.stageName)
     xmapDict = split_XMap_byContig_new(self.outFileList, self.mergedir,
                                        self.varsP, stageName)
     #split_Qcmap_byContig(self.outFileList, self.mergedir, xmapDict, self.varsP)
     split_Qcmap_byContig_new(self.outFileList, self.mergedir, xmapDict,
                              self.varsP, stageName)
     self.varsP.updatePipeReport(
         "Finished AlignModule Align Stats stage for %s\n" % self.stageName,
         printalso=True)
     util.LogStatus("progress", "stage_complete",
                    "%s_stats" % self.stageName)
    def getAlignStats(self):
        """Open output files of alignment jobs and report on statistics.
        """
        #MapClassesRev stores totAssemblyLenMb
        self.varsP.updatePipeReport("Starting AlignModule Align Stats stage for %s\n" % self.stageName, printalso=True)
	util.LogStatus("progress", "stage_start", "%s_stats" % self.stageName)
        if self.doref :
            reflen = mc.multiCmap(self.varsP.ref, lengthonly=True).totalLength / 1e6
        else :
            reflen = self.varsP.totAssemblyLenMb
        getAlignStats(self.varsP, self.outFileList, reflen, isref=self.doref, mergepath=self.mergedir) 
        mergeMap(self.varsP, self.outFileList, mergepath=self.mergedir) 
        splitByContig = (2 if self.doref else 0) #see mergeRcmaps
        stageName = (self.varsP.alignMolvrefName if self.doref else "")
        mergeRcmaps(self.outFileList, self.mergedir, self.varsP, splitByContig, stageName)
        #xmapDict = split_XMap_byContig( self.outFileList, self.mergedir, self.varsP, self.stageName)
        xmapDict = split_XMap_byContig_new( self.outFileList, self.mergedir, self.varsP, stageName)
        #split_Qcmap_byContig(self.outFileList, self.mergedir, xmapDict, self.varsP)
        split_Qcmap_byContig_new(self.outFileList, self.mergedir, xmapDict, self.varsP, stageName)
        self.varsP.updatePipeReport("Finished AlignModule Align Stats stage for %s\n" % self.stageName, printalso=True)
	util.LogStatus("progress", "stage_complete", "%s_stats" % self.stageName)
    def __init__(self, varsP) :
        self.curCharacterizeFileRoots = []
        self.varsP = varsP #bc Characterize uses this for totAssemblyLenMb
        #this is problematic for bypass (because mergeIntoSingleCmap isn't called)--don't need it
        #if not len(varsP.curCharacterizeCmaps) : #need this, set in mergeIntoSingleCmap
        #    return
        #ccc = varsP.curCharacterizeCmaps[0]
        #outFileName = os.path.split(ccc)[1].replace(".cmap", "")
        #outfile = os.path.join(varsP.contigAlignTarget,outFileName) #WRONG bc contigAlignTarget is wrong...try this

        outdir = os.path.join(varsP.outputContigFolder, self.varsP.characterizeDirName) #'alignref'
        if not util.checkDir(outdir, makeIfNotExist=False) : #if this doesn't exist, we can't get what we need
            return
        outfile = None
        for qfile in os.listdir(outdir) :
            if qfile.endswith(".err") : #just take first .err file
                outfile = qfile
                break
        if not outfile : #if no .err files found, give up
            return
        outfile = os.path.join(outdir, outfile.replace(".err",""))
        self.curCharacterizeFileRoots.append(outfile)
        #also want to get varsP.totAssemblyLenMb
        self.varsP.totAssemblyLenMb = mapClasses.multiCmap(varsP.latestMergedCmap, lengthonly=True).totalLength / 1e6
Beispiel #6
0
def characterizeContigs(varsP, xmappath=None):
    """Log simple contigs stats, and optionally align stats from xmappath.
    """
    #print "xmappath:", xmappath
    unitscale = 1e-6
    dorefalign = bool(
        xmappath
    )  #i'm never actually calling refaligner here--this is just using xmappath
    haveref = bool(varsP.ref)

    #refcmap = mapClasses.multiCmap() #not used
    aligndir = varsP.contigAlignTarget

    try:
        #refcmap = mapClasses.multiCmap(varsP.ref)
        #reflen = refcmap.totalLength #note: total length of _all_ contigs
        reflen = mapClasses.multiCmap(varsP.ref, lengthonly=True).totalLength
        #in summary table, this is a denominator--make sure it's non-zero, don't bail (still get contig summary)
        if reflen <= 0:
            #print "Warning in CharacterizeModule.characterizeContigs: bad reflen", reflen, "defaulting to 1" #not necessary
            reflen = 1.
    except:
        reflen = 1.

    outstr = ""  #Contig Characterization:\n"

    #check for .hmaps in same dir as latestMergedCmap: if any, add a line for haploid genome size
    hmaps = util.getListOfFilesFromDir(os.path.dirname(varsP.latestMergedCmap),
                                       ".hmap")
    haplotype = (len(hmaps) > 0)
    haplotypelen = 0
    hapcontiglens = []

    totcontiglen = 0
    totalignlen = 0
    nmapcontigs = 0
    totalignqlen = 0  #defalignlen = 0;
    contiglens = []  #lens of all contigs in bases
    uniqueseg = {
    }  #the argument to util.uniqueRange--stores all the map lengths which go into totalignlen--now each of these is a value, and the keys are the reference contig id or chromosome depending on dorefidchr
    for citr, cpath in enumerate([varsP.latestMergedCmap
                                  ]):  #always use contigpaths
        mapi = mapClasses.multiCmap(cpath)
        totcontiglen += mapi.totalLength
        contiglens += mapi.getAllMapLengths(
        )  #getAllMapLengths is list of all map lengths
        if haplotype:
            haplotypelen += mapi.getHaplotypeTotalMapLength()
            hapcontiglens.extend(mapi.getHaplotypeMapLengths())

        #store a list of the contig ids in this multiCmap, then remove them if they're in the xmap
        # if they're not, print at the end
        mapids = mapi.getAllMapIds(
        )  #this is once per cmap, not once per characterizeModule call--becuase it's keys, it's already a copy, no need to copy explicitly
        ncontigs = len(
            mapids)  #this is ncontigs in this file, ie, in mapi (see below)

        xmapobj = mapClasses.xmap()  #empty map to fix xmapobj scope
        if dorefalign:  #get xmap object
            if util.checkFile(xmappath, ".xmap"):
                xmapobj = mapClasses.xmap(xmappath)

        for xitr, xmapentry in enumerate(xmapobj.xmapLookup.values()):

            #get map length from multicmap.getMapLength--returns 0 for any exception
            contiglen = mapi.getMapLength(xmapentry.contigQry)
            if contiglen <= 0:  #this strikes me as clumsy...but I don't want to return non-zero from multiCmap.getMapLength
                contiglen = 1.
            contigcov = mapi.getMapAvgCoverage(xmapentry.contigQry)

            #don't print lenr for each contig--just total them
            lenr = xmapentry.getMappedRefLen()
            lenq = xmapentry.getMappedQryLen()
            refid = xmapentry.contigRef  #int

            totalignlen += lenr
            totalignqlen += lenq

            #uniqueseg is now a dict to take into account which chromosome the query contig is on
            #note need refid bc need to separate different contigs on the _same_ chromosome
            if not uniqueseg.has_key(
                    refid
            ):  #if first contig on chromosome, need to init new list
                uniqueseg[refid] = []
            uniqueseg[refid].append([xmapentry.RefStart, xmapentry.RefStop])

            #process mapids--remove contig id (contigQry) from mapids if they're in the xmap so non-aligning contigs can be printed
            if xmapentry.contigQry in mapids:
                mapids.remove(xmapentry.contigQry)

        #end loop on xmap entries

        #now that all xmap entries are processed, all contigs with an alignment are removed from mapids,
        # so we can get n contigs align using this and ncontigs
        nmapcontigs += ncontigs - len(mapids)  #sum multiple cmaps

    #end loop on contigs

    varsP.totAssemblyLenMb = totcontiglen * unitscale
    ncontigs = len(
        contiglens)  #contigpaths is just files--contiglens is all contigs
    avgcontiglen = (float(totcontiglen) / ncontigs if ncontigs > 0 else 0)

    if unitscale > 1e-6:  #if not megabases
        fstr = "%9.0f"
    else:  #megabases
        fstr = "%8.3f"

    if haplotype:  #new format for haplotype
        #if haplotypelen != sum(hapcontiglens) : #simply print warning in this case (do not log): ignore this bc of floating point rounding
        #print "Warning in characterizeContigs: haplotype lengths are inconsistent:", haplotypelen, sum(hapcontiglens)
        #diploid is same as else below, but names change
        outstr += "Diploid N Genome Maps: %i\n" % ncontigs
        outstr += ("Diploid Genome Map Len        (Mb): " + fstr +
                   "\n") % (totcontiglen * unitscale)
        outstr += ("Diploid Avg. Genome Map Len   (Mb): " + fstr +
                   "\n") % (avgcontiglen * unitscale)
        outstr += ("Diploid Median Genome Map Len (Mb): " + fstr +
                   "\n") % (util.getMedian(contiglens) * unitscale)
        outstr += ("Diploid Genome Map n50        (Mb): " + fstr +
                   "\n") % (util.getn50(contiglens) * unitscale)
        #haploid : ignore haplotypelen, just use the list hapcontiglens
        outstr += "Haploid N Genome Maps: %i\n" % len(hapcontiglens)
        tot = sum(hapcontiglens)
        avg = (tot / len(hapcontiglens) if len(hapcontiglens) else 0)
        outstr += ("Haploid Genome Map Len        (Mb): " + fstr +
                   "\n") % (tot * unitscale)
        outstr += ("Haploid Avg. Genome Map Len   (Mb): " + fstr +
                   "\n") % (avg * unitscale)
        outstr += ("Haploid Median Genome Map Len (Mb): " + fstr +
                   "\n") % (util.getMedian(hapcontiglens) * unitscale)
        outstr += ("Haploid Genome Map n50        (Mb): " + fstr +
                   "\n") % (util.getn50(hapcontiglens) * unitscale)
    else:  #default to old format
        outstr += "N Genome Maps: %i\n" % ncontigs
        outstr += ("Total Genome Map Len  (Mb): " + fstr +
                   "\n") % (totcontiglen * unitscale)
        outstr += ("Avg. Genome Map Len   (Mb): " + fstr +
                   "\n") % (avgcontiglen * unitscale)
        outstr += ("Median Genome Map Len (Mb): " + fstr +
                   "\n") % (util.getMedian(contiglens) * unitscale)
        outstr += ("Genome Map n50        (Mb): " + fstr +
                   "\n") % (util.getn50(contiglens) * unitscale)

    if haveref:
        outstr += ("Total Ref Len   (Mb): " + fstr + "\n") % (reflen *
                                                              unitscale)
        outstr += ("Total Genome Map Len / Ref Len : " + fstr +
                   "\n") % (totcontiglen / reflen)
    if dorefalign:
        ratio = (float(nmapcontigs) / ncontigs if ncontigs > 0 else 0)
        outstr += ("N Genome Maps total align      : %i (%.2f)\n") % (
            nmapcontigs, ratio)
        outstr += ("Total Aligned Len (Mb)            : " + fstr +
                   "\n") % (totalignlen * unitscale)
        outstr += ("Total Aligned Len / Ref Len       : " + fstr +
                   "\n") % (totalignlen / reflen)
        uniquelen = 0
        for segs in uniqueseg.values():  # need to sum on dict entries
            util.uniqueRange(segs)  #this modifies list in place
            uniquelen += util.totalLengthFromRanges(segs)
        outstr += ("Total Unique Aligned Len (Mb)     : " + fstr +
                   "\n") % (uniquelen * unitscale)
        outstr += ("Total Unique Aligned Len / Ref Len: " + fstr +
                   "\n") % (uniquelen / reflen)

    return outstr
def characterizeContigs(varsP, xmappath=None, listcontigs=False) :
    """Log simple contigs stats, and optionally align stats from xmappath.
    """
    #print header of table
    unitscale = 1e-6
    dorefalign = bool(xmappath) #i'm never actually calling refaligner here--this is just using xmappath
    #dorefidchr = False
    #dorefcid = False
    printrange = False
    #printsegrms = False
    #dochrstr = False
    iscluster = True
    haveref = bool(varsP.ref)

    #refcmap = mapClasses.multiCmap() #not used
    aligndir = varsP.contigAlignTarget

    try :
        #refcmap = mapClasses.multiCmap(varsP.ref)
        #reflen = refcmap.totalLength #note: total length of _all_ contigs
        reflen = mapClasses.multiCmap(varsP.ref, lengthonly=True).totalLength
        #in summary table, this is a denominator--make sure it's non-zero, don't bail (still get contig summary)
        if reflen <= 0 :
            #print "Warning in CharacterizeModule.characterizeContigs: bad reflen", reflen, "defaulting to 1" #not necessary
            reflen = 1.
    except:
        reflen = 1.

    outstr = "" #Contig Characterization:\n"

    if listcontigs and dorefalign :
        outstr += "cID  len"
        outstr += "  Cov"
        #if dorefidchr or dorefcid :
        #    outstr += "  rID" #ref index for either of these
        #if dorefidchr :
        #    outstr += "  rpos"
        outstr += "  alignlen  alignlen/len"
        if printrange :
            outstr += "  Qry1  Qry2  Ref1  Ref2"
        #outstr += ("  segRMS" if printsegrms else "")
        outstr += "  Conf  Conf/lenkb"
        outstr += "  FP  FN  sf  sd  bpp" #"  res" #--ditch res (not bpp)
        #if dochrstr :
        #    outstr += "  Chr"
        outstr += "\n"

    totcontiglen = 0; totalignlen = 0; nmapcontigs = 0; defalignlen = 0; totalignqlen = 0
    contiglens = [] #lens of all contigs in bases
    uniqueseg = {} #the argument to util.uniqueRange--stores all the map lengths which go into totalignlen--now each of these is a value, and the keys are the reference contig id or chromosome depending on dorefidchr
    avgfplist = []; avgfnlist = [] #average FP/FN rates
    #if dorefidchr :
    #    chrsum = refcmap.makeChrSummaryDict() #see mapClasses.multiCmap
    for citr, cpath in enumerate([varsP.latestMergedCmap]) : #always use contigpaths
        mapi = mapClasses.multiCmap(cpath) 
        totcontiglen += mapi.totalLength
        contiglens += mapi.getAllMapLengths() #getAllMapLengths is list of all map lengths

        #store a list of the contig ids in this multiCmap, then remove them if they're in the xmap
        # if they're not, print at the end
        mapids = mapi.getAllMapIds() #this is once per cmap, not once per characterizeModule call--becuase it's keys, it's already a copy, no need to copy explicitly
        ncontigs = len(mapids) #this is ncontigs in this file, ie, in mapi (see below)

        xmapobj = mapClasses.xmap() #empty map to fix xmapobj scope
        if dorefalign : #get xmap object
            #xmappath = aligndir+os.path.split(cpath)[-1].replace(".cmap", ".xmap") #need cmap file name
            #xmappath = self.xmapTarget
            #if xmappath exists isn't a file, nothing will be loaded
            if os.path.isfile( xmappath ) : #was if not isfile : continue
                xmapobj = mapClasses.xmap(xmappath)

        for xitr, xmapentry in enumerate(xmapobj.xmapLookup.values()) :
            #print the contig id from the xmap
            #this is sorted by ref position; could sort the list this loop is over by the contigQry data member,
            # _but_, I think I like reference-oriented better because you can see gap spanning

            #get map length from multicmap.getMapLength--returns 0 for any exception
            contiglen = mapi.getMapLength(xmapentry.contigQry)
            if contiglen <= 0 : #this strikes me as clumsy...but I don't want to return non-zero from multiCmap.getMapLength
                contiglen = 1.
            contigcov = mapi.getMapAvgCoverage(xmapentry.contigQry)

            if listcontigs :
                outstr += "%5i" % xmapentry.contigQry
                outstr += "  %9.1f  %2i" % (contiglen, contigcov)

            #don't print lenr for each contig--just total them
            lenr = xmapentry.getMappedRefLen()
            lenq = xmapentry.getMappedQryLen()
            refid = xmapentry.contigRef #int
            #if dorefidchr : #this is the encoding of ref contig id to chromosome and start position
            #    chrpos = mapClasses.intToChrPos(refid, verbose=False) #this returns a tuple (shouldn't fail, but verbose false)
            #    refidstr = "  %2s  %6i" % chrpos
            #    chrs = chrpos[0] #just for readability
            #    if chrsum.has_key(chrs) : #the method that fills chrsum (makeChrSummaryDict) also uses intToChrPos
            #        chrsum[chrs][0] += lenr #values are list, first ele is total aligned length, second is ref length (fixed)
            #elif dorefcid :
            #    refidstr = "  %3i" % refid #refid is int
            #else : #nothing for neither, but still need empty string
            refidstr = ""

            conf = xmapentry.Confidence #confidence from the xmap, and ratio of it to length in kb
            if listcontigs :
                alignpars = getMappedErrStats(aligndir, cpath) #an empty err file is produced for case of no align
                avgfplist.append( alignpars.fp ) 
                avgfnlist.append( alignpars.fn ) 
                outstr += "%s  %9.1f  %.3f" % (refidstr, lenq, lenq/contiglen) #length for refidstr set above
                if printrange :
                    outstr += "  %5.0f  %5.0f  %5.0f  %5.0f" % (xmapentry.QryStart/pn, xmapentry.QryStop/pn, xmapentry.RefStart/pn, xmapentry.RefStop/pn)
                #outstr += ("  %5.0f" % 0 if printsegrms else "") #don't print anything
                outstr += "  %3.0f  %5.3f" % (conf, conf*1000./lenq) #1000 is for kb
                outstr += "  " + alignpars.getParamString()

            totalignlen  += lenr
            totalignqlen += lenq

            #uniqueseg is now a dict to take into account which chromosome the query contig is on
            #note need refid bc need to separate different contigs on the _same_ chromosome
            if not uniqueseg.has_key(refid) : #if first contig on chromosome, need to init new list
                uniqueseg[refid] = []
            uniqueseg[refid].append( [xmapentry.RefStart, xmapentry.RefStop] )

            #process mapids--remove contig id (contigQry) from mapids if they're in the xmap so non-aligning contigs can be printed
            if xmapentry.contigQry in mapids :
                mapids.remove(xmapentry.contigQry)

            #note: the feature of multiple alignments (strict vs default) is no longer implemented
            defalignlen  += lenr #currently, just default and strict

            #if listcontigs and dochrstr :
            #    outstr += "  " + refIndexToChrStr( xmapentry.contigRef )
            #    outstr += "\n"
            
        #end loop on xmap entries

        #now that all xmap entries are processed, all contigs with an alignment are removed from mapids,
        # so we can get n contigs align using this and ncontigs
        nmapcontigs += ncontigs - len(mapids) #sum multiple cmaps

        #and print the data for the contigs which don't align--just id, length, and coverage
        #these lines are kind of redundant, but I guess that's ok
        if listcontigs :
            for ids in mapids :
                outstr += "%5i" % ids
                #get map length from multicmap.getMapLength--returns 0 for any exception
                contiglen = mapi.getMapLength(ids) #it's ok if it's 0 bc it's never a denominator here
                contigcov = mapi.getMapAvgCoverage(ids)
                outstr += "  %9.1f  %2i\n" % (contiglen, contigcov)

    #end loop on contigs

    varsP.totAssemblyLenMb = totcontiglen*unitscale
    ncontigs = len(contiglens) #contigpaths is just files--contiglens is all contigs
    avgcontiglen = (float(totcontiglen)/ncontigs if ncontigs > 0 else 0)

    #print averages
    if listcontigs and not iscluster : #only do avg if not merged, otherwise just one noise parameter
        avgfp    = sum(avgfplist)/len(avgfplist)
        avgfn    = sum(avgfnlist)/len(avgfnlist)
        outstr += "AVG    %9.1f           %9.1f                     %5.3f  %5.3f\n" % (avgcontiglen, totalignqlen/nmapcontigs, avgfp, avgfn)

    if unitscale > 1e-6 : #if not megabases
        fstr = "%9.0f"
    else : #megabases
        fstr = "%8.3f" 

    outstr += "N Genome Maps: %i\n" % ncontigs
    outstr += ("Total Genome Map Len (Mb): "+fstr+"\n") % (totcontiglen*unitscale)
    outstr += ("Avg. Genome Map Len  (Mb): "+fstr+"\n") % (avgcontiglen*unitscale)
    outstr += ("Median Genome Map Len(Mb): "+fstr+"\n") % (util.getMedian(contiglens)*unitscale)
    outstr += ("Genome Map n50       (Mb): "+fstr+"\n") % (util.getn50(contiglens)*unitscale)

    if haveref :
        outstr += ("Total Ref Len   (Mb): "+fstr+"\n") % (reflen*unitscale)
        outstr += ("Total Genome Map Len / Ref Len : "+fstr+"\n") % (totcontiglen/reflen)
    if dorefalign :
        #print the chromosome summary before the strict/default/total align stats
        #if dorefidchr :
        #    outstr += "Chromosome Summary:\n"
        #    outstr += "Chromosome  align len  ref len  (ratio):\n"
        #    for chrs, align in chrsum.iteritems() :
        #        outstr += "%3s  %9.0f  %9.0f  (%5.3f)\n" % (chrs, align[0], align[1], align[0]/align[1])

        ratio = (float(nmapcontigs)/ncontigs if ncontigs > 0 else 0)
        outstr += ("N Genome Maps total align      : %i (%.2f)\n") % (nmapcontigs, ratio)
        outstr += ("Total Aligned Len (Mb)            : "+fstr+"\n") % (totalignlen*unitscale)
        outstr += ("Total Aligned Len / Ref Len       : "+fstr+"\n") % (totalignlen/reflen)
        uniquelen = 0
        for segs in uniqueseg.values() : # need to sum on dict entries
            util.uniqueRange(segs) #this modifies list in place
            uniquelen += util.totalLengthFromRanges( segs )
        outstr += ("Total Unique Aligned Len (Mb)     : "+fstr+"\n") % (uniquelen*unitscale)
        outstr += ("Total Unique Aligned Len / Ref Len: "+fstr+"\n") % (uniquelen/reflen)

    return outstr
Beispiel #8
0
def mergeRcmaps(outFileList,
                outdir,
                varsP=None,
                splitByContig=None,
                stageName=""):
    """Given a list of file prefixes (outFileList), append "_r.cmap" to them, and merge them
    to outdir. Report to varsP if supplied, stdout if not.
    Also support outFileList is full paths (including "_r.cmap").
    If splitByContig < 1, output each contig separately, if == 1, only output single merged cmap,
    and if > 1, do both.
    Always use stagename if supplied; if not, must supply varsP otherwise prefix is empty.
    """

    if not util.checkDir(outdir):
        err_msg = "Warning in AlignModule.mergeRcmaps: could not make outdir %s, skipping copy number" % outdir
        logOrPrintError(err_msg, varsP)
        return

    if not outFileList:  #just an argument check--check for presence on disk is below
        err_msg = "Warning in AlignModule.mergeRcmaps: no maps supplied"
        logOrPrintError(err_msg, varsP)
        return

    outFileList.sort(
    )  #for reproducibility with runAlignMerge.py (different order when listing dir)
    rsuf = "_r.cmap"
    #mappref = os.path.split(outFileList[0])[1] #this is just prefix, but with integer suffix--get it before -- no longer used
    #mappref = mappref[:mappref.rfind("_")+1] #remove integer suffix
    #even though outFileList should all be there, a job may have failed--check all, just existence
    present = []
    for outf in outFileList:
        target = (outf + rsuf if not outf.endswith(rsuf) else outf
                  )  #now support either
        if not util.checkFile(target):
            err_msg = "Warning in AlignModule.mergeRcmaps: missing _r.cmap %s" % target
            logOrPrintError(err_msg, varsP)
        else:
            present.append(target)
    if not present:  #no _r.cmaps found (this will also happen for empty outFileList)
        err_msg = "Warning in AlignModule.mergeRcmaps: no _r.cmaps found, skipping copy number"
        logOrPrintError(err_msg, varsP)
        return
    outFileList = present  #yes, it's redundant, but now have rsuf appended

    mrgstr = (varsP.alignMolvrefMergeName if varsP else "merge")
    #mergedmappath = os.path.join(outdir, mappref+mrgstr+rsuf) #this is output merged _r.cmap -- unify with filepref

    mergedmap = mc.multiCmap(outFileList[0])  #open original, edit in memory
    #now add other maps
    for rmap in outFileList[1:]:  #don't add map 0 to itself
        if mergedmap.addCovOcc(mc.multiCmap(
                rmap)):  #when calling addCovOcc, check return, warn if True
            err_msg = "Warning in AlignModule.mergeRcmaps: addCovOcc call failed for map %s" % rmap
            logOrPrintError(err_msg, varsP)
    #now it's merged, but the resulting map need to be written back to disk
    filepref = (
        varsP.outputContigPrefix if varsP and stageName == "" else stageName
    )  #see split_XMapQcmap_byContig
    if splitByContig < 1 or splitByContig > 1:
        #print "\nself.varsP.outputContigPrefix", self.varsP.outputContigPrefix, "\n" #debug
        #filepref = (varsP.outputContigPrefix if varsP else stageName) #same as line in split_XMapQcmap_byContig
        mergedmap.writeAllMapsToDisk(os.path.join(outdir,
                                                  filepref + '_contig'),
                                     outsuf="_r")
        report = "mergeRcmaps: wrote %i cmaps" % len(mergedmap.cmapdict)
    if splitByContig > 0:
        mergedmap.writeToFile(
            os.path.join(outdir,
                         filepref + "_" + mrgstr + rsuf))  #was mergedmappath
        report = "mergeRcmaps: wrote merged cmap with %i contigs" % len(
            mergedmap.cmapdict)
    #report result
    logOrPrintError(report, varsP, warn=False)
def runAlignMol():
    parser = argparse.ArgumentParser(description=description)

    parser.add_argument(
        '-q',
        dest='queryDir',
        help=
        'Path to merged cmap to align molecules (-b) to OR alignmol dir from Pipeline for merge (if latter, no alignments are performed), required',
        type=str)
    parser.add_argument(
        '-b',
        dest='bnx',
        help='Input molecule (.bnx) file, required if aligning molecules',
        type=str)
    #parser.add_argument('-b', dest='bnx', help='Input molecule (.bnx) file OR path to dir containing split bnx pieces, required if aligning molecules', type=str) #I should add the split feature; for now, just do single bnx
    parser.add_argument(
        '-a',
        dest='optArguments',
        help=
        'Path to optArguments.xml (optional, default optArguments_human.xml in Pipeline dir if found, otherwise required)',
        default="",
        type=str)
    parser.add_argument(
        '-r',
        help=
        'If this flag is used, alignmolvref arguments are used, otherwise alignmol arguments are used (default alignmol; optional)',
        dest='ref',
        action='store_true')
    parser.add_argument(
        '-o',
        dest='outputDir',
        help=
        'output dir (optional, defaults to sub-dir of input map dir called "alignmol")',
        default="",
        type=str)
    parser.add_argument(
        '-t',
        dest='RefAligner',
        help='Path to RefAligner or dir containing it (required)',
        type=str)
    parser.add_argument(
        '-T',
        dest='numThreads',
        help='Total number of threads (cores) to use (optional, default 4)',
        default=4,
        type=int)
    parser.add_argument(
        '-j',
        dest='maxthreads',
        help=
        'Threads per Job, -maxthreads (non-cluster only;optional, default 4)',
        default=4,
        type=int)
    parser.add_argument(
        '-e',
        dest='errFile',
        help=
        '.err file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise',
        default="",
        type=str)
    parser.add_argument(
        '-E',
        dest='errbinFile',
        help=
        '.errbin file to use for noise parameters--will supersede noise parameters in the optArgument supplied (but that file must still be supplied for non-noise parameters)--should be from autoNoise',
        default="",
        type=str)
    parser.add_argument(
        '-p',
        dest='pipelineDir',
        help=
        'Pipeline dir (optional, defaults to script dir, or current directory)',
        default="",
        type=str)
    result = parser.parse_args()

    outprefix = "exp_refineFinal1"  #this is the default; assume for now

    #check all Pipeline dependencies
    if result.pipelineDir:
        cwd = result.pipelineDir
    else:
        cwd = os.path.split(
            os.path.realpath(__file__))[0]  #this is path of this script
        if not os.path.isfile(os.path.join(
                cwd,
                "utilities.py")):  #if still not here, last try is actual cwd
            cwd = os.getcwd()  #still check this below

    #this is the only one imported here and in runCharacterize
    if not os.path.isfile(os.path.join(cwd, "utilities.py")):
        print "ERROR: utilities.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import utilities as util

    if not os.path.isfile(os.path.join(cwd, "AlignModule.py")):
        print "ERROR: AlignModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import AlignModule as alignmod

    if not util.checkFile(os.path.join(cwd, "Pipeline.py")):
        print "ERROR: Pipeline.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import Pipeline

    if not util.checkFile(os.path.join(cwd, "mapClasses.py")):
        print "ERROR: mapClasses.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    import mapClasses as mc

    #input dir
    if not result.queryDir:
        print "ERROR: Query (-q) argument not supplied."
        sys.exit(1)
    qrypath = os.path.realpath(result.queryDir)
    if util.checkDir(
            qrypath, checkWritable=False,
            makeIfNotExist=False):  #output elsewhere so not writeable is ok
        runaligns = False
    elif util.checkCmap(qrypath):
        runaligns = True
    else:
        print "ERROR: Query argument (" + qrypath + ") not found or not a dir or cmap. Check -q argument."
        sys.exit(1)

    #this check isn't really necessary...make it a warning -- left over from runAlignMerge.py
    #if not os.path.split(qrypath)[1].endswith("alignmol") :
    #    print "Warning: Query dir ("+qrypath+") does not end with 'alignmol'; please be sure this is a Pipeline alignmol dir\n"

    #RefAligner -- check for either path to RefAligner, or dir containing it, depending on cluster args
    rabin = ""  #need empty string for generateJobList even though no jobs are run
    if runaligns:
        rabin = result.RefAligner
        #replicate Pipeline behavior: RefAligner is always required
        if os.path.isdir(rabin):
            rabin = os.path.join(rabin, "RefAligner")
        if not util.checkExecutable(rabin):
            print "ERROR: RefAligner not found or not executable at", rabin, "\nPlease supply RefAligner dir or full path as -t arg."
            sys.exit(1)

    #optargs file
    optargs = None
    if runaligns and result.optArguments:  #supplied on command line
        optargs = result.optArguments
        if not util.checkFile(optargs, ".xml"):
            print "optArguments path is supplied (" + optargs + ") but not found or doesn't end in .xml, check -a argument."
            sys.exit(1)
    elif runaligns:  #load from Pipeline dir if running alignments
        optargs = os.path.join(cwd, "optArguments_human.xml")
        if not util.checkFile(optargs):
            print "optArguments.xml missing in Pipeline directory (" + cwd + "). Try supplying path explicitly using -a."
            sys.exit(1)

    #output dir
    if not result.outputDir:
        outdir = os.path.join(qrypath,
                              "merge")  #should be same as in AlignModule
    else:
        outdir = os.path.realpath(result.outputDir)
    if os.path.isdir(outdir):
        if not util.checkDir(outdir):  #check writeable
            print "\nERROR: Output dir is not writeable:\n", outdir, "\n"
            sys.exit(1)
        #this is ok here
        #elif outdir == contigdir :
        #    print "\nERROR: Output dir cannot be same as input dir:\n", outdir, "\n"
        #    sys.exit(1)
        print "\nWARNING: Output dir already exists, results will be overwritten:\n", outdir, "\n"
    elif not util.checkDir(
            outdir
    ):  #does not exist, make, if False, can't make or not writeable
        print "\nERROR: Output dir cannot be created or is not writeable:\n", outdir, "\n"
        sys.exit(1)

    #bnx file
    bnxfile = result.bnx
    if bnxfile:  #must check for empty string BEFORE you do realpath, or it returns cwd
        bnxfile = os.path.realpath(bnxfile)
        if not util.checkFile(bnxfile, ".bnx"):
            print "ERROR: bnx file supplied but not found or incorrect suffix:", bnxfile
            sys.exit(1)
    elif runaligns:
        print "ERROR: bnx file not supplied but running alignments; please supply bnx file as -b argument"
        sys.exit(1)

    #nthreads
    nthreads = result.numThreads
    if nthreads <= 0:
        print "ERROR: Number of threads value invalid (must be > 0): %i" % nthreads
        sys.exit(1)

    #maxthreads
    maxthreads = result.maxthreads
    if maxthreads <= 0:
        print "ERROR: Max threads value invalid (must be > 0): %i" % maxthreads
        sys.exit(1)
    elif nthreads < maxthreads:
        print "Warning: num threads (-T: %i) < max threads (-j: %i): increasing num threads to equal max threads\n" % (
            nthreads, maxthreads)
        nthreads = maxthreads

    #.errbin file
    errbinfile = result.errbinFile
    if errbinfile:
        errbinfile = os.path.realpath(result.errbinFile)
        if not util.checkFile(errbinfile, ".errbin"):
            print "ERROR: errbin file supplied but not found or incorrect suffix:", errbinfile
            sys.exit(1)

    #.err file
    errfile = result.errFile
    if errfile and errbinfile:
        print "Warning: .err and .errbin arguments supplied; ignoring .err file"
        errfile = ""
    elif errfile:
        errfile = os.path.realpath(result.errFile)
        if not util.checkFile(errfile, ".err"):
            print "err file supplied but not found or incorrect suffix:", errfile
            sys.exit(1)

    if errfile and not util.checkFile(os.path.join(cwd,
                                                   "SampleCharModule.py")):
        print "SampleCharModule.py missing in dir", cwd, "check -p argument, or run this script in Pipeline dir"
        sys.exit(1)
    elif errfile:
        import SampleCharModule as scm

    doref = result.ref

    #DONE checking arguments

    print "Using output dir", outdir
    if runaligns:
        print "Aligning", bnxfile, "\nTo", qrypath, "\n"
    else:
        print "Merging", qrypath, "\n"

    startTime = time.time()  #time since Epoch
    memory_log = os.path.join(outdir, "memory_log.txt")
    util.initMemoryLog(memory_log)

    varsP = Pipeline.varsPipeline()
    varsP.RefAlignerBin = rabin
    varsP.contigFolder = ""  #not used but needs to be an attr
    varsP.outputContigFolder = ""  #not used but needs to be a string attr
    varsP.pipeReportFile = os.path.join(outdir, "alignmol_jobs_log.txt")
    varsP.infoReportFile = os.path.join(outdir, "alignmol_log.txt")
    util.InitStatus(os.path.join(outdir, "status.xml"))

    if runaligns:
        varsP.optArgumentsFileIn = optargs
        varsP.latestMergedCmap = qrypath  #if !doref, need this one
        varsP.ref = qrypath  #and if doref, need this one
        varsP.nThreads = nthreads  #necessary otherwise job won't start -- max threads per node
        varsP.maxthreads = maxthreads  #threads per job
        p = os.path.split(qrypath)[1]
        varsP.outputContigPrefix = p[:p.rfind(".")]  #filename prefix
        varsP.stdoutlog = True  #use -stdout -stderr
        varsP.sorted_file = bnxfile[:bnxfile.rfind(
            ".")]  #enables the mol fraction align in AlignModule.getAlignStats
        if qrypath.endswith(".cmap"):  #enable the mol stats
            varsP.totAssemblyLenMb = mc.multiCmap(
                qrypath, lengthonly=True).totalLength / 1e6

        varsP.memoryLogpath = os.path.join(outdir, "memory_log.txt")
        varsP.parseArguments()  #parses optArgumentsFile
        varsP.checkDependencies()
        varsP.RefAlignerBinOrig = rabin
        varsP.prerunLog(
        )  #general information in log -- needed for refaligner_version

        noisep = {}
        if errbinfile:
            noisep = {"readparameters": errbinfile}
            #print "Using noise parameters from "+errbinfile+"\n" #move below
        elif errfile:
            noisep = scm.readNoiseParameters(errfile.replace(".err", ""))
            if noisep.has_key(
                    'readparameters'
            ):  #remove this because it's redundant, and it can cause problems with RefAligner compatibility
                del noisep['readparameters']
            if not noisep:  #readNoiseParameters returns empty dict on failure
                print "ERROR reading noise parameters, check .err file:", errfile
                sys.exit(1)
            #redundant with below?
            print "Using noise parameters from " + errfile + ":\n" + " ".join(
                ["-" + str(k) + " " + str(v)
                 for k, v in noisep.iteritems()]) + "\n"

        #some code from SampleCharModule to load args into noise0
        infoReport = "Loaded noise parameters:\n"
        klist = [
            "FP", "FN", "sf", "sd", "sr", "bpp", "readparameters"
        ]  #hardcoding parameters is kind of bad, but it fixes the order without using OrderedDict.
        #noiseargs = self.varsP.argsListed('noise0') #not necessary
        for v in klist:
            if not noisep.has_key(v):
                continue
            param = str(noisep[v])
            util.LogStatus("parameter", "auto_" + v, param)
            infoReport += v + ":" + param + "\n"
            varsP.replaceParam("noise0", "-" + v, param)
        varsP.updateInfoReport(infoReport + '\n', printalso=True)

    else:
        print "Getting file list from", qrypath
        outFileList = getOutFileList(util, qrypath)
        if not outFileList:
            print "ERROR: Query dir (" + qrypath + ") does not contain alignmol data. Check -q argument."
            sys.exit(1)
        else:
            print "Found", len(outFileList), "alignment results"
    #end if runaligns

    amod = alignmod.AlignModule(
        varsP, doref, outdir, bnxfile)  #constructor will call generateJobList

    if runaligns:
        amod.runJobs()
        amod.checkResults()
    else:
        amod.outFileList = outFileList
        p = os.path.split(outFileList[0])[1]
        if p.count("_") > 1:  #expect something like "EXP_REFINEFINAL1_4"
            #p = p[:p.rfind("_")+1] #remove integer suffix
            p = p[:p.rfind("_")]  #remove integer suffix (and underscore)
        #else :
        #    p += "_" #because mrgstr is appended
        varsP.outputContigPrefix = p

    if not runaligns or len(amod.jobList) > 0:
        amod.getAlignStats()

    if runaligns:
        print
        #copy from Pipeline.py
        if util.SummarizeErrors(varsP=varsP) == 0:
            varsP.updatePipeReport("Pipeline has successfully completed\n")
            util.LogStatus("progress", "pipeline", "success")
        else:
            varsP.updatePipeReport("Pipeline has completed with errors\n")
            util.LogStatus("progress", "pipeline", "failure")

    #BELOW OLD CODE

    return

    #in Pipeline, this is called first
    #print "Calling getAlignStats:" #but it won't work without varsP atm; skip it
    #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir)
    #getAlignStats(self.varsP, self.outFileList, self.varsP.totAssemblyLenMb, isref=False, mergepath=self.mergedir)

    print "Calling mergeMap"
    print outFileList[0]  #, "\n", outputdir #moved above
    util.logMemory(memory_log, startTime, "mergeMap_start")
    #mergeMap(self.varsP, self.outFileList, mergepath=self.outputdir) #varsP is optional
    alignmod.mergeMap(None, outFileList, outputdir)
    util.logMemory(memory_log, startTime, "mergeMap_end")

    print "Calling mergeRcmaps"
    util.logMemory(memory_log, startTime, "mergeRcmaps_start")
    #mergeRcmaps(outFileList, outdir, varsP=None, splitByContig=None, stageName="alignmol") :
    alignmod.mergeRcmaps(outFileList,
                         outputdir,
                         splitByContig=True,
                         stageName=outprefix)
    util.logMemory(memory_log, startTime, "mergeRcmaps_end")

    print "Calling split_XMap_byContig"  #split_XMapQcmap_byContig"
    util.logMemory(memory_log, startTime, "split_XMap_byContig_start")
    #xmapdict = alignmod.split_XMap_byContig(outFileList, outputdir, stageName=outprefix) #old
    xmapdict = alignmod.split_XMap_byContig_new(outFileList,
                                                outputdir,
                                                stageName=outprefix)
    util.logMemory(memory_log, startTime, "split_XMap_byContig_end")

    print "Calling split_Qcmap_byContig"
    util.logMemory(memory_log, startTime, "split_Qcmap_byContig_start")
    #alignmod.split_Qcmap_byContig(outFileList, outputdir, xmapdict) #old
    alignmod.split_Qcmap_byContig_new(
        outFileList, outputdir, xmapdict,
        stageName=outprefix)  #new: better performance
    util.logMemory(memory_log, startTime, "split_Qcmap_byContig_end")

    print "AlignMerge successfully completed"