Esempio n. 1
0
def trimMatchOverlapsInBoth(inpfile, outfile, trim_subtype):
    gp = MyFile.myfile()
    MatchRecord.sortInXorderAP(inpfile, gp)
    # The following coalescing assumes perfect runs.
    hp = MyFile.myfile()
    coalesceMatches(gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u')))
    gp = MyFile.myfile()
    trimMatchOverlapsInX(hp, gp, trim_subtype)
    hp = MyFile.myfile()
    MatchRecord.sortInYorderAP(gp, hp)
    trimMatchOverlapsInY(hp, outfile, trim_subtype)
    return
def trimMatchOverlapsInBoth(inpfile,outfile,trim_subtype):
    gp = MyFile.myfile()
    MatchRecord.sortInXorderAP(inpfile,gp)
    # The following coalescing assumes perfect runs.
    hp = MyFile.myfile()
    coalesceMatches( gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u')) )
    gp = MyFile.myfile()
    trimMatchOverlapsInX(hp,gp,trim_subtype)
    hp = MyFile.myfile()
    MatchRecord.sortInYorderAP(gp,hp)
    trimMatchOverlapsInY(hp,outfile,trim_subtype)
    return
Esempio n. 3
0
 def __init__( self, runName):
     "You must supply a atac file called runName.atac."
     self.runName = runName
     self.comments = []
     self.metacommands = []
     self.globals = {} 
     self.tableformat = {}
     self.tabledata = {}
     self.matches = MyFile.myfile()
     self.runs    = MyFile.myfile()
     
     fp = open(runName,"r")
     for line in fp:
         self.atac_file_parse_line(line)
Esempio n. 4
0
def findCoverageIntervals( inpfile, outfile, processFirstAxis):
    # The input file is an ATAC matches file.
    # The output file is an ATAC coverage intervals file.
    inpfile.seek(0)
    outfile.seek(0)
    t0 = time.time()
    tmpfile3 = MyFile.myfile()
    for line in inpfile:
        if(line[0]=="M"):
            fields = line.split()
            if(fields[1]=="u" or fields[1]=="x"):
                if(processFirstAxis):
                    axis = fields[4]
                    bgn = int(fields[5])
                    end = bgn+int(fields[6])
                else:
                    axis = fields[8]
                    bgn = int(fields[9])
                    end = bgn+int(fields[10])
                print >>tmpfile3, "E", axis,bgn,1
                print >>tmpfile3, "E", axis,end,-1
    tmpfile3.close()
    tmpname = tempfile.mktemp()
    cmd = "sort -T . -k 1,1 -k 2,2 -k 3n -k 4nr  %s > %s" % (tmpfile3.name, tmpname)
    print >>sys.stderr, cmd
    iret = os.system(cmd); assert(iret==0)
    print >>sys.stderr,"time elapsed is ", (time.time() - t0)
    tmpfile4 = open(tmpname)
    t0 = time.time()
    findUniformCoverageIntervals( tmpfile4, outfile)
    print >>sys.stderr,"time elapsed is ", (time.time() - t0)
    tmpfile4.close()
    os.system("rm -f " + tmpname)
    outfile.seek(0)
def main( inpname, outname):
    obj = AtacFile.AtacFile(inpname)
    assemblyId1 = obj.globals['assemblyId1']
    assemblyId2 = obj.globals['assemblyId2']
    assemblyFilePrefix1 = obj.globals['assemblyFilePrefix1']
    assemblyFilePrefix2 = obj.globals['assemblyFilePrefix2']

    if(not obj.globals.has_key('fillIntraRunGapsErate')):
        obj.globals['fillIntraRunGapsErate'] = 0.10
    if(not obj.globals.has_key('fillIntraRunGapsMaxGap')):
        obj.globals['fillIntraRunGapsMaxGap'] = 100000
    fillIntraRunGapsErate = float(obj.globals['fillIntraRunGapsErate'])
    fillIntraRunGapsMaxGap = int(obj.globals['fillIntraRunGapsMaxGap'])
    
    # mismatches = checkExactMatches( x, y, inpfile)
    # sys.stderr.write("mismatches = %d\n" % mismatches)

    xIdx = IdxStore.IdxStore(assemblyFilePrefix1,assemblyId1)
    yIdx = IdxStore.IdxStore(assemblyFilePrefix2,assemblyId2)

    tempfile = MyFile.myfile()
    mainLoop( obj.matches, tempfile, xIdx, yIdx,
              fillIntraRunGapsMaxGap, fillIntraRunGapsErate)
    obj.matches = tempfile
    obj.checkpoint(outname)
Esempio n. 6
0
def main(inpname, outname):
    obj = AtacFile.AtacFile(inpname)
    assemblyId1 = obj.globals['assemblyId1']
    assemblyId2 = obj.globals['assemblyId2']
    assemblyFilePrefix1 = obj.globals['assemblyFilePrefix1']
    assemblyFilePrefix2 = obj.globals['assemblyFilePrefix2']

    if (not obj.globals.has_key('fillIntraRunGapsErate')):
        obj.globals['fillIntraRunGapsErate'] = 0.10
    if (not obj.globals.has_key('fillIntraRunGapsMaxGap')):
        obj.globals['fillIntraRunGapsMaxGap'] = 100000
    fillIntraRunGapsErate = float(obj.globals['fillIntraRunGapsErate'])
    fillIntraRunGapsMaxGap = int(obj.globals['fillIntraRunGapsMaxGap'])

    # mismatches = checkExactMatches( x, y, inpfile)
    # sys.stderr.write("mismatches = %d\n" % mismatches)

    xIdx = IdxStore.IdxStore(assemblyFilePrefix1, assemblyId1)
    yIdx = IdxStore.IdxStore(assemblyFilePrefix2, assemblyId2)

    tempfile = MyFile.myfile()
    mainLoop(obj.matches, tempfile, xIdx, yIdx, fillIntraRunGapsMaxGap,
             fillIntraRunGapsErate)
    obj.matches = tempfile
    obj.checkpoint(outname)
Esempio n. 7
0
def addMyFile(crtFile, nrFiles):
    global nrNodes
    nrFiles += 1
    nrNodes += 1
    Label[nrNodes] = (crtFile, 'File')
    fileNodes.append(nrNodes)
    fileDict[crtFile] = nrNodes
    fileIssues[nrNodes] = {}
    nrFileIssues[nrNodes] = 0
    files.append(MyFile(nrFiles, crtFile, 0, 0, 0, 0))
    return nrFiles
Esempio n. 8
0
def boxRecovery( inpfile, rawfile, outname):
    inpfile.seek(0)
    rawfile.seek(0)
    outfile = MyFile.myfile()

    rawfileIter = iter(rawfile)
    
    # This is a modified merge operation?
    # The two input files must be sorted the same manner.
    leftMatch = None
    for line in inpfile:
        if(line[0] == 'M'):
            rightMatch = MatchRecord.MatchRecord(line)
            if( leftMatch != None and leftMatch.inSameRunAs(rightMatch) ):
                # print >>STDERR, "In same run leftMatch=", leftMatch, " rightMatch=", rightMatch
                for rawline in rawfileIter:
                    if( rawline[0] == 'M'):
                        rawMatch = MatchRecord.MatchRecord(rawline)
                        if(rawMatch.sameAs(rightMatch)):
                            print >>outfile, rightMatch
                            break
                        else:
                            # print "Inside run rawMatch=", rawMatch
                            if(rawMatch.isInsideBox(leftMatch,rightMatch)):
                                print >>outfile, rawMatch
                            # end if
                        # end if
                    # end if
                # end for
                # We should die here if there is no rawMatch that matched the rightMatch ...
            else:
                # print >>STDERR, "Between runs leftMatch=", leftMatch, " rightMatch=", rightMatch
                for rawline in rawfileIter:
                    if( rawline[0] == 'M'):
                        rawMatch = MatchRecord.MatchRecord(rawline)
                        if(rawMatch.sameAs(rightMatch)):
                            print >>outfile, rightMatch
                            break
                        else: 
                            # print >>STDERR, "Discard rawMatch=", rawMatch
                            pass
                        # end if
                    # end if
                # end for
                # We should die here if there is no rawMatch that matched the rightMatch ...
                # Discard raw Matches until it is ge to the right match.
            # end if
            leftMatch = rightMatch
        # end if
    # end for
    return outfile
Esempio n. 9
0
def formPerfectRuns ( inpfile, firstSort, secondSort, maxJump, runIdPrefix ):
    inpfile.seek(0)
    step = 0
    print >>sys.stderr, 'formPerfectRuns step=' + str(step)
    step += 1

    tmpfile = MyFile.myfile()
    firstSort( inpfile, tmpfile)

    print >>sys.stderr, 'formPerfectRuns step=' + str(step)
    step += 1
    outfile = createSignedEnumeration(tmpfile)

    print >>sys.stderr, 'formPerfectRuns step=' + str(step)
    step += 1
    tmpfile = MyFile.myfile()
    secondSort( outfile, tmpfile)

    print >>sys.stderr, 'formPerfectRuns step=' + str(step)
    step += 1
    outfile = findPerfectRuns( tmpfile, maxJump, runIdPrefix)
    
    return outfile
Esempio n. 10
0
def formPerfectRuns(inpfile, firstSort, secondSort, maxJump, runIdPrefix):
    inpfile.seek(0)
    step = 0
    print >> sys.stderr, 'formPerfectRuns step=' + str(step)
    step += 1

    tmpfile = MyFile.myfile()
    firstSort(inpfile, tmpfile)

    print >> sys.stderr, 'formPerfectRuns step=' + str(step)
    step += 1
    outfile = createSignedEnumeration(tmpfile)

    print >> sys.stderr, 'formPerfectRuns step=' + str(step)
    step += 1
    tmpfile = MyFile.myfile()
    secondSort(outfile, tmpfile)

    print >> sys.stderr, 'formPerfectRuns step=' + str(step)
    step += 1
    outfile = findPerfectRuns(tmpfile, maxJump, runIdPrefix)

    return outfile
Esempio n. 11
0
def onlyKeepLongRuns ( inpfile, outname, lengthThreshold ):
    outfile = MyFile.myfile()
    rejectsfile = MyFile.myfile()
    
    FL = None
    store = []
    lenInMatches = 0
    inpfile.seek(0)
    for line in inpfile:
        if(line[0] == 'M'):
            FM = MatchRecord.MatchRecord(line)
            SL = FM.x_length
            if FL != None and FL.runid != FM.runid :
                for x in store:
                    print >>rejectsfile, x
                # end for
                store = []
                lenInMatches = SL
            else:
                lenInMatches += SL
            # end if

            if lenInMatches < lengthThreshold:
                store.append(FM)
            else:
                for x in store:
                    print >>outfile, x
                # end for
                store = []
                print >>outfile, FM
            # end if
            FL = FM
        # end if
    # end for
    rejectsfile.close()
    return outfile
Esempio n. 12
0
def createSignedEnumeration(inpfile):
    outfile = MyFile.myfile()
    p = 1
    inpfile.seek(0)
    for line in inpfile:
        if (line[0] == 'M'):
            FM = MatchRecord.MatchRecord(line)
            forwardX = FM.x_orientation
            forwardY = FM.y_orientation
            srank = cvm(forwardX == forwardY, p, -p)
            p += 1
            FM.extend['srank'] = srank
            print >> outfile, FM
        # end if
    # end while
    return outfile
Esempio n. 13
0
def createSignedEnumeration(inpfile):
    outfile = MyFile.myfile()
    p = 1
    inpfile.seek(0)
    for line in inpfile:
        if(line[0] == 'M'):
            FM = MatchRecord.MatchRecord(line)
            forwardX = FM.x_orientation
            forwardY = FM.y_orientation
            srank = cvm(forwardX == forwardY, p, -p)
            p += 1
            FM.extend['srank'] = srank
            print >>outfile, FM
        # end if
    # end while
    return outfile
Esempio n. 14
0
def newmain():
    inpname = sys.argv[1]
    outname = sys.argv[2]

    obj = AtacFile.AtacFile(inpname)
    xname = obj.globals["assemblyFilePrefix1"]
    yname = obj.globals["assemblyFilePrefix1"]
    assemblyId1 = obj.globals["assemblyId1"]
    assemblyId2 = obj.globals["assemblyId2"]

    xIdx = IdxStore.IdxStore(xname, assemblyId1)
    yIdx = IdxStore.IdxStore(yname, assemblyId2)

    inpfile = obj.matches
    outfile = MyFile.myfile()
    main(inpfile, outfile, xIdx, yIdx)
    obj.matches = outfile
    obj.checkpoint(outname)
    outfile.close()
Esempio n. 15
0
def newmain():
    inpname = sys.argv[1]
    outname = sys.argv[2]

    obj = AtacFile.AtacFile(inpname)
    xname = obj.globals["assemblyFilePrefix1"]
    yname = obj.globals["assemblyFilePrefix1"]
    assemblyId1 = obj.globals["assemblyId1"]
    assemblyId2 = obj.globals["assemblyId2"]


    xIdx = IdxStore.IdxStore(xname,assemblyId1)
    yIdx = IdxStore.IdxStore(yname,assemblyId2)

    inpfile = obj.matches
    outfile = MyFile.myfile()
    main( inpfile, outfile, xIdx, yIdx)
    obj.matches = outfile
    obj.checkpoint(outname)
    outfile.close()
Esempio n. 16
0
def findCoverageIntervals(inpfile, outfile, processFirstAxis):
    # The input file is an ATAC matches file.
    # The output file is an ATAC coverage intervals file.
    inpfile.seek(0)
    outfile.seek(0)
    t0 = time.time()
    tmpfile3 = MyFile.myfile()
    for line in inpfile:
        if (line[0] == "M"):
            fields = line.split()
            if (fields[1] == "u" or fields[1] == "x"):
                if (processFirstAxis):
                    axis = fields[4]
                    bgn = int(fields[5])
                    end = bgn + int(fields[6])
                else:
                    axis = fields[8]
                    bgn = int(fields[9])
                    end = bgn + int(fields[10])
                print >> tmpfile3, "E", axis, bgn, 1
                print >> tmpfile3, "E", axis, end, -1
    tmpfile3.close()
    tmpname = tempfile.mktemp()
    cmd = "sort -T . -k 1,1 -k 2,2 -k 3n -k 4nr  %s > %s" % (tmpfile3.name,
                                                             tmpname)
    print >> sys.stderr, cmd
    iret = os.system(cmd)
    assert (iret == 0)
    print >> sys.stderr, "time elapsed is ", (time.time() - t0)
    tmpfile4 = open(tmpname)
    t0 = time.time()
    findUniformCoverageIntervals(tmpfile4, outfile)
    print >> sys.stderr, "time elapsed is ", (time.time() - t0)
    tmpfile4.close()
    os.system("rm -f " + tmpname)
    outfile.seek(0)
Esempio n. 17
0
def applyBothKeepMasks( inpfile, outfile ):

    # Maybe we can think of a masking implementation where each ATAC match
    # is treated atomicly.  Assume that the keep mask intervals are sorted
    # by start postition.  Assume that the ATAC matches are sorted by start
    # postion.  Assert that all keep mask intervals are non-overlapping and
    # were cut from only one ATAC match.  Thus the mapping from keep mask
    # intervals is a function.  Note that this requires that we do not
    # coalesce abutting keep mask intervals that originate from multiple
    # matches.  Note this still allows an ATAC match to overlap more than
    # one keep mask interval.  Ignore all keep mask intervals with zero
    # length their creation has tie breaking problems.  See notes on 2003
    # Jul 29.

    debug = 0
    debugnum = 0
    inpfile.seek(0)
    outfile.seek(0)


    # Apply the keepMask for the first axis.
    # Make the sorted the keep mask intervals for the first axis.
    processFirstAxis = 1
    keepMaskFile = MyFile.myfile()
    tmpfile2 = inpfile
    tmpfile3 = MyFile.myfile()
    tmpfile4 = MyFile.myfile()

    findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis)
    if debug:
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in keepMaskFile: print >>debugfile, line,
            
    MatchRecord.sortInXorderAP(tmpfile2,tmpfile3)
    if debug:
        #tmpfile2.seek(0)
        #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        #for line in tmpfile2: print >>debugfile, line,
        tmpfile3.seek(0)
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile3: print >>debugfile, line,
        
    applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
    if debug:
        tmpfile4.seek(0)
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile4: print >>debugfile, line,
        
    # Apply the keepMask for the second axis.
    # Make the sorted the keep mask intervals for the second axis.
    processFirstAxis = 0
    keepMaskFile = MyFile.myfile()
    tmpfile2 = tmpfile4
    tmpfile3 = MyFile.myfile()
    tmpfile4 = outfile

    findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis)
    if debug:
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in keepMaskFile: print >>debugfile, line,


    MatchRecord.sortInYorderAP(tmpfile2,tmpfile3)
    if debug:
        #tmpfile2.seek(0)
        #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        #for line in tmpfile2: print >>debugfile, line,
        tmpfile3.seek(0)
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile3: print >>debugfile, line,

    applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
    if debug:
        tmpfile4.seek(0)
        debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile4: print >>debugfile, line,
Esempio n. 18
0
    def runOld(self):
        self.globals['atacAlgorithmVersion'] = str(17)
        print >>STDERR, "runName = %s\n" % self.runName

        # The ATAC globals used by this script:
        opt_t = int(self.globals['globalMatchMinSize'])
        opt_l = int(self.globals['globalPerfectRunMinLen'])
        maxdiff = int(self.globals['globalPerfectRunMaxGapLen'])

        assemblyId1 = self.globals['assemblyId1']
        assemblyId2 = self.globals['assemblyId2']

        assemblyFile1 = self.globals['assemblyFile1']
        assemblyFile2 = self.globals['assemblyFile2']

        boxRecoveryOn = 0  # Deprecated for same species comparisons 2003/09/09.
        if(self.globals.has_key("boxRecoveryOn")):
            boxRecoveryOn = int(self.globals['boxRecoveryOn'])
            
        t0 = time.time()

        assemblyIdx1 = IdxStore.IdxStore(assemblyFile1,assemblyId1)
        assemblyIdx2 = IdxStore.IdxStore(assemblyFile2,assemblyId2)
        rawfile = None
        
        ###################################################################
        # Setup for checkpointing scheme.        
        redo = 0
        keep = 0
        step = 0
        if(self.globals.has_key("ckpKeep")):
            keep = int(self.globals['ckpKeep'])
        ckpName = "AllDone"
        ###################################################################

        print >>STDERR, 'Keep step=' + str(keep)
        print >>STDERR, 'At step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)

        outprefix = self.runName

        step += 1
        print >>STDERR, 'At uniqueFilter, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            if(not(self.globals.has_key('uniqueFilterOn') and self.globals['uniqueFilterOn']=="0")):
                print >>STDERR, 'Running UniqueFilter'
                outfile = MyFile.myfile()
                UniqueFilter.main( self.matches, outfile)
                self.matches = outfile
                outprefix += '.uniq'
                self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At filterByMatchLength, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'Running filterByMatchLength'
            outfile = MyFile.myfile()
            filterByMatchLength( self.matches, outfile, opt_t)
            self.matches = outfile
            outprefix += '.t' + str(opt_t)
            self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At trimMatchOverlaps, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, "Start trimming for bp one-to-one-ness"
            tempdata = MyFile.myfile()
            TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
            self.matches = tempdata
            print >>STDERR, "Finished trimming for bp one-to-one-ness"
            outprefix += '.trim'
            self.checkpoint(outprefix)

        if( boxRecoveryOn == 1 ):
            # For box recovery later ... but what if we start from a checkpoint?
            rawfile = self.matches

        step += 1
        print >>STDERR, 'At formPerfectRuns, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.p6'
            tempdata = PerfectRuns.formPerfectRuns(self.matches,
                                                   MatchRecord.sortInXorderAP,
                                                   MatchRecord.sortInYorderAP,
                                                   maxdiff,
                                                   'r')
            self.matches = tempdata
            outprefix += ".p6"
        # end if

        step += 1
        print >>STDERR, 'At onlyKeepLongRuns, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.l' + str(opt_l)
            tempdata = onlyKeepLongRuns( self.matches, outprefix, opt_l)
            self.matches = tempdata
            outprefix += '.l' + str(opt_l)
            self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At formPerfectRuns, step=' + str(step) 
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'Heal the perfect runs'
            tempdata = PerfectRuns.formPerfectRuns(self.matches,
                                       MatchRecord.sortInYorderAP,
                                       MatchRecord.sortInXorderAP, maxdiff, 'r')
            self.matches = tempdata
            outprefix += '.pr'
            self.checkpoint(outprefix)

        if(boxRecoveryOn == 1): 

            # This is a box recovery step.
            step += 1
            print >>STDERR, 'At boxRecovery, step=' + str(step) 
            print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
            if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
                redo = 1
                print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.br'
                print >>STDERR, "Make sorted raw matches"
                outfile = MyFile.myfile()
                MatchRecord.sortInXorderAP( rawfile, outfile)
                rawfile = outfile
                print >>STDERR, "perform box recovery"
                tempdata = boxRecovery( self.matches, rawfile, outprefix)
                self.matches = tempdata
                outprefix += '.br'
                self.checkpoint(outprefix)
            # end if

            step += 1
            print >>STDERR, 'At formPerfectRuns, step=' + str(step)
            print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
            if (redo or ( (keep < step) and not self.globals.has_key(ckpName))):
                print >>STDERR, "form perfect runs"
                redo = 1
                print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.p6'
                tempdata = PerfectRuns.formPerfectRuns(self.matches,
                                       MatchRecord.sortInXorderAP,
                                       MatchRecord.sortInYorderAP, maxdiff, 'r')
                self.matches = tempdata
                outprefix += '.pr'
                self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At squeezeIntraRunGaps, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.sq'
            tempdata = MyFile.myfile()
            squeezeIntraRunGaps.mainLoop(
                self.matches,
                tempdata,
                assemblyIdx1, assemblyIdx2)
            tempy = MyFile.myfile()
            # Beware the current match subtypes are 'x', 'L', and 'R'!
            coalesceMatches( tempdata, tempy, 1)
            self.matches = tempy
            outprefix += '.sq'
            self.checkpoint(outprefix)

        step += 1
        print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            print >>STDERR, "Start trimming for bp one-to-one-ness"
            tempdata = MyFile.myfile()
            TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
            self.matches = tempdata
            outprefix += '.trim'
            print >>STDERR, "Finished trimming for bp one-to-one-ness"

        step += 1
        print >>STDERR, 'At RunsAsMatches, step=' + str(step)
        print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
        if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
            redo = 1
            self.runs = PerfectRuns.runsAsMatches( self.matches)
            outprefix += '.runs'
            self.checkpoint(outprefix)
        # end if

        if(self.globals.has_key('fillIntraRunGapsOn') and self.globals['fillIntraRunGapsOn']=="1" ):
        
            # Next comes the DNA sequence dependent stuff.
            step += 1
            print >>STDERR, 'At fillIntraRunGaps, step=' + str(step)
            print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
            if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
                redo = 1
                print >>STDERR, "fill the intrarun gaps"
                if(not self.globals.has_key('fillIntraRunGapsErate')):
                    self.globals['fillIntraRunGapsErate'] = 0.10
                if(not self.globals.has_key('fillIntraRunGapsMaxGap')):
                    self.globals['fillIntraRunGapsMaxGap'] = 100000
                fillIntraRunGapsErate = float(self.globals['fillIntraRunGapsErate'])
                fillIntraRunGapsMaxGap = int(self.globals['fillIntraRunGapsMaxGap'])
                tempdata = MyFile.myfile()
                fillIntraRunGaps.mainLoop(self.matches, tempdata,
                                          assemblyIdx1, assemblyIdx2,
                                          fillIntraRunGapsMaxGap, fillIntraRunGapsErate)
                self.matches = tempdata
                outprefix += '.fill'
                self.checkpoint(outprefix)

            step += 1
            print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step)
            print >>STDERR, 'Time elapsed=' + str(time.time()-t0)
            if (redo or ((keep < step) and not self.globals.has_key(ckpName))):
                redo = 1
                print >>STDERR, "trim the overlaps"
                tempdata = MyFile.myfile()
                TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u')
                self.matches = tempdata
                outprefix += '.trim'
                self.checkpoint(outprefix)
Esempio n. 19
0
def runsAsMatches(inpfile):

    outfile = MyFile.myfile()
    lastF = None
    firstF = None
    runFill = 0
    inpfile.seek(0)
    for line in inpfile:
        if (line[0] == 'M'):
            curF = MatchRecord.MatchRecord(line)
            if ((lastF == None) or (curF.runid != lastF.runid)):
                if ((lastF != None)
                        and (firstF.x_scaf_uid != lastF.x_scaf_uid)):
                    print >> sys.stderr, firstF
                    print >> sys.stderr, lastF
                # end if
                assert ((lastF == None)
                        or (firstF.x_scaf_uid == lastF.x_scaf_uid))
                assert ((lastF == None)
                        or (firstF.y_scaf_uid == lastF.y_scaf_uid))
                if (None != lastF):
                    x1 = firstF.x_start
                    x2 = lastF.x_start
                    startX = cvm(x1 < x2, x1, x2)
                    x1 += firstF.x_length
                    x2 += lastF.x_length
                    endX = cvm(x1 > x2, x1, x2)
                    y1 = firstF.y_start
                    y2 = lastF.y_start
                    startY = cvm(y1 < y2, y1, y2)
                    y1 += firstF.y_length
                    y2 += lastF.y_length
                    endY = cvm(y1 > y2, y1, y2)
                    lastF.subtype = 'r'
                    lastF.matchid = lastF.runid
                    lastF.runid = "."  # the agreed NULL value
                    lastF.x_start = startX
                    lastF.y_start = startY
                    lastF.x_length = endX - startX
                    lastF.y_length = endY - startY
                    lastF.runFill = runFill
                    print >> outfile, lastF
                # end if
                firstF = curF
                runFill = 0
            # end if
            runFill += curF.x_length
            lastF = curF
        # end if
    # end for

    if (None != lastF):
        x1 = firstF.x_start
        x2 = lastF.x_start
        startX = cvm(x1 < x2, x1, x2)
        x1 += firstF.x_length
        x2 += lastF.x_length
        endX = cvm(x1 > x2, x1, x2)
        y1 = firstF.y_start
        y2 = lastF.y_start
        startY = cvm(y1 < y2, y1, y2)
        y1 += firstF.y_length
        y2 += lastF.y_length
        endY = cvm(y1 > y2, y1, y2)
        lastF.subtype = 'r'
        lastF.matchid = lastF.runid
        lastF.runid = "."  # the agreed NULL value
        lastF.x_start = startX
        lastF.y_start = startY
        lastF.x_length = endX - startX
        lastF.y_length = endY - startY
        lastF.runFill = runFill
        print >> outfile, lastF
    # end if
    return outfile
Esempio n. 20
0
def applyBothKeepMasks(inpfile, outfile):

    # Maybe we can think of a masking implementation where each ATAC match
    # is treated atomicly.  Assume that the keep mask intervals are sorted
    # by start postition.  Assume that the ATAC matches are sorted by start
    # postion.  Assert that all keep mask intervals are non-overlapping and
    # were cut from only one ATAC match.  Thus the mapping from keep mask
    # intervals is a function.  Note that this requires that we do not
    # coalesce abutting keep mask intervals that originate from multiple
    # matches.  Note this still allows an ATAC match to overlap more than
    # one keep mask interval.  Ignore all keep mask intervals with zero
    # length their creation has tie breaking problems.  See notes on 2003
    # Jul 29.

    debug = 0
    debugnum = 0
    inpfile.seek(0)
    outfile.seek(0)

    # Apply the keepMask for the first axis.
    # Make the sorted the keep mask intervals for the first axis.
    processFirstAxis = 1
    keepMaskFile = MyFile.myfile()
    tmpfile2 = inpfile
    tmpfile3 = MyFile.myfile()
    tmpfile4 = MyFile.myfile()

    findCoverageIntervals(inpfile, keepMaskFile, processFirstAxis)
    if debug:
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in keepMaskFile:
            print >> debugfile, line,

    MatchRecord.sortInXorderAP(tmpfile2, tmpfile3)
    if debug:
        #tmpfile2.seek(0)
        #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        #for line in tmpfile2: print >>debugfile, line,
        tmpfile3.seek(0)
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile3:
            print >> debugfile, line,

    applyOneKeepMask(tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
    if debug:
        tmpfile4.seek(0)
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile4:
            print >> debugfile, line,

    # Apply the keepMask for the second axis.
    # Make the sorted the keep mask intervals for the second axis.
    processFirstAxis = 0
    keepMaskFile = MyFile.myfile()
    tmpfile2 = tmpfile4
    tmpfile3 = MyFile.myfile()
    tmpfile4 = outfile

    findCoverageIntervals(inpfile, keepMaskFile, processFirstAxis)
    if debug:
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in keepMaskFile:
            print >> debugfile, line,

    MatchRecord.sortInYorderAP(tmpfile2, tmpfile3)
    if debug:
        #tmpfile2.seek(0)
        #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w")
        #for line in tmpfile2: print >>debugfile, line,
        tmpfile3.seek(0)
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile3:
            print >> debugfile, line,

    applyOneKeepMask(tmpfile3, tmpfile4, keepMaskFile, processFirstAxis)
    if debug:
        tmpfile4.seek(0)
        debugnum += 1
        debugfile = open("debugfile.%d" % debugnum, "w")
        for line in tmpfile4:
            print >> debugfile, line,
Esempio n. 21
0
def findPerfectRuns(inpfile, maxJump, runIdPrefix):
    outfile = MyFile.myfile()
    left = None
    runid = 1
    inpfile.seek(0)
    for line in inpfile:
        if (line[0] == 'M'):
            right = MatchRecord.MatchRecord(line)
            pr = int(right.extend['srank'])
            del (right.extend['srank'])
            if (left != None):
                maxGapInXandY = 0
                if (left.x_scaf_uid == right.x_scaf_uid
                        and left.y_scaf_uid == right.y_scaf_uid):
                    # Find the maximum of the gap in x and y axis.

                    x_rs = right.x_start
                    x_re = x_rs + right.x_length
                    x_ls = left.x_start
                    x_le = x_ls + left.x_length
                    assert (x_rs < x_re)
                    assert (x_ls < x_le)
                    # All matches are positive length.
                    x_gapLeftBeforeRight = x_rs - x_le
                    x_gapRightBeforeLeft = x_ls - x_re
                    assert (not (x_gapLeftBeforeRight > 0
                                 and x_gapRightBeforeLeft > 0))
                    x_gap = max(x_gapLeftBeforeRight, x_gapRightBeforeLeft)
                    # x_gap == 0 is abutting
                    # x_gap < 0  is overlapping

                    y_rs = right.y_start
                    y_re = y_rs + right.y_length
                    y_ls = left.y_start
                    y_le = y_ls + left.y_length
                    assert (y_rs < y_re)
                    assert (y_ls < y_le)
                    y_gapLeftBeforeRight = y_rs - y_le
                    y_gapRightBeforeLeft = y_ls - y_re
                    assert (not (y_gapLeftBeforeRight > 0
                                 and y_gapRightBeforeLeft > 0))
                    y_gap = max(y_gapLeftBeforeRight, y_gapRightBeforeLeft)
                    # y_gap == 0 is abutting
                    # y_gap < 0  is overlapping

                    maxGapInXandY = max(x_gap, y_gap)

                    if 1:
                        # Check the sorting of the matches.

                        sorted_by_x = (x_ls <= x_rs)
                        sorted_by_y = (y_ls <= y_rs)

                        if (not (sorted_by_x or sorted_by_y)):
                            print >> sys.stderr, "bad sorting in findPerfectRuns"
                            print >> sys.stderr, left
                            print >> sys.stderr, right
                        assert (sorted_by_x or sorted_by_y)
                        dovetail_in_x = (x_ls <= x_rs) and (x_le <= x_re)
                        dovetail_in_y = (y_ls <= y_rs) and (y_ls <= y_re)
                        if (sorted_by_x and not (dovetail_in_x)):
                            print >> sys.stderr, "contained in x in findPerfectRuns"
                            print >> sys.stderr, left
                            print >> sys.stderr, right
                        if (sorted_by_y and not (dovetail_in_y)):
                            print >> sys.stderr, "contained in y in findPerfectRuns"
                            print >> sys.stderr, left
                            print >> sys.stderr, right
                # endif
                if ((left.x_scaf_uid != right.x_scaf_uid)
                        or  # check first axis id
                    (left.y_scaf_uid != right.y_scaf_uid)
                        or  # check second axis id
                    (maxGapInXandY > maxJump)
                        or (pr != lastpr + 1
                            )  # Using the signed rank NOT the run id !!!!
                    ):
                    runid += 1
                # end if
            # end if
            lastpr = pr
            right.runid = "%s%d" % (
                runIdPrefix,
                runid,
            )  # Assign the run id in the same slot as the signed rank.
            print >> outfile, right
            left = right
        # end if
    # end for
    return outfile
Esempio n. 22
0
def runsAsMatches(inpfile):

    outfile = MyFile.myfile()
    lastF = None
    firstF = None
    runFill = 0
    inpfile.seek(0)
    for line in inpfile:
        if(line[0] == 'M'):
            curF = MatchRecord.MatchRecord(line)
            if ((lastF == None) or (curF.runid != lastF.runid)):
                if ((lastF != None) and (firstF.x_scaf_uid != lastF.x_scaf_uid)):
                    print >>sys.stderr, firstF
                    print >>sys.stderr, lastF
                # end if
                assert((lastF==None) or (firstF.x_scaf_uid == lastF.x_scaf_uid))
                assert((lastF==None) or (firstF.y_scaf_uid == lastF.y_scaf_uid))
                if (None != lastF):
                    x1 = firstF.x_start
                    x2 = lastF.x_start
                    startX = cvm(x1 < x2, x1, x2)
                    x1 += firstF.x_length
                    x2 += lastF.x_length
                    endX = cvm(x1 > x2, x1, x2)
                    y1 = firstF.y_start
                    y2 = lastF.y_start
                    startY = cvm( y1 < y2, y1, y2)
                    y1 += firstF.y_length
                    y2 += lastF.y_length
                    endY = cvm(y1 > y2, y1, y2)
                    lastF.subtype = 'r'
                    lastF.matchid = lastF.runid
                    lastF.runid = "."   # the agreed NULL value
                    lastF.x_start = startX
                    lastF.y_start = startY
                    lastF.x_length = endX - startX
                    lastF.y_length = endY - startY
                    lastF.runFill = runFill
                    print >>outfile, lastF
                # end if
                firstF = curF
                runFill = 0
            # end if
            runFill += curF.x_length
            lastF = curF
        # end if
    # end for
    
    if (None != lastF):
        x1 = firstF.x_start
        x2 = lastF.x_start
        startX = cvm( x1 < x2, x1, x2)
        x1 += firstF.x_length
        x2 += lastF.x_length
        endX = cvm( x1 > x2, x1, x2)
        y1 = firstF.y_start
        y2 = lastF.y_start
        startY = cvm( y1 < y2, y1, y2)
        y1 += firstF.y_length
        y2 += lastF.y_length
        endY = cvm( y1 > y2, y1, y2)
        lastF.subtype =  'r'
        lastF.matchid = lastF.runid
        lastF.runid = "."  # the agreed NULL value
        lastF.x_start = startX
        lastF.y_start = startY
        lastF.x_length = endX - startX
        lastF.y_length = endY - startY
        lastF.runFill = runFill
        print >>outfile, lastF
    # end if
    return outfile
Esempio n. 23
0
def findPerfectRuns ( inpfile, maxJump, runIdPrefix ):
    outfile = MyFile.myfile()
    left = None
    runid = 1
    inpfile.seek(0)
    for line in inpfile:
        if(line[0] == 'M'):
            right = MatchRecord.MatchRecord(line)
            pr = int(right.extend['srank'])
            del(right.extend['srank'])
            if(left != None):
                maxGapInXandY = 0
                if(left.x_scaf_uid == right.x_scaf_uid
                   and
                   left.y_scaf_uid == right.y_scaf_uid ):
                    # Find the maximum of the gap in x and y axis.
                    
                    x_rs = right.x_start
                    x_re = x_rs + right.x_length
                    x_ls = left.x_start
                    x_le = x_ls + left.x_length
                    assert(x_rs < x_re)
                    assert(x_ls < x_le)
                    # All matches are positive length.
                    x_gapLeftBeforeRight = x_rs - x_le
                    x_gapRightBeforeLeft = x_ls - x_re
                    assert(not(x_gapLeftBeforeRight>0 and x_gapRightBeforeLeft>0))
                    x_gap = max(x_gapLeftBeforeRight,x_gapRightBeforeLeft)
                    # x_gap == 0 is abutting
                    # x_gap < 0  is overlapping

                    y_rs = right.y_start
                    y_re = y_rs + right.y_length
                    y_ls = left.y_start
                    y_le = y_ls + left.y_length
                    assert(y_rs < y_re)
                    assert(y_ls < y_le)
                    y_gapLeftBeforeRight = y_rs - y_le
                    y_gapRightBeforeLeft = y_ls - y_re
                    assert(not(y_gapLeftBeforeRight>0 and y_gapRightBeforeLeft>0))
                    y_gap = max(y_gapLeftBeforeRight,y_gapRightBeforeLeft)
                    # y_gap == 0 is abutting
                    # y_gap < 0  is overlapping

                    maxGapInXandY = max(x_gap,y_gap)

                    if 1:
                        # Check the sorting of the matches.
                        
                        sorted_by_x = (x_ls <= x_rs)
                        sorted_by_y = (y_ls <= y_rs)

                        if(not(sorted_by_x or sorted_by_y)):
                            print >>sys.stderr, "bad sorting in findPerfectRuns"
                            print >>sys.stderr, left
                            print >>sys.stderr, right
                        assert(sorted_by_x or sorted_by_y)
                        dovetail_in_x = (x_ls <= x_rs) and (x_le <= x_re)
                        dovetail_in_y = (y_ls <= y_rs) and (y_ls <= y_re)
                        if(sorted_by_x and not(dovetail_in_x)):
                            print >>sys.stderr, "contained in x in findPerfectRuns"
                            print >>sys.stderr, left
                            print >>sys.stderr, right
                        if(sorted_by_y and not(dovetail_in_y)):
                            print >>sys.stderr, "contained in y in findPerfectRuns"
                            print >>sys.stderr, left
                            print >>sys.stderr, right
                # endif
                if(
                    (left.x_scaf_uid != right.x_scaf_uid) or  # check first axis id
                    (left.y_scaf_uid != right.y_scaf_uid) or  # check second axis id
                    (maxGapInXandY > maxJump) or
                    (pr != lastpr + 1)  # Using the signed rank NOT the run id !!!!
                    ):
                    runid += 1
                # end if
            # end if
            lastpr = pr
            right.runid = "%s%d" % (runIdPrefix,runid,)   # Assign the run id in the same slot as the signed rank.
            print >>outfile, right
            left = right
        # end if
    # end for
    return outfile