def trimMatchOverlapsInBoth(inpfile, outfile, trim_subtype): gp = MyFile.myfile() MatchRecord.sortInXorderAP(inpfile, gp) # The following coalescing assumes perfect runs. hp = MyFile.myfile() coalesceMatches(gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u'))) gp = MyFile.myfile() trimMatchOverlapsInX(hp, gp, trim_subtype) hp = MyFile.myfile() MatchRecord.sortInYorderAP(gp, hp) trimMatchOverlapsInY(hp, outfile, trim_subtype) return
def trimMatchOverlapsInBoth(inpfile,outfile,trim_subtype): gp = MyFile.myfile() MatchRecord.sortInXorderAP(inpfile,gp) # The following coalescing assumes perfect runs. hp = MyFile.myfile() coalesceMatches( gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u')) ) gp = MyFile.myfile() trimMatchOverlapsInX(hp,gp,trim_subtype) hp = MyFile.myfile() MatchRecord.sortInYorderAP(gp,hp) trimMatchOverlapsInY(hp,outfile,trim_subtype) return
def __init__( self, runName): "You must supply a atac file called runName.atac." self.runName = runName self.comments = [] self.metacommands = [] self.globals = {} self.tableformat = {} self.tabledata = {} self.matches = MyFile.myfile() self.runs = MyFile.myfile() fp = open(runName,"r") for line in fp: self.atac_file_parse_line(line)
def findCoverageIntervals( inpfile, outfile, processFirstAxis): # The input file is an ATAC matches file. # The output file is an ATAC coverage intervals file. inpfile.seek(0) outfile.seek(0) t0 = time.time() tmpfile3 = MyFile.myfile() for line in inpfile: if(line[0]=="M"): fields = line.split() if(fields[1]=="u" or fields[1]=="x"): if(processFirstAxis): axis = fields[4] bgn = int(fields[5]) end = bgn+int(fields[6]) else: axis = fields[8] bgn = int(fields[9]) end = bgn+int(fields[10]) print >>tmpfile3, "E", axis,bgn,1 print >>tmpfile3, "E", axis,end,-1 tmpfile3.close() tmpname = tempfile.mktemp() cmd = "sort -T . -k 1,1 -k 2,2 -k 3n -k 4nr %s > %s" % (tmpfile3.name, tmpname) print >>sys.stderr, cmd iret = os.system(cmd); assert(iret==0) print >>sys.stderr,"time elapsed is ", (time.time() - t0) tmpfile4 = open(tmpname) t0 = time.time() findUniformCoverageIntervals( tmpfile4, outfile) print >>sys.stderr,"time elapsed is ", (time.time() - t0) tmpfile4.close() os.system("rm -f " + tmpname) outfile.seek(0)
def main( inpname, outname): obj = AtacFile.AtacFile(inpname) assemblyId1 = obj.globals['assemblyId1'] assemblyId2 = obj.globals['assemblyId2'] assemblyFilePrefix1 = obj.globals['assemblyFilePrefix1'] assemblyFilePrefix2 = obj.globals['assemblyFilePrefix2'] if(not obj.globals.has_key('fillIntraRunGapsErate')): obj.globals['fillIntraRunGapsErate'] = 0.10 if(not obj.globals.has_key('fillIntraRunGapsMaxGap')): obj.globals['fillIntraRunGapsMaxGap'] = 100000 fillIntraRunGapsErate = float(obj.globals['fillIntraRunGapsErate']) fillIntraRunGapsMaxGap = int(obj.globals['fillIntraRunGapsMaxGap']) # mismatches = checkExactMatches( x, y, inpfile) # sys.stderr.write("mismatches = %d\n" % mismatches) xIdx = IdxStore.IdxStore(assemblyFilePrefix1,assemblyId1) yIdx = IdxStore.IdxStore(assemblyFilePrefix2,assemblyId2) tempfile = MyFile.myfile() mainLoop( obj.matches, tempfile, xIdx, yIdx, fillIntraRunGapsMaxGap, fillIntraRunGapsErate) obj.matches = tempfile obj.checkpoint(outname)
def main(inpname, outname): obj = AtacFile.AtacFile(inpname) assemblyId1 = obj.globals['assemblyId1'] assemblyId2 = obj.globals['assemblyId2'] assemblyFilePrefix1 = obj.globals['assemblyFilePrefix1'] assemblyFilePrefix2 = obj.globals['assemblyFilePrefix2'] if (not obj.globals.has_key('fillIntraRunGapsErate')): obj.globals['fillIntraRunGapsErate'] = 0.10 if (not obj.globals.has_key('fillIntraRunGapsMaxGap')): obj.globals['fillIntraRunGapsMaxGap'] = 100000 fillIntraRunGapsErate = float(obj.globals['fillIntraRunGapsErate']) fillIntraRunGapsMaxGap = int(obj.globals['fillIntraRunGapsMaxGap']) # mismatches = checkExactMatches( x, y, inpfile) # sys.stderr.write("mismatches = %d\n" % mismatches) xIdx = IdxStore.IdxStore(assemblyFilePrefix1, assemblyId1) yIdx = IdxStore.IdxStore(assemblyFilePrefix2, assemblyId2) tempfile = MyFile.myfile() mainLoop(obj.matches, tempfile, xIdx, yIdx, fillIntraRunGapsMaxGap, fillIntraRunGapsErate) obj.matches = tempfile obj.checkpoint(outname)
def addMyFile(crtFile, nrFiles): global nrNodes nrFiles += 1 nrNodes += 1 Label[nrNodes] = (crtFile, 'File') fileNodes.append(nrNodes) fileDict[crtFile] = nrNodes fileIssues[nrNodes] = {} nrFileIssues[nrNodes] = 0 files.append(MyFile(nrFiles, crtFile, 0, 0, 0, 0)) return nrFiles
def boxRecovery( inpfile, rawfile, outname): inpfile.seek(0) rawfile.seek(0) outfile = MyFile.myfile() rawfileIter = iter(rawfile) # This is a modified merge operation? # The two input files must be sorted the same manner. leftMatch = None for line in inpfile: if(line[0] == 'M'): rightMatch = MatchRecord.MatchRecord(line) if( leftMatch != None and leftMatch.inSameRunAs(rightMatch) ): # print >>STDERR, "In same run leftMatch=", leftMatch, " rightMatch=", rightMatch for rawline in rawfileIter: if( rawline[0] == 'M'): rawMatch = MatchRecord.MatchRecord(rawline) if(rawMatch.sameAs(rightMatch)): print >>outfile, rightMatch break else: # print "Inside run rawMatch=", rawMatch if(rawMatch.isInsideBox(leftMatch,rightMatch)): print >>outfile, rawMatch # end if # end if # end if # end for # We should die here if there is no rawMatch that matched the rightMatch ... else: # print >>STDERR, "Between runs leftMatch=", leftMatch, " rightMatch=", rightMatch for rawline in rawfileIter: if( rawline[0] == 'M'): rawMatch = MatchRecord.MatchRecord(rawline) if(rawMatch.sameAs(rightMatch)): print >>outfile, rightMatch break else: # print >>STDERR, "Discard rawMatch=", rawMatch pass # end if # end if # end for # We should die here if there is no rawMatch that matched the rightMatch ... # Discard raw Matches until it is ge to the right match. # end if leftMatch = rightMatch # end if # end for return outfile
def formPerfectRuns ( inpfile, firstSort, secondSort, maxJump, runIdPrefix ): inpfile.seek(0) step = 0 print >>sys.stderr, 'formPerfectRuns step=' + str(step) step += 1 tmpfile = MyFile.myfile() firstSort( inpfile, tmpfile) print >>sys.stderr, 'formPerfectRuns step=' + str(step) step += 1 outfile = createSignedEnumeration(tmpfile) print >>sys.stderr, 'formPerfectRuns step=' + str(step) step += 1 tmpfile = MyFile.myfile() secondSort( outfile, tmpfile) print >>sys.stderr, 'formPerfectRuns step=' + str(step) step += 1 outfile = findPerfectRuns( tmpfile, maxJump, runIdPrefix) return outfile
def formPerfectRuns(inpfile, firstSort, secondSort, maxJump, runIdPrefix): inpfile.seek(0) step = 0 print >> sys.stderr, 'formPerfectRuns step=' + str(step) step += 1 tmpfile = MyFile.myfile() firstSort(inpfile, tmpfile) print >> sys.stderr, 'formPerfectRuns step=' + str(step) step += 1 outfile = createSignedEnumeration(tmpfile) print >> sys.stderr, 'formPerfectRuns step=' + str(step) step += 1 tmpfile = MyFile.myfile() secondSort(outfile, tmpfile) print >> sys.stderr, 'formPerfectRuns step=' + str(step) step += 1 outfile = findPerfectRuns(tmpfile, maxJump, runIdPrefix) return outfile
def onlyKeepLongRuns ( inpfile, outname, lengthThreshold ): outfile = MyFile.myfile() rejectsfile = MyFile.myfile() FL = None store = [] lenInMatches = 0 inpfile.seek(0) for line in inpfile: if(line[0] == 'M'): FM = MatchRecord.MatchRecord(line) SL = FM.x_length if FL != None and FL.runid != FM.runid : for x in store: print >>rejectsfile, x # end for store = [] lenInMatches = SL else: lenInMatches += SL # end if if lenInMatches < lengthThreshold: store.append(FM) else: for x in store: print >>outfile, x # end for store = [] print >>outfile, FM # end if FL = FM # end if # end for rejectsfile.close() return outfile
def createSignedEnumeration(inpfile): outfile = MyFile.myfile() p = 1 inpfile.seek(0) for line in inpfile: if (line[0] == 'M'): FM = MatchRecord.MatchRecord(line) forwardX = FM.x_orientation forwardY = FM.y_orientation srank = cvm(forwardX == forwardY, p, -p) p += 1 FM.extend['srank'] = srank print >> outfile, FM # end if # end while return outfile
def createSignedEnumeration(inpfile): outfile = MyFile.myfile() p = 1 inpfile.seek(0) for line in inpfile: if(line[0] == 'M'): FM = MatchRecord.MatchRecord(line) forwardX = FM.x_orientation forwardY = FM.y_orientation srank = cvm(forwardX == forwardY, p, -p) p += 1 FM.extend['srank'] = srank print >>outfile, FM # end if # end while return outfile
def newmain(): inpname = sys.argv[1] outname = sys.argv[2] obj = AtacFile.AtacFile(inpname) xname = obj.globals["assemblyFilePrefix1"] yname = obj.globals["assemblyFilePrefix1"] assemblyId1 = obj.globals["assemblyId1"] assemblyId2 = obj.globals["assemblyId2"] xIdx = IdxStore.IdxStore(xname, assemblyId1) yIdx = IdxStore.IdxStore(yname, assemblyId2) inpfile = obj.matches outfile = MyFile.myfile() main(inpfile, outfile, xIdx, yIdx) obj.matches = outfile obj.checkpoint(outname) outfile.close()
def newmain(): inpname = sys.argv[1] outname = sys.argv[2] obj = AtacFile.AtacFile(inpname) xname = obj.globals["assemblyFilePrefix1"] yname = obj.globals["assemblyFilePrefix1"] assemblyId1 = obj.globals["assemblyId1"] assemblyId2 = obj.globals["assemblyId2"] xIdx = IdxStore.IdxStore(xname,assemblyId1) yIdx = IdxStore.IdxStore(yname,assemblyId2) inpfile = obj.matches outfile = MyFile.myfile() main( inpfile, outfile, xIdx, yIdx) obj.matches = outfile obj.checkpoint(outname) outfile.close()
def findCoverageIntervals(inpfile, outfile, processFirstAxis): # The input file is an ATAC matches file. # The output file is an ATAC coverage intervals file. inpfile.seek(0) outfile.seek(0) t0 = time.time() tmpfile3 = MyFile.myfile() for line in inpfile: if (line[0] == "M"): fields = line.split() if (fields[1] == "u" or fields[1] == "x"): if (processFirstAxis): axis = fields[4] bgn = int(fields[5]) end = bgn + int(fields[6]) else: axis = fields[8] bgn = int(fields[9]) end = bgn + int(fields[10]) print >> tmpfile3, "E", axis, bgn, 1 print >> tmpfile3, "E", axis, end, -1 tmpfile3.close() tmpname = tempfile.mktemp() cmd = "sort -T . -k 1,1 -k 2,2 -k 3n -k 4nr %s > %s" % (tmpfile3.name, tmpname) print >> sys.stderr, cmd iret = os.system(cmd) assert (iret == 0) print >> sys.stderr, "time elapsed is ", (time.time() - t0) tmpfile4 = open(tmpname) t0 = time.time() findUniformCoverageIntervals(tmpfile4, outfile) print >> sys.stderr, "time elapsed is ", (time.time() - t0) tmpfile4.close() os.system("rm -f " + tmpname) outfile.seek(0)
def applyBothKeepMasks( inpfile, outfile ): # Maybe we can think of a masking implementation where each ATAC match # is treated atomicly. Assume that the keep mask intervals are sorted # by start postition. Assume that the ATAC matches are sorted by start # postion. Assert that all keep mask intervals are non-overlapping and # were cut from only one ATAC match. Thus the mapping from keep mask # intervals is a function. Note that this requires that we do not # coalesce abutting keep mask intervals that originate from multiple # matches. Note this still allows an ATAC match to overlap more than # one keep mask interval. Ignore all keep mask intervals with zero # length their creation has tie breaking problems. See notes on 2003 # Jul 29. debug = 0 debugnum = 0 inpfile.seek(0) outfile.seek(0) # Apply the keepMask for the first axis. # Make the sorted the keep mask intervals for the first axis. processFirstAxis = 1 keepMaskFile = MyFile.myfile() tmpfile2 = inpfile tmpfile3 = MyFile.myfile() tmpfile4 = MyFile.myfile() findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis) if debug: debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in keepMaskFile: print >>debugfile, line, MatchRecord.sortInXorderAP(tmpfile2,tmpfile3) if debug: #tmpfile2.seek(0) #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") #for line in tmpfile2: print >>debugfile, line, tmpfile3.seek(0) debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile3: print >>debugfile, line, applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis) if debug: tmpfile4.seek(0) debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile4: print >>debugfile, line, # Apply the keepMask for the second axis. # Make the sorted the keep mask intervals for the second axis. processFirstAxis = 0 keepMaskFile = MyFile.myfile() tmpfile2 = tmpfile4 tmpfile3 = MyFile.myfile() tmpfile4 = outfile findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis) if debug: debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in keepMaskFile: print >>debugfile, line, MatchRecord.sortInYorderAP(tmpfile2,tmpfile3) if debug: #tmpfile2.seek(0) #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") #for line in tmpfile2: print >>debugfile, line, tmpfile3.seek(0) debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile3: print >>debugfile, line, applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis) if debug: tmpfile4.seek(0) debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile4: print >>debugfile, line,
def runOld(self): self.globals['atacAlgorithmVersion'] = str(17) print >>STDERR, "runName = %s\n" % self.runName # The ATAC globals used by this script: opt_t = int(self.globals['globalMatchMinSize']) opt_l = int(self.globals['globalPerfectRunMinLen']) maxdiff = int(self.globals['globalPerfectRunMaxGapLen']) assemblyId1 = self.globals['assemblyId1'] assemblyId2 = self.globals['assemblyId2'] assemblyFile1 = self.globals['assemblyFile1'] assemblyFile2 = self.globals['assemblyFile2'] boxRecoveryOn = 0 # Deprecated for same species comparisons 2003/09/09. if(self.globals.has_key("boxRecoveryOn")): boxRecoveryOn = int(self.globals['boxRecoveryOn']) t0 = time.time() assemblyIdx1 = IdxStore.IdxStore(assemblyFile1,assemblyId1) assemblyIdx2 = IdxStore.IdxStore(assemblyFile2,assemblyId2) rawfile = None ################################################################### # Setup for checkpointing scheme. redo = 0 keep = 0 step = 0 if(self.globals.has_key("ckpKeep")): keep = int(self.globals['ckpKeep']) ckpName = "AllDone" ################################################################### print >>STDERR, 'Keep step=' + str(keep) print >>STDERR, 'At step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) outprefix = self.runName step += 1 print >>STDERR, 'At uniqueFilter, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 if(not(self.globals.has_key('uniqueFilterOn') and self.globals['uniqueFilterOn']=="0")): print >>STDERR, 'Running UniqueFilter' outfile = MyFile.myfile() UniqueFilter.main( self.matches, outfile) self.matches = outfile outprefix += '.uniq' self.checkpoint(outprefix) step += 1 print >>STDERR, 'At filterByMatchLength, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'Running filterByMatchLength' outfile = MyFile.myfile() filterByMatchLength( self.matches, outfile, opt_t) self.matches = outfile outprefix += '.t' + str(opt_t) self.checkpoint(outprefix) step += 1 print >>STDERR, 'At trimMatchOverlaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, "Start trimming for bp one-to-one-ness" tempdata = MyFile.myfile() TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u') self.matches = tempdata print >>STDERR, "Finished trimming for bp one-to-one-ness" outprefix += '.trim' self.checkpoint(outprefix) if( boxRecoveryOn == 1 ): # For box recovery later ... but what if we start from a checkpoint? rawfile = self.matches step += 1 print >>STDERR, 'At formPerfectRuns, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.p6' tempdata = PerfectRuns.formPerfectRuns(self.matches, MatchRecord.sortInXorderAP, MatchRecord.sortInYorderAP, maxdiff, 'r') self.matches = tempdata outprefix += ".p6" # end if step += 1 print >>STDERR, 'At onlyKeepLongRuns, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.l' + str(opt_l) tempdata = onlyKeepLongRuns( self.matches, outprefix, opt_l) self.matches = tempdata outprefix += '.l' + str(opt_l) self.checkpoint(outprefix) step += 1 print >>STDERR, 'At formPerfectRuns, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'Heal the perfect runs' tempdata = PerfectRuns.formPerfectRuns(self.matches, MatchRecord.sortInYorderAP, MatchRecord.sortInXorderAP, maxdiff, 'r') self.matches = tempdata outprefix += '.pr' self.checkpoint(outprefix) if(boxRecoveryOn == 1): # This is a box recovery step. step += 1 print >>STDERR, 'At boxRecovery, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.br' print >>STDERR, "Make sorted raw matches" outfile = MyFile.myfile() MatchRecord.sortInXorderAP( rawfile, outfile) rawfile = outfile print >>STDERR, "perform box recovery" tempdata = boxRecovery( self.matches, rawfile, outprefix) self.matches = tempdata outprefix += '.br' self.checkpoint(outprefix) # end if step += 1 print >>STDERR, 'At formPerfectRuns, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ( (keep < step) and not self.globals.has_key(ckpName))): print >>STDERR, "form perfect runs" redo = 1 print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.p6' tempdata = PerfectRuns.formPerfectRuns(self.matches, MatchRecord.sortInXorderAP, MatchRecord.sortInYorderAP, maxdiff, 'r') self.matches = tempdata outprefix += '.pr' self.checkpoint(outprefix) step += 1 print >>STDERR, 'At squeezeIntraRunGaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.sq' tempdata = MyFile.myfile() squeezeIntraRunGaps.mainLoop( self.matches, tempdata, assemblyIdx1, assemblyIdx2) tempy = MyFile.myfile() # Beware the current match subtypes are 'x', 'L', and 'R'! coalesceMatches( tempdata, tempy, 1) self.matches = tempy outprefix += '.sq' self.checkpoint(outprefix) step += 1 print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, "Start trimming for bp one-to-one-ness" tempdata = MyFile.myfile() TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u') self.matches = tempdata outprefix += '.trim' print >>STDERR, "Finished trimming for bp one-to-one-ness" step += 1 print >>STDERR, 'At RunsAsMatches, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 self.runs = PerfectRuns.runsAsMatches( self.matches) outprefix += '.runs' self.checkpoint(outprefix) # end if if(self.globals.has_key('fillIntraRunGapsOn') and self.globals['fillIntraRunGapsOn']=="1" ): # Next comes the DNA sequence dependent stuff. step += 1 print >>STDERR, 'At fillIntraRunGaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, "fill the intrarun gaps" if(not self.globals.has_key('fillIntraRunGapsErate')): self.globals['fillIntraRunGapsErate'] = 0.10 if(not self.globals.has_key('fillIntraRunGapsMaxGap')): self.globals['fillIntraRunGapsMaxGap'] = 100000 fillIntraRunGapsErate = float(self.globals['fillIntraRunGapsErate']) fillIntraRunGapsMaxGap = int(self.globals['fillIntraRunGapsMaxGap']) tempdata = MyFile.myfile() fillIntraRunGaps.mainLoop(self.matches, tempdata, assemblyIdx1, assemblyIdx2, fillIntraRunGapsMaxGap, fillIntraRunGapsErate) self.matches = tempdata outprefix += '.fill' self.checkpoint(outprefix) step += 1 print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, "trim the overlaps" tempdata = MyFile.myfile() TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u') self.matches = tempdata outprefix += '.trim' self.checkpoint(outprefix)
def runsAsMatches(inpfile): outfile = MyFile.myfile() lastF = None firstF = None runFill = 0 inpfile.seek(0) for line in inpfile: if (line[0] == 'M'): curF = MatchRecord.MatchRecord(line) if ((lastF == None) or (curF.runid != lastF.runid)): if ((lastF != None) and (firstF.x_scaf_uid != lastF.x_scaf_uid)): print >> sys.stderr, firstF print >> sys.stderr, lastF # end if assert ((lastF == None) or (firstF.x_scaf_uid == lastF.x_scaf_uid)) assert ((lastF == None) or (firstF.y_scaf_uid == lastF.y_scaf_uid)) if (None != lastF): x1 = firstF.x_start x2 = lastF.x_start startX = cvm(x1 < x2, x1, x2) x1 += firstF.x_length x2 += lastF.x_length endX = cvm(x1 > x2, x1, x2) y1 = firstF.y_start y2 = lastF.y_start startY = cvm(y1 < y2, y1, y2) y1 += firstF.y_length y2 += lastF.y_length endY = cvm(y1 > y2, y1, y2) lastF.subtype = 'r' lastF.matchid = lastF.runid lastF.runid = "." # the agreed NULL value lastF.x_start = startX lastF.y_start = startY lastF.x_length = endX - startX lastF.y_length = endY - startY lastF.runFill = runFill print >> outfile, lastF # end if firstF = curF runFill = 0 # end if runFill += curF.x_length lastF = curF # end if # end for if (None != lastF): x1 = firstF.x_start x2 = lastF.x_start startX = cvm(x1 < x2, x1, x2) x1 += firstF.x_length x2 += lastF.x_length endX = cvm(x1 > x2, x1, x2) y1 = firstF.y_start y2 = lastF.y_start startY = cvm(y1 < y2, y1, y2) y1 += firstF.y_length y2 += lastF.y_length endY = cvm(y1 > y2, y1, y2) lastF.subtype = 'r' lastF.matchid = lastF.runid lastF.runid = "." # the agreed NULL value lastF.x_start = startX lastF.y_start = startY lastF.x_length = endX - startX lastF.y_length = endY - startY lastF.runFill = runFill print >> outfile, lastF # end if return outfile
def applyBothKeepMasks(inpfile, outfile): # Maybe we can think of a masking implementation where each ATAC match # is treated atomicly. Assume that the keep mask intervals are sorted # by start postition. Assume that the ATAC matches are sorted by start # postion. Assert that all keep mask intervals are non-overlapping and # were cut from only one ATAC match. Thus the mapping from keep mask # intervals is a function. Note that this requires that we do not # coalesce abutting keep mask intervals that originate from multiple # matches. Note this still allows an ATAC match to overlap more than # one keep mask interval. Ignore all keep mask intervals with zero # length their creation has tie breaking problems. See notes on 2003 # Jul 29. debug = 0 debugnum = 0 inpfile.seek(0) outfile.seek(0) # Apply the keepMask for the first axis. # Make the sorted the keep mask intervals for the first axis. processFirstAxis = 1 keepMaskFile = MyFile.myfile() tmpfile2 = inpfile tmpfile3 = MyFile.myfile() tmpfile4 = MyFile.myfile() findCoverageIntervals(inpfile, keepMaskFile, processFirstAxis) if debug: debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in keepMaskFile: print >> debugfile, line, MatchRecord.sortInXorderAP(tmpfile2, tmpfile3) if debug: #tmpfile2.seek(0) #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") #for line in tmpfile2: print >>debugfile, line, tmpfile3.seek(0) debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile3: print >> debugfile, line, applyOneKeepMask(tmpfile3, tmpfile4, keepMaskFile, processFirstAxis) if debug: tmpfile4.seek(0) debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile4: print >> debugfile, line, # Apply the keepMask for the second axis. # Make the sorted the keep mask intervals for the second axis. processFirstAxis = 0 keepMaskFile = MyFile.myfile() tmpfile2 = tmpfile4 tmpfile3 = MyFile.myfile() tmpfile4 = outfile findCoverageIntervals(inpfile, keepMaskFile, processFirstAxis) if debug: debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in keepMaskFile: print >> debugfile, line, MatchRecord.sortInYorderAP(tmpfile2, tmpfile3) if debug: #tmpfile2.seek(0) #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") #for line in tmpfile2: print >>debugfile, line, tmpfile3.seek(0) debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile3: print >> debugfile, line, applyOneKeepMask(tmpfile3, tmpfile4, keepMaskFile, processFirstAxis) if debug: tmpfile4.seek(0) debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile4: print >> debugfile, line,
def findPerfectRuns(inpfile, maxJump, runIdPrefix): outfile = MyFile.myfile() left = None runid = 1 inpfile.seek(0) for line in inpfile: if (line[0] == 'M'): right = MatchRecord.MatchRecord(line) pr = int(right.extend['srank']) del (right.extend['srank']) if (left != None): maxGapInXandY = 0 if (left.x_scaf_uid == right.x_scaf_uid and left.y_scaf_uid == right.y_scaf_uid): # Find the maximum of the gap in x and y axis. x_rs = right.x_start x_re = x_rs + right.x_length x_ls = left.x_start x_le = x_ls + left.x_length assert (x_rs < x_re) assert (x_ls < x_le) # All matches are positive length. x_gapLeftBeforeRight = x_rs - x_le x_gapRightBeforeLeft = x_ls - x_re assert (not (x_gapLeftBeforeRight > 0 and x_gapRightBeforeLeft > 0)) x_gap = max(x_gapLeftBeforeRight, x_gapRightBeforeLeft) # x_gap == 0 is abutting # x_gap < 0 is overlapping y_rs = right.y_start y_re = y_rs + right.y_length y_ls = left.y_start y_le = y_ls + left.y_length assert (y_rs < y_re) assert (y_ls < y_le) y_gapLeftBeforeRight = y_rs - y_le y_gapRightBeforeLeft = y_ls - y_re assert (not (y_gapLeftBeforeRight > 0 and y_gapRightBeforeLeft > 0)) y_gap = max(y_gapLeftBeforeRight, y_gapRightBeforeLeft) # y_gap == 0 is abutting # y_gap < 0 is overlapping maxGapInXandY = max(x_gap, y_gap) if 1: # Check the sorting of the matches. sorted_by_x = (x_ls <= x_rs) sorted_by_y = (y_ls <= y_rs) if (not (sorted_by_x or sorted_by_y)): print >> sys.stderr, "bad sorting in findPerfectRuns" print >> sys.stderr, left print >> sys.stderr, right assert (sorted_by_x or sorted_by_y) dovetail_in_x = (x_ls <= x_rs) and (x_le <= x_re) dovetail_in_y = (y_ls <= y_rs) and (y_ls <= y_re) if (sorted_by_x and not (dovetail_in_x)): print >> sys.stderr, "contained in x in findPerfectRuns" print >> sys.stderr, left print >> sys.stderr, right if (sorted_by_y and not (dovetail_in_y)): print >> sys.stderr, "contained in y in findPerfectRuns" print >> sys.stderr, left print >> sys.stderr, right # endif if ((left.x_scaf_uid != right.x_scaf_uid) or # check first axis id (left.y_scaf_uid != right.y_scaf_uid) or # check second axis id (maxGapInXandY > maxJump) or (pr != lastpr + 1 ) # Using the signed rank NOT the run id !!!! ): runid += 1 # end if # end if lastpr = pr right.runid = "%s%d" % ( runIdPrefix, runid, ) # Assign the run id in the same slot as the signed rank. print >> outfile, right left = right # end if # end for return outfile
def runsAsMatches(inpfile): outfile = MyFile.myfile() lastF = None firstF = None runFill = 0 inpfile.seek(0) for line in inpfile: if(line[0] == 'M'): curF = MatchRecord.MatchRecord(line) if ((lastF == None) or (curF.runid != lastF.runid)): if ((lastF != None) and (firstF.x_scaf_uid != lastF.x_scaf_uid)): print >>sys.stderr, firstF print >>sys.stderr, lastF # end if assert((lastF==None) or (firstF.x_scaf_uid == lastF.x_scaf_uid)) assert((lastF==None) or (firstF.y_scaf_uid == lastF.y_scaf_uid)) if (None != lastF): x1 = firstF.x_start x2 = lastF.x_start startX = cvm(x1 < x2, x1, x2) x1 += firstF.x_length x2 += lastF.x_length endX = cvm(x1 > x2, x1, x2) y1 = firstF.y_start y2 = lastF.y_start startY = cvm( y1 < y2, y1, y2) y1 += firstF.y_length y2 += lastF.y_length endY = cvm(y1 > y2, y1, y2) lastF.subtype = 'r' lastF.matchid = lastF.runid lastF.runid = "." # the agreed NULL value lastF.x_start = startX lastF.y_start = startY lastF.x_length = endX - startX lastF.y_length = endY - startY lastF.runFill = runFill print >>outfile, lastF # end if firstF = curF runFill = 0 # end if runFill += curF.x_length lastF = curF # end if # end for if (None != lastF): x1 = firstF.x_start x2 = lastF.x_start startX = cvm( x1 < x2, x1, x2) x1 += firstF.x_length x2 += lastF.x_length endX = cvm( x1 > x2, x1, x2) y1 = firstF.y_start y2 = lastF.y_start startY = cvm( y1 < y2, y1, y2) y1 += firstF.y_length y2 += lastF.y_length endY = cvm( y1 > y2, y1, y2) lastF.subtype = 'r' lastF.matchid = lastF.runid lastF.runid = "." # the agreed NULL value lastF.x_start = startX lastF.y_start = startY lastF.x_length = endX - startX lastF.y_length = endY - startY lastF.runFill = runFill print >>outfile, lastF # end if return outfile
def findPerfectRuns ( inpfile, maxJump, runIdPrefix ): outfile = MyFile.myfile() left = None runid = 1 inpfile.seek(0) for line in inpfile: if(line[0] == 'M'): right = MatchRecord.MatchRecord(line) pr = int(right.extend['srank']) del(right.extend['srank']) if(left != None): maxGapInXandY = 0 if(left.x_scaf_uid == right.x_scaf_uid and left.y_scaf_uid == right.y_scaf_uid ): # Find the maximum of the gap in x and y axis. x_rs = right.x_start x_re = x_rs + right.x_length x_ls = left.x_start x_le = x_ls + left.x_length assert(x_rs < x_re) assert(x_ls < x_le) # All matches are positive length. x_gapLeftBeforeRight = x_rs - x_le x_gapRightBeforeLeft = x_ls - x_re assert(not(x_gapLeftBeforeRight>0 and x_gapRightBeforeLeft>0)) x_gap = max(x_gapLeftBeforeRight,x_gapRightBeforeLeft) # x_gap == 0 is abutting # x_gap < 0 is overlapping y_rs = right.y_start y_re = y_rs + right.y_length y_ls = left.y_start y_le = y_ls + left.y_length assert(y_rs < y_re) assert(y_ls < y_le) y_gapLeftBeforeRight = y_rs - y_le y_gapRightBeforeLeft = y_ls - y_re assert(not(y_gapLeftBeforeRight>0 and y_gapRightBeforeLeft>0)) y_gap = max(y_gapLeftBeforeRight,y_gapRightBeforeLeft) # y_gap == 0 is abutting # y_gap < 0 is overlapping maxGapInXandY = max(x_gap,y_gap) if 1: # Check the sorting of the matches. sorted_by_x = (x_ls <= x_rs) sorted_by_y = (y_ls <= y_rs) if(not(sorted_by_x or sorted_by_y)): print >>sys.stderr, "bad sorting in findPerfectRuns" print >>sys.stderr, left print >>sys.stderr, right assert(sorted_by_x or sorted_by_y) dovetail_in_x = (x_ls <= x_rs) and (x_le <= x_re) dovetail_in_y = (y_ls <= y_rs) and (y_ls <= y_re) if(sorted_by_x and not(dovetail_in_x)): print >>sys.stderr, "contained in x in findPerfectRuns" print >>sys.stderr, left print >>sys.stderr, right if(sorted_by_y and not(dovetail_in_y)): print >>sys.stderr, "contained in y in findPerfectRuns" print >>sys.stderr, left print >>sys.stderr, right # endif if( (left.x_scaf_uid != right.x_scaf_uid) or # check first axis id (left.y_scaf_uid != right.y_scaf_uid) or # check second axis id (maxGapInXandY > maxJump) or (pr != lastpr + 1) # Using the signed rank NOT the run id !!!! ): runid += 1 # end if # end if lastpr = pr right.runid = "%s%d" % (runIdPrefix,runid,) # Assign the run id in the same slot as the signed rank. print >>outfile, right left = right # end if # end for return outfile