def trimMatchOverlapsInBoth(inpfile,outfile,trim_subtype): gp = MyFile.myfile() MatchRecord.sortInXorderAP(inpfile,gp) # The following coalescing assumes perfect runs. hp = MyFile.myfile() coalesceMatches( gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u')) ) gp = MyFile.myfile() trimMatchOverlapsInX(hp,gp,trim_subtype) hp = MyFile.myfile() MatchRecord.sortInYorderAP(gp,hp) trimMatchOverlapsInY(hp,outfile,trim_subtype) return
def trimMatchOverlapsInBoth(inpfile, outfile, trim_subtype): gp = MyFile.myfile() MatchRecord.sortInXorderAP(inpfile, gp) # The following coalescing assumes perfect runs. hp = MyFile.myfile() coalesceMatches(gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u'))) gp = MyFile.myfile() trimMatchOverlapsInX(hp, gp, trim_subtype) hp = MyFile.myfile() MatchRecord.sortInYorderAP(gp, hp) trimMatchOverlapsInY(hp, outfile, trim_subtype) return
def applyBothKeepMasks(inpfile, outfile): # Maybe we can think of a masking implementation where each ATAC match # is treated atomicly. Assume that the keep mask intervals are sorted # by start postition. Assume that the ATAC matches are sorted by start # postion. Assert that all keep mask intervals are non-overlapping and # were cut from only one ATAC match. Thus the mapping from keep mask # intervals is a function. Note that this requires that we do not # coalesce abutting keep mask intervals that originate from multiple # matches. Note this still allows an ATAC match to overlap more than # one keep mask interval. Ignore all keep mask intervals with zero # length their creation has tie breaking problems. See notes on 2003 # Jul 29. debug = 0 debugnum = 0 inpfile.seek(0) outfile.seek(0) # Apply the keepMask for the first axis. # Make the sorted the keep mask intervals for the first axis. processFirstAxis = 1 keepMaskFile = MyFile.myfile() tmpfile2 = inpfile tmpfile3 = MyFile.myfile() tmpfile4 = MyFile.myfile() findCoverageIntervals(inpfile, keepMaskFile, processFirstAxis) if debug: debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in keepMaskFile: print >> debugfile, line, MatchRecord.sortInXorderAP(tmpfile2, tmpfile3) if debug: #tmpfile2.seek(0) #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") #for line in tmpfile2: print >>debugfile, line, tmpfile3.seek(0) debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile3: print >> debugfile, line, applyOneKeepMask(tmpfile3, tmpfile4, keepMaskFile, processFirstAxis) if debug: tmpfile4.seek(0) debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile4: print >> debugfile, line, # Apply the keepMask for the second axis. # Make the sorted the keep mask intervals for the second axis. processFirstAxis = 0 keepMaskFile = MyFile.myfile() tmpfile2 = tmpfile4 tmpfile3 = MyFile.myfile() tmpfile4 = outfile findCoverageIntervals(inpfile, keepMaskFile, processFirstAxis) if debug: debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in keepMaskFile: print >> debugfile, line, MatchRecord.sortInYorderAP(tmpfile2, tmpfile3) if debug: #tmpfile2.seek(0) #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") #for line in tmpfile2: print >>debugfile, line, tmpfile3.seek(0) debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile3: print >> debugfile, line, applyOneKeepMask(tmpfile3, tmpfile4, keepMaskFile, processFirstAxis) if debug: tmpfile4.seek(0) debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile4: print >> debugfile, line,
def runOld(self): self.globals['atacAlgorithmVersion'] = str(17) print >>STDERR, "runName = %s\n" % self.runName # The ATAC globals used by this script: opt_t = int(self.globals['globalMatchMinSize']) opt_l = int(self.globals['globalPerfectRunMinLen']) maxdiff = int(self.globals['globalPerfectRunMaxGapLen']) assemblyId1 = self.globals['assemblyId1'] assemblyId2 = self.globals['assemblyId2'] assemblyFile1 = self.globals['assemblyFile1'] assemblyFile2 = self.globals['assemblyFile2'] boxRecoveryOn = 0 # Deprecated for same species comparisons 2003/09/09. if(self.globals.has_key("boxRecoveryOn")): boxRecoveryOn = int(self.globals['boxRecoveryOn']) t0 = time.time() assemblyIdx1 = IdxStore.IdxStore(assemblyFile1,assemblyId1) assemblyIdx2 = IdxStore.IdxStore(assemblyFile2,assemblyId2) rawfile = None ################################################################### # Setup for checkpointing scheme. redo = 0 keep = 0 step = 0 if(self.globals.has_key("ckpKeep")): keep = int(self.globals['ckpKeep']) ckpName = "AllDone" ################################################################### print >>STDERR, 'Keep step=' + str(keep) print >>STDERR, 'At step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) outprefix = self.runName step += 1 print >>STDERR, 'At uniqueFilter, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 if(not(self.globals.has_key('uniqueFilterOn') and self.globals['uniqueFilterOn']=="0")): print >>STDERR, 'Running UniqueFilter' outfile = MyFile.myfile() UniqueFilter.main( self.matches, outfile) self.matches = outfile outprefix += '.uniq' self.checkpoint(outprefix) step += 1 print >>STDERR, 'At filterByMatchLength, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'Running filterByMatchLength' outfile = MyFile.myfile() filterByMatchLength( self.matches, outfile, opt_t) self.matches = outfile outprefix += '.t' + str(opt_t) self.checkpoint(outprefix) step += 1 print >>STDERR, 'At trimMatchOverlaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, "Start trimming for bp one-to-one-ness" tempdata = MyFile.myfile() TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u') self.matches = tempdata print >>STDERR, "Finished trimming for bp one-to-one-ness" outprefix += '.trim' self.checkpoint(outprefix) if( boxRecoveryOn == 1 ): # For box recovery later ... but what if we start from a checkpoint? rawfile = self.matches step += 1 print >>STDERR, 'At formPerfectRuns, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.p6' tempdata = PerfectRuns.formPerfectRuns(self.matches, MatchRecord.sortInXorderAP, MatchRecord.sortInYorderAP, maxdiff, 'r') self.matches = tempdata outprefix += ".p6" # end if step += 1 print >>STDERR, 'At onlyKeepLongRuns, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.l' + str(opt_l) tempdata = onlyKeepLongRuns( self.matches, outprefix, opt_l) self.matches = tempdata outprefix += '.l' + str(opt_l) self.checkpoint(outprefix) step += 1 print >>STDERR, 'At formPerfectRuns, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'Heal the perfect runs' tempdata = PerfectRuns.formPerfectRuns(self.matches, MatchRecord.sortInYorderAP, MatchRecord.sortInXorderAP, maxdiff, 'r') self.matches = tempdata outprefix += '.pr' self.checkpoint(outprefix) if(boxRecoveryOn == 1): # This is a box recovery step. step += 1 print >>STDERR, 'At boxRecovery, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.br' print >>STDERR, "Make sorted raw matches" outfile = MyFile.myfile() MatchRecord.sortInXorderAP( rawfile, outfile) rawfile = outfile print >>STDERR, "perform box recovery" tempdata = boxRecovery( self.matches, rawfile, outprefix) self.matches = tempdata outprefix += '.br' self.checkpoint(outprefix) # end if step += 1 print >>STDERR, 'At formPerfectRuns, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ( (keep < step) and not self.globals.has_key(ckpName))): print >>STDERR, "form perfect runs" redo = 1 print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.p6' tempdata = PerfectRuns.formPerfectRuns(self.matches, MatchRecord.sortInXorderAP, MatchRecord.sortInYorderAP, maxdiff, 'r') self.matches = tempdata outprefix += '.pr' self.checkpoint(outprefix) step += 1 print >>STDERR, 'At squeezeIntraRunGaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.sq' tempdata = MyFile.myfile() squeezeIntraRunGaps.mainLoop( self.matches, tempdata, assemblyIdx1, assemblyIdx2) tempy = MyFile.myfile() # Beware the current match subtypes are 'x', 'L', and 'R'! coalesceMatches( tempdata, tempy, 1) self.matches = tempy outprefix += '.sq' self.checkpoint(outprefix) step += 1 print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, "Start trimming for bp one-to-one-ness" tempdata = MyFile.myfile() TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u') self.matches = tempdata outprefix += '.trim' print >>STDERR, "Finished trimming for bp one-to-one-ness" step += 1 print >>STDERR, 'At RunsAsMatches, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 self.runs = PerfectRuns.runsAsMatches( self.matches) outprefix += '.runs' self.checkpoint(outprefix) # end if if(self.globals.has_key('fillIntraRunGapsOn') and self.globals['fillIntraRunGapsOn']=="1" ): # Next comes the DNA sequence dependent stuff. step += 1 print >>STDERR, 'At fillIntraRunGaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, "fill the intrarun gaps" if(not self.globals.has_key('fillIntraRunGapsErate')): self.globals['fillIntraRunGapsErate'] = 0.10 if(not self.globals.has_key('fillIntraRunGapsMaxGap')): self.globals['fillIntraRunGapsMaxGap'] = 100000 fillIntraRunGapsErate = float(self.globals['fillIntraRunGapsErate']) fillIntraRunGapsMaxGap = int(self.globals['fillIntraRunGapsMaxGap']) tempdata = MyFile.myfile() fillIntraRunGaps.mainLoop(self.matches, tempdata, assemblyIdx1, assemblyIdx2, fillIntraRunGapsMaxGap, fillIntraRunGapsErate) self.matches = tempdata outprefix += '.fill' self.checkpoint(outprefix) step += 1 print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, "trim the overlaps" tempdata = MyFile.myfile() TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u') self.matches = tempdata outprefix += '.trim' self.checkpoint(outprefix)
def applyBothKeepMasks( inpfile, outfile ): # Maybe we can think of a masking implementation where each ATAC match # is treated atomicly. Assume that the keep mask intervals are sorted # by start postition. Assume that the ATAC matches are sorted by start # postion. Assert that all keep mask intervals are non-overlapping and # were cut from only one ATAC match. Thus the mapping from keep mask # intervals is a function. Note that this requires that we do not # coalesce abutting keep mask intervals that originate from multiple # matches. Note this still allows an ATAC match to overlap more than # one keep mask interval. Ignore all keep mask intervals with zero # length their creation has tie breaking problems. See notes on 2003 # Jul 29. debug = 0 debugnum = 0 inpfile.seek(0) outfile.seek(0) # Apply the keepMask for the first axis. # Make the sorted the keep mask intervals for the first axis. processFirstAxis = 1 keepMaskFile = MyFile.myfile() tmpfile2 = inpfile tmpfile3 = MyFile.myfile() tmpfile4 = MyFile.myfile() findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis) if debug: debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in keepMaskFile: print >>debugfile, line, MatchRecord.sortInXorderAP(tmpfile2,tmpfile3) if debug: #tmpfile2.seek(0) #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") #for line in tmpfile2: print >>debugfile, line, tmpfile3.seek(0) debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile3: print >>debugfile, line, applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis) if debug: tmpfile4.seek(0) debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile4: print >>debugfile, line, # Apply the keepMask for the second axis. # Make the sorted the keep mask intervals for the second axis. processFirstAxis = 0 keepMaskFile = MyFile.myfile() tmpfile2 = tmpfile4 tmpfile3 = MyFile.myfile() tmpfile4 = outfile findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis) if debug: debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in keepMaskFile: print >>debugfile, line, MatchRecord.sortInYorderAP(tmpfile2,tmpfile3) if debug: #tmpfile2.seek(0) #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") #for line in tmpfile2: print >>debugfile, line, tmpfile3.seek(0) debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile3: print >>debugfile, line, applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis) if debug: tmpfile4.seek(0) debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile4: print >>debugfile, line,