def trimMatchOverlapsInBoth(inpfile, outfile, trim_subtype): gp = MyFile.myfile() MatchRecord.sortInXorderAP(inpfile, gp) # The following coalescing assumes perfect runs. hp = MyFile.myfile() coalesceMatches(gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u'))) gp = MyFile.myfile() trimMatchOverlapsInX(hp, gp, trim_subtype) hp = MyFile.myfile() MatchRecord.sortInYorderAP(gp, hp) trimMatchOverlapsInY(hp, outfile, trim_subtype) return
def trimMatchOverlapsInBoth(inpfile,outfile,trim_subtype): gp = MyFile.myfile() MatchRecord.sortInXorderAP(inpfile,gp) # The following coalescing assumes perfect runs. hp = MyFile.myfile() coalesceMatches( gp, hp, ((trim_subtype == 'x') or (trim_subtype == 'u')) ) gp = MyFile.myfile() trimMatchOverlapsInX(hp,gp,trim_subtype) hp = MyFile.myfile() MatchRecord.sortInYorderAP(gp,hp) trimMatchOverlapsInY(hp,outfile,trim_subtype) return
def boxRecovery( inpfile, rawfile, outname): inpfile.seek(0) rawfile.seek(0) outfile = MyFile.myfile() rawfileIter = iter(rawfile) # This is a modified merge operation? # The two input files must be sorted the same manner. leftMatch = None for line in inpfile: if(line[0] == 'M'): rightMatch = MatchRecord.MatchRecord(line) if( leftMatch != None and leftMatch.inSameRunAs(rightMatch) ): # print >>STDERR, "In same run leftMatch=", leftMatch, " rightMatch=", rightMatch for rawline in rawfileIter: if( rawline[0] == 'M'): rawMatch = MatchRecord.MatchRecord(rawline) if(rawMatch.sameAs(rightMatch)): print >>outfile, rightMatch break else: # print "Inside run rawMatch=", rawMatch if(rawMatch.isInsideBox(leftMatch,rightMatch)): print >>outfile, rawMatch # end if # end if # end if # end for # We should die here if there is no rawMatch that matched the rightMatch ... else: # print >>STDERR, "Between runs leftMatch=", leftMatch, " rightMatch=", rightMatch for rawline in rawfileIter: if( rawline[0] == 'M'): rawMatch = MatchRecord.MatchRecord(rawline) if(rawMatch.sameAs(rightMatch)): print >>outfile, rightMatch break else: # print >>STDERR, "Discard rawMatch=", rawMatch pass # end if # end if # end for # We should die here if there is no rawMatch that matched the rightMatch ... # Discard raw Matches until it is ge to the right match. # end if leftMatch = rightMatch # end if # end for return outfile
def mainLoop(inpfile, outfile, xIdx, yIdx, maxgap, erate): margin = 20 # This should be set by an ATAC global. countLines = 0 inter_run_gap_count_total = 0 closed_gap_count_total = 0 squeezed_total = 0 x_len_total = 0 y_len_total = 0 x_nonACGT_total = 0 y_nonACGT_total = 0 inpfile.seek(0) inpfileIter = iter(inpfile) sys.stderr.write("begin\n") left = None for line in inpfileIter: if (line[0] == 'M'): left = MatchRecord.MatchRecord(line) print >> outfile, left countLines += 1 break sys.stderr.write("countLines=%d\n" % countLines) for line in inpfileIter: if (line[0] == 'M'): newRight = MatchRecord.MatchRecord(line) if (newRight.subtype == 'u'): right = newRight #if( countLines % 10000 == 0): # sys.stderr.write("countLines=%d\n" % countLines) (inter_run_gap_count, ) = analyzeGap(xIdx, yIdx, left, right, outfile, maxgap, erate, margin) inter_run_gap_count_total += inter_run_gap_count # Output the record which was possibly trimmed. print >> outfile, right countLines += 1 left = right # end if # end for sys.stderr.write("countLines %d inter_run_gap_count %d \n" % (countLines, inter_run_gap_count_total))
def main(inpfile, outfile, xIdx, yIdx): inpfile.seek(0) outfile.seek(0) lineCount = 0 t0 = time.time() for line in inpfile: lineCount += 1 if ((lineCount % 10000) == 0): print >> sys.stderr, "lineCount=", lineCount, " time=", time.time( ) - t0 if (line[0] == 'M'): FM = MatchRecord.MatchRecord(line) if (FM.subtype == 'g'): parentid = FM.matchid parent_x_forward = (FM.x_orientation == 1) parent_y_forward = (FM.y_orientation == 1) parent_x_start = FM.x_start parent_y_start = FM.y_start parent_x_length = FM.x_length parent_y_length = FM.y_length # Why two orientations and not just a flipped flag? # Because we want the resulting matches to come out in # the same sorted order as the input matches. x_substring = string.upper( xIdx.getStringFromFasta(parent_x_forward, FM.x_scaf_uid, FM.x_start, FM.x_length)) y_substring = string.upper( yIdx.getStringFromFasta(parent_y_forward, FM.y_scaf_uid, FM.y_start, FM.y_length)) ii = 0 # Here we call the dedasher. halign.halignStart(x_substring, y_substring) for segment in iter(halign.halignDedash, None): #print >>outfile, segment (bgn1, bgn2, len1, len2, nmat) = segment # Filter by a minimum length? say four bp. ii += 1 FM.subtype = 'u' FM.matchid = parentid + 'u' + str(ii) # FM.runid = parentid FM.x_start = parent_x_start + ( parent_x_length - bgn1 - len1, bgn1)[parent_x_forward] FM.y_start = parent_y_start + ( parent_y_length - bgn2 - len2, bgn2)[parent_y_forward] FM.x_length = len1 FM.y_length = len2 assert (len1 == len2) mismatches = 0 for ic in range(len1): if (x_seq[bgn1 + ic] != y_seq[bgn2 + ic]): mismatches += 1 FM.extend['mm'] = str(mismatches) FM.identifier = "" # BEWARE print >> outfile, FM else: print >> outfile, line, else: print >> outfile, line,
def filterByMatchLength( inpfile, outfile, minimum_length): "Only keep matches that are long enough." inpfile.seek(0) for line in inpfile: if(line[0] == 'M'): FM = MatchRecord.MatchRecord(line) if (FM.x_length >= minimum_length and FM.y_length >= minimum_length ): print >>outfile, FM
def createSignedEnumeration(inpfile): outfile = MyFile.myfile() p = 1 inpfile.seek(0) for line in inpfile: if (line[0] == 'M'): FM = MatchRecord.MatchRecord(line) forwardX = FM.x_orientation forwardY = FM.y_orientation srank = cvm(forwardX == forwardY, p, -p) p += 1 FM.extend['srank'] = srank print >> outfile, FM # end if # end while return outfile
def onlyKeepLongRuns ( inpfile, outname, lengthThreshold ): outfile = MyFile.myfile() rejectsfile = MyFile.myfile() FL = None store = [] lenInMatches = 0 inpfile.seek(0) for line in inpfile: if(line[0] == 'M'): FM = MatchRecord.MatchRecord(line) SL = FM.x_length if FL != None and FL.runid != FM.runid : for x in store: print >>rejectsfile, x # end for store = [] lenInMatches = SL else: lenInMatches += SL # end if if lenInMatches < lengthThreshold: store.append(FM) else: for x in store: print >>outfile, x # end for store = [] print >>outfile, FM # end if FL = FM # end if # end for rejectsfile.close() return outfile
def applyOneKeepMask(inpfile, outfile, keepMaskFile, processFirstAxis): # Note that the following merge-like control structure is # influenced by the function property of keep intevals to matches. debug = 0 inpfile.seek(0) outfile.seek(0) keepMaskFile.seek(0) # Put the first valid match record into FM. Each input ATAC match # record produces zero, one or more output ATAC matches. FM = None ma = None ms = None me = None qa = None qs = None ql = None # the set of masking intervals, using the q variables and iline maskiter = iter(keepMaskFile) # the set of masked matches using the m variables and mline inpiter = iter(inpfile) iline = None mline = None last_matchid = None subcount = 0 try: # StopIteration exception from either iterator gets us out while 1: if (iline == None): iline = maskiter.next() ( subtype, qa, qs, ql, cov, ) = iline.split() assert (subtype == 'C') cov = int(cov) if (cov != 1): iline = None continue qs = int(qs) ql = int(ql) qe = qs + ql if (mline == None): mline = inpiter.next() if (mline[0] != 'M'): # not a match record, so just pass it through print >> outfile, mline, mline = None continue FM = MatchRecord.MatchRecord(mline) assert (FM.subtype == "u" or FM.subtype == "x") if (processFirstAxis): ma = FM.x_scaf_uid ms = FM.x_start # match start me = ms + FM.x_length # match end else: ma = FM.y_scaf_uid ms = FM.y_start # match start me = ms + FM.y_length # match end # holding valid iline and mline data now if not (ma == qa): # not on same axis, need to get a new one if (ma < qa): mline = None else: iline = None elif not ((ms < qe) and (qs < me)): # we are not overlapping, need to get a new one of them if (ms < qs): mline = None else: iline = None else: # processing for overlaps FT = FM.copy() mx = max(ms, qs) mn = min(me, qe) trimFromStart = mx - ms trimFromEnd = me - mn trimmedLength = mn - mx if (FT.x_orientation == FT.y_orientation): FT.x_start += trimFromStart FT.y_start += trimFromStart else: if (processFirstAxis): FT.x_start += trimFromStart FT.y_start += trimFromEnd else: FT.y_start += trimFromStart FT.x_start += trimFromEnd FT.x_length = trimmedLength FT.y_length = trimmedLength if debug: print >> sys.stdout, "# trimmed " print >> sys.stdout, FT # We must insure that the match identifier is still unique. if last_matchid == FM.matchid: subcount += 1 else: subcount = 0 # print >>sys.stderr, last_matchid, FM.matchid, subcount last_matchid = FM.matchid if (subcount > 0): if processFirstAxis: FT.matchid = FT.matchid + "x" + str(subcount) else: FT.matchid = FT.matchid + "y" + str(subcount) print >> outfile, FT # we need to get a new one if (qe < me): iline = None else: mline = None except StopIteration: # If there are any left over non-match lines, then output them! for mline in inpiter: if (mline[0] != "M"): print >> outfile, mline,
def applyBothKeepMasks(inpfile, outfile): # Maybe we can think of a masking implementation where each ATAC match # is treated atomicly. Assume that the keep mask intervals are sorted # by start postition. Assume that the ATAC matches are sorted by start # postion. Assert that all keep mask intervals are non-overlapping and # were cut from only one ATAC match. Thus the mapping from keep mask # intervals is a function. Note that this requires that we do not # coalesce abutting keep mask intervals that originate from multiple # matches. Note this still allows an ATAC match to overlap more than # one keep mask interval. Ignore all keep mask intervals with zero # length their creation has tie breaking problems. See notes on 2003 # Jul 29. debug = 0 debugnum = 0 inpfile.seek(0) outfile.seek(0) # Apply the keepMask for the first axis. # Make the sorted the keep mask intervals for the first axis. processFirstAxis = 1 keepMaskFile = MyFile.myfile() tmpfile2 = inpfile tmpfile3 = MyFile.myfile() tmpfile4 = MyFile.myfile() findCoverageIntervals(inpfile, keepMaskFile, processFirstAxis) if debug: debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in keepMaskFile: print >> debugfile, line, MatchRecord.sortInXorderAP(tmpfile2, tmpfile3) if debug: #tmpfile2.seek(0) #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") #for line in tmpfile2: print >>debugfile, line, tmpfile3.seek(0) debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile3: print >> debugfile, line, applyOneKeepMask(tmpfile3, tmpfile4, keepMaskFile, processFirstAxis) if debug: tmpfile4.seek(0) debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile4: print >> debugfile, line, # Apply the keepMask for the second axis. # Make the sorted the keep mask intervals for the second axis. processFirstAxis = 0 keepMaskFile = MyFile.myfile() tmpfile2 = tmpfile4 tmpfile3 = MyFile.myfile() tmpfile4 = outfile findCoverageIntervals(inpfile, keepMaskFile, processFirstAxis) if debug: debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in keepMaskFile: print >> debugfile, line, MatchRecord.sortInYorderAP(tmpfile2, tmpfile3) if debug: #tmpfile2.seek(0) #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") #for line in tmpfile2: print >>debugfile, line, tmpfile3.seek(0) debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile3: print >> debugfile, line, applyOneKeepMask(tmpfile3, tmpfile4, keepMaskFile, processFirstAxis) if debug: tmpfile4.seek(0) debugnum += 1 debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile4: print >> debugfile, line,
def trimMatchOverlapsInY(inpfile, outfile, trim_subtype): "Trim the match overlaps with respect to the Y assembly." overlaps = 0 abuts = 0 posgaps = 0 contained = 0 trimmed = 0 left = None picket = 0 # For each genomic axis we scan left to right using this picket # position to annihilating any part of the current match to the # left of this picket. inpfile.seek(0) for line in iter(inpfile): if (line[0] == 'M'): right = MatchRecord.MatchRecord(line) if (right.subtype != trim_subtype): print >> outfile, line, continue if (left == None or #left.y_scaf_uid < right.y_scaf_uid): left.y_scaf_uid != right.y_scaf_uid): picket = 0 else: assert (left != None) assert (right != None) if (left.y_scaf_uid > right.y_scaf_uid): print >> sys.stderr, "sequence ids out of y sorted order" print >> sys.stderr, left print >> sys.stderr, right assert (left.subtype == right.subtype) assert (left.y_scaf_uid == right.y_scaf_uid) if (not (left.y_start <= right.y_start)): print >> sys.stderr, "trimMatchOverlapsInY: Woops not sorted anymore!" print >> sys.stderr, left print >> sys.stderr, right #assert(0) thisbgn = right.y_start thisend = right.y_start + right.y_length if (picket < thisend): gaplen = thisbgn - picket if (gaplen > 0): posgaps += 1 if (gaplen == 0): abuts += 1 if (gaplen < 0): overlaps += 1 trimmed -= gaplen right.y_start -= gaplen # modify the match right.y_length += gaplen right.x_length += gaplen if (right.x_orientation == right.y_orientation): right.x_start -= gaplen # modify the match else: # picketed region contains right. #print >>sys.stderr, "trimMatchOverlapsInY: Contained" #print >>sys.stderr, left #print >>sys.stderr, right contained += 1 right = None # remove this match if (right != None): print >> outfile, right newpicket = right.y_start + right.y_length assert (picket < newpicket) picket = newpicket left = right else: print >> outfile, line, print >> sys.stderr, "trimMatchOverlapsInY:\n", print >>sys.stderr, "#posgaps, #abuts, #overlaps, #contained, bp_trimmed= %d %d %d %d %d\n" \ % (posgaps, abuts, overlaps, contained, trimmed, ) return
def coalesceMatches(inpfile, outfile, needs_to_share_diagonal): "Coalesce overlapping and abutting matches within the same run." firstF = None lastF = None lastLX = -3 lastLY = -4 lastForward = 0 lowHitPX = None lowHitPY = None hghHitPX = None hghHitPY = None inpfile.seek(0) outfile.seek(0) for line in inpfile: if (line[0] == 'M'): curF = MatchRecord.MatchRecord(line) px = curF.x_start nx = curF.x_length py = curF.y_start ny = curF.y_length assert (px >= 0) assert (nx >= 0) assert (py >= 0) assert (ny >= 0) if (not (not needs_to_share_diagonal or nx == ny)): print >> sys.stderr, 'Bombed on:' print >> sys.stderr, str(curF) print >> sys.stderr, 'needs_to_share_diagonal=' + str( needs_to_share_diagonal) print >> sys.stderr, 'nx=' + str(nx) + ' ny=' + str(ny) # end if assert ((hghHitPX == None or (not needs_to_share_diagonal) or nx == ny)) forward = (curF.x_orientation == curF.y_orientation) lx = px ly = cvm(forward, py, py + ny) rx = px + nx ry = cvm(forward, py + ny, py) overlapping = ((lastF != None) and (curF.x_scaf_uid == lastF.x_scaf_uid) and (curF.y_scaf_uid == lastF.y_scaf_uid) and (((lx >= lowHitPX and lx <= hghHitPX) and (ly >= lowHitPY and ly <= hghHitPY)) or ((rx >= lowHitPX and rx <= hghHitPX) and (ry >= lowHitPY and ry <= hghHitPY)))) on_diagonal = ((forward == lastForward) and ((lx - lastLX) == ((ly - lastLY) * cvm(forward, 1, -1)))) # print >>sys.stdout, lastF, curF # print >>sys.stdout, lx,rx,ly,ry # print >>sys.stdout, lowHitPX,hghHitPX,lowHitPY,hghHitPY # print >>sys.stdout, "overlapping=",overlapping # print >>sys.stdout, "on_diagonal=",on_diagonal lowMerPX = px lowMerPY = py hghMerPX = px + nx hghMerPY = py + ny if (not (overlapping and (not needs_to_share_diagonal or on_diagonal))): if (firstF != None): # if (lastF == None or firstF.runid != lastF.runid): # end if firstF.subtype = ('g', 'u')[needs_to_share_diagonal] firstF.x_start = lowHitPX firstF.y_start = lowHitPY firstF.x_length = hghHitPX - lowHitPX firstF.y_length = hghHitPY - lowHitPY print >> outfile, firstF # end if firstF = curF lowHitPX = lowMerPX lowHitPY = lowMerPY hghHitPX = hghMerPX hghHitPY = hghMerPY # end if lowHitPX = cvm(lowHitPX < lowMerPX, lowHitPX, lowMerPX) lowHitPY = cvm(lowHitPY < lowMerPY, lowHitPY, lowMerPY) hghHitPX = cvm(hghHitPX > hghMerPX, hghHitPX, hghMerPX) hghHitPY = cvm(hghHitPY > hghMerPY, hghHitPY, hghMerPY) lastLX = lx lastLY = ly lastForward = forward lastF = curF # end if # end for if (firstF != None): firstF.subtype = ('g', 'u')[needs_to_share_diagonal] firstF.x_start = lowHitPX firstF.y_start = lowHitPY firstF.x_length = hghHitPX - lowHitPX firstF.y_length = hghHitPY - lowHitPY print >> outfile, firstF return
def findPerfectRuns(inpfile, maxJump, runIdPrefix): outfile = MyFile.myfile() left = None runid = 1 inpfile.seek(0) for line in inpfile: if (line[0] == 'M'): right = MatchRecord.MatchRecord(line) pr = int(right.extend['srank']) del (right.extend['srank']) if (left != None): maxGapInXandY = 0 if (left.x_scaf_uid == right.x_scaf_uid and left.y_scaf_uid == right.y_scaf_uid): # Find the maximum of the gap in x and y axis. x_rs = right.x_start x_re = x_rs + right.x_length x_ls = left.x_start x_le = x_ls + left.x_length assert (x_rs < x_re) assert (x_ls < x_le) # All matches are positive length. x_gapLeftBeforeRight = x_rs - x_le x_gapRightBeforeLeft = x_ls - x_re assert (not (x_gapLeftBeforeRight > 0 and x_gapRightBeforeLeft > 0)) x_gap = max(x_gapLeftBeforeRight, x_gapRightBeforeLeft) # x_gap == 0 is abutting # x_gap < 0 is overlapping y_rs = right.y_start y_re = y_rs + right.y_length y_ls = left.y_start y_le = y_ls + left.y_length assert (y_rs < y_re) assert (y_ls < y_le) y_gapLeftBeforeRight = y_rs - y_le y_gapRightBeforeLeft = y_ls - y_re assert (not (y_gapLeftBeforeRight > 0 and y_gapRightBeforeLeft > 0)) y_gap = max(y_gapLeftBeforeRight, y_gapRightBeforeLeft) # y_gap == 0 is abutting # y_gap < 0 is overlapping maxGapInXandY = max(x_gap, y_gap) if 1: # Check the sorting of the matches. sorted_by_x = (x_ls <= x_rs) sorted_by_y = (y_ls <= y_rs) if (not (sorted_by_x or sorted_by_y)): print >> sys.stderr, "bad sorting in findPerfectRuns" print >> sys.stderr, left print >> sys.stderr, right assert (sorted_by_x or sorted_by_y) dovetail_in_x = (x_ls <= x_rs) and (x_le <= x_re) dovetail_in_y = (y_ls <= y_rs) and (y_ls <= y_re) if (sorted_by_x and not (dovetail_in_x)): print >> sys.stderr, "contained in x in findPerfectRuns" print >> sys.stderr, left print >> sys.stderr, right if (sorted_by_y and not (dovetail_in_y)): print >> sys.stderr, "contained in y in findPerfectRuns" print >> sys.stderr, left print >> sys.stderr, right # endif if ((left.x_scaf_uid != right.x_scaf_uid) or # check first axis id (left.y_scaf_uid != right.y_scaf_uid) or # check second axis id (maxGapInXandY > maxJump) or (pr != lastpr + 1 ) # Using the signed rank NOT the run id !!!! ): runid += 1 # end if # end if lastpr = pr right.runid = "%s%d" % ( runIdPrefix, runid, ) # Assign the run id in the same slot as the signed rank. print >> outfile, right left = right # end if # end for return outfile
def runsAsMatches(inpfile): outfile = MyFile.myfile() lastF = None firstF = None runFill = 0 inpfile.seek(0) for line in inpfile: if (line[0] == 'M'): curF = MatchRecord.MatchRecord(line) if ((lastF == None) or (curF.runid != lastF.runid)): if ((lastF != None) and (firstF.x_scaf_uid != lastF.x_scaf_uid)): print >> sys.stderr, firstF print >> sys.stderr, lastF # end if assert ((lastF == None) or (firstF.x_scaf_uid == lastF.x_scaf_uid)) assert ((lastF == None) or (firstF.y_scaf_uid == lastF.y_scaf_uid)) if (None != lastF): x1 = firstF.x_start x2 = lastF.x_start startX = cvm(x1 < x2, x1, x2) x1 += firstF.x_length x2 += lastF.x_length endX = cvm(x1 > x2, x1, x2) y1 = firstF.y_start y2 = lastF.y_start startY = cvm(y1 < y2, y1, y2) y1 += firstF.y_length y2 += lastF.y_length endY = cvm(y1 > y2, y1, y2) lastF.subtype = 'r' lastF.matchid = lastF.runid lastF.runid = "." # the agreed NULL value lastF.x_start = startX lastF.y_start = startY lastF.x_length = endX - startX lastF.y_length = endY - startY lastF.runFill = runFill print >> outfile, lastF # end if firstF = curF runFill = 0 # end if runFill += curF.x_length lastF = curF # end if # end for if (None != lastF): x1 = firstF.x_start x2 = lastF.x_start startX = cvm(x1 < x2, x1, x2) x1 += firstF.x_length x2 += lastF.x_length endX = cvm(x1 > x2, x1, x2) y1 = firstF.y_start y2 = lastF.y_start startY = cvm(y1 < y2, y1, y2) y1 += firstF.y_length y2 += lastF.y_length endY = cvm(y1 > y2, y1, y2) lastF.subtype = 'r' lastF.matchid = lastF.runid lastF.runid = "." # the agreed NULL value lastF.x_start = startX lastF.y_start = startY lastF.x_length = endX - startX lastF.y_length = endY - startY lastF.runFill = runFill print >> outfile, lastF # end if return outfile
def runOld(self): self.globals['atacAlgorithmVersion'] = str(17) print >>STDERR, "runName = %s\n" % self.runName # The ATAC globals used by this script: opt_t = int(self.globals['globalMatchMinSize']) opt_l = int(self.globals['globalPerfectRunMinLen']) maxdiff = int(self.globals['globalPerfectRunMaxGapLen']) assemblyId1 = self.globals['assemblyId1'] assemblyId2 = self.globals['assemblyId2'] assemblyFile1 = self.globals['assemblyFile1'] assemblyFile2 = self.globals['assemblyFile2'] boxRecoveryOn = 0 # Deprecated for same species comparisons 2003/09/09. if(self.globals.has_key("boxRecoveryOn")): boxRecoveryOn = int(self.globals['boxRecoveryOn']) t0 = time.time() assemblyIdx1 = IdxStore.IdxStore(assemblyFile1,assemblyId1) assemblyIdx2 = IdxStore.IdxStore(assemblyFile2,assemblyId2) rawfile = None ################################################################### # Setup for checkpointing scheme. redo = 0 keep = 0 step = 0 if(self.globals.has_key("ckpKeep")): keep = int(self.globals['ckpKeep']) ckpName = "AllDone" ################################################################### print >>STDERR, 'Keep step=' + str(keep) print >>STDERR, 'At step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) outprefix = self.runName step += 1 print >>STDERR, 'At uniqueFilter, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 if(not(self.globals.has_key('uniqueFilterOn') and self.globals['uniqueFilterOn']=="0")): print >>STDERR, 'Running UniqueFilter' outfile = MyFile.myfile() UniqueFilter.main( self.matches, outfile) self.matches = outfile outprefix += '.uniq' self.checkpoint(outprefix) step += 1 print >>STDERR, 'At filterByMatchLength, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'Running filterByMatchLength' outfile = MyFile.myfile() filterByMatchLength( self.matches, outfile, opt_t) self.matches = outfile outprefix += '.t' + str(opt_t) self.checkpoint(outprefix) step += 1 print >>STDERR, 'At trimMatchOverlaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, "Start trimming for bp one-to-one-ness" tempdata = MyFile.myfile() TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u') self.matches = tempdata print >>STDERR, "Finished trimming for bp one-to-one-ness" outprefix += '.trim' self.checkpoint(outprefix) if( boxRecoveryOn == 1 ): # For box recovery later ... but what if we start from a checkpoint? rawfile = self.matches step += 1 print >>STDERR, 'At formPerfectRuns, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.p6' tempdata = PerfectRuns.formPerfectRuns(self.matches, MatchRecord.sortInXorderAP, MatchRecord.sortInYorderAP, maxdiff, 'r') self.matches = tempdata outprefix += ".p6" # end if step += 1 print >>STDERR, 'At onlyKeepLongRuns, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.l' + str(opt_l) tempdata = onlyKeepLongRuns( self.matches, outprefix, opt_l) self.matches = tempdata outprefix += '.l' + str(opt_l) self.checkpoint(outprefix) step += 1 print >>STDERR, 'At formPerfectRuns, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'Heal the perfect runs' tempdata = PerfectRuns.formPerfectRuns(self.matches, MatchRecord.sortInYorderAP, MatchRecord.sortInXorderAP, maxdiff, 'r') self.matches = tempdata outprefix += '.pr' self.checkpoint(outprefix) if(boxRecoveryOn == 1): # This is a box recovery step. step += 1 print >>STDERR, 'At boxRecovery, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'from ' + outprefix + ' making ' + outprefix + '.br' print >>STDERR, "Make sorted raw matches" outfile = MyFile.myfile() MatchRecord.sortInXorderAP( rawfile, outfile) rawfile = outfile print >>STDERR, "perform box recovery" tempdata = boxRecovery( self.matches, rawfile, outprefix) self.matches = tempdata outprefix += '.br' self.checkpoint(outprefix) # end if step += 1 print >>STDERR, 'At formPerfectRuns, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ( (keep < step) and not self.globals.has_key(ckpName))): print >>STDERR, "form perfect runs" redo = 1 print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.p6' tempdata = PerfectRuns.formPerfectRuns(self.matches, MatchRecord.sortInXorderAP, MatchRecord.sortInYorderAP, maxdiff, 'r') self.matches = tempdata outprefix += '.pr' self.checkpoint(outprefix) step += 1 print >>STDERR, 'At squeezeIntraRunGaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, 'from ' + outprefix + ' to ' + outprefix + '.sq' tempdata = MyFile.myfile() squeezeIntraRunGaps.mainLoop( self.matches, tempdata, assemblyIdx1, assemblyIdx2) tempy = MyFile.myfile() # Beware the current match subtypes are 'x', 'L', and 'R'! coalesceMatches( tempdata, tempy, 1) self.matches = tempy outprefix += '.sq' self.checkpoint(outprefix) step += 1 print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, "Start trimming for bp one-to-one-ness" tempdata = MyFile.myfile() TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u') self.matches = tempdata outprefix += '.trim' print >>STDERR, "Finished trimming for bp one-to-one-ness" step += 1 print >>STDERR, 'At RunsAsMatches, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 self.runs = PerfectRuns.runsAsMatches( self.matches) outprefix += '.runs' self.checkpoint(outprefix) # end if if(self.globals.has_key('fillIntraRunGapsOn') and self.globals['fillIntraRunGapsOn']=="1" ): # Next comes the DNA sequence dependent stuff. step += 1 print >>STDERR, 'At fillIntraRunGaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, "fill the intrarun gaps" if(not self.globals.has_key('fillIntraRunGapsErate')): self.globals['fillIntraRunGapsErate'] = 0.10 if(not self.globals.has_key('fillIntraRunGapsMaxGap')): self.globals['fillIntraRunGapsMaxGap'] = 100000 fillIntraRunGapsErate = float(self.globals['fillIntraRunGapsErate']) fillIntraRunGapsMaxGap = int(self.globals['fillIntraRunGapsMaxGap']) tempdata = MyFile.myfile() fillIntraRunGaps.mainLoop(self.matches, tempdata, assemblyIdx1, assemblyIdx2, fillIntraRunGapsMaxGap, fillIntraRunGapsErate) self.matches = tempdata outprefix += '.fill' self.checkpoint(outprefix) step += 1 print >>STDERR, 'At TrimMatchOverlaps, step=' + str(step) print >>STDERR, 'Time elapsed=' + str(time.time()-t0) if (redo or ((keep < step) and not self.globals.has_key(ckpName))): redo = 1 print >>STDERR, "trim the overlaps" tempdata = MyFile.myfile() TrimMatchOverlaps.trimMatchOverlapsInBoth(self.matches,tempdata,'u') self.matches = tempdata outprefix += '.trim' self.checkpoint(outprefix)
def mainLoop( inpfile, outfile, xIdx, yIdx): maxgap = 100000 # This should be set by an ATAC global. margin = 20 # This should be set by an ATAC global. countLines = 0 inter_run_gap_count_total = 0 closed_gap_count_total = 0 squeezed_total = 0 x_len_total = 0 y_len_total = 0 x_nonACGT_total = 0 y_nonACGT_total = 0 inpfile.seek(0) inpfileIter = iter(inpfile) left = None for line in inpfileIter: if(line[0] == 'M'): left = MatchRecord.MatchRecord(line) # outfile.write(str(left)) print >>outfile, left countLines += 1 break; for line in inpfileIter: if(line[0] == 'M'): right = MatchRecord.MatchRecord(line) #if( countLines % 10000 == 0): # sys.stderr.write("countLines=%d\n" % countLines) (inter_run_gap_count,squeezed,x_len,y_len,x_notACGT,y_notACGT) \ = analyzeGap(xIdx,yIdx,left,right, outfile, maxgap, margin) inter_run_gap_count_total += inter_run_gap_count squeezed_total += squeezed x_len_total += x_len y_len_total += y_len x_nonACGT_total += x_notACGT y_nonACGT_total += y_notACGT if(x_len == 0 and y_len == 0): closed_gap_count_total += 1 # Output the record which was possibly trimmed. #outfile.write(str(right)) print >>outfile, right countLines += 1 left = right # end if # end for sys.stderr.write( "countLines %d inter_run_gap_count %d closed_gap_count %d squeezed %d x_len %d y_len %d x_nonACGT %d y_nonACGT %d\n" % (countLines,inter_run_gap_count_total,closed_gap_count_total, squeezed_total,x_len_total,y_len_total,x_nonACGT_total,y_nonACGT_total)) sys.stderr.write("theIsolatedSNPcount = %d\n" % theIsolatedSNPcount) sys.stderr.write("completefillednotXY = %d\n" % completefillednotXY) sys.stderr.write("completefilledXnotY = %d\n" % completefilledXnotY) sys.stderr.write("completefilledYnotX = %d\n" % completefilledYnotX) sys.stderr.write("completefilledXandY = %d\n" % completefilledXandY)
def applyBothKeepMasks( inpfile, outfile ): # Maybe we can think of a masking implementation where each ATAC match # is treated atomicly. Assume that the keep mask intervals are sorted # by start postition. Assume that the ATAC matches are sorted by start # postion. Assert that all keep mask intervals are non-overlapping and # were cut from only one ATAC match. Thus the mapping from keep mask # intervals is a function. Note that this requires that we do not # coalesce abutting keep mask intervals that originate from multiple # matches. Note this still allows an ATAC match to overlap more than # one keep mask interval. Ignore all keep mask intervals with zero # length their creation has tie breaking problems. See notes on 2003 # Jul 29. debug = 0 debugnum = 0 inpfile.seek(0) outfile.seek(0) # Apply the keepMask for the first axis. # Make the sorted the keep mask intervals for the first axis. processFirstAxis = 1 keepMaskFile = MyFile.myfile() tmpfile2 = inpfile tmpfile3 = MyFile.myfile() tmpfile4 = MyFile.myfile() findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis) if debug: debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in keepMaskFile: print >>debugfile, line, MatchRecord.sortInXorderAP(tmpfile2,tmpfile3) if debug: #tmpfile2.seek(0) #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") #for line in tmpfile2: print >>debugfile, line, tmpfile3.seek(0) debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile3: print >>debugfile, line, applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis) if debug: tmpfile4.seek(0) debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile4: print >>debugfile, line, # Apply the keepMask for the second axis. # Make the sorted the keep mask intervals for the second axis. processFirstAxis = 0 keepMaskFile = MyFile.myfile() tmpfile2 = tmpfile4 tmpfile3 = MyFile.myfile() tmpfile4 = outfile findCoverageIntervals( inpfile, keepMaskFile, processFirstAxis) if debug: debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in keepMaskFile: print >>debugfile, line, MatchRecord.sortInYorderAP(tmpfile2,tmpfile3) if debug: #tmpfile2.seek(0) #debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") #for line in tmpfile2: print >>debugfile, line, tmpfile3.seek(0) debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile3: print >>debugfile, line, applyOneKeepMask( tmpfile3, tmpfile4, keepMaskFile, processFirstAxis) if debug: tmpfile4.seek(0) debugnum += 1; debugfile = open("debugfile.%d" % debugnum, "w") for line in tmpfile4: print >>debugfile, line,