def plotPatternBeyondRepeat(genomeSource1, genomeSource2, startpt1, endpt1, startpt2, endpt2, plotRange): f1 = open(genomeSource1, 'r') f2 = open(genomeSource2, 'r') pointerLocation1 = endpt1 pointerLocation2 = endpt2 windowSize = 10 distanceList = [] for index in range(plotRange): f1.seek(pointerLocation1) f2.seek(pointerLocation2) str1 = f1.read(windowSize) str2 = f2.read(windowSize) pointerLocation1 = pointerLocation1 + windowSize pointerLocation2 = pointerLocation2 + windowSize distance = distanceComputeLib.hammingDistance( str1, str2, min(windowSize, len(str1), len(str2))) distanceList.append(distance) plt.subplot(211) plt.plot(range(0, len(distanceList) * windowSize, windowSize), distanceList) windowSize = 10 distanceList = [] pointerLocation1 = startpt1 - windowSize pointerLocation2 = startpt2 - windowSize # for index in range(plotRange): # f1.seek(pointerLocation1) # f2.seek(pointerLocation2) # str1 = f1.read(windowSize) # str2 = f2.read(windowSize) # pointerLocation1 = pointerLocation1 - windowSize # pointerLocation2 = pointerLocation2 - windowSize # distance = hammingDistance(str1, str2, windowSize) # distanceList.append(distance) # plt.subplot(212) # plt.plot(range(0,len(distanceList)*windowSize,windowSize), distanceList) #plt.show() f1.close() f2.close()
def plotPatternBeyondRepeat(genomeSource1,genomeSource2, startpt1, endpt1, startpt2, endpt2,plotRange): f1 = open(genomeSource1,'r') f2 = open(genomeSource2,'r') pointerLocation1 = endpt1 pointerLocation2 = endpt2 windowSize = 10 distanceList = [] for index in range(plotRange): f1.seek(pointerLocation1) f2.seek(pointerLocation2) str1 = f1.read(windowSize) str2 = f2.read(windowSize) pointerLocation1 = pointerLocation1 + windowSize pointerLocation2 = pointerLocation2 + windowSize distance = distanceComputeLib.hammingDistance(str1, str2, min(windowSize,len(str1),len(str2))) distanceList.append(distance) plt.subplot(211) plt.plot(range(0,len(distanceList)*windowSize,windowSize), distanceList) windowSize = 10 distanceList = [] pointerLocation1 = startpt1 - windowSize pointerLocation2 = startpt2 - windowSize # for index in range(plotRange): # f1.seek(pointerLocation1) # f2.seek(pointerLocation2) # str1 = f1.read(windowSize) # str2 = f2.read(windowSize) # pointerLocation1 = pointerLocation1 - windowSize # pointerLocation2 = pointerLocation2 - windowSize # distance = hammingDistance(str1, str2, windowSize) # distanceList.append(distance) # plt.subplot(212) # plt.plot(range(0,len(distanceList)*windowSize,windowSize), distanceList) #plt.show() f1.close() f2.close()
def effectOfInterleaving(genomeSource1, Lrepeat, Liid): f1 = open(genomeSource1, 'r') G = len(f1.read()) print G oldh = Liid totalNumberOfRounds = 100000 for numberOfRounds in range(totalNumberOfRounds): i = random.randint(0, G - Lrepeat - Liid - 1) j = random.randint(i + 1, G - Lrepeat - Liid) f1.seek(i) substring1 = f1.read(Liid) f1.seek(j) substring2 = f1.read(Liid) h1 = distanceComputeLib.hammingDistance(substring1, substring2, len(substring1)) f1.seek(i + Lrepeat) substring1 = f1.read(Liid) f1.seek(j + Lrepeat) substring2 = f1.read(Liid) h2 = distanceComputeLib.hammingDistance(substring1, substring2, len(substring1)) h = max(h1, h2) print i, j, h if h < oldh: oldh = h print "Minimum hamming distance over ", totalNumberOfRounds, " is ", oldh, "\n" f1.close()
def effectOfInterleaving(genomeSource1,Lrepeat, Liid): f1 = open(genomeSource1, 'r') G = len(f1.read()) print G oldh = Liid totalNumberOfRounds = 100000 for numberOfRounds in range(totalNumberOfRounds): i = random.randint(0,G- Lrepeat - Liid -1) j = random.randint(i+1, G- Lrepeat - Liid) f1.seek(i) substring1 = f1.read(Liid) f1.seek(j) substring2= f1.read(Liid) h1= distanceComputeLib.hammingDistance(substring1,substring2, len(substring1)) f1.seek(i + Lrepeat) substring1 = f1.read(Liid) f1.seek(j+ Lrepeat) substring2= f1.read(Liid) h2= distanceComputeLib.hammingDistance(substring1,substring2, len(substring1)) h = max(h1,h2) print i,j, h if h < oldh: oldh = h print "Minimum hamming distance over " ,totalNumberOfRounds, " is ", oldh, "\n" f1.close()
def checking(temp,genomeSource1,genomeSource2 ): print temp f1 = open(genomeSource1,'r') f2 = open(genomeSource2,'r') f1.seek(temp[0]-1) f2.seek(temp[1]-1) str1 = f1.read(temp[2]) str2 = f2.read(temp[2]) print "Hamming distance", distanceComputeLib.hammingDistance(str1,str2, len(str1)) f1.seek(temp[0]-2) f2.seek(temp[1]-2) str1 = f1.read(temp[2]+2) str2 = f2.read(temp[2]+2) print "Hamming distance", distanceComputeLib.hammingDistance(str1,str2, len(str1)) f1.close() f2.close()
def reportPatternBeyondRepeat(genomeSource1, genomeSource2, startpt1, endpt1, startpt2, endpt2, plotRange, outputResult): Gchecker = open(genomeSource1, 'r') G = len(Gchecker.read()) print G Gchecker.close() f1 = open(genomeSource1, 'r') f2 = open(genomeSource2, 'r') pointerLocation1 = endpt1 pointerLocation2 = endpt2 windowSize = 10 distanceList = [] for index in range(plotRange): if pointerLocation1 < G and pointerLocation2 < G: f1.seek(pointerLocation1) f2.seek(pointerLocation2) str1 = f1.read(windowSize) str2 = f2.read(windowSize) pointerLocation1 = pointerLocation1 + windowSize pointerLocation2 = pointerLocation2 + windowSize distance = distanceComputeLib.hammingDistance( str1, str2, windowSize) distanceList.append(distance) f = open(outputResult, 'w') for eachitem in distanceList: f.write(str(eachitem) + "\n") f.close() f1.close() f2.close()
def reportPatternBeyondRepeat(genomeSource1,genomeSource2, startpt1, endpt1, startpt2, endpt2,plotRange,outputResult): Gchecker = open(genomeSource1,'r') G = len(Gchecker.read()) print G Gchecker.close() f1 = open(genomeSource1,'r') f2 = open(genomeSource2,'r') pointerLocation1 = endpt1 pointerLocation2 = endpt2 windowSize = 10 distanceList = [] for index in range(plotRange): if pointerLocation1 < G and pointerLocation2 < G: f1.seek(pointerLocation1) f2.seek(pointerLocation2) str1 = f1.read(windowSize) str2 = f2.read(windowSize) pointerLocation1 = pointerLocation1 + windowSize pointerLocation2 = pointerLocation2 + windowSize distance = distanceComputeLib.hammingDistance(str1, str2, windowSize) distanceList.append(distance) f = open(outputResult,'w') for eachitem in distanceList: f.write(str(eachitem) + "\n" ) f.close() f1.close() f2.close()
def findapproxrepeatLength(filename1, filename2, start1, start2, lengthOfExactRepeat): f1 = open(filename1, 'r') f2 = open(filename2, 'r') totalNumberOfError = 0 windowSize = 100 threshold = 25 ### Decision rule : if > 50 error in the latest window of 100, then stop counting numberOfError = 0 ### Compute the RHS f1.seek(start1 + lengthOfExactRepeat - windowSize - 1) f2.seek(start2 + lengthOfExactRepeat - windowSize - 1) temp1 = f1.read(windowSize) temp2 = f2.read(windowSize) lastPosition1 = start1 + lengthOfExactRepeat - windowSize - 1 lastPosition2 = start2 + lengthOfExactRepeat - windowSize - 1 print "CheckPoint 1 : ", distanceComputeLib.hammingDistance( temp1, temp2, len(temp1)) numberOfError = distanceComputeLib.hammingDistance(temp1, temp2, len(temp1)) totalNumberOfError = totalNumberOfError + numberOfError while (numberOfError < threshold): f1.seek(lastPosition1) char1 = f1.read(1) f2.seek(lastPosition2) char2 = f2.read(1) if char1 != char2: numberOfError = numberOfError - 1 f1.seek(lastPosition1 + windowSize) f2.seek(lastPosition2 + windowSize) char1 = f1.read(1) char2 = f2.read(1) if char1 != char2: numberOfError = numberOfError + 1 totalNumberOfError = totalNumberOfError + 1 lastPosition1 = lastPosition1 + 1 lastPosition2 = lastPosition2 + 1 endIndex1 = lastPosition1 + windowSize - int(threshold * 1.3333) endIndex2 = lastPosition2 + windowSize - int(threshold * 1.3333) numberOfError = 0 ### Compute the LHS f1.seek(start1) f2.seek(start2) temp1 = f1.read(windowSize) temp2 = f2.read(windowSize) lastPosition1 = start1 + windowSize - 1 lastPosition2 = start2 + windowSize - 1 numberOfError = distanceComputeLib.hammingDistance(temp1, temp2, len(temp1)) print "checkPoint2 :", distanceComputeLib.hammingDistance( temp1, temp2, len(temp1)) totalNumberOfError = totalNumberOfError + numberOfError while (numberOfError < threshold): f1.seek(lastPosition1) char1 = f1.read(1) f2.seek(lastPosition2) char2 = f2.read(1) if char1 != char2: numberOfError = numberOfError - 1 f1.seek(lastPosition1 - windowSize) f2.seek(lastPosition2 - windowSize) char1 = f1.read(1) char2 = f2.read(1) if char1 != char2: numberOfError = numberOfError + 1 totalNumberOfError = totalNumberOfError + 1 lastPosition1 = lastPosition1 - 1 lastPosition2 = lastPosition2 - 1 startIndex1 = lastPosition1 - windowSize + int(threshold * 1.3333) startIndex2 = lastPosition2 - windowSize + int(threshold * 1.3333) lapprox = endIndex1 - startIndex1 print lapprox, totalNumberOfError, threshold, lengthOfExactRepeat mutationRate = (totalNumberOfError - 2 * threshold) / float(lapprox) if mutationRate == 0: mutationRate = 1 / float(lapprox) print "mutationRate", mutationRate, (totalNumberOfError - 2 * threshold) if startIndex1 > startIndex2: dummy = startIndex1 startIndex1 = startIndex2 startIndex2 = dummy if lapprox <= lengthOfExactRepeat: return start1, start2, lengthOfExactRepeat + 1, 1 / float( lengthOfExactRepeat) else: return startIndex1, startIndex2, lapprox, mutationRate
def findapproxrepeatLength(filename1, filename2, start1, start2, lengthOfExactRepeat): f1 = open(filename1, 'r') f2 = open(filename2, 'r') totalNumberOfError = 0 windowSize = 100 threshold = 25 ### Decision rule : if > 50 error in the latest window of 100, then stop counting numberOfError= 0 ### Compute the RHS f1.seek(start1+ lengthOfExactRepeat- windowSize-1) f2.seek(start2+ lengthOfExactRepeat- windowSize-1) temp1 = f1.read(windowSize) temp2 = f2.read(windowSize) lastPosition1 = start1+ lengthOfExactRepeat- windowSize-1 lastPosition2 = start2+ lengthOfExactRepeat- windowSize-1 print "CheckPoint 1 : ", distanceComputeLib.hammingDistance(temp1, temp2, len(temp1)) numberOfError = distanceComputeLib.hammingDistance(temp1, temp2, len(temp1)) totalNumberOfError = totalNumberOfError + numberOfError while (numberOfError < threshold): f1.seek(lastPosition1) char1 = f1.read(1) f2.seek(lastPosition2) char2 = f2.read(1) if char1 != char2 : numberOfError = numberOfError - 1 f1.seek(lastPosition1 + windowSize) f2.seek(lastPosition2 + windowSize) char1 = f1.read(1) char2 = f2.read(1) if char1 != char2: numberOfError = numberOfError + 1 totalNumberOfError= totalNumberOfError + 1 lastPosition1 = lastPosition1 + 1 lastPosition2 = lastPosition2 + 1 endIndex1 = lastPosition1 + windowSize - int(threshold* 1.3333) endIndex2 = lastPosition2 + windowSize - int(threshold* 1.3333) numberOfError= 0 ### Compute the LHS f1.seek(start1) f2.seek(start2) temp1 = f1.read(windowSize) temp2 = f2.read(windowSize) lastPosition1 = start1 + windowSize -1 lastPosition2 = start2 + windowSize -1 numberOfError = distanceComputeLib.hammingDistance(temp1, temp2, len(temp1)) print "checkPoint2 :", distanceComputeLib.hammingDistance(temp1, temp2, len(temp1)) totalNumberOfError = totalNumberOfError + numberOfError while (numberOfError < threshold): f1.seek(lastPosition1) char1 = f1.read(1) f2.seek(lastPosition2) char2 = f2.read(1) if char1 != char2 : numberOfError = numberOfError - 1 f1.seek(lastPosition1 - windowSize) f2.seek(lastPosition2 - windowSize) char1 = f1.read(1) char2 = f2.read(1) if char1 != char2: numberOfError = numberOfError + 1 totalNumberOfError = totalNumberOfError +1 lastPosition1 = lastPosition1 - 1 lastPosition2 = lastPosition2 - 1 startIndex1 = lastPosition1 - windowSize + int(threshold* 1.3333) startIndex2 = lastPosition2 - windowSize + int(threshold* 1.3333) lapprox = endIndex1-startIndex1 print lapprox, totalNumberOfError ,threshold,lengthOfExactRepeat mutationRate = (totalNumberOfError - 2* threshold )/float(lapprox) if mutationRate == 0: mutationRate = 1/float(lapprox) print "mutationRate", mutationRate, (totalNumberOfError - 2* threshold ) if startIndex1 > startIndex2 : dummy = startIndex1 startIndex1 = startIndex2 startIndex2 = dummy if lapprox <= lengthOfExactRepeat: return start1, start2, lengthOfExactRepeat+1, 1/ float(lengthOfExactRepeat) else: return startIndex1, startIndex2, lapprox, mutationRate