def recurseSubtract(sList, otherList): '''When a tcc is subtracted, it's subtraction must be subtracted with all the other tcc's in otherlist. The best way to do this is to break it up in a recursive function''' #print 'here recurse' totalOverlap = False subtractList = [] #print 'Lists (keep, other):', sList, otherList for tccK in sList: # overlap = False #print 'known check', tccK for tccO in otherList: #print ' other check', tccO if bioLibCG.tccOverlap(tccK, tccO): totalOverlap = True overlap = True for sTcc in subtractTwoTcc(tccK, tccO): #print ' adding', sTcc, 'to list' if sTcc not in subtractList: subtractList.append(sTcc) #print ' list', subtractList break if not overlap: #print 'tccK did not overlap anything, adding to list' subtractList.append(tccK) #print subtractList if totalOverlap: rList = recurseSubtract(subtractList, otherList) return rList #once all other recursions have ended return the final list else: return subtractList #no overlaps, return all the way to top...
def makeConnectionsDict(tccList, complexity=None): '''A connections dictionary gives each coordinates connections to other coordinates in the list tcc : [tcc1, tcc2, etc]''' #make sure complexity is set if complexity == None: complexity = len(tccList) #make the index (tccIndex, indexMin, indexStep) = returnTccIndex(tccList, complexity) #compare vs sequences connectionsDict = { } # format is coord: [tcc, tcc, etc]. It means X coord is connected to other coords for coord in tccList: #Add coord to connections dict --> every coord is a key connectionsDict[coord] = [] numCheckLow = int(coord.strip().split(':')[2]) numCheckHigh = int(coord.strip().split(':')[3]) indexCheckLow = getStepIndex(numCheckLow, indexMin, indexStep) indexCheckHigh = getStepIndex(numCheckHigh, indexMin, indexStep) indexChecks = range(indexCheckLow, indexCheckHigh + 1, indexStep) for indexCheck in indexChecks: if indexCheck in tccIndex: for indexCoord in tccIndex[indexCheck]: if indexCoord == coord: pass elif bioLibCG.tccOverlap(coord, indexCoord): #print 'overlapped',indexCoord,coord if indexCoord not in connectionsDict[coord]: connectionsDict[coord].append(indexCoord) #break #wouldn't breaking here prevent finding all overlaps? return connectionsDict
def makeConnectionsDict(tccList, complexity = None): '''A connections dictionary gives each coordinates connections to other coordinates in the list tcc : [tcc1, tcc2, etc]''' #make sure complexity is set if complexity == None: complexity = len(tccList) #make the index (tccIndex, indexMin, indexStep) = returnTccIndex(tccList, complexity) #compare vs sequences connectionsDict = {} # format is coord: [tcc, tcc, etc]. It means X coord is connected to other coords for coord in tccList: #Add coord to connections dict --> every coord is a key connectionsDict[coord] = [] numCheckLow = int(coord.strip().split(':')[2]) numCheckHigh = int(coord.strip().split(':')[3]) indexCheckLow = getStepIndex(numCheckLow, indexMin, indexStep) indexCheckHigh = getStepIndex(numCheckHigh, indexMin, indexStep) indexChecks = range(indexCheckLow, indexCheckHigh + 1, indexStep) for indexCheck in indexChecks: if indexCheck in tccIndex: for indexCoord in tccIndex[indexCheck]: if indexCoord == coord: pass elif bioLibCG.tccOverlap(coord, indexCoord): #print 'overlapped',indexCoord,coord if indexCoord not in connectionsDict[coord]: connectionsDict[coord].append(indexCoord) #break #wouldn't breaking here prevent finding all overlaps? return connectionsDict
def compareTwoTcc(tccListOne, tccListTwo, order = 0, complexity = None, amount = False): '''Checks overlaps between two TCC lists Indexed --> Runs quicker. Complexity is roughly # of bins --> defaults to length of shortest list Returns list with overlapping sequences from BOTH lists --> think shared space of vinn diagram Takes longer than returning single list --> see other function ORDER DECIDES WHICH LIST IS TO BE INDEX --> NON INDEXED LIST IS RETURNED 0 = shortest is indexed, 1 = first list is returned, 2 = second list is returned amount refers to if you want to return the amount of overlap for each transcript [coord, amount]''' #decide which list is to be indexed if order == 0: #shortest list is indexed if len(tccListOne) < len(tccListTwo): indexList = tccListOne otherList = tccListTwo else: indexList = tccListTwo otherList = tccListOne elif order == 1: indexList = tccListTwo otherList = tccListOne elif order == 2: indexList = tccListOne otherList = tccListTwo #make sure complexity is set, should set max complexity here... if complexity == None: complexity = len(indexList) #make the index (tccIndex, indexMin, indexStep) = returnTccIndex(indexList, complexity) #print tccIndex, complexity #compare vs sequences reducedList = [] for coord in otherList: #print coord numCheckLow = int(coord.strip().split(':')[2]) numCheckHigh = int(coord.strip().split(':')[3]) indexCheckLow = getStepIndex(numCheckLow, indexMin, indexStep) indexCheckHigh = getStepIndex(numCheckHigh, indexMin, indexStep) indexChecks = range(indexCheckLow, indexCheckHigh + 1, indexStep) #print ' ',numCheckLow, indexCheckLow #print ' ',numCheckHigh, indexCheckHigh for indexCheck in indexChecks: if coord in reducedList: break #Already overlapped from another indexCheck... if indexCheck in tccIndex: #print 'Got Here' for indexCoord in tccIndex[indexCheck]: #print 'Checking Overlap', indexCoord, coord overlap = bioLibCG.tccOverlap(coord, indexCoord, True) if overlap:#if there is any overlap... #print 'overlapped',indexCoord,coord if amount: reducedList.append([coord, overlap]) else: reducedList.append(coord) break #already found this coords overlap --> don't check other indexCoords return reducedList
def compareTwoTcc(tccListOne, tccListTwo, order=0, complexity=None): '''Checks overlaps between two TCC lists Indexed --> Runs quicker. Complexity is roughly # of bins --> defaults to length of shortest list Returns list with overlapping sequences from BOTH lists --> think shared space of vinn diagram Takes longer than returning single list --> see other function ORDER DECIDES WHICH LIST IS TO BE INDEX --> NON INDEXED LIST IS RETURNED 0 = shortest is indexed, 1 = first list is returned, 2 = second list is returned''' #decide which list is to be indexed if order == 0: #shortest list is indexed if len(tccListOne) < len(tccListTwo): indexList = tccListOne otherList = tccListTwo else: indexList = tccListTwo otherList = tccListOne elif order == 1: indexList = tccListTwo otherList = tccListOne elif order == 2: indexList = tccListOne otherList = tccListTwo #make sure complexity is set, should set max complexity here... if complexity == None: complexity = len(indexList) #make the index (tccIndex, indexMin, indexStep) = returnTccIndex(indexList, complexity) #print tccIndex, complexity #compare vs sequences reducedList = [] for coord in otherList: #print coord numCheckLow = int(coord.strip().split(':')[2]) numCheckHigh = int(coord.strip().split(':')[3]) indexCheckLow = getStepIndex(numCheckLow, indexMin, indexStep) indexCheckHigh = getStepIndex(numCheckHigh, indexMin, indexStep) indexChecks = range(indexCheckLow, indexCheckHigh + 1, indexStep) #print ' ',numCheckLow, indexCheckLow #print ' ',numCheckHigh, indexCheckHigh for indexCheck in indexChecks: if coord in reducedList: break #Already overlapped from another indexCheck... if indexCheck in tccIndex: #print 'Got Here' for indexCoord in tccIndex[indexCheck]: #print 'Checking Overlap', indexCoord, coord if bioLibCG.tccOverlap(coord, indexCoord): #print 'overlapped',indexCoord,coord reducedList.append(coord) break #already found this coords overlap --> don't check other indexCoords return reducedList
def getOverlappingElements(self, tcc): '''Given region, Which element (INTRON, EXON, 5UTR, 3UTR)''' overlappingElements = [] try: for utrSegment in self.utr5: utr5Tcc = bioLibCG.makeTcc(self.chromosome, self.strand, utrSegment[0], utrSegment[1]) if bioLibCG.tccOverlap(utr5Tcc, tcc): overlappingElements.append([utrSegment, '5UTR']) except IndexError: pass for exon in self.exonList: exonTcc = bioLibCG.makeTcc(self.chromosome, self.strand, exon[0], exon[1]) #print '@ ', exonTcc, tcc, 'EXON' if bioLibCG.tccOverlap(exonTcc, tcc): overlappingElements.append([exon, 'EXON']) for intron in self.intronList: intronTcc = bioLibCG.makeTcc(self.chromosome, self.strand, intron[0], intron[1]) #print '@ ', intronTcc, tcc, 'INTRON' if bioLibCG.tccOverlap(intronTcc, tcc): overlappingElements.append([intron, 'INTRON']) try: for utrSegment in self.utr3: utr3Tcc = bioLibCG.makeTcc(self.chromosome, self.strand, utrSegment[0], utrSegment[1]) if bioLibCG.tccOverlap(utr3Tcc, tcc): overlappingElements.append([utrSegment, '3UTR']) except IndexError: pass #!!!Eventually add a way to find if overlapping EXON_UTR as well if 'EXON' in overlappingElements and 'INTRON' in overlappingElements: overlappingElements.append('EXON_INTRON') overlappingElements.remove('EXON') overlappingElements.remove('INTRON') return overlappingElements
def getOverlappingElements(self, tcc): """Given region, Which element (INTRON, EXON, 5UTR, 3UTR)""" overlappingElements = [] try: for utrSegment in self.utr5: utr5Tcc = bioLibCG.makeTcc(self.chromosome, self.strand, utrSegment[0], utrSegment[1]) if bioLibCG.tccOverlap(utr5Tcc, tcc): overlappingElements.append([utrSegment, "5UTR"]) except IndexError: pass for exon in self.exonList: exonTcc = bioLibCG.makeTcc(self.chromosome, self.strand, exon[0], exon[1]) # print '@ ', exonTcc, tcc, 'EXON' if bioLibCG.tccOverlap(exonTcc, tcc): overlappingElements.append([exon, "EXON"]) for intron in self.intronList: intronTcc = bioLibCG.makeTcc(self.chromosome, self.strand, intron[0], intron[1]) # print '@ ', intronTcc, tcc, 'INTRON' if bioLibCG.tccOverlap(intronTcc, tcc): overlappingElements.append([intron, "INTRON"]) try: for utrSegment in self.utr3: utr3Tcc = bioLibCG.makeTcc(self.chromosome, self.strand, utrSegment[0], utrSegment[1]) if bioLibCG.tccOverlap(utr3Tcc, tcc): overlappingElements.append([utrSegment, "3UTR"]) except IndexError: pass #!!!Eventually add a way to find if overlapping EXON_UTR as well if "EXON" in overlappingElements and "INTRON" in overlappingElements: overlappingElements.append("EXON_INTRON") overlappingElements.remove("EXON") overlappingElements.remove("INTRON") return overlappingElements
def updateDensity(cName=None): #Create hitmap for blocks, cValdict for block conf = cgConfig.getConfig(cName) blockFileName = conf.conf[ 'hitsPerFrame'] # created in defineCluster script folder blockFile = open(blockFileName, 'r') blocksList = [] cValBlockDict = {} for line in blockFile: blocksList.append(line.strip().split('\t')[0]) cValBlockDict[line.strip().split('\t')[0]] = int( line.strip().split('\t')[1]) blockFile.close() blockHitmap = bioLibCG.createHitMap(blocksList) #Now append the cVal for each predicted line: predictedFileName = conf.conf['results'] predictedFile = open(predictedFileName, 'r') newFileList = [] counter = 0 for line in predictedFile: counter = counter + 1 #print counter cVal = 0 #what blocks does this prediction overlap? tccPrediction = line.strip().split('\t')[1] #This should be mature? coordsPrediction = bioLibCG.stripTripleColon(tccPrediction) for i in range(int(coordsPrediction['start']), int(coordsPrediction['end'])): if i in blockHitmap: for block in blockHitmap[i]: if bioLibCG.tccOverlap(tccPrediction, block): if cValBlockDict[block] > cVal: cVal = cValBlockDict[block] newLine = line.strip().split('\t') newLine[5] = str(cVal) newLine = '\t'.join(newLine) + '\n' newFileList.append(newLine) predictedFile.close() newFileName = conf.conf['results'] newFile = open(newFileName, 'w') for line in newFileList: newFile.write(line) newFile.close()
def getIndividualOverlaps(tccListOne, tccListTwo, order, complexity=None): '''Checks overlaps between two TCC lists Indexed --> Runs quicker. Complexity is roughly # of bins --> defaults to length of shortest list Returns list with overlapping sequences from BOTH lists --> think shared space of vinn diagram Takes longer than returning single list --> see other function 0 = shortest is indexed, 1 = first list is returned, 2 = second list is returned''' #decide which list is to be indexed if order == 1: indexList = tccListTwo otherList = tccListOne elif order == 2: indexList = tccListOne otherList = tccListTwo #make sure complexity is set, should set max complexity here... if complexity == None: complexity = len(indexList) #make the index (tccIndex, indexMin, indexStep) = returnTccIndex(indexList, complexity) #print tccIndex, complexity #compare vs sequences individualOverlaps = {} # tcc : [tcc, tcc] for coord in otherList: individualOverlaps[coord] = [] #print coord numCheckLow = int(coord.strip().split(':')[2]) numCheckHigh = int(coord.strip().split(':')[3]) indexCheckLow = getStepIndex(numCheckLow, indexMin, indexStep) indexCheckHigh = getStepIndex(numCheckHigh, indexMin, indexStep) indexChecks = range(indexCheckLow, indexCheckHigh + 1, indexStep) #print ' ',numCheckLow, indexCheckLow #print ' ',numCheckHigh, indexCheckHigh for indexCheck in indexChecks: if indexCheck in tccIndex: for indexCoord in tccIndex[indexCheck]: overlap = bioLibCG.tccOverlap(coord, indexCoord, True) if overlap: #if there is any overlap... if indexCoord not in individualOverlaps[coord]: individualOverlaps[coord].append(indexCoord) return individualOverlaps
def getIndividualOverlaps(tccListOne, tccListTwo, order, complexity = None): '''Checks overlaps between two TCC lists Indexed --> Runs quicker. Complexity is roughly # of bins --> defaults to length of shortest list Returns list with overlapping sequences from BOTH lists --> think shared space of vinn diagram Takes longer than returning single list --> see other function 0 = shortest is indexed, 1 = first list is returned, 2 = second list is returned''' #decide which list is to be indexed if order == 1: indexList = tccListTwo otherList = tccListOne elif order == 2: indexList = tccListOne otherList = tccListTwo #make sure complexity is set, should set max complexity here... if complexity == None: complexity = len(indexList) #make the index (tccIndex, indexMin, indexStep) = returnTccIndex(indexList, complexity) #print tccIndex, complexity #compare vs sequences individualOverlaps = {} # tcc : [tcc, tcc] for coord in otherList: individualOverlaps[coord] = [] #print coord numCheckLow = int(coord.strip().split(':')[2]) numCheckHigh = int(coord.strip().split(':')[3]) indexCheckLow = getStepIndex(numCheckLow, indexMin, indexStep) indexCheckHigh = getStepIndex(numCheckHigh, indexMin, indexStep) indexChecks = range(indexCheckLow, indexCheckHigh + 1, indexStep) #print ' ',numCheckLow, indexCheckLow #print ' ',numCheckHigh, indexCheckHigh for indexCheck in indexChecks: if indexCheck in tccIndex: for indexCoord in tccIndex[indexCheck]: overlap = bioLibCG.tccOverlap(coord, indexCoord, True) if overlap:#if there is any overlap... if indexCoord not in individualOverlaps[coord]: individualOverlaps[coord].append(indexCoord) return individualOverlaps
def updateDensity(cName=None): # Create hitmap for blocks, cValdict for block conf = cgConfig.getConfig(cName) blockFileName = conf.conf["hitsPerFrame"] # created in defineCluster script folder blockFile = open(blockFileName, "r") blocksList = [] cValBlockDict = {} for line in blockFile: blocksList.append(line.strip().split("\t")[0]) cValBlockDict[line.strip().split("\t")[0]] = int(line.strip().split("\t")[1]) blockFile.close() blockHitmap = bioLibCG.createHitMap(blocksList) # Now append the cVal for each predicted line: predictedFileName = conf.conf["results"] predictedFile = open(predictedFileName, "r") newFileList = [] counter = 0 for line in predictedFile: counter = counter + 1 # print counter cVal = 0 # what blocks does this prediction overlap? tccPrediction = line.strip().split("\t")[1] # This should be mature? coordsPrediction = bioLibCG.stripTripleColon(tccPrediction) for i in range(int(coordsPrediction["start"]), int(coordsPrediction["end"])): if i in blockHitmap: for block in blockHitmap[i]: if bioLibCG.tccOverlap(tccPrediction, block): if cValBlockDict[block] > cVal: cVal = cValBlockDict[block] newLine = line.strip().split("\t") newLine[5] = str(cVal) newLine = "\t".join(newLine) + "\n" newFileList.append(newLine) predictedFile.close() newFileName = conf.conf["results"] newFile = open(newFileName, "w") for line in newFileList: newFile.write(line) newFile.close()
def consolidatePeaksByTcc(dFN, outFN): fOut = open(outFN, 'w') f = open(dFN, 'r') lastLine = '' lastTcc = 'chr100:1:100:1000' for line in f: ls = line.strip().split('\t') tcc = ls[1] if bioLibCG.tccOverlap(tcc, lastTcc): #skip line... lastLine = line lastTcc = tcc else: fOut.write(lastLine) lastLine = line lastTcc = tcc fOut.close() f.close()