Beispiel #1
0
def recurseSubtract(sList, otherList):
    '''When a tcc is subtracted, it's subtraction must be subtracted 
	with all the other tcc's in otherlist.  The best way to do this is
	to break it up in a recursive function'''
    #print 'here recurse'

    totalOverlap = False
    subtractList = []

    #print 'Lists (keep, other):', sList, otherList

    for tccK in sList:  #
        overlap = False
        #print 'known check', tccK
        for tccO in otherList:
            #print ' other check', tccO
            if bioLibCG.tccOverlap(tccK, tccO):
                totalOverlap = True
                overlap = True
                for sTcc in subtractTwoTcc(tccK, tccO):
                    #print '  adding', sTcc, 'to list'
                    if sTcc not in subtractList: subtractList.append(sTcc)
                    #print '  list', subtractList
                break
        if not overlap:
            #print 'tccK did not overlap anything, adding to list'
            subtractList.append(tccK)
            #print subtractList

    if totalOverlap:
        rList = recurseSubtract(subtractList, otherList)
        return rList  #once all other recursions have ended return the final list
    else:
        return subtractList  #no overlaps, return all the way to top...
def recurseSubtract(sList, otherList):
	'''When a tcc is subtracted, it's subtraction must be subtracted 
	with all the other tcc's in otherlist.  The best way to do this is
	to break it up in a recursive function'''
	#print 'here recurse'
	
	totalOverlap = False
	subtractList = []
	
	#print 'Lists (keep, other):', sList, otherList
	
	for tccK in sList: #
		overlap = False
		#print 'known check', tccK
		for tccO in otherList:
			#print ' other check', tccO
			if bioLibCG.tccOverlap(tccK, tccO):
				totalOverlap = True
				overlap = True
				for sTcc in subtractTwoTcc(tccK, tccO):
					#print '  adding', sTcc, 'to list'
					if sTcc not in subtractList: subtractList.append(sTcc)
					#print '  list', subtractList
				break
		if not overlap:
			#print 'tccK did not overlap anything, adding to list'
			subtractList.append(tccK)
			#print subtractList
	
	if totalOverlap:
		rList = recurseSubtract(subtractList, otherList)
		return rList #once all other recursions have ended return the final list
	else:
		return subtractList #no overlaps, return all the way to top...
Beispiel #3
0
def makeConnectionsDict(tccList, complexity=None):
    '''A connections dictionary gives each coordinates connections to other coordinates in the list
	tcc : [tcc1, tcc2, etc]'''
    #make sure complexity is set
    if complexity == None:
        complexity = len(tccList)

    #make the index
    (tccIndex, indexMin, indexStep) = returnTccIndex(tccList, complexity)

    #compare vs sequences
    connectionsDict = {
    }  # format is coord: [tcc, tcc, etc].  It means X coord is connected to other coords
    for coord in tccList:
        #Add coord to connections dict --> every coord is a key
        connectionsDict[coord] = []

        numCheckLow = int(coord.strip().split(':')[2])
        numCheckHigh = int(coord.strip().split(':')[3])
        indexCheckLow = getStepIndex(numCheckLow, indexMin, indexStep)
        indexCheckHigh = getStepIndex(numCheckHigh, indexMin, indexStep)
        indexChecks = range(indexCheckLow, indexCheckHigh + 1, indexStep)

        for indexCheck in indexChecks:
            if indexCheck in tccIndex:
                for indexCoord in tccIndex[indexCheck]:
                    if indexCoord == coord:
                        pass
                    elif bioLibCG.tccOverlap(coord, indexCoord):
                        #print 'overlapped',indexCoord,coord
                        if indexCoord not in connectionsDict[coord]:
                            connectionsDict[coord].append(indexCoord)
                        #break #wouldn't breaking here prevent finding all overlaps?
    return connectionsDict
def makeConnectionsDict(tccList, complexity = None):
	'''A connections dictionary gives each coordinates connections to other coordinates in the list
	tcc : [tcc1, tcc2, etc]'''
	#make sure complexity is set
	if complexity == None:
		complexity = len(tccList)

	#make the index
	(tccIndex, indexMin, indexStep) = returnTccIndex(tccList, complexity)

	#compare vs sequences
	connectionsDict = {} # format is coord: [tcc, tcc, etc].  It means X coord is connected to other coords
	for coord in tccList:
		#Add coord to connections dict --> every coord is a key
		connectionsDict[coord] = []
		
		numCheckLow = int(coord.strip().split(':')[2])
		numCheckHigh = int(coord.strip().split(':')[3])
		indexCheckLow = getStepIndex(numCheckLow, indexMin, indexStep)
		indexCheckHigh = getStepIndex(numCheckHigh, indexMin, indexStep)
		indexChecks = range(indexCheckLow, indexCheckHigh + 1, indexStep)

		for indexCheck in indexChecks:
			if indexCheck in tccIndex:
				for indexCoord in tccIndex[indexCheck]:
					if indexCoord == coord:
						pass
					elif bioLibCG.tccOverlap(coord, indexCoord):
						#print 'overlapped',indexCoord,coord
						if indexCoord not in connectionsDict[coord]:
							connectionsDict[coord].append(indexCoord)
						#break #wouldn't breaking here prevent finding all overlaps?
	return connectionsDict
def compareTwoTcc(tccListOne, tccListTwo, order = 0, complexity = None, amount = False):
	'''Checks overlaps between two TCC lists
	Indexed --> Runs quicker.
	Complexity is roughly # of bins --> defaults to length of shortest list
	Returns list with overlapping sequences from BOTH lists --> think shared space of vinn diagram
	Takes longer than returning single list --> see other function
	ORDER DECIDES WHICH LIST IS TO BE INDEX --> NON INDEXED LIST IS RETURNED
	0 = shortest is indexed, 1 = first list is returned, 2 = second list is returned
	amount refers to if you want to return the amount of overlap for each transcript [coord, amount]'''
	
	#decide which list is to be indexed
	if order == 0: #shortest list is indexed
		if len(tccListOne) < len(tccListTwo):
			indexList = tccListOne
			otherList = tccListTwo
		else:
			indexList = tccListTwo
			otherList = tccListOne
	elif order == 1:
		indexList = tccListTwo
		otherList = tccListOne
	elif order == 2:
		indexList = tccListOne
		otherList = tccListTwo
		
	#make sure complexity is set, should set max complexity here...
	if complexity == None:
		complexity = len(indexList)
	#make the index
	(tccIndex, indexMin, indexStep) = returnTccIndex(indexList, complexity)
	#print tccIndex, complexity
	
	#compare vs sequences
	reducedList = []
	for coord in otherList:
		#print coord
		numCheckLow = int(coord.strip().split(':')[2])
		numCheckHigh = int(coord.strip().split(':')[3])
		indexCheckLow = getStepIndex(numCheckLow, indexMin, indexStep)
		indexCheckHigh = getStepIndex(numCheckHigh, indexMin, indexStep)
		indexChecks = range(indexCheckLow, indexCheckHigh + 1, indexStep)
		#print '  ',numCheckLow, indexCheckLow
		#print '  ',numCheckHigh, indexCheckHigh
		
		for indexCheck in indexChecks:
			if coord in reducedList: break #Already overlapped from another indexCheck...
			if indexCheck in tccIndex:
				#print 'Got Here'
				for indexCoord in tccIndex[indexCheck]:
					#print 'Checking Overlap', indexCoord, coord
					overlap = bioLibCG.tccOverlap(coord, indexCoord, True)
					if overlap:#if there is any overlap...
						#print 'overlapped',indexCoord,coord
						if amount:
							reducedList.append([coord, overlap])
						else:
							reducedList.append(coord)
						break #already found this coords overlap --> don't check other indexCoords
	return reducedList
Beispiel #6
0
def compareTwoTcc(tccListOne, tccListTwo, order=0, complexity=None):
    '''Checks overlaps between two TCC lists
	Indexed --> Runs quicker.
	Complexity is roughly # of bins --> defaults to length of shortest list
	Returns list with overlapping sequences from BOTH lists --> think shared space of vinn diagram
	Takes longer than returning single list --> see other function
	ORDER DECIDES WHICH LIST IS TO BE INDEX --> NON INDEXED LIST IS RETURNED
	0 = shortest is indexed, 1 = first list is returned, 2 = second list is returned'''

    #decide which list is to be indexed
    if order == 0:  #shortest list is indexed
        if len(tccListOne) < len(tccListTwo):
            indexList = tccListOne
            otherList = tccListTwo
        else:
            indexList = tccListTwo
            otherList = tccListOne
    elif order == 1:
        indexList = tccListTwo
        otherList = tccListOne
    elif order == 2:
        indexList = tccListOne
        otherList = tccListTwo

    #make sure complexity is set, should set max complexity here...
    if complexity == None:
        complexity = len(indexList)
    #make the index
    (tccIndex, indexMin, indexStep) = returnTccIndex(indexList, complexity)
    #print tccIndex, complexity

    #compare vs sequences
    reducedList = []
    for coord in otherList:
        #print coord
        numCheckLow = int(coord.strip().split(':')[2])
        numCheckHigh = int(coord.strip().split(':')[3])
        indexCheckLow = getStepIndex(numCheckLow, indexMin, indexStep)
        indexCheckHigh = getStepIndex(numCheckHigh, indexMin, indexStep)
        indexChecks = range(indexCheckLow, indexCheckHigh + 1, indexStep)
        #print '  ',numCheckLow, indexCheckLow
        #print '  ',numCheckHigh, indexCheckHigh

        for indexCheck in indexChecks:
            if coord in reducedList:
                break  #Already overlapped from another indexCheck...
            if indexCheck in tccIndex:
                #print 'Got Here'
                for indexCoord in tccIndex[indexCheck]:
                    #print 'Checking Overlap', indexCoord, coord
                    if bioLibCG.tccOverlap(coord, indexCoord):
                        #print 'overlapped',indexCoord,coord
                        reducedList.append(coord)
                        break  #already found this coords overlap --> don't check other indexCoords
    return reducedList
Beispiel #7
0
    def getOverlappingElements(self, tcc):
        '''Given region, Which element (INTRON, EXON, 5UTR, 3UTR)'''
        overlappingElements = []
        try:
            for utrSegment in self.utr5:
                utr5Tcc = bioLibCG.makeTcc(self.chromosome, self.strand,
                                           utrSegment[0], utrSegment[1])
                if bioLibCG.tccOverlap(utr5Tcc, tcc):
                    overlappingElements.append([utrSegment, '5UTR'])
        except IndexError:
            pass

        for exon in self.exonList:
            exonTcc = bioLibCG.makeTcc(self.chromosome, self.strand, exon[0],
                                       exon[1])
            #print '@ ', exonTcc, tcc, 'EXON'
            if bioLibCG.tccOverlap(exonTcc, tcc):
                overlappingElements.append([exon, 'EXON'])

        for intron in self.intronList:
            intronTcc = bioLibCG.makeTcc(self.chromosome, self.strand,
                                         intron[0], intron[1])
            #print '@ ', intronTcc, tcc, 'INTRON'
            if bioLibCG.tccOverlap(intronTcc, tcc):
                overlappingElements.append([intron, 'INTRON'])

        try:
            for utrSegment in self.utr3:
                utr3Tcc = bioLibCG.makeTcc(self.chromosome, self.strand,
                                           utrSegment[0], utrSegment[1])
                if bioLibCG.tccOverlap(utr3Tcc, tcc):
                    overlappingElements.append([utrSegment, '3UTR'])
        except IndexError:
            pass

        #!!!Eventually add a way to find if overlapping EXON_UTR as well
        if 'EXON' in overlappingElements and 'INTRON' in overlappingElements:
            overlappingElements.append('EXON_INTRON')
            overlappingElements.remove('EXON')
            overlappingElements.remove('INTRON')

        return overlappingElements
Beispiel #8
0
    def getOverlappingElements(self, tcc):
        """Given region, Which element (INTRON, EXON, 5UTR, 3UTR)"""
        overlappingElements = []
        try:
            for utrSegment in self.utr5:
                utr5Tcc = bioLibCG.makeTcc(self.chromosome, self.strand, utrSegment[0], utrSegment[1])
                if bioLibCG.tccOverlap(utr5Tcc, tcc):
                    overlappingElements.append([utrSegment, "5UTR"])
        except IndexError:
            pass

        for exon in self.exonList:
            exonTcc = bioLibCG.makeTcc(self.chromosome, self.strand, exon[0], exon[1])
            # print '@ ', exonTcc, tcc, 'EXON'
            if bioLibCG.tccOverlap(exonTcc, tcc):
                overlappingElements.append([exon, "EXON"])

        for intron in self.intronList:
            intronTcc = bioLibCG.makeTcc(self.chromosome, self.strand, intron[0], intron[1])
            # print '@ ', intronTcc, tcc, 'INTRON'
            if bioLibCG.tccOverlap(intronTcc, tcc):
                overlappingElements.append([intron, "INTRON"])

        try:
            for utrSegment in self.utr3:
                utr3Tcc = bioLibCG.makeTcc(self.chromosome, self.strand, utrSegment[0], utrSegment[1])
                if bioLibCG.tccOverlap(utr3Tcc, tcc):
                    overlappingElements.append([utrSegment, "3UTR"])
        except IndexError:
            pass

        #!!!Eventually add a way to find if overlapping EXON_UTR as well
        if "EXON" in overlappingElements and "INTRON" in overlappingElements:
            overlappingElements.append("EXON_INTRON")
            overlappingElements.remove("EXON")
            overlappingElements.remove("INTRON")

        return overlappingElements
Beispiel #9
0
def updateDensity(cName=None):
    #Create hitmap for blocks, cValdict for block
    conf = cgConfig.getConfig(cName)
    blockFileName = conf.conf[
        'hitsPerFrame']  # created in defineCluster script folder
    blockFile = open(blockFileName, 'r')
    blocksList = []
    cValBlockDict = {}

    for line in blockFile:
        blocksList.append(line.strip().split('\t')[0])
        cValBlockDict[line.strip().split('\t')[0]] = int(
            line.strip().split('\t')[1])
    blockFile.close()
    blockHitmap = bioLibCG.createHitMap(blocksList)

    #Now append the cVal for each predicted line:

    predictedFileName = conf.conf['results']
    predictedFile = open(predictedFileName, 'r')

    newFileList = []
    counter = 0
    for line in predictedFile:
        counter = counter + 1
        #print counter
        cVal = 0
        #what blocks does this prediction overlap?
        tccPrediction = line.strip().split('\t')[1]  #This should be mature?
        coordsPrediction = bioLibCG.stripTripleColon(tccPrediction)
        for i in range(int(coordsPrediction['start']),
                       int(coordsPrediction['end'])):
            if i in blockHitmap:
                for block in blockHitmap[i]:
                    if bioLibCG.tccOverlap(tccPrediction, block):
                        if cValBlockDict[block] > cVal:
                            cVal = cValBlockDict[block]
        newLine = line.strip().split('\t')
        newLine[5] = str(cVal)
        newLine = '\t'.join(newLine) + '\n'
        newFileList.append(newLine)
    predictedFile.close()

    newFileName = conf.conf['results']
    newFile = open(newFileName, 'w')
    for line in newFileList:
        newFile.write(line)

    newFile.close()
Beispiel #10
0
def getIndividualOverlaps(tccListOne, tccListTwo, order, complexity=None):
    '''Checks overlaps between two TCC lists
	Indexed --> Runs quicker.
	Complexity is roughly # of bins --> defaults to length of shortest list
	Returns list with overlapping sequences from BOTH lists --> think shared space of vinn diagram
	Takes longer than returning single list --> see other function
	0 = shortest is indexed, 1 = first list is returned, 2 = second list is returned'''

    #decide which list is to be indexed
    if order == 1:
        indexList = tccListTwo
        otherList = tccListOne
    elif order == 2:
        indexList = tccListOne
        otherList = tccListTwo

    #make sure complexity is set, should set max complexity here...
    if complexity == None:
        complexity = len(indexList)
    #make the index
    (tccIndex, indexMin, indexStep) = returnTccIndex(indexList, complexity)
    #print tccIndex, complexity

    #compare vs sequences
    individualOverlaps = {}  # tcc : [tcc, tcc]
    for coord in otherList:

        individualOverlaps[coord] = []

        #print coord
        numCheckLow = int(coord.strip().split(':')[2])
        numCheckHigh = int(coord.strip().split(':')[3])
        indexCheckLow = getStepIndex(numCheckLow, indexMin, indexStep)
        indexCheckHigh = getStepIndex(numCheckHigh, indexMin, indexStep)
        indexChecks = range(indexCheckLow, indexCheckHigh + 1, indexStep)
        #print '  ',numCheckLow, indexCheckLow
        #print '  ',numCheckHigh, indexCheckHigh

        for indexCheck in indexChecks:
            if indexCheck in tccIndex:
                for indexCoord in tccIndex[indexCheck]:
                    overlap = bioLibCG.tccOverlap(coord, indexCoord, True)
                    if overlap:  #if there is any overlap...
                        if indexCoord not in individualOverlaps[coord]:
                            individualOverlaps[coord].append(indexCoord)
    return individualOverlaps
Beispiel #11
0
def getIndividualOverlaps(tccListOne, tccListTwo, order, complexity = None):
	'''Checks overlaps between two TCC lists
	Indexed --> Runs quicker.
	Complexity is roughly # of bins --> defaults to length of shortest list
	Returns list with overlapping sequences from BOTH lists --> think shared space of vinn diagram
	Takes longer than returning single list --> see other function
	0 = shortest is indexed, 1 = first list is returned, 2 = second list is returned'''
	
	#decide which list is to be indexed
	if order == 1:
		indexList = tccListTwo
		otherList = tccListOne
	elif order == 2:
		indexList = tccListOne
		otherList = tccListTwo
		
	#make sure complexity is set, should set max complexity here...
	if complexity == None:
		complexity = len(indexList)
	#make the index
	(tccIndex, indexMin, indexStep) = returnTccIndex(indexList, complexity)
	#print tccIndex, complexity
	
	#compare vs sequences
        individualOverlaps = {} # tcc : [tcc, tcc]
	for coord in otherList:

                individualOverlaps[coord] = []

		#print coord
		numCheckLow = int(coord.strip().split(':')[2])
		numCheckHigh = int(coord.strip().split(':')[3])
		indexCheckLow = getStepIndex(numCheckLow, indexMin, indexStep)
		indexCheckHigh = getStepIndex(numCheckHigh, indexMin, indexStep)
		indexChecks = range(indexCheckLow, indexCheckHigh + 1, indexStep)
		#print '  ',numCheckLow, indexCheckLow
		#print '  ',numCheckHigh, indexCheckHigh
		
		for indexCheck in indexChecks:
			if indexCheck in tccIndex:
				for indexCoord in tccIndex[indexCheck]:
					overlap = bioLibCG.tccOverlap(coord, indexCoord, True)
					if overlap:#if there is any overlap...
                                                if indexCoord not in individualOverlaps[coord]:
                                                        individualOverlaps[coord].append(indexCoord)
        return individualOverlaps                     
def updateDensity(cName=None):
    # Create hitmap for blocks, cValdict for block
    conf = cgConfig.getConfig(cName)
    blockFileName = conf.conf["hitsPerFrame"]  # created in defineCluster script folder
    blockFile = open(blockFileName, "r")
    blocksList = []
    cValBlockDict = {}

    for line in blockFile:
        blocksList.append(line.strip().split("\t")[0])
        cValBlockDict[line.strip().split("\t")[0]] = int(line.strip().split("\t")[1])
    blockFile.close()
    blockHitmap = bioLibCG.createHitMap(blocksList)

    # Now append the cVal for each predicted line:

    predictedFileName = conf.conf["results"]
    predictedFile = open(predictedFileName, "r")

    newFileList = []
    counter = 0
    for line in predictedFile:
        counter = counter + 1
        # print counter
        cVal = 0
        # what blocks does this prediction overlap?
        tccPrediction = line.strip().split("\t")[1]  # This should be mature?
        coordsPrediction = bioLibCG.stripTripleColon(tccPrediction)
        for i in range(int(coordsPrediction["start"]), int(coordsPrediction["end"])):
            if i in blockHitmap:
                for block in blockHitmap[i]:
                    if bioLibCG.tccOverlap(tccPrediction, block):
                        if cValBlockDict[block] > cVal:
                            cVal = cValBlockDict[block]
        newLine = line.strip().split("\t")
        newLine[5] = str(cVal)
        newLine = "\t".join(newLine) + "\n"
        newFileList.append(newLine)
    predictedFile.close()

    newFileName = conf.conf["results"]
    newFile = open(newFileName, "w")
    for line in newFileList:
        newFile.write(line)

    newFile.close()
Beispiel #13
0
def consolidatePeaksByTcc(dFN, outFN):

    fOut = open(outFN, 'w')
    f = open(dFN, 'r')

    lastLine = ''
    lastTcc = 'chr100:1:100:1000'
    for line in f:
        ls = line.strip().split('\t')
        tcc = ls[1]

        if bioLibCG.tccOverlap(tcc, lastTcc):
            #skip line...
            lastLine = line
            lastTcc = tcc
        else:
            fOut.write(lastLine)
            lastLine = line
            lastTcc = tcc

    fOut.close()
    f.close()
def consolidatePeaksByTcc(dFN, outFN):

        fOut = open(outFN, 'w')
        f = open(dFN, 'r')

        lastLine = ''
        lastTcc = 'chr100:1:100:1000'
        for line in f:
                ls = line.strip().split('\t')
                tcc = ls[1]

                
                
                if bioLibCG.tccOverlap(tcc, lastTcc):
                        #skip line...
                        lastLine = line
                        lastTcc = tcc
                else:
                        fOut.write(lastLine)
                        lastLine = line
                        lastTcc = tcc
                
        fOut.close()
        f.close()