Ejemplo n.º 1
0
def makePeakInputQ(cName, minExpression=2000):
    '''Uses shell script and qsub to get peaks quickly'''

    mConf = c.getConfig('Main.conf')
    conf = c.getConfig(cName)

    assembly = conf.conf['assembly']

    tccList = []

    chromLens = cg.returnChromLengthDict(assembly)

    for chrom in chromLens:
        if chrom not in cg.acceptableChroms: continue
        for strand in ['1', '-1']:
            print 'Getting Peaks for ', chrom, strand
            prevI = 0
            for i in rangePoints(1, chromLens[chrom], 30):
                if i == 1:
                    prevI = i
                    continue

                start = prevI
                end = i
                prevI = i

                tcc = cg.makeTcc(chrom, strand, start, end)

                log = 'logs/o-' + str(start)
                elog = 'logs/e-%s-%s-%s-%s' % (chrom, strand, start, end)
                subprocess.Popen([
                    'qsub', '-V', '-cwd', '-e', elog, '-o', log, '-l',
                    'mem=3G', '-l', 'rt=3600', 'q.sh', tcc, cName,
                    str(minExpression)
                ]).wait()
Ejemplo n.º 2
0
def makePeakInputQ(cName, minExpression = 2000):
	'''Uses shell script and qsub to get peaks quickly'''
	
	mConf = c.getConfig('Main.conf')
	conf = c.getConfig(cName)
	
	assembly = conf.conf['assembly']
	
	tccList = []
	
	chromLens = cg.returnChromLengthDict(assembly)
	
	for chrom in chromLens:
		if chrom not in cg.acceptableChroms: continue
		for strand in ['1','-1']:
			print 'Getting Peaks for ', chrom, strand
			prevI = 0
			for i in rangePoints(1, chromLens[chrom], 30):
				if i == 1:
					prevI = i
					continue
				
				start = prevI
				end = i
				prevI = i
				
				tcc = cg.makeTcc(chrom, strand, start, end)
								
				log = 'logs/o-' + str(start)
				elog = 'logs/e-%s-%s-%s-%s' % (chrom, strand, start, end)
				subprocess.Popen(['qsub', '-V', '-cwd', '-e', elog, '-o', log, '-l', 'mem=3G', '-l', 'rt=3600', 'q.sh', tcc, cName, str(minExpression)]).wait()
Ejemplo n.º 3
0
def sortResults(cName = None):
	#INIT
	conf = cgConfig.getConfig(cName)
	
	pFileName = conf.conf['results']
	minDensity = 4
	minSmallHits = 1
	
	pFile = open(pFileName, 'r')
	fileLines = [] #This will hold the lists to be sorted...
	for line in pFile:
		fileLines.append(line.strip().split('\t'))		
	pFile.close()
	
	#highest prediction density
	densityDict = {} #CID: highest density --> used to sort out clusters without proper density
	for line in fileLines:
		CID = line[7]
		pDensity = int(line[5])
		
		if CID in densityDict:
			if pDensity > densityDict[CID]:
				densityDict[CID] = pDensity
		else:
			densityDict[CID] = pDensity
	
	
	#take out clusters that didn't make the cut
	CIDpassed = []
	keptLines = []
	for line in fileLines:
		CID = line[7]
		#smallClusterHits = int(line[10]) Not using this metric anymore...
		if (densityDict[CID] >= minDensity): #Density 
			if line[8] == '0': #doesn't overlap with anything known (the cluster doesn't...that is)
				keptLines.append(line)
				if CID not in CIDpassed:
					CIDpassed.append(CID)
	
	#remake keptLines with integer in field ten
	#at this point just sort by cluster density...
	sID = 5 
	for line in keptLines:
		line[sID] = int(line[sID])
		
	sortedData = sorted(keptLines, key=itemgetter(sID), reverse = True) #sort by small RNA hits
	
	for line in sortedData:
		line[sID] = str(line[sID])
	
	#output
	sortedFile = open(conf.conf['results'] + '.sorted', 'w')
	for line in sortedData:
		sortedFile.write('\t'.join(line) + '\n')
	sortedFile.close()
	
	#Now output stats
	statFile = open('statFile.data', 'w')
	statFile.write('Total Clusters: %s\n' % len(densityDict))
	statFile.write('Passed: %s\n' % len(CIDpassed))
Ejemplo n.º 4
0
def updateOverlaps(cName=None):
    #init
    conf = cgConfig.getConfig(cName)
    pFileName = conf.conf['results']
    overlapsFileName = conf.conf['matureOverlaps']

    #put overlapping sequences in list:
    overlaps = []
    overlapFile = open(overlapsFileName, 'r')
    for tcc in overlapFile:
        overlaps.append(tcc.strip())
    overlapFile.close()

    #check each line of pred file for overlap, add 1 for overlap and 0 for non
    predFile = open(pFileName, 'r')
    newFileLines = []
    for line in predFile:
        mTcc = line.strip().split('\t')[1]
        if mTcc in overlaps:
            newLine = line.strip().split('\t')
            newLine[6] = str(1)
            newLine = '\t'.join(newLine) + '\n'
            newFileLines.append(newLine)
        else:
            newLine = line.strip().split('\t')
            newLine[6] = str(0)
            newLine = '\t'.join(newLine) + '\n'
            newFileLines.append(newLine)
    predFile.close()

    #write new File
    newFile = open(pFileName, 'w')
    for line in newFileLines:
        newFile.write(line)
Ejemplo n.º 5
0
def returnChromLengthDict(assembly):
	mConf = c.getConfig('Main.conf')
	lenFileName = mConf.conf['chromosomeLengths'] + '/' + assembly
	f = open(lenFileName, 'r')
	lenDict = {}
	for line in f:
		lenDict[line.split('\t')[0]] = int(line.split('\t')[1])
	
	return lenDict
Ejemplo n.º 6
0
def returnChromLengthDict(assembly):
    mConf = c.getConfig('Main.conf')
    lenFileName = mConf.conf['chromosomeLengths'] + '/' + assembly
    f = open(lenFileName, 'r')
    lenDict = {}
    for line in f:
        lenDict[line.split('\t')[0]] = int(line.split('\t')[1])

    return lenDict
Ejemplo n.º 7
0
def scanVectorsOrganism(tccList, config=None):
    '''Given tcc list --> scan Organism wig files and coord:value...
	'''

    config = c.getConfig(config)

    coordDict = {}  # tcc: [list values]
    for tcc in tccList:
        chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)

        #print 'Checking Tcc'
        org = config.conf['organism']
        mConf = c.getConfig('Main.conf')
        wigDir = mConf.conf['wig%s' % org]
        fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom)
        #print 'Checking Index'
        #goto correct line in index
        fIndex = cgIndex.lineIndex(
            fN, header=True
        )  #!!!there actually is a header...have to deal with this...
        fIndex.passCheckFunction(cgIndex.wigCheckFunction)
        fIndex.binarySearch(
            tcc)  #places file pointer at beginning of tcc as beginning

        stop = False
        for line in fIndex.file:

            #print 'Line:', line.strip()
            lBeg = int(cg.ss(line)[1])
            lEnd = int(cg.ss(line)[2])
            lValue = int(cg.ss(line)[3].split('.')[0])

            if tccStart > lBeg:
                lBeg = tccStart
            if tccEnd < lEnd:
                lEnd = tccEnd
                stop = True
            #print timer.split()

            for i in range(lBeg, lEnd):
                coordDict[i] = lValue

            if stop: break
    return coordDict
Ejemplo n.º 8
0
def writeWigFromHitDict(hitDict, assembly, name, directory=None):

    mConf = c.getConfig('Main.conf')
    if not directory: directory = mConf.conf['wigs']
    if not name: name = cg.getBaseFileName(name, naked=True)
    lDict = cg.returnChromLengthDict(assembly)

    cg.clearDirectory(directory, overwrite=False)
    #write results to wig file
    for chrom in hitDict:
        for strand in hitDict[chrom]:

            oF = open(directory + '/%s.%s.%s.wig' % (name, chrom, strand), 'w')
            oF.write('track type=bedGraph name=%s.%s.%s\n' %
                     (name, chrom, strand))

            #print '  sorting'
            #print hitDict[chrom]
            chromEnd = lDict[chrom]  #
            hitDict[chrom][strand][chromEnd] = 0
            keys = hitDict[chrom][strand].keys()
            keys.sort()

            #print '  writing blocks'
            prevVal = 0
            prevCoord = 0
            blockStart = 0
            blockEnd = 1
            for key in keys:
                val = hitDict[chrom][strand][key]

                if prevCoord == key - 1:
                    if val == prevVal:  #should be combined
                        blockEnd = key + 1
                    else:  #no zero block
                        #write old block
                        oF.write('%s\t%s\t%s\t%s\n' %
                                 (chrom, blockStart, blockEnd,
                                  prevVal))  #!make it a float value?
                        #start new block
                        blockStart = key
                        blockEnd = key + 1

                else:
                    #write old block
                    oF.write('%s\t%s\t%s\t%s\n' %
                             (chrom, blockStart, blockEnd, prevVal))
                    #write zero block
                    oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockEnd, key, 0))
                    #start new block
                    blockStart = key
                    blockEnd = key + 1

                prevVal = val
                prevCoord = key
            oF.close()
Ejemplo n.º 9
0
def updateClusterInfo(cName = None):
	#init
	conf = cgConfig.getConfig(cName)
	
	pFileName = conf.conf['results']
	sortedClustersFileName = conf.conf['sortedClusters']
		
	#add cluster IDs to tccs
	sortedFile = open(sortedClustersFileName, 'r')
	idDict = {} # format:  tcc : clusterID
	i = 0
	for line in sortedFile:
		clusterID = str(i)
		i = i + 1
		
		#put cluster into list
		cluster = line.strip().split(',')
		cluster.remove('') #pesky last comma
		
		for tcc in cluster:
			idDict[tcc] = clusterID
	sortedFile.close()
	
	#get tccs that are overlapping with known sequences
	overDict = {}
	predFile = open(pFileName, 'r')
	for line in predFile:
		if line.strip().split('\t')[6] == '1':
			clusterID = idDict[line.strip().split('\t')[1]]
			overDict[clusterID] = 1
	predFile.close()
		
	#now remake file
	newLines = []
	predFile = open(pFileName, 'r')
	for line in predFile:
		clusterID = idDict[line.strip().split('\t')[1]]
		if clusterID in overDict:
			newLine = line.strip().split('\t')
			newLine[7] = str(clusterID)
			newLine[8] = str(1)
			newLine = '\t'.join(newLine) + '\n'
			newLines.append(newLine)
		else:
			newLine = line.strip().split('\t')
			newLine[7] = str(clusterID)
			newLine[8] = str(0)
			newLine = '\t'.join(newLine) + '\n'
			newLines.append(newLine)
	predFile.close()
	
	#write new File
	newFile = open(pFileName, 'w')
	for line in newLines:
		newFile.write(line)
Ejemplo n.º 10
0
def scanVectorsOrganism(tccList, config = None):
	'''Given tcc list --> scan Organism wig files and coord:value...
	'''
	
	config = c.getConfig(config)
	
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		#print 'Checking Tcc'	
		org = config.conf['organism']
		mConf = c.getConfig('Main.conf')
		wigDir = mConf.conf['wig%s' % org]
		fN = wigDir + '/Merge.%s.%s.wig.%s.wig' %  (org.lower(),strand,chrom)	
		#print 'Checking Index'
		#goto correct line in index
		fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this...
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
		
		stop = False
		for line in fIndex.file:
			
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	return coordDict
Ejemplo n.º 11
0
def svCoord(tccList, config = None):
	'''Given tcc list --> scan Organism wig files and coord:value...
	'''
	
	#init
	config = c.getConfig(config)
	org = config.conf['organism']
	wigDir = config.conf['wigSetDir']
	wigSetName = config.conf['wigSetName']
	splitIntoChroms = config.conf['wigChromSplit']
	if splitIntoChroms == 'True':
		splitIntoChroms = True
	else:
		splitIntoChroms = False

	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		if splitIntoChroms:
			fN = wigDir + '/%s.%s.%s.wig' %  (wigSetName, chrom, strand)
		else:
			fN = wigDir + '/Merge.%s.%s.wig' % (org.lower(), strand)
		
		fIndex = cgIndex.lineIndex(fN, header = True)
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
		
		stop = False
		for line in fIndex.file:
			
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1]) + 1
                        #print 'lBeg', lBeg
			lEnd = int(cg.ss(line)[2])
                        #print 'lEnd', lEnd
                        #print '--'
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd + 1):
				coordDict[i] = lValue
				
			if stop: break
		fIndex.close() #close the file and the index after use...

	return coordDict
Ejemplo n.º 12
0
def writeWigFromHitDict(hitDict, assembly, name, directory = None):
	
	mConf = c.getConfig('Main.conf')
	if not directory: directory = mConf.conf['wigs']
	if not name: name = cg.getBaseFileName(name, naked = True)
	lDict = cg.returnChromLengthDict(assembly)
	
	cg.clearDirectory(directory, overwrite = False)
	#write results to wig file
	for chrom in hitDict:
		for strand in hitDict[chrom]:
			
			oF = open(directory + '/%s.%s.%s.wig' % (name, chrom, strand), 'w')
			oF.write('track type=bedGraph name=%s.%s.%s\n' % (name, chrom, strand))
			
			#print '  sorting'
			#print hitDict[chrom]
			chromEnd = lDict[chrom] #
			hitDict[chrom][strand][chromEnd] = 0
			keys = hitDict[chrom][strand].keys()
			keys.sort()
			
			#print '  writing blocks'
			prevVal = 0
			prevCoord = 0
			blockStart = 0
			blockEnd = 1
			for key in keys:
				val = hitDict[chrom][strand][key]
				
				if prevCoord == key - 1: 
					if val == prevVal:#should be combined
						blockEnd = key + 1
					else: #no zero block
						#write old block
						oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #!make it a float value?
						#start new block
						blockStart = key
						blockEnd = key + 1
						
				else:
					#write old block
					oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal))
					#write zero block
					oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockEnd, key, 0))
					#start new block
					blockStart = key
					blockEnd = key + 1
				
				prevVal = val
				prevCoord = key
			oF.close()
Ejemplo n.º 13
0
def filterOut(cName = None):
	
	#Init
	conf = cgConfig.getConfig(cName)
	
	predictionList = compare.tccFileToList(conf.conf['resultsRaw'], 1)
	#predictionList = compare.tccFileToList(conf.conf['resultsRaw'], 0)
	overlapped = compare.filterOutTccs(predictionList, conf.conf['knownDirectory'], True) #True gives me the filtered out ones instead of the list without filtered out
	
	
	matureOverlaps = open(conf.conf['matureOverlaps'], 'w')
	for tcc in overlapped:
		matureOverlaps.write(tcc + '\n')
Ejemplo n.º 14
0
def filterOut(cName=None):

    # Init
    conf = cgConfig.getConfig(cName)

    predictionList = compare.tccFileToList(conf.conf["resultsRaw"], 1)
    # predictionList = compare.tccFileToList(conf.conf['resultsRaw'], 0)
    overlapped = compare.filterOutTccs(
        predictionList, conf.conf["knownDirectory"], True
    )  # True gives me the filtered out ones instead of the list without filtered out

    matureOverlaps = open(conf.conf["matureOverlaps"], "w")
    for tcc in overlapped:
        matureOverlaps.write(tcc + "\n")
Ejemplo n.º 15
0
def addPeriods(cName = None):
	
	#init
	conf = cgConfig.getConfig(cName) #gets the current configuration instructions
	
	pFileName = conf.conf['resultsRaw']
	nFileName = conf.conf['results']
	
	pFile = open(pFileName, 'r')
	nFile = open(nFileName, 'w')
	newLines = []
	for line in pFile:
		newLine = '\t'.join(line.strip().split('\t')[0:5]) + '\t.\t.\t.\t.\t.\t.\n'
		nFile.write(newLine)
Ejemplo n.º 16
0
def splitExonsIntrons(cName = None):
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)
	
	#init
	organism = conf.conf['organism']
	minOverlap = 50
	cHairs = getHairpins.getHairpins() #CID: HAIRPIN
	exonList = compare.tccFileToList('%sExons.tcc' % organism, 0)
	hairpins = []
	for CID in cHairs:
		hairpins.append(cHairs[CID])
	
	print 'checking overlaps'
	#check which hairpins overlap exons and by how much
	exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount = True)
	print '  ', len(exonOverlapped)
	
	print 'removing partial introns'
	#remove the ones that didn't overlap more than X:
	remList = []
	for tcc, oAmount in exonOverlapped:
		if oAmount < minOverlap:
			remList.append([tcc, oAmount])
	
	for item in remList:
		exonOverlapped.remove(item)
	print '  ', len(exonOverlapped), 'out of', len(cHairs.keys())
		
	#get CIDs of exons
	exonCIDs = []
	for tcc, oAmount in exonOverlapped:
		for CID in cHairs:
			if cHairs[CID] == tcc:
				exonCIDs.append(str(CID))
	
	
	#Open sorted predictions and write lines with CIDs to respective files
	predFile = open(conf.conf['resultsSorted'], 'r')
	exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w')
	intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w')
	for line in predFile:
		if line.split('\t')[7] in exonCIDs:
			exonFile.write(line)
		else:
			intronFile.write(line)
	predFile.close()
	exonFile.close()
	intronFile.close()
Ejemplo n.º 17
0
def updateDensity(cName=None):
    #Create hitmap for blocks, cValdict for block
    conf = cgConfig.getConfig(cName)
    blockFileName = conf.conf[
        'hitsPerFrame']  # created in defineCluster script folder
    blockFile = open(blockFileName, 'r')
    blocksList = []
    cValBlockDict = {}

    for line in blockFile:
        blocksList.append(line.strip().split('\t')[0])
        cValBlockDict[line.strip().split('\t')[0]] = int(
            line.strip().split('\t')[1])
    blockFile.close()
    blockHitmap = bioLibCG.createHitMap(blocksList)

    #Now append the cVal for each predicted line:

    predictedFileName = conf.conf['results']
    predictedFile = open(predictedFileName, 'r')

    newFileList = []
    counter = 0
    for line in predictedFile:
        counter = counter + 1
        #print counter
        cVal = 0
        #what blocks does this prediction overlap?
        tccPrediction = line.strip().split('\t')[1]  #This should be mature?
        coordsPrediction = bioLibCG.stripTripleColon(tccPrediction)
        for i in range(int(coordsPrediction['start']),
                       int(coordsPrediction['end'])):
            if i in blockHitmap:
                for block in blockHitmap[i]:
                    if bioLibCG.tccOverlap(tccPrediction, block):
                        if cValBlockDict[block] > cVal:
                            cVal = cValBlockDict[block]
        newLine = line.strip().split('\t')
        newLine[5] = str(cVal)
        newLine = '\t'.join(newLine) + '\n'
        newFileList.append(newLine)
    predictedFile.close()

    newFileName = conf.conf['results']
    newFile = open(newFileName, 'w')
    for line in newFileList:
        newFile.write(line)

    newFile.close()
Ejemplo n.º 18
0
def splitExonsIntrons(cName=None):
    mConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    #init
    organism = conf.conf['organism']
    minOverlap = 50
    cHairs = getHairpins.getHairpins()  #CID: HAIRPIN
    exonList = compare.tccFileToList('%sExons.tcc' % organism, 0)
    hairpins = []
    for CID in cHairs:
        hairpins.append(cHairs[CID])

    print 'checking overlaps'
    #check which hairpins overlap exons and by how much
    exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount=True)
    print '  ', len(exonOverlapped)

    print 'removing partial introns'
    #remove the ones that didn't overlap more than X:
    remList = []
    for tcc, oAmount in exonOverlapped:
        if oAmount < minOverlap:
            remList.append([tcc, oAmount])

    for item in remList:
        exonOverlapped.remove(item)
    print '  ', len(exonOverlapped), 'out of', len(cHairs.keys())

    #get CIDs of exons
    exonCIDs = []
    for tcc, oAmount in exonOverlapped:
        for CID in cHairs:
            if cHairs[CID] == tcc:
                exonCIDs.append(str(CID))

    #Open sorted predictions and write lines with CIDs to respective files
    predFile = open(conf.conf['resultsSorted'], 'r')
    exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w')
    intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w')
    for line in predFile:
        if line.split('\t')[7] in exonCIDs:
            exonFile.write(line)
        else:
            intronFile.write(line)
    predFile.close()
    exonFile.close()
    intronFile.close()
Ejemplo n.º 19
0
def finalSort(pType, cName = None):
	
	#INIT
	conf = cgConfig.getConfig(cName)

	if pType == "E":
		pFileName = conf.conf['resultsExons']
	else:
		pFileName = conf.conf['resultsIntrons']
		
	minDensity = 4
	maxPVal = float(.05)

	pFile = open(pFileName, 'r')
	fileLines = [] #This will hold the lists to be sorted...
	for line in pFile:
		fileLines.append(line.strip().split('\t'))		
	pFile.close()


	keptLines = []
	for line in fileLines:
		CID = line[7]
		cDensity = int(line[12])
		pVal = float(line[15])
		
		#print pVal, cDensity
		if cDensity > 5 and pVal < maxPVal:
			#print '  kept'
			keptLines.append(line)

	print len(keptLines)
	#remake keptLines with float(pVal)
	i = 12
	for line in keptLines:
		line[i] = float(line[i])
		
	sortedData = sorted(keptLines, key=itemgetter(i), reverse = True) #sort by i

	for line in sortedData:
		line[i] = str(line[i])

	#print len(sortedData)
	#output
	sortedFile = open(pFileName + '.sorted', 'w')
	for line in sortedData:
		sortedFile.write('\t'.join(line) + '\n')
	sortedFile.close()
Ejemplo n.º 20
0
def addPeriods(cName=None):

    #init
    conf = cgConfig.getConfig(
        cName)  #gets the current configuration instructions

    pFileName = conf.conf['resultsRaw']
    nFileName = conf.conf['results']

    pFile = open(pFileName, 'r')
    nFile = open(nFileName, 'w')
    newLines = []
    for line in pFile:
        newLine = '\t'.join(
            line.strip().split('\t')[0:5]) + '\t.\t.\t.\t.\t.\t.\n'
        nFile.write(newLine)
Ejemplo n.º 21
0
def finalSort(pType, cName=None):

    #INIT
    conf = cgConfig.getConfig(cName)

    if pType == "E":
        pFileName = conf.conf['resultsExons']
    else:
        pFileName = conf.conf['resultsIntrons']

    minDensity = 4
    maxPVal = float(.05)

    pFile = open(pFileName, 'r')
    fileLines = []  #This will hold the lists to be sorted...
    for line in pFile:
        fileLines.append(line.strip().split('\t'))
    pFile.close()

    keptLines = []
    for line in fileLines:
        CID = line[7]
        cDensity = int(line[12])
        pVal = float(line[15])

        #print pVal, cDensity
        if cDensity > 5 and pVal < maxPVal:
            #print '  kept'
            keptLines.append(line)

    print len(keptLines)
    #remake keptLines with float(pVal)
    i = 12
    for line in keptLines:
        line[i] = float(line[i])

    sortedData = sorted(keptLines, key=itemgetter(i), reverse=True)  #sort by i

    for line in sortedData:
        line[i] = str(line[i])

    #print len(sortedData)
    #output
    sortedFile = open(pFileName + '.sorted', 'w')
    for line in sortedData:
        sortedFile.write('\t'.join(line) + '\n')
    sortedFile.close()
Ejemplo n.º 22
0
def updateDensity(cName=None):
    # Create hitmap for blocks, cValdict for block
    conf = cgConfig.getConfig(cName)
    blockFileName = conf.conf["hitsPerFrame"]  # created in defineCluster script folder
    blockFile = open(blockFileName, "r")
    blocksList = []
    cValBlockDict = {}

    for line in blockFile:
        blocksList.append(line.strip().split("\t")[0])
        cValBlockDict[line.strip().split("\t")[0]] = int(line.strip().split("\t")[1])
    blockFile.close()
    blockHitmap = bioLibCG.createHitMap(blocksList)

    # Now append the cVal for each predicted line:

    predictedFileName = conf.conf["results"]
    predictedFile = open(predictedFileName, "r")

    newFileList = []
    counter = 0
    for line in predictedFile:
        counter = counter + 1
        # print counter
        cVal = 0
        # what blocks does this prediction overlap?
        tccPrediction = line.strip().split("\t")[1]  # This should be mature?
        coordsPrediction = bioLibCG.stripTripleColon(tccPrediction)
        for i in range(int(coordsPrediction["start"]), int(coordsPrediction["end"])):
            if i in blockHitmap:
                for block in blockHitmap[i]:
                    if bioLibCG.tccOverlap(tccPrediction, block):
                        if cValBlockDict[block] > cVal:
                            cVal = cValBlockDict[block]
        newLine = line.strip().split("\t")
        newLine[5] = str(cVal)
        newLine = "\t".join(newLine) + "\n"
        newFileList.append(newLine)
    predictedFile.close()

    newFileName = conf.conf["results"]
    newFile = open(newFileName, "w")
    for line in newFileList:
        newFile.write(line)

    newFile.close()
Ejemplo n.º 23
0
def mergeInputs(cName, eLevel):
	
	conf = c.getConfig(cName)
	assembly = conf.conf['assembly']
	ending = '%s.%s' % (eLevel, assembly)
	
	print 'merging all files with ending', ending
	
	newLines = []
	for fN in cg.recurseDir('out', end = ending):
		print os.getcwd(), fN
		fN = os.getcwd() + '/' + fN
		f = open(fN, 'r')
		newLines.extend(f.readlines())
		f.close()
	
	f = open('peakData.%s.%s' % (eLevel, assembly), 'w')
	f.writelines(newLines)
	f.close()
Ejemplo n.º 24
0
def mergeInputs(cName, eLevel):

    conf = c.getConfig(cName)
    assembly = conf.conf['assembly']
    ending = '%s.%s' % (eLevel, assembly)

    print 'merging all files with ending', ending

    newLines = []
    for fN in cg.recurseDir('out', end=ending):
        print os.getcwd(), fN
        fN = os.getcwd() + '/' + fN
        f = open(fN, 'r')
        newLines.extend(f.readlines())
        f.close()

    f = open('peakData.%s.%s' % (eLevel, assembly), 'w')
    f.writelines(newLines)
    f.close()
Ejemplo n.º 25
0
def parallelMakePeaks(tcc, cName, minExpression):
	conf = c.getConfig(cName)
	f = open('out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w')
	chrom, strand, start, end = cg.tccSplit(tcc)
        peaks = cgPeaks.stretch(tcc, cName)
		
	print 'getting peaks'
	peaks.createPeaks(span = 1, minVal = int(minExpression))
        	
	for x in peaks.peaks:

		print x
                
                newTcc = cg.makeTcc(chrom, strand, x, x + 1)
                testedPeak = extendPeakTest(newTcc, 20, .2, .05, 0, 6, cName) 
                #testedPeak = roofPeakTest(newTcc, 30, .85, .9, .2, 6, 17, 24, cName)

                if testedPeak:
                        f.write('%s\n' % testedPeak)
	

	f.close()
Ejemplo n.º 26
0
def parallelMakePeaks(tcc, cName, minExpression):
	conf = c.getConfig(cName)
	f = open('out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w')
	chrom, strand, start, end = cg.tccSplit(tcc)
        peaks = cgPeaks.stretch(tcc, cName)
		
	print 'getting peaks'
	peaks.createPeaks(span = 1, minVal = int(minExpression))
        	
	for x in peaks.peaks:
                
                print ""
		print chrom, strand, x,
                
                newTcc = cg.makeTcc(chrom, strand, x, x + 1)
                testedPeak = extendPeakTest(newTcc, 20, .2, .05, 0, 6, cName) 
                #testedPeak = roofPeakTest(newTcc, 30, .85, .9, .2, 8, 16, 25, cName)

                if testedPeak:
                        f.write('%s\n' % testedPeak)
	

	f.close()
Ejemplo n.º 27
0
def updateOverlaps(cName = None):
	#init
	conf = cgConfig.getConfig(cName)
	pFileName = conf.conf['results']
	overlapsFileName = conf.conf['matureOverlaps']
	
	
		
	#put overlapping sequences in list:
	overlaps = []
	overlapFile = open(overlapsFileName, 'r')
	for tcc in overlapFile:
		overlaps.append(tcc.strip())
	overlapFile.close()
	
	#check each line of pred file for overlap, add 1 for overlap and 0 for non
	predFile = open(pFileName, 'r')
	newFileLines = []
	for line in predFile:
		mTcc = line.strip().split('\t')[1]
		if mTcc in overlaps:
			newLine = line.strip().split('\t')
			newLine[6] = str(1)
			newLine = '\t'.join(newLine) + '\n'
			newFileLines.append(newLine)
		else:
			newLine = line.strip().split('\t')
			newLine[6] = str(0)
			newLine = '\t'.join(newLine) + '\n'
			newFileLines.append(newLine)
	predFile.close()
	
	#write new File
	newFile = open(pFileName, 'w')
	for line in newFileLines:
		newFile.write(line)
Ejemplo n.º 28
0
def intronNoisy(cName=None):
    mConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    #init
    cHairs = getHairpins.getHairpins(
        conf.conf['resultsIntrons'])  #CID: HAIRPIN
    organism = conf.conf['organism']
    exonList = compare.tccFileToList('%sExons.tcc' % organism, 0)
    slide = 1000

    #make prediction overlap hitmap
    predMap = {}
    predList = []
    for CID in cHairs:
        hPin = cHairs[CID]
        predList.append(hPin)

    #collapse Overlaps
    print ' collapsing predictions'
    predList = compare.collapseOverlaps(predList)
    print ' collapsing exons'
    exonList = compare.collapseOverlaps(exonList)

    #collect levels for each hairpin region
    cidLevels = {}
    for CID in cHairs:
        print CID
        hPin = cHairs[CID]
        chrom = ss(hPin, ':')[0]
        strand = ss(hPin, ':')[1]
        start = int(ss(hPin, ':')[2])
        end = int(ss(hPin, ':')[3])

        scanStart = start - slide
        scanEnd = end + slide

        scanRange = []
        scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start))
        scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd))

        print scanRange
        scanRange = compare.subtractTwoTccLists(scanRange, predList)
        scanRange = compare.subtractTwoTccLists(scanRange, exonList)

        levels = []

        print '  Retrieving Expression levels:', cg.getTccListTotalLength(
            scanRange)
        levels = []

        hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName)
        for hPin in hPinLevels:
            levels.extend(hPinLevels[hPin])

        cidLevels[CID] = levels

    #output levels to file

    #find longest
    longest = 0
    for CID in cidLevels:
        length = len(cidLevels[CID])
        if length > longest:
            longest = length

    sortedKeys = cidLevels.keys()
    sortedKeys.sort()

    newLines = []
    for j in range(0, longest):  #how many lines are there
        newLine = []
        for CID in sortedKeys:
            if len(cidLevels[CID]) > j:  # add it
                newLine.append(str(cidLevels[CID][j]))
            else:
                newLine.append('NA')

        newLines.append('\t'.join(newLine) + '\n')

    outFileN = conf.conf['intronNoiseData']
    outFile = open(outFileN, 'w')
    outFile.write('\t'.join(sortedKeys) + '\n')
    outFile.writelines(newLines)
    outFile.close()
Ejemplo n.º 29
0
def parallelMakePeaks(tcc, cName, minExpression):
	conf = c.getConfig(cName)
	f = open('out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w')
	print 'scanning range', tcc
	chrom, strand, start, end = cg.tccSplit(tcc)
	peaks = cgPeaks.stretch(tcc, cName)
	
	
	
	#print 'getting peaks'
	peaks.createPeaks(span = 1, minVal = int(minExpression))
	
	print 'len peaks', len(peaks.peaks)
	endCheck = 0
	for x in peaks.peaks:
		print x, endCheck
                
                '''
		if x < endCheck:
                        print 'endChecked'
			continue
	        '''

		#scan a 30 bp range around this point and find the best roof...
		pRange = 40
		rTcc = cg.makeTcc(chrom, strand, x, x + 1)
		

		#now make profile for roof...
		cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True)
		
		#now get highest stretch length and the rNext coord.
		minVal = .70
		highest = 0
		stretch = 0
		startCurrent = None
		startFinal = None
		endFinal = None
		for i in range(1 - pRange, pRange):
                        print ' ', x + i, cProfile[i] 
			if cProfile[i] > minVal:
				print '  extending stretch'
                                stretch += 1
				if startCurrent == None:
					startCurrent = i
			else:
				if stretch > 0:
					print 'end of stretch'
                                        if stretch > highest: #stretch ended and was higher than previous
						highest = stretch
						endFinal = i - 1
						startFinal = startCurrent
						startCurrent = None
					else:
						startCurrent = None
				stretch = 0
		
		#get +/- extend value...
		val = [1.0, 1.0]
                extend = 1
		if (startFinal) and (endFinal):
			low = startFinal - extend
			high = endFinal + extend
			if low > (1 - pRange) and high < pRange:
					val[0] = float(cProfile[startFinal - extend])
					val[1] = float(cProfile[endFinal + extend])
			else:
                                print 'out of range'
				continue
		else:
                        print 'no start and end of peak'
			continue
	        print low, high, x, endFinal
		endCheck = x + endFinal
		
                #avg expression around peak check...
                #get total expression before peak
                noiseExpression = 0
                lowRange = range(1 - pRange, low)
                highRange = range(high + 1, pRange) 
                totalLength = len(lowRange) + len(highRange)
                for i in lowRange:
                        noiseExpression += cProfile[i]
                for i in highRange:
                        noiseExpression += cProfile[i]
                avgNoise = noiseExpression/float(totalLength)


		#filter out peaks that look a certain way.
                print highest, val[0], val[1], avgNoise
		if 0 < highest < 5: #rooflength 14/26
			if val[0] < 0.20 and val[1] < .20: #drop values
                                if avgNoise < .3:
                                        goodTcc = cg.makeTcc(chrom, strand, x + low, x + high)
				        print '*KEEPER'
				        f.write('%s\n' % goodTcc)
	

	f.close()
	print 'DONE', tcc
Ejemplo n.º 30
0
import getHairpins
import cgGenes
import cgConfig as c
import bioLibCG as cg

mConf = c.getConfig('Main.conf')
geneSetFolder = mConf.conf['geneSetsHuman']

fN = '/home/chrisgre/projects/NoncodingHuman/results/NChuman-s3k8b17.results.sorted.introns.sorted'
cHairs = getHairpins.getHairpins(fN)

ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllTranscripts.tsv')

cDesc = {} #CID:gDesc
for CID in cHairs:
	tcc = cHairs[CID]
	
	cDesc[CID] = "NONE"
	
	overlappingGenes = ensGenes.geneOverlaps([tcc])
	if len(overlappingGenes) > 0:
		print overlappingGenes[0].type
		cDesc[CID] = overlappingGenes[0].type

f = open(fN, 'r')
newLines = []
for line in f:
	CID = line.strip().split('\t')[7]
	newLines.append(cg.appendToLine(line, cDesc[CID], 16))
f.close()
Ejemplo n.º 31
0
import cgGenes
import compareData as compare
import cgConfig as c

cName = 'mm9.conf'
mConf = c.getConfig('Main.conf')
conf = c.getConfig(cName)
organism = conf.conf['organism']
geneSetFolder = mConf.conf['geneSets%s' % organism]
genes = cgGenes.createGeneSetFromFile(geneSetFolder + '/allTransciptsType.tsv')
peakTccs = compare.tccFileToList('peakData.500.mm9', 0)


tOverlaps = genes.transcriptOverlaps(peakTccs)
typeDict = {}
for transcript in tOverlaps:
	if transcript.type not in typeDict:
		typeDict[transcript.type] = 1
	else:
		typeDict[transcript.type] += 1

#count the amounts of each type for each transcript
amount = {}
for gene in genes.genes:
	for t in gene.transcripts:
		if t.type in amount:
			amount[t.type] += 1
		else:
			amount[t.type] = 1

print 'Total Peaks:', len(peakTccs)
Ejemplo n.º 32
0
def intronNoisy(cName = None):
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)
	
	#init
	cHairs = getHairpins.getHairpins(conf.conf['resultsIntrons']) #CID: HAIRPIN
	organism = conf.conf['organism']
	exonList = compare.tccFileToList('%sExons.tcc' % organism, 0)
	slide = 1000
	
	#make prediction overlap hitmap
	predMap = {}
	predList = []
	for CID in cHairs:
		hPin = cHairs[CID]
		predList.append(hPin)
	
	#collapse Overlaps
	print ' collapsing predictions'
	predList = compare.collapseOverlaps(predList)
	print ' collapsing exons'
	exonList = compare.collapseOverlaps(exonList)
	
	
	#collect levels for each hairpin region
	cidLevels = {}
	for CID in cHairs:
		print CID
		hPin = cHairs[CID]
		chrom = ss(hPin, ':')[0]
		strand = ss(hPin, ':')[1]
		start = int(ss(hPin, ':')[2])
		end = int(ss(hPin, ':')[3])
		
		scanStart = start - slide
		scanEnd = end + slide
		
		scanRange = []
		scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start))
		scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd))
		
		print scanRange
		scanRange = compare.subtractTwoTccLists(scanRange, predList)
		scanRange = compare.subtractTwoTccLists(scanRange, exonList)
			
		levels = []
		
		print '  Retrieving Expression levels:', cg.getTccListTotalLength(scanRange)
		levels = []
		
		
		hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName)
		for hPin in hPinLevels:
			levels.extend(hPinLevels[hPin])
		
			
		cidLevels[CID] = levels
		
	#output levels to file
	
	#find longest
	longest = 0
	for CID in cidLevels:
		length = len(cidLevels[CID])
		if length > longest:
			longest = length
	
	sortedKeys = cidLevels.keys()
	sortedKeys.sort()
	
	newLines = []
	for j in range(0, longest): #how many lines are there
		newLine = []
		for CID in sortedKeys:
			if len(cidLevels[CID]) > j:# add it
				newLine.append(str(cidLevels[CID][j]))
			else:
				newLine.append('NA')
	
		newLines.append('\t'.join(newLine) + '\n')
	
	outFileN = conf.conf['intronNoiseData']
	outFile = open(outFileN, 'w')
	outFile.write('\t'.join(sortedKeys) + '\n')
	outFile.writelines(newLines)
	outFile.close()
Ejemplo n.º 33
0
def scanVectorsSingleCoord(tccList, cName):
    '''Given tcc list --> scan wig files and coord:value...
	'''

    conf = c.getConfig(cName)
    org = conf.conf['organism']
    mConf = c.getConfig('Main.conf')
    wigDir = mConf.conf['wig%s' % org]

    timer = cg.cgTimer()
    timer.start()
    coordDict = {}  # tcc: [list values]
    for tcc in tccList:
        theSplit = ss(tcc, ':')
        chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1], int(
            theSplit[2]), int(theSplit[3])

        #goto correct fild, correct line in index

        fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom)
        fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),
                                                              strand, chrom)

        #print timer.split()
        #get line in index file
        iFile = open(fNindex, 'r')
        startByte = 'None'
        for line in iFile:
            beg = int(cg.ss(line)[1])
            end = int(cg.ss(line)[2])

            if beg <= tccStart < end:
                startByte = int(cg.ss(line)[0])
                #print 'INDEX', line.strip()
                break
        iFile.close()

        #print timer.split()
        #grab value
        f = open(fN, 'r')
        f.seek(startByte, 0)

        stop = False
        for line in f:
            #print 'Line:', line.strip()
            lBeg = int(cg.ss(line)[1])
            lEnd = int(cg.ss(line)[2])
            lValue = int(cg.ss(line)[3].split('.')[0])

            if tccStart > lBeg:
                lBeg = tccStart
            if tccEnd < lEnd:
                lEnd = tccEnd
                stop = True
            #print timer.split()

            for i in range(lBeg, lEnd):
                coordDict[i] = lValue

            if stop: break

        f.close()
    return coordDict
Ejemplo n.º 34
0
def exonNoisy(cName = None):
	#init
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)
	cHairs = getHairpins.getHairpins(conf.conf['resultsExons']) #CID: HAIRPIN
	organism = conf.conf['organism']
	geneSetFolder = mConf.conf['geneSets%s' % organism]
	
	#make prediction overlap hitmap
	print 'Making prediction list'
	predList = [] 
	for CID in cHairs:
		hPin = cHairs[CID]
		predList.append(hPin)
	
	if compare.checkIfOverlaps(predList):
		predList = compare.collapseOverlaps(predList)
	
	
	#make genes for Ensemble/make list of tccs for exons.
	print 'Creating gene set'
	
	ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllExons.tsv')
	print '  loaded # genes:', len(ensGenes.set)
	
	
	#collect levels for each haipin region
	print '[Checking all levels]'
	cidLevels = {}
	for CID in cHairs:
		print CID
		hPin = cHairs[CID]
			
		#for each hairpin, --> find overlapping transcripts in same gene
		overlappingGenes = ensGenes.geneOverlaps([hPin])
		if len(overlappingGenes) > 0:
			gIDs = [gene.id for gene in overlappingGenes]
			allTccs = ensGenes.getTccsFromGIDs(gIDs)
			if compare.checkIfOverlaps:
				print '  Overlaps...collapsing'
				allTccs = compare.collapseOverlaps(allTccs)
		else:
			print 'NO GENE OVERLAPS!!!!!', CID, hPin
		
		
		#filter out my predictions.
		print '  Filtering out predictions'
		checkList = compare.subtractTwoTccLists(allTccs, predList)
			
		
		#Get Expression level for gene.
		print '  Retrieving Expression levels:', cg.getTccListTotalLength(checkList)
		levels = []
		
		
		hPinLevels = stepVectorScan.scanVectorsHist(checkList, cName)
		for hPin in hPinLevels:
			levels.extend(hPinLevels[hPin])
		
			
		cidLevels[CID] = levels
		
	
	
	
	#output levels to file
	print 'Outputting to file'
	#find longest
	longest = 0
	for CID in cidLevels:
		length = len(cidLevels[CID])
		if length > longest:
			longest = length
	
	sortedKeys = cidLevels.keys()
	sortedKeys.sort()
	#print sortedKeys
	
	newLines = []
	for j in range(0, longest): #how many lines are there
		newLine = []
		for CID in sortedKeys:
			if len(cidLevels[CID]) > j:# add it
				newLine.append(str(cidLevels[CID][j]))
			else:
				newLine.append('NA')
	
		newLines.append('\t'.join(newLine) + '\n')
	
	outFileN = conf.conf['exonNoiseData']
	outFile = open(outFileN, 'w')
	outFile.write('\t'.join(sortedKeys) + '\n')
	outFile.writelines(newLines)
	outFile.close()
Ejemplo n.º 35
0
def findPeaks(pType, cName = None):
	
	#init
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)

	if pType == 'E':
		predName = conf.conf['resultsExonsSorted']
	else:
		predName = conf.conf['resultsIntronsSorted']
	
	print predName
	#make CID:hairpin:peak dictionary
	cHairs = getHairpins.getHairpins(predName)
	peakDict = {}
	for CID in cHairs:
		peakDict[CID] = [cHairs[CID],'None']
		

	timer = cg.cgTimer()
	timer.start()

	#put peaks in memory
	print 'Creating peak data'
	peaks = {} # chr:peak:value
	for CID in cHairs:
		chrom, strand, start, end = cg.tccSplit(cHairs[CID])
		tcc = cHairs[CID]
		
		#init dictionary
		if chrom not in peaks:
			peaks[chrom] = {}
		
		if strand not in peaks[chrom]:
			peaks[chrom][strand] = {}
		
		#create peaks for tcc and add to peak dictionary
		stretch = cgPeaks.stretch(tcc, cName)
		stretch.createPeaks()
		for peakCoord in stretch.peaks:
			peaks[chrom][strand][peakCoord] = 0
	print timer.split()

	print 'finding best combos'
	bestCombos = []
	aPass = 0
	bPass = 0
	cPass = 0
	numT = 0
	for CID in peakDict:
		cgFlag = False
		if CID == '538':cgFlag = True
		
		tcc = peakDict[CID][0]
		#print tcc
		tccPeaks = []
		chrom = cg.ss(tcc, ':')[0]
		strand = cg.ss(tcc, ':')[1]
		start = int(cg.ss(tcc, ':')[2])
		end = int(cg.ss(tcc, ':')[3])
		
		#get all peaks
		for i in range(start, end + 1):
			if i in peaks[chrom][strand]:
				#print '  peak added', i
				tccPeaks.append(i)
		
		#Calculate parameters...
		pairStrings = [] #used to check if pair already added
		peakCombos = []
		for x in tccPeaks:
				
								
				#scan a 30 bp range around this point and find the best roof...
				pRange = 30
				rTcc = cg.makeTcc(chrom, strand, x, x + 1)
				
				#quickly get max value...kinda a long way to do it but whatever
				cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False)
				xval = cProfile[0]
				max = xval
				highestValueCoord = x
				
				#now make profile for roof...
				cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True)
				
				
				
				#now get highest stretch length and the rNext coord.
				minVal = .80
				highest = 0
				stretch = 0
				startCurrent = None
				startFinal = None
				endFinal = None
				for i in range(1 - pRange, pRange):
					if cProfile[i] > minVal:
						stretch += 1
						if startCurrent == None:
							startCurrent = i
					else:
						if stretch > 0:
							if stretch > highest: #stretch ended and was higher than previous
								highest = stretch
								endFinal = i - 1
								startFinal = startCurrent
								startCurrent = None
							else:
								startCurrent = None
						stretch = 0
				
				#get +/- 4 value...
				val = [1.0, 1.0]
				if (startFinal) and (endFinal):
					low = startFinal - 4
					high = endFinal + 4
					if low > (1 - pRange):
						if high < pRange:
							val[0] = float(cProfile[startFinal - 4])
							val[1] = float(cProfile[endFinal + 4])
				
				#fill in other details...
				y = 'S'
				dist = 'S'
				ratio = 'S'
				
				peakCombos.append([tcc,x,y,dist,ratio,max,highest,val])
				#print '  ', peakCombos[-1]
		
		#find best combo...
		topCombo = None
		for combo in peakCombos:
			roofLength = combo[6]
			dropValue = combo[7][0]
			if combo[7][1] > dropValue:
				dropValue = combo[7][1]
			
			#print roofLength, dropValue
			if 14 < roofLength < 26:
				if 0.0 < dropValue < 0.2:
					#pick one with rooflength nearest 20:
					if topCombo:
						if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])):
							topCombo = combo
					else:
						topCombo = combo
		
		if topCombo:
			peakDict[CID][1] = topCombo
			bestCombos.append(topCombo)
			print bestCombos[-1]
		else:
			#print 'None'
			pass

	print timer.split()


	#now update predFile (SLOT 13)
	predFile = open(predName, 'r')
	newLines = []
	for line in predFile:
		CID = cg.ss(line)[7]
		if peakDict[CID][1] == 'None':
			peakInfo = 'None'
		else:
			peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7])
		newLines.append(cg.appendToLine(line, peakInfo, 13))
	predFile.close()

	predFile = open(predName, 'w')
	predFile.writelines(newLines)
	predFile.close()
Ejemplo n.º 36
0
def updateReadDensity(tType, cName):
    #go through wig each chromosome and check the mature seqs
    mainConf = cgConfig.cgConfig('Main.conf')
    conf = cgConfig.getConfig(cName)
    organism = conf.conf['organism']
    wigFolder = mainConf.conf['wig%s' % organism]
    newLines = []

    #Differentiate between exon or intron...
    if tType == 'E':
        pFileName = conf.conf['resultsExons']
    elif tType == 'I':
        pFileName = conf.conf['resultsIntrons']
    else:
        print 'READ UPDATE FAIL'

    print '  Updating Read Density:', tType

    #get read density for each line...
    print '  calculating hits for mature seqs'
    #calculate total hits per mature
    mirFile = open(pFileName, 'r')
    for line in mirFile:
        mTcc = line.strip().split('\t')[1]
        mirID = line.strip().split('\t')[0]

        tccStretch = cgPeaks.stretch(mTcc, cName)
        highestHit = 0
        for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])):
            if i in tccStretch.profile:
                if tccStretch.profile[i] > highestHit:
                    highestHit = tccStretch.profile[i]

        newLines.append(cg.appendToLine(line, str(highestHit), 11))

    mirFile.close()

    print 'Writing New File'
    #write new results file
    outFile = open(pFileName, 'w')
    for line in newLines:
        outFile.write(line)
    outFile.close()

    ####NOW UPDATE HIGHEST HIT PER CLUSTER####

    clusterCount = {}

    pFile = open(pFileName, 'r')
    for line in pFile:
        predictionCount = int(line.strip().split('\t')[11])
        CID = line.strip().split('\t')[7]
        if CID in clusterCount:
            if clusterCount[CID] < predictionCount:
                clusterCount[CID] = predictionCount
        else:
            clusterCount[CID] = predictionCount
    pFile.close()

    #update the file --> cluster small count
    newLines = []
    predFile = open(pFileName, 'r')
    for line in predFile:
        CID = line.strip().split('\t')[7]
        numMax = clusterCount[CID]
        newLines.append(cg.appendToLine(line, str(numMax), 12))
    predFile.close()

    #sort newLines by clusterID
    sortDict = {}
    CIDs = []
    for line in newLines:
        CID = int(line.strip().split('\t')[7])
        if CID not in CIDs:
            CIDs.append(CID)
        if CID in sortDict:
            sortDict[CID].append(line)
        else:
            sortDict[CID] = [line]

    CIDs.sort()

    newLines = []
    for CID in CIDs:
        for line in sortDict[CID]:
            newLines.append(line)

    #write new File
    newFile = open(pFileName, 'w')
    for line in newLines:
        newFile.write(line)
    newFile.close()
Ejemplo n.º 37
0
def updateNoise(pType, cName=None):

    #init
    mainConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    if pType == 'E':
        predName = conf.conf['resultsExons']
    else:
        predName = conf.conf['resultsIntrons']

    #populate cid: exon dist
    print 'Populating CID/INtron/exon distribution data'
    if pType == 'E':
        noiseFN = conf.conf['exonNoiseData']
        f = open(noiseFN, 'r')
    else:
        noiseFN = conf.conf['intronNoiseData']
        f = open(noiseFN, 'r')

    exonDists = {}  #cid: [exon dist]
    header = f.readline()
    order = {}  # num:CID
    for i, CID in enumerate(header.strip().split('\t')):
        order[i] = CID
        exonDists[CID] = []

    for line in f:
        data = line.strip().split('\t')
        for i, dataPoint in enumerate(data):
            if dataPoint == 'NA' or dataPoint == '':
                continue
            else:
                dataPoint = float(dataPoint)
                CID = order[i]
                exonDists[CID].append(dataPoint)

    #get highest expression level for each cluster
    print 'Populating highest expression levels'
    predExpression = {}  # CID; highest level
    exonFile = open(predName, 'r')
    for line in exonFile:
        CID = line.strip().split('\t')[7]
        hDensity = line.strip().split('\t')[12]

        predExpression[CID] = hDensity

    #get pVals for each CID
    print 'Getting pvals for each cluster'
    pVals = {}  # CID; [lam,pVal]
    for CID in exonDists:
        if not len(exonDists[CID]) > 0:  #no data in 2kb range.
            lam = 'NA'
            pVal = 'NA'
        else:
            lam = cgStats.getLam(exonDists[CID])
            pVal = cgStats.getPValExp(predExpression[CID], lam)

        pVals[CID] = [
            lam, pVal
        ]  #lam gives a good approximation of noise levels in region...

    print 'Updating the file'
    #update file...
    predFile = open(predName, 'r')
    newLines = []
    for line in predFile:
        CID = line.split('\t')[7]
        newLine = cg.appendToLine(line, pVals[CID][0], 14)
        newLine = cg.appendToLine(newLine, pVals[CID][1], 15)
        newLines.append(newLine)
    predFile.close()

    predFile = open(predName, 'w')
    predFile.writelines(newLines)
    predFile.close()
Ejemplo n.º 38
0
def sortResults(cName=None):
    #INIT
    conf = cgConfig.getConfig(cName)

    pFileName = conf.conf['results']
    minDensity = 4
    minSmallHits = 1

    pFile = open(pFileName, 'r')
    fileLines = []  #This will hold the lists to be sorted...
    for line in pFile:
        fileLines.append(line.strip().split('\t'))
    pFile.close()

    #highest prediction density
    densityDict = {
    }  #CID: highest density --> used to sort out clusters without proper density
    for line in fileLines:
        CID = line[7]
        pDensity = int(line[5])

        if CID in densityDict:
            if pDensity > densityDict[CID]:
                densityDict[CID] = pDensity
        else:
            densityDict[CID] = pDensity

    #take out clusters that didn't make the cut
    CIDpassed = []
    keptLines = []
    for line in fileLines:
        CID = line[7]
        #smallClusterHits = int(line[10]) Not using this metric anymore...
        if (densityDict[CID] >= minDensity):  #Density
            if line[8] == '0':  #doesn't overlap with anything known (the cluster doesn't...that is)
                keptLines.append(line)
                if CID not in CIDpassed:
                    CIDpassed.append(CID)

    #remake keptLines with integer in field ten
    #at this point just sort by cluster density...
    sID = 5
    for line in keptLines:
        line[sID] = int(line[sID])

    sortedData = sorted(keptLines, key=itemgetter(sID),
                        reverse=True)  #sort by small RNA hits

    for line in sortedData:
        line[sID] = str(line[sID])

    #output
    sortedFile = open(conf.conf['results'] + '.sorted', 'w')
    for line in sortedData:
        sortedFile.write('\t'.join(line) + '\n')
    sortedFile.close()

    #Now output stats
    statFile = open('statFile.data', 'w')
    statFile.write('Total Clusters: %s\n' % len(densityDict))
    statFile.write('Passed: %s\n' % len(CIDpassed))
Ejemplo n.º 39
0
def findPeaks(pType, cName=None):

    #init
    mConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    if pType == 'E':
        predName = conf.conf['resultsExonsSorted']
    else:
        predName = conf.conf['resultsIntronsSorted']

    print predName
    #make CID:hairpin:peak dictionary
    cHairs = getHairpins.getHairpins(predName)
    peakDict = {}
    for CID in cHairs:
        peakDict[CID] = [cHairs[CID], 'None']

    timer = cg.cgTimer()
    timer.start()

    #put peaks in memory
    print 'Creating peak data'
    peaks = {}  # chr:peak:value
    for CID in cHairs:
        chrom, strand, start, end = cg.tccSplit(cHairs[CID])
        tcc = cHairs[CID]

        #init dictionary
        if chrom not in peaks:
            peaks[chrom] = {}

        if strand not in peaks[chrom]:
            peaks[chrom][strand] = {}

        #create peaks for tcc and add to peak dictionary
        stretch = cgPeaks.stretch(tcc, cName)
        stretch.createPeaks()
        for peakCoord in stretch.peaks:
            peaks[chrom][strand][peakCoord] = 0
    print timer.split()

    print 'finding best combos'
    bestCombos = []
    aPass = 0
    bPass = 0
    cPass = 0
    numT = 0
    for CID in peakDict:
        cgFlag = False
        if CID == '538': cgFlag = True

        tcc = peakDict[CID][0]
        #print tcc
        tccPeaks = []
        chrom = cg.ss(tcc, ':')[0]
        strand = cg.ss(tcc, ':')[1]
        start = int(cg.ss(tcc, ':')[2])
        end = int(cg.ss(tcc, ':')[3])

        #get all peaks
        for i in range(start, end + 1):
            if i in peaks[chrom][strand]:
                #print '  peak added', i
                tccPeaks.append(i)

        #Calculate parameters...
        pairStrings = []  #used to check if pair already added
        peakCombos = []
        for x in tccPeaks:

            #scan a 30 bp range around this point and find the best roof...
            pRange = 30
            rTcc = cg.makeTcc(chrom, strand, x, x + 1)

            #quickly get max value...kinda a long way to do it but whatever
            cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                         1,
                                                         cName,
                                                         ratio=False)
            xval = cProfile[0]
            max = xval
            highestValueCoord = x

            #now make profile for roof...
            cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                         pRange,
                                                         cName,
                                                         ratio=True)

            #now get highest stretch length and the rNext coord.
            minVal = .80
            highest = 0
            stretch = 0
            startCurrent = None
            startFinal = None
            endFinal = None
            for i in range(1 - pRange, pRange):
                if cProfile[i] > minVal:
                    stretch += 1
                    if startCurrent == None:
                        startCurrent = i
                else:
                    if stretch > 0:
                        if stretch > highest:  #stretch ended and was higher than previous
                            highest = stretch
                            endFinal = i - 1
                            startFinal = startCurrent
                            startCurrent = None
                        else:
                            startCurrent = None
                    stretch = 0

            #get +/- 4 value...
            val = [1.0, 1.0]
            if (startFinal) and (endFinal):
                low = startFinal - 4
                high = endFinal + 4
                if low > (1 - pRange):
                    if high < pRange:
                        val[0] = float(cProfile[startFinal - 4])
                        val[1] = float(cProfile[endFinal + 4])

            #fill in other details...
            y = 'S'
            dist = 'S'
            ratio = 'S'

            peakCombos.append([tcc, x, y, dist, ratio, max, highest, val])
            #print '  ', peakCombos[-1]

        #find best combo...
        topCombo = None
        for combo in peakCombos:
            roofLength = combo[6]
            dropValue = combo[7][0]
            if combo[7][1] > dropValue:
                dropValue = combo[7][1]

            #print roofLength, dropValue
            if 14 < roofLength < 26:
                if 0.0 < dropValue < 0.2:
                    #pick one with rooflength nearest 20:
                    if topCombo:
                        if (math.fabs(22 - roofLength)) < (
                                math.fabs(22 - topCombo[6])):
                            topCombo = combo
                    else:
                        topCombo = combo

        if topCombo:
            peakDict[CID][1] = topCombo
            bestCombos.append(topCombo)
            print bestCombos[-1]
        else:
            #print 'None'
            pass

    print timer.split()

    #now update predFile (SLOT 13)
    predFile = open(predName, 'r')
    newLines = []
    for line in predFile:
        CID = cg.ss(line)[7]
        if peakDict[CID][1] == 'None':
            peakInfo = 'None'
        else:
            peakInfo = '%s:%s:%s:%s:%s:%s' % (
                str(peakDict[CID][1][1])[-3:], 'S', str(
                    peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],
                peakDict[CID][1][6], peakDict[CID][1][7])
        newLines.append(cg.appendToLine(line, peakInfo, 13))
    predFile.close()

    predFile = open(predName, 'w')
    predFile.writelines(newLines)
    predFile.close()
Ejemplo n.º 40
0
def scanVectorsSingleCoord(tccList, cName):
	'''Given tcc list --> scan wig files and coord:value...
	'''
	
	conf = c.getConfig(cName)
	org = conf.conf['organism']
	mConf = c.getConfig('Main.conf')
	wigDir = mConf.conf['wig%s' % org]

	timer = cg.cgTimer()
	timer.start()
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		theSplit = ss(tcc, ':')
		chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1],int(theSplit[2]),int(theSplit[3])
		
		#goto correct fild, correct line in index
		
		fN = wigDir + '/Merge.%s.%s.wig.%s.wig' %  (org.lower(),strand,chrom)
		fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),strand,chrom)
		
		#print timer.split()
		#get line in index file
		iFile = open(fNindex, 'r')
		startByte = 'None'
		for line in iFile:
			beg = int(cg.ss(line)[1])
			end = int(cg.ss(line)[2])
			
			if beg <= tccStart < end:
				startByte = int(cg.ss(line)[0]) 
				#print 'INDEX', line.strip()
				break
		iFile.close()
		
		#print timer.split()
		#grab value
		f = open(fN, 'r')
		f.seek(startByte, 0)
		
		stop = False
		for line in f:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	
		f.close()
	return coordDict
Ejemplo n.º 41
0
def scanVectorsHist(tccList, cName):
    '''Given tcc list --> scan wig files and get histogram values
	can be modified to do single/total values...
	THIS USES INDEXES!!! = BAD...'''

    conf = c.getConfig(cName)
    org = conf.conf['organism']
    mConf = c.getConfig('Main.conf')
    wigDir = mConf.conf['wig%s' % org]

    timer = cg.cgTimer()
    timer.start()
    histDict = {}  # tcc: [list values]
    for tcc in tccList:
        theSplit = ss(tcc, ':')
        chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1], int(
            theSplit[2]), int(theSplit[3])

        #goto correct fild, correct line in index

        fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom)
        fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),
                                                              strand, chrom)

        #print timer.split()
        #get line in index file
        iFile = open(fNindex, 'r')
        startByte = 'None'
        for line in iFile:
            beg = int(cg.ss(line)[1])
            end = int(cg.ss(line)[2])

            if beg <= tccStart < end:
                startByte = int(cg.ss(line)[0])
                #print 'INDEX', line.strip()
                break
        iFile.close()

        #print timer.split()
        #grab value
        f = open(fN, 'r')
        f.seek(startByte, 0)

        stop = False
        for line in f:
            #print 'Line:', line.strip()
            lBeg = int(cg.ss(line)[1])
            lEnd = int(cg.ss(line)[2])
            lValue = int(cg.ss(line)[3].split('.')[0])

            if tccStart > lBeg:
                lBeg = tccStart
            if tccEnd < lEnd:
                lEnd = tccEnd
                stop = True
            #print timer.split()

            for i in range(lBeg, lEnd):
                try:
                    histDict[tcc].append(lValue)
                except KeyError:  #just for zero...so you don't have to if every time...
                    histDict[tcc] = [lValue]
            if stop: break

        f.close()
        #print timer.split()
    return histDict
Ejemplo n.º 42
0
def parallelMakePeaks(tcc, cName, minExpression):
    conf = c.getConfig(cName)
    f = open(
        'out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']),
        'w')
    print 'scanning range', tcc
    chrom, strand, start, end = cg.tccSplit(tcc)
    peaks = cgPeaks.stretch(tcc, cName)

    #print 'getting peaks'
    peaks.createPeaks(span=1, minVal=int(minExpression))

    print 'len peaks', len(peaks.peaks)
    endCheck = 0
    for x in peaks.peaks:
        print x, endCheck
        '''
		if x < endCheck:
                        print 'endChecked'
			continue
	        '''

        #scan a 30 bp range around this point and find the best roof...
        pRange = 40
        rTcc = cg.makeTcc(chrom, strand, x, x + 1)

        #now make profile for roof...
        cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                     pRange,
                                                     cName,
                                                     ratio=True)

        #now get highest stretch length and the rNext coord.
        minVal = .70
        highest = 0
        stretch = 0
        startCurrent = None
        startFinal = None
        endFinal = None
        for i in range(1 - pRange, pRange):
            print ' ', x + i, cProfile[i]
            if cProfile[i] > minVal:
                print '  extending stretch'
                stretch += 1
                if startCurrent == None:
                    startCurrent = i
            else:
                if stretch > 0:
                    print 'end of stretch'
                    if stretch > highest:  #stretch ended and was higher than previous
                        highest = stretch
                        endFinal = i - 1
                        startFinal = startCurrent
                        startCurrent = None
                    else:
                        startCurrent = None
                stretch = 0

        #get +/- extend value...
        val = [1.0, 1.0]
        extend = 1
        if (startFinal) and (endFinal):
            low = startFinal - extend
            high = endFinal + extend
            if low > (1 - pRange) and high < pRange:
                val[0] = float(cProfile[startFinal - extend])
                val[1] = float(cProfile[endFinal + extend])
            else:
                print 'out of range'
                continue
        else:
            print 'no start and end of peak'
            continue
        print low, high, x, endFinal
        endCheck = x + endFinal

        #avg expression around peak check...
        #get total expression before peak
        noiseExpression = 0
        lowRange = range(1 - pRange, low)
        highRange = range(high + 1, pRange)
        totalLength = len(lowRange) + len(highRange)
        for i in lowRange:
            noiseExpression += cProfile[i]
        for i in highRange:
            noiseExpression += cProfile[i]
        avgNoise = noiseExpression / float(totalLength)

        #filter out peaks that look a certain way.
        print highest, val[0], val[1], avgNoise
        if 0 < highest < 5:  #rooflength 14/26
            if val[0] < 0.20 and val[1] < .20:  #drop values
                if avgNoise < .3:
                    goodTcc = cg.makeTcc(chrom, strand, x + low, x + high)
                    print '*KEEPER'
                    f.write('%s\n' % goodTcc)

    f.close()
    print 'DONE', tcc
Ejemplo n.º 43
0
def makePeakInput(cName, minExpression = 2000):
	
	mConf = c.getConfig('Main.conf')
	conf = c.getConfig(cName)
	
	assembly = conf.conf['assembly']
	
	tccList = []
	
	chromLens = cg.returnChromLengthDict(assembly)
	f = open('peakData.%s' % minExpression, 'w')
	for chrom in chromLens:
		if chrom not in cg.acceptableChroms: continue
		for strand in ['1', '-1']:
			print 'Getting Peaks for ', chrom, strand
			prevI = 0
			endCheck = 0
			for i in rangePoints(1, chromLens[chrom], 1000):
				if i == 1:
					prevI = i
					continue
				
				start = prevI
				end = i
				prevI = i
				
				tcc = cg.makeTcc(chrom, strand, start, end)
				#print 'scanning range', tcc
				peaks = cgPeaks.stretch(tcc, cName)
				peaks.createPeaks(span = 3, minVal = minExpression)
				
				for x in peaks.peaks:
					
					if x < endCheck:
						continue
				
					#scan a 30 bp range around this point and find the best roof...
					pRange = 30
					rTcc = cg.makeTcc(chrom, strand, x, x + 1)
					
	
					#now make profile for roof...
					cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True)
					
					
					
					#now get highest stretch length and the rNext coord.
					minVal = .80
					highest = 0
					stretch = 0
					startCurrent = None
					startFinal = None
					endFinal = None
					for i in range(1 - pRange, pRange):
						if cProfile[i] > minVal:
							stretch += 1
							if startCurrent == None:
								startCurrent = i
						else:
							if stretch > 0:
								if stretch > highest: #stretch ended and was higher than previous
									highest = stretch
									endFinal = i - 1
									startFinal = startCurrent
									startCurrent = None
								else:
									startCurrent = None
							stretch = 0
					
					#get +/- 4 value...
					val = [1.0, 1.0]
					if (startFinal) and (endFinal):
						low = startFinal - 4
						high = endFinal + 4
						if low > (1 - pRange) and high < pRange:
								val[0] = float(cProfile[startFinal - 4])
								val[1] = float(cProfile[endFinal + 4])
						else:
							continue
					else:
						continue
					
					endCheck = x + high
					
					#filter out peaks that look a certain way.
					if 14 < highest < 26: #rooflength
						if val[0] < 0.2 and val[1] < .2: #drop values
							goodTcc = cg.makeTcc(chrom, strand, x + low, x + high)
							#print goodTcc
							f.write('%s\n' % goodTcc)
	f.close()
Ejemplo n.º 44
0
def makePeakInput(cName, minExpression=2000):

    mConf = c.getConfig('Main.conf')
    conf = c.getConfig(cName)

    assembly = conf.conf['assembly']

    tccList = []

    chromLens = cg.returnChromLengthDict(assembly)
    f = open('peakData.%s' % minExpression, 'w')
    for chrom in chromLens:
        if chrom not in cg.acceptableChroms: continue
        for strand in ['1', '-1']:
            print 'Getting Peaks for ', chrom, strand
            prevI = 0
            endCheck = 0
            for i in rangePoints(1, chromLens[chrom], 1000):
                if i == 1:
                    prevI = i
                    continue

                start = prevI
                end = i
                prevI = i

                tcc = cg.makeTcc(chrom, strand, start, end)
                #print 'scanning range', tcc
                peaks = cgPeaks.stretch(tcc, cName)
                peaks.createPeaks(span=3, minVal=minExpression)

                for x in peaks.peaks:

                    if x < endCheck:
                        continue

                    #scan a 30 bp range around this point and find the best roof...
                    pRange = 30
                    rTcc = cg.makeTcc(chrom, strand, x, x + 1)

                    #now make profile for roof...
                    cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                                 pRange,
                                                                 cName,
                                                                 ratio=True)

                    #now get highest stretch length and the rNext coord.
                    minVal = .80
                    highest = 0
                    stretch = 0
                    startCurrent = None
                    startFinal = None
                    endFinal = None
                    for i in range(1 - pRange, pRange):
                        if cProfile[i] > minVal:
                            stretch += 1
                            if startCurrent == None:
                                startCurrent = i
                        else:
                            if stretch > 0:
                                if stretch > highest:  #stretch ended and was higher than previous
                                    highest = stretch
                                    endFinal = i - 1
                                    startFinal = startCurrent
                                    startCurrent = None
                                else:
                                    startCurrent = None
                            stretch = 0

                    #get +/- 4 value...
                    val = [1.0, 1.0]
                    if (startFinal) and (endFinal):
                        low = startFinal - 4
                        high = endFinal + 4
                        if low > (1 - pRange) and high < pRange:
                            val[0] = float(cProfile[startFinal - 4])
                            val[1] = float(cProfile[endFinal + 4])
                        else:
                            continue
                    else:
                        continue

                    endCheck = x + high

                    #filter out peaks that look a certain way.
                    if 14 < highest < 26:  #rooflength
                        if val[0] < 0.2 and val[1] < .2:  #drop values
                            goodTcc = cg.makeTcc(chrom, strand, x + low,
                                                 x + high)
                            #print goodTcc
                            f.write('%s\n' % goodTcc)
    f.close()
Ejemplo n.º 45
0
def scanVectorsHist(tccList, cName):
	'''Given tcc list --> scan wig files and get histogram values
	can be modified to do single/total values...
	THIS USES INDEXES!!! = BAD...'''
	
	conf = c.getConfig(cName)
	org = conf.conf['organism']
	mConf = c.getConfig('Main.conf')
	wigDir = mConf.conf['wig%s' % org]

	
	timer = cg.cgTimer()
	timer.start()
	histDict = {} # tcc: [list values]
	for tcc in tccList:
		theSplit = ss(tcc, ':')
		chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1],int(theSplit[2]),int(theSplit[3])
		
		#goto correct fild, correct line in index
		
		fN = wigDir + '/Merge.%s.%s.wig.%s.wig' %  (org.lower(),strand,chrom)
		fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),strand,chrom)
		
		#print timer.split()
		#get line in index file
		iFile = open(fNindex, 'r')
		startByte = 'None'
		for line in iFile:
			beg = int(cg.ss(line)[1])
			end = int(cg.ss(line)[2])
			
			if beg <= tccStart < end:
				startByte = int(cg.ss(line)[0]) 
				#print 'INDEX', line.strip()
				break
		iFile.close()
		
		#print timer.split()
		#grab value
		f = open(fN, 'r')
		f.seek(startByte, 0)
		
		stop = False
		for line in f:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				try:
					histDict[tcc].append(lValue)
				except KeyError: #just for zero...so you don't have to if every time...
					histDict[tcc] = [lValue]
			if stop: break
	
		f.close()
		#print timer.split()
	return histDict
Ejemplo n.º 46
0
def updateReadDensity(tType, cName):
	#go through wig each chromosome and check the mature seqs
	mainConf = cgConfig.cgConfig('Main.conf')
	conf = cgConfig.getConfig(cName)
	organism = conf.conf['organism']
	wigFolder = mainConf.conf['wig%s' % organism]	
	newLines = []
	
	
	#Differentiate between exon or intron...
	if tType == 'E':
		pFileName = conf.conf['resultsExons']
	elif tType == 'I':
		pFileName = conf.conf['resultsIntrons']
	else:
		print 'READ UPDATE FAIL'

	print '  Updating Read Density:', tType

	
	#get read density for each line...
	print '  calculating hits for mature seqs'
	#calculate total hits per mature
	mirFile = open(pFileName, 'r')
	for line in mirFile:
		mTcc = line.strip().split('\t')[1]
		mirID = line.strip().split('\t')[0]
		
		tccStretch = cgPeaks.stretch(mTcc, cName)
		highestHit = 0
		for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])):
			if i in tccStretch.profile:
				if tccStretch.profile[i] > highestHit:
					highestHit = tccStretch.profile[i]		
		
		newLines.append(cg.appendToLine(line, str(highestHit), 11))
	
	mirFile.close()

	print 'Writing New File'
	#write new results file
	outFile = open(pFileName, 'w')
	for line in newLines:
		outFile.write(line)
	outFile.close()

	####NOW UPDATE HIGHEST HIT PER CLUSTER####

	clusterCount = {}

	pFile = open(pFileName, 'r')
	for line in pFile:
		predictionCount = int(line.strip().split('\t')[11])
		CID = line.strip().split('\t')[7]
		if CID in clusterCount:
			if clusterCount[CID] < predictionCount:
				clusterCount[CID] = predictionCount
		else:
			clusterCount[CID] = predictionCount
	pFile.close()

	#update the file --> cluster small count
	newLines = []
	predFile = open(pFileName, 'r')
	for line in predFile:
		CID = line.strip().split('\t')[7]
		numMax = clusterCount[CID]
		newLines.append(cg.appendToLine(line, str(numMax), 12))
	predFile.close()

	#sort newLines by clusterID
	sortDict = {}
	CIDs = []
	for line in newLines:
		CID = int(line.strip().split('\t')[7])
		if CID not in CIDs:
			CIDs.append(CID)
		if CID in sortDict:
			sortDict[CID].append(line)
		else:
			sortDict[CID] = [line]
		
	CIDs.sort()

	newLines = []
	for CID in CIDs:
		for line in sortDict[CID]:
			newLines.append(line)

	#write new File
	newFile = open(pFileName, 'w')
	for line in newLines:
		newFile.write(line)
	newFile.close()
Ejemplo n.º 47
0
def defineClusters(cName=None):
    #Start Timer
    timer = cg.cgTimer()
    timer.start()

    #Get list of mature tccs
    conf = cgConfig.getConfig(cName)  #passed or default
    finalMirFileName = conf.conf['resultsRaw']
    matureTccs = compare.tccFileToList(finalMirFileName,
                                       1)  # list of all mature micro in tcc
    print 'List getting', timer.split()

    #make connections dict
    matureConnections = compare.makeConnectionsDict(matureTccs)
    print 'Make connections:', timer.split()

    #Now have to define Clusters...
    clusters = []
    addedList = []

    #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P
    def createClusters(item=None, mode=None):

        if item in addedList:
            return 0
        elif mode == "top":
            clusters.append([item])
            addedList.append(
                item)  ##creates new cluster with the item already stored in it
            for connectedItem in matureConnections[item]:
                createClusters(connectedItem, "neighbor")
        elif mode == "neighbor":
            clusters[-1].append(
                item)  #add this item to the last cluster created
            addedList.append(item)
            for connectedItem in matureConnections[item]:
                createClusters(connectedItem, "neighbor")

    for tcc in matureTccs:
        createClusters(tcc, "top")

    print 'Make Clusters', timer.split()

    #Sort Clusters.
    sortedClusters = []

    for cluster in clusters:
        sortedClusters.append(cg.sortTccList(cluster))

    print 'Sort Clusters:', timer.split()

    #Output sorted cluster file
    clusterFileName = conf.conf['sortedClusters']
    clusterFile = open(clusterFileName, 'w')
    for cluster in sortedClusters:
        for hit in cluster:
            clusterFile.write('%s,' % hit)
        clusterFile.write('\n')
    clusterFile.close()
    '''
	#re-create sortedClusters list:
	clusterFileName = 'sortedClusters.data'
	clusterFile = open(clusterFileName, 'r')
	sortedClusters = []
	
	
	for line in clusterFile:
		sortedClusters.append([])
		line = line.strip()[0:-1] #take off last comma ;P
		for hit in (line.strip().split(',')):
			sortedClusters[-1].append(hit)
	'''

    print 'Store intermediate data:', timer.split()

    #output hitsAround file
    outputFile = open(conf.conf['hitsPerFrame'], 'w')

    frameLength = 200
    frameShift = 1
    for cluster in sortedClusters:
        #grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope
        clusterChrom = cluster[0].split(":")[0]
        clusterStrand = cluster[0].split(":")[1]
        firstCoord = int(cluster[0].split(":")[2])
        #print cluster[-1]
        lastCoord = int(cluster[-1].split(":")[3])

        startCoord = firstCoord
        while startCoord < lastCoord:
            #count how many hits there are in this range
            rangeStart = startCoord - (frameLength / 2)
            rangeEnd = startCoord + (frameLength / 2)
            rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand,
                                        rangeStart, rangeEnd)
            overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2)
            hitCount = len(overlappedList)

            #output
            outputFile.write('%s\t%s\n' % (rangeTcc, hitCount))
            startCoord = startCoord + frameShift  #check overlap with range
    outputFile.close()

    print 'Output Hits per Frame:', timer.split()
    print 'Overall Time:', timer.report()
Ejemplo n.º 48
0
import getHairpins
import cgGenes
import cgConfig as c
import bioLibCG as cg

mConf = c.getConfig('Main.conf')
geneSetFolder = mConf.conf['geneSetsHuman']

fN = '/home/chrisgre/projects/NoncodingHuman/results/NChuman-s3k8b17.results.sorted.introns.sorted'
cHairs = getHairpins.getHairpins(fN)

ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder +
                                         '/ensemblAllTranscripts.tsv')

cDesc = {}  #CID:gDesc
for CID in cHairs:
    tcc = cHairs[CID]

    cDesc[CID] = "NONE"

    overlappingGenes = ensGenes.geneOverlaps([tcc])
    if len(overlappingGenes) > 0:
        print overlappingGenes[0].type
        cDesc[CID] = overlappingGenes[0].type

f = open(fN, 'r')
newLines = []
for line in f:
    CID = line.strip().split('\t')[7]
    newLines.append(cg.appendToLine(line, cDesc[CID], 16))
f.close()
Ejemplo n.º 49
0
def defineClusters(cName = None):
	#Start Timer
	timer = cg.cgTimer()
	timer.start()
	
	#Get list of mature tccs
	conf = cgConfig.getConfig(cName) #passed or default
	finalMirFileName = conf.conf['resultsRaw']
	matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc
	print 'List getting', timer.split()
	
	
	#make connections dict
	matureConnections = compare.makeConnectionsDict(matureTccs)
	print 'Make connections:', timer.split()
	
	#Now have to define Clusters...
	clusters = []
	addedList = []
	
	#I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P
	def createClusters(item = None, mode = None):
			
		if item in addedList:
			return 0
		elif mode == "top":
			clusters.append([item])
			addedList.append(item) ##creates new cluster with the item already stored in it
			for connectedItem in matureConnections[item]:
				createClusters(connectedItem, "neighbor")
		elif mode == "neighbor":
			clusters[-1].append(item) #add this item to the last cluster created
			addedList.append(item)
			for connectedItem in matureConnections[item]:
				createClusters(connectedItem, "neighbor")
		
	for tcc in matureTccs:
		createClusters(tcc, "top")
	
	print 'Make Clusters', timer.split()
	
	
	#Sort Clusters.
	sortedClusters = []
	
	for cluster in clusters:
		sortedClusters.append(cg.sortTccList(cluster))
	
	print 'Sort Clusters:', timer.split()
	
	
	#Output sorted cluster file
	clusterFileName = conf.conf['sortedClusters']
	clusterFile = open(clusterFileName, 'w')
	for cluster in sortedClusters:
		for hit in cluster:
			clusterFile.write('%s,' % hit)
		clusterFile.write('\n')
	clusterFile.close()
	
	'''
	#re-create sortedClusters list:
	clusterFileName = 'sortedClusters.data'
	clusterFile = open(clusterFileName, 'r')
	sortedClusters = []
	
	
	for line in clusterFile:
		sortedClusters.append([])
		line = line.strip()[0:-1] #take off last comma ;P
		for hit in (line.strip().split(',')):
			sortedClusters[-1].append(hit)
	'''
	
	
	print 'Store intermediate data:', timer.split()
	
	
	#output hitsAround file
	outputFile = open(conf.conf['hitsPerFrame'], 'w')
	
	frameLength = 200
	frameShift = 1
	for cluster in sortedClusters:
		#grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope
		clusterChrom = cluster[0].split(":")[0]
		clusterStrand = cluster[0].split(":")[1]
		firstCoord = int(cluster[0].split(":")[2])
		#print cluster[-1]
		lastCoord = int(cluster[-1].split(":")[3])
		
		
		startCoord = firstCoord
		while startCoord < lastCoord:
			#count how many hits there are in this range
			rangeStart = startCoord - (frameLength/2)
			rangeEnd = startCoord + (frameLength/2)
			rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd)
			overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2)
			hitCount = len(overlappedList) 
			
			#output 
			outputFile.write('%s\t%s\n' % (rangeTcc, hitCount))
			startCoord = startCoord + frameShift #check overlap with range
	outputFile.close()
	
	print 'Output Hits per Frame:', timer.split()
	print 'Overall Time:', timer.report()