def scanVectorsFile(fN, tccList):
	'''Given tcc list --> scan wig files and return coord:value...
	'''	
	timer = cg.cgTimer()
	timer.start()
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		#goto correct line in index
		fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this...
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
				
		stop = False
		for line in fIndex.file:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	
		#fIndex.close()
	return coordDict
Esempio n. 2
0
def scanVectorsFile(fN, tccList):
	'''Given tcc list --> scan wig files and return coord:value...
	'''	
	timer = cg.cgTimer()
	timer.start()
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		#goto correct line in index
		fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this...
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
				
		stop = False
		for line in fIndex.file:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	
		#fIndex.close()
	return coordDict
Esempio n. 3
0
def svCoord(tccList, config = None):
	'''Given tcc list --> scan Organism wig files and coord:value...
	'''
	
	#init
	config = c.getConfig(config)
	org = config.conf['organism']
	wigDir = config.conf['wigSetDir']
	wigSetName = config.conf['wigSetName']
	splitIntoChroms = config.conf['wigChromSplit']
	if splitIntoChroms == 'True':
		splitIntoChroms = True
	else:
		splitIntoChroms = False

	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		if splitIntoChroms:
			fN = wigDir + '/%s.%s.%s.wig' %  (wigSetName, chrom, strand)
		else:
			fN = wigDir + '/Merge.%s.%s.wig' % (org.lower(), strand)
		
		fIndex = cgIndex.lineIndex(fN, header = True)
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
		
		stop = False
		for line in fIndex.file:
			
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1]) + 1
                        #print 'lBeg', lBeg
			lEnd = int(cg.ss(line)[2])
                        #print 'lEnd', lEnd
                        #print '--'
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd + 1):
				coordDict[i] = lValue
				
			if stop: break
		fIndex.close() #close the file and the index after use...

	return coordDict
Esempio n. 4
0
def getHairpins(fN):
	predFile = open(fN, 'r')

	#populate CID:hairpin range
	cHairs = {}
	for line in predFile:
		#get cluster ID
		CID = ss(line)[7]
		hairpin = ss(line)[2]
		
		if CID in cHairs:
			#check if the starts and ends need to be stretched
			hStart = int(ss(cHairs[CID], ':')[2])
			hEnd = int(ss(cHairs[CID], ':')[3])
			
			start = int(ss(hairpin, ':')[2])
			end = int(ss(hairpin, ':')[3])
			
			if start < hStart:
				hStart = start
			if end > hEnd:
				hEnd = end
			
			cHairs[CID] = '%s:%s:%s:%s' % (ss(hairpin, 1)[0], ss(hairpin, 1)[1], hStart, hEnd)
		else:
			cHairs[CID] = hairpin
	predFile.close()
	
	return cHairs
Esempio n. 5
0
def getHairpins():
	predFile = open(conf.conf['resultsSorted'], 'r')

	#populate CID:hairpin range
	cHairs = {}
	for line in predFile:
		#get cluster ID
		CID = ss(line)[7]
		hairpin = ss(line)[2]
		
		if CID in cHairs:
			#check if the starts and ends need to be stretched
			hStart = int(ss(cHairs[CID], ':')[2])
			hEnd = int(ss(cHairs[CID], ':')[3])
			
			start = int(ss(hairpin, ':')[2])
			end = int(ss(hairpin, ':')[3])
			
			if start < hStart:
				hStart = start
			if end > hEnd:
				hEnd = end
			
			cHairs[CID] = '%s:%s:%s:%s' % (ss(hairpin, 1)[0], ss(hairpin, 1)[1], hStart, hEnd)
		else:
			cHairs[CID] = hairpin
	predFile.close()
	
	return cHairs
Esempio n. 6
0
def getHairpins(fN):
    predFile = open(fN, "r")

    # populate CID:hairpin range
    cHairs = {}
    for line in predFile:
        # get cluster ID
        CID = ss(line)[7]
        hairpin = ss(line)[2]

        if CID in cHairs:
            # check if the starts and ends need to be stretched
            hStart = int(ss(cHairs[CID], ":")[2])
            hEnd = int(ss(cHairs[CID], ":")[3])

            start = int(ss(hairpin, ":")[2])
            end = int(ss(hairpin, ":")[3])

            if start < hStart:
                hStart = start
            if end > hEnd:
                hEnd = end

            cHairs[CID] = "%s:%s:%s:%s" % (ss(hairpin, 1)[0], ss(hairpin, 1)[1], hStart, hEnd)
        else:
            cHairs[CID] = hairpin
    predFile.close()

    return cHairs
def scanVectorsOrganism(tccList, config=None):
    '''Given tcc list --> scan Organism wig files and coord:value...
	'''

    config = c.getConfig(config)

    coordDict = {}  # tcc: [list values]
    for tcc in tccList:
        chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)

        #print 'Checking Tcc'
        org = config.conf['organism']
        mConf = c.getConfig('Main.conf')
        wigDir = mConf.conf['wig%s' % org]
        fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom)
        #print 'Checking Index'
        #goto correct line in index
        fIndex = cgIndex.lineIndex(
            fN, header=True
        )  #!!!there actually is a header...have to deal with this...
        fIndex.passCheckFunction(cgIndex.wigCheckFunction)
        fIndex.binarySearch(
            tcc)  #places file pointer at beginning of tcc as beginning

        stop = False
        for line in fIndex.file:

            #print 'Line:', line.strip()
            lBeg = int(cg.ss(line)[1])
            lEnd = int(cg.ss(line)[2])
            lValue = int(cg.ss(line)[3].split('.')[0])

            if tccStart > lBeg:
                lBeg = tccStart
            if tccEnd < lEnd:
                lEnd = tccEnd
                stop = True
            #print timer.split()

            for i in range(lBeg, lEnd):
                coordDict[i] = lValue

            if stop: break
    return coordDict
def scanVectorsOrganism(tccList, config = None):
	'''Given tcc list --> scan Organism wig files and coord:value...
	'''
	
	config = c.getConfig(config)
	
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		#print 'Checking Tcc'	
		org = config.conf['organism']
		mConf = c.getConfig('Main.conf')
		wigDir = mConf.conf['wig%s' % org]
		fN = wigDir + '/Merge.%s.%s.wig.%s.wig' %  (org.lower(),strand,chrom)	
		#print 'Checking Index'
		#goto correct line in index
		fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this...
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
		
		stop = False
		for line in fIndex.file:
			
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	return coordDict
Esempio n. 9
0
def getAll(chrom, strand, point):
    fNs = cg.recurseDir(mConf.conf['smallPath'], end='.wig')
    for file in fNs:
        if 'WIG' in file:
            fNs.remove(file)
        elif file == '/home/chrisgre/smallLibs/WIGS/Merge.mouse.1.wig' or '/home/chrisgre/smallLibs/WIGS/Merge.human.1.wig':
            fNs.remove(file)

    for fN in fNs:
        fStrand = cg.ss(fN, '.')[-2]
        if str(fStrand) == str(strand):
            val = getWigValue(chrom, point, fN)
            if val > 0:
                print fN, val
Esempio n. 10
0
def getWigValue(chrom, point, fN):

    #no strand specification in sequencing
    point = int(point)

    #get line in index file

    #grab value
    f = open(fN, 'r')
    f.readline()
    wigValue = 0
    for line in f:
        beg = int(cg.ss(line)[1])
        end = int(cg.ss(line)[2])
        fChrom = cg.ss(line)[0]

        if beg <= point < end:
            if chrom == fChrom:
                wigValue += float(cg.ss(line)[3].split('.')[0])
                break
    f.close()

    return wigValue
Esempio n. 11
0
def getWigValue(chrom, point, fN):
	
	#no strand specification in sequencing
	point = int(point)
	
	#get line in index file
	
	#grab value
	f = open(fN, 'r')
	f.readline()
	wigValue = 0
	for line in f:
		beg = int(cg.ss(line)[1])
		end = int(cg.ss(line)[2])
		fChrom = cg.ss(line)[0]
		
		if beg <= point < end:
			if chrom == fChrom:
				wigValue += float(cg.ss(line)[3].split('.')[0])
				break
	f.close()

	return wigValue
Esempio n. 12
0
def getAll(chrom, strand, point):
	fNs = cg.recurseDir(mConf.conf['smallPath'], end = '.wig')
	for file in fNs:
		if 'WIG' in file:
			fNs.remove(file)
		elif file == '/home/chrisgre/smallLibs/WIGS/Merge.mouse.1.wig' or '/home/chrisgre/smallLibs/WIGS/Merge.human.1.wig':
			fNs.remove(file)
	
	for fN in fNs:
		fStrand = cg.ss(fN, '.')[-2]
		if str(fStrand) == str(strand):
			val = getWigValue(chrom, point, fN)
			if val > 0:
				print fN, val
Esempio n. 13
0
def getWigValue(chrom, strand, point):
	'''Uses Byte Indexes'''
	
	#no strand specification in sequencing
	point = int(point)
	if int(strand) == 1:
		fN = wigDir + '/Merge.mouse.1.wig.%s.wig' % chrom
		fNindex = wigDir + '/Merge.mouse.1.wig.%s.wig.index' % chrom
	else:
		fN = wigDir + '/Merge.mouse.-1.wig.%s.wig' % chrom
		fNindex = wigDir + '/Merge.mouse.-1.wig.%s.wig.index' % chrom
	
	#get line in index file
	iFile = open(fNindex, 'r')
	startByte = 'None'
	for line in iFile:
		beg = int(cg.ss(line)[1])
		end = int(cg.ss(line)[2])
		
		if beg <= point < end:
			startByte = int(cg.ss(line)[0]) 
			#print 'INDEX', line.strip()
			break
	iFile.close()
	
	#grab value
	f = open(fN, 'r')
	f.seek(startByte, 0)
	'''
	s = ""
	i = 0
	while i < 20:
		s += f.read(1)
		i += 1
	print s
	'''
	wigValue = 0
	for line in f:
		#print 'Line:', line.strip()
		beg = int(cg.ss(line)[1])
		end = int(cg.ss(line)[2])
		
		if beg <= point < end:
			wigValue += float(cg.ss(line)[3].split('.')[0])
			break
	f.close()

	return wigValue
Esempio n. 14
0
def getWigValue(chrom, strand, point):
	'''Uses Byte Indexes'''
	
	#no strand specification in sequencing
	point = int(point)
	if int(strand) == 1:
		fN = wigDir + '/Merge.mouse.1.wig.%s.wig' % chrom
		fNindex = wigDir + '/Merge.mouse.1.wig.%s.wig.index' % chrom
	else:
		fN = wigDir + '/Merge.mouse.-1.wig.%s.wig' % chrom
		fNindex = wigDir + '/Merge.mouse.-1.wig.%s.wig.index' % chrom
	
	#get line in index file
	iFile = open(fNindex, 'r')
	startByte = 'None'
	for line in iFile:
		beg = int(cg.ss(line)[1])
		end = int(cg.ss(line)[2])
		
		if beg <= point < end:
			startByte = int(cg.ss(line)[0])
			#print 'INDEX', line.strip()
			break
	iFile.close()
	
	#grab value
	f = open(fN, 'r')
	f.seek(startByte, 0)
	'''
	s = ""
	i = 0
	while i < 20:
		s += f.read(1)
		i += 1
	print s
	'''
	wigValue = 0
	for line in f:
		#print 'Line:', line.strip()
		beg = int(cg.ss(line)[1])
		end = int(cg.ss(line)[2])
		
		if beg <= point < end:
			wigValue += float(cg.ss(line)[3].split('.')[0])
			break
	f.close()

	return wigValue
Esempio n. 15
0
def getWigValueLINE(chrom, strand, point):
	'''Old ONE -> Use "Byte" one'''
	
	#no strand specification in sequencing
	point = int(point)
	if int(strand) == 1:
		fN = wigDir + '/Merge.mouse.1.wig.%s.wig' % chrom
		fNindex = wigDir + '/Merge.mouse.1.wig.%s.wig.index' % chrom
	else:
		fN = wigDir + '/Merge.mouse.-1.wig.%s.wig' % chrom
		fNindex = wigDir + '/Merge.mouse.-1.wig.%s.wig.index' % chrom
	
	#get line in index file
	iFile = open(fNindex, 'r')
	startLine = 0
	for line in iFile:
		beg = int(cg.ss(line)[1])
		end = int(cg.ss(line)[2])
		
		if beg <= point <= end:
			startLine = int(cg.ss(line)[0])
			break
	iFile.close()
	
	#grab value
	f = open(fN, 'r')
	i = 0
	while i < startLine:
		f.readline() #skip header and lines till indexed line...
		i += 1
	wigValue = 0
	for line in f:
		beg = int(cg.ss(line)[1])
		end = int(cg.ss(line)[2])
		
		if beg <= point < end:
			wigValue += float(cg.ss(line)[3].split('.')[0])
			break
	f.close()

	return wigValue
Esempio n. 16
0
def getWigValueLINE(chrom, strand, point):
	'''Old ONE -> Use "Byte" one'''
	
	#no strand specification in sequencing
	point = int(point)
	if int(strand) == 1:
		fN = wigDir + '/Merge.mouse.1.wig.%s.wig' % chrom
		fNindex = wigDir + '/Merge.mouse.1.wig.%s.wig.index' % chrom
	else:
		fN = wigDir + '/Merge.mouse.-1.wig.%s.wig' % chrom
		fNindex = wigDir + '/Merge.mouse.-1.wig.%s.wig.index' % chrom
	
	#get line in index file
	iFile = open(fNindex, 'r')
	startLine = 0
	for line in iFile:
		beg = int(cg.ss(line)[1])
		end = int(cg.ss(line)[2])
		
		if beg <= point <= end:
			startLine = int(cg.ss(line)[0])
			break
	iFile.close()
	
	#grab value
	f = open(fN, 'r')
	i = 0
	while i < startLine:
		f.readline() #skip header and lines till indexed line...
		i += 1
	wigValue = 0
	for line in f:
		beg = int(cg.ss(line)[1])
		end = int(cg.ss(line)[2])
		
		if beg <= point < end:
			wigValue += float(cg.ss(line)[3].split('.')[0])
			break
	f.close()

	return wigValue
Esempio n. 17
0
def scanVectorsHist(tccList, cName):
    '''Given tcc list --> scan wig files and get histogram values
	can be modified to do single/total values...
	THIS USES INDEXES!!! = BAD...'''

    conf = c.getConfig(cName)
    org = conf.conf['organism']
    mConf = c.getConfig('Main.conf')
    wigDir = mConf.conf['wig%s' % org]

    timer = cg.cgTimer()
    timer.start()
    histDict = {}  # tcc: [list values]
    for tcc in tccList:
        theSplit = ss(tcc, ':')
        chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1], int(
            theSplit[2]), int(theSplit[3])

        #goto correct fild, correct line in index

        fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom)
        fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),
                                                              strand, chrom)

        #print timer.split()
        #get line in index file
        iFile = open(fNindex, 'r')
        startByte = 'None'
        for line in iFile:
            beg = int(cg.ss(line)[1])
            end = int(cg.ss(line)[2])

            if beg <= tccStart < end:
                startByte = int(cg.ss(line)[0])
                #print 'INDEX', line.strip()
                break
        iFile.close()

        #print timer.split()
        #grab value
        f = open(fN, 'r')
        f.seek(startByte, 0)

        stop = False
        for line in f:
            #print 'Line:', line.strip()
            lBeg = int(cg.ss(line)[1])
            lEnd = int(cg.ss(line)[2])
            lValue = int(cg.ss(line)[3].split('.')[0])

            if tccStart > lBeg:
                lBeg = tccStart
            if tccEnd < lEnd:
                lEnd = tccEnd
                stop = True
            #print timer.split()

            for i in range(lBeg, lEnd):
                try:
                    histDict[tcc].append(lValue)
                except KeyError:  #just for zero...so you don't have to if every time...
                    histDict[tcc] = [lValue]
            if stop: break

        f.close()
        #print timer.split()
    return histDict
Esempio n. 18
0
def intronNoisy(cName=None):
    mConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    #init
    cHairs = getHairpins.getHairpins(
        conf.conf['resultsIntrons'])  #CID: HAIRPIN
    organism = conf.conf['organism']
    exonList = compare.tccFileToList('%sExons.tcc' % organism, 0)
    slide = 1000

    #make prediction overlap hitmap
    predMap = {}
    predList = []
    for CID in cHairs:
        hPin = cHairs[CID]
        predList.append(hPin)

    #collapse Overlaps
    print ' collapsing predictions'
    predList = compare.collapseOverlaps(predList)
    print ' collapsing exons'
    exonList = compare.collapseOverlaps(exonList)

    #collect levels for each hairpin region
    cidLevels = {}
    for CID in cHairs:
        print CID
        hPin = cHairs[CID]
        chrom = ss(hPin, ':')[0]
        strand = ss(hPin, ':')[1]
        start = int(ss(hPin, ':')[2])
        end = int(ss(hPin, ':')[3])

        scanStart = start - slide
        scanEnd = end + slide

        scanRange = []
        scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start))
        scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd))

        print scanRange
        scanRange = compare.subtractTwoTccLists(scanRange, predList)
        scanRange = compare.subtractTwoTccLists(scanRange, exonList)

        levels = []

        print '  Retrieving Expression levels:', cg.getTccListTotalLength(
            scanRange)
        levels = []

        hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName)
        for hPin in hPinLevels:
            levels.extend(hPinLevels[hPin])

        cidLevels[CID] = levels

    #output levels to file

    #find longest
    longest = 0
    for CID in cidLevels:
        length = len(cidLevels[CID])
        if length > longest:
            longest = length

    sortedKeys = cidLevels.keys()
    sortedKeys.sort()

    newLines = []
    for j in range(0, longest):  #how many lines are there
        newLine = []
        for CID in sortedKeys:
            if len(cidLevels[CID]) > j:  # add it
                newLine.append(str(cidLevels[CID][j]))
            else:
                newLine.append('NA')

        newLines.append('\t'.join(newLine) + '\n')

    outFileN = conf.conf['intronNoiseData']
    outFile = open(outFileN, 'w')
    outFile.write('\t'.join(sortedKeys) + '\n')
    outFile.writelines(newLines)
    outFile.close()
Esempio n. 19
0
import bioLibCG as cg
import cgConfig as c

mConf = c.cgConfig('Main.conf')

fileNames = cg.recurseDir(mConf.conf['wigMouse'], end='.wig')

for fN in fileNames:
    file = open(fN, 'r')
    file.readline()  #header

    #get all points in midpoint form
    pointsDict = {}

    for line in file:
        start = int(cg.ss(line)[1])
        end = int(cg.ss(line)[2])
        point = start + (end - start) / 2  #midpoint
        pointsDict[point] = int(cg.ss(line)[3].split('.')[0])
    file.close()

    #determine peaks based off of neighbors of each point
    lowest = pointsDict.keys()
    lowest.sort()
    peaks = []
    span = 2  #must be > 0
    for i in range(span + 1, len(lowest) - span - 1):

        val = pointsDict[lowest[i]]
        if val < 5:  #minimum
            continue
Esempio n. 20
0
    if strand not in peaks[chrom]:
        peaks[chrom][strand] = {}

    #get peaks and values and put in dictionary
    pFile = open(pN, 'r')
    for line in pFile:
        peaks[chrom][strand][int(line.strip().split('\t')[0])] = int(
            line.strip().split('\t')[1].split('.')[0])
print timer.split()

print 'finding best combos'
bestCombos = []
for tcc in tccList:
    print tcc
    tccPeaks = []
    chrom = cg.ss(tcc, ':')[0]
    strand = cg.ss(tcc, ':')[1]
    start = int(cg.ss(tcc, ':')[2])
    end = int(cg.ss(tcc, ':')[3])

    #get all peaks
    for i in range(start, end + 1):
        if i in peaks[chrom][strand]:
            print '  peak added', i
            tccPeaks.append(i)

    #get all combos
    pairStrings = []  #used to check if pair already added
    peakCombos = []
    for x in tccPeaks:
        for y in tccPeaks:
Esempio n. 21
0
import bioLibCG as cg
import cgConfig as c

mConf = c.cgConfig('Main.conf')

fileNames = cg.recurseDir(mConf.conf['wigMouse'],  end = '.wig')

for fN in fileNames:
	file = open(fN, 'r')
	file.readline() #header
	
	#get all points in midpoint form
	pointsDict = {}
	
	for line in file:
		start = int(cg.ss(line)[1])
		end = int(cg.ss(line)[2])
		point = start + (end-start)/2 #midpoint
		pointsDict[point] = int(cg.ss(line)[3].split('.')[0])
	file.close()
	
	
	#determine peaks based off of neighbors of each point
	lowest = pointsDict.keys()
	lowest.sort()
	peaks = []
	span = 2 #must be > 0
	for i in range(span + 1,len(lowest) - span - 1):
			
		val = pointsDict[lowest[i]]
		if val < 5: #minimum
Esempio n. 22
0
	
	if strand not in peaks[chrom]:
		peaks[chrom][strand] = {}
	
	#get peaks and values and put in dictionary
	pFile = open(pN, 'r')
	for line in pFile:
		peaks[chrom][strand][int(line.strip().split('\t')[0])] = int(line.strip().split('\t')[1].split('.')[0])
print timer.split()

print 'finding best combos'
bestCombos = []
for tcc in tccList:
	print tcc
	tccPeaks = []
	chrom = cg.ss(tcc, ':')[0]
	strand = cg.ss(tcc, ':')[1]
	start = int(cg.ss(tcc, ':')[2])
	end = int(cg.ss(tcc, ':')[3])
	
	#get all peaks
	for i in range(start, end + 1):
		if i in peaks[chrom][strand]:
			print '  peak added', i
			tccPeaks.append(i)
	
	#get all combos
	pairStrings = [] #used to check if pair already added
	peakCombos = []
	for x in tccPeaks:
		for y in tccPeaks:
def findPeaks(pType, cName = None):
	
	#init
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)

	if pType == 'E':
		predName = conf.conf['resultsExonsSorted']
	else:
		predName = conf.conf['resultsIntronsSorted']
	
	print predName
	#make CID:hairpin:peak dictionary
	cHairs = getHairpins.getHairpins(predName)
	peakDict = {}
	for CID in cHairs:
		peakDict[CID] = [cHairs[CID],'None']
		

	timer = cg.cgTimer()
	timer.start()

	#put peaks in memory
	print 'Creating peak data'
	peaks = {} # chr:peak:value
	for CID in cHairs:
		chrom, strand, start, end = cg.tccSplit(cHairs[CID])
		tcc = cHairs[CID]
		
		#init dictionary
		if chrom not in peaks:
			peaks[chrom] = {}
		
		if strand not in peaks[chrom]:
			peaks[chrom][strand] = {}
		
		#create peaks for tcc and add to peak dictionary
		stretch = cgPeaks.stretch(tcc, cName)
		stretch.createPeaks()
		for peakCoord in stretch.peaks:
			peaks[chrom][strand][peakCoord] = 0
	print timer.split()

	print 'finding best combos'
	bestCombos = []
	aPass = 0
	bPass = 0
	cPass = 0
	numT = 0
	for CID in peakDict:
		cgFlag = False
		if CID == '538':cgFlag = True
		
		tcc = peakDict[CID][0]
		#print tcc
		tccPeaks = []
		chrom = cg.ss(tcc, ':')[0]
		strand = cg.ss(tcc, ':')[1]
		start = int(cg.ss(tcc, ':')[2])
		end = int(cg.ss(tcc, ':')[3])
		
		#get all peaks
		for i in range(start, end + 1):
			if i in peaks[chrom][strand]:
				#print '  peak added', i
				tccPeaks.append(i)
		
		#Calculate parameters...
		pairStrings = [] #used to check if pair already added
		peakCombos = []
		for x in tccPeaks:
				
								
				#scan a 30 bp range around this point and find the best roof...
				pRange = 30
				rTcc = cg.makeTcc(chrom, strand, x, x + 1)
				
				#quickly get max value...kinda a long way to do it but whatever
				cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False)
				xval = cProfile[0]
				max = xval
				highestValueCoord = x
				
				#now make profile for roof...
				cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True)
				
				
				
				#now get highest stretch length and the rNext coord.
				minVal = .80
				highest = 0
				stretch = 0
				startCurrent = None
				startFinal = None
				endFinal = None
				for i in range(1 - pRange, pRange):
					if cProfile[i] > minVal:
						stretch += 1
						if startCurrent == None:
							startCurrent = i
					else:
						if stretch > 0:
							if stretch > highest: #stretch ended and was higher than previous
								highest = stretch
								endFinal = i - 1
								startFinal = startCurrent
								startCurrent = None
							else:
								startCurrent = None
						stretch = 0
				
				#get +/- 4 value...
				val = [1.0, 1.0]
				if (startFinal) and (endFinal):
					low = startFinal - 4
					high = endFinal + 4
					if low > (1 - pRange):
						if high < pRange:
							val[0] = float(cProfile[startFinal - 4])
							val[1] = float(cProfile[endFinal + 4])
				
				#fill in other details...
				y = 'S'
				dist = 'S'
				ratio = 'S'
				
				peakCombos.append([tcc,x,y,dist,ratio,max,highest,val])
				#print '  ', peakCombos[-1]
		
		#find best combo...
		topCombo = None
		for combo in peakCombos:
			roofLength = combo[6]
			dropValue = combo[7][0]
			if combo[7][1] > dropValue:
				dropValue = combo[7][1]
			
			#print roofLength, dropValue
			if 14 < roofLength < 26:
				if 0.0 < dropValue < 0.2:
					#pick one with rooflength nearest 20:
					if topCombo:
						if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])):
							topCombo = combo
					else:
						topCombo = combo
		
		if topCombo:
			peakDict[CID][1] = topCombo
			bestCombos.append(topCombo)
			print bestCombos[-1]
		else:
			#print 'None'
			pass

	print timer.split()


	#now update predFile (SLOT 13)
	predFile = open(predName, 'r')
	newLines = []
	for line in predFile:
		CID = cg.ss(line)[7]
		if peakDict[CID][1] == 'None':
			peakInfo = 'None'
		else:
			peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7])
		newLines.append(cg.appendToLine(line, peakInfo, 13))
	predFile.close()

	predFile = open(predName, 'w')
	predFile.writelines(newLines)
	predFile.close()
Esempio n. 24
0
def scanVectorsSingleCoord(tccList, cName):
	'''Given tcc list --> scan wig files and coord:value...
	'''
	
	conf = c.getConfig(cName)
	org = conf.conf['organism']
	mConf = c.getConfig('Main.conf')
	wigDir = mConf.conf['wig%s' % org]

	timer = cg.cgTimer()
	timer.start()
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		theSplit = ss(tcc, ':')
		chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1],int(theSplit[2]),int(theSplit[3])
		
		#goto correct fild, correct line in index
		
		fN = wigDir + '/Merge.%s.%s.wig.%s.wig' %  (org.lower(),strand,chrom)
		fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),strand,chrom)
		
		#print timer.split()
		#get line in index file
		iFile = open(fNindex, 'r')
		startByte = 'None'
		for line in iFile:
			beg = int(cg.ss(line)[1])
			end = int(cg.ss(line)[2])
			
			if beg <= tccStart < end:
				startByte = int(cg.ss(line)[0]) 
				#print 'INDEX', line.strip()
				break
		iFile.close()
		
		#print timer.split()
		#grab value
		f = open(fN, 'r')
		f.seek(startByte, 0)
		
		stop = False
		for line in f:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	
		f.close()
	return coordDict
Esempio n. 25
0
def findPeaks(pType, cName=None):

    #init
    mConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    if pType == 'E':
        predName = conf.conf['resultsExonsSorted']
    else:
        predName = conf.conf['resultsIntronsSorted']

    print predName
    #make CID:hairpin:peak dictionary
    cHairs = getHairpins.getHairpins(predName)
    peakDict = {}
    for CID in cHairs:
        peakDict[CID] = [cHairs[CID], 'None']

    timer = cg.cgTimer()
    timer.start()

    #put peaks in memory
    print 'Creating peak data'
    peaks = {}  # chr:peak:value
    for CID in cHairs:
        chrom, strand, start, end = cg.tccSplit(cHairs[CID])
        tcc = cHairs[CID]

        #init dictionary
        if chrom not in peaks:
            peaks[chrom] = {}

        if strand not in peaks[chrom]:
            peaks[chrom][strand] = {}

        #create peaks for tcc and add to peak dictionary
        stretch = cgPeaks.stretch(tcc, cName)
        stretch.createPeaks()
        for peakCoord in stretch.peaks:
            peaks[chrom][strand][peakCoord] = 0
    print timer.split()

    print 'finding best combos'
    bestCombos = []
    aPass = 0
    bPass = 0
    cPass = 0
    numT = 0
    for CID in peakDict:
        cgFlag = False
        if CID == '538': cgFlag = True

        tcc = peakDict[CID][0]
        #print tcc
        tccPeaks = []
        chrom = cg.ss(tcc, ':')[0]
        strand = cg.ss(tcc, ':')[1]
        start = int(cg.ss(tcc, ':')[2])
        end = int(cg.ss(tcc, ':')[3])

        #get all peaks
        for i in range(start, end + 1):
            if i in peaks[chrom][strand]:
                #print '  peak added', i
                tccPeaks.append(i)

        #Calculate parameters...
        pairStrings = []  #used to check if pair already added
        peakCombos = []
        for x in tccPeaks:

            #scan a 30 bp range around this point and find the best roof...
            pRange = 30
            rTcc = cg.makeTcc(chrom, strand, x, x + 1)

            #quickly get max value...kinda a long way to do it but whatever
            cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                         1,
                                                         cName,
                                                         ratio=False)
            xval = cProfile[0]
            max = xval
            highestValueCoord = x

            #now make profile for roof...
            cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                         pRange,
                                                         cName,
                                                         ratio=True)

            #now get highest stretch length and the rNext coord.
            minVal = .80
            highest = 0
            stretch = 0
            startCurrent = None
            startFinal = None
            endFinal = None
            for i in range(1 - pRange, pRange):
                if cProfile[i] > minVal:
                    stretch += 1
                    if startCurrent == None:
                        startCurrent = i
                else:
                    if stretch > 0:
                        if stretch > highest:  #stretch ended and was higher than previous
                            highest = stretch
                            endFinal = i - 1
                            startFinal = startCurrent
                            startCurrent = None
                        else:
                            startCurrent = None
                    stretch = 0

            #get +/- 4 value...
            val = [1.0, 1.0]
            if (startFinal) and (endFinal):
                low = startFinal - 4
                high = endFinal + 4
                if low > (1 - pRange):
                    if high < pRange:
                        val[0] = float(cProfile[startFinal - 4])
                        val[1] = float(cProfile[endFinal + 4])

            #fill in other details...
            y = 'S'
            dist = 'S'
            ratio = 'S'

            peakCombos.append([tcc, x, y, dist, ratio, max, highest, val])
            #print '  ', peakCombos[-1]

        #find best combo...
        topCombo = None
        for combo in peakCombos:
            roofLength = combo[6]
            dropValue = combo[7][0]
            if combo[7][1] > dropValue:
                dropValue = combo[7][1]

            #print roofLength, dropValue
            if 14 < roofLength < 26:
                if 0.0 < dropValue < 0.2:
                    #pick one with rooflength nearest 20:
                    if topCombo:
                        if (math.fabs(22 - roofLength)) < (
                                math.fabs(22 - topCombo[6])):
                            topCombo = combo
                    else:
                        topCombo = combo

        if topCombo:
            peakDict[CID][1] = topCombo
            bestCombos.append(topCombo)
            print bestCombos[-1]
        else:
            #print 'None'
            pass

    print timer.split()

    #now update predFile (SLOT 13)
    predFile = open(predName, 'r')
    newLines = []
    for line in predFile:
        CID = cg.ss(line)[7]
        if peakDict[CID][1] == 'None':
            peakInfo = 'None'
        else:
            peakInfo = '%s:%s:%s:%s:%s:%s' % (
                str(peakDict[CID][1][1])[-3:], 'S', str(
                    peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],
                peakDict[CID][1][6], peakDict[CID][1][7])
        newLines.append(cg.appendToLine(line, peakInfo, 13))
    predFile.close()

    predFile = open(predName, 'w')
    predFile.writelines(newLines)
    predFile.close()
Esempio n. 26
0
def intronNoisy(cName = None):
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)
	
	#init
	cHairs = getHairpins.getHairpins(conf.conf['resultsIntrons']) #CID: HAIRPIN
	organism = conf.conf['organism']
	exonList = compare.tccFileToList('%sExons.tcc' % organism, 0)
	slide = 1000
	
	#make prediction overlap hitmap
	predMap = {}
	predList = []
	for CID in cHairs:
		hPin = cHairs[CID]
		predList.append(hPin)
	
	#collapse Overlaps
	print ' collapsing predictions'
	predList = compare.collapseOverlaps(predList)
	print ' collapsing exons'
	exonList = compare.collapseOverlaps(exonList)
	
	
	#collect levels for each hairpin region
	cidLevels = {}
	for CID in cHairs:
		print CID
		hPin = cHairs[CID]
		chrom = ss(hPin, ':')[0]
		strand = ss(hPin, ':')[1]
		start = int(ss(hPin, ':')[2])
		end = int(ss(hPin, ':')[3])
		
		scanStart = start - slide
		scanEnd = end + slide
		
		scanRange = []
		scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start))
		scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd))
		
		print scanRange
		scanRange = compare.subtractTwoTccLists(scanRange, predList)
		scanRange = compare.subtractTwoTccLists(scanRange, exonList)
			
		levels = []
		
		print '  Retrieving Expression levels:', cg.getTccListTotalLength(scanRange)
		levels = []
		
		
		hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName)
		for hPin in hPinLevels:
			levels.extend(hPinLevels[hPin])
		
			
		cidLevels[CID] = levels
		
	#output levels to file
	
	#find longest
	longest = 0
	for CID in cidLevels:
		length = len(cidLevels[CID])
		if length > longest:
			longest = length
	
	sortedKeys = cidLevels.keys()
	sortedKeys.sort()
	
	newLines = []
	for j in range(0, longest): #how many lines are there
		newLine = []
		for CID in sortedKeys:
			if len(cidLevels[CID]) > j:# add it
				newLine.append(str(cidLevels[CID][j]))
			else:
				newLine.append('NA')
	
		newLines.append('\t'.join(newLine) + '\n')
	
	outFileN = conf.conf['intronNoiseData']
	outFile = open(outFileN, 'w')
	outFile.write('\t'.join(sortedKeys) + '\n')
	outFile.writelines(newLines)
	outFile.close()
Esempio n. 27
0
def scanVectorsSingleCoord(tccList, cName):
    '''Given tcc list --> scan wig files and coord:value...
	'''

    conf = c.getConfig(cName)
    org = conf.conf['organism']
    mConf = c.getConfig('Main.conf')
    wigDir = mConf.conf['wig%s' % org]

    timer = cg.cgTimer()
    timer.start()
    coordDict = {}  # tcc: [list values]
    for tcc in tccList:
        theSplit = ss(tcc, ':')
        chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1], int(
            theSplit[2]), int(theSplit[3])

        #goto correct fild, correct line in index

        fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom)
        fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),
                                                              strand, chrom)

        #print timer.split()
        #get line in index file
        iFile = open(fNindex, 'r')
        startByte = 'None'
        for line in iFile:
            beg = int(cg.ss(line)[1])
            end = int(cg.ss(line)[2])

            if beg <= tccStart < end:
                startByte = int(cg.ss(line)[0])
                #print 'INDEX', line.strip()
                break
        iFile.close()

        #print timer.split()
        #grab value
        f = open(fN, 'r')
        f.seek(startByte, 0)

        stop = False
        for line in f:
            #print 'Line:', line.strip()
            lBeg = int(cg.ss(line)[1])
            lEnd = int(cg.ss(line)[2])
            lValue = int(cg.ss(line)[3].split('.')[0])

            if tccStart > lBeg:
                lBeg = tccStart
            if tccEnd < lEnd:
                lEnd = tccEnd
                stop = True
            #print timer.split()

            for i in range(lBeg, lEnd):
                coordDict[i] = lValue

            if stop: break

        f.close()
    return coordDict
Esempio n. 28
0
def scanVectorsHist(tccList, cName):
	'''Given tcc list --> scan wig files and get histogram values
	can be modified to do single/total values...
	THIS USES INDEXES!!! = BAD...'''
	
	conf = c.getConfig(cName)
	org = conf.conf['organism']
	mConf = c.getConfig('Main.conf')
	wigDir = mConf.conf['wig%s' % org]

	
	timer = cg.cgTimer()
	timer.start()
	histDict = {} # tcc: [list values]
	for tcc in tccList:
		theSplit = ss(tcc, ':')
		chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1],int(theSplit[2]),int(theSplit[3])
		
		#goto correct fild, correct line in index
		
		fN = wigDir + '/Merge.%s.%s.wig.%s.wig' %  (org.lower(),strand,chrom)
		fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),strand,chrom)
		
		#print timer.split()
		#get line in index file
		iFile = open(fNindex, 'r')
		startByte = 'None'
		for line in iFile:
			beg = int(cg.ss(line)[1])
			end = int(cg.ss(line)[2])
			
			if beg <= tccStart < end:
				startByte = int(cg.ss(line)[0]) 
				#print 'INDEX', line.strip()
				break
		iFile.close()
		
		#print timer.split()
		#grab value
		f = open(fN, 'r')
		f.seek(startByte, 0)
		
		stop = False
		for line in f:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				try:
					histDict[tcc].append(lValue)
				except KeyError: #just for zero...so you don't have to if every time...
					histDict[tcc] = [lValue]
			if stop: break
	
		f.close()
		#print timer.split()
	return histDict