コード例 #1
0
ファイル: siRnaPredict.py プロジェクト: sknyx/ResearchScripts
def updateSignificant(resultsFN, simulationAverageFN, outFN):

    id_avgNum = {}
    f = open(simulationAverageFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        id_avgNum[int(ls[0])] = float(ls[1])

    f = open(resultsFN, 'r')
    newLines = []
    for line in f:
        ls = line.strip().split('\t')
        id = int(ls[0])
        numTargets = float(len(ls[4].split(',')))
        try:
            numExpected = id_avgNum[id]
        except KeyError:
            numExpected = 0

        sigFlag = 'SIG'
        if numTargets < numExpected:
            sigFlag = 'NON'

        #Do Calculations here

        updateVal = sigFlag

        #update newLines
        newLines.append(cg.appendToLine(line, updateVal, int(8)))
    f.close()

    #update file
    f = open(outFN, 'w')
    f.writelines(newLines)
    f.close()
コード例 #2
0
ファイル: cgConvert.py プロジェクト: JasonAng/ResearchScripts
def convertFile(fN, inFormat, outFormat, oFN = None):
	
	#get input order and extract info
	iO = returnOrderList(inFormat)
	oO = returnOrderList(outFormat)
        
	
	f = open(fN, 'r')
	newLines = []
	for line in f:
		ls = line.strip().split(iO[5])
		chrom, strand, start, end = ls[iO[0]], ls[iO[1]], ls[iO[2]], ls[iO[3]]
	        
                #switch to appropriate chromosome type if needed
                if len(chrom) == 1:
                        chrom = oO[6] + chrom

		#switch strand if need be
		if oO[4] == 0:
			if strand == '1' or strand == '+':
				strand = '1'
			else:
				strand = '-1'
		else:
			if strand == '1' or strand == '+':
				strand = '+'
			else:
				strand = '-'
		
		#construct new Line
		newLine = '\n'
		newLine = cg.appendToLine(newLine, chrom, oO[0], sep = oO[5])
		newLine = cg.appendToLine(newLine, strand, oO[1], sep = oO[5])
		newLine = cg.appendToLine(newLine, start, oO[2], sep = oO[5])
		newLine = cg.appendToLine(newLine, end, oO[3], sep = oO[5])
		
		newLines.append(newLine)
	f.close()
	
	#output file
	f = open(fN + '.' + outFormat, 'w')
	f.writelines(newLines)
	f.close()
コード例 #3
0
ファイル: cgConvert.py プロジェクト: sknyx/ResearchScripts
def convertFile(fN, inFormat, outFormat, oFN=None):

    #get input order and extract info
    iO = returnOrderList(inFormat)
    oO = returnOrderList(outFormat)

    f = open(fN, 'r')
    newLines = []
    for line in f:
        ls = line.strip().split(iO[5])
        chrom, strand, start, end = ls[iO[0]], ls[iO[1]], ls[iO[2]], ls[iO[3]]

        #switch to appropriate chromosome type if needed
        if len(chrom) == 1:
            chrom = oO[6] + chrom

        #switch strand if need be
        if oO[4] == 0:
            if strand == '1' or strand == '+':
                strand = '1'
            else:
                strand = '-1'
        else:
            if strand == '1' or strand == '+':
                strand = '+'
            else:
                strand = '-'

        #construct new Line
        newLine = '\n'
        newLine = cg.appendToLine(newLine, chrom, oO[0], sep=oO[5])
        newLine = cg.appendToLine(newLine, strand, oO[1], sep=oO[5])
        newLine = cg.appendToLine(newLine, start, oO[2], sep=oO[5])
        newLine = cg.appendToLine(newLine, end, oO[3], sep=oO[5])

        newLines.append(newLine)
    f.close()

    #output file
    f = open(fN + '.' + outFormat, 'w')
    f.writelines(newLines)
    f.close()
コード例 #4
0
ファイル: siRnaPredict.py プロジェクト: sknyx/ResearchScripts
def updateTargetsExpression(resultsFN, targetsFN, inputPosition,
                            updatePosition, outFN):

    #load target expression dict
    f = open(targetsFN, 'r')
    targetsDict = {}  # tID: eLevel
    for line in f:
        targetsDict[int(line.strip().split('\t')[0])] = int(
            line.strip().split('\t')[2])
    f.close()

    #For each sRNA, get target Expression.
    f = open(resultsFN, 'r')
    newLines = []
    for line in f:
        targets = line.strip().split('\t')[int(inputPosition)]
        targets = targets.strip().split(',')

        maxExpressionLevel = 0
        totalExpressionLevel = 0
        for tID in targets:
            tID = int(tID)
            tExpressionLevel = targetsDict[tID]

            totalExpressionLevel += targetsDict[tID]
            if tExpressionLevel > maxExpressionLevel:
                maxExpressionLevel = tExpressionLevel

        #update newLines
        newLine = cg.appendToLine(line, maxExpressionLevel,
                                  int(updatePosition))
        newLines.append(
            cg.appendToLine(newLine, totalExpressionLevel,
                            int(updatePosition) + 1))

    f.close()

    #update file
    f = open(outFN, 'w')
    f.writelines(newLines)
    f.close()
コード例 #5
0
ファイル: addZeroes.py プロジェクト: JasonAng/ResearchScripts
def addFiller(fN, filler, zeroPosition, outFN):
    filler = str(filler)
    f = open(fN, "r")

    newLines = []
    for line in f:
        newLines.append(bioLibCG.appendToLine(line, filler, int(zeroPosition)))
    f.close()

    fOut = open(outFN, "w")
    fOut.writelines(newLines)
    fOut.close()
コード例 #6
0
ファイル: addZeroes.py プロジェクト: sknyx/ResearchScripts
def addFiller(fN, filler, zeroPosition, outFN):
        filler = str(filler)
        f = open(fN, 'r')

        newLines = []
        for line in f:
                newLines.append(bioLibCG.appendToLine(line, filler, int(zeroPosition)))
        f.close()

        fOut = open(outFN, 'w')
        fOut.writelines(newLines)
        fOut.close()
コード例 #7
0
def transcriptSetOverlapDegFileHitmap(degFile, runningChrom, runningStrand):

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)
        transcriptTccs = []
        for gene in allExons.set.values():
                for transcript in gene.transcripts:
                        transcriptTccs.append(transcript.tcc)

        #create hitmap
        coordSet = set()
        for tcc in transcriptTccs:
                chrom, strand, start, end = cg.tccSplit(tcc)
                
                if chrom != runningChrom:
                        continue

                if strand != runningStrand:
                        continue

                for i in range(start, end + 1):
                        coordSet.add(i)

        #find overlapping degTccs
        print 'done creating hitmap'
        

        f = open(degFile, 'r')
	newLines = []
	for line in f:
	        ls = line.strip().split('\t') 
                degTcc = cg.convertToAS(ls[1])
                chrom, strand, start, end = cg.tccSplit(degTcc)
                if chrom != runningChrom:
                        continue

                if strand != runningStrand:
                        continue

                inTran = '0'
                for i in xrange(start, end + 1):
                        if i in coordSet:
                                inTran = '1'
                                break

		#update newLines
                newLine = cg.appendToLine(line, inTran, 3)
                newLines.append(newLine)         
	f.close()

        f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w')
        f.writelines(newLines)
        f.close()
コード例 #8
0
ファイル: siRnaPredict.py プロジェクト: sknyx/ResearchScripts
def transcriptSetOverlapDegFileHitmap(degFile, runningChrom, runningStrand):

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)
    transcriptTccs = []
    for gene in allExons.set.values():
        for transcript in gene.transcripts:
            transcriptTccs.append(transcript.tcc)

#create hitmap
    coordSet = set()
    for tcc in transcriptTccs:
        chrom, strand, start, end = cg.tccSplit(tcc)

        if chrom != runningChrom:
            continue

        if strand != runningStrand:
            continue

        for i in range(start, end + 1):
            coordSet.add(i)

#find overlapping degTccs
    print 'done creating hitmap'

    f = open(degFile, 'r')
    newLines = []
    for line in f:
        ls = line.strip().split('\t')
        degTcc = cg.convertToAS(ls[1])
        chrom, strand, start, end = cg.tccSplit(degTcc)
        if chrom != runningChrom:
            continue

        if strand != runningStrand:
            continue

        inTran = '0'
        for i in xrange(start, end + 1):
            if i in coordSet:
                inTran = '1'
                break

    #update newLines
        newLine = cg.appendToLine(line, inTran, 3)
        newLines.append(newLine)
    f.close()

    f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w')
    f.writelines(newLines)
    f.close()
コード例 #9
0
def updateTargetsExpression(resultsFN, targetsFN, inputPosition, updatePosition, outFN):
	
        #load target expression dict
        f = open(targetsFN, 'r')
        targetsDict = {} # tID: eLevel
        for line in f:
                targetsDict[int(line.strip().split('\t')[0])] = int(line.strip().split('\t')[2])
        f.close()


        #For each sRNA, get target Expression.
	f = open(resultsFN, 'r')
	newLines = []
	for line in f:
		targets = line.strip().split('\t')[int(inputPosition)]
		targets = targets.strip().split(',')
                
                maxExpressionLevel = 0
                totalExpressionLevel = 0
                for tID in targets:
                        tID = int(tID)
                        tExpressionLevel = targetsDict[tID]

                        totalExpressionLevel += targetsDict[tID]
                        if tExpressionLevel > maxExpressionLevel:
                                maxExpressionLevel = tExpressionLevel

	        	
		#update newLines
                newLine = cg.appendToLine(line, maxExpressionLevel, int(updatePosition))
		newLines.append(cg.appendToLine(newLine, totalExpressionLevel, int(updatePosition) + 1))
                
	f.close()
	
	
	#update file
	f = open(outFN, 'w')
	f.writelines(newLines)
	f.close()
コード例 #10
0
ファイル: addZeroes.py プロジェクト: sknyx/ResearchScripts
def addFiller(fN, filler, zeroPosition, outFN):
        filler = str(filler)
        
        idFlag = False
        if filler == 'ID':
                idFlag = True

        f = open(fN, 'r')
        newLines = []

        i = 0
        for line in f:
                if idFlag:
                        filler = str(i)
                newLines.append(bioLibCG.appendToLine(line, filler, int(zeroPosition)))
                i += 1
        f.close()

        fOut = open(outFN, 'w')
        fOut.writelines(newLines)
        fOut.close()
コード例 #11
0
def transcriptSetOverlapDegFile(degFile):

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)

	#get degradome TCCS
	#note that you need to test the AS peaks, this is the location of the targetted transcript
        
        degTccs = []
        f = open(degFile, 'r')
        for line in f:
                ls = line.strip().split('\t')
                degTccs.append(ls[1])
        f.close()
                        

        degTccs = [cg.convertToAS(x) for x in degTccs]

	#find all overlapping exons/transcripts, then all results sequences that overlap exons
	overlappingExons = allExons.transcriptOverlaps(degTccs)
	#print len(overlappingExons), "num of overlapping exons"
        overlappingExonTccs = [x.tcc for x in overlappingExons]
	overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

        
        f = open(degFile, 'r')
	newLines = []
	for line in f:
	        
                degTcc = cg.convertToAS(ls[1])
               
                inTran = '0'
                if degTcc in overlappingDegTccs:
                        inTran = '1'

		#update newLines
                newLine = cg.appendToLine(line, inTran, 3)
                
	f.close()
コード例 #12
0
ファイル: siRnaPredict.py プロジェクト: sknyx/ResearchScripts
def transcriptSetOverlapDegFile(degFile):

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)

    #get degradome TCCS
    #note that you need to test the AS peaks, this is the location of the targetted transcript

    degTccs = []
    f = open(degFile, 'r')
    for line in f:
        ls = line.strip().split('\t')
        degTccs.append(ls[1])
    f.close()

    degTccs = [cg.convertToAS(x) for x in degTccs]

    #find all overlapping exons/transcripts, then all results sequences that overlap exons
    overlappingExons = allExons.transcriptOverlaps(degTccs)
    #print len(overlappingExons), "num of overlapping exons"
    overlappingExonTccs = [x.tcc for x in overlappingExons]
    overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

    f = open(degFile, 'r')
    newLines = []
    for line in f:

        degTcc = cg.convertToAS(ls[1])

        inTran = '0'
        if degTcc in overlappingDegTccs:
            inTran = '1'

    #update newLines
        newLine = cg.appendToLine(line, inTran, 3)

    f.close()
コード例 #13
0
def updateSignificant(resultsFN, simulationAverageFN, outFN):
        
        id_avgNum = {}
        f = open(simulationAverageFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                id_avgNum[int(ls[0])] = float(ls[1])
        
        f = open(resultsFN, 'r')
        newLines = []
        for line in f:
                ls = line.strip().split('\t')
                id = int(ls[0])
                numTargets = float(len(ls[4].split(',')))
                try:
                        numExpected = id_avgNum[id]
                except KeyError:
                        numExpected = 0

                sigFlag = 'SIG'
                if numTargets < numExpected:
                        sigFlag = 'NON'

                #Do Calculations here


                updateVal = sigFlag

                #update newLines
                newLines.append(cg.appendToLine(line, updateVal, int(8)))
        f.close()
        
        
        #update file
        f = open(outFN, 'w')
        f.writelines(newLines)
        f.close()
コード例 #14
0
def updateReadDensity(tType, cName):
	#go through wig each chromosome and check the mature seqs
	mainConf = cgConfig.cgConfig('Main.conf')
	conf = cgConfig.getConfig(cName)
	organism = conf.conf['organism']
	wigFolder = mainConf.conf['wig%s' % organism]	
	newLines = []
	
	
	#Differentiate between exon or intron...
	if tType == 'E':
		pFileName = conf.conf['resultsExons']
	elif tType == 'I':
		pFileName = conf.conf['resultsIntrons']
	else:
		print 'READ UPDATE FAIL'

	print '  Updating Read Density:', tType

	
	#get read density for each line...
	print '  calculating hits for mature seqs'
	#calculate total hits per mature
	mirFile = open(pFileName, 'r')
	for line in mirFile:
		mTcc = line.strip().split('\t')[1]
		mirID = line.strip().split('\t')[0]
		
		tccStretch = cgPeaks.stretch(mTcc, cName)
		highestHit = 0
		for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])):
			if i in tccStretch.profile:
				if tccStretch.profile[i] > highestHit:
					highestHit = tccStretch.profile[i]		
		
		newLines.append(cg.appendToLine(line, str(highestHit), 11))
	
	mirFile.close()

	print 'Writing New File'
	#write new results file
	outFile = open(pFileName, 'w')
	for line in newLines:
		outFile.write(line)
	outFile.close()

	####NOW UPDATE HIGHEST HIT PER CLUSTER####

	clusterCount = {}

	pFile = open(pFileName, 'r')
	for line in pFile:
		predictionCount = int(line.strip().split('\t')[11])
		CID = line.strip().split('\t')[7]
		if CID in clusterCount:
			if clusterCount[CID] < predictionCount:
				clusterCount[CID] = predictionCount
		else:
			clusterCount[CID] = predictionCount
	pFile.close()

	#update the file --> cluster small count
	newLines = []
	predFile = open(pFileName, 'r')
	for line in predFile:
		CID = line.strip().split('\t')[7]
		numMax = clusterCount[CID]
		newLines.append(cg.appendToLine(line, str(numMax), 12))
	predFile.close()

	#sort newLines by clusterID
	sortDict = {}
	CIDs = []
	for line in newLines:
		CID = int(line.strip().split('\t')[7])
		if CID not in CIDs:
			CIDs.append(CID)
		if CID in sortDict:
			sortDict[CID].append(line)
		else:
			sortDict[CID] = [line]
		
	CIDs.sort()

	newLines = []
	for CID in CIDs:
		for line in sortDict[CID]:
			newLines.append(line)

	#write new File
	newFile = open(pFileName, 'w')
	for line in newLines:
		newFile.write(line)
	newFile.close()
コード例 #15
0
def updateReadDensity(tType):
    #go through wig each chromosome and check the mature seqs
    mainConf = cgConfig.cgConfig('Main.conf')
    conf = cgConfig.cgConfig()
    organism = conf.conf['organism']
    wigFolder = mainConf.conf['wig%s' % organism]
    newLines = []

    if tType == 'E':
        pFileName = conf.conf['resultsExons']
    elif tType == 'I':
        pFileName = conf.conf['resultsIntrons']
    else:
        print 'READ UPDATE FAIL'

    print '  Updating Read Density:', tType

    for wigFileN in cg.recurseDir(wigFolder, end='.wig'):

        #init
        chrom = wigFileN.strip().split('.')[-2]
        strand = wigFileN.strip().split('.')[-4]
        wigFile = open(wigFileN, 'r')
        mirFile = open(pFileName, 'r')
        print wigFileN

        #get rid of header
        wigFile.readline()

        print '  populating hitmap'
        #populate hitmap
        wigMap = {}
        for line in wigFile:
            value = int(line.strip().split('\t')[3].split('.')[0])
            if value > 0:
                start = int(line.strip().split('\t')[1])
                end = int(line.strip().split('\t')[2])
                for i in range(start, end):
                    wigMap[i] = value
        wigFile.close()

        print '  calculating hits for mature seqs'
        #calculate total hits per mature
        for line in mirFile:
            mTcc = line.strip().split('\t')[1]
            mirID = line.strip().split('\t')[0]
            if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1]
                                                  == strand):
                #if mirID == '26477.30.106643972': print 'Starting Total Count'
                highestHit = 0
                for i in range(int(mTcc.split(':')[2]),
                               int(mTcc.split(':')[3])):
                    #if mirID == '26477.30.106643972': print '  ', i
                    if i in wigMap:
                        if wigMap[i] > highestHit:
                            highestHit = wigMap[i]
                        #if mirID == '26477.30.106643972': print '    ', i, totalHits, wigMap[i]

                newLines.append(cg.appendToLine(line, str(highestHit), 11))

        mirFile.close()

    print 'Writing New File'
    #write new results file
    outFile = open(pFileName, 'w')
    for line in newLines:
        outFile.write(line)
    outFile.close()

    ####NOW UPDATE HIGHEST HIT PER CLUSTER####

    clusterCount = {}

    pFile = open(pFileName, 'r')
    for line in pFile:
        predictionCount = int(line.strip().split('\t')[11])
        CID = line.strip().split('\t')[7]
        if CID in clusterCount:
            if clusterCount[CID] < predictionCount:
                clusterCount[CID] = predictionCount
        else:
            clusterCount[CID] = predictionCount
    pFile.close()

    #update the file --> cluster small count
    newLines = []
    predFile = open(pFileName, 'r')
    for line in predFile:
        CID = line.strip().split('\t')[7]
        numMax = clusterCount[CID]
        newLines.append(cg.appendToLine(line, str(numMax), 12))
    predFile.close()

    #sort newLines by clusterID
    sortDict = {}
    CIDs = []
    for line in newLines:
        CID = int(line.strip().split('\t')[7])
        if CID not in CIDs:
            CIDs.append(CID)
        if CID in sortDict:
            sortDict[CID].append(line)
        else:
            sortDict[CID] = [line]

    CIDs.sort()

    newLines = []
    for CID in CIDs:
        for line in sortDict[CID]:
            newLines.append(line)

    #write new File
    newFile = open(pFileName, 'w')
    for line in newLines:
        newFile.write(line)
    newFile.close()
コード例 #16
0
	
	print '  calculating hits for mature seqs'
	#calculate total hits per mature
	for line in mirFile:
		mTcc = line.strip().split('\t')[1]
		mirID = line.strip().split('\t')[0]
		if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1] == strand):
			#if mirID == '26477.30.106643972': print 'Starting Total Count'
			totalHits = 0
			for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])):
				#if mirID == '26477.30.106643972': print '  ', i 
				if i in wigMap:
					totalHits += wigMap[i]
					#if mirID == '26477.30.106643972': print '    ', i, totalHits, wigMap[i]
		
			newLines.append(cg.appendToLine(line, str(totalHits), 11))
	
	mirFile.close()

print 'Writing New File'
#write new results file
outFile = open(pFileName, 'w')
for line in newLines:
	outFile.write(line)
outFile.close()

####NOW UPDATE MAX HITS PER CLUSTER####

clusterCount = {}

pFile = open(pFileName, 'r')
コード例 #17
0
print 'Total', numT
print 'A', aPass
print 'B', bPass
print 'C', cPass

#output results to a file for R
outFile = open('mousePeaksResults.R.data', 'w')
outFile.write('tcc\tpeakOne\tpeakTwo\tdistance\tratio\tmax\thRatio\n')
for peak in bestCombos:
    outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' %
                  (peak[0], peak[1], peak[2], peak[3], peak[4], peak[5]))
outFile.close()

#now update predFile (SLOT 13)
predFile = open(predName, 'r')
newLines = []
for line in predFile:
    CID = cg.ss(line)[7]
    if peakDict[CID][1] == 'None':
        peakInfo = 'None'
    else:
        peakInfo = '%s:%s:%s:%s' % (
            str(peakDict[CID][1][1])[-3:], str(peakDict[CID][1][2])[-3:],
            str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5])
    newLines.append(cg.appendToLine(line, peakInfo, 13))
predFile.close()

predFile = open(predName, 'w')
predFile.writelines(newLines)
predFile.close()
コード例 #18
0
def updateReadDensity(tType, cName):
    #go through wig each chromosome and check the mature seqs
    mainConf = cgConfig.cgConfig('Main.conf')
    conf = cgConfig.getConfig(cName)
    organism = conf.conf['organism']
    wigFolder = mainConf.conf['wig%s' % organism]
    newLines = []

    #Differentiate between exon or intron...
    if tType == 'E':
        pFileName = conf.conf['resultsExons']
    elif tType == 'I':
        pFileName = conf.conf['resultsIntrons']
    else:
        print 'READ UPDATE FAIL'

    print '  Updating Read Density:', tType

    #get read density for each line...
    print '  calculating hits for mature seqs'
    #calculate total hits per mature
    mirFile = open(pFileName, 'r')
    for line in mirFile:
        mTcc = line.strip().split('\t')[1]
        mirID = line.strip().split('\t')[0]

        tccStretch = cgPeaks.stretch(mTcc, cName)
        highestHit = 0
        for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])):
            if i in tccStretch.profile:
                if tccStretch.profile[i] > highestHit:
                    highestHit = tccStretch.profile[i]

        newLines.append(cg.appendToLine(line, str(highestHit), 11))

    mirFile.close()

    print 'Writing New File'
    #write new results file
    outFile = open(pFileName, 'w')
    for line in newLines:
        outFile.write(line)
    outFile.close()

    ####NOW UPDATE HIGHEST HIT PER CLUSTER####

    clusterCount = {}

    pFile = open(pFileName, 'r')
    for line in pFile:
        predictionCount = int(line.strip().split('\t')[11])
        CID = line.strip().split('\t')[7]
        if CID in clusterCount:
            if clusterCount[CID] < predictionCount:
                clusterCount[CID] = predictionCount
        else:
            clusterCount[CID] = predictionCount
    pFile.close()

    #update the file --> cluster small count
    newLines = []
    predFile = open(pFileName, 'r')
    for line in predFile:
        CID = line.strip().split('\t')[7]
        numMax = clusterCount[CID]
        newLines.append(cg.appendToLine(line, str(numMax), 12))
    predFile.close()

    #sort newLines by clusterID
    sortDict = {}
    CIDs = []
    for line in newLines:
        CID = int(line.strip().split('\t')[7])
        if CID not in CIDs:
            CIDs.append(CID)
        if CID in sortDict:
            sortDict[CID].append(line)
        else:
            sortDict[CID] = [line]

    CIDs.sort()

    newLines = []
    for CID in CIDs:
        for line in sortDict[CID]:
            newLines.append(line)

    #write new File
    newFile = open(pFileName, 'w')
    for line in newLines:
        newFile.write(line)
    newFile.close()
コード例 #19
0
	else:
		print 'None'

print timer.split()


#output results to a file for R
outFile = open('mousePeaksResults.R.data', 'w')
outFile.write('tcc\tpeakOne\tpeakTwo\tdistance\tratio\tmax\thRatio\n')
for peak in bestCombos:
	outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (peak[0], peak[1],peak[2],peak[3],peak[4],peak[5]))
outFile.close()


#now update predFile (SLOT 13)
predFile = open(predName, 'r')
newLines = []
for line in predFile:
	CID = cg.ss(line)[7]
	if peakDict[CID][1] == 'None':
		peakInfo = 'None'
	else:
		peakInfo = '%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], str(peakDict[CID][1][2])[-3:], str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5])
	newLines.append(cg.appendToLine(line, peakInfo, 13))
predFile.close()

predFile = open(predName, 'w')
predFile.writelines(newLines)
predFile.close()
	
コード例 #20
0
ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllTranscripts.tsv')

cDesc = {} #CID:gDesc
for CID in cHairs:
	tcc = cHairs[CID]
	
	cDesc[CID] = "NONE"
	
	overlappingGenes = ensGenes.geneOverlaps([tcc])
	if len(overlappingGenes) > 0:
		print overlappingGenes[0].type
		cDesc[CID] = overlappingGenes[0].type

f = open(fN, 'r')
newLines = []
for line in f:
	CID = line.strip().split('\t')[7]
	newLines.append(cg.appendToLine(line, cDesc[CID], 16))
f.close()

f = open(fN + '.FINAL', 'w')
f.writelines(newLines)
f.close()
		
		
	
	
		
	
コード例 #21
0
ファイル: appendGDesc.py プロジェクト: sknyx/ResearchScripts
mConf = c.getConfig('Main.conf')
geneSetFolder = mConf.conf['geneSetsHuman']

fN = '/home/chrisgre/projects/NoncodingHuman/results/NChuman-s3k8b17.results.sorted.introns.sorted'
cHairs = getHairpins.getHairpins(fN)

ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder +
                                         '/ensemblAllTranscripts.tsv')

cDesc = {}  #CID:gDesc
for CID in cHairs:
    tcc = cHairs[CID]

    cDesc[CID] = "NONE"

    overlappingGenes = ensGenes.geneOverlaps([tcc])
    if len(overlappingGenes) > 0:
        print overlappingGenes[0].type
        cDesc[CID] = overlappingGenes[0].type

f = open(fN, 'r')
newLines = []
for line in f:
    CID = line.strip().split('\t')[7]
    newLines.append(cg.appendToLine(line, cDesc[CID], 16))
f.close()

f = open(fN + '.FINAL', 'w')
f.writelines(newLines)
f.close()
コード例 #22
0
def findPeaks(pType, cName=None):

    #init
    mConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    if pType == 'E':
        predName = conf.conf['resultsExonsSorted']
    else:
        predName = conf.conf['resultsIntronsSorted']

    print predName
    #make CID:hairpin:peak dictionary
    cHairs = getHairpins.getHairpins(predName)
    peakDict = {}
    for CID in cHairs:
        peakDict[CID] = [cHairs[CID], 'None']

    timer = cg.cgTimer()
    timer.start()

    #put peaks in memory
    print 'Creating peak data'
    peaks = {}  # chr:peak:value
    for CID in cHairs:
        chrom, strand, start, end = cg.tccSplit(cHairs[CID])
        tcc = cHairs[CID]

        #init dictionary
        if chrom not in peaks:
            peaks[chrom] = {}

        if strand not in peaks[chrom]:
            peaks[chrom][strand] = {}

        #create peaks for tcc and add to peak dictionary
        stretch = cgPeaks.stretch(tcc, cName)
        stretch.createPeaks()
        for peakCoord in stretch.peaks:
            peaks[chrom][strand][peakCoord] = 0
    print timer.split()

    print 'finding best combos'
    bestCombos = []
    aPass = 0
    bPass = 0
    cPass = 0
    numT = 0
    for CID in peakDict:
        cgFlag = False
        if CID == '538': cgFlag = True

        tcc = peakDict[CID][0]
        #print tcc
        tccPeaks = []
        chrom = cg.ss(tcc, ':')[0]
        strand = cg.ss(tcc, ':')[1]
        start = int(cg.ss(tcc, ':')[2])
        end = int(cg.ss(tcc, ':')[3])

        #get all peaks
        for i in range(start, end + 1):
            if i in peaks[chrom][strand]:
                #print '  peak added', i
                tccPeaks.append(i)

        #Calculate parameters...
        pairStrings = []  #used to check if pair already added
        peakCombos = []
        for x in tccPeaks:

            #scan a 30 bp range around this point and find the best roof...
            pRange = 30
            rTcc = cg.makeTcc(chrom, strand, x, x + 1)

            #quickly get max value...kinda a long way to do it but whatever
            cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                         1,
                                                         cName,
                                                         ratio=False)
            xval = cProfile[0]
            max = xval
            highestValueCoord = x

            #now make profile for roof...
            cProfile = stepVectorScan.profileAroundPoint(rTcc,
                                                         pRange,
                                                         cName,
                                                         ratio=True)

            #now get highest stretch length and the rNext coord.
            minVal = .80
            highest = 0
            stretch = 0
            startCurrent = None
            startFinal = None
            endFinal = None
            for i in range(1 - pRange, pRange):
                if cProfile[i] > minVal:
                    stretch += 1
                    if startCurrent == None:
                        startCurrent = i
                else:
                    if stretch > 0:
                        if stretch > highest:  #stretch ended and was higher than previous
                            highest = stretch
                            endFinal = i - 1
                            startFinal = startCurrent
                            startCurrent = None
                        else:
                            startCurrent = None
                    stretch = 0

            #get +/- 4 value...
            val = [1.0, 1.0]
            if (startFinal) and (endFinal):
                low = startFinal - 4
                high = endFinal + 4
                if low > (1 - pRange):
                    if high < pRange:
                        val[0] = float(cProfile[startFinal - 4])
                        val[1] = float(cProfile[endFinal + 4])

            #fill in other details...
            y = 'S'
            dist = 'S'
            ratio = 'S'

            peakCombos.append([tcc, x, y, dist, ratio, max, highest, val])
            #print '  ', peakCombos[-1]

        #find best combo...
        topCombo = None
        for combo in peakCombos:
            roofLength = combo[6]
            dropValue = combo[7][0]
            if combo[7][1] > dropValue:
                dropValue = combo[7][1]

            #print roofLength, dropValue
            if 14 < roofLength < 26:
                if 0.0 < dropValue < 0.2:
                    #pick one with rooflength nearest 20:
                    if topCombo:
                        if (math.fabs(22 - roofLength)) < (
                                math.fabs(22 - topCombo[6])):
                            topCombo = combo
                    else:
                        topCombo = combo

        if topCombo:
            peakDict[CID][1] = topCombo
            bestCombos.append(topCombo)
            print bestCombos[-1]
        else:
            #print 'None'
            pass

    print timer.split()

    #now update predFile (SLOT 13)
    predFile = open(predName, 'r')
    newLines = []
    for line in predFile:
        CID = cg.ss(line)[7]
        if peakDict[CID][1] == 'None':
            peakInfo = 'None'
        else:
            peakInfo = '%s:%s:%s:%s:%s:%s' % (
                str(peakDict[CID][1][1])[-3:], 'S', str(
                    peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],
                peakDict[CID][1][6], peakDict[CID][1][7])
        newLines.append(cg.appendToLine(line, peakInfo, 13))
    predFile.close()

    predFile = open(predName, 'w')
    predFile.writelines(newLines)
    predFile.close()
コード例 #23
0
def updateNoise(pType, cName=None):

    #init
    mainConf = c.cgConfig('Main.conf')
    conf = c.getConfig(cName)

    if pType == 'E':
        predName = conf.conf['resultsExons']
    else:
        predName = conf.conf['resultsIntrons']

    #populate cid: exon dist
    print 'Populating CID/INtron/exon distribution data'
    if pType == 'E':
        noiseFN = conf.conf['exonNoiseData']
        f = open(noiseFN, 'r')
    else:
        noiseFN = conf.conf['intronNoiseData']
        f = open(noiseFN, 'r')

    exonDists = {}  #cid: [exon dist]
    header = f.readline()
    order = {}  # num:CID
    for i, CID in enumerate(header.strip().split('\t')):
        order[i] = CID
        exonDists[CID] = []

    for line in f:
        data = line.strip().split('\t')
        for i, dataPoint in enumerate(data):
            if dataPoint == 'NA' or dataPoint == '':
                continue
            else:
                dataPoint = float(dataPoint)
                CID = order[i]
                exonDists[CID].append(dataPoint)

    #get highest expression level for each cluster
    print 'Populating highest expression levels'
    predExpression = {}  # CID; highest level
    exonFile = open(predName, 'r')
    for line in exonFile:
        CID = line.strip().split('\t')[7]
        hDensity = line.strip().split('\t')[12]

        predExpression[CID] = hDensity

    #get pVals for each CID
    print 'Getting pvals for each cluster'
    pVals = {}  # CID; [lam,pVal]
    for CID in exonDists:
        if not len(exonDists[CID]) > 0:  #no data in 2kb range.
            lam = 'NA'
            pVal = 'NA'
        else:
            lam = cgStats.getLam(exonDists[CID])
            pVal = cgStats.getPValExp(predExpression[CID], lam)

        pVals[CID] = [
            lam, pVal
        ]  #lam gives a good approximation of noise levels in region...

    print 'Updating the file'
    #update file...
    predFile = open(predName, 'r')
    newLines = []
    for line in predFile:
        CID = line.split('\t')[7]
        newLine = cg.appendToLine(line, pVals[CID][0], 14)
        newLine = cg.appendToLine(newLine, pVals[CID][1], 15)
        newLines.append(newLine)
    predFile.close()

    predFile = open(predName, 'w')
    predFile.writelines(newLines)
    predFile.close()
コード例 #24
0
def updateReadDensity(tType):
	#go through wig each chromosome and check the mature seqs
	mainConf = cgConfig.cgConfig('Main.conf')
	conf = cgConfig.cgConfig()
	organism = conf.conf['organism']
	wigFolder = mainConf.conf['wig%s' % organism]	
	newLines = []
	

	if tType == 'E':
		pFileName = conf.conf['resultsExons']
	elif tType == 'I':
		pFileName = conf.conf['resultsIntrons']
	else:
		print 'READ UPDATE FAIL'

	print '  Updating Read Density:', tType

	for wigFileN in cg.recurseDir(wigFolder, end = '.wig'):
		
		
		#init
		chrom = wigFileN.strip().split('.')[-2]
		strand = wigFileN.strip().split('.')[-4]
		wigFile = open(wigFileN, 'r')
		mirFile = open(pFileName, 'r')
		print wigFileN
		
		#get rid of header
		wigFile.readline()
		
		print '  populating hitmap'
		#populate hitmap
		wigMap = {}
		for line in wigFile:
			value = int(line.strip().split('\t')[3].split('.')[0])
			if value > 0:
				start = int(line.strip().split('\t')[1])
				end = int(line.strip().split('\t')[2])
				for i in range(start, end):
					wigMap[i] = value
		wigFile.close()
		
		print '  calculating hits for mature seqs'
		#calculate total hits per mature
		for line in mirFile:
			mTcc = line.strip().split('\t')[1]
			mirID = line.strip().split('\t')[0]
			if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1] == strand):
				#if mirID == '26477.30.106643972': print 'Starting Total Count'
				highestHit = 0
				for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])):
					#if mirID == '26477.30.106643972': print '  ', i 
					if i in wigMap:
						if wigMap[i] > highestHit:
							highestHit = wigMap[i]
						#if mirID == '26477.30.106643972': print '    ', i, totalHits, wigMap[i]
			
				newLines.append(cg.appendToLine(line, str(highestHit), 11))
		
		mirFile.close()

	print 'Writing New File'
	#write new results file
	outFile = open(pFileName, 'w')
	for line in newLines:
		outFile.write(line)
	outFile.close()

	####NOW UPDATE HIGHEST HIT PER CLUSTER####

	clusterCount = {}

	pFile = open(pFileName, 'r')
	for line in pFile:
		predictionCount = int(line.strip().split('\t')[11])
		CID = line.strip().split('\t')[7]
		if CID in clusterCount:
			if clusterCount[CID] < predictionCount:
				clusterCount[CID] = predictionCount
		else:
			clusterCount[CID] = predictionCount
	pFile.close()

	#update the file --> cluster small count
	newLines = []
	predFile = open(pFileName, 'r')
	for line in predFile:
		CID = line.strip().split('\t')[7]
		numMax = clusterCount[CID]
		newLines.append(cg.appendToLine(line, str(numMax), 12))
	predFile.close()

	#sort newLines by clusterID
	sortDict = {}
	CIDs = []
	for line in newLines:
		CID = int(line.strip().split('\t')[7])
		if CID not in CIDs:
			CIDs.append(CID)
		if CID in sortDict:
			sortDict[CID].append(line)
		else:
			sortDict[CID] = [line]
		
	CIDs.sort()

	newLines = []
	for CID in CIDs:
		for line in sortDict[CID]:
			newLines.append(line)

	#write new File
	newFile = open(pFileName, 'w')
	for line in newLines:
		newFile.write(line)
	newFile.close()
コード例 #25
0
ファイル: filtering.py プロジェクト: JasonAng/ResearchScripts
def filterOutTargets(resultsFN, centerFN, mismatchFN, targetFN, tranCheck, mPick, cPick, minCenterLevel, inputPosition, updatePosition, outFN):
        '''Pick is the range in which you need the values for. 0 is 4bp around, 1 is 6...'''	

        #make mismatch dict
        mmDict = {} # siD: tID : mmVal
        f = open(mismatchFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                sID = int(ls[0])
                tID = int(ls[1])
                mmVal = int(ls[2 + mPick])
                if not sID in mmDict:
                        mmDict[sID] = {}

                mmDict[sID][tID] = mmVal
        f.close()

        #make center dict
        centerDict = {} # sID: tID: centerVal
        f = open(centerFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                sID = int(ls[0])
                tID = int(ls[1])
                centerVal = float(ls[2 + cPick])
                if not sID in centerDict:
                        centerDict[sID] = {}

                centerDict[sID][tID] = centerVal
        f.close()

        #make transcript target dict
        tranVals = {} # tID : tranValue
        f = open(targetFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                tID = int(ls[0])
                tranVal = ls[3]
                tranVals[tID] = tranVal

        #go through and test all the targets.

	f = open(resultsFN, 'r')
	newLines = []
	for line in f:
                sID = int(line.strip().split('\t')[0])
		targets = line.strip().split('\t')[int(inputPosition)]
		targets = targets.strip().split(',')
                
                newTargetList = []
                for tID in targets:
                        ##print sID, tID
                        tID = int(tID)
                        
                        #Check for inside transcript
                        if tranCheck:
                                if tranVals[tID] == '0':
                                        continue

                        #check mismatches
                        mmVal = mmDict[sID][tID]
                        if mmVal == 1:
                                continue

                        #check center Expression
                        centerVal = centerDict[sID][tID]
                        if centerVal < minCenterLevel:
                                continue
	        	
                        newTargetList.append(str(tID))

                if len(newTargetList) < 1: continue 
                newTargets = ','.join(newTargetList)

		#update newLines
	        newLines.append(bioLibCG.appendToLine(line, newTargets, int(updatePosition)))
                
	f.close()
	
	
	#update file
	f = open(outFN, 'w')
	f.writelines(newLines)
	f.close()
コード例 #26
0
ファイル: filtering.py プロジェクト: sknyx/ResearchScripts
def filterOutTargets(resultsFN, centerFN, mismatchFN, targetFN, tranCheck,
                     mPick, cPick, minCenterLevel, inputPosition,
                     updatePosition, outFN):
    '''Pick is the range in which you need the values for. 0 is 4bp around, 1 is 6...'''

    #make mismatch dict
    mmDict = {}  # siD: tID : mmVal
    f = open(mismatchFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        sID = int(ls[0])
        tID = int(ls[1])
        mmVal = int(ls[2 + mPick])
        if not sID in mmDict:
            mmDict[sID] = {}

        mmDict[sID][tID] = mmVal
    f.close()

    #make center dict
    centerDict = {}  # sID: tID: centerVal
    f = open(centerFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        sID = int(ls[0])
        tID = int(ls[1])
        centerVal = float(ls[2 + cPick])
        if not sID in centerDict:
            centerDict[sID] = {}

        centerDict[sID][tID] = centerVal
    f.close()

    #make transcript target dict
    tranVals = {}  # tID : tranValue
    f = open(targetFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        tID = int(ls[0])
        tranVal = ls[3]
        tranVals[tID] = tranVal

    #go through and test all the targets.

    f = open(resultsFN, 'r')
    newLines = []
    for line in f:
        sID = int(line.strip().split('\t')[0])
        targets = line.strip().split('\t')[int(inputPosition)]
        targets = targets.strip().split(',')

        newTargetList = []
        for tID in targets:
            ##print sID, tID
            tID = int(tID)

            #Check for inside transcript
            if tranCheck:
                if tranVals[tID] == '0':
                    continue

            #check mismatches
            mmVal = mmDict[sID][tID]
            if mmVal == 1:
                continue

            #check center Expression
            centerVal = centerDict[sID][tID]
            if centerVal < minCenterLevel:
                continue

            newTargetList.append(str(tID))

        if len(newTargetList) < 1: continue
        newTargets = ','.join(newTargetList)

        #update newLines
        newLines.append(
            bioLibCG.appendToLine(line, newTargets, int(updatePosition)))

    f.close()

    #update file
    f = open(outFN, 'w')
    f.writelines(newLines)
    f.close()
コード例 #27
0
def findPeaks(pType, cName = None):
	
	#init
	mConf = c.cgConfig('Main.conf')
	conf = c.getConfig(cName)

	if pType == 'E':
		predName = conf.conf['resultsExonsSorted']
	else:
		predName = conf.conf['resultsIntronsSorted']
	
	print predName
	#make CID:hairpin:peak dictionary
	cHairs = getHairpins.getHairpins(predName)
	peakDict = {}
	for CID in cHairs:
		peakDict[CID] = [cHairs[CID],'None']
		

	timer = cg.cgTimer()
	timer.start()

	#put peaks in memory
	print 'Creating peak data'
	peaks = {} # chr:peak:value
	for CID in cHairs:
		chrom, strand, start, end = cg.tccSplit(cHairs[CID])
		tcc = cHairs[CID]
		
		#init dictionary
		if chrom not in peaks:
			peaks[chrom] = {}
		
		if strand not in peaks[chrom]:
			peaks[chrom][strand] = {}
		
		#create peaks for tcc and add to peak dictionary
		stretch = cgPeaks.stretch(tcc, cName)
		stretch.createPeaks()
		for peakCoord in stretch.peaks:
			peaks[chrom][strand][peakCoord] = 0
	print timer.split()

	print 'finding best combos'
	bestCombos = []
	aPass = 0
	bPass = 0
	cPass = 0
	numT = 0
	for CID in peakDict:
		cgFlag = False
		if CID == '538':cgFlag = True
		
		tcc = peakDict[CID][0]
		#print tcc
		tccPeaks = []
		chrom = cg.ss(tcc, ':')[0]
		strand = cg.ss(tcc, ':')[1]
		start = int(cg.ss(tcc, ':')[2])
		end = int(cg.ss(tcc, ':')[3])
		
		#get all peaks
		for i in range(start, end + 1):
			if i in peaks[chrom][strand]:
				#print '  peak added', i
				tccPeaks.append(i)
		
		#Calculate parameters...
		pairStrings = [] #used to check if pair already added
		peakCombos = []
		for x in tccPeaks:
				
								
				#scan a 30 bp range around this point and find the best roof...
				pRange = 30
				rTcc = cg.makeTcc(chrom, strand, x, x + 1)
				
				#quickly get max value...kinda a long way to do it but whatever
				cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False)
				xval = cProfile[0]
				max = xval
				highestValueCoord = x
				
				#now make profile for roof...
				cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True)
				
				
				
				#now get highest stretch length and the rNext coord.
				minVal = .80
				highest = 0
				stretch = 0
				startCurrent = None
				startFinal = None
				endFinal = None
				for i in range(1 - pRange, pRange):
					if cProfile[i] > minVal:
						stretch += 1
						if startCurrent == None:
							startCurrent = i
					else:
						if stretch > 0:
							if stretch > highest: #stretch ended and was higher than previous
								highest = stretch
								endFinal = i - 1
								startFinal = startCurrent
								startCurrent = None
							else:
								startCurrent = None
						stretch = 0
				
				#get +/- 4 value...
				val = [1.0, 1.0]
				if (startFinal) and (endFinal):
					low = startFinal - 4
					high = endFinal + 4
					if low > (1 - pRange):
						if high < pRange:
							val[0] = float(cProfile[startFinal - 4])
							val[1] = float(cProfile[endFinal + 4])
				
				#fill in other details...
				y = 'S'
				dist = 'S'
				ratio = 'S'
				
				peakCombos.append([tcc,x,y,dist,ratio,max,highest,val])
				#print '  ', peakCombos[-1]
		
		#find best combo...
		topCombo = None
		for combo in peakCombos:
			roofLength = combo[6]
			dropValue = combo[7][0]
			if combo[7][1] > dropValue:
				dropValue = combo[7][1]
			
			#print roofLength, dropValue
			if 14 < roofLength < 26:
				if 0.0 < dropValue < 0.2:
					#pick one with rooflength nearest 20:
					if topCombo:
						if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])):
							topCombo = combo
					else:
						topCombo = combo
		
		if topCombo:
			peakDict[CID][1] = topCombo
			bestCombos.append(topCombo)
			print bestCombos[-1]
		else:
			#print 'None'
			pass

	print timer.split()


	#now update predFile (SLOT 13)
	predFile = open(predName, 'r')
	newLines = []
	for line in predFile:
		CID = cg.ss(line)[7]
		if peakDict[CID][1] == 'None':
			peakInfo = 'None'
		else:
			peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7])
		newLines.append(cg.appendToLine(line, peakInfo, 13))
	predFile.close()

	predFile = open(predName, 'w')
	predFile.writelines(newLines)
	predFile.close()