def updateSignificant(resultsFN, simulationAverageFN, outFN): id_avgNum = {} f = open(simulationAverageFN, 'r') for line in f: ls = line.strip().split('\t') id_avgNum[int(ls[0])] = float(ls[1]) f = open(resultsFN, 'r') newLines = [] for line in f: ls = line.strip().split('\t') id = int(ls[0]) numTargets = float(len(ls[4].split(','))) try: numExpected = id_avgNum[id] except KeyError: numExpected = 0 sigFlag = 'SIG' if numTargets < numExpected: sigFlag = 'NON' #Do Calculations here updateVal = sigFlag #update newLines newLines.append(cg.appendToLine(line, updateVal, int(8))) f.close() #update file f = open(outFN, 'w') f.writelines(newLines) f.close()
def convertFile(fN, inFormat, outFormat, oFN = None): #get input order and extract info iO = returnOrderList(inFormat) oO = returnOrderList(outFormat) f = open(fN, 'r') newLines = [] for line in f: ls = line.strip().split(iO[5]) chrom, strand, start, end = ls[iO[0]], ls[iO[1]], ls[iO[2]], ls[iO[3]] #switch to appropriate chromosome type if needed if len(chrom) == 1: chrom = oO[6] + chrom #switch strand if need be if oO[4] == 0: if strand == '1' or strand == '+': strand = '1' else: strand = '-1' else: if strand == '1' or strand == '+': strand = '+' else: strand = '-' #construct new Line newLine = '\n' newLine = cg.appendToLine(newLine, chrom, oO[0], sep = oO[5]) newLine = cg.appendToLine(newLine, strand, oO[1], sep = oO[5]) newLine = cg.appendToLine(newLine, start, oO[2], sep = oO[5]) newLine = cg.appendToLine(newLine, end, oO[3], sep = oO[5]) newLines.append(newLine) f.close() #output file f = open(fN + '.' + outFormat, 'w') f.writelines(newLines) f.close()
def convertFile(fN, inFormat, outFormat, oFN=None): #get input order and extract info iO = returnOrderList(inFormat) oO = returnOrderList(outFormat) f = open(fN, 'r') newLines = [] for line in f: ls = line.strip().split(iO[5]) chrom, strand, start, end = ls[iO[0]], ls[iO[1]], ls[iO[2]], ls[iO[3]] #switch to appropriate chromosome type if needed if len(chrom) == 1: chrom = oO[6] + chrom #switch strand if need be if oO[4] == 0: if strand == '1' or strand == '+': strand = '1' else: strand = '-1' else: if strand == '1' or strand == '+': strand = '+' else: strand = '-' #construct new Line newLine = '\n' newLine = cg.appendToLine(newLine, chrom, oO[0], sep=oO[5]) newLine = cg.appendToLine(newLine, strand, oO[1], sep=oO[5]) newLine = cg.appendToLine(newLine, start, oO[2], sep=oO[5]) newLine = cg.appendToLine(newLine, end, oO[3], sep=oO[5]) newLines.append(newLine) f.close() #output file f = open(fN + '.' + outFormat, 'w') f.writelines(newLines) f.close()
def updateTargetsExpression(resultsFN, targetsFN, inputPosition, updatePosition, outFN): #load target expression dict f = open(targetsFN, 'r') targetsDict = {} # tID: eLevel for line in f: targetsDict[int(line.strip().split('\t')[0])] = int( line.strip().split('\t')[2]) f.close() #For each sRNA, get target Expression. f = open(resultsFN, 'r') newLines = [] for line in f: targets = line.strip().split('\t')[int(inputPosition)] targets = targets.strip().split(',') maxExpressionLevel = 0 totalExpressionLevel = 0 for tID in targets: tID = int(tID) tExpressionLevel = targetsDict[tID] totalExpressionLevel += targetsDict[tID] if tExpressionLevel > maxExpressionLevel: maxExpressionLevel = tExpressionLevel #update newLines newLine = cg.appendToLine(line, maxExpressionLevel, int(updatePosition)) newLines.append( cg.appendToLine(newLine, totalExpressionLevel, int(updatePosition) + 1)) f.close() #update file f = open(outFN, 'w') f.writelines(newLines) f.close()
def addFiller(fN, filler, zeroPosition, outFN): filler = str(filler) f = open(fN, "r") newLines = [] for line in f: newLines.append(bioLibCG.appendToLine(line, filler, int(zeroPosition))) f.close() fOut = open(outFN, "w") fOut.writelines(newLines) fOut.close()
def addFiller(fN, filler, zeroPosition, outFN): filler = str(filler) f = open(fN, 'r') newLines = [] for line in f: newLines.append(bioLibCG.appendToLine(line, filler, int(zeroPosition))) f.close() fOut = open(outFN, 'w') fOut.writelines(newLines) fOut.close()
def transcriptSetOverlapDegFileHitmap(degFile, runningChrom, runningStrand): geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) transcriptTccs = [] for gene in allExons.set.values(): for transcript in gene.transcripts: transcriptTccs.append(transcript.tcc) #create hitmap coordSet = set() for tcc in transcriptTccs: chrom, strand, start, end = cg.tccSplit(tcc) if chrom != runningChrom: continue if strand != runningStrand: continue for i in range(start, end + 1): coordSet.add(i) #find overlapping degTccs print 'done creating hitmap' f = open(degFile, 'r') newLines = [] for line in f: ls = line.strip().split('\t') degTcc = cg.convertToAS(ls[1]) chrom, strand, start, end = cg.tccSplit(degTcc) if chrom != runningChrom: continue if strand != runningStrand: continue inTran = '0' for i in xrange(start, end + 1): if i in coordSet: inTran = '1' break #update newLines newLine = cg.appendToLine(line, inTran, 3) newLines.append(newLine) f.close() f = open(degFile + '.%s.%s' % (runningChrom, runningStrand), 'w') f.writelines(newLines) f.close()
def updateTargetsExpression(resultsFN, targetsFN, inputPosition, updatePosition, outFN): #load target expression dict f = open(targetsFN, 'r') targetsDict = {} # tID: eLevel for line in f: targetsDict[int(line.strip().split('\t')[0])] = int(line.strip().split('\t')[2]) f.close() #For each sRNA, get target Expression. f = open(resultsFN, 'r') newLines = [] for line in f: targets = line.strip().split('\t')[int(inputPosition)] targets = targets.strip().split(',') maxExpressionLevel = 0 totalExpressionLevel = 0 for tID in targets: tID = int(tID) tExpressionLevel = targetsDict[tID] totalExpressionLevel += targetsDict[tID] if tExpressionLevel > maxExpressionLevel: maxExpressionLevel = tExpressionLevel #update newLines newLine = cg.appendToLine(line, maxExpressionLevel, int(updatePosition)) newLines.append(cg.appendToLine(newLine, totalExpressionLevel, int(updatePosition) + 1)) f.close() #update file f = open(outFN, 'w') f.writelines(newLines) f.close()
def addFiller(fN, filler, zeroPosition, outFN): filler = str(filler) idFlag = False if filler == 'ID': idFlag = True f = open(fN, 'r') newLines = [] i = 0 for line in f: if idFlag: filler = str(i) newLines.append(bioLibCG.appendToLine(line, filler, int(zeroPosition))) i += 1 f.close() fOut = open(outFN, 'w') fOut.writelines(newLines) fOut.close()
def transcriptSetOverlapDegFile(degFile): geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript degTccs = [] f = open(degFile, 'r') for line in f: ls = line.strip().split('\t') degTccs.append(ls[1]) f.close() degTccs = [cg.convertToAS(x) for x in degTccs] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) #print len(overlappingExons), "num of overlapping exons" overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) f = open(degFile, 'r') newLines = [] for line in f: degTcc = cg.convertToAS(ls[1]) inTran = '0' if degTcc in overlappingDegTccs: inTran = '1' #update newLines newLine = cg.appendToLine(line, inTran, 3) f.close()
def updateReadDensity(tType, cName): #go through wig each chromosome and check the mature seqs mainConf = cgConfig.cgConfig('Main.conf') conf = cgConfig.getConfig(cName) organism = conf.conf['organism'] wigFolder = mainConf.conf['wig%s' % organism] newLines = [] #Differentiate between exon or intron... if tType == 'E': pFileName = conf.conf['resultsExons'] elif tType == 'I': pFileName = conf.conf['resultsIntrons'] else: print 'READ UPDATE FAIL' print ' Updating Read Density:', tType #get read density for each line... print ' calculating hits for mature seqs' #calculate total hits per mature mirFile = open(pFileName, 'r') for line in mirFile: mTcc = line.strip().split('\t')[1] mirID = line.strip().split('\t')[0] tccStretch = cgPeaks.stretch(mTcc, cName) highestHit = 0 for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])): if i in tccStretch.profile: if tccStretch.profile[i] > highestHit: highestHit = tccStretch.profile[i] newLines.append(cg.appendToLine(line, str(highestHit), 11)) mirFile.close() print 'Writing New File' #write new results file outFile = open(pFileName, 'w') for line in newLines: outFile.write(line) outFile.close() ####NOW UPDATE HIGHEST HIT PER CLUSTER#### clusterCount = {} pFile = open(pFileName, 'r') for line in pFile: predictionCount = int(line.strip().split('\t')[11]) CID = line.strip().split('\t')[7] if CID in clusterCount: if clusterCount[CID] < predictionCount: clusterCount[CID] = predictionCount else: clusterCount[CID] = predictionCount pFile.close() #update the file --> cluster small count newLines = [] predFile = open(pFileName, 'r') for line in predFile: CID = line.strip().split('\t')[7] numMax = clusterCount[CID] newLines.append(cg.appendToLine(line, str(numMax), 12)) predFile.close() #sort newLines by clusterID sortDict = {} CIDs = [] for line in newLines: CID = int(line.strip().split('\t')[7]) if CID not in CIDs: CIDs.append(CID) if CID in sortDict: sortDict[CID].append(line) else: sortDict[CID] = [line] CIDs.sort() newLines = [] for CID in CIDs: for line in sortDict[CID]: newLines.append(line) #write new File newFile = open(pFileName, 'w') for line in newLines: newFile.write(line) newFile.close()
def updateReadDensity(tType): #go through wig each chromosome and check the mature seqs mainConf = cgConfig.cgConfig('Main.conf') conf = cgConfig.cgConfig() organism = conf.conf['organism'] wigFolder = mainConf.conf['wig%s' % organism] newLines = [] if tType == 'E': pFileName = conf.conf['resultsExons'] elif tType == 'I': pFileName = conf.conf['resultsIntrons'] else: print 'READ UPDATE FAIL' print ' Updating Read Density:', tType for wigFileN in cg.recurseDir(wigFolder, end='.wig'): #init chrom = wigFileN.strip().split('.')[-2] strand = wigFileN.strip().split('.')[-4] wigFile = open(wigFileN, 'r') mirFile = open(pFileName, 'r') print wigFileN #get rid of header wigFile.readline() print ' populating hitmap' #populate hitmap wigMap = {} for line in wigFile: value = int(line.strip().split('\t')[3].split('.')[0]) if value > 0: start = int(line.strip().split('\t')[1]) end = int(line.strip().split('\t')[2]) for i in range(start, end): wigMap[i] = value wigFile.close() print ' calculating hits for mature seqs' #calculate total hits per mature for line in mirFile: mTcc = line.strip().split('\t')[1] mirID = line.strip().split('\t')[0] if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1] == strand): #if mirID == '26477.30.106643972': print 'Starting Total Count' highestHit = 0 for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])): #if mirID == '26477.30.106643972': print ' ', i if i in wigMap: if wigMap[i] > highestHit: highestHit = wigMap[i] #if mirID == '26477.30.106643972': print ' ', i, totalHits, wigMap[i] newLines.append(cg.appendToLine(line, str(highestHit), 11)) mirFile.close() print 'Writing New File' #write new results file outFile = open(pFileName, 'w') for line in newLines: outFile.write(line) outFile.close() ####NOW UPDATE HIGHEST HIT PER CLUSTER#### clusterCount = {} pFile = open(pFileName, 'r') for line in pFile: predictionCount = int(line.strip().split('\t')[11]) CID = line.strip().split('\t')[7] if CID in clusterCount: if clusterCount[CID] < predictionCount: clusterCount[CID] = predictionCount else: clusterCount[CID] = predictionCount pFile.close() #update the file --> cluster small count newLines = [] predFile = open(pFileName, 'r') for line in predFile: CID = line.strip().split('\t')[7] numMax = clusterCount[CID] newLines.append(cg.appendToLine(line, str(numMax), 12)) predFile.close() #sort newLines by clusterID sortDict = {} CIDs = [] for line in newLines: CID = int(line.strip().split('\t')[7]) if CID not in CIDs: CIDs.append(CID) if CID in sortDict: sortDict[CID].append(line) else: sortDict[CID] = [line] CIDs.sort() newLines = [] for CID in CIDs: for line in sortDict[CID]: newLines.append(line) #write new File newFile = open(pFileName, 'w') for line in newLines: newFile.write(line) newFile.close()
print ' calculating hits for mature seqs' #calculate total hits per mature for line in mirFile: mTcc = line.strip().split('\t')[1] mirID = line.strip().split('\t')[0] if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1] == strand): #if mirID == '26477.30.106643972': print 'Starting Total Count' totalHits = 0 for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])): #if mirID == '26477.30.106643972': print ' ', i if i in wigMap: totalHits += wigMap[i] #if mirID == '26477.30.106643972': print ' ', i, totalHits, wigMap[i] newLines.append(cg.appendToLine(line, str(totalHits), 11)) mirFile.close() print 'Writing New File' #write new results file outFile = open(pFileName, 'w') for line in newLines: outFile.write(line) outFile.close() ####NOW UPDATE MAX HITS PER CLUSTER#### clusterCount = {} pFile = open(pFileName, 'r')
print 'Total', numT print 'A', aPass print 'B', bPass print 'C', cPass #output results to a file for R outFile = open('mousePeaksResults.R.data', 'w') outFile.write('tcc\tpeakOne\tpeakTwo\tdistance\tratio\tmax\thRatio\n') for peak in bestCombos: outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (peak[0], peak[1], peak[2], peak[3], peak[4], peak[5])) outFile.close() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s' % ( str(peakDict[CID][1][1])[-3:], str(peakDict[CID][1][2])[-3:], str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
else: print 'None' print timer.split() #output results to a file for R outFile = open('mousePeaksResults.R.data', 'w') outFile.write('tcc\tpeakOne\tpeakTwo\tdistance\tratio\tmax\thRatio\n') for peak in bestCombos: outFile.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (peak[0], peak[1],peak[2],peak[3],peak[4],peak[5])) outFile.close() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], str(peakDict[CID][1][2])[-3:], str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllTranscripts.tsv') cDesc = {} #CID:gDesc for CID in cHairs: tcc = cHairs[CID] cDesc[CID] = "NONE" overlappingGenes = ensGenes.geneOverlaps([tcc]) if len(overlappingGenes) > 0: print overlappingGenes[0].type cDesc[CID] = overlappingGenes[0].type f = open(fN, 'r') newLines = [] for line in f: CID = line.strip().split('\t')[7] newLines.append(cg.appendToLine(line, cDesc[CID], 16)) f.close() f = open(fN + '.FINAL', 'w') f.writelines(newLines) f.close()
mConf = c.getConfig('Main.conf') geneSetFolder = mConf.conf['geneSetsHuman'] fN = '/home/chrisgre/projects/NoncodingHuman/results/NChuman-s3k8b17.results.sorted.introns.sorted' cHairs = getHairpins.getHairpins(fN) ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllTranscripts.tsv') cDesc = {} #CID:gDesc for CID in cHairs: tcc = cHairs[CID] cDesc[CID] = "NONE" overlappingGenes = ensGenes.geneOverlaps([tcc]) if len(overlappingGenes) > 0: print overlappingGenes[0].type cDesc[CID] = overlappingGenes[0].type f = open(fN, 'r') newLines = [] for line in f: CID = line.strip().split('\t')[7] newLines.append(cg.appendToLine(line, cDesc[CID], 16)) f.close() f = open(fN + '.FINAL', 'w') f.writelines(newLines) f.close()
def findPeaks(pType, cName=None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID], 'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538': cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio=False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio=True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc, x, y, dist, ratio, max, highest, val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < ( math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % ( str(peakDict[CID][1][1])[-3:], 'S', str( peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5], peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
def updateNoise(pType, cName=None): #init mainConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExons'] else: predName = conf.conf['resultsIntrons'] #populate cid: exon dist print 'Populating CID/INtron/exon distribution data' if pType == 'E': noiseFN = conf.conf['exonNoiseData'] f = open(noiseFN, 'r') else: noiseFN = conf.conf['intronNoiseData'] f = open(noiseFN, 'r') exonDists = {} #cid: [exon dist] header = f.readline() order = {} # num:CID for i, CID in enumerate(header.strip().split('\t')): order[i] = CID exonDists[CID] = [] for line in f: data = line.strip().split('\t') for i, dataPoint in enumerate(data): if dataPoint == 'NA' or dataPoint == '': continue else: dataPoint = float(dataPoint) CID = order[i] exonDists[CID].append(dataPoint) #get highest expression level for each cluster print 'Populating highest expression levels' predExpression = {} # CID; highest level exonFile = open(predName, 'r') for line in exonFile: CID = line.strip().split('\t')[7] hDensity = line.strip().split('\t')[12] predExpression[CID] = hDensity #get pVals for each CID print 'Getting pvals for each cluster' pVals = {} # CID; [lam,pVal] for CID in exonDists: if not len(exonDists[CID]) > 0: #no data in 2kb range. lam = 'NA' pVal = 'NA' else: lam = cgStats.getLam(exonDists[CID]) pVal = cgStats.getPValExp(predExpression[CID], lam) pVals[CID] = [ lam, pVal ] #lam gives a good approximation of noise levels in region... print 'Updating the file' #update file... predFile = open(predName, 'r') newLines = [] for line in predFile: CID = line.split('\t')[7] newLine = cg.appendToLine(line, pVals[CID][0], 14) newLine = cg.appendToLine(newLine, pVals[CID][1], 15) newLines.append(newLine) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
def updateReadDensity(tType): #go through wig each chromosome and check the mature seqs mainConf = cgConfig.cgConfig('Main.conf') conf = cgConfig.cgConfig() organism = conf.conf['organism'] wigFolder = mainConf.conf['wig%s' % organism] newLines = [] if tType == 'E': pFileName = conf.conf['resultsExons'] elif tType == 'I': pFileName = conf.conf['resultsIntrons'] else: print 'READ UPDATE FAIL' print ' Updating Read Density:', tType for wigFileN in cg.recurseDir(wigFolder, end = '.wig'): #init chrom = wigFileN.strip().split('.')[-2] strand = wigFileN.strip().split('.')[-4] wigFile = open(wigFileN, 'r') mirFile = open(pFileName, 'r') print wigFileN #get rid of header wigFile.readline() print ' populating hitmap' #populate hitmap wigMap = {} for line in wigFile: value = int(line.strip().split('\t')[3].split('.')[0]) if value > 0: start = int(line.strip().split('\t')[1]) end = int(line.strip().split('\t')[2]) for i in range(start, end): wigMap[i] = value wigFile.close() print ' calculating hits for mature seqs' #calculate total hits per mature for line in mirFile: mTcc = line.strip().split('\t')[1] mirID = line.strip().split('\t')[0] if (mTcc.split(':')[0] == chrom) and (mTcc.split(':')[1] == strand): #if mirID == '26477.30.106643972': print 'Starting Total Count' highestHit = 0 for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])): #if mirID == '26477.30.106643972': print ' ', i if i in wigMap: if wigMap[i] > highestHit: highestHit = wigMap[i] #if mirID == '26477.30.106643972': print ' ', i, totalHits, wigMap[i] newLines.append(cg.appendToLine(line, str(highestHit), 11)) mirFile.close() print 'Writing New File' #write new results file outFile = open(pFileName, 'w') for line in newLines: outFile.write(line) outFile.close() ####NOW UPDATE HIGHEST HIT PER CLUSTER#### clusterCount = {} pFile = open(pFileName, 'r') for line in pFile: predictionCount = int(line.strip().split('\t')[11]) CID = line.strip().split('\t')[7] if CID in clusterCount: if clusterCount[CID] < predictionCount: clusterCount[CID] = predictionCount else: clusterCount[CID] = predictionCount pFile.close() #update the file --> cluster small count newLines = [] predFile = open(pFileName, 'r') for line in predFile: CID = line.strip().split('\t')[7] numMax = clusterCount[CID] newLines.append(cg.appendToLine(line, str(numMax), 12)) predFile.close() #sort newLines by clusterID sortDict = {} CIDs = [] for line in newLines: CID = int(line.strip().split('\t')[7]) if CID not in CIDs: CIDs.append(CID) if CID in sortDict: sortDict[CID].append(line) else: sortDict[CID] = [line] CIDs.sort() newLines = [] for CID in CIDs: for line in sortDict[CID]: newLines.append(line) #write new File newFile = open(pFileName, 'w') for line in newLines: newFile.write(line) newFile.close()
def filterOutTargets(resultsFN, centerFN, mismatchFN, targetFN, tranCheck, mPick, cPick, minCenterLevel, inputPosition, updatePosition, outFN): '''Pick is the range in which you need the values for. 0 is 4bp around, 1 is 6...''' #make mismatch dict mmDict = {} # siD: tID : mmVal f = open(mismatchFN, 'r') for line in f: ls = line.strip().split('\t') sID = int(ls[0]) tID = int(ls[1]) mmVal = int(ls[2 + mPick]) if not sID in mmDict: mmDict[sID] = {} mmDict[sID][tID] = mmVal f.close() #make center dict centerDict = {} # sID: tID: centerVal f = open(centerFN, 'r') for line in f: ls = line.strip().split('\t') sID = int(ls[0]) tID = int(ls[1]) centerVal = float(ls[2 + cPick]) if not sID in centerDict: centerDict[sID] = {} centerDict[sID][tID] = centerVal f.close() #make transcript target dict tranVals = {} # tID : tranValue f = open(targetFN, 'r') for line in f: ls = line.strip().split('\t') tID = int(ls[0]) tranVal = ls[3] tranVals[tID] = tranVal #go through and test all the targets. f = open(resultsFN, 'r') newLines = [] for line in f: sID = int(line.strip().split('\t')[0]) targets = line.strip().split('\t')[int(inputPosition)] targets = targets.strip().split(',') newTargetList = [] for tID in targets: ##print sID, tID tID = int(tID) #Check for inside transcript if tranCheck: if tranVals[tID] == '0': continue #check mismatches mmVal = mmDict[sID][tID] if mmVal == 1: continue #check center Expression centerVal = centerDict[sID][tID] if centerVal < minCenterLevel: continue newTargetList.append(str(tID)) if len(newTargetList) < 1: continue newTargets = ','.join(newTargetList) #update newLines newLines.append(bioLibCG.appendToLine(line, newTargets, int(updatePosition))) f.close() #update file f = open(outFN, 'w') f.writelines(newLines) f.close()
def filterOutTargets(resultsFN, centerFN, mismatchFN, targetFN, tranCheck, mPick, cPick, minCenterLevel, inputPosition, updatePosition, outFN): '''Pick is the range in which you need the values for. 0 is 4bp around, 1 is 6...''' #make mismatch dict mmDict = {} # siD: tID : mmVal f = open(mismatchFN, 'r') for line in f: ls = line.strip().split('\t') sID = int(ls[0]) tID = int(ls[1]) mmVal = int(ls[2 + mPick]) if not sID in mmDict: mmDict[sID] = {} mmDict[sID][tID] = mmVal f.close() #make center dict centerDict = {} # sID: tID: centerVal f = open(centerFN, 'r') for line in f: ls = line.strip().split('\t') sID = int(ls[0]) tID = int(ls[1]) centerVal = float(ls[2 + cPick]) if not sID in centerDict: centerDict[sID] = {} centerDict[sID][tID] = centerVal f.close() #make transcript target dict tranVals = {} # tID : tranValue f = open(targetFN, 'r') for line in f: ls = line.strip().split('\t') tID = int(ls[0]) tranVal = ls[3] tranVals[tID] = tranVal #go through and test all the targets. f = open(resultsFN, 'r') newLines = [] for line in f: sID = int(line.strip().split('\t')[0]) targets = line.strip().split('\t')[int(inputPosition)] targets = targets.strip().split(',') newTargetList = [] for tID in targets: ##print sID, tID tID = int(tID) #Check for inside transcript if tranCheck: if tranVals[tID] == '0': continue #check mismatches mmVal = mmDict[sID][tID] if mmVal == 1: continue #check center Expression centerVal = centerDict[sID][tID] if centerVal < minCenterLevel: continue newTargetList.append(str(tID)) if len(newTargetList) < 1: continue newTargets = ','.join(newTargetList) #update newLines newLines.append( bioLibCG.appendToLine(line, newTargets, int(updatePosition))) f.close() #update file f = open(outFN, 'w') f.writelines(newLines) f.close()
def findPeaks(pType, cName = None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID],'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538':cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc,x,y,dist,ratio,max,highest,val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()