def smallAnalyze(inFile = None): #Caste and defaults if not inFile: print 'need inFile' return 1 # start Timer timer = cg.cgTimer() timer.start() # for every id in outfile, count how many matches there are countDict = {} countFile = open(inFile, 'r') for line in countFile: (id, library) = (line.strip().split(':')[1], line.strip().split(':')[0]) if id not in countDict: countDict[id] = {} else: if library not in countDict[id]: countDict[id][library] = 1 else: countDict[id][library] = countDict[id][library] + 1 #print 'Time for counting lib hits: ', timer.split() sortList = [] for id in countDict: sortList.append(id) sortList.sort() for id in sortList: print '%s' % id for lib in countDict[id]: print '%s\t%s' % (lib, countDict[id][lib])
def markCenterExpression(aFN, wigDir, rn = None, tn = None): extend = 25 timer = bioLibCG.cgTimer() timer.start() aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength', 'tELevel'], [rn, tn]) #load expression of degradome wigDict = cgWig.loadWigDict(wigDir) for aID in aNX.centerExpression: aNX.centerExpression[aID] = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID]) offset = aNX.tStart[aID] sLen = aNX.sLength[aID] if strand == '1': start = start - extend + offset end = start + sLen else: end = end + extend - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgWig.getExpressionProfile(scanRange, wigDict) #make sure peak is in the small range peakLevel = aNX.tELevel[aID] peakInRange = (peakLevel in stretch.values()) expressionSum = sum(stretch.values()) sortedKeys = stretch.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0 and peakInRange: sumE = 0.0 for key in sortedKeys[8:12]: sumE += stretch[key] aNX.centerExpression[aID][0] = sumE/expressionSum sumE = 0.0 for key in sortedKeys[7:13]: sumE += stretch[key] aNX.centerExpression[aID][1] = sumE/expressionSum sumE = 0.0 for key in sortedKeys[6:14]: sumE += stretch[key] aNX.centerExpression[aID][2] = sumE/expressionSum aNX.save()
def alignSeqs(seqsFN, dbName, wordSize, outFN, maxNumMismatches, sendExitSignal = False): maxNumMismatches = int(maxNumMismatches) sendExitSignal = bool(sendExitSignal) timer = bioLibCG.cgTimer() timer.start() #put seqs in cgSeq object, align wName = dbName + '.wDB' sName = dbName + '.sDB' wordSize = int(wordSize) #load dbs #print 'loading Sequence Database' sDB = cgAlign.loadSequenceDatabase(sName) print timer.split() #print 'loading Word Database' wDB = cgAlign.loadWordDatabase(wName) print timer.split() #align each seq f = open(seqsFN, 'r') fOut = open(outFN, 'w') for line in f: qSeq = cgAlign.cgSeq(line.strip().split('\t')[0], line.strip().split('\t')[1]) #write out the alignments cgAlign.alignQuery(qSeq, wDB, sDB, wordSize, maxNumMismatches, fOut) f.close() fOut.close() print timer.split() if sendExitSignal: cgExit.sendExitSignal(seqsFN)
def scanVectorsFile(fN, tccList): '''Given tcc list --> scan wig files and return coord:value... ''' timer = cg.cgTimer() timer.start() coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) #goto correct line in index fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this... fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break #fIndex.close() return coordDict
def updatePolySeqs(mFN, readsFN, alignFN): tim = bioLibCG.cgTimer() tim.start() variousAs = ["A" * x for x in range(1,20)] variousGs = ["G" * x for x in range(1,20)] variousTs = ["T" * x for x in range(1,20)] variousCs = ["C" * x for x in range(1,20)] letter_variousLetters = [ ("A", variousAs), ("G", variousGs), ("T", variousTs), ("C", variousCs)] checkRange = range(1,8) NX = cgNexusFlat.Nexus(mFN, miR) NX.load(['sequence', 'polySeqs']) #print 'load micro', tim.split() reads = cgNexusFlat.quickTable(('read','string', '.', 1)) rNX = cgNexusFlat.Nexus(readsFN, reads) rNX.load(['read']) #print 'load reads', tim.split() aNX = cgNexusFlat.Nexus(alignFN, cgAlignment) aNX.load(['sID', 'tID']) #print 'load alignments', tim.split() for id in aNX.ids: theRead = rNX.read[aNX.sID[id]] mID = aNX.tID[id] microSeq = NX.sequence[mID] #may be a read for expression, but wont count... if theRead in microSeq: continue #just for expression if microSeq == theRead: print tabIt(microSeq, theRead, 0, 0, "N") #first check full elif microSeq in theRead and (len(theRead) != len(microSeq)): tail = theRead.split(microSeq)[1] for let, variousLetters in letter_variousLetters: if tail in variousLetters: print tabIt(microSeq, theRead, 0, len(tail), let) #now check trimmed (cant do [:-0]) else: for i in checkRange: if microSeq[:-i] in theRead and (len(theRead) != len(microSeq[:-i])): tail = theRead.split(microSeq[:-i])[1] for let, variousLetters in letter_variousLetters: if tail in variousLetters: print tabIt(microSeq, theRead, i, len(tail), let) print "TRIMMED" break #dont trim after the first trimmed one works
def markCenterExpression(aFN, wigDir, rn=None, tn=None): extend = 25 timer = bioLibCG.cgTimer() timer.start() aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment) aNX.load(['centerExpression', 'tTcc', 'tStart', 'sLength', 'tELevel'], [rn, tn]) #load expression of degradome wigDict = cgWig.loadWigDict(wigDir) for aID in aNX.centerExpression: aNX.centerExpression[aID] = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(aNX.tTcc[aID]) offset = aNX.tStart[aID] sLen = aNX.sLength[aID] if strand == '1': start = start - extend + offset end = start + sLen else: end = end + extend - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgWig.getExpressionProfile(scanRange, wigDict) #make sure peak is in the small range peakLevel = aNX.tELevel[aID] peakInRange = (peakLevel in stretch.values()) expressionSum = sum(stretch.values()) sortedKeys = stretch.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0 and peakInRange: sumE = 0.0 for key in sortedKeys[8:12]: sumE += stretch[key] aNX.centerExpression[aID][0] = sumE / expressionSum sumE = 0.0 for key in sortedKeys[7:13]: sumE += stretch[key] aNX.centerExpression[aID][1] = sumE / expressionSum sumE = 0.0 for key in sortedKeys[6:14]: sumE += stretch[key] aNX.centerExpression[aID][2] = sumE / expressionSum aNX.save()
def alignSeqs(seqsFN, dbName, wordSize, outFN, maxNumMismatches, sendExitSignal=False): maxNumMismatches = int(maxNumMismatches) sendExitSignal = bool(sendExitSignal) timer = bioLibCG.cgTimer() timer.start() #put seqs in cgSeq object, align wName = dbName + '.wDB' sName = dbName + '.sDB' wordSize = int(wordSize) #load dbs #print 'loading Sequence Database' sDB = cgAlign.loadSequenceDatabase(sName) print timer.split() #print 'loading Word Database' wDB = cgAlign.loadWordDatabase(wName) print timer.split() #align each seq f = open(seqsFN, 'r') fOut = open(outFN, 'w') for line in f: qSeq = cgAlign.cgSeq(line.strip().split('\t')[0], line.strip().split('\t')[1]) #write out the alignments cgAlign.alignQuery(qSeq, wDB, sDB, wordSize, maxNumMismatches, fOut) f.close() fOut.close() print timer.split() if sendExitSignal: cgExit.sendExitSignal(seqsFN)
def smallAnalyze(inFile=None): #Caste and defaults if not inFile: print 'need inFile' return 1 # start Timer timer = cg.cgTimer() timer.start() # for every id in outfile, count how many matches there are countDict = {} countFile = open(inFile, 'r') for line in countFile: (id, library) = (line.strip().split(':')[1], line.strip().split(':')[0]) if id not in countDict: countDict[id] = {} else: if library not in countDict[id]: countDict[id][library] = 1 else: countDict[id][library] = countDict[id][library] + 1 #print 'Time for counting lib hits: ', timer.split() sortList = [] for id in countDict: sortList.append(id) sortList.sort() for id in sortList: print '%s' % id for lib in countDict[id]: print '%s\t%s' % (lib, countDict[id][lib])
def scanVectorsSingleCoord(tccList, cName): '''Given tcc list --> scan wig files and coord:value... ''' conf = c.getConfig(cName) org = conf.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] timer = cg.cgTimer() timer.start() coordDict = {} # tcc: [list values] for tcc in tccList: theSplit = ss(tcc, ':') chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1], int( theSplit[2]), int(theSplit[3]) #goto correct fild, correct line in index fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom) fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(), strand, chrom) #print timer.split() #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= tccStart < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #print timer.split() #grab value f = open(fN, 'r') f.seek(startByte, 0) stop = False for line in f: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break f.close() return coordDict
def scanVectorsHist(tccList, cName): '''Given tcc list --> scan wig files and get histogram values can be modified to do single/total values... THIS USES INDEXES!!! = BAD...''' conf = c.getConfig(cName) org = conf.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] timer = cg.cgTimer() timer.start() histDict = {} # tcc: [list values] for tcc in tccList: theSplit = ss(tcc, ':') chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1], int( theSplit[2]), int(theSplit[3]) #goto correct fild, correct line in index fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom) fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(), strand, chrom) #print timer.split() #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= tccStart < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #print timer.split() #grab value f = open(fN, 'r') f.seek(startByte, 0) stop = False for line in f: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): try: histDict[tcc].append(lValue) except KeyError: #just for zero...so you don't have to if every time... histDict[tcc] = [lValue] if stop: break f.close() #print timer.split() return histDict
def findPeaks(pType, cName = None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID],'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538':cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc,x,y,dist,ratio,max,highest,val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
def scanVectorsSingleCoord(tccList, cName): '''Given tcc list --> scan wig files and coord:value... ''' conf = c.getConfig(cName) org = conf.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] timer = cg.cgTimer() timer.start() coordDict = {} # tcc: [list values] for tcc in tccList: theSplit = ss(tcc, ':') chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1],int(theSplit[2]),int(theSplit[3]) #goto correct fild, correct line in index fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(),strand,chrom) fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),strand,chrom) #print timer.split() #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= tccStart < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #print timer.split() #grab value f = open(fN, 'r') f.seek(startByte, 0) stop = False for line in f: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break f.close() return coordDict
#puts final hits into clusters... ##Clusters are based off of overlapping neighbors, if you have an overlapping neighbor than you are part of that cluster. import bioLibCG as cg import subprocess import compareData as compare import cgConfig #Start Timer timer = cg.cgTimer() timer.start() #Get list of mature tccs conf = cgConfig.returnConfDict() finalMirFileName = '/u/home8/gxxiao/chrisgre/projects/PipeRuns/LanderHuman/out/LanderHuman-s3k8b17.ALL.FINAL.mirs.tsv' finalMirFileName = conf['resultsRaw'] matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc print 'List getting', timer.split() #make connections dict matureConnections = compare.makeConnectionsDict(matureTccs) print 'Make connections:', timer.split() #Now have to define Clusters... clusters = [] addedList = [] #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P def createClusters(item = None, mode = None):
def defineClusters(cName = None): #Start Timer timer = cg.cgTimer() timer.start() #Get list of mature tccs conf = cgConfig.getConfig(cName) #passed or default finalMirFileName = conf.conf['resultsRaw'] matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc print 'List getting', timer.split() #make connections dict matureConnections = compare.makeConnectionsDict(matureTccs) print 'Make connections:', timer.split() #Now have to define Clusters... clusters = [] addedList = [] #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P def createClusters(item = None, mode = None): if item in addedList: return 0 elif mode == "top": clusters.append([item]) addedList.append(item) ##creates new cluster with the item already stored in it for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") elif mode == "neighbor": clusters[-1].append(item) #add this item to the last cluster created addedList.append(item) for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") for tcc in matureTccs: createClusters(tcc, "top") print 'Make Clusters', timer.split() #Sort Clusters. sortedClusters = [] for cluster in clusters: sortedClusters.append(cg.sortTccList(cluster)) print 'Sort Clusters:', timer.split() #Output sorted cluster file clusterFileName = conf.conf['sortedClusters'] clusterFile = open(clusterFileName, 'w') for cluster in sortedClusters: for hit in cluster: clusterFile.write('%s,' % hit) clusterFile.write('\n') clusterFile.close() ''' #re-create sortedClusters list: clusterFileName = 'sortedClusters.data' clusterFile = open(clusterFileName, 'r') sortedClusters = [] for line in clusterFile: sortedClusters.append([]) line = line.strip()[0:-1] #take off last comma ;P for hit in (line.strip().split(',')): sortedClusters[-1].append(hit) ''' print 'Store intermediate data:', timer.split() #output hitsAround file outputFile = open(conf.conf['hitsPerFrame'], 'w') frameLength = 200 frameShift = 1 for cluster in sortedClusters: #grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope clusterChrom = cluster[0].split(":")[0] clusterStrand = cluster[0].split(":")[1] firstCoord = int(cluster[0].split(":")[2]) #print cluster[-1] lastCoord = int(cluster[-1].split(":")[3]) startCoord = firstCoord while startCoord < lastCoord: #count how many hits there are in this range rangeStart = startCoord - (frameLength/2) rangeEnd = startCoord + (frameLength/2) rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd) overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2) hitCount = len(overlappedList) #output outputFile.write('%s\t%s\n' % (rangeTcc, hitCount)) startCoord = startCoord + frameShift #check overlap with range outputFile.close() print 'Output Hits per Frame:', timer.split() print 'Overall Time:', timer.report()
def findPeaks(pType, cName=None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID], 'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538': cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio=False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio=True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc, x, y, dist, ratio, max, highest, val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < ( math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % ( str(peakDict[CID][1][1])[-3:], 'S', str( peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5], peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
def defineClusters(cName=None): #Start Timer timer = cg.cgTimer() timer.start() #Get list of mature tccs conf = cgConfig.getConfig(cName) #passed or default finalMirFileName = conf.conf['resultsRaw'] matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc print 'List getting', timer.split() #make connections dict matureConnections = compare.makeConnectionsDict(matureTccs) print 'Make connections:', timer.split() #Now have to define Clusters... clusters = [] addedList = [] #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P def createClusters(item=None, mode=None): if item in addedList: return 0 elif mode == "top": clusters.append([item]) addedList.append( item) ##creates new cluster with the item already stored in it for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") elif mode == "neighbor": clusters[-1].append( item) #add this item to the last cluster created addedList.append(item) for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") for tcc in matureTccs: createClusters(tcc, "top") print 'Make Clusters', timer.split() #Sort Clusters. sortedClusters = [] for cluster in clusters: sortedClusters.append(cg.sortTccList(cluster)) print 'Sort Clusters:', timer.split() #Output sorted cluster file clusterFileName = conf.conf['sortedClusters'] clusterFile = open(clusterFileName, 'w') for cluster in sortedClusters: for hit in cluster: clusterFile.write('%s,' % hit) clusterFile.write('\n') clusterFile.close() ''' #re-create sortedClusters list: clusterFileName = 'sortedClusters.data' clusterFile = open(clusterFileName, 'r') sortedClusters = [] for line in clusterFile: sortedClusters.append([]) line = line.strip()[0:-1] #take off last comma ;P for hit in (line.strip().split(',')): sortedClusters[-1].append(hit) ''' print 'Store intermediate data:', timer.split() #output hitsAround file outputFile = open(conf.conf['hitsPerFrame'], 'w') frameLength = 200 frameShift = 1 for cluster in sortedClusters: #grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope clusterChrom = cluster[0].split(":")[0] clusterStrand = cluster[0].split(":")[1] firstCoord = int(cluster[0].split(":")[2]) #print cluster[-1] lastCoord = int(cluster[-1].split(":")[3]) startCoord = firstCoord while startCoord < lastCoord: #count how many hits there are in this range rangeStart = startCoord - (frameLength / 2) rangeEnd = startCoord + (frameLength / 2) rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd) overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2) hitCount = len(overlappedList) #output outputFile.write('%s\t%s\n' % (rangeTcc, hitCount)) startCoord = startCoord + frameShift #check overlap with range outputFile.close() print 'Output Hits per Frame:', timer.split() print 'Overall Time:', timer.report()
import bioLibCG import sys fN = sys.argv[1] timer = bioLibCG.cgTimer() timer.start() loadTime = 0.0 splitTime = 0.0 f = open(fN, 'r') for line in f: loadTime += timer.split() a = line.strip().split('\t') b = int(a[0]) splitTime += timer.split() print loadTime print splitTime
def scanVectorsHist(tccList, cName): '''Given tcc list --> scan wig files and get histogram values can be modified to do single/total values... THIS USES INDEXES!!! = BAD...''' conf = c.getConfig(cName) org = conf.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] timer = cg.cgTimer() timer.start() histDict = {} # tcc: [list values] for tcc in tccList: theSplit = ss(tcc, ':') chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1],int(theSplit[2]),int(theSplit[3]) #goto correct fild, correct line in index fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(),strand,chrom) fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),strand,chrom) #print timer.split() #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= tccStart < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #print timer.split() #grab value f = open(fN, 'r') f.seek(startByte, 0) stop = False for line in f: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): try: histDict[tcc].append(lValue) except KeyError: #just for zero...so you don't have to if every time... histDict[tcc] = [lValue] if stop: break f.close() #print timer.split() return histDict