def makePeakInputQ(cName, minExpression=2000): '''Uses shell script and qsub to get peaks quickly''' mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) assembly = conf.conf['assembly'] tccList = [] chromLens = cg.returnChromLengthDict(assembly) for chrom in chromLens: if chrom not in cg.acceptableChroms: continue for strand in ['1', '-1']: print 'Getting Peaks for ', chrom, strand prevI = 0 for i in rangePoints(1, chromLens[chrom], 30): if i == 1: prevI = i continue start = prevI end = i prevI = i tcc = cg.makeTcc(chrom, strand, start, end) log = 'logs/o-' + str(start) elog = 'logs/e-%s-%s-%s-%s' % (chrom, strand, start, end) subprocess.Popen([ 'qsub', '-V', '-cwd', '-e', elog, '-o', log, '-l', 'mem=3G', '-l', 'rt=3600', 'q.sh', tcc, cName, str(minExpression) ]).wait()
def makePeakInputQ(cName, minExpression = 2000): '''Uses shell script and qsub to get peaks quickly''' mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) assembly = conf.conf['assembly'] tccList = [] chromLens = cg.returnChromLengthDict(assembly) for chrom in chromLens: if chrom not in cg.acceptableChroms: continue for strand in ['1','-1']: print 'Getting Peaks for ', chrom, strand prevI = 0 for i in rangePoints(1, chromLens[chrom], 30): if i == 1: prevI = i continue start = prevI end = i prevI = i tcc = cg.makeTcc(chrom, strand, start, end) log = 'logs/o-' + str(start) elog = 'logs/e-%s-%s-%s-%s' % (chrom, strand, start, end) subprocess.Popen(['qsub', '-V', '-cwd', '-e', elog, '-o', log, '-l', 'mem=3G', '-l', 'rt=3600', 'q.sh', tcc, cName, str(minExpression)]).wait()
def sortResults(cName = None): #INIT conf = cgConfig.getConfig(cName) pFileName = conf.conf['results'] minDensity = 4 minSmallHits = 1 pFile = open(pFileName, 'r') fileLines = [] #This will hold the lists to be sorted... for line in pFile: fileLines.append(line.strip().split('\t')) pFile.close() #highest prediction density densityDict = {} #CID: highest density --> used to sort out clusters without proper density for line in fileLines: CID = line[7] pDensity = int(line[5]) if CID in densityDict: if pDensity > densityDict[CID]: densityDict[CID] = pDensity else: densityDict[CID] = pDensity #take out clusters that didn't make the cut CIDpassed = [] keptLines = [] for line in fileLines: CID = line[7] #smallClusterHits = int(line[10]) Not using this metric anymore... if (densityDict[CID] >= minDensity): #Density if line[8] == '0': #doesn't overlap with anything known (the cluster doesn't...that is) keptLines.append(line) if CID not in CIDpassed: CIDpassed.append(CID) #remake keptLines with integer in field ten #at this point just sort by cluster density... sID = 5 for line in keptLines: line[sID] = int(line[sID]) sortedData = sorted(keptLines, key=itemgetter(sID), reverse = True) #sort by small RNA hits for line in sortedData: line[sID] = str(line[sID]) #output sortedFile = open(conf.conf['results'] + '.sorted', 'w') for line in sortedData: sortedFile.write('\t'.join(line) + '\n') sortedFile.close() #Now output stats statFile = open('statFile.data', 'w') statFile.write('Total Clusters: %s\n' % len(densityDict)) statFile.write('Passed: %s\n' % len(CIDpassed))
def updateOverlaps(cName=None): #init conf = cgConfig.getConfig(cName) pFileName = conf.conf['results'] overlapsFileName = conf.conf['matureOverlaps'] #put overlapping sequences in list: overlaps = [] overlapFile = open(overlapsFileName, 'r') for tcc in overlapFile: overlaps.append(tcc.strip()) overlapFile.close() #check each line of pred file for overlap, add 1 for overlap and 0 for non predFile = open(pFileName, 'r') newFileLines = [] for line in predFile: mTcc = line.strip().split('\t')[1] if mTcc in overlaps: newLine = line.strip().split('\t') newLine[6] = str(1) newLine = '\t'.join(newLine) + '\n' newFileLines.append(newLine) else: newLine = line.strip().split('\t') newLine[6] = str(0) newLine = '\t'.join(newLine) + '\n' newFileLines.append(newLine) predFile.close() #write new File newFile = open(pFileName, 'w') for line in newFileLines: newFile.write(line)
def returnChromLengthDict(assembly): mConf = c.getConfig('Main.conf') lenFileName = mConf.conf['chromosomeLengths'] + '/' + assembly f = open(lenFileName, 'r') lenDict = {} for line in f: lenDict[line.split('\t')[0]] = int(line.split('\t')[1]) return lenDict
def scanVectorsOrganism(tccList, config=None): '''Given tcc list --> scan Organism wig files and coord:value... ''' config = c.getConfig(config) coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) #print 'Checking Tcc' org = config.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom) #print 'Checking Index' #goto correct line in index fIndex = cgIndex.lineIndex( fN, header=True ) #!!!there actually is a header...have to deal with this... fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch( tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break return coordDict
def writeWigFromHitDict(hitDict, assembly, name, directory=None): mConf = c.getConfig('Main.conf') if not directory: directory = mConf.conf['wigs'] if not name: name = cg.getBaseFileName(name, naked=True) lDict = cg.returnChromLengthDict(assembly) cg.clearDirectory(directory, overwrite=False) #write results to wig file for chrom in hitDict: for strand in hitDict[chrom]: oF = open(directory + '/%s.%s.%s.wig' % (name, chrom, strand), 'w') oF.write('track type=bedGraph name=%s.%s.%s\n' % (name, chrom, strand)) #print ' sorting' #print hitDict[chrom] chromEnd = lDict[chrom] # hitDict[chrom][strand][chromEnd] = 0 keys = hitDict[chrom][strand].keys() keys.sort() #print ' writing blocks' prevVal = 0 prevCoord = 0 blockStart = 0 blockEnd = 1 for key in keys: val = hitDict[chrom][strand][key] if prevCoord == key - 1: if val == prevVal: #should be combined blockEnd = key + 1 else: #no zero block #write old block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #!make it a float value? #start new block blockStart = key blockEnd = key + 1 else: #write old block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #write zero block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockEnd, key, 0)) #start new block blockStart = key blockEnd = key + 1 prevVal = val prevCoord = key oF.close()
def updateClusterInfo(cName = None): #init conf = cgConfig.getConfig(cName) pFileName = conf.conf['results'] sortedClustersFileName = conf.conf['sortedClusters'] #add cluster IDs to tccs sortedFile = open(sortedClustersFileName, 'r') idDict = {} # format: tcc : clusterID i = 0 for line in sortedFile: clusterID = str(i) i = i + 1 #put cluster into list cluster = line.strip().split(',') cluster.remove('') #pesky last comma for tcc in cluster: idDict[tcc] = clusterID sortedFile.close() #get tccs that are overlapping with known sequences overDict = {} predFile = open(pFileName, 'r') for line in predFile: if line.strip().split('\t')[6] == '1': clusterID = idDict[line.strip().split('\t')[1]] overDict[clusterID] = 1 predFile.close() #now remake file newLines = [] predFile = open(pFileName, 'r') for line in predFile: clusterID = idDict[line.strip().split('\t')[1]] if clusterID in overDict: newLine = line.strip().split('\t') newLine[7] = str(clusterID) newLine[8] = str(1) newLine = '\t'.join(newLine) + '\n' newLines.append(newLine) else: newLine = line.strip().split('\t') newLine[7] = str(clusterID) newLine[8] = str(0) newLine = '\t'.join(newLine) + '\n' newLines.append(newLine) predFile.close() #write new File newFile = open(pFileName, 'w') for line in newLines: newFile.write(line)
def scanVectorsOrganism(tccList, config = None): '''Given tcc list --> scan Organism wig files and coord:value... ''' config = c.getConfig(config) coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) #print 'Checking Tcc' org = config.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(),strand,chrom) #print 'Checking Index' #goto correct line in index fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this... fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break return coordDict
def svCoord(tccList, config = None): '''Given tcc list --> scan Organism wig files and coord:value... ''' #init config = c.getConfig(config) org = config.conf['organism'] wigDir = config.conf['wigSetDir'] wigSetName = config.conf['wigSetName'] splitIntoChroms = config.conf['wigChromSplit'] if splitIntoChroms == 'True': splitIntoChroms = True else: splitIntoChroms = False coordDict = {} # tcc: [list values] for tcc in tccList: chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc) if splitIntoChroms: fN = wigDir + '/%s.%s.%s.wig' % (wigSetName, chrom, strand) else: fN = wigDir + '/Merge.%s.%s.wig' % (org.lower(), strand) fIndex = cgIndex.lineIndex(fN, header = True) fIndex.passCheckFunction(cgIndex.wigCheckFunction) fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning stop = False for line in fIndex.file: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) + 1 #print 'lBeg', lBeg lEnd = int(cg.ss(line)[2]) #print 'lEnd', lEnd #print '--' lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd + 1): coordDict[i] = lValue if stop: break fIndex.close() #close the file and the index after use... return coordDict
def writeWigFromHitDict(hitDict, assembly, name, directory = None): mConf = c.getConfig('Main.conf') if not directory: directory = mConf.conf['wigs'] if not name: name = cg.getBaseFileName(name, naked = True) lDict = cg.returnChromLengthDict(assembly) cg.clearDirectory(directory, overwrite = False) #write results to wig file for chrom in hitDict: for strand in hitDict[chrom]: oF = open(directory + '/%s.%s.%s.wig' % (name, chrom, strand), 'w') oF.write('track type=bedGraph name=%s.%s.%s\n' % (name, chrom, strand)) #print ' sorting' #print hitDict[chrom] chromEnd = lDict[chrom] # hitDict[chrom][strand][chromEnd] = 0 keys = hitDict[chrom][strand].keys() keys.sort() #print ' writing blocks' prevVal = 0 prevCoord = 0 blockStart = 0 blockEnd = 1 for key in keys: val = hitDict[chrom][strand][key] if prevCoord == key - 1: if val == prevVal:#should be combined blockEnd = key + 1 else: #no zero block #write old block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #!make it a float value? #start new block blockStart = key blockEnd = key + 1 else: #write old block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #write zero block oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockEnd, key, 0)) #start new block blockStart = key blockEnd = key + 1 prevVal = val prevCoord = key oF.close()
def filterOut(cName = None): #Init conf = cgConfig.getConfig(cName) predictionList = compare.tccFileToList(conf.conf['resultsRaw'], 1) #predictionList = compare.tccFileToList(conf.conf['resultsRaw'], 0) overlapped = compare.filterOutTccs(predictionList, conf.conf['knownDirectory'], True) #True gives me the filtered out ones instead of the list without filtered out matureOverlaps = open(conf.conf['matureOverlaps'], 'w') for tcc in overlapped: matureOverlaps.write(tcc + '\n')
def filterOut(cName=None): # Init conf = cgConfig.getConfig(cName) predictionList = compare.tccFileToList(conf.conf["resultsRaw"], 1) # predictionList = compare.tccFileToList(conf.conf['resultsRaw'], 0) overlapped = compare.filterOutTccs( predictionList, conf.conf["knownDirectory"], True ) # True gives me the filtered out ones instead of the list without filtered out matureOverlaps = open(conf.conf["matureOverlaps"], "w") for tcc in overlapped: matureOverlaps.write(tcc + "\n")
def addPeriods(cName = None): #init conf = cgConfig.getConfig(cName) #gets the current configuration instructions pFileName = conf.conf['resultsRaw'] nFileName = conf.conf['results'] pFile = open(pFileName, 'r') nFile = open(nFileName, 'w') newLines = [] for line in pFile: newLine = '\t'.join(line.strip().split('\t')[0:5]) + '\t.\t.\t.\t.\t.\t.\n' nFile.write(newLine)
def splitExonsIntrons(cName = None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init organism = conf.conf['organism'] minOverlap = 50 cHairs = getHairpins.getHairpins() #CID: HAIRPIN exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) hairpins = [] for CID in cHairs: hairpins.append(cHairs[CID]) print 'checking overlaps' #check which hairpins overlap exons and by how much exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount = True) print ' ', len(exonOverlapped) print 'removing partial introns' #remove the ones that didn't overlap more than X: remList = [] for tcc, oAmount in exonOverlapped: if oAmount < minOverlap: remList.append([tcc, oAmount]) for item in remList: exonOverlapped.remove(item) print ' ', len(exonOverlapped), 'out of', len(cHairs.keys()) #get CIDs of exons exonCIDs = [] for tcc, oAmount in exonOverlapped: for CID in cHairs: if cHairs[CID] == tcc: exonCIDs.append(str(CID)) #Open sorted predictions and write lines with CIDs to respective files predFile = open(conf.conf['resultsSorted'], 'r') exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w') intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w') for line in predFile: if line.split('\t')[7] in exonCIDs: exonFile.write(line) else: intronFile.write(line) predFile.close() exonFile.close() intronFile.close()
def updateDensity(cName=None): #Create hitmap for blocks, cValdict for block conf = cgConfig.getConfig(cName) blockFileName = conf.conf[ 'hitsPerFrame'] # created in defineCluster script folder blockFile = open(blockFileName, 'r') blocksList = [] cValBlockDict = {} for line in blockFile: blocksList.append(line.strip().split('\t')[0]) cValBlockDict[line.strip().split('\t')[0]] = int( line.strip().split('\t')[1]) blockFile.close() blockHitmap = bioLibCG.createHitMap(blocksList) #Now append the cVal for each predicted line: predictedFileName = conf.conf['results'] predictedFile = open(predictedFileName, 'r') newFileList = [] counter = 0 for line in predictedFile: counter = counter + 1 #print counter cVal = 0 #what blocks does this prediction overlap? tccPrediction = line.strip().split('\t')[1] #This should be mature? coordsPrediction = bioLibCG.stripTripleColon(tccPrediction) for i in range(int(coordsPrediction['start']), int(coordsPrediction['end'])): if i in blockHitmap: for block in blockHitmap[i]: if bioLibCG.tccOverlap(tccPrediction, block): if cValBlockDict[block] > cVal: cVal = cValBlockDict[block] newLine = line.strip().split('\t') newLine[5] = str(cVal) newLine = '\t'.join(newLine) + '\n' newFileList.append(newLine) predictedFile.close() newFileName = conf.conf['results'] newFile = open(newFileName, 'w') for line in newFileList: newFile.write(line) newFile.close()
def splitExonsIntrons(cName=None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init organism = conf.conf['organism'] minOverlap = 50 cHairs = getHairpins.getHairpins() #CID: HAIRPIN exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) hairpins = [] for CID in cHairs: hairpins.append(cHairs[CID]) print 'checking overlaps' #check which hairpins overlap exons and by how much exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount=True) print ' ', len(exonOverlapped) print 'removing partial introns' #remove the ones that didn't overlap more than X: remList = [] for tcc, oAmount in exonOverlapped: if oAmount < minOverlap: remList.append([tcc, oAmount]) for item in remList: exonOverlapped.remove(item) print ' ', len(exonOverlapped), 'out of', len(cHairs.keys()) #get CIDs of exons exonCIDs = [] for tcc, oAmount in exonOverlapped: for CID in cHairs: if cHairs[CID] == tcc: exonCIDs.append(str(CID)) #Open sorted predictions and write lines with CIDs to respective files predFile = open(conf.conf['resultsSorted'], 'r') exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w') intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w') for line in predFile: if line.split('\t')[7] in exonCIDs: exonFile.write(line) else: intronFile.write(line) predFile.close() exonFile.close() intronFile.close()
def finalSort(pType, cName = None): #INIT conf = cgConfig.getConfig(cName) if pType == "E": pFileName = conf.conf['resultsExons'] else: pFileName = conf.conf['resultsIntrons'] minDensity = 4 maxPVal = float(.05) pFile = open(pFileName, 'r') fileLines = [] #This will hold the lists to be sorted... for line in pFile: fileLines.append(line.strip().split('\t')) pFile.close() keptLines = [] for line in fileLines: CID = line[7] cDensity = int(line[12]) pVal = float(line[15]) #print pVal, cDensity if cDensity > 5 and pVal < maxPVal: #print ' kept' keptLines.append(line) print len(keptLines) #remake keptLines with float(pVal) i = 12 for line in keptLines: line[i] = float(line[i]) sortedData = sorted(keptLines, key=itemgetter(i), reverse = True) #sort by i for line in sortedData: line[i] = str(line[i]) #print len(sortedData) #output sortedFile = open(pFileName + '.sorted', 'w') for line in sortedData: sortedFile.write('\t'.join(line) + '\n') sortedFile.close()
def addPeriods(cName=None): #init conf = cgConfig.getConfig( cName) #gets the current configuration instructions pFileName = conf.conf['resultsRaw'] nFileName = conf.conf['results'] pFile = open(pFileName, 'r') nFile = open(nFileName, 'w') newLines = [] for line in pFile: newLine = '\t'.join( line.strip().split('\t')[0:5]) + '\t.\t.\t.\t.\t.\t.\n' nFile.write(newLine)
def finalSort(pType, cName=None): #INIT conf = cgConfig.getConfig(cName) if pType == "E": pFileName = conf.conf['resultsExons'] else: pFileName = conf.conf['resultsIntrons'] minDensity = 4 maxPVal = float(.05) pFile = open(pFileName, 'r') fileLines = [] #This will hold the lists to be sorted... for line in pFile: fileLines.append(line.strip().split('\t')) pFile.close() keptLines = [] for line in fileLines: CID = line[7] cDensity = int(line[12]) pVal = float(line[15]) #print pVal, cDensity if cDensity > 5 and pVal < maxPVal: #print ' kept' keptLines.append(line) print len(keptLines) #remake keptLines with float(pVal) i = 12 for line in keptLines: line[i] = float(line[i]) sortedData = sorted(keptLines, key=itemgetter(i), reverse=True) #sort by i for line in sortedData: line[i] = str(line[i]) #print len(sortedData) #output sortedFile = open(pFileName + '.sorted', 'w') for line in sortedData: sortedFile.write('\t'.join(line) + '\n') sortedFile.close()
def updateDensity(cName=None): # Create hitmap for blocks, cValdict for block conf = cgConfig.getConfig(cName) blockFileName = conf.conf["hitsPerFrame"] # created in defineCluster script folder blockFile = open(blockFileName, "r") blocksList = [] cValBlockDict = {} for line in blockFile: blocksList.append(line.strip().split("\t")[0]) cValBlockDict[line.strip().split("\t")[0]] = int(line.strip().split("\t")[1]) blockFile.close() blockHitmap = bioLibCG.createHitMap(blocksList) # Now append the cVal for each predicted line: predictedFileName = conf.conf["results"] predictedFile = open(predictedFileName, "r") newFileList = [] counter = 0 for line in predictedFile: counter = counter + 1 # print counter cVal = 0 # what blocks does this prediction overlap? tccPrediction = line.strip().split("\t")[1] # This should be mature? coordsPrediction = bioLibCG.stripTripleColon(tccPrediction) for i in range(int(coordsPrediction["start"]), int(coordsPrediction["end"])): if i in blockHitmap: for block in blockHitmap[i]: if bioLibCG.tccOverlap(tccPrediction, block): if cValBlockDict[block] > cVal: cVal = cValBlockDict[block] newLine = line.strip().split("\t") newLine[5] = str(cVal) newLine = "\t".join(newLine) + "\n" newFileList.append(newLine) predictedFile.close() newFileName = conf.conf["results"] newFile = open(newFileName, "w") for line in newFileList: newFile.write(line) newFile.close()
def mergeInputs(cName, eLevel): conf = c.getConfig(cName) assembly = conf.conf['assembly'] ending = '%s.%s' % (eLevel, assembly) print 'merging all files with ending', ending newLines = [] for fN in cg.recurseDir('out', end = ending): print os.getcwd(), fN fN = os.getcwd() + '/' + fN f = open(fN, 'r') newLines.extend(f.readlines()) f.close() f = open('peakData.%s.%s' % (eLevel, assembly), 'w') f.writelines(newLines) f.close()
def mergeInputs(cName, eLevel): conf = c.getConfig(cName) assembly = conf.conf['assembly'] ending = '%s.%s' % (eLevel, assembly) print 'merging all files with ending', ending newLines = [] for fN in cg.recurseDir('out', end=ending): print os.getcwd(), fN fN = os.getcwd() + '/' + fN f = open(fN, 'r') newLines.extend(f.readlines()) f.close() f = open('peakData.%s.%s' % (eLevel, assembly), 'w') f.writelines(newLines) f.close()
def parallelMakePeaks(tcc, cName, minExpression): conf = c.getConfig(cName) f = open('out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w') chrom, strand, start, end = cg.tccSplit(tcc) peaks = cgPeaks.stretch(tcc, cName) print 'getting peaks' peaks.createPeaks(span = 1, minVal = int(minExpression)) for x in peaks.peaks: print x newTcc = cg.makeTcc(chrom, strand, x, x + 1) testedPeak = extendPeakTest(newTcc, 20, .2, .05, 0, 6, cName) #testedPeak = roofPeakTest(newTcc, 30, .85, .9, .2, 6, 17, 24, cName) if testedPeak: f.write('%s\n' % testedPeak) f.close()
def parallelMakePeaks(tcc, cName, minExpression): conf = c.getConfig(cName) f = open('out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w') chrom, strand, start, end = cg.tccSplit(tcc) peaks = cgPeaks.stretch(tcc, cName) print 'getting peaks' peaks.createPeaks(span = 1, minVal = int(minExpression)) for x in peaks.peaks: print "" print chrom, strand, x, newTcc = cg.makeTcc(chrom, strand, x, x + 1) testedPeak = extendPeakTest(newTcc, 20, .2, .05, 0, 6, cName) #testedPeak = roofPeakTest(newTcc, 30, .85, .9, .2, 8, 16, 25, cName) if testedPeak: f.write('%s\n' % testedPeak) f.close()
def updateOverlaps(cName = None): #init conf = cgConfig.getConfig(cName) pFileName = conf.conf['results'] overlapsFileName = conf.conf['matureOverlaps'] #put overlapping sequences in list: overlaps = [] overlapFile = open(overlapsFileName, 'r') for tcc in overlapFile: overlaps.append(tcc.strip()) overlapFile.close() #check each line of pred file for overlap, add 1 for overlap and 0 for non predFile = open(pFileName, 'r') newFileLines = [] for line in predFile: mTcc = line.strip().split('\t')[1] if mTcc in overlaps: newLine = line.strip().split('\t') newLine[6] = str(1) newLine = '\t'.join(newLine) + '\n' newFileLines.append(newLine) else: newLine = line.strip().split('\t') newLine[6] = str(0) newLine = '\t'.join(newLine) + '\n' newFileLines.append(newLine) predFile.close() #write new File newFile = open(pFileName, 'w') for line in newFileLines: newFile.write(line)
def intronNoisy(cName=None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init cHairs = getHairpins.getHairpins( conf.conf['resultsIntrons']) #CID: HAIRPIN organism = conf.conf['organism'] exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) slide = 1000 #make prediction overlap hitmap predMap = {} predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) #collapse Overlaps print ' collapsing predictions' predList = compare.collapseOverlaps(predList) print ' collapsing exons' exonList = compare.collapseOverlaps(exonList) #collect levels for each hairpin region cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] chrom = ss(hPin, ':')[0] strand = ss(hPin, ':')[1] start = int(ss(hPin, ':')[2]) end = int(ss(hPin, ':')[3]) scanStart = start - slide scanEnd = end + slide scanRange = [] scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start)) scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd)) print scanRange scanRange = compare.subtractTwoTccLists(scanRange, predList) scanRange = compare.subtractTwoTccLists(scanRange, exonList) levels = [] print ' Retrieving Expression levels:', cg.getTccListTotalLength( scanRange) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j: # add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['intronNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()
def parallelMakePeaks(tcc, cName, minExpression): conf = c.getConfig(cName) f = open('out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w') print 'scanning range', tcc chrom, strand, start, end = cg.tccSplit(tcc) peaks = cgPeaks.stretch(tcc, cName) #print 'getting peaks' peaks.createPeaks(span = 1, minVal = int(minExpression)) print 'len peaks', len(peaks.peaks) endCheck = 0 for x in peaks.peaks: print x, endCheck ''' if x < endCheck: print 'endChecked' continue ''' #scan a 30 bp range around this point and find the best roof... pRange = 40 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True) #now get highest stretch length and the rNext coord. minVal = .70 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): print ' ', x + i, cProfile[i] if cProfile[i] > minVal: print ' extending stretch' stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: print 'end of stretch' if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- extend value... val = [1.0, 1.0] extend = 1 if (startFinal) and (endFinal): low = startFinal - extend high = endFinal + extend if low > (1 - pRange) and high < pRange: val[0] = float(cProfile[startFinal - extend]) val[1] = float(cProfile[endFinal + extend]) else: print 'out of range' continue else: print 'no start and end of peak' continue print low, high, x, endFinal endCheck = x + endFinal #avg expression around peak check... #get total expression before peak noiseExpression = 0 lowRange = range(1 - pRange, low) highRange = range(high + 1, pRange) totalLength = len(lowRange) + len(highRange) for i in lowRange: noiseExpression += cProfile[i] for i in highRange: noiseExpression += cProfile[i] avgNoise = noiseExpression/float(totalLength) #filter out peaks that look a certain way. print highest, val[0], val[1], avgNoise if 0 < highest < 5: #rooflength 14/26 if val[0] < 0.20 and val[1] < .20: #drop values if avgNoise < .3: goodTcc = cg.makeTcc(chrom, strand, x + low, x + high) print '*KEEPER' f.write('%s\n' % goodTcc) f.close() print 'DONE', tcc
import getHairpins import cgGenes import cgConfig as c import bioLibCG as cg mConf = c.getConfig('Main.conf') geneSetFolder = mConf.conf['geneSetsHuman'] fN = '/home/chrisgre/projects/NoncodingHuman/results/NChuman-s3k8b17.results.sorted.introns.sorted' cHairs = getHairpins.getHairpins(fN) ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllTranscripts.tsv') cDesc = {} #CID:gDesc for CID in cHairs: tcc = cHairs[CID] cDesc[CID] = "NONE" overlappingGenes = ensGenes.geneOverlaps([tcc]) if len(overlappingGenes) > 0: print overlappingGenes[0].type cDesc[CID] = overlappingGenes[0].type f = open(fN, 'r') newLines = [] for line in f: CID = line.strip().split('\t')[7] newLines.append(cg.appendToLine(line, cDesc[CID], 16)) f.close()
import cgGenes import compareData as compare import cgConfig as c cName = 'mm9.conf' mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) organism = conf.conf['organism'] geneSetFolder = mConf.conf['geneSets%s' % organism] genes = cgGenes.createGeneSetFromFile(geneSetFolder + '/allTransciptsType.tsv') peakTccs = compare.tccFileToList('peakData.500.mm9', 0) tOverlaps = genes.transcriptOverlaps(peakTccs) typeDict = {} for transcript in tOverlaps: if transcript.type not in typeDict: typeDict[transcript.type] = 1 else: typeDict[transcript.type] += 1 #count the amounts of each type for each transcript amount = {} for gene in genes.genes: for t in gene.transcripts: if t.type in amount: amount[t.type] += 1 else: amount[t.type] = 1 print 'Total Peaks:', len(peakTccs)
def intronNoisy(cName = None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init cHairs = getHairpins.getHairpins(conf.conf['resultsIntrons']) #CID: HAIRPIN organism = conf.conf['organism'] exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) slide = 1000 #make prediction overlap hitmap predMap = {} predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) #collapse Overlaps print ' collapsing predictions' predList = compare.collapseOverlaps(predList) print ' collapsing exons' exonList = compare.collapseOverlaps(exonList) #collect levels for each hairpin region cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] chrom = ss(hPin, ':')[0] strand = ss(hPin, ':')[1] start = int(ss(hPin, ':')[2]) end = int(ss(hPin, ':')[3]) scanStart = start - slide scanEnd = end + slide scanRange = [] scanRange.append('%s:%s:%s:%s' % (chrom, strand, scanStart, start)) scanRange.append('%s:%s:%s:%s' % (chrom, strand, end, scanEnd)) print scanRange scanRange = compare.subtractTwoTccLists(scanRange, predList) scanRange = compare.subtractTwoTccLists(scanRange, exonList) levels = [] print ' Retrieving Expression levels:', cg.getTccListTotalLength(scanRange) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(scanRange, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j:# add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['intronNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()
def scanVectorsSingleCoord(tccList, cName): '''Given tcc list --> scan wig files and coord:value... ''' conf = c.getConfig(cName) org = conf.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] timer = cg.cgTimer() timer.start() coordDict = {} # tcc: [list values] for tcc in tccList: theSplit = ss(tcc, ':') chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1], int( theSplit[2]), int(theSplit[3]) #goto correct fild, correct line in index fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom) fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(), strand, chrom) #print timer.split() #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= tccStart < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #print timer.split() #grab value f = open(fN, 'r') f.seek(startByte, 0) stop = False for line in f: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break f.close() return coordDict
def exonNoisy(cName = None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) cHairs = getHairpins.getHairpins(conf.conf['resultsExons']) #CID: HAIRPIN organism = conf.conf['organism'] geneSetFolder = mConf.conf['geneSets%s' % organism] #make prediction overlap hitmap print 'Making prediction list' predList = [] for CID in cHairs: hPin = cHairs[CID] predList.append(hPin) if compare.checkIfOverlaps(predList): predList = compare.collapseOverlaps(predList) #make genes for Ensemble/make list of tccs for exons. print 'Creating gene set' ensGenes = cgGenes.createGeneSetFromFile(geneSetFolder + '/ensemblAllExons.tsv') print ' loaded # genes:', len(ensGenes.set) #collect levels for each haipin region print '[Checking all levels]' cidLevels = {} for CID in cHairs: print CID hPin = cHairs[CID] #for each hairpin, --> find overlapping transcripts in same gene overlappingGenes = ensGenes.geneOverlaps([hPin]) if len(overlappingGenes) > 0: gIDs = [gene.id for gene in overlappingGenes] allTccs = ensGenes.getTccsFromGIDs(gIDs) if compare.checkIfOverlaps: print ' Overlaps...collapsing' allTccs = compare.collapseOverlaps(allTccs) else: print 'NO GENE OVERLAPS!!!!!', CID, hPin #filter out my predictions. print ' Filtering out predictions' checkList = compare.subtractTwoTccLists(allTccs, predList) #Get Expression level for gene. print ' Retrieving Expression levels:', cg.getTccListTotalLength(checkList) levels = [] hPinLevels = stepVectorScan.scanVectorsHist(checkList, cName) for hPin in hPinLevels: levels.extend(hPinLevels[hPin]) cidLevels[CID] = levels #output levels to file print 'Outputting to file' #find longest longest = 0 for CID in cidLevels: length = len(cidLevels[CID]) if length > longest: longest = length sortedKeys = cidLevels.keys() sortedKeys.sort() #print sortedKeys newLines = [] for j in range(0, longest): #how many lines are there newLine = [] for CID in sortedKeys: if len(cidLevels[CID]) > j:# add it newLine.append(str(cidLevels[CID][j])) else: newLine.append('NA') newLines.append('\t'.join(newLine) + '\n') outFileN = conf.conf['exonNoiseData'] outFile = open(outFileN, 'w') outFile.write('\t'.join(sortedKeys) + '\n') outFile.writelines(newLines) outFile.close()
def findPeaks(pType, cName = None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID],'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538':cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio = False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc,x,y,dist,ratio,max,highest,val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < (math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % (str(peakDict[CID][1][1])[-3:], 'S', str(peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5],peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
def updateReadDensity(tType, cName): #go through wig each chromosome and check the mature seqs mainConf = cgConfig.cgConfig('Main.conf') conf = cgConfig.getConfig(cName) organism = conf.conf['organism'] wigFolder = mainConf.conf['wig%s' % organism] newLines = [] #Differentiate between exon or intron... if tType == 'E': pFileName = conf.conf['resultsExons'] elif tType == 'I': pFileName = conf.conf['resultsIntrons'] else: print 'READ UPDATE FAIL' print ' Updating Read Density:', tType #get read density for each line... print ' calculating hits for mature seqs' #calculate total hits per mature mirFile = open(pFileName, 'r') for line in mirFile: mTcc = line.strip().split('\t')[1] mirID = line.strip().split('\t')[0] tccStretch = cgPeaks.stretch(mTcc, cName) highestHit = 0 for i in range(int(mTcc.split(':')[2]), int(mTcc.split(':')[3])): if i in tccStretch.profile: if tccStretch.profile[i] > highestHit: highestHit = tccStretch.profile[i] newLines.append(cg.appendToLine(line, str(highestHit), 11)) mirFile.close() print 'Writing New File' #write new results file outFile = open(pFileName, 'w') for line in newLines: outFile.write(line) outFile.close() ####NOW UPDATE HIGHEST HIT PER CLUSTER#### clusterCount = {} pFile = open(pFileName, 'r') for line in pFile: predictionCount = int(line.strip().split('\t')[11]) CID = line.strip().split('\t')[7] if CID in clusterCount: if clusterCount[CID] < predictionCount: clusterCount[CID] = predictionCount else: clusterCount[CID] = predictionCount pFile.close() #update the file --> cluster small count newLines = [] predFile = open(pFileName, 'r') for line in predFile: CID = line.strip().split('\t')[7] numMax = clusterCount[CID] newLines.append(cg.appendToLine(line, str(numMax), 12)) predFile.close() #sort newLines by clusterID sortDict = {} CIDs = [] for line in newLines: CID = int(line.strip().split('\t')[7]) if CID not in CIDs: CIDs.append(CID) if CID in sortDict: sortDict[CID].append(line) else: sortDict[CID] = [line] CIDs.sort() newLines = [] for CID in CIDs: for line in sortDict[CID]: newLines.append(line) #write new File newFile = open(pFileName, 'w') for line in newLines: newFile.write(line) newFile.close()
def updateNoise(pType, cName=None): #init mainConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExons'] else: predName = conf.conf['resultsIntrons'] #populate cid: exon dist print 'Populating CID/INtron/exon distribution data' if pType == 'E': noiseFN = conf.conf['exonNoiseData'] f = open(noiseFN, 'r') else: noiseFN = conf.conf['intronNoiseData'] f = open(noiseFN, 'r') exonDists = {} #cid: [exon dist] header = f.readline() order = {} # num:CID for i, CID in enumerate(header.strip().split('\t')): order[i] = CID exonDists[CID] = [] for line in f: data = line.strip().split('\t') for i, dataPoint in enumerate(data): if dataPoint == 'NA' or dataPoint == '': continue else: dataPoint = float(dataPoint) CID = order[i] exonDists[CID].append(dataPoint) #get highest expression level for each cluster print 'Populating highest expression levels' predExpression = {} # CID; highest level exonFile = open(predName, 'r') for line in exonFile: CID = line.strip().split('\t')[7] hDensity = line.strip().split('\t')[12] predExpression[CID] = hDensity #get pVals for each CID print 'Getting pvals for each cluster' pVals = {} # CID; [lam,pVal] for CID in exonDists: if not len(exonDists[CID]) > 0: #no data in 2kb range. lam = 'NA' pVal = 'NA' else: lam = cgStats.getLam(exonDists[CID]) pVal = cgStats.getPValExp(predExpression[CID], lam) pVals[CID] = [ lam, pVal ] #lam gives a good approximation of noise levels in region... print 'Updating the file' #update file... predFile = open(predName, 'r') newLines = [] for line in predFile: CID = line.split('\t')[7] newLine = cg.appendToLine(line, pVals[CID][0], 14) newLine = cg.appendToLine(newLine, pVals[CID][1], 15) newLines.append(newLine) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
def sortResults(cName=None): #INIT conf = cgConfig.getConfig(cName) pFileName = conf.conf['results'] minDensity = 4 minSmallHits = 1 pFile = open(pFileName, 'r') fileLines = [] #This will hold the lists to be sorted... for line in pFile: fileLines.append(line.strip().split('\t')) pFile.close() #highest prediction density densityDict = { } #CID: highest density --> used to sort out clusters without proper density for line in fileLines: CID = line[7] pDensity = int(line[5]) if CID in densityDict: if pDensity > densityDict[CID]: densityDict[CID] = pDensity else: densityDict[CID] = pDensity #take out clusters that didn't make the cut CIDpassed = [] keptLines = [] for line in fileLines: CID = line[7] #smallClusterHits = int(line[10]) Not using this metric anymore... if (densityDict[CID] >= minDensity): #Density if line[8] == '0': #doesn't overlap with anything known (the cluster doesn't...that is) keptLines.append(line) if CID not in CIDpassed: CIDpassed.append(CID) #remake keptLines with integer in field ten #at this point just sort by cluster density... sID = 5 for line in keptLines: line[sID] = int(line[sID]) sortedData = sorted(keptLines, key=itemgetter(sID), reverse=True) #sort by small RNA hits for line in sortedData: line[sID] = str(line[sID]) #output sortedFile = open(conf.conf['results'] + '.sorted', 'w') for line in sortedData: sortedFile.write('\t'.join(line) + '\n') sortedFile.close() #Now output stats statFile = open('statFile.data', 'w') statFile.write('Total Clusters: %s\n' % len(densityDict)) statFile.write('Passed: %s\n' % len(CIDpassed))
def findPeaks(pType, cName=None): #init mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) if pType == 'E': predName = conf.conf['resultsExonsSorted'] else: predName = conf.conf['resultsIntronsSorted'] print predName #make CID:hairpin:peak dictionary cHairs = getHairpins.getHairpins(predName) peakDict = {} for CID in cHairs: peakDict[CID] = [cHairs[CID], 'None'] timer = cg.cgTimer() timer.start() #put peaks in memory print 'Creating peak data' peaks = {} # chr:peak:value for CID in cHairs: chrom, strand, start, end = cg.tccSplit(cHairs[CID]) tcc = cHairs[CID] #init dictionary if chrom not in peaks: peaks[chrom] = {} if strand not in peaks[chrom]: peaks[chrom][strand] = {} #create peaks for tcc and add to peak dictionary stretch = cgPeaks.stretch(tcc, cName) stretch.createPeaks() for peakCoord in stretch.peaks: peaks[chrom][strand][peakCoord] = 0 print timer.split() print 'finding best combos' bestCombos = [] aPass = 0 bPass = 0 cPass = 0 numT = 0 for CID in peakDict: cgFlag = False if CID == '538': cgFlag = True tcc = peakDict[CID][0] #print tcc tccPeaks = [] chrom = cg.ss(tcc, ':')[0] strand = cg.ss(tcc, ':')[1] start = int(cg.ss(tcc, ':')[2]) end = int(cg.ss(tcc, ':')[3]) #get all peaks for i in range(start, end + 1): if i in peaks[chrom][strand]: #print ' peak added', i tccPeaks.append(i) #Calculate parameters... pairStrings = [] #used to check if pair already added peakCombos = [] for x in tccPeaks: #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #quickly get max value...kinda a long way to do it but whatever cProfile = stepVectorScan.profileAroundPoint(rTcc, 1, cName, ratio=False) xval = cProfile[0] max = xval highestValueCoord = x #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio=True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange): if high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) #fill in other details... y = 'S' dist = 'S' ratio = 'S' peakCombos.append([tcc, x, y, dist, ratio, max, highest, val]) #print ' ', peakCombos[-1] #find best combo... topCombo = None for combo in peakCombos: roofLength = combo[6] dropValue = combo[7][0] if combo[7][1] > dropValue: dropValue = combo[7][1] #print roofLength, dropValue if 14 < roofLength < 26: if 0.0 < dropValue < 0.2: #pick one with rooflength nearest 20: if topCombo: if (math.fabs(22 - roofLength)) < ( math.fabs(22 - topCombo[6])): topCombo = combo else: topCombo = combo if topCombo: peakDict[CID][1] = topCombo bestCombos.append(topCombo) print bestCombos[-1] else: #print 'None' pass print timer.split() #now update predFile (SLOT 13) predFile = open(predName, 'r') newLines = [] for line in predFile: CID = cg.ss(line)[7] if peakDict[CID][1] == 'None': peakInfo = 'None' else: peakInfo = '%s:%s:%s:%s:%s:%s' % ( str(peakDict[CID][1][1])[-3:], 'S', str( peakDict[CID][1][4]).split('.')[0], peakDict[CID][1][5], peakDict[CID][1][6], peakDict[CID][1][7]) newLines.append(cg.appendToLine(line, peakInfo, 13)) predFile.close() predFile = open(predName, 'w') predFile.writelines(newLines) predFile.close()
def scanVectorsSingleCoord(tccList, cName): '''Given tcc list --> scan wig files and coord:value... ''' conf = c.getConfig(cName) org = conf.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] timer = cg.cgTimer() timer.start() coordDict = {} # tcc: [list values] for tcc in tccList: theSplit = ss(tcc, ':') chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1],int(theSplit[2]),int(theSplit[3]) #goto correct fild, correct line in index fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(),strand,chrom) fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),strand,chrom) #print timer.split() #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= tccStart < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #print timer.split() #grab value f = open(fN, 'r') f.seek(startByte, 0) stop = False for line in f: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): coordDict[i] = lValue if stop: break f.close() return coordDict
def scanVectorsHist(tccList, cName): '''Given tcc list --> scan wig files and get histogram values can be modified to do single/total values... THIS USES INDEXES!!! = BAD...''' conf = c.getConfig(cName) org = conf.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] timer = cg.cgTimer() timer.start() histDict = {} # tcc: [list values] for tcc in tccList: theSplit = ss(tcc, ':') chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1], int( theSplit[2]), int(theSplit[3]) #goto correct fild, correct line in index fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom) fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(), strand, chrom) #print timer.split() #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= tccStart < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #print timer.split() #grab value f = open(fN, 'r') f.seek(startByte, 0) stop = False for line in f: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): try: histDict[tcc].append(lValue) except KeyError: #just for zero...so you don't have to if every time... histDict[tcc] = [lValue] if stop: break f.close() #print timer.split() return histDict
def parallelMakePeaks(tcc, cName, minExpression): conf = c.getConfig(cName) f = open( 'out/peakData.%s.%s.%s' % (tcc, minExpression, conf.conf['assembly']), 'w') print 'scanning range', tcc chrom, strand, start, end = cg.tccSplit(tcc) peaks = cgPeaks.stretch(tcc, cName) #print 'getting peaks' peaks.createPeaks(span=1, minVal=int(minExpression)) print 'len peaks', len(peaks.peaks) endCheck = 0 for x in peaks.peaks: print x, endCheck ''' if x < endCheck: print 'endChecked' continue ''' #scan a 30 bp range around this point and find the best roof... pRange = 40 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio=True) #now get highest stretch length and the rNext coord. minVal = .70 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): print ' ', x + i, cProfile[i] if cProfile[i] > minVal: print ' extending stretch' stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: print 'end of stretch' if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- extend value... val = [1.0, 1.0] extend = 1 if (startFinal) and (endFinal): low = startFinal - extend high = endFinal + extend if low > (1 - pRange) and high < pRange: val[0] = float(cProfile[startFinal - extend]) val[1] = float(cProfile[endFinal + extend]) else: print 'out of range' continue else: print 'no start and end of peak' continue print low, high, x, endFinal endCheck = x + endFinal #avg expression around peak check... #get total expression before peak noiseExpression = 0 lowRange = range(1 - pRange, low) highRange = range(high + 1, pRange) totalLength = len(lowRange) + len(highRange) for i in lowRange: noiseExpression += cProfile[i] for i in highRange: noiseExpression += cProfile[i] avgNoise = noiseExpression / float(totalLength) #filter out peaks that look a certain way. print highest, val[0], val[1], avgNoise if 0 < highest < 5: #rooflength 14/26 if val[0] < 0.20 and val[1] < .20: #drop values if avgNoise < .3: goodTcc = cg.makeTcc(chrom, strand, x + low, x + high) print '*KEEPER' f.write('%s\n' % goodTcc) f.close() print 'DONE', tcc
def makePeakInput(cName, minExpression = 2000): mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) assembly = conf.conf['assembly'] tccList = [] chromLens = cg.returnChromLengthDict(assembly) f = open('peakData.%s' % minExpression, 'w') for chrom in chromLens: if chrom not in cg.acceptableChroms: continue for strand in ['1', '-1']: print 'Getting Peaks for ', chrom, strand prevI = 0 endCheck = 0 for i in rangePoints(1, chromLens[chrom], 1000): if i == 1: prevI = i continue start = prevI end = i prevI = i tcc = cg.makeTcc(chrom, strand, start, end) #print 'scanning range', tcc peaks = cgPeaks.stretch(tcc, cName) peaks.createPeaks(span = 3, minVal = minExpression) for x in peaks.peaks: if x < endCheck: continue #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio = True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange) and high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) else: continue else: continue endCheck = x + high #filter out peaks that look a certain way. if 14 < highest < 26: #rooflength if val[0] < 0.2 and val[1] < .2: #drop values goodTcc = cg.makeTcc(chrom, strand, x + low, x + high) #print goodTcc f.write('%s\n' % goodTcc) f.close()
def makePeakInput(cName, minExpression=2000): mConf = c.getConfig('Main.conf') conf = c.getConfig(cName) assembly = conf.conf['assembly'] tccList = [] chromLens = cg.returnChromLengthDict(assembly) f = open('peakData.%s' % minExpression, 'w') for chrom in chromLens: if chrom not in cg.acceptableChroms: continue for strand in ['1', '-1']: print 'Getting Peaks for ', chrom, strand prevI = 0 endCheck = 0 for i in rangePoints(1, chromLens[chrom], 1000): if i == 1: prevI = i continue start = prevI end = i prevI = i tcc = cg.makeTcc(chrom, strand, start, end) #print 'scanning range', tcc peaks = cgPeaks.stretch(tcc, cName) peaks.createPeaks(span=3, minVal=minExpression) for x in peaks.peaks: if x < endCheck: continue #scan a 30 bp range around this point and find the best roof... pRange = 30 rTcc = cg.makeTcc(chrom, strand, x, x + 1) #now make profile for roof... cProfile = stepVectorScan.profileAroundPoint(rTcc, pRange, cName, ratio=True) #now get highest stretch length and the rNext coord. minVal = .80 highest = 0 stretch = 0 startCurrent = None startFinal = None endFinal = None for i in range(1 - pRange, pRange): if cProfile[i] > minVal: stretch += 1 if startCurrent == None: startCurrent = i else: if stretch > 0: if stretch > highest: #stretch ended and was higher than previous highest = stretch endFinal = i - 1 startFinal = startCurrent startCurrent = None else: startCurrent = None stretch = 0 #get +/- 4 value... val = [1.0, 1.0] if (startFinal) and (endFinal): low = startFinal - 4 high = endFinal + 4 if low > (1 - pRange) and high < pRange: val[0] = float(cProfile[startFinal - 4]) val[1] = float(cProfile[endFinal + 4]) else: continue else: continue endCheck = x + high #filter out peaks that look a certain way. if 14 < highest < 26: #rooflength if val[0] < 0.2 and val[1] < .2: #drop values goodTcc = cg.makeTcc(chrom, strand, x + low, x + high) #print goodTcc f.write('%s\n' % goodTcc) f.close()
def scanVectorsHist(tccList, cName): '''Given tcc list --> scan wig files and get histogram values can be modified to do single/total values... THIS USES INDEXES!!! = BAD...''' conf = c.getConfig(cName) org = conf.conf['organism'] mConf = c.getConfig('Main.conf') wigDir = mConf.conf['wig%s' % org] timer = cg.cgTimer() timer.start() histDict = {} # tcc: [list values] for tcc in tccList: theSplit = ss(tcc, ':') chrom, strand, tccStart, tccEnd = theSplit[0], theSplit[1],int(theSplit[2]),int(theSplit[3]) #goto correct fild, correct line in index fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(),strand,chrom) fNindex = wigDir + '/Merge.%s.%s.wig.%s.wig.index' % (org.lower(),strand,chrom) #print timer.split() #get line in index file iFile = open(fNindex, 'r') startByte = 'None' for line in iFile: beg = int(cg.ss(line)[1]) end = int(cg.ss(line)[2]) if beg <= tccStart < end: startByte = int(cg.ss(line)[0]) #print 'INDEX', line.strip() break iFile.close() #print timer.split() #grab value f = open(fN, 'r') f.seek(startByte, 0) stop = False for line in f: #print 'Line:', line.strip() lBeg = int(cg.ss(line)[1]) lEnd = int(cg.ss(line)[2]) lValue = int(cg.ss(line)[3].split('.')[0]) if tccStart > lBeg: lBeg = tccStart if tccEnd < lEnd: lEnd = tccEnd stop = True #print timer.split() for i in range(lBeg, lEnd): try: histDict[tcc].append(lValue) except KeyError: #just for zero...so you don't have to if every time... histDict[tcc] = [lValue] if stop: break f.close() #print timer.split() return histDict
def defineClusters(cName=None): #Start Timer timer = cg.cgTimer() timer.start() #Get list of mature tccs conf = cgConfig.getConfig(cName) #passed or default finalMirFileName = conf.conf['resultsRaw'] matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc print 'List getting', timer.split() #make connections dict matureConnections = compare.makeConnectionsDict(matureTccs) print 'Make connections:', timer.split() #Now have to define Clusters... clusters = [] addedList = [] #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P def createClusters(item=None, mode=None): if item in addedList: return 0 elif mode == "top": clusters.append([item]) addedList.append( item) ##creates new cluster with the item already stored in it for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") elif mode == "neighbor": clusters[-1].append( item) #add this item to the last cluster created addedList.append(item) for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") for tcc in matureTccs: createClusters(tcc, "top") print 'Make Clusters', timer.split() #Sort Clusters. sortedClusters = [] for cluster in clusters: sortedClusters.append(cg.sortTccList(cluster)) print 'Sort Clusters:', timer.split() #Output sorted cluster file clusterFileName = conf.conf['sortedClusters'] clusterFile = open(clusterFileName, 'w') for cluster in sortedClusters: for hit in cluster: clusterFile.write('%s,' % hit) clusterFile.write('\n') clusterFile.close() ''' #re-create sortedClusters list: clusterFileName = 'sortedClusters.data' clusterFile = open(clusterFileName, 'r') sortedClusters = [] for line in clusterFile: sortedClusters.append([]) line = line.strip()[0:-1] #take off last comma ;P for hit in (line.strip().split(',')): sortedClusters[-1].append(hit) ''' print 'Store intermediate data:', timer.split() #output hitsAround file outputFile = open(conf.conf['hitsPerFrame'], 'w') frameLength = 200 frameShift = 1 for cluster in sortedClusters: #grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope clusterChrom = cluster[0].split(":")[0] clusterStrand = cluster[0].split(":")[1] firstCoord = int(cluster[0].split(":")[2]) #print cluster[-1] lastCoord = int(cluster[-1].split(":")[3]) startCoord = firstCoord while startCoord < lastCoord: #count how many hits there are in this range rangeStart = startCoord - (frameLength / 2) rangeEnd = startCoord + (frameLength / 2) rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd) overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2) hitCount = len(overlappedList) #output outputFile.write('%s\t%s\n' % (rangeTcc, hitCount)) startCoord = startCoord + frameShift #check overlap with range outputFile.close() print 'Output Hits per Frame:', timer.split() print 'Overall Time:', timer.report()
def defineClusters(cName = None): #Start Timer timer = cg.cgTimer() timer.start() #Get list of mature tccs conf = cgConfig.getConfig(cName) #passed or default finalMirFileName = conf.conf['resultsRaw'] matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc print 'List getting', timer.split() #make connections dict matureConnections = compare.makeConnectionsDict(matureTccs) print 'Make connections:', timer.split() #Now have to define Clusters... clusters = [] addedList = [] #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P def createClusters(item = None, mode = None): if item in addedList: return 0 elif mode == "top": clusters.append([item]) addedList.append(item) ##creates new cluster with the item already stored in it for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") elif mode == "neighbor": clusters[-1].append(item) #add this item to the last cluster created addedList.append(item) for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") for tcc in matureTccs: createClusters(tcc, "top") print 'Make Clusters', timer.split() #Sort Clusters. sortedClusters = [] for cluster in clusters: sortedClusters.append(cg.sortTccList(cluster)) print 'Sort Clusters:', timer.split() #Output sorted cluster file clusterFileName = conf.conf['sortedClusters'] clusterFile = open(clusterFileName, 'w') for cluster in sortedClusters: for hit in cluster: clusterFile.write('%s,' % hit) clusterFile.write('\n') clusterFile.close() ''' #re-create sortedClusters list: clusterFileName = 'sortedClusters.data' clusterFile = open(clusterFileName, 'r') sortedClusters = [] for line in clusterFile: sortedClusters.append([]) line = line.strip()[0:-1] #take off last comma ;P for hit in (line.strip().split(',')): sortedClusters[-1].append(hit) ''' print 'Store intermediate data:', timer.split() #output hitsAround file outputFile = open(conf.conf['hitsPerFrame'], 'w') frameLength = 200 frameShift = 1 for cluster in sortedClusters: #grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope clusterChrom = cluster[0].split(":")[0] clusterStrand = cluster[0].split(":")[1] firstCoord = int(cluster[0].split(":")[2]) #print cluster[-1] lastCoord = int(cluster[-1].split(":")[3]) startCoord = firstCoord while startCoord < lastCoord: #count how many hits there are in this range rangeStart = startCoord - (frameLength/2) rangeEnd = startCoord + (frameLength/2) rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd) overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2) hitCount = len(overlappedList) #output outputFile.write('%s\t%s\n' % (rangeTcc, hitCount)) startCoord = startCoord + frameShift #check overlap with range outputFile.close() print 'Output Hits per Frame:', timer.split() print 'Overall Time:', timer.report()