def transcriptOverlaps(self, tccs): '''return list of overlapping transcripts''' if not isinstance(tccs, type([])): print 'transcript overlaps: NEED TCC LIST, not a single tcc!' return 1 print 'num of tccs being compared', len(tccs) #gather all transcript tccs(make tcc --> id). tccDict = {} for gene in self.genes: for transcript in gene.transcripts: if transcript.tcc in tccDict: tccDict[transcript.tcc].append(transcript.id) else: tccDict[transcript.tcc] = [transcript.id] print 'num of tcc transcript tccs', len(tccDict.keys()) overlapped = compare.compareTwoTcc(tccDict.keys(), tccs, 1) tList = [] for gene in self.genes: for transcript in gene.transcripts: if transcript.tcc in overlapped: tList.append(transcript) return tList
def transcriptSetOverlap(aDir, AS): AS = bool(AS) geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() if AS == True: degTccs = [cg.convertToAS(x.tcc) for x in id_oRNA.values()] else: degTccs = [x.tcc for x in id_oRNA.values()] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) #print len(overlappingExons), "num of overlapping exons" overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) #write new file for obj in id_oRNA.values(): if AS: degTcc = cg.convertToAS(obj.tcc) else: degTcc = obj.tcc if degTcc in overlappingDegTccs: obj.transcriptOverlap = True else: obj.transcriptOverlap = False oRNA_DC.commit(id_oRNA)
def transcriptSetOverlap(aDir, AS): AS = bool(AS) geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript oRNA_DC = cgNexusFlat.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() if AS == True: degTccs = [cg.convertToAS(x.tcc) for x in id_oRNA.values()] else: degTccs = [x.tcc for x in id_oRNA.values()] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) #print len(overlappingExons), "num of overlapping exons" overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) #write new file for obj in id_oRNA.values(): if AS: degTcc = cg.convertToAS(obj.tcc) else: degTcc = obj.tcc if degTcc in overlappingDegTccs: obj.transcriptOverlap = True else: obj.transcriptOverlap = False oRNA_DC.commit(id_oRNA)
def compareMouseHuman(mFN, hFNLift): aDC = cgNexusFlat.Nexus(mFN, cgOriginRNAFlat.OriginRNA) aDC.load(['tcc']) bTccs = [] f = open(hFNLift, 'r') for line in f: ls = line.strip().split('\t') bTccs.append(ls[0]) aTccs = [] for id in aDC.tcc: aTccs.append(aDC.tcc[id]) print compareTwoTcc(aTccs, bTccs, amount=True)
def compareTwoMouse(aFN, bFN): aDC = cgNexusFlat.Nexus(aFN, cgOriginRNAFlat.OriginRNA) aDC.load(['tcc']) bDC = cgNexusFlat.Nexus(bFN, cgOriginRNAFlat.OriginRNA) bDC.load(['tcc']) aTccs = [] bTccs = [] for id in aDC.tcc: aTccs.append(aDC.tcc[id]) for id in bDC.tcc: bTccs.append(bDC.tcc[id]) print compareTwoTcc(aTccs, bTccs, amount=True)
def compareMouseHuman(mFN, hFNLift): aDC = cgNexusFlat.Nexus(mFN, cgOriginRNAFlat.OriginRNA) aDC.load(['tcc']) bTccs = [] f = open(hFNLift, 'r') for line in f: ls = line.strip().split('\t') bTccs.append(ls[0]) aTccs = [] for id in aDC.tcc: aTccs.append(aDC.tcc[id]) print compareTwoTcc(aTccs, bTccs, amount = True)
def transcriptSetOverlapTargets(aDir): geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() #create list of unique tccs. uniqTccs = [] for alignment in id_alignment.values(): chrom, strand, start, end = cg.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen tcc = cg.makeTcc(chrom, strand, start, end) if tcc not in uniqTccs: uniqTccs.append(tcc) degTccs = [cg.convertToAS(x) for x in uniqTccs] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) #update for obj in id_alignment.values(): chrom, strand, start, end = cg.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen tcc = cg.makeTcc(chrom, strand, start, end) degTcc = cg.convertToAS(tcc) if degTcc in overlappingDegTccs: obj.transcriptOverlap = True else: obj.transcriptOverlap = False aDC.commit(id_alignment)
def compareTwoMouse(aFN, bFN): aDC = cgNexusFlat.Nexus(aFN, cgOriginRNAFlat.OriginRNA) aDC.load(['tcc']) bDC = cgNexusFlat.Nexus(bFN, cgOriginRNAFlat.OriginRNA) bDC.load(['tcc']) aTccs = [] bTccs = [] for id in aDC.tcc: aTccs.append(aDC.tcc[id]) for id in bDC.tcc: bTccs.append(bDC.tcc[id]) print compareTwoTcc(aTccs, bTccs, amount = True)
def splitExonsIntrons(cName = None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init organism = conf.conf['organism'] minOverlap = 50 cHairs = getHairpins.getHairpins() #CID: HAIRPIN exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) hairpins = [] for CID in cHairs: hairpins.append(cHairs[CID]) print 'checking overlaps' #check which hairpins overlap exons and by how much exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount = True) print ' ', len(exonOverlapped) print 'removing partial introns' #remove the ones that didn't overlap more than X: remList = [] for tcc, oAmount in exonOverlapped: if oAmount < minOverlap: remList.append([tcc, oAmount]) for item in remList: exonOverlapped.remove(item) print ' ', len(exonOverlapped), 'out of', len(cHairs.keys()) #get CIDs of exons exonCIDs = [] for tcc, oAmount in exonOverlapped: for CID in cHairs: if cHairs[CID] == tcc: exonCIDs.append(str(CID)) #Open sorted predictions and write lines with CIDs to respective files predFile = open(conf.conf['resultsSorted'], 'r') exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w') intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w') for line in predFile: if line.split('\t')[7] in exonCIDs: exonFile.write(line) else: intronFile.write(line) predFile.close() exonFile.close() intronFile.close()
def splitExonsIntrons(cName=None): mConf = c.cgConfig('Main.conf') conf = c.getConfig(cName) #init organism = conf.conf['organism'] minOverlap = 50 cHairs = getHairpins.getHairpins() #CID: HAIRPIN exonList = compare.tccFileToList('%sExons.tcc' % organism, 0) hairpins = [] for CID in cHairs: hairpins.append(cHairs[CID]) print 'checking overlaps' #check which hairpins overlap exons and by how much exonOverlapped = compare.compareTwoTcc(hairpins, exonList, 1, amount=True) print ' ', len(exonOverlapped) print 'removing partial introns' #remove the ones that didn't overlap more than X: remList = [] for tcc, oAmount in exonOverlapped: if oAmount < minOverlap: remList.append([tcc, oAmount]) for item in remList: exonOverlapped.remove(item) print ' ', len(exonOverlapped), 'out of', len(cHairs.keys()) #get CIDs of exons exonCIDs = [] for tcc, oAmount in exonOverlapped: for CID in cHairs: if cHairs[CID] == tcc: exonCIDs.append(str(CID)) #Open sorted predictions and write lines with CIDs to respective files predFile = open(conf.conf['resultsSorted'], 'r') exonFile = open(conf.conf['resultsSorted'] + '.exons', 'w') intronFile = open(conf.conf['resultsSorted'] + '.introns', 'w') for line in predFile: if line.split('\t')[7] in exonCIDs: exonFile.write(line) else: intronFile.write(line) predFile.close() exonFile.close() intronFile.close()
def reportDifference(oldFN, newFN): f = open(oldFN, 'r') oldList = [x for x in f] oldCoords = [x.strip().split('\t')[0] for x in oldList] f.close() f = open(newFN, 'r') newList = [x for x in f] newCoords = [x.strip().split('\t')[0] for x in newList] f.close() bothList = [] bothList.extend(newList) bothList.extend(oldList) bothCoords = compareData.compareTwoTcc(oldCoords, newCoords, 1) onlyOld = [] for x in oldCoords: if x not in bothCoords: onlyOld.append(x) onlyNew = [] for x in newCoords: if x not in bothCoords: onlyNew.append(x) print 'Both' knownList = [] for x in bothCoords: for y in bothList: if x in y: print y.strip() knownList.append(y) break print 'old' for y in oldList: if y not in knownList: print y.strip() print 'new' for y in newList: if y not in knownList: print y.strip()
def overlapWithDegradome(dFN, eFN): eSites = cgEdit.loadEditingSites(eFN) degTccs = [] f = open(dFN, 'r') for line in f: ls = line.strip().split('\t') chrom, strand, start, end = bioLibCG.tccSplit(ls[1]) start = start - 3 end = end + 3 degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end)) print degTccs[0:5] eTccs = [eSite.tcc for eSite in eSites] overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1) print len(overlaps)
def overlapWithDegradome(dFN, eFN): eSites = cgEdit.loadEditingSites(eFN) degTccs = [] f = open(dFN, "r") for line in f: ls = line.strip().split("\t") chrom, strand, start, end = bioLibCG.tccSplit(ls[1]) start = start - 3 end = end + 3 degTccs.append(bioLibCG.makeTcc(chrom, strand, start, end)) print degTccs[0:5] eTccs = [eSite.tcc for eSite in eSites] overlaps = compareData.compareTwoTcc(eTccs, degTccs, 1) print len(overlaps)
def countWithBins(dFN, binDir, type = 'INTRON'): print 'loading degradome' dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak) dNX.load(['tcc']) print 'loading bins' bins = [] for i in range(50): bins.append([]) for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r') for line in f: ls = line.strip().split('\t') tccs = ls[1:51] for i in range(0,50): bins[i].append(tccs[i]) #collect dTtcs in list dTccs = [] for dID in dNX.tcc: tcc = dNX.tcc[dID] c, s, st, en = bioLibCG.tccSplit(tcc) if s == '1': s = '-1' en = st else: s = '1' st = en dTccs.append(bioLibCG.makeTcc(c,s,st,en)) print len(dTccs), len(bins[0]) for i in range(0, 50): print i overlaps = compareData.compareTwoTcc(dTccs, bins[i], 1) print len(overlaps) print overlaps
def compareTccs(humanFN, liftCoords, rn = None, tn = None): '''compare if there is an overlap between a mouse alignment and any human''' mouseList = [] f = open(liftCoords, 'r') for line in f: ls = line.strip().split('\t') mouseList.append(ls[0]) humanList = [] DC = cgNexusFlat.Nexus(humanFN, cgOriginRNAFlat.OriginRNA) DC.load(['tcc'], [rn, tn]) for id in DC.tcc: humanList.append(DC.tcc[id]) mouseList = list(set(mouseList)) humanList = list(set(humanList)) x = compareData.compareTwoTcc(humanList, mouseList) for i in x: print i
def compareTccs(humanFN, liftCoords, rn=None, tn=None): '''compare if there is an overlap between a mouse alignment and any human''' mouseList = [] f = open(liftCoords, 'r') for line in f: ls = line.strip().split('\t') mouseList.append(ls[0]) humanList = [] DC = cgNexusFlat.Nexus(humanFN, cgOriginRNAFlat.OriginRNA) DC.load(['tcc'], [rn, tn]) for id in DC.tcc: humanList.append(DC.tcc[id]) mouseList = list(set(mouseList)) humanList = list(set(humanList)) x = compareData.compareTwoTcc(humanList, mouseList) for i in x: print i
def updateMicroRNAOverlap(aDir, microFN): oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() #Put micro and small coords into lists microCoords = [] smallCoords = [] f = open(microFN, 'r') microCoords = [x.strip() for x in f] f.close() smallCoords = [x.tcc for x in id_oRNA.values()] #overlap them smallOverlaps = compare.compareTwoTcc(microCoords, smallCoords, 2) #For each sRNA, save overlap value. for oRNA in id_oRNA.values(): oRNA.microOverlap = oRNA.tcc in smallOverlaps oRNA_DC.commit(id_oRNA)
def countWithBins(dFN, binDir, type='INTRON'): print 'loading degradome' dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak) dNX.load(['tcc']) print 'loading bins' bins = [] for i in range(50): bins.append([]) for chrom in bioLibCG.humanChromosomes: for strand in ('1', '-1'): f = open(binDir + '/%s.%s.%s.bins' % (type, chrom, strand), 'r') for line in f: ls = line.strip().split('\t') tccs = ls[1:51] for i in range(0, 50): bins[i].append(tccs[i]) #collect dTtcs in list dTccs = [] for dID in dNX.tcc: tcc = dNX.tcc[dID] c, s, st, en = bioLibCG.tccSplit(tcc) if s == '1': s = '-1' en = st else: s = '1' st = en dTccs.append(bioLibCG.makeTcc(c, s, st, en)) print len(dTccs), len(bins[0]) for i in range(0, 50): print i overlaps = compareData.compareTwoTcc(dTccs, bins[i], 1) print len(overlaps) print overlaps
def transcriptSetOverlapDegFile(degFile): geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript degTccs = [] f = open(degFile, 'r') for line in f: ls = line.strip().split('\t') degTccs.append(ls[1]) f.close() degTccs = [cg.convertToAS(x) for x in degTccs] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) #print len(overlappingExons), "num of overlapping exons" overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) f = open(degFile, 'r') newLines = [] for line in f: degTcc = cg.convertToAS(ls[1]) inTran = '0' if degTcc in overlappingDegTccs: inTran = '1' #update newLines newLine = cg.appendToLine(line, inTran, 3) f.close()
def updateMicroRNAOverlap(aDir, microFN): oRNA_DC = cgNexusFlat.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() #Put micro and small coords into lists microCoords = [] smallCoords = [] f = open(microFN, 'r') microCoords = [x.strip() for x in f] f.close() smallCoords = [x.tcc for x in id_oRNA.values()] #overlap them smallOverlaps = compare.compareTwoTcc(microCoords, smallCoords, 2) #For each sRNA, save overlap value. for oRNA in id_oRNA.values(): oRNA.microOverlap = oRNA.tcc in smallOverlaps oRNA_DC.commit(id_oRNA)
def getOverlappingTranscriptList(self, tccs): """given tccs, return the transcripts that overlap with them in a single list""" if not isinstance(tccs, type([])): # print 'transcript overlaps: NEED TCC LIST, not a single tcc!' return 1 # print 'num of tccs being compared', len(tccs) # gather all transcript tccs(make tcc --> id). tccDict = {} for gene in self.genes: for transcript in gene.transcripts: tccDict[transcript.tcc] = 1 # print 'num of tcc transcript tccs', len(tccDict.keys()) overlapped = compare.compareTwoTcc(tccDict.keys(), tccs, 1) tList = [] for gene in self.genes: for transcript in gene.transcripts: if transcript.tcc in overlapped: tList.append(transcript) return tList
def getOverlappingTranscriptList(self, tccs): '''given tccs, return the transcripts that overlap with them in a single list''' if not isinstance(tccs, type([])): #print 'transcript overlaps: NEED TCC LIST, not a single tcc!' return 1 #print 'num of tccs being compared', len(tccs) #gather all transcript tccs(make tcc --> id). tccDict = {} for gene in self.genes: for transcript in gene.transcripts: tccDict[transcript.tcc] = 1 #print 'num of tcc transcript tccs', len(tccDict.keys()) overlapped = compare.compareTwoTcc(tccDict.keys(), tccs, 1) tList = [] for gene in self.genes: for transcript in gene.transcripts: if transcript.tcc in overlapped: tList.append(transcript) return tList
#get results that are only noncoding import bioLibCG as cg import compareData as compare predName = '/home/chrisgre/projects/NoncodingMouse/results/NCmouse-s3k8b17.bothNCandC.results' keepList = compare.tccFileToList('keepNoncoding.tcc', 0) predList = compare.tccFileToList(predName, 1) keepers = compare.compareTwoTcc(predList, keepList, 1) print len(keepers) #now go back through pred file and create a new file with only lines that have noncoding in them predFile = open(predName, 'r') outFile = open('NCmouse.noncoding.results', 'w') predLines = predFile.readlines() predFile.close() newLines = {} for keeper in keepers: for line in predLines: if keeper in line: newLines[line] = 1 for line in newLines: outFile.write(line)
frameLength = 200 frameShift = 1 for cluster in sortedClusters: #grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope clusterChrom = cluster[0].split(":")[0] clusterStrand = cluster[0].split(":")[1] firstCoord = int(cluster[0].split(":")[2]) #print cluster[-1] lastCoord = int(cluster[-1].split(":")[3]) startCoord = firstCoord while startCoord < lastCoord: #count how many hits there are in this range rangeStart = startCoord - (frameLength/2) rangeEnd = startCoord + (frameLength/2) rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd) overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2) hitCount = len(overlappedList) #output outputFile.write('%s\t%s\n' % (rangeTcc, hitCount)) startCoord = startCoord + frameShift #check overlap with range outputFile.close() print 'Output Hits per Frame:', timer.split() print 'Overall Time:', timer.report()
def defineClusters(cName=None): #Start Timer timer = cg.cgTimer() timer.start() #Get list of mature tccs conf = cgConfig.getConfig(cName) #passed or default finalMirFileName = conf.conf['resultsRaw'] matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc print 'List getting', timer.split() #make connections dict matureConnections = compare.makeConnectionsDict(matureTccs) print 'Make connections:', timer.split() #Now have to define Clusters... clusters = [] addedList = [] #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P def createClusters(item=None, mode=None): if item in addedList: return 0 elif mode == "top": clusters.append([item]) addedList.append( item) ##creates new cluster with the item already stored in it for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") elif mode == "neighbor": clusters[-1].append( item) #add this item to the last cluster created addedList.append(item) for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") for tcc in matureTccs: createClusters(tcc, "top") print 'Make Clusters', timer.split() #Sort Clusters. sortedClusters = [] for cluster in clusters: sortedClusters.append(cg.sortTccList(cluster)) print 'Sort Clusters:', timer.split() #Output sorted cluster file clusterFileName = conf.conf['sortedClusters'] clusterFile = open(clusterFileName, 'w') for cluster in sortedClusters: for hit in cluster: clusterFile.write('%s,' % hit) clusterFile.write('\n') clusterFile.close() ''' #re-create sortedClusters list: clusterFileName = 'sortedClusters.data' clusterFile = open(clusterFileName, 'r') sortedClusters = [] for line in clusterFile: sortedClusters.append([]) line = line.strip()[0:-1] #take off last comma ;P for hit in (line.strip().split(',')): sortedClusters[-1].append(hit) ''' print 'Store intermediate data:', timer.split() #output hitsAround file outputFile = open(conf.conf['hitsPerFrame'], 'w') frameLength = 200 frameShift = 1 for cluster in sortedClusters: #grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope clusterChrom = cluster[0].split(":")[0] clusterStrand = cluster[0].split(":")[1] firstCoord = int(cluster[0].split(":")[2]) #print cluster[-1] lastCoord = int(cluster[-1].split(":")[3]) startCoord = firstCoord while startCoord < lastCoord: #count how many hits there are in this range rangeStart = startCoord - (frameLength / 2) rangeEnd = startCoord + (frameLength / 2) rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd) overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2) hitCount = len(overlappedList) #output outputFile.write('%s\t%s\n' % (rangeTcc, hitCount)) startCoord = startCoord + frameShift #check overlap with range outputFile.close() print 'Output Hits per Frame:', timer.split() print 'Overall Time:', timer.report()
def defineClusters(cName = None): #Start Timer timer = cg.cgTimer() timer.start() #Get list of mature tccs conf = cgConfig.getConfig(cName) #passed or default finalMirFileName = conf.conf['resultsRaw'] matureTccs = compare.tccFileToList(finalMirFileName, 1) # list of all mature micro in tcc print 'List getting', timer.split() #make connections dict matureConnections = compare.makeConnectionsDict(matureTccs) print 'Make connections:', timer.split() #Now have to define Clusters... clusters = [] addedList = [] #I don't think python passes by reference? also I think this function is in the middle because it uses a global variable :P def createClusters(item = None, mode = None): if item in addedList: return 0 elif mode == "top": clusters.append([item]) addedList.append(item) ##creates new cluster with the item already stored in it for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") elif mode == "neighbor": clusters[-1].append(item) #add this item to the last cluster created addedList.append(item) for connectedItem in matureConnections[item]: createClusters(connectedItem, "neighbor") for tcc in matureTccs: createClusters(tcc, "top") print 'Make Clusters', timer.split() #Sort Clusters. sortedClusters = [] for cluster in clusters: sortedClusters.append(cg.sortTccList(cluster)) print 'Sort Clusters:', timer.split() #Output sorted cluster file clusterFileName = conf.conf['sortedClusters'] clusterFile = open(clusterFileName, 'w') for cluster in sortedClusters: for hit in cluster: clusterFile.write('%s,' % hit) clusterFile.write('\n') clusterFile.close() ''' #re-create sortedClusters list: clusterFileName = 'sortedClusters.data' clusterFile = open(clusterFileName, 'r') sortedClusters = [] for line in clusterFile: sortedClusters.append([]) line = line.strip()[0:-1] #take off last comma ;P for hit in (line.strip().split(',')): sortedClusters[-1].append(hit) ''' print 'Store intermediate data:', timer.split() #output hitsAround file outputFile = open(conf.conf['hitsPerFrame'], 'w') frameLength = 200 frameShift = 1 for cluster in sortedClusters: #grab first and last coordinate from cluster, for each cluster deduce how many theoretical microRNAs were in hitScope clusterChrom = cluster[0].split(":")[0] clusterStrand = cluster[0].split(":")[1] firstCoord = int(cluster[0].split(":")[2]) #print cluster[-1] lastCoord = int(cluster[-1].split(":")[3]) startCoord = firstCoord while startCoord < lastCoord: #count how many hits there are in this range rangeStart = startCoord - (frameLength/2) rangeEnd = startCoord + (frameLength/2) rangeTcc = '%s:%s:%s:%s' % (clusterChrom, clusterStrand, rangeStart, rangeEnd) overlappedList = compare.compareTwoTcc([rangeTcc], cluster, 2) hitCount = len(overlappedList) #output outputFile.write('%s\t%s\n' % (rangeTcc, hitCount)) startCoord = startCoord + frameShift #check overlap with range outputFile.close() print 'Output Hits per Frame:', timer.split() print 'Overall Time:', timer.report()