def testmerge(masterDir, parDir): '''The master directory will contain the merged objects, the slave directory contains the directories of all the runs oRNA (master) aDir (master) pRuns --run.00 ----oRNA (slave: pRuns/run.00/oRNA) ----aDir --run.01 ''' mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment) id_masterObj = mDC.load() #recurse through all the runs masterBN = bioLibCG.getBaseFileName(masterDir) for slaveDir in bioLibCG.recursePaths(parDir, end = masterBN): oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment) id_slaveObj = oDC.load() id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj, cgOriginRNA.OriginRNA) mDC.commit(id_masterObj)
def filterTargets(oRNADir, aDir, inTranscript, misLevel, centerLevel, minCenterLevel): if inTranscript == 'True': inTranscript = True if inTranscript == 'False': inTranscript = False misLevel, centerLevel, minCenterLevel = int(misLevel), int(centerLevel), float(minCenterLevel) oDC = cgDB.dataController(oRNADir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for oRNA in id_oRNA.values(): oRNA.filteredTargets = [] for aID in oRNA.targets: alignment = id_alignment[aID] #transcriptOverlap if inTranscript: if not alignment.transcriptOverlap: #print 'tOverlap Fail', cgAlignment.pretty#print(alignment) continue #misLevel if alignment.mismatchStatus[misLevel]: #print 'mismatch Fail', cgAlignment.pretty#print(alignment) continue #centerLevel if alignment.centerExpression[centerLevel] < minCenterLevel: #print 'expression Fail', cgAlignment.pretty#print(alignment) continue oRNA.filteredTargets.append(aID) oDC.commit(id_oRNA)
def testmerge(masterDir, parDir): '''The master directory will contain the merged objects, the slave directory contains the directories of all the runs oRNA (master) aDir (master) pRuns --run.00 ----oRNA (slave: pRuns/run.00/oRNA) ----aDir --run.01 ''' mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment) id_masterObj = mDC.load() #recurse through all the runs masterBN = bioLibCG.getBaseFileName(masterDir) for slaveDir in bioLibCG.recursePaths(parDir, end=masterBN): oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment) id_slaveObj = oDC.load() id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj, cgOriginRNA.OriginRNA) mDC.commit(id_masterObj)
def plotPairs(oDir, aDir, cName): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for oID, oRNA in id_oRNA.items(): if not oRNA.passedFilter: continue for aID in oRNA.filteredTargets: alignment = id_alignment[aID] chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength print sLen print oRNA.sequence print oRNA.tcc print alignment.tTcc if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen print chrom, strand, start, end scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() xVals = range(1, sLen + 2) xVals = sortedKeys yVals = [stretch.profile[x] for x in sortedKeys] print xVals, len(xVals) print yVals, len(yVals) plt.plot(xVals, yVals) plt.show() return 0
def markMismatchedPairs(aDir): #make mismatchDict aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for alignment in id_alignment.values(): alignment.mismatchStatus = [False, False, False] lowRange = range(9,13) midRange = range(8,14) highRange = range(7,15) #check mismatches for i in lowRange: if i in alignment.mismatchPositions: alignment.mismatchStatus[0] = True break for i in midRange: if i in alignment.mismatchPositions: alignment.mismatchStatus[1] = True break for i in highRange: if i in alignment.mismatchPositions: alignment.mismatchStatus[2] = True break aDC.commit(id_alignment)
def totalSNR(oDir, filterList): fList = [] f = open(filterList, 'r') for line in f: ls = line.strip().split('\t') fList.append(int(line.strip())) oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() totalRun = 0 totalSim = 0 simsTotals = [] n = 0 for oRNA in id_oRNA.values(): if oRNA.id in fList: if len(oRNA.filteredTargets) == 0: continue totalRun += len(oRNA.filteredTargets) simsTotal = oRNA.avgNumSimulationTargets * 10 simsTotals.append(simsTotal) totalSim += oRNA.avgNumSimulationTargets n +=1 print 'Total Number Targets for my run:', totalRun print 'Total Number Targets for Simulations:', totalSim print 'SNR', float(totalRun)/float(totalSim) print 'stderr(%s)' % n, stdv(simsTotals)/sqrt(10)
def uniqueTargets(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() aID_numHit = {} uniqueTargets = [] totalTargets = [] for oRNA in id_oRNA.values(): if not oRNA.passedFilter: continue for aID in oRNA.filteredTargets: numHits = aID_numHit.get(aID, 0) aID_numHit[aID] = numHits + 1 if aID not in uniqueTargets: uniqueTargets.append(aID) totalTargets.append(aID) for aID, numHit in aID_numHit.items(): print aID, numHit print len(uniqueTargets) print len(totalTargets)
def loadAlignments2(aDir, alignmentFN): '''Just added IDs at the beginning to parallel things''' aDC = cgDB.dataController(aDir, cgAlignment) id_alignment = {} f = open(alignmentFN, 'r') for line in f: ls = line.strip().split(' ') id = int(ls[0]) a = cgAlignment(id) a.sID, a.tID = int(ls[1]), int(ls[2]) a.sStart, a.sEnd = int(ls[3]), int(ls[4]) a.tStart, a.tEnd = int(ls[5]), int(ls[6]) a.sLength, a.tLength = int(ls[7]), int(ls[8]) a.numMismatches = int(ls[9]) try: a.mismatchPositions = [int(x) for x in ls[10].split(',')] except IndexError: a.mismatchPositions = [] id_alignment[id] = a aDC.commit(id_alignment)
def updateEndContig(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for oRNA in id_oRNA.values(): seq = oRNA.sequence #5' cLength5 = 1 for i, letter in enumerate(seq): if i == 0: continue if seq[i] == seq[i - 1]: cLength5 += 1 else: break #3' cLength = 1 revSeq = [x for x in reversed(seq)] for i, letter in enumerate(revSeq): if i == 0: continue if revSeq[i] == revSeq[i - 1]: cLength += 1 else: break highest = cLength5 if cLength > cLength5: highest = cLength oRNA.endContigLength = highest oDC.commit(id_oRNA)
def transcriptSetOverlap(aDir, AS): AS = bool(AS) geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() if AS == True: degTccs = [cg.convertToAS(x.tcc) for x in id_oRNA.values()] else: degTccs = [x.tcc for x in id_oRNA.values()] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) #print len(overlappingExons), "num of overlapping exons" overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) #write new file for obj in id_oRNA.values(): if AS: degTcc = cg.convertToAS(obj.tcc) else: degTcc = obj.tcc if degTcc in overlappingDegTccs: obj.transcriptOverlap = True else: obj.transcriptOverlap = False oRNA_DC.commit(id_oRNA)
def loadAlignments(aDir, alignmentFN): aDC = cgDB.dataController(aDir, cgAlignment) id_alignment = {} i = 0 f = open(alignmentFN, 'r') for line in f: ls = line.strip().split(' ') a = cgAlignment(i) a.sID, a.tID = int(ls[0]), int(ls[1]) a.sStart, a.sEnd = int(ls[2]), int(ls[3]) a.tStart, a.tEnd = int(ls[4]), int(ls[5]) a.sLength, a.tLength = int(ls[6]), int(ls[7]) a.numMismatches = int(ls[8]) try: a.mismatchPositions = [int(x) for x in ls[9].split(',')] except IndexError: a.mismatchPositions = [] id_alignment[i] = a i += 1 aDC.commit(id_alignment)
def markMismatchedPairs(aDir): #make mismatchDict aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for alignment in id_alignment.values(): alignment.mismatchStatus = [False, False, False] lowRange = range(9, 13) midRange = range(8, 14) highRange = range(7, 15) #check mismatches for i in lowRange: if i in alignment.mismatchPositions: alignment.mismatchStatus[0] = True break for i in midRange: if i in alignment.mismatchPositions: alignment.mismatchStatus[1] = True break for i in highRange: if i in alignment.mismatchPositions: alignment.mismatchStatus[2] = True break aDC.commit(id_alignment)
def updateEndContig(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for oRNA in id_oRNA.values(): seq = oRNA.sequence #5' cLength5 = 1 for i,letter in enumerate(seq): if i == 0: continue if seq[i] == seq[i-1]: cLength5 += 1 else: break #3' cLength = 1 revSeq = [x for x in reversed(seq)] for i,letter in enumerate(revSeq): if i == 0: continue if revSeq[i] == revSeq[i-1]: cLength += 1 else: break highest = cLength5 if cLength > cLength5: highest = cLength oRNA.endContigLength = highest oDC.commit(id_oRNA)
def probeORNA(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for oRNA in id_oRNA.values(): if oRNA.passedFilter: cgOriginRNA.prettyPrint(oRNA)
def transcriptSetOverlapTargets(aDir): geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv' allExons = cgGenes.createGeneSetFromFile(geneSetFN) #get degradome TCCS #note that you need to test the AS peaks, this is the location of the targetted transcript aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() #create list of unique tccs. uniqTccs = [] for alignment in id_alignment.values(): chrom, strand, start, end = cg.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen tcc = cg.makeTcc(chrom, strand, start, end) if tcc not in uniqTccs: uniqTccs.append(tcc) degTccs = [cg.convertToAS(x) for x in uniqTccs] #find all overlapping exons/transcripts, then all results sequences that overlap exons overlappingExons = allExons.transcriptOverlaps(degTccs) overlappingExonTccs = [x.tcc for x in overlappingExons] overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1) #update for obj in id_alignment.values(): chrom, strand, start, end = cg.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen tcc = cg.makeTcc(chrom, strand, start, end) degTcc = cg.convertToAS(tcc) if degTcc in overlappingDegTccs: obj.transcriptOverlap = True else: obj.transcriptOverlap = False aDC.commit(id_alignment)
def test(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() print id_oRNA[1].targets id_oRNA[1].targets.append(13) print id_oRNA[1].targets print id_oRNA[2].targets
def probeMicro(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for oRNA in id_oRNA.values(): if oRNA.passedFilter: print oRNA.id, oRNA.sequence, oRNA.tcc, oRNA.tccs
def updateEntropy(aDir): oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() for obj in id_oRNA.values(): obj.entropy = getEntropy(obj.sequence) oRNA_DC.commit(id_oRNA)
def pPrint(aDir): aDC = cgDB.dataController(aDir, cgAlignment) id_alignment = aDC.load() for alignment in id_alignment.values(): attName_att = alignment.__dict__ attVals = [attName_att['id'], attName_att['tID'], attName_att['tTcc']] attVals = [str(x) for x in attVals] print '\t'.join(attVals)
def probeAlignments(aDir): probePairs = [[6, 35934]] aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for alignment in id_alignment.values(): for sID, tID in probePairs: if alignment.sID == sID and alignment.tID == tID: print alignment.id, alignment.sID, alignment.tID, alignment.centerExpression, alignment.mismatchStatus, alignment.numMismatches, alignment.transcriptOverlap
def updateTargetIDs(oDir, aDir): #load the data aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() oRNA_DC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() #clear targets that are there. for oRNA in id_oRNA.values(): oRNA.targets = [] #update the targets for oRNAs for alignment in id_alignment.values(): try: id_oRNA[alignment.sID].targets.append(alignment.id) except KeyError: print 'oRNA key missing', alignment.sID, alignment.id #save oRNA_DC.commit(id_oRNA)
def updateID(aDir, peakFN): '''This fxn will initialize the database objects...''' oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = {} f = open(peakFN, 'r') i = 0 for line in f: id_oRNA[i] = cgOriginRNA.OriginRNA(i) i += 1 oRNA_DC.commit(id_oRNA)
def updateIDFromQuery(aDir, queryFN): '''This fxn will initialize the database objects...''' oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = {} f = open(queryFN, 'r') for line in f: ls = line.strip().split('\t') id = int(ls[0]) id_oRNA[id] = cgOriginRNA.OriginRNA(id) oRNA_DC.commit(id_oRNA)
def updateSmallExpression(aDir, cName): oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() for id, oRNA in id_oRNA.items(): stretch = cgPeaks.stretch(oRNA.tcc, cName) #this stretch contains values for small library... highValue = stretch.getHighestLevel() oRNA.eLevel = highValue oRNA_DC.commit(id_oRNA)
def filterORNA(oDir, maxEndContig, maxTotalContig, minSNR, minNumTargets, keepDuplicates=False): if keepDuplicates == 'True': keepDuplicates = True if keepDuplicates == 'False': keepDuplicates = False maxEndContig, maxTotalContig = int(maxEndContig), int(maxTotalContig) minNumTargets = int(minNumTargets) minSNR = float(minSNR) oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for oRNA in id_oRNA.values(): oRNA.passedFilter = True if len(oRNA.filteredTargets) < minNumTargets: oRNA.passedFilter = False cgOriginRNA.prettyPrint(oRNA, 'numTargets') continue if oRNA.endContigLength > maxEndContig: cgOriginRNA.prettyPrint(oRNA, 'endContig') oRNA.passedFilter = False continue if oRNA.totalContigLength > maxTotalContig: cgOriginRNA.prettyPrint(oRNA, 'totalContig') oRNA.passedFilter = False continue if oRNA.snr < minSNR: cgOriginRNA.prettyPrint(oRNA, 'SNR fail') oRNA.passedFilter = False continue if not keepDuplicates: if oRNA.sequenceDuplicate: cgOriginRNA.prettyPrint(oRNA, 'Duplicate Fail') oRNA.passedFilter = False continue print 'PASSED:', oRNA.id, ','.join( str(x) for x in oRNA.filteredTargets ), oRNA.entropy, oRNA.avgNumSimulationTargets, oRNA.snr, oRNA.endContigLength, oRNA.sequence oDC.commit(id_oRNA)
def updateSmallExpression(aDir, cName): oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() for id, oRNA in id_oRNA.items(): stretch = cgPeaks.stretch( oRNA.tcc, cName) #this stretch contains values for small library... highValue = stretch.getHighestLevel() oRNA.eLevel = highValue oRNA_DC.commit(id_oRNA)
def updateSequence(aDir, seqFN): oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() f = open(seqFN, 'r') i = 0 for line in f: ls = line.strip().split('\t') seq = ls[0] id_oRNA[i].sequence = seq i += 1 oRNA_DC.commit(id_oRNA)
def updateTcc(aDir, tccFN): oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA) id_oRNA = oRNA_DC.load() f = open(tccFN, 'r') i = 0 for line in f: ls = line.strip().split('\t') tcc = ls[0] id_oRNA[i].tcc = tcc i += 1 oRNA_DC.commit(id_oRNA)
def filterTargets(oRNADir, aDir, inTranscript, misLevel, centerLevel, minCenterLevel): if inTranscript == 'True': inTranscript = True if inTranscript == 'False': inTranscript = False misLevel, centerLevel, minCenterLevel = int(misLevel), int( centerLevel), float(minCenterLevel) oDC = cgDB.dataController(oRNADir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for oRNA in id_oRNA.values(): oRNA.filteredTargets = [] for aID in oRNA.targets: alignment = id_alignment[aID] #transcriptOverlap if inTranscript: if not alignment.transcriptOverlap: #print 'tOverlap Fail', cgAlignment.pretty#print(alignment) continue #misLevel if alignment.mismatchStatus[misLevel]: #print 'mismatch Fail', cgAlignment.pretty#print(alignment) continue #centerLevel if alignment.centerExpression[centerLevel] < minCenterLevel: #print 'expression Fail', cgAlignment.pretty#print(alignment) continue oRNA.filteredTargets.append(aID) oDC.commit(id_oRNA)
def getSeqs(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for id, oRNA in id_oRNA.items(): if oRNA.sequenceDuplicate: continue if oRNA.totalContigLength > 6: continue if oRNA.endContigLength > 6: continue print "%s" % id
def updateAvgNumTargets(oDir): oID_numTargets = {} for i in range(0,10): print i simDirRNA = '/home/chrisgre/scripts/simulations/simsk50Filtered/simulation.%s/oRNA' % i oDC = cgDB.dataController(simDirRNA, cgOriginRNA.OriginRNA) id_sRNA = oDC.load() for id, sRNA in id_sRNA.items(): currTargets = oID_numTargets.get(id, 0) oID_numTargets[id] = currTargets + len(sRNA.filteredTargets) #now save it oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for oRNA in id_oRNA.values(): totalNum = oID_numTargets.get(oRNA.id, 0) avgNum = float(totalNum)/float(10.0) oRNA.avgNumSimulationTargets = avgNum oDC.commit(id_oRNA)
def quickScript(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() failedList = [] for id, oRNA in id_oRNA.items(): if oRNA.endContigLength > 7 or oRNA.totalContigLength > 7: failedList.append(id) for id in failedList: print id
def getSeqs(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for id, oRNA in id_oRNA.items(): if oRNA.sequenceDuplicate: continue if oRNA.totalContigLength > 6: continue if oRNA.endContigLength > 6: continue print '%s' % id
def updateSNR(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for oRNA in id_oRNA.values(): actualNum = len(oRNA.filteredTargets) avgNum = oRNA.avgNumSimulationTargets if avgNum == 0: avgNum = .01 SNR = float(actualNum) / avgNum oRNA.snr = SNR oDC.commit(id_oRNA)
def updateAvgNumTargets(oDir): oID_numTargets = {} for i in range(0, 10): print i simDirRNA = '/home/chrisgre/scripts/simulations/simsk50Filtered/simulation.%s/oRNA' % i oDC = cgDB.dataController(simDirRNA, cgOriginRNA.OriginRNA) id_sRNA = oDC.load() for id, sRNA in id_sRNA.items(): currTargets = oID_numTargets.get(id, 0) oID_numTargets[id] = currTargets + len(sRNA.filteredTargets) #now save it oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for oRNA in id_oRNA.values(): totalNum = oID_numTargets.get(oRNA.id, 0) avgNum = float(totalNum) / float(10.0) oRNA.avgNumSimulationTargets = avgNum oDC.commit(id_oRNA)
def updateSNR(oDir): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for oRNA in id_oRNA.values(): actualNum = len(oRNA.filteredTargets) avgNum = oRNA.avgNumSimulationTargets if avgNum == 0: avgNum = .01 SNR = float(actualNum)/avgNum oRNA.snr = SNR oDC.commit(id_oRNA)
def markCenterExpression(aDir, cName): aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for alignment in id_alignment.values(): alignment.centerExpression = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) expressionSum = stretch.getSumOfLevels() sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0: sum = 0.0 for key in sortedKeys[8:12]: sum += stretch.profile[key] alignment.centerExpression[0] = sum/expressionSum sum = 0.0 for key in sortedKeys[7:13]: sum += stretch.profile[key] alignment.centerExpression[1] = sum/expressionSum sum = 0.0 for key in sortedKeys[6:14]: sum += stretch.profile[key] alignment.centerExpression[2] = sum/expressionSum aDC.commit(id_alignment)
def markCenterExpression(aDir, cName): aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment) id_alignment = aDC.load() for alignment in id_alignment.values(): alignment.centerExpression = [0.0, 0.0, 0.0] chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc) offset = alignment.tStart sLen = alignment.sLength if strand == '1': start = start - 19 + offset end = start + sLen else: end = end + 19 - offset start = end - sLen scanRange = bioLibCG.makeTcc(chrom, strand, start, end) stretch = cgPeaks.stretch(scanRange, cName) expressionSum = stretch.getSumOfLevels() sortedKeys = stretch.profile.keys() sortedKeys.sort() if strand == '-1': sortedKeys.reverse() if expressionSum != 0: sum = 0.0 for key in sortedKeys[8:12]: sum += stretch.profile[key] alignment.centerExpression[0] = sum / expressionSum sum = 0.0 for key in sortedKeys[7:13]: sum += stretch.profile[key] alignment.centerExpression[1] = sum / expressionSum sum = 0.0 for key in sortedKeys[6:14]: sum += stretch.profile[key] alignment.centerExpression[2] = sum / expressionSum aDC.commit(id_alignment)
def signalHisto(oDir, title = 'SNR'): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() histVals = [] for oRNA in id_oRNA.values(): if not oRNA.passedFilter: continue histVals.append(math.log(oRNA.snr, 2)) plt.hist(histVals, 30, facecolor='b', alpha = .75) plt.axis([-4,10,0,30]) plt.title('%s' % title) plt.xlabel('log2(Signal/Noise)') plt.ylabel('Number of Origin RNAs') plt.show()
def signalHisto(oDir, title='SNR'): oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() histVals = [] for oRNA in id_oRNA.values(): if not oRNA.passedFilter: continue histVals.append(math.log(oRNA.snr, 2)) plt.hist(histVals, 30, facecolor='b', alpha=.75) plt.axis([-4, 10, 0, 30]) plt.title('%s' % title) plt.xlabel('log2(Signal/Noise)') plt.ylabel('Number of Origin RNAs') plt.show()
def filterORNA(oDir, maxEndContig, maxTotalContig, minSNR, minNumTargets, keepDuplicates = False): if keepDuplicates == 'True': keepDuplicates = True if keepDuplicates == 'False': keepDuplicates = False maxEndContig, maxTotalContig = int(maxEndContig), int(maxTotalContig) minNumTargets = int(minNumTargets) minSNR = float(minSNR) oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA) id_oRNA = oDC.load() for oRNA in id_oRNA.values(): oRNA.passedFilter = True if len(oRNA.filteredTargets) < minNumTargets: oRNA.passedFilter = False cgOriginRNA.prettyPrint(oRNA, 'numTargets') continue if oRNA.endContigLength > maxEndContig: cgOriginRNA.prettyPrint(oRNA, 'endContig') oRNA.passedFilter = False continue if oRNA.totalContigLength > maxTotalContig: cgOriginRNA.prettyPrint(oRNA, 'totalContig') oRNA.passedFilter = False continue if oRNA.snr < minSNR: cgOriginRNA.prettyPrint(oRNA, 'SNR fail') oRNA.passedFilter = False continue if not keepDuplicates: if oRNA.sequenceDuplicate: cgOriginRNA.prettyPrint(oRNA, 'Duplicate Fail') oRNA.passedFilter = False continue print 'PASSED:', oRNA.id, ','.join(str(x) for x in oRNA.filteredTargets), oRNA.entropy, oRNA.avgNumSimulationTargets, oRNA.snr, oRNA.endContigLength, oRNA.sequence oDC.commit(id_oRNA)