Ejemplo n.º 1
0
def testmerge(masterDir, parDir):
        '''The master directory will contain the merged objects,
        the slave directory contains the directories of all the runs
        oRNA (master)
        aDir (master)
        pRuns
        --run.00
        ----oRNA (slave: pRuns/run.00/oRNA)
        ----aDir
        --run.01
        '''

        mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment)
        id_masterObj = mDC.load()
        
        #recurse through all the runs
        masterBN = bioLibCG.getBaseFileName(masterDir)

        for slaveDir in bioLibCG.recursePaths(parDir, end = masterBN):

        
                oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment)
                id_slaveObj = oDC.load()
       
                id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj, cgOriginRNA.OriginRNA) 
        
        mDC.commit(id_masterObj)
Ejemplo n.º 2
0
def filterTargets(oRNADir, aDir, inTranscript, misLevel, centerLevel, minCenterLevel):
        if inTranscript == 'True': inTranscript = True
        if inTranscript == 'False': inTranscript = False
        misLevel, centerLevel, minCenterLevel =  int(misLevel), int(centerLevel), float(minCenterLevel)

        oDC = cgDB.dataController(oRNADir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()

        for oRNA in id_oRNA.values():
                oRNA.filteredTargets = []
                for aID in oRNA.targets:
                        alignment = id_alignment[aID]

                        #transcriptOverlap
                        if inTranscript:
                                if not alignment.transcriptOverlap:
                                        #print 'tOverlap Fail', cgAlignment.pretty#print(alignment)
                                        continue

                        #misLevel
                        if alignment.mismatchStatus[misLevel]:
                                #print 'mismatch Fail', cgAlignment.pretty#print(alignment)
                                continue
                           
                        #centerLevel
                        if alignment.centerExpression[centerLevel] < minCenterLevel:
                                #print 'expression Fail', cgAlignment.pretty#print(alignment)
                                continue
                        
                        oRNA.filteredTargets.append(aID)

        oDC.commit(id_oRNA)                        
Ejemplo n.º 3
0
def testmerge(masterDir, parDir):
    '''The master directory will contain the merged objects,
        the slave directory contains the directories of all the runs
        oRNA (master)
        aDir (master)
        pRuns
        --run.00
        ----oRNA (slave: pRuns/run.00/oRNA)
        ----aDir
        --run.01
        '''

    mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment)
    id_masterObj = mDC.load()

    #recurse through all the runs
    masterBN = bioLibCG.getBaseFileName(masterDir)

    for slaveDir in bioLibCG.recursePaths(parDir, end=masterBN):

        oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment)
        id_slaveObj = oDC.load()

        id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj,
                                            cgOriginRNA.OriginRNA)

    mDC.commit(id_masterObj)
def plotPairs(oDir, aDir, cName):

        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()

        for oID, oRNA in id_oRNA.items():
                
                if not oRNA.passedFilter:
                        continue

                for aID in oRNA.filteredTargets:

                        alignment = id_alignment[aID]
                        chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc)
                        offset = alignment.tStart
                        sLen = alignment.sLength
                        print sLen
                        print oRNA.sequence
                        print oRNA.tcc
                        print alignment.tTcc
                        if strand == '1':
                                start = start - 19 + offset
                                end = start + sLen
                        else:
                                end = end + 19 - offset
                                start = end - sLen

                        print chrom, strand, start, end
                        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                        
                        stretch = cgPeaks.stretch(scanRange, cName)
                        sortedKeys = stretch.profile.keys()
                        sortedKeys.sort()

                        if strand == '-1':
                                sortedKeys.reverse()
                        

                        xVals = range(1, sLen + 2)
                        xVals = sortedKeys
                        yVals = [stretch.profile[x] for x in sortedKeys]
                        print xVals, len(xVals)
                        print yVals, len(yVals)
                        
                        plt.plot(xVals, yVals)
                        plt.show()

                        return 0
def plotPairs(oDir, aDir, cName):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    for oID, oRNA in id_oRNA.items():

        if not oRNA.passedFilter:
            continue

        for aID in oRNA.filteredTargets:

            alignment = id_alignment[aID]
            chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc)
            offset = alignment.tStart
            sLen = alignment.sLength
            print sLen
            print oRNA.sequence
            print oRNA.tcc
            print alignment.tTcc
            if strand == '1':
                start = start - 19 + offset
                end = start + sLen
            else:
                end = end + 19 - offset
                start = end - sLen

            print chrom, strand, start, end
            scanRange = bioLibCG.makeTcc(chrom, strand, start, end)

            stretch = cgPeaks.stretch(scanRange, cName)
            sortedKeys = stretch.profile.keys()
            sortedKeys.sort()

            if strand == '-1':
                sortedKeys.reverse()

            xVals = range(1, sLen + 2)
            xVals = sortedKeys
            yVals = [stretch.profile[x] for x in sortedKeys]
            print xVals, len(xVals)
            print yVals, len(yVals)

            plt.plot(xVals, yVals)
            plt.show()

            return 0
def markMismatchedPairs(aDir):
      
        #make mismatchDict
        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()
        
        for alignment in id_alignment.values():

                alignment.mismatchStatus = [False, False, False]
                lowRange = range(9,13)
                midRange = range(8,14)
                highRange = range(7,15)
                
                #check mismatches
                for i in lowRange:
                        if i in alignment.mismatchPositions:
                                alignment.mismatchStatus[0] = True
                                break

        
                for i in midRange:
                        if i in alignment.mismatchPositions:
                                alignment.mismatchStatus[1] = True
                                break

                for i in highRange:
                        if i in alignment.mismatchPositions:
                                alignment.mismatchStatus[2] = True
                                break

        aDC.commit(id_alignment)
Ejemplo n.º 7
0
def totalSNR(oDir, filterList):
        
        fList = []
        f = open(filterList, 'r')
        for line in f:
                ls = line.strip().split('\t')
                fList.append(int(line.strip()))

        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        totalRun = 0 
        totalSim = 0
        simsTotals = []
        n = 0
        for oRNA in id_oRNA.values():
                if oRNA.id in fList:
                        if len(oRNA.filteredTargets) == 0:
                                continue
                        totalRun += len(oRNA.filteredTargets)
                        simsTotal = oRNA.avgNumSimulationTargets * 10
                        simsTotals.append(simsTotal)
                        totalSim += oRNA.avgNumSimulationTargets
                        n +=1
        
        print 'Total Number Targets for my run:', totalRun
        print 'Total Number Targets for Simulations:',  totalSim
        print 'SNR', float(totalRun)/float(totalSim)
        print 'stderr(%s)' % n, stdv(simsTotals)/sqrt(10)
Ejemplo n.º 8
0
def uniqueTargets(oDir):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    aID_numHit = {}
    uniqueTargets = []
    totalTargets = []
    for oRNA in id_oRNA.values():
        if not oRNA.passedFilter:
            continue
        for aID in oRNA.filteredTargets:
            numHits = aID_numHit.get(aID, 0)
            aID_numHit[aID] = numHits + 1

            if aID not in uniqueTargets:
                uniqueTargets.append(aID)

            totalTargets.append(aID)

    for aID, numHit in aID_numHit.items():
        print aID, numHit

    print len(uniqueTargets)
    print len(totalTargets)
Ejemplo n.º 9
0
def loadAlignments2(aDir, alignmentFN):
    '''Just added IDs at the beginning to parallel things'''

    aDC = cgDB.dataController(aDir, cgAlignment)

    id_alignment = {}
    f = open(alignmentFN, 'r')
    for line in f:
        ls = line.strip().split(' ')
        id = int(ls[0])
        a = cgAlignment(id)

        a.sID, a.tID = int(ls[1]), int(ls[2])
        a.sStart, a.sEnd = int(ls[3]), int(ls[4])
        a.tStart, a.tEnd = int(ls[5]), int(ls[6])
        a.sLength, a.tLength = int(ls[7]), int(ls[8])
        a.numMismatches = int(ls[9])

        try:
            a.mismatchPositions = [int(x) for x in ls[10].split(',')]
        except IndexError:
            a.mismatchPositions = []

        id_alignment[id] = a

    aDC.commit(id_alignment)
Ejemplo n.º 10
0
def updateEndContig(oDir):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    for oRNA in id_oRNA.values():
        seq = oRNA.sequence

        #5'
        cLength5 = 1
        for i, letter in enumerate(seq):
            if i == 0: continue

            if seq[i] == seq[i - 1]:
                cLength5 += 1
            else:
                break
        #3'
        cLength = 1
        revSeq = [x for x in reversed(seq)]
        for i, letter in enumerate(revSeq):
            if i == 0: continue

            if revSeq[i] == revSeq[i - 1]:
                cLength += 1
            else:
                break

        highest = cLength5
        if cLength > cLength5:
            highest = cLength

        oRNA.endContigLength = highest

    oDC.commit(id_oRNA)
Ejemplo n.º 11
0
def transcriptSetOverlap(aDir, AS):
        AS = bool(AS)

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)

	#get degradome TCCS
	#note that you need to test the AS peaks, this is the location of the targetted transcript
        oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
	id_oRNA = oRNA_DC.load()
        if AS == True:
                degTccs = [cg.convertToAS(x.tcc) for x in id_oRNA.values()]
        else:
                degTccs = [x.tcc for x in id_oRNA.values()]

	#find all overlapping exons/transcripts, then all results sequences that overlap exons
	overlappingExons = allExons.transcriptOverlaps(degTccs)
	#print len(overlappingExons), "num of overlapping exons"
        overlappingExonTccs = [x.tcc for x in overlappingExons]
	overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)


	#write new file
        for obj in id_oRNA.values():         
                if AS:
                        degTcc = cg.convertToAS(obj.tcc)
                else:
                        degTcc = obj.tcc

                if degTcc in overlappingDegTccs:
                        obj.transcriptOverlap = True
	        else:
                        obj.transcriptOverlap = False 

        oRNA_DC.commit(id_oRNA)	
Ejemplo n.º 12
0
def loadAlignments(aDir, alignmentFN):

    aDC = cgDB.dataController(aDir, cgAlignment)

    id_alignment = {}
    i = 0
    f = open(alignmentFN, 'r')
    for line in f:
        ls = line.strip().split(' ')

        a = cgAlignment(i)

        a.sID, a.tID = int(ls[0]), int(ls[1])
        a.sStart, a.sEnd = int(ls[2]), int(ls[3])
        a.tStart, a.tEnd = int(ls[4]), int(ls[5])
        a.sLength, a.tLength = int(ls[6]), int(ls[7])
        a.numMismatches = int(ls[8])

        try:
            a.mismatchPositions = [int(x) for x in ls[9].split(',')]
        except IndexError:
            a.mismatchPositions = []

        id_alignment[i] = a

        i += 1

    aDC.commit(id_alignment)
Ejemplo n.º 13
0
def markMismatchedPairs(aDir):

    #make mismatchDict
    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    for alignment in id_alignment.values():

        alignment.mismatchStatus = [False, False, False]
        lowRange = range(9, 13)
        midRange = range(8, 14)
        highRange = range(7, 15)

        #check mismatches
        for i in lowRange:
            if i in alignment.mismatchPositions:
                alignment.mismatchStatus[0] = True
                break

        for i in midRange:
            if i in alignment.mismatchPositions:
                alignment.mismatchStatus[1] = True
                break

        for i in highRange:
            if i in alignment.mismatchPositions:
                alignment.mismatchStatus[2] = True
                break

    aDC.commit(id_alignment)
Ejemplo n.º 14
0
def uniqueTargets(oDir):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    aID_numHit = {}
    uniqueTargets = []
    totalTargets = []
    for oRNA in id_oRNA.values():
        if not oRNA.passedFilter:
            continue
        for aID in oRNA.filteredTargets:
            numHits = aID_numHit.get(aID, 0)
            aID_numHit[aID] = numHits + 1

            if aID not in uniqueTargets:
                uniqueTargets.append(aID)

            totalTargets.append(aID)

    for aID, numHit in aID_numHit.items():
        print aID, numHit

    print len(uniqueTargets)
    print len(totalTargets)
Ejemplo n.º 15
0
def updateEndContig(oDir):

        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()
        
        for oRNA in id_oRNA.values():
                seq = oRNA.sequence

                #5'
                cLength5 = 1
                for i,letter in enumerate(seq):
                        if i == 0: continue

                        if seq[i] == seq[i-1]:
                                cLength5 += 1
                        else:
                                break
                #3'
                cLength = 1
                revSeq = [x for x in reversed(seq)]
                for i,letter in enumerate(revSeq):
                        if i == 0: continue

                        if revSeq[i] == revSeq[i-1]:
                                cLength += 1
                        else:
                                break

                highest = cLength5
                if cLength > cLength5:
                        highest = cLength

                oRNA.endContigLength = highest                        
               
        oDC.commit(id_oRNA)
Ejemplo n.º 16
0
def loadAlignments(aDir, alignmentFN):
        
        aDC = cgDB.dataController(aDir, cgAlignment)

        id_alignment = {}
        i = 0
        f = open(alignmentFN, 'r')
        for line in f:
                ls = line.strip().split(' ')
                
                a = cgAlignment(i)
                
                a.sID, a.tID = int(ls[0]), int(ls[1])
                a.sStart, a.sEnd = int(ls[2]), int(ls[3])
                a.tStart, a.tEnd = int(ls[4]), int(ls[5])
                a.sLength, a.tLength = int(ls[6]), int(ls[7])
                a.numMismatches = int(ls[8])
                
                try:
                        a.mismatchPositions = [int(x) for x in ls[9].split(',')]
                except IndexError:
                        a.mismatchPositions = []

                id_alignment[i] = a

                i += 1
        
        aDC.commit(id_alignment)
Ejemplo n.º 17
0
def loadAlignments2(aDir, alignmentFN):
        '''Just added IDs at the beginning to parallel things'''
        
        aDC = cgDB.dataController(aDir, cgAlignment)

        id_alignment = {}
        f = open(alignmentFN, 'r')
        for line in f:
                ls = line.strip().split(' ')
                id = int(ls[0]) 
                a = cgAlignment(id)
                
                a.sID, a.tID = int(ls[1]), int(ls[2])
                a.sStart, a.sEnd = int(ls[3]), int(ls[4])
                a.tStart, a.tEnd = int(ls[5]), int(ls[6])
                a.sLength, a.tLength = int(ls[7]), int(ls[8])
                a.numMismatches = int(ls[9])
                
                try:
                        a.mismatchPositions = [int(x) for x in ls[10].split(',')]
                except IndexError:
                        a.mismatchPositions = []

                id_alignment[id] = a

        aDC.commit(id_alignment)
Ejemplo n.º 18
0
def transcriptSetOverlap(aDir, AS):
    AS = bool(AS)

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)

    #get degradome TCCS
    #note that you need to test the AS peaks, this is the location of the targetted transcript
    oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
    id_oRNA = oRNA_DC.load()
    if AS == True:
        degTccs = [cg.convertToAS(x.tcc) for x in id_oRNA.values()]
    else:
        degTccs = [x.tcc for x in id_oRNA.values()]

#find all overlapping exons/transcripts, then all results sequences that overlap exons
    overlappingExons = allExons.transcriptOverlaps(degTccs)
    #print len(overlappingExons), "num of overlapping exons"
    overlappingExonTccs = [x.tcc for x in overlappingExons]
    overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

    #write new file
    for obj in id_oRNA.values():
        if AS:
            degTcc = cg.convertToAS(obj.tcc)
        else:
            degTcc = obj.tcc

        if degTcc in overlappingDegTccs:
            obj.transcriptOverlap = True
        else:
            obj.transcriptOverlap = False

    oRNA_DC.commit(id_oRNA)
Ejemplo n.º 19
0
def probeORNA(oDir):
        
        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        for oRNA in id_oRNA.values():
                if oRNA.passedFilter:
                        cgOriginRNA.prettyPrint(oRNA)
Ejemplo n.º 20
0
def probeORNA(oDir):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    for oRNA in id_oRNA.values():
        if oRNA.passedFilter:
            cgOriginRNA.prettyPrint(oRNA)
Ejemplo n.º 21
0
def transcriptSetOverlapTargets(aDir):

    geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
    allExons = cgGenes.createGeneSetFromFile(geneSetFN)

    #get degradome TCCS
    #note that you need to test the AS peaks, this is the location of the targetted transcript

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    #create list of unique tccs.
    uniqTccs = []
    for alignment in id_alignment.values():
        chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        tcc = cg.makeTcc(chrom, strand, start, end)
        if tcc not in uniqTccs: uniqTccs.append(tcc)

    degTccs = [cg.convertToAS(x) for x in uniqTccs]

    #find all overlapping exons/transcripts, then all results sequences that overlap exons
    overlappingExons = allExons.transcriptOverlaps(degTccs)
    overlappingExonTccs = [x.tcc for x in overlappingExons]
    overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

    #update
    for obj in id_alignment.values():
        chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        tcc = cg.makeTcc(chrom, strand, start, end)
        degTcc = cg.convertToAS(tcc)

        if degTcc in overlappingDegTccs:
            obj.transcriptOverlap = True
        else:
            obj.transcriptOverlap = False

    aDC.commit(id_alignment)
Ejemplo n.º 22
0
def test(oDir):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    print id_oRNA[1].targets
    id_oRNA[1].targets.append(13)
    print id_oRNA[1].targets
    print id_oRNA[2].targets
Ejemplo n.º 23
0
def test(oDir):
        
        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        print id_oRNA[1].targets
        id_oRNA[1].targets.append(13)
        print id_oRNA[1].targets
        print id_oRNA[2].targets
Ejemplo n.º 24
0
def probeMicro(oDir):
        
        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        for oRNA in id_oRNA.values():

                if oRNA.passedFilter:
                        print oRNA.id, oRNA.sequence, oRNA.tcc, oRNA.tccs
Ejemplo n.º 25
0
def probeMicro(oDir):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    for oRNA in id_oRNA.values():

        if oRNA.passedFilter:
            print oRNA.id, oRNA.sequence, oRNA.tcc, oRNA.tccs
Ejemplo n.º 26
0
def transcriptSetOverlapTargets(aDir):

	geneSetFN = '/home/chrisgre/dataSources/known/Human/geneSets/ensemblAllTranscripts.tsv'
	allExons = cgGenes.createGeneSetFromFile(geneSetFN)

	#get degradome TCCS
	#note that you need to test the AS peaks, this is the location of the targetted transcript
        
        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()
        
        #create list of unique tccs.
        uniqTccs = []
        for alignment in id_alignment.values():
                chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                tcc = cg.makeTcc(chrom, strand, start, end)
                if tcc not in uniqTccs: uniqTccs.append(tcc)

        degTccs = [cg.convertToAS(x) for x in uniqTccs]

	#find all overlapping exons/transcripts, then all results sequences that overlap exons
	overlappingExons = allExons.transcriptOverlaps(degTccs)
        overlappingExonTccs = [x.tcc for x in overlappingExons]
	overlappingDegTccs = compare.compareTwoTcc(degTccs, overlappingExonTccs, 1)

        #update
        for obj in id_alignment.values():         
                chrom, strand, start, end = cg.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                tcc = cg.makeTcc(chrom, strand, start, end)
                degTcc = cg.convertToAS(tcc)

                if degTcc in overlappingDegTccs:
                        obj.transcriptOverlap = True
	        else:
                        obj.transcriptOverlap = False 

        aDC.commit(id_alignment)
Ejemplo n.º 27
0
def updateEntropy(aDir):

    oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
    id_oRNA = oRNA_DC.load()

    for obj in id_oRNA.values():

        obj.entropy = getEntropy(obj.sequence)

    oRNA_DC.commit(id_oRNA)
Ejemplo n.º 28
0
def pPrint(aDir):

        aDC = cgDB.dataController(aDir, cgAlignment)
        id_alignment = aDC.load()
        
        for alignment in id_alignment.values():
                attName_att = alignment.__dict__
                attVals = [attName_att['id'], attName_att['tID'], attName_att['tTcc']]
                attVals = [str(x) for x in attVals]
                print '\t'.join(attVals)
Ejemplo n.º 29
0
def pPrint(aDir):

    aDC = cgDB.dataController(aDir, cgAlignment)
    id_alignment = aDC.load()

    for alignment in id_alignment.values():
        attName_att = alignment.__dict__
        attVals = [attName_att['id'], attName_att['tID'], attName_att['tTcc']]
        attVals = [str(x) for x in attVals]
        print '\t'.join(attVals)
Ejemplo n.º 30
0
def updateEntropy(aDir):
        
	oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
	id_oRNA = oRNA_DC.load()
        
        for obj in id_oRNA.values():

                obj.entropy = getEntropy(obj.sequence) 

        oRNA_DC.commit(id_oRNA)
Ejemplo n.º 31
0
def probeAlignments(aDir):

    probePairs = [[6, 35934]]

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    for alignment in id_alignment.values():
        for sID, tID in probePairs:

            if alignment.sID == sID and alignment.tID == tID:
                print alignment.id, alignment.sID, alignment.tID, alignment.centerExpression, alignment.mismatchStatus, alignment.numMismatches, alignment.transcriptOverlap
Ejemplo n.º 32
0
def updateTargetIDs(oDir, aDir):
       
        #load the data 
        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()

        oRNA_DC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oRNA_DC.load()

        #clear targets that are there.
        for oRNA in id_oRNA.values():
                oRNA.targets = []

        #update the targets for oRNAs
        for alignment in id_alignment.values():
                try:
                        id_oRNA[alignment.sID].targets.append(alignment.id)
                except KeyError:
                        print 'oRNA key missing', alignment.sID, alignment.id
        #save
        oRNA_DC.commit(id_oRNA)
Ejemplo n.º 33
0
def probeAlignments(aDir):
        
        probePairs = [[6, 35934]]

        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()

        for alignment in id_alignment.values():
                for sID, tID in probePairs:

                        if alignment.sID == sID and alignment.tID == tID:
                                print alignment.id, alignment.sID, alignment.tID, alignment.centerExpression, alignment.mismatchStatus, alignment.numMismatches, alignment.transcriptOverlap
Ejemplo n.º 34
0
def updateTargetIDs(oDir, aDir):

    #load the data
    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    oRNA_DC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oRNA_DC.load()

    #clear targets that are there.
    for oRNA in id_oRNA.values():
        oRNA.targets = []

    #update the targets for oRNAs
    for alignment in id_alignment.values():
        try:
            id_oRNA[alignment.sID].targets.append(alignment.id)
        except KeyError:
            print 'oRNA key missing', alignment.sID, alignment.id
    #save
    oRNA_DC.commit(id_oRNA)
Ejemplo n.º 35
0
def updateID(aDir, peakFN):
        '''This fxn will initialize the database objects...'''
        
        oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
        id_oRNA = {}

        f = open(peakFN, 'r')
        i = 0
        for line in f:
                id_oRNA[i] = cgOriginRNA.OriginRNA(i)
                i += 1
                
        oRNA_DC.commit(id_oRNA)
Ejemplo n.º 36
0
def updateID(aDir, peakFN):
    '''This fxn will initialize the database objects...'''

    oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
    id_oRNA = {}

    f = open(peakFN, 'r')
    i = 0
    for line in f:
        id_oRNA[i] = cgOriginRNA.OriginRNA(i)
        i += 1

    oRNA_DC.commit(id_oRNA)
Ejemplo n.º 37
0
def updateIDFromQuery(aDir, queryFN):
    '''This fxn will initialize the database objects...'''

    oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
    id_oRNA = {}

    f = open(queryFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        id = int(ls[0])
        id_oRNA[id] = cgOriginRNA.OriginRNA(id)

    oRNA_DC.commit(id_oRNA)
Ejemplo n.º 38
0
def updateIDFromQuery(aDir, queryFN):
        '''This fxn will initialize the database objects...'''
        
        oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
        id_oRNA = {}

        f = open(queryFN, 'r')
        for line in f:
                ls = line.strip().split('\t')
                id = int(ls[0])
                id_oRNA[id] = cgOriginRNA.OriginRNA(id)
                
        oRNA_DC.commit(id_oRNA)
Ejemplo n.º 39
0
def updateSmallExpression(aDir, cName):
	
	oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
	id_oRNA = oRNA_DC.load()
	
        for id, oRNA in id_oRNA.items():

	        stretch = cgPeaks.stretch(oRNA.tcc, cName) #this stretch contains values for small library...
	        highValue = stretch.getHighestLevel()
	        oRNA.eLevel = highValue

        
        oRNA_DC.commit(id_oRNA)
Ejemplo n.º 40
0
def filterORNA(oDir,
               maxEndContig,
               maxTotalContig,
               minSNR,
               minNumTargets,
               keepDuplicates=False):
    if keepDuplicates == 'True': keepDuplicates = True
    if keepDuplicates == 'False': keepDuplicates = False

    maxEndContig, maxTotalContig = int(maxEndContig), int(maxTotalContig)
    minNumTargets = int(minNumTargets)
    minSNR = float(minSNR)

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    for oRNA in id_oRNA.values():

        oRNA.passedFilter = True

        if len(oRNA.filteredTargets) < minNumTargets:
            oRNA.passedFilter = False
            cgOriginRNA.prettyPrint(oRNA, 'numTargets')
            continue

        if oRNA.endContigLength > maxEndContig:
            cgOriginRNA.prettyPrint(oRNA, 'endContig')
            oRNA.passedFilter = False
            continue

        if oRNA.totalContigLength > maxTotalContig:
            cgOriginRNA.prettyPrint(oRNA, 'totalContig')
            oRNA.passedFilter = False
            continue

        if oRNA.snr < minSNR:
            cgOriginRNA.prettyPrint(oRNA, 'SNR fail')
            oRNA.passedFilter = False
            continue

        if not keepDuplicates:
            if oRNA.sequenceDuplicate:
                cgOriginRNA.prettyPrint(oRNA, 'Duplicate Fail')
                oRNA.passedFilter = False
                continue

        print 'PASSED:', oRNA.id, ','.join(
            str(x) for x in oRNA.filteredTargets
        ), oRNA.entropy, oRNA.avgNumSimulationTargets, oRNA.snr, oRNA.endContigLength, oRNA.sequence

    oDC.commit(id_oRNA)
Ejemplo n.º 41
0
def updateSmallExpression(aDir, cName):

    oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
    id_oRNA = oRNA_DC.load()

    for id, oRNA in id_oRNA.items():

        stretch = cgPeaks.stretch(
            oRNA.tcc,
            cName)  #this stretch contains values for small library...
        highValue = stretch.getHighestLevel()
        oRNA.eLevel = highValue

    oRNA_DC.commit(id_oRNA)
Ejemplo n.º 42
0
def updateSequence(aDir, seqFN):

    oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
    id_oRNA = oRNA_DC.load()

    f = open(seqFN, 'r')
    i = 0
    for line in f:
        ls = line.strip().split('\t')
        seq = ls[0]
        id_oRNA[i].sequence = seq
        i += 1

    oRNA_DC.commit(id_oRNA)
Ejemplo n.º 43
0
def updateTcc(aDir, tccFN):

        oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
	id_oRNA = oRNA_DC.load()

        f = open(tccFN, 'r')
        i = 0
        for line in f:
                ls = line.strip().split('\t')
                tcc = ls[0]
                id_oRNA[i].tcc = tcc
                i += 1

        oRNA_DC.commit(id_oRNA)
Ejemplo n.º 44
0
def updateSequence(aDir, seqFN):
        
        oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
        id_oRNA = oRNA_DC.load()

        f = open(seqFN, 'r')
        i = 0
        for line in f:
                ls = line.strip().split('\t')
                seq = ls[0]
                id_oRNA[i].sequence = seq
                i += 1

        oRNA_DC.commit(id_oRNA)
Ejemplo n.º 45
0
def updateTcc(aDir, tccFN):

    oRNA_DC = cgDB.dataController(aDir, cgOriginRNA.OriginRNA)
    id_oRNA = oRNA_DC.load()

    f = open(tccFN, 'r')
    i = 0
    for line in f:
        ls = line.strip().split('\t')
        tcc = ls[0]
        id_oRNA[i].tcc = tcc
        i += 1

    oRNA_DC.commit(id_oRNA)
Ejemplo n.º 46
0
def filterTargets(oRNADir, aDir, inTranscript, misLevel, centerLevel,
                  minCenterLevel):
    if inTranscript == 'True': inTranscript = True
    if inTranscript == 'False': inTranscript = False
    misLevel, centerLevel, minCenterLevel = int(misLevel), int(
        centerLevel), float(minCenterLevel)

    oDC = cgDB.dataController(oRNADir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    for oRNA in id_oRNA.values():
        oRNA.filteredTargets = []
        for aID in oRNA.targets:
            alignment = id_alignment[aID]

            #transcriptOverlap
            if inTranscript:
                if not alignment.transcriptOverlap:
                    #print 'tOverlap Fail', cgAlignment.pretty#print(alignment)
                    continue

            #misLevel
            if alignment.mismatchStatus[misLevel]:
                #print 'mismatch Fail', cgAlignment.pretty#print(alignment)
                continue

            #centerLevel
            if alignment.centerExpression[centerLevel] < minCenterLevel:
                #print 'expression Fail', cgAlignment.pretty#print(alignment)
                continue

            oRNA.filteredTargets.append(aID)

    oDC.commit(id_oRNA)
def getSeqs(oDir):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    for id, oRNA in id_oRNA.items():

        if oRNA.sequenceDuplicate:
            continue
        if oRNA.totalContigLength > 6:
            continue
        if oRNA.endContigLength > 6:
            continue

        print "%s" % id
Ejemplo n.º 48
0
def updateAvgNumTargets(oDir):
       
        oID_numTargets = {}

        for i in range(0,10):
                print i
                simDirRNA = '/home/chrisgre/scripts/simulations/simsk50Filtered/simulation.%s/oRNA' % i
                oDC = cgDB.dataController(simDirRNA, cgOriginRNA.OriginRNA)
                id_sRNA = oDC.load()

                for id, sRNA in id_sRNA.items():
                        currTargets = oID_numTargets.get(id, 0)
                        oID_numTargets[id] = currTargets + len(sRNA.filteredTargets)
        
        #now save it
        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        for oRNA in id_oRNA.values():
                totalNum = oID_numTargets.get(oRNA.id, 0)
                avgNum = float(totalNum)/float(10.0)
                oRNA.avgNumSimulationTargets = avgNum

        oDC.commit(id_oRNA)
Ejemplo n.º 49
0
def quickScript(oDir):
        
        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        failedList = []

        for id, oRNA in id_oRNA.items():

                if oRNA.endContigLength > 7 or oRNA.totalContigLength > 7:
                        failedList.append(id)

        
        for id in failedList:
                print id
Ejemplo n.º 50
0
def getSeqs(oDir):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    for id, oRNA in id_oRNA.items():

        if oRNA.sequenceDuplicate:
            continue
        if oRNA.totalContigLength > 6:
            continue
        if oRNA.endContigLength > 6:
            continue

        print '%s' % id
Ejemplo n.º 51
0
def updateSNR(oDir):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    for oRNA in id_oRNA.values():
        actualNum = len(oRNA.filteredTargets)
        avgNum = oRNA.avgNumSimulationTargets

        if avgNum == 0: avgNum = .01

        SNR = float(actualNum) / avgNum
        oRNA.snr = SNR

    oDC.commit(id_oRNA)
Ejemplo n.º 52
0
def updateAvgNumTargets(oDir):

    oID_numTargets = {}

    for i in range(0, 10):
        print i
        simDirRNA = '/home/chrisgre/scripts/simulations/simsk50Filtered/simulation.%s/oRNA' % i
        oDC = cgDB.dataController(simDirRNA, cgOriginRNA.OriginRNA)
        id_sRNA = oDC.load()

        for id, sRNA in id_sRNA.items():
            currTargets = oID_numTargets.get(id, 0)
            oID_numTargets[id] = currTargets + len(sRNA.filteredTargets)

    #now save it
    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    for oRNA in id_oRNA.values():
        totalNum = oID_numTargets.get(oRNA.id, 0)
        avgNum = float(totalNum) / float(10.0)
        oRNA.avgNumSimulationTargets = avgNum

    oDC.commit(id_oRNA)
Ejemplo n.º 53
0
def updateSNR(oDir):

        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        for oRNA in id_oRNA.values():
                actualNum = len(oRNA.filteredTargets)
                avgNum = oRNA.avgNumSimulationTargets
                
                if avgNum == 0: avgNum = .01
                
                SNR = float(actualNum)/avgNum
                oRNA.snr = SNR


        oDC.commit(id_oRNA)
def markCenterExpression(aDir, cName):
        
        aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
        id_alignment = aDC.load()


        for alignment in id_alignment.values():
                alignment.centerExpression = [0.0, 0.0, 0.0]      
                chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc)
                offset = alignment.tStart
                sLen = alignment.sLength

                if strand == '1':
                        start = start - 19 + offset
                        end = start + sLen
                else:
                        end = end + 19 - offset
                        start = end - sLen

                scanRange = bioLibCG.makeTcc(chrom, strand, start, end)
                
                stretch = cgPeaks.stretch(scanRange, cName)
                expressionSum = stretch.getSumOfLevels()
                sortedKeys = stretch.profile.keys()
                sortedKeys.sort()

                if strand == '-1':
                        sortedKeys.reverse()
                
                if expressionSum != 0:

                        sum = 0.0
                        for key in sortedKeys[8:12]:
                                sum += stretch.profile[key]
                        alignment.centerExpression[0] = sum/expressionSum

                        sum = 0.0
                        for key in sortedKeys[7:13]:
                                sum += stretch.profile[key]
                        alignment.centerExpression[1] = sum/expressionSum

                        sum = 0.0
                        for key in sortedKeys[6:14]:
                                sum += stretch.profile[key]
                        alignment.centerExpression[2] = sum/expressionSum

        aDC.commit(id_alignment)
Ejemplo n.º 55
0
def markCenterExpression(aDir, cName):

    aDC = cgDB.dataController(aDir, cgAlignment.cgAlignment)
    id_alignment = aDC.load()

    for alignment in id_alignment.values():
        alignment.centerExpression = [0.0, 0.0, 0.0]
        chrom, strand, start, end = bioLibCG.tccSplit(alignment.tTcc)
        offset = alignment.tStart
        sLen = alignment.sLength

        if strand == '1':
            start = start - 19 + offset
            end = start + sLen
        else:
            end = end + 19 - offset
            start = end - sLen

        scanRange = bioLibCG.makeTcc(chrom, strand, start, end)

        stretch = cgPeaks.stretch(scanRange, cName)
        expressionSum = stretch.getSumOfLevels()
        sortedKeys = stretch.profile.keys()
        sortedKeys.sort()

        if strand == '-1':
            sortedKeys.reverse()

        if expressionSum != 0:

            sum = 0.0
            for key in sortedKeys[8:12]:
                sum += stretch.profile[key]
            alignment.centerExpression[0] = sum / expressionSum

            sum = 0.0
            for key in sortedKeys[7:13]:
                sum += stretch.profile[key]
            alignment.centerExpression[1] = sum / expressionSum

            sum = 0.0
            for key in sortedKeys[6:14]:
                sum += stretch.profile[key]
            alignment.centerExpression[2] = sum / expressionSum

    aDC.commit(id_alignment)
def signalHisto(oDir, title = 'SNR'):

        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        histVals = []
        for oRNA in id_oRNA.values():
                if not oRNA.passedFilter:
                        continue

                histVals.append(math.log(oRNA.snr, 2))

        plt.hist(histVals, 30, facecolor='b', alpha = .75)
        plt.axis([-4,10,0,30])        
        plt.title('%s' % title)
        plt.xlabel('log2(Signal/Noise)')
        plt.ylabel('Number of Origin RNAs')
        plt.show()
Ejemplo n.º 57
0
def signalHisto(oDir, title='SNR'):

    oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
    id_oRNA = oDC.load()

    histVals = []
    for oRNA in id_oRNA.values():
        if not oRNA.passedFilter:
            continue

        histVals.append(math.log(oRNA.snr, 2))

    plt.hist(histVals, 30, facecolor='b', alpha=.75)
    plt.axis([-4, 10, 0, 30])
    plt.title('%s' % title)
    plt.xlabel('log2(Signal/Noise)')
    plt.ylabel('Number of Origin RNAs')
    plt.show()
Ejemplo n.º 58
0
def filterORNA(oDir, maxEndContig, maxTotalContig, minSNR, minNumTargets, keepDuplicates = False):
        if keepDuplicates == 'True': keepDuplicates = True
        if keepDuplicates == 'False': keepDuplicates = False

        maxEndContig, maxTotalContig = int(maxEndContig), int(maxTotalContig)
        minNumTargets = int(minNumTargets)
        minSNR = float(minSNR)

        oDC = cgDB.dataController(oDir, cgOriginRNA.OriginRNA)
        id_oRNA = oDC.load()

        for oRNA in id_oRNA.values():

                oRNA.passedFilter = True
                
                if len(oRNA.filteredTargets) < minNumTargets:
                        oRNA.passedFilter = False
                        cgOriginRNA.prettyPrint(oRNA, 'numTargets')
                        continue
                
                if oRNA.endContigLength > maxEndContig:
                        cgOriginRNA.prettyPrint(oRNA, 'endContig')
                        oRNA.passedFilter = False
                        continue

                if oRNA.totalContigLength > maxTotalContig:
                        cgOriginRNA.prettyPrint(oRNA, 'totalContig')
                        oRNA.passedFilter = False
                        continue

                if oRNA.snr < minSNR:
                        cgOriginRNA.prettyPrint(oRNA, 'SNR fail')
                        oRNA.passedFilter = False
                        continue

                if not keepDuplicates:
                        if oRNA.sequenceDuplicate:
                                cgOriginRNA.prettyPrint(oRNA, 'Duplicate Fail')
                                oRNA.passedFilter = False
                                continue
                
                print 'PASSED:', oRNA.id, ','.join(str(x) for x in oRNA.filteredTargets), oRNA.entropy, oRNA.avgNumSimulationTargets, oRNA.snr, oRNA.endContigLength, oRNA.sequence

        oDC.commit(id_oRNA)