Example #1
0
def updateAvgNumSS(oFN):

    bn = os.path.basename(oFN)
    print 'basename', bn

    oID_numSS = {}

    numSims = 100

    print 'getting avg for %s simulations' % numSims
    for i in range(0, numSims):

        #simFN = '/home/chrisgre/scripts/simulations/simsk50FilteredMasked/simulation.%s/%s' % (i, bn)
        #simFN = '/home/chrisgre/scripts/simulations/simsk50Fix/simulation.%s/%s' % (i, bn)
        #simFN = '/home/chrisgre/scripts/simulations/mm9/simulation.%s/%s' % (i, bn)
        #simFN = '/home/chrisgre/scripts/simulations/hg19.hela/simulation.%s/%s' % (i, bn)
        simFN = '/home/chrisgre/scripts/simulations/hg19.U87/simulation.%s/%s' % (
            i, bn)
        osNX = cgNexusFlat.Nexus(simFN, cgOriginRNAFlat.OriginRNA)
        osNX.load(['numSignificantSequences'])
        for oID in osNX.numSignificantSequences:
            oID_numSS[oID] = oID_numSS.get(
                oID, 0) + osNX.numSignificantSequences[oID]

    #now save it
    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['avgNumSS'])

    for oID in oNX.avgNumSS:
        totalNum = oID_numSS.get(oID, 0)
        avgNum = float(totalNum) / float(numSims)
        oNX.avgNumSS[oID] = avgNum
    oNX.save()
Example #2
0
def updateTargetIDsFiltered(oFN, aFN, rn=None, tn=None):
    '''CAUTION: NO SELECTION BEING MADE!!!'''

    #load the data
    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['filteredTargets'], [rn, tn])
    '''
        #get ids of alignments I need (set)
        oIDs = set()
        for oID in oNX.filteredTargets: oIDs.add(oID)
        '''

    #load only alignments I need
    '''c = {'sID' : lambda x: x in oIDs}'''
    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['sID'])

    #clear targets that are there.
    for oID in oNX.filteredTargets:
        oNX.filteredTargets[oID] = []

    #update the targets for oRNAs
    for aID in aNX.sID:
        oID = aNX.sID[aID]
        try:
            oNX.filteredTargets[oID].append(aID)
        except KeyError:  #another process is taking care of this one
            pass

    #save
    oNX.save()
Example #3
0
def appendTInfoFlat(aFN, dFN, rn = None, tn = None):

        aNX = cgNexusFlat.Nexus(aFN, cgAlignment)
        aNX.load(['tID', 'tTcc', 'transcriptOverlap', 'tELevel', 'context', 'repeat', 'targetSequence', 'gScore'], [rn, tn])

        dNX = cgNexusFlat.Nexus(dFN, cgDegPeak.Peak)
        dNX.load(['tOverlap', 'eLevel', 'tcc', 'context', 'repeatStatus', 'sequence', 'gScore'])

        tID_aIDs = {}
        for aID in aNX.tID:
                tID_aIDs.setdefault(aNX.tID[aID], []).append(aID)


        for dID in dNX.tcc: 
                
                for aID in tID_aIDs.get(dID, list()):
                        aNX.tTcc[aID] = dNX.tcc[dID]
                        aNX.tELevel[aID] = dNX.eLevel[dID]
                        aNX.transcriptOverlap[aID] = dNX.tOverlap[dID]
                        aNX.context[aID] = dNX.context[dID]
                        aNX.repeat[aID] = dNX.repeatStatus[dID]
                        aNX.gScore[aID] = dNX.gScore[dID]
                        aNX.targetSequence[aID] = dNX.sequence[dID]
                        #aNX.repeatCount[aID] = dNX.repeatCount[dID]
                        #aNX.totalContig[aID] = dNX.totalContig[dID]
        aNX.save()                        
Example #4
0
def updateAvgNumTargets(oFN):

    bn = os.path.basename(oFN)
    print 'basename', bn

    oID_numTargets = {}

    for i in range(0, 10):

        #simFN = '/home/chrisgre/scripts/simulations/simsk50FilteredMasked/simulation.%s/%s' % (i, bn)
        simFN = '/home/chrisgre/scripts/simulations/simsk50/simulation.%s/%s' % (
            i, bn)
        print simFN
        osNX = cgNexusFlat.Nexus(simFN, cgOriginRNAFlat.OriginRNA)
        osNX.load(['filteredTargets'])
        for oID in osNX.filteredTargets:
            currTargets = oID_numTargets.get(oID, 0)
            oID_numTargets[oID] = currTargets + len(osNX.filteredTargets[oID])

    #now save it
    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['avgNumSimulationTargets'])

    for oID in oNX.avgNumSimulationTargets:
        totalNum = oID_numTargets.get(oID, 0)
        avgNum = float(totalNum) / float(10.0)
        oNX.avgNumSimulationTargets[oID] = avgNum

    oNX.save()
Example #5
0
def eLevelHistogram(oFN, aFN, oRNA=True):

    oRNA = 'True' in oRNA

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['filteredTargets', 'eLevel'])

    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['tELevel'])

    histValues = []
    for oID in oNX.eLevel:
        if oRNA:
            histValues.append(oNX.eLevel[oID])
        else:
            for aID in oNX.filteredTargets[oID]:
                histValues.append(aNX.tELevel[aID])

    histVals = [math.log(x, 10) for x in histValues]
    plt.hist(histVals, 50)
    type = 'oRNA'
    if not oRNA: type = 'Targets (degradome)'
    plt.title('Expression Level for %s' % type)
    plt.xlabel('log(Expression Level)')
    plt.ylabel('Number of %s' % type)

    plt.show()
Example #6
0
def countRepeatStatusTargets(oFN, aFN, oContext=None, oType=None):
    if oContext == 'None': oContext = None
    if oType == 'None': oType = None

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(
        ['snrSS', 'context', 'transcriptType', 'filteredTargets', 'gScore'],
        [rn, tn])

    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['context', 'gScore', 'repeat'], [rn, tn])

    context_rStatuss = {}

    for oID in oNX.context:

        if oNX.snrSS[oID] < 2.00: continue
        #gather targets' context info if oRNA is okay
        for aID in oNX.filteredTargets[oID]:
            aCon = aNX.context[aID]
            rStatus = aNX.repeat[aID]
            context_rStatuss.setdefault(aCon, []).append(rStatus)

    #plot
    for context in context_rStatuss:
        print context, context_rStatuss[context].count(
            True), context_rStatuss[context].count(False)
Example #7
0
def gZipContextECDF(oFN, aFN, imgName, oContext=None, oType=None):
    if oContext == 'None': oContext = None
    if oType == 'None': oType = None

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(
        ['snrSS', 'context', 'transcriptType', 'filteredTargets', 'gScore'],
        [rn, tn])

    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['context', 'gScore'], [rn, tn])

    context_gzips = {}

    for oID in oNX.context:

        if oNX.snrSS[oID] < 2.00: continue
        #gather targets' context info if oRNA is okay
        for aID in oNX.filteredTargets[oID]:
            aCon = aNX.context[aID]
            gScore = aNX.gScore[aID]
            context_gzips.setdefault(aCon, []).append(gScore)

    #plot
    for context in context_gzips:
        plt.hist(context_gzips[context],
                 bins=10000,
                 cumulative=True,
                 histtype='step',
                 normed=True,
                 label='%s' % context)

    plt.legend()
    plt.savefig(imgName, bbox_inches='tight', pad_inches=1)
Example #8
0
def updateTargetIDs(oFN, aFN, rn=None, tn=None):

    #load the data
    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['targets'], [rn, tn])

    #get ids of alignments I need (set)
    oIDs = set()
    for oID in oNX.targets:
        oIDs.add(oID)

    #load only alignments I need
    c = {'sID': lambda x: x in oIDs}
    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['sID'], conditions=c)

    #clear targets that are there.
    for oID in oNX.targets:
        oNX.targets[oID] = []

    #update the targets for oRNAs
    for aID in aNX.sID:
        oID = aNX.sID[aID]
        oNX.targets[oID].append(aID)

    #save
    oNX.save()
Example #9
0
def updatePolySeqs(mFN, readsFN, alignFN):

    tim = bioLibCG.cgTimer()
    tim.start()
    variousAs = ["A" * x for x in range(1,20)]
    variousGs = ["G" * x for x in range(1,20)]
    variousTs = ["T" * x for x in range(1,20)]
    variousCs = ["C" * x for x in range(1,20)]

    letter_variousLetters = [ ("A", variousAs),
                            ("G", variousGs),
                            ("T", variousTs),
                            ("C", variousCs)]


    checkRange = range(1,8)

    NX = cgNexusFlat.Nexus(mFN, miR)
    NX.load(['sequence', 'polySeqs'])
    #print 'load micro', tim.split() 

    reads = cgNexusFlat.quickTable(('read','string', '.', 1))
    rNX = cgNexusFlat.Nexus(readsFN, reads)
    rNX.load(['read'])
    #print 'load reads', tim.split() 

    aNX = cgNexusFlat.Nexus(alignFN, cgAlignment)
    aNX.load(['sID', 'tID'])
    #print 'load alignments', tim.split() 

    for id in aNX.ids:

        theRead = rNX.read[aNX.sID[id]]
        mID = aNX.tID[id]
        microSeq = NX.sequence[mID]

        #may be a read for expression, but wont count...
        if theRead in microSeq: continue

        #just for expression
        if microSeq == theRead: 
            print tabIt(microSeq, theRead, 0, 0, "N")

        #first check full
        elif microSeq in theRead and (len(theRead) != len(microSeq)):
            tail = theRead.split(microSeq)[1]
            for let, variousLetters in letter_variousLetters:
                if tail in variousLetters:
                    print tabIt(microSeq, theRead, 0, len(tail), let)

        #now check trimmed (cant do [:-0])
        else:
            for i in checkRange:
                if microSeq[:-i] in theRead and (len(theRead) != len(microSeq[:-i])):
                    tail = theRead.split(microSeq[:-i])[1]
                    for let, variousLetters in letter_variousLetters:
                        if tail in variousLetters:
                            print tabIt(microSeq, theRead, i, len(tail), let)
                            print "TRIMMED"
                    break #dont trim after the first trimmed one works                           
Example #10
0
def targetContextPercentageVsExpression(oFN,
                                        aFN,
                                        oContext=None,
                                        oType=None,
                                        rn=None,
                                        tn=None):

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['context', 'transcriptType', 'filteredTargets'])

    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['context', 'tELevel'])

    context_level_count = {}

    for lev in range(50, 500, 50):
        for oID in oNX.context:

            #filter oID types here
            con = oNX.context[oID]
            typ = oNX.transcriptType[oID]

            if oContext:
                if oContext != con: continue

            if oType:
                if oType != typ: continue

            #gather targets' context info if oRNA is okay
            for aID in oNX.filteredTargets[oID]:
                eLevel = aNX.tELevel[aID]
                if eLevel < lev: continue
                aCon = aNX.context[aID]
                context_level_count[aCon][
                    lev] = context_level_count.setdefault(aCon, {}).get(
                        lev, 0) + 1

    #fracs = pieFractions(counts)
    plots_labels = [[], []]
    for con in context_level_count:
        x = []
        y = []
        sortedLevs = sorted(context_level_count[con].keys())
        for lev in sortedLevs:
            x.append(lev)
            y.append(context_level_count[con][lev])
        plots_labels[0].append(plt.plot(
            x,
            y,
        ))
        plots_labels[1].append(con)

    #plot
    plt.legend(plots_labels[0], plots_labels[1])
    plt.title(
        'oRNA Targets\' Context Proportion Stability w/ Expression Increase')
    plt.xlabel('Degradome Expression Cutoff')
    plt.ylabel('Number of oRNA Targets')
    plt.show()
def phastScoreByNT(oFN, aFN, oIDFilter=None):

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['phastScores', 'snrSS', 'filteredTargets'])

    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['mismatchPositions'])

    misPositionsX = []
    misScoresY = []
    positionsX = []
    scoresY = []

    for oID in oNX.phastScores:

        avgScore = sum(oNX.phastScores[oID]) / float(len(oNX.phastScores[oID]))

        #filter
        if (avgScore < .90) or (oNX.snrSS[oID] < 2):
            continue

        if oIDFilter:
            if oID != int(oIDFilter):
                continue

        misPositions = set()
        #get consolidated mismatches
        for aID in oNX.filteredTargets[oID]:
            for mPos in aNX.mismatchPositions[aID]:

                misPositions.add(mPos)

        for i, pScore in enumerate(oNX.phastScores[oID]):
            if i in misPositions:
                misPositionsX.append(
                    i + 1.2
                )  #1 is for 0BASE, .2 is for differentiating between mis and reg
                misScoresY.append(pScore)
            else:
                positionsX.append(i + 1)
                scoresY.append(pScore)

    highestNT = max(positionsX)
    for i in range(1, highestNT + 1):
        plt.axvspan(i - .15, i + .35, facecolor='g', alpha=.25)

    plt.plot(positionsX, scoresY, 'bo')
    plt.plot(misPositionsX, misScoresY, 'ro')
    plt.ylim(0, 1.1)
    plt.xlim(0, 24)
    plt.title('Conservation by Position of Conserved oRNA')
    plt.ylabel('PhastCons Score')
    plt.xlabel('Nucleotide Position')

    plt.show()
Example #12
0
def filterTargets(oFN,
                  aFN,
                  inTranscript,
                  misLevel,
                  centerLevel,
                  minCenterLevel,
                  rn=None,
                  tn=None):
    if inTranscript == 'True': inTranscript = True
    if inTranscript == 'False': inTranscript = False
    misLevel, centerLevel, minCenterLevel = int(misLevel), int(
        centerLevel), float(minCenterLevel)

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['filteredTargets', 'targets'], [rn, tn])

    print inTranscript, misLevel, centerLevel, minCenterLevel, oFN, aFN, rn, tn

    #make selection set
    targets = set()
    for oID in oNX.targets:
        for target in oNX.targets[oID]:
            targets.add(target)

    c = {'ID': lambda x: x in targets}
    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['transcriptOverlap', 'mismatchStatus', 'centerExpression'],
             conditions=c)

    for oID in oNX.filteredTargets:
        oNX.filteredTargets[oID] = []
        for aID in oNX.targets[oID]:

            #transcriptOverlap
            if inTranscript:
                if not aNX.transcriptOverlap[aID]:
                    #print 'tOverlap Fail', cgAlignment.pretty#print(alignment)
                    continue

            #misLevel
            if aNX.mismatchStatus[aID][misLevel]:
                #print 'mismatch Fail', cgAlignment.pretty#print(alignment)
                continue

            #centerLevel
            if aNX.centerExpression[aID][centerLevel] < minCenterLevel:
                #print 'expression Fail', cgAlignment.pretty#print(alignment)
                continue

            oNX.filteredTargets[oID].append(aID)

    oNX.save()
def conservedHisto(oFN):

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['phastScores'])

    scores = []
    for oID in oNX.phastScores:

        avgScore = sum(oNX.phastScores[oID]) / float(len(oNX.phastScores[oID]))

        #scores.extend(oNX.phastScores[oID])
        scores.append(avgScore)

    print len(scores)
    plt.title('PhastCons Scores By Nucleotide')
    plt.title('PhastCons Scores By oRNA')

    plt.ylabel('Number of Nucleotides')
    plt.ylabel('Number of oRNA')

    plt.xlabel('PhastCons Score')
    plt.xlabel('PhastCons Score Average')

    plt.hist(scores, 50)
    plt.show()
Example #14
0
def updateTranscriptOverlap(oFN, wigDir, chrom, strand, rn=None, tn=None):

    oNX = cgNexusFlat.Nexus(oFN, cgDegPeak.Peak)
    oNX.load(['tOverlap', 'tcc'], [rn, tn])

    #load the AS wig file for this degradome strand
    if strand == '1':
        strand = '-1'
    else:
        strand = '1'

    coord_transcripts = cgWig.loadSingleWigTranscript(wigDir, chrom, strand,
                                                      'transcript')

    for oID in oNX.tOverlap:

        tChrom, tStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID])
        if tStrand == '1':
            tStrand = '-1'
        else:
            tStrand = '1'

        if tChrom != chrom or tStrand != strand: continue

        oNX.tOverlap[oID] = False
        for i in xrange(start, end + 1):
            if i in coord_transcripts:
                oNX.tOverlap[oID] = True
                break

    oNX.save()
Example #15
0
def markMismatchedPairs(aFN, rn=None, tn=None):

    #make mismatchDict
    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['mismatchStatus', 'mismatchPositions'], [rn, tn])

    lowRange = range(
        8, 12)  # remember the small locations are 0-based, so 10 is 9
    midRange = range(7, 13)
    highRange = range(6, 14)
    for aID in aNX.mismatchStatus:

        aNX.mismatchStatus[aID] = [False, False, False]
        #check mismatches
        for i in lowRange:
            if i in aNX.mismatchPositions[aID]:
                aNX.mismatchStatus[aID][0] = True
                break

        for i in midRange:
            if i in aNX.mismatchPositions[aID]:
                aNX.mismatchStatus[aID][1] = True
                break

        for i in highRange:
            if i in aNX.mismatchPositions[aID]:
                aNX.mismatchStatus[aID][2] = True
                break

    aNX.save()
def getGeneGC(genePropFN, rn=None, tn=None):

    myGF = gf.GenomeFetch('hg19')

    NX = cgNexusFlat.Nexus(genePropFN, geneProperty)
    NX.load([
        'geneName', 'geneChrom', 'geneStrand', 'geneStarts', 'geneEnds',
        'geneGCContent'
    ], [rn, tn])

    for id in NX.ids:

        spanPairs = zip(NX.geneStarts[id], NX.geneEnds[id])
        spanTccs = [
            '%s:%s:%s:%s' %
            (NX.geneChrom[id], NX.geneStrand[id], pair[0], pair[1])
            for pair in spanPairs
        ]

        totalSequenceLength = 0
        numGC = 0
        for tcc in spanTccs:
            seq = myGF.getSequence(tcc)
            totalSequenceLength += len(seq)
            numGC += seq.count('G') + seq.count('C')

        GCC = float(numGC) / totalSequenceLength

        NX.geneGCContent[id] = GCC

    NX.save()
def updateGeneData(geneRanges, genePropFN):

    myGF = gf.GenomeFetch('hg19')

    NX = cgNexusFlat.Nexus(genePropFN, geneProperty)
    NX.load(['geneName', 'geneChrom', 'geneStrand', 'geneStarts', 'geneEnds'])

    #make inverse dictionary
    gName_nID = {}
    for id in NX.ids:
        gName_nID[NX.geneName[id]] = id

    f = open(geneRanges, 'r')
    for line in f:
        ls = line.strip().split('\t')

        sChrom, sStrand = ls[1], ls[2]
        geneName = ls[0]
        geneStarts = [int(x) for x in ls[3].split(',')]
        geneEnds = [int(x) for x in ls[4].split(',')]

        #get id
        nID = gName_nID.get(geneName, None)

        if nID:
            NX.geneChrom[nID] = sChrom
            NX.geneStrand[nID] = sStrand
            NX.geneStarts[nID] = geneStarts
            NX.geneEnds[nID] = geneEnds

    f.close()

    NX.save()
Example #18
0
def filterOrigin(oFN, rn=None, tn=None):

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load([
        'filteredTargets', 'endContigLength', 'totalContigLength',
        'sequenceDuplicate', 'passedFilter', 'entropy'
    ], [rn, tn])

    for oID in oNX.entropy:

        oNX.passedFilter[oID] = False

        if oNX.entropy[oID] < 1.15:
            continue

        if oNX.endContigLength[oID] > 6:
            continue

        if oNX.totalContigLength[oID] > 6:
            continue

        if oNX.sequenceDuplicate[oID]:
            continue

        if not oNX.filteredTargets[oID]:
            continue

        oNX.passedFilter[oID] = True

    oNX.save()
def getGeneLength(geneRanges, genePropFN):

    NX = cgNexusFlat.Nexus(genePropFN, geneProperty)
    NX.load(['geneName', 'geneLength'])

    #make inverse dictionary
    gName_nID = {}
    for id in NX.ids:
        gName_nID[NX.geneName[id]] = id

    f = open(geneRanges, 'r')
    for line in f:
        ls = line.strip().split('\t')

        sChrom, sStrand = ls[1], ls[2]
        geneName = ls[0]
        geneStarts = [int(x) for x in ls[3].split(',')]
        geneEnds = [int(x) for x in ls[4].split(',')]
        spanPairs = zip(geneStarts, geneEnds)
        totalLength = sum([pair[1] - pair[0] for pair in spanPairs])

        #get id
        nID = gName_nID.get(geneName, None)

        if nID:
            NX.geneLength[nID] = totalLength

    f.close()

    NX.save()
Example #20
0
def totalSNRSS(oFN, SNRToggle=False):

    if SNRToggle == 'True':
        SNRToggle = True
    else:
        SNRToggle = False

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['snrSS', 'numSignificantSequences', 'avgNumSS'])

    highSNRs = []
    for oID in oNX.snrSS:

        snrSS = oNX.snrSS[oID]
        if snrSS > 2:
            highSNRs.append(snrSS)

    if SNRToggle:
        try:
            avgSNR = sum(highSNRs) / len(highSNRs)
            print avgSNR,
        except:
            print '0.0',
    else:
        print len(highSNRs),
Example #21
0
def totalSNR(oFN):

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(
        ['snr', 'filteredTargets', 'avgNumSimulationTargets', 'passedFilter'])

    totalRun = 0
    totalSim = 0
    simsTotals = []
    n = 0
    highSNR = 0
    for oID in oNX.avgNumSimulationTargets:

        #filter out
        if not oNX.passedFilter[oID]:
            continue

        #collect stats
        totalRun += len(oNX.filteredTargets[oID])
        simsTotal = oNX.avgNumSimulationTargets[oID] * 10
        simsTotals.append(simsTotal)
        totalSim += oNX.avgNumSimulationTargets[oID]

        if oNX.snr[oID] > 2:
            highSNR += 1
        n += 1

    print oFN
    print 'Total Number Targets for my run:', totalRun
    print 'Total Number Targets for Simulations:', totalSim
    print 'SNR', float(totalRun) / float(totalSim)
    print 'Total oRNA:', n, 'Total oRNA w/ SNR > 2:', highSNR
    print '\n'
def updateFiltered(oFN):

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load([
        'filteredTargets', 'endContigLength', 'totalContigLength', 'entropy',
        'sequenceDuplicate', 'passedFilter'
    ])

    for oID in oNX.passedFilter:

        oNX.passedFilter[oID] = False

        if len(oNX.filteredTargets[oID]) == 0:
            continue

        if oNX.endContigLength[oID] > 6:
            continue

        if oNX.totalContigLength[oID] > 6:
            continue

        if oNX.entropy[oID] < 1.2:
            continue

        if oNX.sequenceDuplicate[oID]:
            continue

        #if it passed, update
        oNX.passedFilter[oID] = True

    oNX.save()
def correlationSNR(oFN):

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['phastScores', 'snrSS'])

    snrX = []
    scoreY = []
    for oID in oNX.phastScores:

        snr = oNX.snrSS[oID]
        avgScore = sum(oNX.phastScores[oID]) / float(len(oNX.phastScores[oID]))

        snrX.append(snr)
        scoreY.append(avgScore)
        '''                
                for pScore in oNX.phastScores[oID]:
                        snrX.append(snr)
                        scoreY.append(pScore)
                '''

    conserved = [True if x > .8 else False for x in scoreY]
    print conserved.count(True)
    print len(conserved)
    plt.title('SNR vs PhastCons Score')

    plt.ylabel('Avg PhastCons Score of oRNA')

    plt.xlabel('SNR')

    plt.plot(snrX, scoreY, 'ro')
    plt.show()
Example #24
0
def oRNAContextPie(oFN, imgName):
    '''REMEMBER!!! Have to do with grouped results...'''
    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['context', 'snrSS'])

    context_count = {}

    for oID in oNX.context:

        if oNX.snrSS[oID] < 2.00: continue
        con = oNX.context[oID]
        print con
        context_count[con] = context_count.get(con, 0) + 1

    labels = sorted(context_count.keys())
    counts = [context_count[x] for x in labels]
    fracs = pieFractions(counts)

    #add numbers to labels
    labels = ['%s (%s)' % (x, context_count[x]) for x in labels]

    #plot
    plt.title('Context of oRNA (results > 2.00 SNR)')
    plt.pie(fracs, labels=labels, shadow=True)
    plt.savefig(imgName, bbox_inches='tight', pad_inches=1)
def updateEndContig(oFN, rn = None, tn = None):

        oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
        oNX.load(['sequence', 'endContigLength'], [rn, tn])
        
        for oID in oNX.sequence:
                seq = oNX.sequence[oID]

                #5'
                cLength5 = 1
                for i,letter in enumerate(seq):
                        if i == 0: continue

                        if seq[i] == seq[i-1]:
                                cLength5 += 1
                        else:
                                break
                #3'
                cLength = 1
                revSeq = [x for x in reversed(seq)]
                for i,letter in enumerate(revSeq):
                        if i == 0: continue

                        if revSeq[i] == revSeq[i-1]:
                                cLength += 1
                        else:
                                break

                highest = cLength5
                if cLength > cLength5:
                        highest = cLength

                oNX.endContigLength[oID] = highest                        
               
        oNX.save()
Example #26
0
def loadSeqs(seqFN):

    seqNX = cgNexusFlat.Nexus(seqFN, Seq)
    seqNX.load(['length', 'sequence'])

    print seqNX.length[100000], seqNX.sequence[100000]
    print 'done loading'
def markMismatchedPairs(aFN, rn=None, tn=None):

    #make mismatchDict
    aNX = cgNexusFlat.Nexus(aFN, cgAlignmentFlat.cgAlignment)
    aNX.load(['mismatchStatus', 'mismatchPositions'], [rn, tn])

    for aID in aNX.mismatchStatus:

        aNX.mismatchStatus[aID] = [False, False, False]
        lowRange = range(9, 13)
        midRange = range(8, 14)
        highRange = range(7, 15)

        #check mismatches
        for i in lowRange:
            if i in aNX.mismatchPositions[aID]:
                aNX.mismatchStatus[aID][0] = True
                break

        for i in midRange:
            if i in aNX.mismatchPositions[aID]:
                aNX.mismatchStatus[aID][1] = True
                break

        for i in highRange:
            if i in aNX.mismatchPositions:
                aNX.mismatchStatus[aID][2] = True
                break

    aNX.save()
def updateSharedTargets(oFN, rn=None, tn=None):
    '''Just because there are duplicate sequences does not mean that
        the genomic position of each results is the correct one.  The 
        targets for each genomic position should be the same as the targets
        for each duplicate sequence

        make set of targets for each oID --> set each oid's targets'''

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['sequence', 'filteredTargets'], [rn, tn])

    knownSeq_targets = {}

    #create oID groups and target sets.
    for oID in oNX.sequence:
        currSeq = oNX.sequence[oID]

        #add targets to set
        for tID in oNX.filteredTargets[oID]:
            knownSeq_targets.setdefault(currSeq, set()).add(tID)

    for oID in oNX.sequence:

        currSeq = oNX.sequence[oID]

        newTargets = list(knownSeq_targets.get(currSeq, set()))
        oNX.filteredTargets[oID] = newTargets

    oNX.save()
Example #29
0
def oRNATypePie(oFN, imgName):

    oNX = cgNexusFlat.Nexus(oFN, cgOriginRNAFlat.OriginRNA)
    oNX.load(['snrSS', 'transcriptType', 'transcriptTypes'], [rn, tn])

    context_count = {}

    for oID in oNX.transcriptType:

        #con = oNX.transcriptType[oID]
        if oNX.snrSS[oID] < 2.00: continue
        cons = oNX.transcriptTypes[oID]
        for con in cons:
            context_count[con] = context_count.get(con, 0) + 1
        #context_count[con] = context_count.get(con, 0) + 1

    labels = sorted(context_count.keys())
    counts = [context_count[x] for x in labels]
    fracs = pieFractions(counts)

    #add numbers to labels
    labels = ['%s (%s)' % (x, context_count[x]) for x in labels]

    #plot
    plt.pie(fracs, labels=labels, shadow=True)
    plt.savefig(imgName, bbox_inches='tight', pad_inches=1)
Example #30
0
def updateContext(oFN, wigDir, chrom, strand, rn=None, tn=None):

    oNX = cgNexusFlat.Nexus(oFN, degPeak.degPeak)
    oNX.load(['context', 'tcc'], [rn, tn])

    print 'loading wig'
    coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand,
                                                'context')
    print 'done loading'

    ds = bioLibCG.dominantSpotter([
        'C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR',
        'C_INTRON', 'NC_INTRON', 'INTER'
    ])

    for oID in oNX.tcc:

        oChrom, oStrand, start, end = bioLibCG.tccSplit(oNX.tcc[oID])

        #deg wigs is AS to actual clipping site
        if oStrand == '1':
            oStrand = '-1'
        else:
            oStrand = '1'

        if oChrom == chrom and oStrand == strand:

            contexts = coord_contexts.get(start, 'INTER').split(',')
            oNX.context[oID] = ds.spotItem(contexts)

    oNX.save()