Esempio n. 1
0
def updateAvgSS(dataFN, oFF, simDir, simBase, mm, numSims=100):

    #get simulation information (# unique sims, num UFBS)
    fileNames = [
        '%s/simulation.%s/%s.%s' % (simDir, i, simBase, mm)
        for i in range(numSims)
    ]
    sID_numSimUFBS = {}
    sID_simSeqs = {}
    for fN in fileNames:
        oNX = Nexus(fN, oFF)
        oNX.load(['numUFBS', 'sequence'])
        while oNX.nextID():
            if oNX.sequence in sID_simSeqs.get(oNX.id, set()):
                pass  #dont count again
            else:
                sID_numSimUFBS.setdefault(oNX.id, []).append(oNX.numUFBS)
                sID_simSeqs.setdefault(oNX.id, set()).add(oNX.sequence)

    #update data based on sim info
    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['avgNumSimSS', 'numUniqueSims', 'totalNumUFBSSim'])
    while dataNX.nextID():

        numUniqueSims = len(sID_numSimUFBS.get(dataNX.id, []))
        totalSimUFBS = sum(sID_numSimUFBS.get(dataNX.id, []))
        avgSimUFBS = totalSimUFBS / float(
            numUniqueSims) if numUniqueSims != 0 else -1
        dataNX.avgNumSimSS = avgSimUFBS
        dataNX.numUniqueSims = numUniqueSims
        dataNX.totalNumUFBSSim = totalSimUFBS

    dataNX.save()
Esempio n. 2
0
def updateAvgSS(dataFN, oFF, simDir, simBase, mm, numSims = 100):

    #get simulation information (# unique sims, num UFBS)
    fileNames = ['%s/simulation.%s/%s.%s' % (simDir, i, simBase, mm) for i in range(numSims)]
    sID_numSimUFBS = {}
    sID_simSeqs = {}
    for fN in fileNames:
        oNX = Nexus(fN, oFF)
        oNX.load(['numUFBS', 'sequence'])
        while oNX.nextID():
            if oNX.sequence in sID_simSeqs.get(oNX.id, set()):
                pass #dont count again
            else:
                sID_numSimUFBS.setdefault(oNX.id, []).append(oNX.numUFBS)
                sID_simSeqs.setdefault(oNX.id, set()).add(oNX.sequence)

    #update data based on sim info
    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['avgNumSimSS', 'numUniqueSims', 'totalNumUFBSSim'])
    while dataNX.nextID():

        numUniqueSims = len(sID_numSimUFBS.get(dataNX.id, []))
        totalSimUFBS = sum(sID_numSimUFBS.get(dataNX.id, []))
        avgSimUFBS = totalSimUFBS/float(numUniqueSims) if numUniqueSims != 0 else -1
        dataNX.avgNumSimSS = avgSimUFBS
        dataNX.numUniqueSims = numUniqueSims
        dataNX.totalNumUFBSSim = totalSimUFBS 

    dataNX.save()
def updateContext(fN, fF, wigDir, chrom, strand, switchStrand = False):
        
    NX = Nexus(fN, fF)
    NX.load(['tcc', 'context'])
    
    if switchStrand:
        strand = str(-int(strand))
    else:
        strand = str(strand)
    
    print 'loading wig'
    coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'context') 
    print 'done loading'

    ds = bioLibCG.dominantSpotter(['C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR', 'C_INTRON', 'NC_INTRON', 'INTER']) 


    while NX.nextID():

        oChrom, oStrand, start, end = bioLibCG.tccSplit(NX.tcc)
        
        #deg wigs is AS to actual clipping site
        if switchStrand:
            oStrand = str(-int(strand))
        else:
            oStrand = str(oStrand)

        if oChrom == chrom and oStrand == strand:

            contexts = coord_contexts.get(start, 'INTER').split(',')
            NX.context = ds.spotItem(contexts)

    
    NX.save()
Esempio n. 4
0
def updateGeneName(dFN,
                   fFN,
                   wigDir,
                   chrom,
                   strand,
                   prefix,
                   switchStrand=False):

    NX = Nexus(dFN, fFN)
    NX.load(['geneNames', 'tcc'])

    if switchStrand:
        strand = -strand

    strand = str(strand)
    coord_gName = cgWig.loadSingleWigTranscript(wigDir, chrom, strand, prefix)

    while NX.nextID():

        chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc)

        overlappingGenes = coord_gName.get(start, ".")
        if overlappingGenes == "NONE":
            NX.geneNames = []
        else:
            NX.geneNames = overlappingGenes.split(',')

    NX.save()
def updateScores(fN, fFN):
    
    NX = Nexus(fN, fFN)
    NX.load(['numNormMatches', 'numGUs', 'numMismatches', 'numQGaps', 'numRGaps', 'numExtensionsQ','numExtensionsR', 'score'])
    
    while NX.nextID():
        NX.score = calculateAlignmentScore(NX.numNormMatches, NX.numGUs, NX.numMismatches, NX.numQGaps, NX.numRGaps, NX.numExtensionsQ, NX.numExtensionsR)

    NX.save()
Esempio n. 6
0
def testPeaks(degFN, dForm, allGeneInfo, gForm, switchStrand = False):

    #load/configure gene Info
    gNX = Nexus(allGeneInfo, gForm)
    gNX.load(['geneName', 'numReads', 'numSpots'])
    
    gName_numReads = {}
    gName_numSpots = {}
    while gNX.nextID():
        gName_numReads[gNX.geneName] = gNX.numReads
        gName_numSpots[gNX.geneName] = gNX.numSpots
   
   
    #load degFN info
    dNX = Nexus(degFN, dForm)
    dNX.load(['tcc', 'eLevel', 'geneNames', 'pValBin'])

    while dNX.nextID():
       
        gNames, readsForPeak = dNX.geneNames, dNX.eLevel
        chrom, strand, start, end = bioLibCG.tccSplit(dNX.tcc)
        if switchStrand:
            strand = -int(strand)
      
        pVals = []
        for gName in gNames:
            
            #may have to change gene name cuz of multiple spans
            try:
                totGeneReads = gName_numReads[gName]
                numSpotsForGene = gName_numSpots[gName]
            except KeyError:

                try:
                    gName = gName + '_RE_%s_%s' % (chrom, strand)
                    totGeneReads = gName_numReads[gName]
                    numSpotsForGene = gName_numSpots[gName]
                except KeyError:
                    print "FIX THIS GENE NAME", gName
                    continue

            #add psuedocount
            totGeneReads += 1
            numSpotsForGene += 1 # not sure whether to do this yet...

            #check for hidden intron gene overlap
            try:
                q = 1.0/numSpotsForGene
            except ZeroDivisionError:
                continue #intron gene

            #add p val
            pVals.append(binom.sf(readsForPeak, totGeneReads, q))

        dNX.pValBin = max(pVals) if pVals else -1.0

    dNX.save()
Esempio n. 7
0
def updateSNR(oFN, oFF):

    NX = Nexus(oFN, oFF)
    NX.load(['avgNumSimSS', 'numUFBS', 'snr'])

    while NX.nextID():
        try:
            NX.snr = NX.numUFBS/NX.avgNumSimSS
        except ZeroDivisionError:
            NX.snr = NX.numUFBS/0.01

    NX.save()
Esempio n. 8
0
def testNX(fN, fF):

    NX = Nexus(fN, fF, 'geneName numReads isCoding otherIDs')

    print 'START LOOPING'
    for gene in NX:
        gene.isCoding = True
        gene.otherIDs = range(10)
        gene.geneName = "testAuto"
        gene.numReads = 300
    
    NX.save()
Esempio n. 9
0
def updateSNR(oFN, oFF):

    NX = Nexus(oFN, oFF)
    NX.load(['avgNumSimSS', 'numUFBS', 'snr'])

    while NX.nextID():
        try:
            NX.snr = NX.numUFBS / NX.avgNumSimSS
        except ZeroDivisionError:
            NX.snr = NX.numUFBS / 0.01

    NX.save()
Esempio n. 10
0
def updateSimilarSiblings(oFN, oFF, frameLength):

    dataNX = Nexus(oFN, oFF)
    dataNX.load(['sequence', 'siblingSet'])
    oID_sequence = dataNX.createMap('id', 'sequence')
    consolidatedSets = getSimilarORNASets(oID_sequence, frameLength)

    for cSet in consolidatedSets:
        for oID in cSet:
            dataNX.id = oID
            dataNX.siblingSet = list(cSet)

    dataNX.save()
Esempio n. 11
0
def filterCenterProperties(fN, fFN):
    
    NX = Nexus(fN, fFN)
    NX.load(['query', 'reference', 'qStart', 'qEnd', 'rStart', 'rEnd', 'qLen', 'rLen', 'sigMask', 'centerPass', 'mismatchPass'])
   
    while NX.nextID():
        qRange = [NX.qStart, NX.qEnd]        
        rRange = [NX.rStart, NX.rEnd]        
        
        NX.mismatchPass = checkMismatchCenter(NX.query, NX.reference, NX.qLen, NX.rLen, qRange, rRange, NX.sigMask)
        NX.centerPass = checkPeakCenter(NX.query, NX.reference, NX.qLen, NX.rLen, qRange, rRange)

    NX.save()
Esempio n. 12
0
def updateSimilarSiblings(oFN, oFF, frameLength):

    dataNX = Nexus(oFN, oFF)
    dataNX.load(['sequence', 'siblingSet'])
    oID_sequence = dataNX.createMap('id', 'sequence')
    consolidatedSets = getSimilarORNASets(oID_sequence, frameLength)

    for cSet in consolidatedSets:
        for oID in cSet:
            dataNX.id = oID
            dataNX.siblingSet = list(cSet)

    dataNX.save()
Esempio n. 13
0
def testAutoLoad(fN, ff):

    NX = Nexus(fN, ff)

    print 'START LOOPING'
    while NX.nextID():

        NX.isCoding = True
        NX.otherIDs = range(10)
        NX.geneName = "testAuto"
        NX.numReads = 300

    NX.save()
Esempio n. 14
0
def updateAdjustedMismatches(fN, fF, guValue = .5, otherValue = 1.0):
    
    NX = Nexus(fN, fF)
    NX.load(['sigMask', 'adjustedNumMismatches'])

    while NX.nextID():

        mask = NX.sigMask
        numGU = mask.count('G')
        numGapAndMM = mask.count('X')
        
        NX.adjustedNumMismatches = (numGU * guValue) + (numGapAndMM * otherValue)

    NX.save()
Esempio n. 15
0
def updateELevel2(dFN, dForm, wigDir):
    '''Dont need to do it by chromosome because it is small enough'''
    '''Also dont need to flip the strand because the wig is opposite as well'''

    NX = Nexus(dFN, dForm)
    NX.load(['tcc', 'eLevel'])

    wigDict = cgWig.loadWigDictFloat(wigDir)

    while NX.nextID():

        coord_value = cgWig.getExpressionProfile(NX.tcc, wigDict)
        NX.eLevel = max(coord_value.values())

    NX.save()
Esempio n. 16
0
def updateSequence(oFN, oFF, extend, assembly):

    NX = Nexus(oFN, oFF)
    NX.load(['sequence', 'tcc'])

    gf = GenomeFetch.GenomeFetch(assembly)

    while NX.nextID():

        chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc)
        start, end = start - extend, end + extend
        newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
        NX.sequence = gf.getSequence(newTcc)

    NX.save()
Esempio n. 17
0
def updateScores(fN, fFN):

    NX = Nexus(fN, fFN)
    NX.load([
        'numNormMatches', 'numGUs', 'numMismatches', 'numQGaps', 'numRGaps',
        'numExtensionsQ', 'numExtensionsR', 'score'
    ])

    while NX.nextID():
        NX.score = calculateAlignmentScore(NX.numNormMatches, NX.numGUs,
                                           NX.numMismatches, NX.numQGaps,
                                           NX.numRGaps, NX.numExtensionsQ,
                                           NX.numExtensionsR)

    NX.save()
Esempio n. 18
0
def updateELevel2(dFN, dForm, wigDir):
    '''Dont need to do it by chromosome because it is small enough'''
    '''Also dont need to flip the strand because the wig is opposite as well'''

    NX = Nexus(dFN, dForm)
    NX.load(['tcc', 'eLevel'])
    
    wigDict = cgWig.loadWigDictFloat(wigDir)
   
    while NX.nextID():
        
        coord_value = cgWig.getExpressionProfile(NX.tcc, wigDict)
        NX.eLevel = max(coord_value.values())

    NX.save()
Esempio n. 19
0
def testQuickLoad(fN):

    ff = ['1 geneName string .',
          '3 otherIDs intList .',
          '4 isCoding bool F'
         ]
    NX = Nexus(fN, ff)

    while NX.nextID():

        NX.isCoding = True
        NX.otherIDs = range(18)
        NX.geneName = "testAuto"
        
    NX.save()
Esempio n. 20
0
def updateSequence(oFN, oFF, extend, assembly):
        
    NX = Nexus(oFN, oFF)
    NX.load(['sequence', 'tcc'])
        
    gf = GenomeFetch.GenomeFetch(assembly)

    while NX.nextID():
        
        chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc)
        start, end = start - extend, end + extend
        newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
        NX.sequence = gf.getSequence(newTcc)

    NX.save()
def updateTargetIDs(oFN, oFF, aFN, aFF):

    NX = Nexus(oFN, oFF)
    NX.load(['filteredTargets'])

    aNX = Nexus(aFN, aFF)
    aNX.load(['sID'])


    while aNX.nextID():

        NX.id = aNX.sID
        NX.filteredTargets.append(aNX.id)

    NX.save()
Esempio n. 22
0
def updateSimSeqsForUnique(oFN, oFF, seqFN):

    NX = Nexus(oFN, oFF)
    NX.load(['sequence'])

    id_seq = {}
    f = open(seqFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        id_seq[int(ls[0])] = ls[1]
    f.close()

    while NX.nextID():
        NX.sequence = id_seq.get(NX.id, '.')

    NX.save()
Esempio n. 23
0
def updateSimSeqsForUnique(oFN, oFF, seqFN):

    NX = Nexus(oFN, oFF)
    NX.load(['sequence'])

    id_seq = {}
    f = open(seqFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        id_seq[int(ls[0])] = ls[1]
    f.close()
    

    while NX.nextID():
        NX.sequence = id_seq.get(NX.id, '.')

    NX.save()
Esempio n. 24
0
def filterCenterProperties(fN, fFN):

    NX = Nexus(fN, fFN)
    NX.load([
        'query', 'reference', 'qStart', 'qEnd', 'rStart', 'rEnd', 'qLen',
        'rLen', 'sigMask', 'centerPass', 'mismatchPass'
    ])

    while NX.nextID():
        qRange = [NX.qStart, NX.qEnd]
        rRange = [NX.rStart, NX.rEnd]

        NX.mismatchPass = checkMismatchCenter(NX.query, NX.reference, NX.qLen,
                                              NX.rLen, qRange, rRange,
                                              NX.sigMask)
        NX.centerPass = checkPeakCenter(NX.query, NX.reference, NX.qLen,
                                        NX.rLen, qRange, rRange)

    NX.save()
def updateRepeatStatus(fN, fF, wigDir, chrom, strand):

    #load oRNAs
    NX = Nexus(fN, fF)
    NX.load(['repeat', 'tcc'])
    
    #load wig file for chrom, strand
    coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'REPEAT')

    while NX.nextID():
        oChrom, oStrand, start, end = bioLibCG.tccSplit(NX.tcc)
        if oChrom != chrom or oStrand != strand: continue

        NX.repeat = False
        for i in range(start, end + 1):
            if i in coord_value:
                NX.repeat = True
                break

    NX.save()
Esempio n. 26
0
def updateGeneName(dFN, fFN, wigDir, chrom, strand, prefix, switchStrand = False):

    NX = Nexus(dFN, fFN)
    NX.load(['geneNames', 'tcc'])

    if switchStrand:
        strand = -strand

    strand = str(strand)
    coord_gName = cgWig.loadSingleWigTranscript(wigDir, chrom, strand, prefix)

    while NX.nextID():

        chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc)
        
        overlappingGenes = coord_gName.get(start, ".")
        if overlappingGenes == "NONE":
            NX.geneNames = []
        else:
            NX.geneNames = overlappingGenes.split(',')

    NX.save()
Esempio n. 27
0
def linkTargetIDs(oFN, oFF, aFN, aFF):

    oNX = Nexus(oFN, oFF)
    oNX.load(['filteredTargets'])

    #just give it some blanks
    if os.path.getsize(aFN) == 0:
        while oNX.nextID():
            oNX.filteredTargets = []
        oNX.save()
        return

    aNX = Nexus(aFN, aFF)
    aNX.load(['sID'])

    sID_aIDs = aNX.createMap('sID', 'id', False)

    for sID, aIDs in sID_aIDs.iteritems():
        oNX.id = sID
        oNX.filteredTargets = aIDs

    oNX.save()
Esempio n. 28
0
def cleanForSNR(dataFN, oFF):
    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['numUniqueSims', 'numUFBS', 'snrClean', 'siblingSet'])
    id_numUFBS = dataNX.createMap('id', 'numUFBS')
    id_siblingSet = dataNX.createMap('id', 'siblingSet')

    unusedSiblings = []
    for id, siblingSet in id_siblingSet.iteritems():
        if len(siblingSet) == 1: continue #NOTE: oRNA IDs are in their own sibling set
        numUFBS__id = [(id_numUFBS[x], x) for x in siblingSet]
        numUFBS__id.sort()
        numUFBS__id.pop() #take last one (one we're keeping) out of list 
        unusedIDs = [x[1] for x in numUFBS__id]
        unusedSiblings.extend(unusedIDs)

    #tag unclean oRNA
    while dataNX.nextID():
        if (dataNX.id in unusedSiblings) or (dataNX.numUniqueSims < 10):
            dataNX.snrClean = False
        else:
            dataNX.snrClean = True
    dataNX.save()
Esempio n. 29
0
def cleanForSNR(dataFN, oFF):
    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['numUniqueSims', 'numUFBS', 'snrClean', 'siblingSet'])
    id_numUFBS = dataNX.createMap('id', 'numUFBS')
    id_siblingSet = dataNX.createMap('id', 'siblingSet')

    unusedSiblings = []
    for id, siblingSet in id_siblingSet.iteritems():
        if len(siblingSet) == 1:
            continue  #NOTE: oRNA IDs are in their own sibling set
        numUFBS__id = [(id_numUFBS[x], x) for x in siblingSet]
        numUFBS__id.sort()
        numUFBS__id.pop()  #take last one (one we're keeping) out of list
        unusedIDs = [x[1] for x in numUFBS__id]
        unusedSiblings.extend(unusedIDs)

    #tag unclean oRNA
    while dataNX.nextID():
        if (dataNX.id in unusedSiblings) or (dataNX.numUniqueSims < 10):
            dataNX.snrClean = False
        else:
            dataNX.snrClean = True
    dataNX.save()
Esempio n. 30
0
def updateNumUFBS(oFN, oFF, aFN, aFF):

    oNX = Nexus(oFN, oFF)
    oNX.load(['filteredTargets', 'numUFBS'])

    #just give it some blanks
    if os.path.getsize(aFN) == 0:
        while oNX.nextID():
            oNX.numUFBS = 0
        oNX.save()
        return

    aNX = Nexus(aFN, aFF)
    aNX.load(['sigMask'])

    while oNX.nextID():
        sigMaskSet = set()
        for aID in oNX.filteredTargets:
            aNX.id = aID
            sigMaskSet.add(aNX.sigMask)
        oNX.numUFBS = len(sigMaskSet)

    oNX.save()
Esempio n. 31
0
def linkTargetIDs(oFN, oFF, aFN, aFF):

    oNX = Nexus(oFN, oFF)
    oNX.load(['filteredTargets'])

    #just give it some blanks
    if os.path.getsize(aFN) == 0:
        while oNX.nextID():
            oNX.filteredTargets = []
        oNX.save()
        return
    

    aNX = Nexus(aFN, aFF)
    aNX.load(['sID'])

    sID_aIDs = aNX.createMap('sID', 'id', False)
    
    for sID, aIDs in sID_aIDs.iteritems():
        oNX.id = sID
        oNX.filteredTargets = aIDs

    oNX.save()
Esempio n. 32
0
def updateNumUFBS(oFN, oFF, aFN, aFF):

    oNX = Nexus(oFN, oFF)
    oNX.load(['filteredTargets', 'numUFBS'])

    #just give it some blanks
    if os.path.getsize(aFN) == 0:
        while oNX.nextID():
            oNX.numUFBS = 0
        oNX.save()
        return
    
    aNX = Nexus(aFN, aFF)
    aNX.load(['sigMask'])

    while oNX.nextID():
        sigMaskSet = set()
        for aID in oNX.filteredTargets:
            aNX.id = aID
            sigMaskSet.add(aNX.sigMask)
        oNX.numUFBS = len(sigMaskSet)

    oNX.save()
Esempio n. 33
0
def pickBestAlignment(fN, fFN):

    NX = Nexus(fN, fFN)
    NX.load(['sID', 'dID', 'score', 'best'])

    #find best id
    pair_score = {}  #pair : score
    pair_highID = {}
    while NX.nextID():
        pair = '%s_%s' % (NX.sID, NX.dID)
        score = NX.score

        if score > pair_score.get(pair, 0.0):
            pair_score[pair] = score
            pair_highID[pair] = NX.id

    # update best id
    bestIDs = set(pair_highID.values())
    while NX.nextID():

        if NX.id in bestIDs:
            NX.best = True

    NX.save()
Esempio n. 34
0
def pickBestAlignment(fN, fFN):

    NX = Nexus(fN, fFN)
    NX.load(['sID', 'dID', 'score', 'best'])

    #find best id
    pair_score = {} #pair : score
    pair_highID = {}
    while NX.nextID():
        pair = '%s_%s' % (NX.sID, NX.dID)
        score = NX.score

        if score > pair_score.get(pair, 0.0):
            pair_score[pair] = score
            pair_highID[pair] = NX.id
   
    # update best id
    bestIDs = set(pair_highID.values())
    while NX.nextID():
        
        if NX.id in bestIDs:
            NX.best = True
    
    NX.save()