Beispiel #1
0
def testOverlaps(dataFN, oFF):

    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['tcc'])

    #check for overlaps
    overlappingIDs = set()
    chrom_strand_range = {}
    while dataNX.nextID():
        chrom, strand, start, end = bioLibCG.tccSplit(dataNX.tcc)

        #check if overlap
        chrom_strand_range.setdefault(chrom, {}).setdefault(strand, set())
        overlap = False
        for i in range(start, end + 1):
            if i in chrom_strand_range[chrom][strand]:
                overlap = True
                break

        #tag or add these coordinates
        if overlap:
            overlappingIDs.add(dataNX.id)
        else:
            for i in range(start, end + 1):
                chrom_strand_range[chrom][strand].add(i)

    print "THESE OVERLAP", overlappingIDs
Beispiel #2
0
def testOverlaps(dataFN, oFF):

    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['tcc'])

    #check for overlaps
    overlappingIDs = set()
    chrom_strand_range = {}
    while dataNX.nextID():
        chrom, strand, start, end = bioLibCG.tccSplit(dataNX.tcc)

        #check if overlap
        chrom_strand_range.setdefault(chrom, {}).setdefault(strand, set())
        overlap = False
        for i in range(start, end + 1):
            if i in chrom_strand_range[chrom][strand]:
                overlap = True
                break

        #tag or add these coordinates 
        if overlap:
            overlappingIDs.add(dataNX.id)
        else:
            for i in range(start, end + 1):
                chrom_strand_range[chrom][strand].add(i)

    print "THESE OVERLAP", overlappingIDs
Beispiel #3
0
def check_ORNA_in_ago(oFN, oFF, agoFN, clippingAmount = 1):
   
    NX = Nexus(oFN, oFF)
    NX.load(['sequence', 'geneNames'])
    
    #make truncated sequences
    id_sequence = NX.createMap('id', 'sequence')
    if clippingAmount > 0:
        id_sequence = dict( (i, j[clippingAmount:-clippingAmount]) for i,j in id_sequence.items())
   
    #get fastq sequences
    agoF = open(agoFN, 'r')
    agoSeqs = []
    while True:
        fPacket = nextFilePacket(agoF, 4)
        if not fPacket: break
        agoSeqs.append(fPacket[1])
    agoF.close()

    #count for each oRNA
    id_count = {}
    for id, seq in id_sequence.items():
        for agoSeq in agoSeqs:
            if seq in agoSeq:
                id_count[id] = id_count.get(id, 0) + 1

    #out
    totalCount = 0
    for id, count in id_count.items():
        NX.id = id
        print '%s\t%s\t%s' % (id, count, NX.geneNames)
        totalCount += count

    print totalCount
Beispiel #4
0
def testMap(fN, fF):

    NX = Nexus(fN, fF)
    NX.load(['geneName', 'numReads', 'otherIDs'])

    geneName_numReads = NX.createMap('otherIDs', 'geneName', False) #not 1to1

    for k,v in geneName_numReads.iteritems():
        print k, v[:5]
        return
def updateGeneName(dFN,
                   fFN,
                   wigDir,
                   chrom,
                   strand,
                   prefix,
                   switchStrand=False):

    NX = Nexus(dFN, fFN)
    NX.load(['geneNames', 'tcc'])

    if switchStrand:
        strand = -strand

    strand = str(strand)
    coord_gName = cgWig.loadSingleWigTranscript(wigDir, chrom, strand, prefix)

    while NX.nextID():

        chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc)

        overlappingGenes = coord_gName.get(start, ".")
        if overlappingGenes == "NONE":
            NX.geneNames = []
        else:
            NX.geneNames = overlappingGenes.split(',')

    NX.save()
Beispiel #6
0
def testNX(fN, fF):

    NX = Nexus(fN, fF, 'geneName numReads isCoding otherIDs')

    print 'START LOOPING'
    for gene in NX:
        gene.isCoding = True
        gene.otherIDs = range(10)
        gene.geneName = "testAuto"
        gene.numReads = 300
    
    NX.save()
Beispiel #7
0
def getTotalSpots(allFN, formatFN):

    NX = Nexus(allFN, formatFN)
    NX.load(['numNSpots', 'numReads'])

    totalSpots = 0
    totalReads = 0
    while NX.nextID():

        totalSpots += NX.numNSpots
        totalReads += NX.numReads

    print 'spotsNSpots', totalSpots
    print 'numReads', totalReads
def updateContext(fN, fF, wigDir, chrom, strand, switchStrand = False):
        
    NX = Nexus(fN, fF)
    NX.load(['tcc', 'context'])
    
    if switchStrand:
        strand = str(-int(strand))
    else:
        strand = str(strand)
    
    print 'loading wig'
    coord_contexts = cgWig.loadSingleWigContext(wigDir, chrom, strand, 'context') 
    print 'done loading'

    ds = bioLibCG.dominantSpotter(['C_EXON', 'C_3UTR', 'C_5UTR', 'NC_EXON', 'NC_3UTR', 'NC_5UTR', 'C_INTRON', 'NC_INTRON', 'INTER']) 


    while NX.nextID():

        oChrom, oStrand, start, end = bioLibCG.tccSplit(NX.tcc)
        
        #deg wigs is AS to actual clipping site
        if switchStrand:
            oStrand = str(-int(strand))
        else:
            oStrand = str(oStrand)

        if oChrom == chrom and oStrand == strand:

            contexts = coord_contexts.get(start, 'INTER').split(',')
            NX.context = ds.spotItem(contexts)

    
    NX.save()
Beispiel #9
0
def updateSNR(oFN, oFF):

    NX = Nexus(oFN, oFF)
    NX.load(['avgNumSimSS', 'numUFBS', 'snr'])

    while NX.nextID():
        try:
            NX.snr = NX.numUFBS / NX.avgNumSimSS
        except ZeroDivisionError:
            NX.snr = NX.numUFBS / 0.01

    NX.save()
Beispiel #10
0
def updateSimilarSiblings(oFN, oFF, frameLength):

    dataNX = Nexus(oFN, oFF)
    dataNX.load(['sequence', 'siblingSet'])
    oID_sequence = dataNX.createMap('id', 'sequence')
    consolidatedSets = getSimilarORNASets(oID_sequence, frameLength)

    for cSet in consolidatedSets:
        for oID in cSet:
            dataNX.id = oID
            dataNX.siblingSet = list(cSet)

    dataNX.save()
Beispiel #11
0
def updateSNR(oFN, oFF):

    NX = Nexus(oFN, oFF)
    NX.load(['avgNumSimSS', 'numUFBS', 'snr'])

    while NX.nextID():
        try:
            NX.snr = NX.numUFBS/NX.avgNumSimSS
        except ZeroDivisionError:
            NX.snr = NX.numUFBS/0.01

    NX.save()
def updateScores(fN, fFN):
    
    NX = Nexus(fN, fFN)
    NX.load(['numNormMatches', 'numGUs', 'numMismatches', 'numQGaps', 'numRGaps', 'numExtensionsQ','numExtensionsR', 'score'])
    
    while NX.nextID():
        NX.score = calculateAlignmentScore(NX.numNormMatches, NX.numGUs, NX.numMismatches, NX.numQGaps, NX.numRGaps, NX.numExtensionsQ, NX.numExtensionsR)

    NX.save()
def filterCenterProperties(fN, fFN):
    
    NX = Nexus(fN, fFN)
    NX.load(['query', 'reference', 'qStart', 'qEnd', 'rStart', 'rEnd', 'qLen', 'rLen', 'sigMask', 'centerPass', 'mismatchPass'])
   
    while NX.nextID():
        qRange = [NX.qStart, NX.qEnd]        
        rRange = [NX.rStart, NX.rEnd]        
        
        NX.mismatchPass = checkMismatchCenter(NX.query, NX.reference, NX.qLen, NX.rLen, qRange, rRange, NX.sigMask)
        NX.centerPass = checkPeakCenter(NX.query, NX.reference, NX.qLen, NX.rLen, qRange, rRange)

    NX.save()
Beispiel #14
0
def updateSimilarSiblings(oFN, oFF, frameLength):

    dataNX = Nexus(oFN, oFF)
    dataNX.load(['sequence', 'siblingSet'])
    oID_sequence = dataNX.createMap('id', 'sequence')
    consolidatedSets = getSimilarORNASets(oID_sequence, frameLength)

    for cSet in consolidatedSets:
        for oID in cSet:
            dataNX.id = oID
            dataNX.siblingSet = list(cSet)

    dataNX.save()
Beispiel #15
0
def testAutoLoad(fN, ff):

    NX = Nexus(fN, ff)

    print 'START LOOPING'
    while NX.nextID():

        NX.isCoding = True
        NX.otherIDs = range(10)
        NX.geneName = "testAuto"
        NX.numReads = 300

    NX.save()
Beispiel #16
0
def check_ORNA_in_ago(oFN, oFF, agoFN, clippingAmount=1):

    NX = Nexus(oFN, oFF)
    NX.load(['sequence', 'geneNames'])

    #make truncated sequences
    id_sequence = NX.createMap('id', 'sequence')
    if clippingAmount > 0:
        id_sequence = dict((i, j[clippingAmount:-clippingAmount])
                           for i, j in id_sequence.items())

    #get fastq sequences
    agoF = open(agoFN, 'r')
    agoSeqs = []
    while True:
        fPacket = nextFilePacket(agoF, 4)
        if not fPacket: break
        agoSeqs.append(fPacket[1])
    agoF.close()

    #count for each oRNA
    id_count = {}
    for id, seq in id_sequence.items():
        for agoSeq in agoSeqs:
            if seq in agoSeq:
                id_count[id] = id_count.get(id, 0) + 1

    #out
    totalCount = 0
    for id, count in id_count.items():
        NX.id = id
        print '%s\t%s\t%s' % (id, count, NX.geneNames)
        totalCount += count

    print totalCount
Beispiel #17
0
def collectData(fN, fFN, outFN, logHeight = False):
    '''x(pHeight) v. y(-log10(pval)).  0.0 is 1.0e-100'''

    NX = Nexus(fN, fFN)
    NX.load(['eLevel', 'pValBin'])

    f = open(outFN, 'w')
    while NX.nextID():

        x = math.log(NX.eLevel, 10) if logHeight else NX.eLevel
        pVal = NX.pValBin
        if pVal < 0: continue
        pVal = pVal if (pVal != 0.0) else float("1.0e-100") 
                
        try:
            y = -math.log(pVal, 10)
        except ValueError:
            print x, pVal
            return

        f.write('%s\t%s\n' % (x,y))
    f.close()
Beispiel #18
0
def testConsolidation(oFN, oFF, frameLength):

    dataNX = Nexus(oFN, oFF)
    dataNX.load(['sequence'])
    oID_sequence = dataNX.createMap('id', 'sequence')
    consolidatedSets = getSimilarORNASets(oID_sequence, frameLength)

    #check if all oIDs are in set
    allConsolidatedIDs = set()
    [allConsolidatedIDs.add(x) for theSet in consolidatedSets for x in theSet]
    oIDsSet = set(oID_sequence.keys())
    print "DIFFERENCE"
    print oIDsSet.symmetric_difference(allConsolidatedIDs)

    #check Duplicates

    #print out sets to verify that they work
    for oIDSet in consolidatedSets:
        print
        print oIDSet
        for oID in oIDSet:
            print oID, oID_sequence[oID]
Beispiel #19
0
def cleanForSNR(dataFN, oFF):
    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['numUniqueSims', 'numUFBS', 'snrClean', 'siblingSet'])
    id_numUFBS = dataNX.createMap('id', 'numUFBS')
    id_siblingSet = dataNX.createMap('id', 'siblingSet')

    unusedSiblings = []
    for id, siblingSet in id_siblingSet.iteritems():
        if len(siblingSet) == 1: continue #NOTE: oRNA IDs are in their own sibling set
        numUFBS__id = [(id_numUFBS[x], x) for x in siblingSet]
        numUFBS__id.sort()
        numUFBS__id.pop() #take last one (one we're keeping) out of list 
        unusedIDs = [x[1] for x in numUFBS__id]
        unusedSiblings.extend(unusedIDs)

    #tag unclean oRNA
    while dataNX.nextID():
        if (dataNX.id in unusedSiblings) or (dataNX.numUniqueSims < 10):
            dataNX.snrClean = False
        else:
            dataNX.snrClean = True
    dataNX.save()
Beispiel #20
0
def testConsolidation(oFN, oFF, frameLength):

    dataNX = Nexus(oFN, oFF)
    dataNX.load(['sequence'])
    oID_sequence = dataNX.createMap('id', 'sequence')
    consolidatedSets = getSimilarORNASets(oID_sequence, frameLength)
    
    #check if all oIDs are in set
    allConsolidatedIDs = set()
    [allConsolidatedIDs.add(x) for theSet in consolidatedSets for x in theSet]    
    oIDsSet = set(oID_sequence.keys())
    print "DIFFERENCE"
    print oIDsSet.symmetric_difference(allConsolidatedIDs)

    #check Duplicates
     
    #print out sets to verify that they work 
    for oIDSet in consolidatedSets:
        print 
        print oIDSet
        for oID in oIDSet:
            print oID, oID_sequence[oID]
Beispiel #21
0
def updateSequence(oFN, oFF, extend, assembly):

    NX = Nexus(oFN, oFF)
    NX.load(['sequence', 'tcc'])

    gf = GenomeFetch.GenomeFetch(assembly)

    while NX.nextID():

        chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc)
        start, end = start - extend, end + extend
        newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
        NX.sequence = gf.getSequence(newTcc)

    NX.save()
Beispiel #22
0
def updateELevel2(dFN, dForm, wigDir):
    '''Dont need to do it by chromosome because it is small enough'''
    '''Also dont need to flip the strand because the wig is opposite as well'''

    NX = Nexus(dFN, dForm)
    NX.load(['tcc', 'eLevel'])

    wigDict = cgWig.loadWigDictFloat(wigDir)

    while NX.nextID():

        coord_value = cgWig.getExpressionProfile(NX.tcc, wigDict)
        NX.eLevel = max(coord_value.values())

    NX.save()
Beispiel #23
0
def updateScores(fN, fFN):

    NX = Nexus(fN, fFN)
    NX.load([
        'numNormMatches', 'numGUs', 'numMismatches', 'numQGaps', 'numRGaps',
        'numExtensionsQ', 'numExtensionsR', 'score'
    ])

    while NX.nextID():
        NX.score = calculateAlignmentScore(NX.numNormMatches, NX.numGUs,
                                           NX.numMismatches, NX.numQGaps,
                                           NX.numRGaps, NX.numExtensionsQ,
                                           NX.numExtensionsR)

    NX.save()
Beispiel #24
0
def filterCenterProperties(fN, fFN):

    NX = Nexus(fN, fFN)
    NX.load([
        'query', 'reference', 'qStart', 'qEnd', 'rStart', 'rEnd', 'qLen',
        'rLen', 'sigMask', 'centerPass', 'mismatchPass'
    ])

    while NX.nextID():
        qRange = [NX.qStart, NX.qEnd]
        rRange = [NX.rStart, NX.rEnd]

        NX.mismatchPass = checkMismatchCenter(NX.query, NX.reference, NX.qLen,
                                              NX.rLen, qRange, rRange,
                                              NX.sigMask)
        NX.centerPass = checkPeakCenter(NX.query, NX.reference, NX.qLen,
                                        NX.rLen, qRange, rRange)

    NX.save()
Beispiel #25
0
def updateSimSeqsForUnique(oFN, oFF, seqFN):

    NX = Nexus(oFN, oFF)
    NX.load(['sequence'])

    id_seq = {}
    f = open(seqFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        id_seq[int(ls[0])] = ls[1]
    f.close()

    while NX.nextID():
        NX.sequence = id_seq.get(NX.id, '.')

    NX.save()
Beispiel #26
0
def calculateTotalSNR(dataFN, oFF, mm, iSNRCutoff):
    mm = str(mm)
    
    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['snr', 'numUFBS', 'numUniqueSims', 'totalNumUFBSSim']) 

    #Check SNR Cutoffs
    dataIDs = set()
    lowSNRORNA = set()
    while dataNX.nextID():
        dataIDs.add(dataNX.id)
        if dataNX.snr < iSNRCutoff:
            lowSNRORNA.add(dataNX.id)

    #NOTE sum of avgs != total avg
    #get total numUFBS for data and simulation total,num unique sims
    totalUFBSData = 0.0
    totalUFBSSim = 0.0
    totalPassingORNA = 0
    totalUniqueSims = 0
    while dataNX.nextID():
        if (dataNX.id in lowSNRORNA): continue
        totalUFBSData += dataNX.numUFBS
        totalPassingORNA += 1
        totalUFBSSim += dataNX.totalNumUFBSSim
        totalUniqueSims += dataNX.numUniqueSims

    totalAvgSimUFBS, totalAvgDataUFBS = 0.0, 0.0
    try:
        totalAvgSimUFBS = totalUFBSSim / float(totalUniqueSims)
        totalAvgDataUFBS = totalUFBSData / float(totalPassingORNA)
        totalSNR = totalAvgDataUFBS/totalAvgSimUFBS
        oS = [str(x) for x in [mm, iSNRCutoff, totalUFBSData, totalAvgSimUFBS, totalSNR, totalPassingORNA, float(totalUFBSData)/totalPassingORNA]]
    except ZeroDivisionError:
        oS = [str(x) for x in [mm, iSNRCutoff, totalUFBSData, totalAvgSimUFBS, "NA", totalPassingORNA, "NA"]]
        
    print '\t'.join(oS)
def updateRepeatStatus(fN, fF, wigDir, chrom, strand):

    #load oRNAs
    NX = Nexus(fN, fF)
    NX.load(['repeat', 'tcc'])
    
    #load wig file for chrom, strand
    coord_value = cgWig.loadSingleWig(wigDir, chrom, strand, 'REPEAT')

    while NX.nextID():
        oChrom, oStrand, start, end = bioLibCG.tccSplit(NX.tcc)
        if oChrom != chrom or oStrand != strand: continue

        NX.repeat = False
        for i in range(start, end + 1):
            if i in coord_value:
                NX.repeat = True
                break

    NX.save()
def updateAdjustedMismatches(fN, fF, guValue = .5, otherValue = 1.0):
    
    NX = Nexus(fN, fF)
    NX.load(['sigMask', 'adjustedNumMismatches'])

    while NX.nextID():

        mask = NX.sigMask
        numGU = mask.count('G')
        numGapAndMM = mask.count('X')
        
        NX.adjustedNumMismatches = (numGU * guValue) + (numGapAndMM * otherValue)

    NX.save()
def updateSequence(oFN, oFF, extend, assembly):
        
    NX = Nexus(oFN, oFF)
    NX.load(['sequence', 'tcc'])
        
    gf = GenomeFetch.GenomeFetch(assembly)

    while NX.nextID():
        
        chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc)
        start, end = start - extend, end + extend
        newTcc = bioLibCG.makeTcc(chrom, strand, start, end)
        NX.sequence = gf.getSequence(newTcc)

    NX.save()
Beispiel #30
0
def testQuickLoad(fN):

    ff = ['1 geneName string .',
          '3 otherIDs intList .',
          '4 isCoding bool F'
         ]
    NX = Nexus(fN, ff)

    while NX.nextID():

        NX.isCoding = True
        NX.otherIDs = range(18)
        NX.geneName = "testAuto"
        
    NX.save()
def updateELevel2(dFN, dForm, wigDir):
    '''Dont need to do it by chromosome because it is small enough'''
    '''Also dont need to flip the strand because the wig is opposite as well'''

    NX = Nexus(dFN, dForm)
    NX.load(['tcc', 'eLevel'])
    
    wigDict = cgWig.loadWigDictFloat(wigDir)
   
    while NX.nextID():
        
        coord_value = cgWig.getExpressionProfile(NX.tcc, wigDict)
        NX.eLevel = max(coord_value.values())

    NX.save()
Beispiel #32
0
def pickBestAlignment(fN, fFN):

    NX = Nexus(fN, fFN)
    NX.load(['sID', 'dID', 'score', 'best'])

    #find best id
    pair_score = {}  #pair : score
    pair_highID = {}
    while NX.nextID():
        pair = '%s_%s' % (NX.sID, NX.dID)
        score = NX.score

        if score > pair_score.get(pair, 0.0):
            pair_score[pair] = score
            pair_highID[pair] = NX.id

    # update best id
    bestIDs = set(pair_highID.values())
    while NX.nextID():

        if NX.id in bestIDs:
            NX.best = True

    NX.save()
Beispiel #33
0
def updateSimSeqsForUnique(oFN, oFF, seqFN):

    NX = Nexus(oFN, oFF)
    NX.load(['sequence'])

    id_seq = {}
    f = open(seqFN, 'r')
    for line in f:
        ls = line.strip().split('\t')
        id_seq[int(ls[0])] = ls[1]
    f.close()
    

    while NX.nextID():
        NX.sequence = id_seq.get(NX.id, '.')

    NX.save()
def updateGeneName(dFN, fFN, wigDir, chrom, strand, prefix, switchStrand = False):

    NX = Nexus(dFN, fFN)
    NX.load(['geneNames', 'tcc'])

    if switchStrand:
        strand = -strand

    strand = str(strand)
    coord_gName = cgWig.loadSingleWigTranscript(wigDir, chrom, strand, prefix)

    while NX.nextID():

        chrom, strand, start, end = bioLibCG.tccSplit(NX.tcc)
        
        overlappingGenes = coord_gName.get(start, ".")
        if overlappingGenes == "NONE":
            NX.geneNames = []
        else:
            NX.geneNames = overlappingGenes.split(',')

    NX.save()
Beispiel #35
0
def calculateTotalSNR(dataFN, oFF, mm, iSNRCutoff):
    mm = str(mm)

    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['snr', 'numUFBS', 'numUniqueSims', 'totalNumUFBSSim'])

    #Check SNR Cutoffs
    dataIDs = set()
    lowSNRORNA = set()
    while dataNX.nextID():
        dataIDs.add(dataNX.id)
        if dataNX.snr < iSNRCutoff:
            lowSNRORNA.add(dataNX.id)

    #NOTE sum of avgs != total avg
    #get total numUFBS for data and simulation total,num unique sims
    totalUFBSData = 0.0
    totalUFBSSim = 0.0
    totalPassingORNA = 0
    totalUniqueSims = 0
    while dataNX.nextID():
        if (dataNX.id in lowSNRORNA): continue
        totalUFBSData += dataNX.numUFBS
        totalPassingORNA += 1
        totalUFBSSim += dataNX.totalNumUFBSSim
        totalUniqueSims += dataNX.numUniqueSims

    totalAvgSimUFBS, totalAvgDataUFBS = 0.0, 0.0
    try:
        totalAvgSimUFBS = totalUFBSSim / float(totalUniqueSims)
        totalAvgDataUFBS = totalUFBSData / float(totalPassingORNA)
        totalSNR = totalAvgDataUFBS / totalAvgSimUFBS
        oS = [
            str(x) for x in [
                mm, iSNRCutoff, totalUFBSData, totalAvgSimUFBS, totalSNR,
                totalPassingORNA,
                float(totalUFBSData) / totalPassingORNA
            ]
        ]
    except ZeroDivisionError:
        oS = [
            str(x) for x in [
                mm, iSNRCutoff, totalUFBSData, totalAvgSimUFBS, "NA",
                totalPassingORNA, "NA"
            ]
        ]

    print '\t'.join(oS)
def pickBestAlignment(fN, fFN):

    NX = Nexus(fN, fFN)
    NX.load(['sID', 'dID', 'score', 'best'])

    #find best id
    pair_score = {} #pair : score
    pair_highID = {}
    while NX.nextID():
        pair = '%s_%s' % (NX.sID, NX.dID)
        score = NX.score

        if score > pair_score.get(pair, 0.0):
            pair_score[pair] = score
            pair_highID[pair] = NX.id
   
    # update best id
    bestIDs = set(pair_highID.values())
    while NX.nextID():
        
        if NX.id in bestIDs:
            NX.best = True
    
    NX.save()
def updateTargetIDs(oFN, oFF, aFN, aFF):

    NX = Nexus(oFN, oFF)
    NX.load(['filteredTargets'])

    aNX = Nexus(aFN, aFF)
    aNX.load(['sID'])


    while aNX.nextID():

        NX.id = aNX.sID
        NX.filteredTargets.append(aNX.id)

    NX.save()
Beispiel #38
0
def cleanForSNR(dataFN, oFF):
    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['numUniqueSims', 'numUFBS', 'snrClean', 'siblingSet'])
    id_numUFBS = dataNX.createMap('id', 'numUFBS')
    id_siblingSet = dataNX.createMap('id', 'siblingSet')

    unusedSiblings = []
    for id, siblingSet in id_siblingSet.iteritems():
        if len(siblingSet) == 1:
            continue  #NOTE: oRNA IDs are in their own sibling set
        numUFBS__id = [(id_numUFBS[x], x) for x in siblingSet]
        numUFBS__id.sort()
        numUFBS__id.pop()  #take last one (one we're keeping) out of list
        unusedIDs = [x[1] for x in numUFBS__id]
        unusedSiblings.extend(unusedIDs)

    #tag unclean oRNA
    while dataNX.nextID():
        if (dataNX.id in unusedSiblings) or (dataNX.numUniqueSims < 10):
            dataNX.snrClean = False
        else:
            dataNX.snrClean = True
    dataNX.save()
Beispiel #39
0
def updateNumUFBS(oFN, oFF, aFN, aFF):

    oNX = Nexus(oFN, oFF)
    oNX.load(['filteredTargets', 'numUFBS'])

    #just give it some blanks
    if os.path.getsize(aFN) == 0:
        while oNX.nextID():
            oNX.numUFBS = 0
        oNX.save()
        return

    aNX = Nexus(aFN, aFF)
    aNX.load(['sigMask'])

    while oNX.nextID():
        sigMaskSet = set()
        for aID in oNX.filteredTargets:
            aNX.id = aID
            sigMaskSet.add(aNX.sigMask)
        oNX.numUFBS = len(sigMaskSet)

    oNX.save()
Beispiel #40
0
def linkTargetIDs(oFN, oFF, aFN, aFF):

    oNX = Nexus(oFN, oFF)
    oNX.load(['filteredTargets'])

    #just give it some blanks
    if os.path.getsize(aFN) == 0:
        while oNX.nextID():
            oNX.filteredTargets = []
        oNX.save()
        return

    aNX = Nexus(aFN, aFF)
    aNX.load(['sID'])

    sID_aIDs = aNX.createMap('sID', 'id', False)

    for sID, aIDs in sID_aIDs.iteritems():
        oNX.id = sID
        oNX.filteredTargets = aIDs

    oNX.save()
Beispiel #41
0
def linkTargetIDs(oFN, oFF, aFN, aFF):

    oNX = Nexus(oFN, oFF)
    oNX.load(['filteredTargets'])

    #just give it some blanks
    if os.path.getsize(aFN) == 0:
        while oNX.nextID():
            oNX.filteredTargets = []
        oNX.save()
        return
    

    aNX = Nexus(aFN, aFF)
    aNX.load(['sID'])

    sID_aIDs = aNX.createMap('sID', 'id', False)
    
    for sID, aIDs in sID_aIDs.iteritems():
        oNX.id = sID
        oNX.filteredTargets = aIDs

    oNX.save()
Beispiel #42
0
def updateNumUFBS(oFN, oFF, aFN, aFF):

    oNX = Nexus(oFN, oFF)
    oNX.load(['filteredTargets', 'numUFBS'])

    #just give it some blanks
    if os.path.getsize(aFN) == 0:
        while oNX.nextID():
            oNX.numUFBS = 0
        oNX.save()
        return
    
    aNX = Nexus(aFN, aFF)
    aNX.load(['sigMask'])

    while oNX.nextID():
        sigMaskSet = set()
        for aID in oNX.filteredTargets:
            aNX.id = aID
            sigMaskSet.add(aNX.sigMask)
        oNX.numUFBS = len(sigMaskSet)

    oNX.save()
Beispiel #43
0
def testPeaks(degFN, dForm, allGeneInfo, gForm, switchStrand = False):

    #load/configure gene Info
    gNX = Nexus(allGeneInfo, gForm)
    gNX.load(['geneName', 'numReads', 'numSpots'])
    
    gName_numReads = {}
    gName_numSpots = {}
    while gNX.nextID():
        gName_numReads[gNX.geneName] = gNX.numReads
        gName_numSpots[gNX.geneName] = gNX.numSpots
   
   
    #load degFN info
    dNX = Nexus(degFN, dForm)
    dNX.load(['tcc', 'eLevel', 'geneNames', 'pValBin'])

    while dNX.nextID():
       
        gNames, readsForPeak = dNX.geneNames, dNX.eLevel
        chrom, strand, start, end = bioLibCG.tccSplit(dNX.tcc)
        if switchStrand:
            strand = -int(strand)
      
        pVals = []
        for gName in gNames:
            
            #may have to change gene name cuz of multiple spans
            try:
                totGeneReads = gName_numReads[gName]
                numSpotsForGene = gName_numSpots[gName]
            except KeyError:

                try:
                    gName = gName + '_RE_%s_%s' % (chrom, strand)
                    totGeneReads = gName_numReads[gName]
                    numSpotsForGene = gName_numSpots[gName]
                except KeyError:
                    print "FIX THIS GENE NAME", gName
                    continue

            #add psuedocount
            totGeneReads += 1
            numSpotsForGene += 1 # not sure whether to do this yet...

            #check for hidden intron gene overlap
            try:
                q = 1.0/numSpotsForGene
            except ZeroDivisionError:
                continue #intron gene

            #add p val
            pVals.append(binom.sf(readsForPeak, totGeneReads, q))

        dNX.pValBin = max(pVals) if pVals else -1.0

    dNX.save()
Beispiel #44
0
def updateAvgSS(dataFN, oFF, simDir, simBase, mm, numSims=100):

    #get simulation information (# unique sims, num UFBS)
    fileNames = [
        '%s/simulation.%s/%s.%s' % (simDir, i, simBase, mm)
        for i in range(numSims)
    ]
    sID_numSimUFBS = {}
    sID_simSeqs = {}
    for fN in fileNames:
        oNX = Nexus(fN, oFF)
        oNX.load(['numUFBS', 'sequence'])
        while oNX.nextID():
            if oNX.sequence in sID_simSeqs.get(oNX.id, set()):
                pass  #dont count again
            else:
                sID_numSimUFBS.setdefault(oNX.id, []).append(oNX.numUFBS)
                sID_simSeqs.setdefault(oNX.id, set()).add(oNX.sequence)

    #update data based on sim info
    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['avgNumSimSS', 'numUniqueSims', 'totalNumUFBSSim'])
    while dataNX.nextID():

        numUniqueSims = len(sID_numSimUFBS.get(dataNX.id, []))
        totalSimUFBS = sum(sID_numSimUFBS.get(dataNX.id, []))
        avgSimUFBS = totalSimUFBS / float(
            numUniqueSims) if numUniqueSims != 0 else -1
        dataNX.avgNumSimSS = avgSimUFBS
        dataNX.numUniqueSims = numUniqueSims
        dataNX.totalNumUFBSSim = totalSimUFBS

    dataNX.save()
Beispiel #45
0
def updateAvgSS(dataFN, oFF, simDir, simBase, mm, numSims = 100):

    #get simulation information (# unique sims, num UFBS)
    fileNames = ['%s/simulation.%s/%s.%s' % (simDir, i, simBase, mm) for i in range(numSims)]
    sID_numSimUFBS = {}
    sID_simSeqs = {}
    for fN in fileNames:
        oNX = Nexus(fN, oFF)
        oNX.load(['numUFBS', 'sequence'])
        while oNX.nextID():
            if oNX.sequence in sID_simSeqs.get(oNX.id, set()):
                pass #dont count again
            else:
                sID_numSimUFBS.setdefault(oNX.id, []).append(oNX.numUFBS)
                sID_simSeqs.setdefault(oNX.id, set()).add(oNX.sequence)

    #update data based on sim info
    dataNX = Nexus(dataFN, oFF)
    dataNX.load(['avgNumSimSS', 'numUniqueSims', 'totalNumUFBSSim'])
    while dataNX.nextID():

        numUniqueSims = len(sID_numSimUFBS.get(dataNX.id, []))
        totalSimUFBS = sum(sID_numSimUFBS.get(dataNX.id, []))
        avgSimUFBS = totalSimUFBS/float(numUniqueSims) if numUniqueSims != 0 else -1
        dataNX.avgNumSimSS = avgSimUFBS
        dataNX.numUniqueSims = numUniqueSims
        dataNX.totalNumUFBSSim = totalSimUFBS 

    dataNX.save()