Ejemplo n.º 1
0
def checkDiFrequency(dataSeqsFN, simSeqsFN, outFN):

    dataSeqs = cgDL.listFromColumns(dataSeqsFN, [0], ['string'])
    simSeqs = cgDL.listFromColumns(simSeqsFN, [0], ['string'])

    def returnDiFreq(seqs):
        #collect di count
        di_freq = {}
        totalFrames = 0.0
        for seq in seqs:
            dis = bioLibCG.returnFrames(seq, 2)
            totalFrames += len(dis)
            for di in dis:
                di_freq[di] = di_freq.get(di, 0) + 1.0

        #convert to frequencey
        di_freq = dict((x, di_freq[x] / totalFrames) for x in di_freq)
        return di_freq

    data_freqs = returnDiFreq(dataSeqs)
    sim_freqs = returnDiFreq(simSeqs)

    with open(outFN, 'w') as f:
        for di in data_freqs:
            f.write('%s\t%s\t%s\n' %
                    (di, data_freqs.get(di, 0.0), sim_freqs.get(di, 0.0)))
Ejemplo n.º 2
0
def checkDiFrequency(dataSeqsFN, simSeqsFN, outFN):

    dataSeqs = cgDL.listFromColumns(dataSeqsFN, [0], ['string'])
    simSeqs = cgDL.listFromColumns(simSeqsFN, [0], ['string'])
 
    def returnDiFreq(seqs):
        #collect di count
        di_freq = {}
        totalFrames = 0.0
        for seq in seqs:
            dis = bioLibCG.returnFrames(seq, 2)
            totalFrames += len(dis)
            for di in dis:
                di_freq[di] = di_freq.get(di, 0) + 1.0

        #convert to frequencey
        di_freq = dict( (x, di_freq[x]/totalFrames) for x in di_freq)
        return di_freq

    data_freqs = returnDiFreq(dataSeqs)
    sim_freqs = returnDiFreq(simSeqs)

    with open(outFN, 'w') as f:
        for di in data_freqs:
            f.write('%s\t%s\t%s\n' % (di, data_freqs.get(di, 0.0), sim_freqs.get(di, 0.0)))
Ejemplo n.º 3
0
def plotTotalSNR(fN):
    fig = PLT.figure()
    ax1 = fig.add_subplot(111, projection='3d')

    Xs, Ys, dZs = cgDL.listFromColumns(fN, [0,1,2], ['float', 'float', 'float'], naToZero = True)
    #xpos = [1,2,3]
    #ypos = [1,2,3]
    #zpos = [0,0,0]

    Zs = [0] * len(dZs)
    dZs = [1 if x == 0.0 else x for x in dZs]
    dx = dy = [.2] * len(Zs)
    ax1.bar3d(Xs, Ys, Zs, dx, dy, dZs, color='#8E4585', zsort = 'max')
    PLT.show()
Ejemplo n.º 4
0
def getMicroHistExpression(microFN, fqFile, outFN):

    microSeqs = cgDL.listFromColumns(microFN, [0], ['string'])
    microSeq_count = dict( (seq, 0) for seq in microSeqs)
        
    f = open(fqFile, 'r')
    for line in f:
        possibleSeq = line.strip()
        if possibleSeq in microSeq_count:
            microSeq_count[possibleSeq] += 1
    f.close()

    with open(outFN, 'w') as f:
        for seq, count in microSeq_count.iteritems():
            f.write('%s\t%s\n' % (seq, count))
Ejemplo n.º 5
0
def checkMaskEnds(maskPerLineFN):

    masks = cgDL.listFromColumns(maskPerLineFN, [0], ['string'])

    index_numMM = dict((i, 0) for i in range(10))

    for mask in masks:
        mask = mask[::-1]
        for i, char in enumerate(mask):
            if i == 10: break
            if char == 'X':
                index_numMM[i] += 1

    for i, num in index_numMM.items():
        print i, num
Ejemplo n.º 6
0
def getMicroHistExpression(microFN, fqFile, outFN):

    microSeqs = cgDL.listFromColumns(microFN, [0], ['string'])
    microSeq_count = dict((seq, 0) for seq in microSeqs)

    f = open(fqFile, 'r')
    for line in f:
        possibleSeq = line.strip()
        if possibleSeq in microSeq_count:
            microSeq_count[possibleSeq] += 1
    f.close()

    with open(outFN, 'w') as f:
        for seq, count in microSeq_count.iteritems():
            f.write('%s\t%s\n' % (seq, count))
Ejemplo n.º 7
0
def checkMaskEnds(maskPerLineFN):

    masks = cgDL.listFromColumns(maskPerLineFN, [0], ['string'])

    index_numMM = dict((i,0) for i in range(10))

    for mask in masks:
        mask = mask[::-1]
        for i, char in enumerate(mask):
            if i == 10: break
            if char == 'X':
                index_numMM[i] += 1


    for i, num in index_numMM.items():
        print i, num
Ejemplo n.º 8
0
def grepBC(grepList, inFile, column, word=False):

    grepList = cgDL.listFromColumns(grepList, [column], ["int"])

    f = open(inFile, "r")
    for line in f:
        ls = line.strip().split("\t")

        for w in grepList:
            if word:
                if w == ls[column]:
                    print line,
            else:
                if w in ls[column]:
                    print line,
    f.close()
Ejemplo n.º 9
0
def plotTotalSNR(fN):
    fig = PLT.figure()
    ax1 = fig.add_subplot(111, projection='3d')

    Xs, Ys, dZs = cgDL.listFromColumns(fN, [0, 1, 2],
                                       ['float', 'float', 'float'],
                                       naToZero=True)
    #xpos = [1,2,3]
    #ypos = [1,2,3]
    #zpos = [0,0,0]

    Zs = [0] * len(dZs)
    dZs = [1 if x == 0.0 else x for x in dZs]
    dx = dy = [.2] * len(Zs)
    ax1.bar3d(Xs, Ys, Zs, dx, dy, dZs, color='#8E4585', zsort='max')
    PLT.show()
Ejemplo n.º 10
0
def generateLeftRightChimera(oRNAFN, outFNBase):
    '''make map of 4mer --> left, 4mer rights
    NOTE: left and right do not include middle!''' 
    
    #FN is id/sequence
    sequences = cgDL.listFromColumns(oRNAFN, [1], ['string'])
     
    #update left/rights
    fourMer_left = {}
    fourMer_right = {}
    for seq in sequences:
        middle = seq[9:13]
        left, right = seq[:9], seq[13:]
        fourMer_left.setdefault(middle, set()).add(left)
        fourMer_right.setdefault(middle, set()).add(right)

    #lefts and rights mapped by middle 4
    for side in ['left', 'right']:
        outDict = eval('fourMer_%s' % side) #uh oh! EVAL TIME!
        with open(outFNBase + '.' + side, 'w') as f:
            for mer, seqs in outDict.iteritems():
                print mer, seqs
                f.write('%s\t%s\n' % (mer, ','.join(seqs)))
Ejemplo n.º 11
0
def maskAll(fN):

    allSeqs = cgDL.listFromColumns(fN, [2], ['string'])
    allMasks = [multiMask(x, list(), True) for x in allSeqs]