Exemple #1
0
 def readBicluster(self):
     # Read in the bicluster score
     summaryFile = open('cluster.summary.csv','r')
     summaryFile.readline() # Skip the header
     scoreDict = {}
     for sumLine in summaryFile.readlines():
         splitUp = sumLine.strip().split(',')
         scoreDict[int(splitUp[0].strip('"'))] = splitUp[2]
     self.score = scoreDict[int(self.k)]
     # Read in genes
     geneFile = open('biclust/'+str(self.k)+'/genes.csv','r')
     geneFile.readline() # Skip header
     self.genes = [line.strip().split(',')[1].strip('"') for line in geneFile.readlines()]
     geneFile.close()
     # Read in conditions
     condFile = open('biclust/'+str(self.k)+'/conditions.csv','r')
     condFile.readline() # Skip header
     self.conditions = [line.strip().split(',')[1].strip('"') for line in condFile.readlines()]
     condFile.close()
     # Read in residuals
     residFile = open('biclust/'+str(self.k)+'/resid.csv','r')
     residFile.readline() # Skip header
     self.resid = residFile.readline().strip().split(',')[1]
     self.residNorm = residFile.readline().strip().split(',')[1]
     residFile.close()
     # Read in associations with clinical traits
     self.cor = {}
     corFile = open('biclust/'+str(self.k)+'/correlation.csv','r')
     corFile.readline() # Skip header
     for line in corFile.readlines():
         splitUp = line.strip().split(',')
         self.cor[splitUp[0].strip('"')] = {'cor':splitUp[1],'pValue':splitUp[2]}
     corFile.close()
     self.surv = {}
     survFile = open('biclust/'+str(self.k)+'/survival.csv','r')
     survFile.readline() # Skip header
     for line in survFile.readlines():
         splitUp = line.strip().split(',')
         self.surv[splitUp[0]] = {'zScore':splitUp[1],'pValue':splitUp[2]}
     survFile.close()
     # Now load up the PSSMs
     pssmUpFiles = os.listdir('biclust/'+str(self.k)+'/upstream')
     self.pssmsUpstream = []
     for pssmUpFile in pssmUpFiles:
         self.pssmsUpstream.append(pssm('biclust/'+str(self.k)+'/upstream/'+pssmUpFile,self.k))
     # Now read in the pssms
     pssmUTRFiles = os.listdir('biclust/'+str(self.k)+'/3pUTR')
     self.pssms3pUTR = []
     for pssmUTRFile in pssmUTRFiles:
         self.pssms3pUTR.append(pssm('biclust/'+str(self.k)+'/3pUTR/'+pssmUTRFile,self.k))
def meme(num, seqFile=None, bgFile=None, nMotifs=1, minMotifWidth=6, maxMotifWidth=12, revComp=True, seed=None):
    if not os.path.exists('tmp/meme'):
        os.makedirs('tmp/meme')
    # Arguments for tomtom
    memeArgs = str(seqFile)+' -bfile tmp/meme/bgFile.meme -nostatus -text -time 600 -dna -maxsize 9999999 -evt 1e9 -mod zoops -minw ' + str(minMotifWidth) + ' -maxw ' + str(maxMotifWidth) + ' -nmotifs ' + str(nMotifs)
    if revComp==True:
        memeArgs += ' -revcomp'
    if not seed==None:
        memeArgs += ' -cons ' + str(seed)
    print memeArgs
    #errOut = open('tmp/meme/stderr.out','w')
    memeProc = Popen("meme " + memeArgs, shell=True,stdout=PIPE) #,stderr=errOut)
    output = memeProc.communicate()[0].split('\n')
    #errOut.close()
    
    PSSMs = []
    # Now iterate through output and save data
    for i in range(len(output)):
        splitUp1 = output[i].strip().split(' ')
        if splitUp1[0]=='Motif' and splitUp1[2]=='position-specific' and splitUp1[3]=='probability':
            i += 2 # Skip the separator line, go to the summary line
            splitUp = output[i].strip().split(' ')
            width = int(splitUp[5])
            sites = splitUp[7]
            eValue = splitUp[9]
            matrix = []
            for j in range(width):
                i += 1
                matrix += [[float(let) for let in output[i].strip().split(' ') if let]]
            PSSMs.append(pssm(biclusterName=str(splitUp1[1]),nsites=sites,eValue=eValue,pssm=matrix,genes=[]))
    clusterMemeRuns[num] = PSSMs
def merge2PSSMs(name, pssm1, pssm2, weight1, weight2, offset, overlap, orientation):
    pssm1Mat = deepcopy(pssm1.getMatrix())
    # Flip PSSM matrix values for nucleotides if orientation=='-'
    pssm2Mat = deepcopy(pssm2.getMatrix())
    if orientation=='-':
        pssm2Mat = flip(pssm2Mat)
    # Merge PSSM matrices
    newPSSMMat = []
    newWeight = []
    if offset>0:
        for i in range(offset):
            row1 = pssm1Mat.pop(0)
            w1 = weight1.pop(0)
            newPSSMMat += [[row1[0]*100*w1, row1[1]*100*w1, row1[2]*100*w1, row1[3]*100*w1]]
            newWeight += [w1]
    elif offset<0:
        for i in range(-offset):
            row2 = pssm2Mat.pop(0)
            w2 = weight2.pop(0)
            newPSSMMat += [[row2[0]*100*w2, row2[1]*100*w2, row2[2]*100*w2, row2[3]*100*w2]]
            newWeight += [w2]
    maxLen = 0
    if len(pssm1Mat)>=len(pssm2Mat):
        maxLen = len(pssm1Mat)
    else:
        maxLen = len(pssm2Mat)
    for i in range(maxLen):
        if len(pssm1Mat)>0:
            row1 = pssm1Mat.pop(0)
            w1 = weight1.pop(0)
        else:
            row1 = [0, 0, 0, 0]
            w1 = 0
        if len(pssm2Mat)>0:
            row2 = pssm2Mat.pop(0)
            w2 = weight2.pop(0)
        else:
            row2 = [0, 0, 0, 0]
            w2 = 0
        newPSSMMat += [[row1[0]*100*w1+row2[0]*100*w2, row1[1]*100*w1+row2[1]*100*w2, row1[2]*100*w1+row2[2]*100*w2, row1[3]*100*w1+row2[3]*100*w2]]
        newWeight += [w1+w2]
    # Normalize matrix
    for i in range(len(newPSSMMat)):
        total = newPSSMMat[i][0]+newPSSMMat[i][1]+newPSSMMat[i][2]+newPSSMMat[i][3]
        newPSSMMat[i][0] = newPSSMMat[i][0]/total
        newPSSMMat[i][1] = newPSSMMat[i][1]/total
        newPSSMMat[i][2] = newPSSMMat[i][2]/total
        newPSSMMat[i][3] = newPSSMMat[i][3]/total
    # Now build the new PSSM object
    newPSSM = pssm(biclusterName=name, nsites='30', eValue='0', pssm=newPSSMMat, genes=[])
    print name, newPSSM.getConsensusMotif(), pssm1.getName(), pssm1.getConsensusMotif(), pssm2.getName(), pssm2.getConsensusMotif(), offset, overlap, orientation
    return [newPSSM, newWeight]
def weeder(config, seqFile=None, percTargets=50, revComp=False, bgModel='HS'):
    weeder_dir = os.path.join(config.get('General', 'tmp_dir'), 'weeder')
    if not os.path.exists(weeder_dir):
        os.makedirs(weeder_dir)

    # Note that we use a slightly hacked version of weeder. Weederlauncher has
    # hard-coded paths to the other executables, and weederlauncher and 
    # weederTFBS have hard-coded paths to the FreqFiles directory. We hacked
    # those to point to our directories.

    # First run weederTFBS for 6bp motifs
    weederArgs = ' '+str(seqFile)+' '+str(bgModel)+' small T50'
    if revComp==True:
        weederArgs += ' -S'
    errOut = open(os.path.join(weeder_dir, 'stderr.out'), 'w')
    weederProc = Popen("weederlauncher " + weederArgs, shell=True, stdout=PIPE, stderr=errOut)
    output = weederProc.communicate()
    
    # Now parse output from weeder
    PSSMs = []
    output = open(str(seqFile)+'.wee','r')
    outLines = [line for line in output.readlines() if line.strip()]
    hitBp = {}
    # Get top hit of 6bp look for "1)"
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('1) ') == -1:
            break
    hitBp[6] = outLine.strip().split(' ')[1:]

    # Scroll to where the 8bp reads wll be
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('Searching for motifs of length 8') == -1:
            break

    # Get top hit of 8bp look for "1)"
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('1) ') == -1:
            break
    hitBp[8] = outLine.strip().split(' ')[1:]

    # Scroll to where the 8bp reads wll be
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('Your sequences:') == -1:
            break
    
    # Get into the highest ranking motifs
    seqDict = {}
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('**** MY ADVICE ****') == -1:
            break
        splitUp = outLine.strip().split(' ')
        seqDict[splitUp[1]] = splitUp[3].lstrip('>')

    # Get into the highest ranking motifs
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('Interesting motifs (highest-ranking)') == -1:
            break
    while 1:
        name = outLines.pop(0).strip() # Get match
        if not name.find('(not highest-ranking)') == -1:
            break
        # Get redundant motifs
        outLines.pop(0)
        redMotifs = [i for i in outLines.pop(0).strip().split(' ') if not i=='-']
        outLines.pop(0)
        outLines.pop(0)
        line = outLines.pop(0)
        instances = []
        matches = []
        while line.find('Frequency Matrix') == -1:
            splitUp = [i for i in line.strip().split(' ') if i]
            instances.append({'gene':seqDict[splitUp[0]],
                              'strand':splitUp[1],
                              'site':splitUp[2],
                              'start':splitUp[3],
                              'match':splitUp[4].lstrip('(').rstrip(')'),
                              'mfe':rnaDuplex(config, name,[splitUp[2]])[0] })
            line = outLines.pop(0)
        # Read in Frequency Matrix
        outLines.pop(0)
        outLines.pop(0)
        matrix = []
        col = outLines.pop(0)
        while col.find('======') == -1:
            nums = [i for i in col.strip().split('\t')[1].split(' ') if i]
            colSum = 0
            for i in nums:
                colSum += int(i.strip())
            matrix += [[ float(nums[0])/float(colSum), float(nums[1])/float(colSum), float(nums[2])/float(colSum), float(nums[3])/float(colSum)]]
            col = outLines.pop(0)
        PSSMs += [pssm(biclusterName=name,nsites=instances,eValue=hitBp[len(matrix)][1],pssm=matrix,genes=redMotifs)]
    return PSSMs
Exemple #5
0
 def readBicluster(self,
                   de_novo_method_upstream=None,
                   de_novo_method_3pUTR=None):
     # Attach to the database
     con = lite.connect(self.sqliteDb)
     con.row_factory = lite.Row
     cur = con.cursor()
     # Get the number of biclusters in run
     q1 = 'SELECT name FROM row_members, row_names WHERE row_members.cluster=' + str(
         self.k) + ' and row_members.iteration=' + str(
             self.maxIter
         ) + ' and row_members.order_num=row_names.order_num'
     cur.execute(q1)
     data = cur.fetchall()
     self.genes = [str(i['name']) for i in data]
     # Read in conditions
     q1 = 'SELECT name FROM column_members, column_names WHERE column_members.cluster=' + str(
         self.k) + ' and column_members.iteration=' + str(
             self.maxIter
         ) + ' and column_members.order_num=column_names.order_num'
     cur.execute(q1)
     data = cur.fetchall()
     self.conditions = [i['name'] for i in data]
     # Read in residuals
     q1 = 'SELECT residual FROM cluster_stats WHERE cluster=' + str(
         self.k) + ' and iteration=' + str(self.maxIter)
     cur.execute(q1)
     data = cur.fetchall()
     self.residNorm = float(data[0]['residual'])
     # At some point might be nice to figure out how to include this
     """# Read in associations with clinical traits
     self.cor = {}
     corFile = open('biclust/'+str(self.k)+'/correlation.csv','r')
     corFile.readline() # Skip header
     for line in corFile.readlines():
         splitUp = line.strip().split(',')
         self.cor[splitUp[0].strip('"')] = {'cor':splitUp[1],'pValue':splitUp[2]}
     corFile.close()
     self.surv = {}
     survFile = open('biclust/'+str(self.k)+'/survival.csv','r')
     survFile.readline() # Skip header
     for line in survFile.readlines():
         splitUp = line.strip().split(',')
         self.surv[splitUp[0]] = {'zScore':splitUp[1],'pValue':splitUp[2]}
     survFile.close()
     """
     # Now load up the PSSMs
     q1 = 'SELECT rowid, seqtype, motif_num, evalue FROM motif_infos WHERE cluster=' + str(
         self.k) + ' and iteration=2001'
     cur.execute(q1)
     motifs = cur.fetchall()
     # Set the de novo method
     if not de_novo_method_upstream == None:
         de_novo_method = de_novo_method_upstream
     else:
         de_novo_method = de_novo_method_3pUTR
     self.pssmsUpstream = []
     self.pssms3pUTR = []
     for motif in motifs:
         name = str(self.k) + '_motif' + str(
             motif['motif_num']) + '_' + de_novo_method
         q1 = 'SELECT row_names.name, motif_annotations.position, motif_annotations.reverse, motif_annotations.pvalue FROM motif_annotations, row_names WHERE motif_annotations.motif_info_id=' + str(
             motif['rowid']
         ) + ' and motif_annotations.gene_num=row_names.order_num'
         cur.execute(q1)
         sites = [[
             str(i['name']), i['position'], i['reverse'], i['pvalue']
         ] for i in cur.fetchall()]
         nsites = sites
         eValue = motif['evalue']
         q1 = 'SELECT a, c, g, t FROM motif_pssm_rows WHERE motif_info_id=' + str(
             motif['rowid'])  #+' and iteration='+str(self.maxIter)
         cur.execute(q1)
         matrix = [[i['a'], i['c'], i['g'], i['t']] for i in cur.fetchall()]
         genes = list(set([i[0] for i in sites]))
         if motif['seqtype'] == 'upstream':
             self.pssmsUpstream.append(
                 pssm(pssmFileName=None,
                      biclusterName=name,
                      nsites=nsites,
                      eValue=eValue,
                      pssm=matrix,
                      genes=genes,
                      de_novo_method=de_novo_method))
         else:
             self.pssms3pUTR.append(
                 pssm(pssmFileName=None,
                      biclusterName=name,
                      nsites=nsites,
                      eValue=eValue,
                      pssm=matrix,
                      genes=genes,
                      de_novo_method=de_novo_method))
     con.close()
        else:
            genes[gene] = [gene+'.1']
            gene = gene+'.1'
        # Get rid of the header crap
        for i in range(len(inLines)-4):
            crap = inLines.pop(0)
        tmpPssm = []
        for line in inLines:
            splitUp = [i for i in line.strip().split('\t') if i]
            catchMe = splitUp.pop(0)
            for j in range(len(splitUp)):
                if catchMe=='A:':
                    tmpPssm.append([float(splitUp[j])])
                else:
                    tmpPssm[j].append(float(splitUp[j]))
        uniprobePSSMs[gene] = pssm(biclusterName=gene, pssm=deepcopy(tmpPssm), nsites=str(100), eValue=str(0.01))
print 'PSSMs recovered:', len(uniprobePSSMs)
print 'Genes recovered:',len(genes)
red1 = 0
for gene in genes:
    if len(genes[gene])>1:
        red1 += 1
print 'Redundant genes:',red1

# Test to show that the information content produces the expected values
# The code for this funciton was obtained from the Biopyhton Bio.motif.ic funciton
#tmpPssm = [[0.05, 0.05, 0.85, 0.05],
#           [0.85, 0.05, 0.05, 0.05],
#           [0.05, 0.05, 0.85, 0.05],
#           [0.65, 0.05, 0.25, 0.05],
#           [0.85, 0.05, 0.05, 0.05]]
Exemple #7
0
def weeder(seqFile=None, revComp=False):
    if not os.path.exists('tmp/weeder'):
        os.makedirs('tmp/weeder')

    # First run weederTFBS
    weederArgs = str(seqFile) + ' HS3P small T50'
    if revComp == True:
        weederArgs += ' S'
    #weederArgs += '&> /dev/null'
    errOut = open('tmp/weeder/stderr.out', 'w')
    weederProc = Popen("progs/weederlauncher " + weederArgs,
                       shell=True,
                       stdout=PIPE,
                       stderr=errOut)
    errOut.close()
    output = weederProc.communicate()

    # Now parse output from weeder
    PSSMs = []
    output = open(str(seqFile) + '.wee', 'r')
    outLines = [line for line in output.readlines() if line.strip()]
    hitBp = {}
    # Get top hit of 6bp look for "1)"
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('1) ') == -1:
            break
    hitBp[6] = outLine.strip().split(' ')[1:]

    # Scroll to where the 8bp reads wll be
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('Searching for motifs of length 8') == -1:
            break

    # Get top hit of 8bp look for "1)"
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('1) ') == -1:
            break
    hitBp[8] = outLine.strip().split(' ')[1:]

    # Scroll to where the 8bp reads wll be
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('Your sequences:') == -1:
            break

    # Get into the highest ranking motifs
    seqDict = {}
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('**** MY ADVICE ****') == -1:
            break
        splitUp = outLine.strip().split(' ')
        seqDict[splitUp[1]] = splitUp[3].lstrip('>')

    # Get into the highest ranking motifs
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('Interesting motifs (highest-ranking)') == -1:
            break
    while 1:
        name = outLines.pop(0).strip()  # Get match
        if not name.find('(not highest-ranking)') == -1:
            break
        # Get redundant motifs
        outLines.pop(0)
        redMotifs = [
            i for i in outLines.pop(0).strip().split(' ') if not i == '-'
        ]
        outLines.pop(0)
        outLines.pop(0)
        line = outLines.pop(0)
        instances = []
        while line.find('Frequency Matrix') == -1:
            splitUp = [i for i in line.strip().split(' ') if i]
            instances.append({
                'gene': seqDict[splitUp[0]],
                'strand': splitUp[1],
                'site': splitUp[2],
                'start': splitUp[3],
                'match': splitUp[4].lstrip('(').rstrip(')')
            })
            line = outLines.pop(0)
        # Read in Frequency Matrix
        outLines.pop(0)
        outLines.pop(0)
        matrix = []
        col = outLines.pop(0)
        while col.find('======') == -1:
            nums = [i for i in col.strip().split('\t')[1].split(' ') if i]
            colSum = 0
            for i in nums:
                colSum += int(i.strip())
            matrix += [[
                float(nums[0]) / float(colSum),
                float(nums[1]) / float(colSum),
                float(nums[2]) / float(colSum),
                float(nums[3]) / float(colSum)
            ]]
            col = outLines.pop(0)
        PSSMs += [
            pssm(biclusterName=name,
                 nsites=instances,
                 eValue=hitBp[len(matrix)][1],
                 pssm=matrix,
                 genes=redMotifs)
        ]
    return PSSMs
Exemple #8
0
def weeder(i=None, percTargets=50, revComp=False):
    seqFile = fastaFiles[i]
    print seqFile
    if not os.path.exists('tmp/weeder'):
        os.makedirs('tmp/weeder')
    
    # First run weederTFBS for 6bp motifs
    weederArgs = ' '+str(seqFile)+' HS3P small T50'
    if revComp==True:
        weederArgs += ' -S'
    errOut = open('tmp/weeder/stderr.out','w')
    weederProc = Popen("weederlauncher " + weederArgs, shell=True,stdout=PIPE,stderr=errOut)
    output = weederProc.communicate()
    
    # Now parse output from weeder
    PSSMs = []
    output = open(str(seqFile)+'.wee','r')
    outLines = [line for line in output.readlines() if line.strip()]
    hitBp = {}
    # Get top hit of 6bp look for "1)"
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('1) ') == -1:
            break
    hitBp[6] = outLine.strip().split(' ')[1:]

    # Scroll to where the 8bp reads wll be
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('Searching for motifs of length 8') == -1:
            break

    # Get top hit of 8bp look for "1)"
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('1) ') == -1:
            break
    hitBp[8] = outLine.strip().split(' ')[1:]

    # Scroll to where the 8bp reads wll be
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('Your sequences:') == -1:
            break
    
    # Get into the highest ranking motifs
    seqDict = {}
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('**** MY ADVICE ****') == -1:
            break
        splitUp = outLine.strip().split(' ')
        seqDict[splitUp[1]] = splitUp[3].lstrip('>')

    # Get into the highest ranking motifs
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('Interesting motifs (highest-ranking)') == -1:
            break
    while 1:
        name = seqFile.split('/')[-1].split('.')[0] +'_'+ outLines.pop(0).strip() # Get match
        if not name.find('(not highest-ranking)') == -1:
            break
        # Get redundant motifs
        outLines.pop(0)
        redMotifs = [i for i in outLines.pop(0).strip().split(' ') if not i=='-']
        outLines.pop(0)
        outLines.pop(0)
        line = outLines.pop(0)
        instances = []
        while line.find('Frequency Matrix') == -1:
            splitUp = [i for i in line.strip().split(' ') if i]
            instances.append({'gene':seqDict[splitUp[0]], 'strand':splitUp[1], 'site':splitUp[2], 'start':splitUp[3], 'match':splitUp[4].lstrip('(').rstrip(')') })
            line = outLines.pop(0)
        # Read in Frequency Matrix
        outLines.pop(0)
        outLines.pop(0)
        matrix = []
        col = outLines.pop(0)
        while col.find('======') == -1:
            nums = [i for i in col.strip().split('\t')[1].split(' ') if i]
            colSum = 0
            for i in nums:
                colSum += int(i.strip())
            matrix += [[ float(nums[0])/float(colSum), float(nums[1])/float(colSum), float(nums[2])/float(colSum), float(nums[3])/float(colSum)]]
            col = outLines.pop(0)
        weederPSSMs1.append(pssm(biclusterName=name,nsites=instances,eValue=hitBp[len(matrix)][1],pssm=matrix,genes=redMotifs))
    for cur in ['A','C','G','T']:
        line = inFile.readline()
        splitUp = [i for i in line.strip().split(',') if i]
        catchMe = splitUp.pop(0)
        for j in range(len(splitUp)):
            if cur=='A':
                tmpPssm.append([splitUp[j]])
            else:
                tmpPssm[j].append(splitUp[j])
    # Convert counts to frequencies
    for i in range(len(tmpPssm)):
        tmpSum = float(tmpPssm[i][0])+float(tmpPssm[i][1])+float(tmpPssm[i][2])+float(tmpPssm[i][3])
        for j in [0,1,2,3]:
            tmpPssm[i][j] = float(tmpPssm[i][j])/tmpSum
    # Instantiate PSSM object
    selexPSSMs[name] = pssm(biclusterName=name, pssm=deepcopy(tmpPssm), nsites=str(100), eValue=str(0.01))
inFile.close()
print 'PSSMs recovered:',len(selexPSSMs)

# Test to show that the information content produces the expected values
# The code for this funciton was obtained from the Biopyhton Bio.motif.ic funciton
#tmpPssm = [[0.05, 0.05, 0.85, 0.05],
#           [0.85, 0.05, 0.05, 0.05],
#           [0.05, 0.05, 0.85, 0.05],
#           [0.65, 0.05, 0.25, 0.05],
#           [0.85, 0.05, 0.05, 0.05]]
#testPssm = pssm(biclusterName='test', pssm=deepcopy(tmpPssm), nsites=str(100), eValue=str(0.01))
#print 'ic.norm = ',ic(testPssm,norm=True)
#print 'ic = ', ic(testPssm,norm=False) # Biopython Bio.moitf.ic gives 5.27 as the IC

# 3. Write out pickle of the PSSMs for the systematic SELEX experiemnts
def weeder(seqFile=None, percTargets=50, revComp=False, bgModel='HS'):
    if not os.path.exists(conf.tmp_dir+'/weeder'):
        os.makedirs(conf.tmp_dir+'/weeder')

    # Note that we use a slightly hacked version of weeder. Weederlauncher has
    # hard-coded paths to the other executables, and weederlauncher and 
    # weederTFBS have hard-coded paths to the FreqFiles directory. We hacked
    # those to point to our directories.

    # First run weederTFBS for 6bp motifs
    weederArgs = ' '+str(seqFile)+' '+str(bgModel)+' small T50'
    if revComp==True:
        weederArgs += ' -S'
    errOut = open(conf.tmp_dir+'/weeder/stderr.out','w')
    weederProc = Popen("weederlauncher " + weederArgs, shell=True,stdout=PIPE,stderr=errOut)
    output = weederProc.communicate()
    
    """# First run weederTFBS for 6bp motifs
    weederArgs = '-f '+str(seqFile)+' -W 6 -e 1 -O HS -R '+str(percTargets)
    if revComp==True:
        weederArgs += ' -S'
    errOut = open(conf.tmp_dir+'/weeder/stderr.out','w')
    weederProc = Popen("weeder " + weederArgs, shell=True,stdout=PIPE,stderr=errOut)
    output = weederProc.communicate()
    
    # Second run weederTFBS for 8bp motifs
    weederArgs = '-f '+str(seqFile)+' -W 8 -e 2 -O HS -R '+str(percTargets)
    if revComp==True:
        weederArgs += ' -S'
    weederProc = Popen("weeder " + weederArgs, shell=True,stdout=PIPE,stderr=errOut)
    output = weederProc.communicate()
    
    # Finally run adviser
    weederArgs = str(seqFile)
    weederProc = Popen("adviser " + weederArgs, shell=True,stdout=PIPE,stderr=errOut)
    output = weederProc.communicate()
    errOut.close()
    """

    # Now parse output from weeder
    PSSMs = []
    output = open(str(seqFile)+'.wee','r')
    outLines = [line for line in output.readlines() if line.strip()]
    hitBp = {}
    # Get top hit of 6bp look for "1)"
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('1) ') == -1:
            break
    hitBp[6] = outLine.strip().split(' ')[1:]

    # Scroll to where the 8bp reads wll be
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('Searching for motifs of length 8') == -1:
            break

    # Get top hit of 8bp look for "1)"
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('1) ') == -1:
            break
    hitBp[8] = outLine.strip().split(' ')[1:]

    # Scroll to where the 8bp reads wll be
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('Your sequences:') == -1:
            break
    
    # Get into the highest ranking motifs
    seqDict = {}
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('**** MY ADVICE ****') == -1:
            break
        splitUp = outLine.strip().split(' ')
        seqDict[splitUp[1]] = splitUp[3].lstrip('>')

    # Get into the highest ranking motifs
    while 1:
        outLine = outLines.pop(0)
        if not outLine.find('Interesting motifs (highest-ranking)') == -1:
            break
    while 1:
        name = outLines.pop(0).strip() # Get match
        if not name.find('(not highest-ranking)') == -1:
            break
        # Get redundant motifs
        outLines.pop(0)
        redMotifs = [i for i in outLines.pop(0).strip().split(' ') if not i=='-']
        outLines.pop(0)
        outLines.pop(0)
        line = outLines.pop(0)
        instances = []
        matches = []
        while line.find('Frequency Matrix') == -1:
            splitUp = [i for i in line.strip().split(' ') if i]
            instances.append({'gene':seqDict[splitUp[0]], 'strand':splitUp[1], 'site':splitUp[2], 'start':splitUp[3], 'match':splitUp[4].lstrip('(').rstrip(')'), 'mfe':rnaDuplex(name,[splitUp[2]])[0] })
            line = outLines.pop(0)
        # Read in Frequency Matrix
        outLines.pop(0)
        outLines.pop(0)
        matrix = []
        col = outLines.pop(0)
        while col.find('======') == -1:
            nums = [i for i in col.strip().split('\t')[1].split(' ') if i]
            colSum = 0
            for i in nums:
                colSum += int(i.strip())
            matrix += [[ float(nums[0])/float(colSum), float(nums[1])/float(colSum), float(nums[2])/float(colSum), float(nums[3])/float(colSum)]]
            col = outLines.pop(0)
        PSSMs += [pssm(biclusterName=name,nsites=instances,eValue=hitBp[len(matrix)][1],pssm=matrix,genes=redMotifs)]
    return PSSMs
Exemple #11
0
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from pssm import pssm
from malign import malign

# multiple alignment on clusters
# for i in range(3):
# 	# malign("neg/PR_neg_%d.fasta" % (i))

# pssm
centroids = []
for i in range(3):
	records = list(SeqIO.parse("neg/PR_neg_%d.clustal" % (i), "clustal"))
	centroids.append(pssm([ record.seq for record in records ]))

SeqIO.write([SeqRecord(Seq(c), id=str(i)) for i, c in enumerate(centroids)], "neg/centroids.fasta", "fasta")

# comparison
malign("neg/centroids.fasta")