def readBicluster(self): # Read in the bicluster score summaryFile = open('cluster.summary.csv','r') summaryFile.readline() # Skip the header scoreDict = {} for sumLine in summaryFile.readlines(): splitUp = sumLine.strip().split(',') scoreDict[int(splitUp[0].strip('"'))] = splitUp[2] self.score = scoreDict[int(self.k)] # Read in genes geneFile = open('biclust/'+str(self.k)+'/genes.csv','r') geneFile.readline() # Skip header self.genes = [line.strip().split(',')[1].strip('"') for line in geneFile.readlines()] geneFile.close() # Read in conditions condFile = open('biclust/'+str(self.k)+'/conditions.csv','r') condFile.readline() # Skip header self.conditions = [line.strip().split(',')[1].strip('"') for line in condFile.readlines()] condFile.close() # Read in residuals residFile = open('biclust/'+str(self.k)+'/resid.csv','r') residFile.readline() # Skip header self.resid = residFile.readline().strip().split(',')[1] self.residNorm = residFile.readline().strip().split(',')[1] residFile.close() # Read in associations with clinical traits self.cor = {} corFile = open('biclust/'+str(self.k)+'/correlation.csv','r') corFile.readline() # Skip header for line in corFile.readlines(): splitUp = line.strip().split(',') self.cor[splitUp[0].strip('"')] = {'cor':splitUp[1],'pValue':splitUp[2]} corFile.close() self.surv = {} survFile = open('biclust/'+str(self.k)+'/survival.csv','r') survFile.readline() # Skip header for line in survFile.readlines(): splitUp = line.strip().split(',') self.surv[splitUp[0]] = {'zScore':splitUp[1],'pValue':splitUp[2]} survFile.close() # Now load up the PSSMs pssmUpFiles = os.listdir('biclust/'+str(self.k)+'/upstream') self.pssmsUpstream = [] for pssmUpFile in pssmUpFiles: self.pssmsUpstream.append(pssm('biclust/'+str(self.k)+'/upstream/'+pssmUpFile,self.k)) # Now read in the pssms pssmUTRFiles = os.listdir('biclust/'+str(self.k)+'/3pUTR') self.pssms3pUTR = [] for pssmUTRFile in pssmUTRFiles: self.pssms3pUTR.append(pssm('biclust/'+str(self.k)+'/3pUTR/'+pssmUTRFile,self.k))
def meme(num, seqFile=None, bgFile=None, nMotifs=1, minMotifWidth=6, maxMotifWidth=12, revComp=True, seed=None): if not os.path.exists('tmp/meme'): os.makedirs('tmp/meme') # Arguments for tomtom memeArgs = str(seqFile)+' -bfile tmp/meme/bgFile.meme -nostatus -text -time 600 -dna -maxsize 9999999 -evt 1e9 -mod zoops -minw ' + str(minMotifWidth) + ' -maxw ' + str(maxMotifWidth) + ' -nmotifs ' + str(nMotifs) if revComp==True: memeArgs += ' -revcomp' if not seed==None: memeArgs += ' -cons ' + str(seed) print memeArgs #errOut = open('tmp/meme/stderr.out','w') memeProc = Popen("meme " + memeArgs, shell=True,stdout=PIPE) #,stderr=errOut) output = memeProc.communicate()[0].split('\n') #errOut.close() PSSMs = [] # Now iterate through output and save data for i in range(len(output)): splitUp1 = output[i].strip().split(' ') if splitUp1[0]=='Motif' and splitUp1[2]=='position-specific' and splitUp1[3]=='probability': i += 2 # Skip the separator line, go to the summary line splitUp = output[i].strip().split(' ') width = int(splitUp[5]) sites = splitUp[7] eValue = splitUp[9] matrix = [] for j in range(width): i += 1 matrix += [[float(let) for let in output[i].strip().split(' ') if let]] PSSMs.append(pssm(biclusterName=str(splitUp1[1]),nsites=sites,eValue=eValue,pssm=matrix,genes=[])) clusterMemeRuns[num] = PSSMs
def merge2PSSMs(name, pssm1, pssm2, weight1, weight2, offset, overlap, orientation): pssm1Mat = deepcopy(pssm1.getMatrix()) # Flip PSSM matrix values for nucleotides if orientation=='-' pssm2Mat = deepcopy(pssm2.getMatrix()) if orientation=='-': pssm2Mat = flip(pssm2Mat) # Merge PSSM matrices newPSSMMat = [] newWeight = [] if offset>0: for i in range(offset): row1 = pssm1Mat.pop(0) w1 = weight1.pop(0) newPSSMMat += [[row1[0]*100*w1, row1[1]*100*w1, row1[2]*100*w1, row1[3]*100*w1]] newWeight += [w1] elif offset<0: for i in range(-offset): row2 = pssm2Mat.pop(0) w2 = weight2.pop(0) newPSSMMat += [[row2[0]*100*w2, row2[1]*100*w2, row2[2]*100*w2, row2[3]*100*w2]] newWeight += [w2] maxLen = 0 if len(pssm1Mat)>=len(pssm2Mat): maxLen = len(pssm1Mat) else: maxLen = len(pssm2Mat) for i in range(maxLen): if len(pssm1Mat)>0: row1 = pssm1Mat.pop(0) w1 = weight1.pop(0) else: row1 = [0, 0, 0, 0] w1 = 0 if len(pssm2Mat)>0: row2 = pssm2Mat.pop(0) w2 = weight2.pop(0) else: row2 = [0, 0, 0, 0] w2 = 0 newPSSMMat += [[row1[0]*100*w1+row2[0]*100*w2, row1[1]*100*w1+row2[1]*100*w2, row1[2]*100*w1+row2[2]*100*w2, row1[3]*100*w1+row2[3]*100*w2]] newWeight += [w1+w2] # Normalize matrix for i in range(len(newPSSMMat)): total = newPSSMMat[i][0]+newPSSMMat[i][1]+newPSSMMat[i][2]+newPSSMMat[i][3] newPSSMMat[i][0] = newPSSMMat[i][0]/total newPSSMMat[i][1] = newPSSMMat[i][1]/total newPSSMMat[i][2] = newPSSMMat[i][2]/total newPSSMMat[i][3] = newPSSMMat[i][3]/total # Now build the new PSSM object newPSSM = pssm(biclusterName=name, nsites='30', eValue='0', pssm=newPSSMMat, genes=[]) print name, newPSSM.getConsensusMotif(), pssm1.getName(), pssm1.getConsensusMotif(), pssm2.getName(), pssm2.getConsensusMotif(), offset, overlap, orientation return [newPSSM, newWeight]
def weeder(config, seqFile=None, percTargets=50, revComp=False, bgModel='HS'): weeder_dir = os.path.join(config.get('General', 'tmp_dir'), 'weeder') if not os.path.exists(weeder_dir): os.makedirs(weeder_dir) # Note that we use a slightly hacked version of weeder. Weederlauncher has # hard-coded paths to the other executables, and weederlauncher and # weederTFBS have hard-coded paths to the FreqFiles directory. We hacked # those to point to our directories. # First run weederTFBS for 6bp motifs weederArgs = ' '+str(seqFile)+' '+str(bgModel)+' small T50' if revComp==True: weederArgs += ' -S' errOut = open(os.path.join(weeder_dir, 'stderr.out'), 'w') weederProc = Popen("weederlauncher " + weederArgs, shell=True, stdout=PIPE, stderr=errOut) output = weederProc.communicate() # Now parse output from weeder PSSMs = [] output = open(str(seqFile)+'.wee','r') outLines = [line for line in output.readlines() if line.strip()] hitBp = {} # Get top hit of 6bp look for "1)" while 1: outLine = outLines.pop(0) if not outLine.find('1) ') == -1: break hitBp[6] = outLine.strip().split(' ')[1:] # Scroll to where the 8bp reads wll be while 1: outLine = outLines.pop(0) if not outLine.find('Searching for motifs of length 8') == -1: break # Get top hit of 8bp look for "1)" while 1: outLine = outLines.pop(0) if not outLine.find('1) ') == -1: break hitBp[8] = outLine.strip().split(' ')[1:] # Scroll to where the 8bp reads wll be while 1: outLine = outLines.pop(0) if not outLine.find('Your sequences:') == -1: break # Get into the highest ranking motifs seqDict = {} while 1: outLine = outLines.pop(0) if not outLine.find('**** MY ADVICE ****') == -1: break splitUp = outLine.strip().split(' ') seqDict[splitUp[1]] = splitUp[3].lstrip('>') # Get into the highest ranking motifs while 1: outLine = outLines.pop(0) if not outLine.find('Interesting motifs (highest-ranking)') == -1: break while 1: name = outLines.pop(0).strip() # Get match if not name.find('(not highest-ranking)') == -1: break # Get redundant motifs outLines.pop(0) redMotifs = [i for i in outLines.pop(0).strip().split(' ') if not i=='-'] outLines.pop(0) outLines.pop(0) line = outLines.pop(0) instances = [] matches = [] while line.find('Frequency Matrix') == -1: splitUp = [i for i in line.strip().split(' ') if i] instances.append({'gene':seqDict[splitUp[0]], 'strand':splitUp[1], 'site':splitUp[2], 'start':splitUp[3], 'match':splitUp[4].lstrip('(').rstrip(')'), 'mfe':rnaDuplex(config, name,[splitUp[2]])[0] }) line = outLines.pop(0) # Read in Frequency Matrix outLines.pop(0) outLines.pop(0) matrix = [] col = outLines.pop(0) while col.find('======') == -1: nums = [i for i in col.strip().split('\t')[1].split(' ') if i] colSum = 0 for i in nums: colSum += int(i.strip()) matrix += [[ float(nums[0])/float(colSum), float(nums[1])/float(colSum), float(nums[2])/float(colSum), float(nums[3])/float(colSum)]] col = outLines.pop(0) PSSMs += [pssm(biclusterName=name,nsites=instances,eValue=hitBp[len(matrix)][1],pssm=matrix,genes=redMotifs)] return PSSMs
def readBicluster(self, de_novo_method_upstream=None, de_novo_method_3pUTR=None): # Attach to the database con = lite.connect(self.sqliteDb) con.row_factory = lite.Row cur = con.cursor() # Get the number of biclusters in run q1 = 'SELECT name FROM row_members, row_names WHERE row_members.cluster=' + str( self.k) + ' and row_members.iteration=' + str( self.maxIter ) + ' and row_members.order_num=row_names.order_num' cur.execute(q1) data = cur.fetchall() self.genes = [str(i['name']) for i in data] # Read in conditions q1 = 'SELECT name FROM column_members, column_names WHERE column_members.cluster=' + str( self.k) + ' and column_members.iteration=' + str( self.maxIter ) + ' and column_members.order_num=column_names.order_num' cur.execute(q1) data = cur.fetchall() self.conditions = [i['name'] for i in data] # Read in residuals q1 = 'SELECT residual FROM cluster_stats WHERE cluster=' + str( self.k) + ' and iteration=' + str(self.maxIter) cur.execute(q1) data = cur.fetchall() self.residNorm = float(data[0]['residual']) # At some point might be nice to figure out how to include this """# Read in associations with clinical traits self.cor = {} corFile = open('biclust/'+str(self.k)+'/correlation.csv','r') corFile.readline() # Skip header for line in corFile.readlines(): splitUp = line.strip().split(',') self.cor[splitUp[0].strip('"')] = {'cor':splitUp[1],'pValue':splitUp[2]} corFile.close() self.surv = {} survFile = open('biclust/'+str(self.k)+'/survival.csv','r') survFile.readline() # Skip header for line in survFile.readlines(): splitUp = line.strip().split(',') self.surv[splitUp[0]] = {'zScore':splitUp[1],'pValue':splitUp[2]} survFile.close() """ # Now load up the PSSMs q1 = 'SELECT rowid, seqtype, motif_num, evalue FROM motif_infos WHERE cluster=' + str( self.k) + ' and iteration=2001' cur.execute(q1) motifs = cur.fetchall() # Set the de novo method if not de_novo_method_upstream == None: de_novo_method = de_novo_method_upstream else: de_novo_method = de_novo_method_3pUTR self.pssmsUpstream = [] self.pssms3pUTR = [] for motif in motifs: name = str(self.k) + '_motif' + str( motif['motif_num']) + '_' + de_novo_method q1 = 'SELECT row_names.name, motif_annotations.position, motif_annotations.reverse, motif_annotations.pvalue FROM motif_annotations, row_names WHERE motif_annotations.motif_info_id=' + str( motif['rowid'] ) + ' and motif_annotations.gene_num=row_names.order_num' cur.execute(q1) sites = [[ str(i['name']), i['position'], i['reverse'], i['pvalue'] ] for i in cur.fetchall()] nsites = sites eValue = motif['evalue'] q1 = 'SELECT a, c, g, t FROM motif_pssm_rows WHERE motif_info_id=' + str( motif['rowid']) #+' and iteration='+str(self.maxIter) cur.execute(q1) matrix = [[i['a'], i['c'], i['g'], i['t']] for i in cur.fetchall()] genes = list(set([i[0] for i in sites])) if motif['seqtype'] == 'upstream': self.pssmsUpstream.append( pssm(pssmFileName=None, biclusterName=name, nsites=nsites, eValue=eValue, pssm=matrix, genes=genes, de_novo_method=de_novo_method)) else: self.pssms3pUTR.append( pssm(pssmFileName=None, biclusterName=name, nsites=nsites, eValue=eValue, pssm=matrix, genes=genes, de_novo_method=de_novo_method)) con.close()
else: genes[gene] = [gene+'.1'] gene = gene+'.1' # Get rid of the header crap for i in range(len(inLines)-4): crap = inLines.pop(0) tmpPssm = [] for line in inLines: splitUp = [i for i in line.strip().split('\t') if i] catchMe = splitUp.pop(0) for j in range(len(splitUp)): if catchMe=='A:': tmpPssm.append([float(splitUp[j])]) else: tmpPssm[j].append(float(splitUp[j])) uniprobePSSMs[gene] = pssm(biclusterName=gene, pssm=deepcopy(tmpPssm), nsites=str(100), eValue=str(0.01)) print 'PSSMs recovered:', len(uniprobePSSMs) print 'Genes recovered:',len(genes) red1 = 0 for gene in genes: if len(genes[gene])>1: red1 += 1 print 'Redundant genes:',red1 # Test to show that the information content produces the expected values # The code for this funciton was obtained from the Biopyhton Bio.motif.ic funciton #tmpPssm = [[0.05, 0.05, 0.85, 0.05], # [0.85, 0.05, 0.05, 0.05], # [0.05, 0.05, 0.85, 0.05], # [0.65, 0.05, 0.25, 0.05], # [0.85, 0.05, 0.05, 0.05]]
def weeder(seqFile=None, revComp=False): if not os.path.exists('tmp/weeder'): os.makedirs('tmp/weeder') # First run weederTFBS weederArgs = str(seqFile) + ' HS3P small T50' if revComp == True: weederArgs += ' S' #weederArgs += '&> /dev/null' errOut = open('tmp/weeder/stderr.out', 'w') weederProc = Popen("progs/weederlauncher " + weederArgs, shell=True, stdout=PIPE, stderr=errOut) errOut.close() output = weederProc.communicate() # Now parse output from weeder PSSMs = [] output = open(str(seqFile) + '.wee', 'r') outLines = [line for line in output.readlines() if line.strip()] hitBp = {} # Get top hit of 6bp look for "1)" while 1: outLine = outLines.pop(0) if not outLine.find('1) ') == -1: break hitBp[6] = outLine.strip().split(' ')[1:] # Scroll to where the 8bp reads wll be while 1: outLine = outLines.pop(0) if not outLine.find('Searching for motifs of length 8') == -1: break # Get top hit of 8bp look for "1)" while 1: outLine = outLines.pop(0) if not outLine.find('1) ') == -1: break hitBp[8] = outLine.strip().split(' ')[1:] # Scroll to where the 8bp reads wll be while 1: outLine = outLines.pop(0) if not outLine.find('Your sequences:') == -1: break # Get into the highest ranking motifs seqDict = {} while 1: outLine = outLines.pop(0) if not outLine.find('**** MY ADVICE ****') == -1: break splitUp = outLine.strip().split(' ') seqDict[splitUp[1]] = splitUp[3].lstrip('>') # Get into the highest ranking motifs while 1: outLine = outLines.pop(0) if not outLine.find('Interesting motifs (highest-ranking)') == -1: break while 1: name = outLines.pop(0).strip() # Get match if not name.find('(not highest-ranking)') == -1: break # Get redundant motifs outLines.pop(0) redMotifs = [ i for i in outLines.pop(0).strip().split(' ') if not i == '-' ] outLines.pop(0) outLines.pop(0) line = outLines.pop(0) instances = [] while line.find('Frequency Matrix') == -1: splitUp = [i for i in line.strip().split(' ') if i] instances.append({ 'gene': seqDict[splitUp[0]], 'strand': splitUp[1], 'site': splitUp[2], 'start': splitUp[3], 'match': splitUp[4].lstrip('(').rstrip(')') }) line = outLines.pop(0) # Read in Frequency Matrix outLines.pop(0) outLines.pop(0) matrix = [] col = outLines.pop(0) while col.find('======') == -1: nums = [i for i in col.strip().split('\t')[1].split(' ') if i] colSum = 0 for i in nums: colSum += int(i.strip()) matrix += [[ float(nums[0]) / float(colSum), float(nums[1]) / float(colSum), float(nums[2]) / float(colSum), float(nums[3]) / float(colSum) ]] col = outLines.pop(0) PSSMs += [ pssm(biclusterName=name, nsites=instances, eValue=hitBp[len(matrix)][1], pssm=matrix, genes=redMotifs) ] return PSSMs
def weeder(i=None, percTargets=50, revComp=False): seqFile = fastaFiles[i] print seqFile if not os.path.exists('tmp/weeder'): os.makedirs('tmp/weeder') # First run weederTFBS for 6bp motifs weederArgs = ' '+str(seqFile)+' HS3P small T50' if revComp==True: weederArgs += ' -S' errOut = open('tmp/weeder/stderr.out','w') weederProc = Popen("weederlauncher " + weederArgs, shell=True,stdout=PIPE,stderr=errOut) output = weederProc.communicate() # Now parse output from weeder PSSMs = [] output = open(str(seqFile)+'.wee','r') outLines = [line for line in output.readlines() if line.strip()] hitBp = {} # Get top hit of 6bp look for "1)" while 1: outLine = outLines.pop(0) if not outLine.find('1) ') == -1: break hitBp[6] = outLine.strip().split(' ')[1:] # Scroll to where the 8bp reads wll be while 1: outLine = outLines.pop(0) if not outLine.find('Searching for motifs of length 8') == -1: break # Get top hit of 8bp look for "1)" while 1: outLine = outLines.pop(0) if not outLine.find('1) ') == -1: break hitBp[8] = outLine.strip().split(' ')[1:] # Scroll to where the 8bp reads wll be while 1: outLine = outLines.pop(0) if not outLine.find('Your sequences:') == -1: break # Get into the highest ranking motifs seqDict = {} while 1: outLine = outLines.pop(0) if not outLine.find('**** MY ADVICE ****') == -1: break splitUp = outLine.strip().split(' ') seqDict[splitUp[1]] = splitUp[3].lstrip('>') # Get into the highest ranking motifs while 1: outLine = outLines.pop(0) if not outLine.find('Interesting motifs (highest-ranking)') == -1: break while 1: name = seqFile.split('/')[-1].split('.')[0] +'_'+ outLines.pop(0).strip() # Get match if not name.find('(not highest-ranking)') == -1: break # Get redundant motifs outLines.pop(0) redMotifs = [i for i in outLines.pop(0).strip().split(' ') if not i=='-'] outLines.pop(0) outLines.pop(0) line = outLines.pop(0) instances = [] while line.find('Frequency Matrix') == -1: splitUp = [i for i in line.strip().split(' ') if i] instances.append({'gene':seqDict[splitUp[0]], 'strand':splitUp[1], 'site':splitUp[2], 'start':splitUp[3], 'match':splitUp[4].lstrip('(').rstrip(')') }) line = outLines.pop(0) # Read in Frequency Matrix outLines.pop(0) outLines.pop(0) matrix = [] col = outLines.pop(0) while col.find('======') == -1: nums = [i for i in col.strip().split('\t')[1].split(' ') if i] colSum = 0 for i in nums: colSum += int(i.strip()) matrix += [[ float(nums[0])/float(colSum), float(nums[1])/float(colSum), float(nums[2])/float(colSum), float(nums[3])/float(colSum)]] col = outLines.pop(0) weederPSSMs1.append(pssm(biclusterName=name,nsites=instances,eValue=hitBp[len(matrix)][1],pssm=matrix,genes=redMotifs))
for cur in ['A','C','G','T']: line = inFile.readline() splitUp = [i for i in line.strip().split(',') if i] catchMe = splitUp.pop(0) for j in range(len(splitUp)): if cur=='A': tmpPssm.append([splitUp[j]]) else: tmpPssm[j].append(splitUp[j]) # Convert counts to frequencies for i in range(len(tmpPssm)): tmpSum = float(tmpPssm[i][0])+float(tmpPssm[i][1])+float(tmpPssm[i][2])+float(tmpPssm[i][3]) for j in [0,1,2,3]: tmpPssm[i][j] = float(tmpPssm[i][j])/tmpSum # Instantiate PSSM object selexPSSMs[name] = pssm(biclusterName=name, pssm=deepcopy(tmpPssm), nsites=str(100), eValue=str(0.01)) inFile.close() print 'PSSMs recovered:',len(selexPSSMs) # Test to show that the information content produces the expected values # The code for this funciton was obtained from the Biopyhton Bio.motif.ic funciton #tmpPssm = [[0.05, 0.05, 0.85, 0.05], # [0.85, 0.05, 0.05, 0.05], # [0.05, 0.05, 0.85, 0.05], # [0.65, 0.05, 0.25, 0.05], # [0.85, 0.05, 0.05, 0.05]] #testPssm = pssm(biclusterName='test', pssm=deepcopy(tmpPssm), nsites=str(100), eValue=str(0.01)) #print 'ic.norm = ',ic(testPssm,norm=True) #print 'ic = ', ic(testPssm,norm=False) # Biopython Bio.moitf.ic gives 5.27 as the IC # 3. Write out pickle of the PSSMs for the systematic SELEX experiemnts
def weeder(seqFile=None, percTargets=50, revComp=False, bgModel='HS'): if not os.path.exists(conf.tmp_dir+'/weeder'): os.makedirs(conf.tmp_dir+'/weeder') # Note that we use a slightly hacked version of weeder. Weederlauncher has # hard-coded paths to the other executables, and weederlauncher and # weederTFBS have hard-coded paths to the FreqFiles directory. We hacked # those to point to our directories. # First run weederTFBS for 6bp motifs weederArgs = ' '+str(seqFile)+' '+str(bgModel)+' small T50' if revComp==True: weederArgs += ' -S' errOut = open(conf.tmp_dir+'/weeder/stderr.out','w') weederProc = Popen("weederlauncher " + weederArgs, shell=True,stdout=PIPE,stderr=errOut) output = weederProc.communicate() """# First run weederTFBS for 6bp motifs weederArgs = '-f '+str(seqFile)+' -W 6 -e 1 -O HS -R '+str(percTargets) if revComp==True: weederArgs += ' -S' errOut = open(conf.tmp_dir+'/weeder/stderr.out','w') weederProc = Popen("weeder " + weederArgs, shell=True,stdout=PIPE,stderr=errOut) output = weederProc.communicate() # Second run weederTFBS for 8bp motifs weederArgs = '-f '+str(seqFile)+' -W 8 -e 2 -O HS -R '+str(percTargets) if revComp==True: weederArgs += ' -S' weederProc = Popen("weeder " + weederArgs, shell=True,stdout=PIPE,stderr=errOut) output = weederProc.communicate() # Finally run adviser weederArgs = str(seqFile) weederProc = Popen("adviser " + weederArgs, shell=True,stdout=PIPE,stderr=errOut) output = weederProc.communicate() errOut.close() """ # Now parse output from weeder PSSMs = [] output = open(str(seqFile)+'.wee','r') outLines = [line for line in output.readlines() if line.strip()] hitBp = {} # Get top hit of 6bp look for "1)" while 1: outLine = outLines.pop(0) if not outLine.find('1) ') == -1: break hitBp[6] = outLine.strip().split(' ')[1:] # Scroll to where the 8bp reads wll be while 1: outLine = outLines.pop(0) if not outLine.find('Searching for motifs of length 8') == -1: break # Get top hit of 8bp look for "1)" while 1: outLine = outLines.pop(0) if not outLine.find('1) ') == -1: break hitBp[8] = outLine.strip().split(' ')[1:] # Scroll to where the 8bp reads wll be while 1: outLine = outLines.pop(0) if not outLine.find('Your sequences:') == -1: break # Get into the highest ranking motifs seqDict = {} while 1: outLine = outLines.pop(0) if not outLine.find('**** MY ADVICE ****') == -1: break splitUp = outLine.strip().split(' ') seqDict[splitUp[1]] = splitUp[3].lstrip('>') # Get into the highest ranking motifs while 1: outLine = outLines.pop(0) if not outLine.find('Interesting motifs (highest-ranking)') == -1: break while 1: name = outLines.pop(0).strip() # Get match if not name.find('(not highest-ranking)') == -1: break # Get redundant motifs outLines.pop(0) redMotifs = [i for i in outLines.pop(0).strip().split(' ') if not i=='-'] outLines.pop(0) outLines.pop(0) line = outLines.pop(0) instances = [] matches = [] while line.find('Frequency Matrix') == -1: splitUp = [i for i in line.strip().split(' ') if i] instances.append({'gene':seqDict[splitUp[0]], 'strand':splitUp[1], 'site':splitUp[2], 'start':splitUp[3], 'match':splitUp[4].lstrip('(').rstrip(')'), 'mfe':rnaDuplex(name,[splitUp[2]])[0] }) line = outLines.pop(0) # Read in Frequency Matrix outLines.pop(0) outLines.pop(0) matrix = [] col = outLines.pop(0) while col.find('======') == -1: nums = [i for i in col.strip().split('\t')[1].split(' ') if i] colSum = 0 for i in nums: colSum += int(i.strip()) matrix += [[ float(nums[0])/float(colSum), float(nums[1])/float(colSum), float(nums[2])/float(colSum), float(nums[3])/float(colSum)]] col = outLines.pop(0) PSSMs += [pssm(biclusterName=name,nsites=instances,eValue=hitBp[len(matrix)][1],pssm=matrix,genes=redMotifs)] return PSSMs
from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord from pssm import pssm from malign import malign # multiple alignment on clusters # for i in range(3): # # malign("neg/PR_neg_%d.fasta" % (i)) # pssm centroids = [] for i in range(3): records = list(SeqIO.parse("neg/PR_neg_%d.clustal" % (i), "clustal")) centroids.append(pssm([ record.seq for record in records ])) SeqIO.write([SeqRecord(Seq(c), id=str(i)) for i, c in enumerate(centroids)], "neg/centroids.fasta", "fasta") # comparison malign("neg/centroids.fasta")