def memefiles2tamo(files, tamoname): global probefile, PROBESET, fsafile motifs = [] for filename in files: print ">>>SDFSD>F ",filename if re.search('\.ace$',filename): mdobject = AlignAce.AlignAce(filename) if not mdobject.fastafile: mdobject.fastafile=filename.replace('.ace','.fsa') elif re.search('\.meme.*$',filename): mdobject = Meme.Meme(filename) if not mdobject.fastafile: mdobject.fastafile=re.sub('\..\.meme','.meme',filename).replace('.meme','.fsa') motifs.extend(mdobject.motifs) #fsaname = find_fsa(mdobject.fastafile) print mdobject.fastafile if fsafile: fsaname = fsafile else: fsaname = Fasta.find(mdobject.fastafile) fsaD = Fasta.load(fsaname) probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) for key,seq in fsaD.items(): PROBESET.probes[key] = seq for motif in motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') #if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if motif.frac == None: motif.frac = PROBESET.frac(motif,probes,'v',0.7) if re.search('\.meme$',filename): motif.MAP = -math.log(motif.evalue)/math.log(10) if 0 and (motif.CRA == None): try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass if re.search('\.meme$',filename): mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church)) MotifTools.save_motifs(motifs,tamoname)
def tamo2tamo(file, outname): global probefile, PROBESET, fsafile motifs = MotifTools.load(file) if fsafile: fsaname = fsafile else: fsaname = find_fsa(file) print '# FSA ', fsaname fsaD = MotifMetrics.fasta2seqs(fsaname, 'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) #for key,seq in fsaD.items(): # PROBESET.probes[key] = seq print "# %d motifs" % len(motifs) for motif in motifs: #motif.pvalue, motif.church = 1,1 #Comment this! if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif, probes, 'v') if motif.church == 1: motif.church = PROBESET.church(motif, probes, 'v') #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc == None: motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v') #if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if motif.frac == None: motif.frac = PROBESET.frac(motif, probes, 'v', 0.7) if motif.numbound == 0: matching = PROBESET.matching_ids(motif, [], factor=0.7) matchbound = [x for x in matching if x in probes] motif.numbound = len(probes) motif.nummotif = len(matching) motif.numboundmotif = len(matchbound) if 0 and motif.CRA == None: try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif, probes, 'v', tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass MotifTools.save_motifs(motifs, outname)
def tamo2tamo(file, outname): global probefile, PROBESET, fsafile motifs = MotifTools.load(file) if fsafile: fsaname = fsafile else: fsaname = find_fsa(file) print '# FSA ',fsaname fsaD = MotifMetrics.fasta2seqs(fsaname,'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) #for key,seq in fsaD.items(): # PROBESET.probes[key] = seq print "# %d motifs"%len(motifs) for motif in motifs: #motif.pvalue, motif.church = 1,1 #Comment this! if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') #if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if motif.frac == None: motif.frac = PROBESET.frac(motif,probes,'v',0.7) if motif.numbound == 0: matching = PROBESET.matching_ids(motif,[],factor=0.7) matchbound = [x for x in matching if x in probes] motif.numbound = len(probes) motif.nummotif = len(matching) motif.numboundmotif = len(matchbound) if 0 and motif.CRA == None: try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass MotifTools.save_motifs(motifs,outname)
def ace2tamo(filename, tamoname): global probefile, PROBESET if re.search('\.ace$',filename): mdobject = AlignAce.AlignAce(filename) elif re.search('\.meme$',filename): mdobject = Meme.Meme(filename) fsaname = find_fsa(mdobject.fastafile) fsaD = MotifMetrics.fasta2seqs(fsaname,'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('HUMAN_250') #PROBESET= pick_genome(fsaname) for key,seq in fsaD.items(): PROBESET.probes[key] = seq for motif in mdobject.motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if re.search('\.meme$',filename): motif.MAP = -math.log(motif.evalue)/math.log(10) sys.stdout.flush() i = 0 for motif in mdobject.motifs: motif.seednum = i ; i=i+1 kmers = motif.bogus_kmers(100) motif.maxscore = -100 scores = [motif.scan(kmer)[2][0] for kmer in kmers] print Arith.avestd(scores) if re.search('\.meme$',filename): mdobject.motifs.sort(lambda x,y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x,y: cmp(x.church, y.church)) MotifTools.save_motifs(mdobject.motifs,tamoname)
def motifs2tamo(motifs, outname): global probefile, PROBESET fsaname = find_fsa(outname) fsaD = MotifMetrics.fasta2seqs(fsaname,'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) #for key,seq in fsaD.items(): # PROBESET.probes[key] = seq print "# %d motifs"%len(motifs) for motif in motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') MotifTools.save_motifs(motifs,outname)
filename = sys.argv[1] motif_list = open(filename).read().split('\nMOTIF')[1:] tamo_list = [] motif_counter = 1 nsites_pat = re.compile("(w= [0-9]+)") for motif in motif_list: m_info1, m_info2 = motif.split('letter-probability matrix: ') m_mat = m_info2.split('--------------------------------------------------------------------------------', 1)[0] m_mat_header, m_prob_mat = m_mat.split('\n', 1) nsites = int(nsites_pat.findall(m_mat_header)[0].split('= ')[1]) count_pos = m_prob_mat.split('\n')[:-1] count_mat = [] site_list = [] for count in count_pos: sites = [float(i) for i in count.split()] site_list.append(sites) count_dict = {'A': int(sites[0] * nsites), 'C': int(sites[1] * nsites), 'G': int(sites[2] * nsites), 'T': int(sites[3] * nsites)} count_mat.append(count_dict) m = MotifTools.Motif_from_counts(count_mat) m.source = "Motif%s | %s" % (motif_counter, m_mat_header) tamo_list.append(m) motif_counter += 1 MotifTools.save_motifs(tamo_list, "MEME_motifs_%s.tamo" % filename.split('.')[0])
# Sort on log'd pVals testMotifs.sort(key=lambda x: x[1]) comboMotifs = [] for i in range(0,int(len(testMotifs)*0.2)): simMotifs = getKmersWithOneMisMtch(testMotifs[i][0],testMotifs) alndMotifs = alignSimilarMotifs([x[0] for x in simMotifs]) #for m in simMotifs: #print m[0].oneletter comboMotifs.append(MotifTools.sum(alndMotifs,[-x[1] for x in simMotifs])) # -x[1] to convert neg logs to pos weights print len(comboMotifs) t2 = time.time() oFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.gte2x.5-16mers.shfSeq.3.gGEMS.tmo' pFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.gte2x.5-16mers.shfSeq.3.gGEMS.pkl' MotifTools.save_motifs(comboMotifs,oFile,kmer_count=60) pFile = open(pFile, 'w') cPickle.dump(comboMotifs,pFile) t3 = time.time() print 'Calculations took %.3f min.\nWriting/Pickling took %.3f min.' % ((float(t2)-t1)/60, (float(t3)-t2)/60) None
# Create a general list with all the motifs from all algorithms genlist = [] genlist.extend(seederlist) genlist.extend(memelist) genlist.extend(weederlist) # Perform clustering on the general list of motifs clusterinf = clusterinfo(genlist) averages = clusteravg(genlist,clusterinf) # Trim the final average list # averages = trim(averages,0.5) # print clusterinf # Save new list of cluster averages MotifTools.save_motifs(averages,tamooutput) # WEBLOGO IMAGE GENERATION # Generate giflogos of all average motifs for index in range(len(averages)): cluster = 'Cluster ' + str(index + 1) clustergif = argv[1] + '/other/cluster' + str(index + 1) averages[index].giflogo(clustergif,title=cluster,scale=2) # SUMMARY REPORT # Determine location of the markdown file for the summary report reportout = open(str(argv[1] + '/final/' + listname + '_cluster_report.md'), 'w') # Write the header of the report rundate = date.today() header = "# Summary report for `" + listname + "`\nThis analysis was run on: " + str(rundate) + \
def memefiles2tamo(files, tamoname): global probefile, PROBESET motifs = [] for filename in files: print ">>>SDFSD>F ", filename if re.search('\.ace$', filename): mdobject = AlignAce.AlignAce(filename) if not mdobject.fastafile: mdobject.fastafile = filename.replace('.ace', '.fsa') elif re.search('\.meme.*$', filename): mdobject = Meme.Meme(filename) if not mdobject.fastafile: mdobject.fastafile = re.sub('\..\.meme', '.meme', filename).replace('.meme', '.fsa') motifs.extend(mdobject.motifs) #fsaname = find_fsa(mdobject.fastafile) print mdobject.fastafile fsaname = Fasta.find(mdobject.fastafile) fsaD = Fasta.load(fsaname) probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) for key, seq in fsaD.items(): PROBESET.probes[key] = seq for motif in motifs: if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif, probes, 'v') if motif.church == 1: motif.church = PROBESET.church(motif, probes, 'v') if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif, probes, 3, 'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc == None: motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v') if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif, probes, 'v') if motif.frac == None: motif.frac = PROBESET.frac(motif, probes, 'v', 0.7) if re.search('\.meme$', filename): motif.MAP = -math.log(motif.evalue) / math.log(10) if 1 and (motif.CRA == None): try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif, probes, 'v', tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass if re.search('\.meme$', filename): mdobject.motifs.sort(lambda x, y: cmp(x.pvalue, y.pvalue)) else: mdobject.motifs.sort(lambda x, y: cmp(x.church, y.church)) MotifTools.save_motifs(motifs, tamoname)
from gusPyCode.MDAP_proj.MDAP_defs import alignAndCombineMotifs from TAMO import MotifTools Motif = MotifTools.Motif outFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6PlusCombos.motifs.stdThresh.tmo' m = MotifTools.load('/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6.motifs.stdThresh.tmo') w = [5.8952, 5.6523, 5.0585, 4.9788, 4.9678, 4.7688] toTmo = [] toTmo.append(alignAndCombineMotifs([m[0],m[1]],[w[0],w[1]])) toTmo.append(alignAndCombineMotifs([m[0],m[4]],[w[0],w[4]])) toTmo.append(alignAndCombineMotifs([m[1],m[4]],[w[1],w[4]])) toTmo.append(alignAndCombineMotifs([m[2],m[3]],[w[2],w[3]])) toTmo.append(alignAndCombineMotifs([m[2],m[5]],[w[2],w[5]])) for e in toTmo: print e.oneletter MotifTools.save_motifs(m+toTmo,outFile) None
from gusPyCode.MDAP_proj.MDAP_defs import loadMotifsFromOutFile from TAMO import MotifTools mdOutFiles = ['/Users/biggus/Documents/James/Data/ReClustering/PrelimData_Grant_Feb09/RandSplitFastas/AceResults/Clus2_247gene_0.8_Apr16_14-46-33.ace.1.txt', '/Users/biggus/Documents/James/Data/ReClustering/PrelimData_Grant_Feb09/RandSplitFastas/AceResults/Clus2_247gene_0.8_Apr16_14-46-33.ace.2.txt', '/Users/biggus/Documents/James/Data/ReClustering/PrelimData_Grant_Feb09/RandSplitFastas/AceResults/Clus2_247gene_0.8_Apr16_14-46-33.ace.3.txt'] for mdFile in mdOutFiles: motifs = loadMotifsFromOutFile(mdFile,'list') # ['Meme', 'AlignAce', 'MDscan', 'Weeder','list'] MotifTools.save_motifs(motifs,mdFile+'.tmo') print 'Done.'
# generate PWM for indicated motifs #module load TAMO (on hpc) import sys from TAMO import MotifTools #input file: motif list input=open(sys.argv[1], 'r') motif=[] for line in input: if line.startswith("#"): pass else: line=line.strip("\n").strip("\r").strip() motif.append(line) input.close() print (motif) pw=[] for i in range(0, len(motif)): m=MotifTools.Motif_from_text(motif[i]) pw.append(m) MotifTools.save_motifs(pw,sys.argv[1]+'.tamo')
def run_CV(self,models): num_bg = len(self.all_probes.keys()) num_fg = len(self.fg_seqs.keys()) for key in self.fg_seqs.keys(): self.all_probes[key] = self.fg_seqs[key] self.probes = self.all_probes.keys() if (self.randomize): trials = 50 else: trials = 1 self.models = models std = 0.0 mean = 0.0 sum_sq = 0.0 sum_mean = 0.0 for trial in range(trials): if (self.randomize): N = num_bg/num_fg if (N>10): N = 10 fg = [] bg = [] total = len(self.probes) while(len(fg)<num_fg): sp = random.randint(0,(total-1)) if (not(fg.count(self.probes[sp])>0)): fg.append(self.probes[sp]) while(len(bg)<(N*num_fg)): sp = random.randint(0,(total-1)) if ( (not(bg.count(self.probes[sp])>0)) and (not(fg.count(self.probes[sp])>0)) ): bg.append(self.probes[sp]) else: #select a random under-sampled set of background sequences N = num_bg/num_fg (N, bg) = self.undersample(N) num_bg = N*num_fg fg = self.fg_seqs.keys() fg_group_size = len(fg)/self.cv_level #determine foreground group size bg_group_size = len(bg)/self.cv_level #determine background group size #separate sequences into groups fg_groups = {} bg_groups = {} temp_fg = [] temp_bg = [] for a in fg: temp_fg.append(a) for b in bg: temp_bg.append(b) for i in range(self.cv_level): fg_groups[i] = [] bg_groups[i] = [] if (i==(self.cv_level-1)): fg_groups[i] = temp_fg[0:] bg_groups[i] = temp_bg[0:] else: for j in range(fg_group_size): entry = temp_fg[random.randint(0,len(temp_fg)-1)] temp_fg.remove(entry) fg_groups[i].append(entry) for j in range(bg_group_size): entry = temp_bg[random.randint(0,len(temp_bg)-1)] temp_bg.remove(entry) bg_groups[i].append(entry) #################################################################################### #for each seed we run EM to train a probability model, calculate the maximum LLR for #each input sequence in a group, train a SVM classifier on the training data, and #calculate classification error on the test set. We repeat this for each cv-group #and determine the mean cross-validation error for each hypothesis. #################################################################################### self.classification_errors = {} self.refined_motifs = {} classifier = {} self.best_motif = [] num_models = len(self.models) for k in range(num_models): self.best_motif.append(None) (self.refined_motifs[k], self.classification_errors[k]) = self.train_classifier(k, fg_groups, bg_groups, N, trial) #print out some information best_c = 0 best_beta = 0.0 min = 1.0 mean = 0.0 for beta in self.classification_errors[k].keys(): for j in range(len(self.classification_errors[k][beta][0])): mean = 0.0 for i in range(self.cv_level): mean = mean + self.classification_errors[k][beta][i][j] mean = mean/self.cv_level if (mean<min): min = mean best_c = j best_beta = beta if (self.randomize): sum_sq = sum_sq + (min*min) sum_mean = sum_mean + min cv_mean = sum_mean/(trial+1) if (trial>0): try: std = math.sqrt((sum_sq - (sum_mean*sum_mean)/(trial+1))/(trial)) except: std = 0.0 stddev_err = 0.71*std/math.sqrt(trial+1) else: (self.best_motif[k], classifier[k], fn) = self.train_final(k, fg, bg, N, best_beta) print "\r\r---------------New Hypothesis---------------" print "Unrefined Hypothesis: %s"%(self.models[k]) print "Refined Hypothesis: %s Optimal Beta: %f"%(self.best_motif[k], best_beta/(best_beta+1.0)) print "Mean %i-fold cv error: %f"%(self.cv_level,min) LLR_thresh = (classifier[k][2]*classifier[k][1]/classifier[k][0] - classifier[k][3]) print "LLR match threshold: %f True positives: %i False Negatives: %i"%(LLR_thresh/self.best_motif[k].maxscore, (len(fg)-fn), fn) self.best_motif[k].source = self.models[k].source if (self.family!=''): self.best_motif[k].family = self.family self.best_motif[k].dataset = self.datafiles[0] self.best_motif[k].bgfile = self.datafiles[1] self.best_motif[k].beta = best_beta/(best_beta+1.0) self.best_motif[k].match_thresh = LLR_thresh self.best_motif[k].cverror = min print "Motif matches in positive input set:" best_pssm = MDsupport.Motif2c_PSSM(self.best_motif[k]) for seq in fg: sites = [] matches = best_pssm.matchstarts(self.all_probes[seq],LLR_thresh) if (matches): line = seq + '------> ' for match in matches: entry = str(match) + ': ' + self.all_probes[seq][match:match+self.best_motif[k].width] + ' ' line = line + entry print line if (self.randomize): print "Mean: %f Std.Dev: %f Error: %f Percent error: %f"%(cv_mean,std,stddev_err,(stddev_err/std)) else: MotifTools.save_motifs(self.best_motif, self.motif_file) return((self.best_motif, min))
motiflist.pop(blankindex) # Build matrix dictionaries and substitute the strings for the dictionaries for num in range(len(motiflist)): # Prepare emtpy list of dictionaries of the length of the motif tempmotif = [] motiflength = len(motiflist[num][1].strip().split('\t')) - 1 for item in range(motiflength): tempmotif.append({}) # Start filling in the dictionaries with the information in the matrix for line in range(1, 5): nucleotide = motiflist[num][line].strip().split('\t')[0] problist = motiflist[num][line].strip().split('\t')[1:] for position in range(motiflength): tempmotif[position][nucleotide] = float(problist[position]) # Save the list of dictionaries in the general variable motiflist[num] = tempmotif #print motiflist # TAMO CONVERSION # TAMO formated list of files tamomotifs = [] # Convert dictionaries found in sigmotifs and store them as TAMO motifs for motif in motiflist: tamomotif = MotifTools.Motif_from_counts(motif[:], beta=0.01, bg=bkgrddict) tamomotifs.append(tamomotif) MotifTools.save_motifs(tamomotifs, output)