def trim_motif(TAMO_file, cut=0.4): '''Trims the motifs in TAMO_file, eliminating low-information flanks.''' testmotifs = MotifTools.load(TAMO_file) file = TAMO_file + "_" + str(cut) + ".trim" new_mlist = [] for motif in testmotifs: m = motif.trimmed(cut) new_mlist.append(m) save_motifs(new_mlist, file)
def opentamo(fileloc): ''' Opens a tamo file with MotifTools.load and returns the list of motifs, except when the input file doesn't exist, in which case it returns an empty list. Has 1 argument: - fileloc: a string with the location of the file ''' try: return MotifTools.load(fileloc) except IOError: return []
def tamo2tamo(file, outname): global probefile, PROBESET, fsafile motifs = MotifTools.load(file) if fsafile: fsaname = fsafile else: fsaname = find_fsa(file) print '# FSA ', fsaname fsaD = MotifMetrics.fasta2seqs(fsaname, 'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) #for key,seq in fsaD.items(): # PROBESET.probes[key] = seq print "# %d motifs" % len(motifs) for motif in motifs: #motif.pvalue, motif.church = 1,1 #Comment this! if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif, probes, 'v') if motif.church == 1: motif.church = PROBESET.church(motif, probes, 'v') #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc == None: motif.ROC_auc = PROBESET.ROC_AUC(motif, probes, 'v') #if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if motif.frac == None: motif.frac = PROBESET.frac(motif, probes, 'v', 0.7) if motif.numbound == 0: matching = PROBESET.matching_ids(motif, [], factor=0.7) matchbound = [x for x in matching if x in probes] motif.numbound = len(probes) motif.nummotif = len(matching) motif.numboundmotif = len(matchbound) if 0 and motif.CRA == None: try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif, probes, 'v', tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass MotifTools.save_motifs(motifs, outname)
def tamo2tamo(file, outname): global probefile, PROBESET, fsafile motifs = MotifTools.load(file) if fsafile: fsaname = fsafile else: fsaname = find_fsa(file) print '# FSA ',fsaname fsaD = MotifMetrics.fasta2seqs(fsaname,'want_dict') probes = fsaD.keys() if not probefile: PROBESET = MotifMetrics.ProbeSet('YEAST') #PROBESET= pick_genome(fsaname) #for key,seq in fsaD.items(): # PROBESET.probes[key] = seq print "# %d motifs"%len(motifs) for motif in motifs: #motif.pvalue, motif.church = 1,1 #Comment this! if motif.pvalue == 1: motif.pvalue = PROBESET.p_value(motif,probes,'v') if motif.church == 1: motif.church = PROBESET.church(motif,probes,'v') #if motif.E_site == None: motif.E_site = PROBESET.E_sitef(motif,probes,3,'v') #if motif.E_chi2 == None: motif.E_chi2 = PROBESET.E_chi2(motif,probes,None,'v') #if motif.E_seq == None: motif.E_seq = PROBESET.E_seq(motif,probes,'v') if motif.ROC_auc== None: motif.ROC_auc= PROBESET.ROC_AUC(motif,probes,'v') #if motif.MNCP == None: motif.MNCP = PROBESET.MNCP(motif,probes,'v') if motif.frac == None: motif.frac = PROBESET.frac(motif,probes,'v',0.7) if motif.numbound == 0: matching = PROBESET.matching_ids(motif,[],factor=0.7) matchbound = [x for x in matching if x in probes] motif.numbound = len(probes) motif.nummotif = len(matching) motif.numboundmotif = len(matchbound) if 0 and motif.CRA == None: try: pass CRA, Cfrac = PROBESET.cons_ROC_AUC(motif,probes,'v',tuple='YES') motif.CRA = CRA motif.Cfrac = Cfrac except: pass MotifTools.save_motifs(motifs,outname)
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "f:m:n:L:t:a:S:i:", ["help", "output="]) # AD added 'i' except getopt.GetoptError: usage() sys.exit(1) if not opts: usage() sys.exit(1) print "#" + ' '.join(sys.argv) fastafile, motiffile, motifnums, labels, thresh = (None, None, [], None, 0.75) # AD changed thresh val to 0.75 from 0.7 ambigs = [] scale = 50.0 / 1000.0 motifs = [] for opt, value in opts: #print opt, value if opt == '-f': fastafile = value elif opt == '-m': motifs.extend(MotifTools.txt2motifs(value)) elif opt == '-n': motifnums = [int(x) for x in value.split(',')] elif opt == '-L': labels = list(value) elif opt == '-t': thresh = float(value) elif opt == '-a': ambigs.extend(value.split(',')) elif opt == '-S': scale = float(value) elif opt == '-i': motiffile = value # AD added this option to ACTUALLY supply the tamo motif file at the command-line. The code to deal with motiffiles already existed. There was just no code for User to supply one. probes = Fasta.load(fastafile) if motiffile: for f in motiffile.split(','): # AD added this to allow supplying multiple tamo files at the prompt like you can supply multiple motifs motifs.extend(MotifTools.load(f)) if ambigs: for ambig in ambigs: motifs.append( MotifTools.Motif_from_text(ambig,0.1) ) if not motifnums: motifnums = range(len(motifs)) print '# %d: %s'%(len(motifs),motifnums) for i in range(len(motifnums)): motif = motifs[motifnums[i]] if labels and i < len(labels): txt = labels[i] else: txt = '%d'%i print '%-3s : %s %5.2f (%4.2f)'%(txt,motif,thresh*motif.maxscore,thresh) probehits = {} for key in probes.keys(): hits_by_motif = [] save_flag = 0 if re.search('[BDHU]',probes[key]): continue for num in motifnums: result = motifs[num].scan(probes[key],thresh*motif.maxscore) if result[0]: hits_by_motif.append(result) save_flag = 1 else: hits_by_motif.append(None) if save_flag: probehits[key]=hits_by_motif #scale = .1 maxw = 40 for key in probehits.keys(): l = len(probes[key]) a = list('-'* int(scale*l) ) a.extend( list(' '*10 ) ) desc = [] matches = probehits[key] for i in range(len(matches)): if matches[i]: subseqs,endpoints,scores = matches[i] for idx in range(len(subseqs)): start,stop = endpoints[idx] subseq = subseqs[idx] score = scores[idx] if labels and (i<len(labels)): ID = labels[i] else : ID = '%d'%i desc.append('%s %s %d-%d %4.2f '%(ID,subseq,start,stop,score)) start = int(start*scale) for offset in range(10): if a[start+offset] == '-': if labels and (i < len(labels)): a[start+offset] = labels[i] else: a[start+offset] = '%d'%i break print '%-14s %s'%(key,''.join(a)), print ' '*max(0,maxw-len(a)), '| '.join(['%-27s'%x for x in desc]) print print "Found matches in %d of %d input probes"%(len(probehits),len(probes))
#!/usr/bin/python ''' This opens a general TAMO cluster list and outputs **TO STANDARD OUT** the probability matrices of all items there. Separated by a line with the name of each cluster. *It is recommended to be used in a bash pipeline where the standard out can be written into a file.* Has 1 argument: - motiflist: a TAMO motif list that will be outputed Returns: - A series of strings that represet the probability matrices of all motifs in the input list Author: Hector Galvez ''' from sys import argv from TAMO import MotifTools # Open list motiflist = MotifTools.load(argv[1]) # Start printing information for each motif for num in range(len(motiflist)): print '>Cluster_' + str(num + 1) motiflist[num]._print_p()
import os,sys,string from TAMO import MotifTools from TAMO.seq import Fasta from TAMO.MotifMetrics import ProbeSet promoters = ProbeSet(sys.argv[1]) geneset_ids = open(sys.argv[2]).read().split('\n')[:-1] match_ids = [] prom_ids = promoters.probes.keys() for id in geneset_ids: if id in prom_ids: match_ids.append(id) motifs = MotifTools.load(sys.argv[3]) church = 0.05 rocauc = 0.1 pvalue = 0.05 print "Name\tMotif\tChurch\tRoc-auc\tP-value" for m in motifs: m.church = promoters.church (m, match_ids) # m.ROC_auc = promoters.ROC_AUC (m, match_ids) m.pvalue = promoters.p_value (m, match_ids) if m.church <= church and m.pvalue <= pvalue: print "%s\t%s\t%s\t%s" %\ (m.source, m, m.church, m.pvalue)
def motif_matrix(fsa, motif, outfile, genome='mm9'): if genome == 'hg18': markov = "/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov" else: markov = "/nfs/data/cwng/chipseq/hypotheses/Mouse.markov" #Load motif and background adjust PSSM m = MotifTools.load(motif) EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() F = Fasta.load(fsa, key_func=lambda x: x) seqs = F.values() n_seqs = len(seqs) n_motifs = len(m) SCORES = np.zeros((n_motifs, n_seqs), dtype='float') #SHIFTS=np.zeros((n_motifs,n_seqs)) #out=open(outfile,'w') for i, M in enumerate(m): ll = M.logP EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() for pos in ll: for letter in pos.keys(): pos[letter] = pos[letter] - math.log( bg[letter]) / math.log(2.0) AM = MotifTools.Motif_from_ll(ll) #adj_model = MotifTools.Motif_from_ll(ll) #adj_model.source = M.source #pssm = MDsupport.Motif2c_PSSM(adj_model) #w=pssm.width #shift=[] #scores=[] mi, ma = AM.minscore, AM.maxscore #F_m={} #Search every seq for given motif above threshold t and print motif centered results for j, seq in enumerate(seqs): seq_fwd = seq.upper() #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1] #scores_fwd = pssm.score_probe(seq_fwd) #scores_rev = pssm.score_probe(seq_rev) #max_score=mi #max_ind=0 #for ind,s in enumerate(scores_fwd): # if s> max_score: # max_score=s # max_ind=ind # strand='+' #for ind,s in enumerate(scores_rev): # if s> max_score: # max_score=s # max_ind=ind # strand='-' max_score = AM.bestscore(seq_fwd) mscore = (max_score - mi) / (ma - mi) #orig=len(seq_fwd)/2 #bind=max_ind+w//2 #d=abs(orig-bind) SCORES[i, j] = mscore #SHIFTS[i,j]=d #out.write('%1.3f\t'%mscore) #out.write('\n') #out.close() #del F np.savetxt(outfile, SCORES, fmt='%1.3f')
# # Compare motifs in tamo format # from TAMO import MotifTools from TAMO.MotifMetrics import ProbeSet from TAMO.Clustering import MotifCompare from TAMO.Clustering import Kmedoids import sys import pickle import pprint file_unknown = sys.argv[1]# Unknown file_tfbs = sys.argv[2]# TF db motifs_unknown = MotifTools.load(file_unknown) motifs_tfbs = MotifTools.load(file_tfbs) match_dict = {} for unknown in motifs_unknown: tf_list = [] for tfbs in motifs_tfbs: #print #print "Comparing motifs:" #print " %s vs %s" % (unknown.source, tfbs.source) #print " Unknown motif ( %s ) vs TFBS ( %s ) " % (unknown, tfbs) #print joined_motifs = [] joined_motifs.append(unknown) joined_motifs.append(tfbs) print joined_motifs
from gusPyCode.MDAP_proj.MDAP_defs import alignAndCombineMotifs from TAMO import MotifTools Motif = MotifTools.Motif outFile = '/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6PlusCombos.motifs.stdThresh.tmo' m = MotifTools.load('/Users/biggus/Documents/James/Collaborations/Campbell/data/Results_HyperGeoScreen/masked/Results_gGEMS/CCupAt4Days.6-8mers.gGEMS.top6.motifs.stdThresh.tmo') w = [5.8952, 5.6523, 5.0585, 4.9788, 4.9678, 4.7688] toTmo = [] toTmo.append(alignAndCombineMotifs([m[0],m[1]],[w[0],w[1]])) toTmo.append(alignAndCombineMotifs([m[0],m[4]],[w[0],w[4]])) toTmo.append(alignAndCombineMotifs([m[1],m[4]],[w[1],w[4]])) toTmo.append(alignAndCombineMotifs([m[2],m[3]],[w[2],w[3]])) toTmo.append(alignAndCombineMotifs([m[2],m[5]],[w[2],w[5]])) for e in toTmo: print e.oneletter MotifTools.save_motifs(m+toTmo,outFile) None
genelist = argv[1].split('/')[-1] allclusters = argv[1] + '/' + genelist + '_allclusters.tamo' #print genelist oneletters = argv[1] + '/other/' + genelist + '_oneletter.tmp' symbols = argv[1] + '/other/' + genelist + '_symbols.tmp' # Open output files for writing oneletters = open(oneletters, 'w') symbols = open(symbols, 'w') # Define output variables oneletterlist = [] symbolstring = '1234567890ABCDEFGHIJKLMNOPQRSTUVWXYZ*+.,:;!' # Open list motiflist = MotifTools.load(allclusters) # Try to verify the initial list is not too long if len(motiflist) > len(symbolstring): # If the list is too long, raise an exception so that the program quits raise ValueError("The cluster list is too long for sitemap.py") # If the list is not too long, adjust the symbols string to the appropriate length else: symbolstring = symbolstring[:len(motiflist)] # Save symbol string in the symbols file and close that file symbols.write(symbolstring) symbols.close() # Add oneletter summaries to the list for num in range(len(motiflist)):
- Requires the clustering.py script to be in the same directory Author: Hector Galvez """ from sys import argv,exit # To be able to parse arguments in the command line from clustering import clusterinfo,clusteravg # Import clustering functions from clustering.py script # Import all relevant TAMO modules from TAMO import MotifTools from TAMO import Clustering from TAMO.Clustering import MotifCompare from TAMO.Clustering import Kmedoids try: # Import the motif lists from the tamo file provided as the first argument inputlist = MotifTools.load(argv[1]) except IOError: print "Couldn't find %s, moving on..." % str(argv[1].split('/')[-2] + '/' + argv[1].split('/')[-1]) exit() # Create the name of a new TAMO file for clustered motifs genelist = str(argv[1].split('/')[-3]) output = argv[1].rstrip(genelist + '.tamo') + genelist + '_clusters.tamo' # Create output information clusterinf = clusterinfo(inputlist) averages = clusteravg(inputlist,clusterinf) # Save new list of cluster averages MotifTools.save_motifs(averages,output)
def main(): ########################################################################################## #THEME.py: THEME module for performing cross-validated hypothesis testing on transcription #factor binding data. #Usage: python THEME.py foreground_fasta_file (file path) background_fasta_file (file path) #hypothesis_index (integer) -fse hypothesis_file (file path) -markov markov_background (file path) #-motif_file output_file (file path) -cv fold cross-validation (integer) ########################################################################################## if (len(sys.argv)<4): print "Usage: THEME.py foreground.fsa background.fsa hypotheses.txt" sys.exit(1) fg_file = sys.argv[1] #get fasta file with foreground sequences bg_file = sys.argv[2] #get fasta file with background sequences test_indices = sys.argv[3] #colon separated indices into fse file cv_level = 2 #default 2-fold cross-validation refine = 1 randomize = 0 beta = 0.0 delta = 0.001 motif_file = 'dummy.out' dump_categories_to_file = 0 test_family = '' #read in any command line options for arg, i in zip(sys.argv,range(len(sys.argv))): if (arg == '-cv'): cv_level = int(sys.argv[i+1]) if (arg == '-markov'): markov_file = sys.argv[i+1] if (arg == '-fse'): fse_file = sys.argv[i+1] if (arg == '-norefine'): refine = 0 if (arg == '-beta'): beta = float(sys.argv[i+1]) if (arg == '-delta'): delta = float(sys.argv[i+1]) if (arg == '-randomization'): randomize = 1 if (arg == '-motif_file'): motif_file = sys.argv[i+1] if (arg == '-dump'): dump_categories_to_file = 1 if (arg == '-family'): test_family = family FH = open(motif_file, 'w') FH.write("******THEME Motif Output******") FH.close() random.seed() cross_val = THEME(fg_file, bg_file, cv_level, markov_file) if ((beta>0.0)and(beta<1.0)) : cross_val.beta = beta/(1-beta) cross_val.delta = delta cross_val.refine = refine cross_val.randomize = randomize cross_val.motif_file = motif_file if (test_family): cross_val.family = test_family if (dump_categories_to_file): cross_val.dump = 1 ################################################################################### #get seed sequences that will be tested ################################################################################### models = [] fses = MotifTools.load(fse_file) if (test_indices=='all'): indices = range(len(fses)) else: indices = [] ivals = test_indices.split(':') for v in ivals: indices.append(int(v)) for i in indices: ll = fses[i].logP bg = EM.theMarkovBackground.zeroth() for pos in ll: for letter in pos.keys(): pos[letter] = pos[letter] - math.log(bg[letter])/math.log(2.0) adj_bg_model = MotifTools.Motif_from_ll(ll) adj_bg_model.source = fses[i].source models.append(adj_bg_model) (m, err) = cross_val.run_CV(models)
def motif_matrix(fsa,motif,outfile,genome='mm9'): if genome=='hg18': markov="/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov" else: markov="/nfs/data/cwng/chipseq/hypotheses/Mouse.markov" #Load motif and background adjust PSSM m=MotifTools.load(motif) EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() F=Fasta.load(fsa,key_func=lambda x:x) seqs=F.values() n_seqs=len(seqs) n_motifs=len(m) SCORES=np.zeros((n_motifs,n_seqs),dtype='float') #SHIFTS=np.zeros((n_motifs,n_seqs)) #out=open(outfile,'w') for i,M in enumerate(m): ll = M.logP EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() for pos in ll: for letter in pos.keys(): pos[letter] = pos[letter] - math.log(bg[letter])/math.log(2.0) AM = MotifTools.Motif_from_ll(ll) #adj_model = MotifTools.Motif_from_ll(ll) #adj_model.source = M.source #pssm = MDsupport.Motif2c_PSSM(adj_model) #w=pssm.width #shift=[] #scores=[] mi,ma=AM.minscore,AM.maxscore #F_m={} #Search every seq for given motif above threshold t and print motif centered results for j,seq in enumerate(seqs): seq_fwd = seq.upper() #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1] #scores_fwd = pssm.score_probe(seq_fwd) #scores_rev = pssm.score_probe(seq_rev) #max_score=mi #max_ind=0 #for ind,s in enumerate(scores_fwd): # if s> max_score: # max_score=s # max_ind=ind # strand='+' #for ind,s in enumerate(scores_rev): # if s> max_score: # max_score=s # max_ind=ind # strand='-' max_score=AM.bestscore(seq_fwd) mscore=(max_score-mi)/(ma-mi) #orig=len(seq_fwd)/2 #bind=max_ind+w//2 #d=abs(orig-bind) SCORES[i,j]=mscore #SHIFTS[i,j]=d #out.write('%1.3f\t'%mscore) #out.write('\n') #out.close() #del F np.savetxt(outfile,SCORES,fmt='%1.3f')