def tamofile2motifs(filename): FID = open(filename,'r') lines = FID.readlines() FID.close() motifs = [] seedD = {} seedfile = '' for i in range(len(lines)): if lines[i][0:10] == 'Log-odds matrix'[0:10]: w = len(lines[i+1].split())-1 ll = [] for pos in range(w): ll.append({}) for j in range(0,4): toks = lines[i+j+2].split() L = toks[0][1] for pos in range(w): ll[pos][L] = float(toks[pos+1]) m = MotifTools.Motif_from_ll(ll) motifs.append(m) if lines[i][0:6] == 'Motif '[0:6]: toks = lines[i].split() motifs[-1].nseqs = float(re.sub('[\(\)]','',toks[3])) motifs[-1].totalbits= float(toks[5]) motifs[-1].MAP = float(toks[7]) motifs[-1].seeddist = float(toks[9]) motifs[-1].seednum = int(toks[10][0:-1]) motifs[-1].pvalue = math.pow(10,-float(toks[12])) if 'ch:' in toks: motifs[-1].church = math.pow(10,-float(toks[14])) if lines[i][0:10] == 'Threshold: '[0:10]: toks = lines[i].split() motifs[-1].threshold= float(toks[1]) if lines[i][0:5] == 'Seed '[0:5]: toks = lines[i].split() id = int(toks[1][0:-1]) #'10:' -> '10' seedD[id] = toks[2] if lines[i][0:7] == 'Source: '[0:7]: motifs[-1].source = lines[i][7:].strip() if lines[i][0:6] == 'Gamma: '[0:6]: motifs[-1].gamma = float(lines[i][6:]) if lines[i][0:6] == 'Evalue: '[0:6]: motifs[-1].evalue = float(lines[i][7:].strip()) if lines[i].find('Using')>=0 and lines[i].find('as seeds')>=0: '''#Using all (132) motifs in SLT_081503.seeds as seeds:''' seedfile = lines[i].split()[-3] for i in range(len(motifs)): if seedfile: motifs[i].seedfile = seedfile seednum = motifs[i].seednum if seedD.has_key(seednum): motifs[i].seedtxt = seedD[seednum] return(motifs)
def motif_matrix(fsa, motif, outfile, genome='mm9'): if genome == 'hg18': markov = "/nfs/genomes/human_gp_mar_06/hg18_promoters_3000_1000.markov" else: markov = "/nfs/data/cwng/chipseq/hypotheses/Mouse.markov" #Load motif and background adjust PSSM m = MotifTools.load(motif) EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() F = Fasta.load(fsa, key_func=lambda x: x) seqs = F.values() n_seqs = len(seqs) n_motifs = len(m) SCORES = np.zeros((n_motifs, n_seqs), dtype='float') #SHIFTS=np.zeros((n_motifs,n_seqs)) #out=open(outfile,'w') for i, M in enumerate(m): ll = M.logP EM.loadMarkovBackground(markov) bg = EM.theMarkovBackground.zeroth() for pos in ll: for letter in pos.keys(): pos[letter] = pos[letter] - math.log( bg[letter]) / math.log(2.0) AM = MotifTools.Motif_from_ll(ll) #adj_model = MotifTools.Motif_from_ll(ll) #adj_model.source = M.source #pssm = MDsupport.Motif2c_PSSM(adj_model) #w=pssm.width #shift=[] #scores=[] mi, ma = AM.minscore, AM.maxscore #F_m={} #Search every seq for given motif above threshold t and print motif centered results for j, seq in enumerate(seqs): seq_fwd = seq.upper() #seq_rev = str(MotifTools.revcomplement(seq_fwd))[::-1] #scores_fwd = pssm.score_probe(seq_fwd) #scores_rev = pssm.score_probe(seq_rev) #max_score=mi #max_ind=0 #for ind,s in enumerate(scores_fwd): # if s> max_score: # max_score=s # max_ind=ind # strand='+' #for ind,s in enumerate(scores_rev): # if s> max_score: # max_score=s # max_ind=ind # strand='-' max_score = AM.bestscore(seq_fwd) mscore = (max_score - mi) / (ma - mi) #orig=len(seq_fwd)/2 #bind=max_ind+w//2 #d=abs(orig-bind) SCORES[i, j] = mscore #SHIFTS[i,j]=d #out.write('%1.3f\t'%mscore) #out.write('\n') #out.close() #del F np.savetxt(outfile, SCORES, fmt='%1.3f')
def main(): ########################################################################################## #THEME.py: THEME module for performing cross-validated hypothesis testing on transcription #factor binding data. #Usage: python THEME.py foreground_fasta_file (file path) background_fasta_file (file path) #hypothesis_index (integer) -fse hypothesis_file (file path) -markov markov_background (file path) #-motif_file output_file (file path) -cv fold cross-validation (integer) ########################################################################################## if (len(sys.argv)<4): print "Usage: THEME.py foreground.fsa background.fsa hypotheses.txt" sys.exit(1) fg_file = sys.argv[1] #get fasta file with foreground sequences bg_file = sys.argv[2] #get fasta file with background sequences test_indices = sys.argv[3] #colon separated indices into fse file cv_level = 2 #default 2-fold cross-validation refine = 1 randomize = 0 beta = 0.0 delta = 0.001 motif_file = 'dummy.out' dump_categories_to_file = 0 test_family = '' #read in any command line options for arg, i in zip(sys.argv,range(len(sys.argv))): if (arg == '-cv'): cv_level = int(sys.argv[i+1]) if (arg == '-markov'): markov_file = sys.argv[i+1] if (arg == '-fse'): fse_file = sys.argv[i+1] if (arg == '-norefine'): refine = 0 if (arg == '-beta'): beta = float(sys.argv[i+1]) if (arg == '-delta'): delta = float(sys.argv[i+1]) if (arg == '-randomization'): randomize = 1 if (arg == '-motif_file'): motif_file = sys.argv[i+1] if (arg == '-dump'): dump_categories_to_file = 1 if (arg == '-family'): test_family = family FH = open(motif_file, 'w') FH.write("******THEME Motif Output******") FH.close() random.seed() cross_val = THEME(fg_file, bg_file, cv_level, markov_file) if ((beta>0.0)and(beta<1.0)) : cross_val.beta = beta/(1-beta) cross_val.delta = delta cross_val.refine = refine cross_val.randomize = randomize cross_val.motif_file = motif_file if (test_family): cross_val.family = test_family if (dump_categories_to_file): cross_val.dump = 1 ################################################################################### #get seed sequences that will be tested ################################################################################### models = [] fses = MotifTools.load(fse_file) if (test_indices=='all'): indices = range(len(fses)) else: indices = [] ivals = test_indices.split(':') for v in ivals: indices.append(int(v)) for i in indices: ll = fses[i].logP bg = EM.theMarkovBackground.zeroth() for pos in ll: for letter in pos.keys(): pos[letter] = pos[letter] - math.log(bg[letter])/math.log(2.0) adj_bg_model = MotifTools.Motif_from_ll(ll) adj_bg_model.source = fses[i].source models.append(adj_bg_model) (m, err) = cross_val.run_CV(models)