#first sequence set includes randomly distributed sites #second sequence set includes only one site #experiment is repeated 100 times random.seed(None) #write csv header out_file.write('Post_motif,Post_single_site\n') #loop experiments for cnt in range(0, 100): print "Experiment: ", cnt #create background sequence set: 100 seqs 283 bp long set1 = MGlib.random_DNA(283, {'A': 0.3, 'C': 0.2, 'G': 0.2, 'T': 0.3}, 100) set2 = set1[:] #compute softmax scores for background sequences in dataset gscr = MGlib.esfmax_score_seqs(set1, pssm, rpssm) #compute softmax scores for motif sequences mscr = MGlib.esfmax_score_seqs(mot.instances, pssm, rpssm) #get normal distributions for background and motif n_g = norm(mean(gscr), std(gscr)) n_m = norm(mean(mscr), std(mscr)) #create motif instances pmot1 = MGlib.sample_motif(mot, 100) pmot2 = MGlib.sample_motif(mot, 1) #insert sites in sequences e = 0 while (e < len(set1)): #insert random site in first 10 sequences
#all sites are inserted at the first position of the sequence #get individual sequence posteriors and write them together with the #score of the site inserted #repeat 100 times random.seed(None) #write csv header out_file.write('Ins_site_score,Posterior\n') #loop experiments for cnt in range(0,100): print "Experiment: ", cnt #create background sequence set: 100 seqs 283 bp long set1 = MGlib.random_DNA(283,{'A': 0.3,'C': 0.2,'G': 0.2,'T': 0.3},100) #compute softmax scores for background sequences in dataset gscr = MGlib.esfmax_score_seqs(set1,pssm,rpssm) #compute softmax scores for motif sequences mscr = MGlib.esfmax_score_seqs(mot.instances,pssm,rpssm) #get normal distributions for background and motif n_g=norm(mean(gscr), std(gscr)) n_m=norm(mean(mscr), std(mscr)) #create motif instances pmot1 = MGlib.sample_motif(mot,100) #insert sites in sequences e=0 while (e<len(set1)): set1[e] = pmot1[e] + set1[e] e = e+1
def main(): ############################################################################### #set default parameters motif_filename = "CsoR.txt" #input file out_filename = "cog_exp_sym2_c" #prefix for output verbose = 0 #verbose mode alpha = 1.0 / 300.0 #mixing ratio for regulated model rproms = 3.0 #number of regulated promoters [prior] tproms = 1811.0 #total number of promoters in genome [prior] # control number of cogs and number of permutations num_cogs = 10000 neg_cutoff = 9900 # Cog #'s less than this are negative num_perms = 100 cog_sample_size = 1000 #verbose if verbose: print "Using: ", motif_filename, " as input" if verbose: print "Writing to (suffix): ", "[void]" if out_filename==""\ else out_filename #open file for ouput try: out_file = open( out_filename + str(num_cogs) + "_s" + str(cog_sample_size) + "_p" + str(num_perms) + ".csv", "w") except (IOError, OSError) as file_open_exception: print "*** Something went wrong while opening the output file" print "*** Error: ", file_open_exception.errno, " - ",\ file_open_exception.strerror sys.exit() #compute priors PR = rproms / tproms #prior probability of regulation PB = 1.0 - PR #prior probability of non-regulation PPR = PB / PR #prior probability ratio # read motif and assign 0.25 pseudocounts to PSWM # also assign background uniform distribution for the PSSM (default) mot = MGlib.read_motif(motif_filename) mot.pseudocounts = 1 mot.background = None # save the pssm for the motif and the reverse complement #(so that they are not recalculated everytime we invoke motif.pssm) pssm = mot.pssm rpssm = pssm.reverse_complement() # Save the motif itself as a list of strings for later permuting motif_sites = [] num_motif_sites = len(mot.instances) for i in range(num_motif_sites): motif_sites.append(str(mot.instances[i])) random.seed(None) # Create the COGS all_cogs = [] the_neg_seqs = [] neg_cog_nums = [i for i in range(0, neg_cutoff)] ran_neg_cog_nums = sample(cog_sample_size, neg_cog_nums, replace=False) cog_file = open( "seqs_sym2_" + str(num_cogs) + "_s" + str(cog_sample_size) + "_p" + str(num_perms) + ".csv", "w") for i in range(0, num_cogs): label = get_cog_type(i, neg_cutoff) #print "Create cog #", i, label cur_cog = create_COG(label, mot) all_cogs.append(cur_cog) if i in ran_neg_cog_nums: # A negatively regulated cog for s in cur_cog: the_neg_seqs.append(s) cog_file.write("%d,%s\n" % (i, s)) else: for s in cur_cog: cog_file.write("%d,%s\n" % (i, s)) cog_file.close() # compute softmax scores for sampled background sequences gscr = MGlib.esfmax_score_seqs(the_neg_seqs, pssm, rpssm) # compute softmax scores for motif sequences mscr = MGlib.esfmax_score_seqs(mot.instances, pssm, rpssm) # get normal distributions for background and motif mean_gscr = mean(gscr) std_gscr = std(gscr) n_g = norm(mean_gscr, std_gscr) mean_mscr = mean(mscr) std_mscr = std(mscr) n_m = norm(mean(mscr), std(mscr)) smeans_file = open( "smeans_stds_sym2_" + str(num_cogs) + "_s" + str(cog_sample_size) + "_p" + str(num_perms) + ".csv", "w") smeans_file.write("PSSM n_g,%13.10f,%13.10f\n" % (mean_gscr, std_gscr)) smeans_file.write("PSSM n_m,%13.10f,%13.10f\n" % (mean_mscr, std_mscr)) # Create the permuted pssm and n_m and n_g for the permutation tests new_pssm_list = [] rnew_pssm_list = [] n_m_perms = [] n_g_perms = [] for j in range(0, num_perms): #print "\n***************** Create permutation #", j # permute the columns of the motif new_mot = sym_permute_motif(motif_sites) new_pssm = new_mot.pssm # rnew_pssm = new_pssm.reverse_complement() new_pssm_list.append(new_pssm) rnew_pssm_list.append(rnew_pssm) # compute score for the negative sequences gscr = MGlib.esfmax_score_seqs(the_neg_seqs, new_pssm, rnew_pssm) mean_gscr = mean(gscr) std_gscr = std(gscr) # compute softmax scores for new motif sequences mscr = MGlib.esfmax_score_seqs(new_mot.instances, new_pssm, rnew_pssm) mean_mscr = mean(mscr) std_mscr = std(mscr) smeans_file.write("PermPSSM n_g,%13.10f,%13.10f\n" % (mean_gscr, std_gscr)) smeans_file.write("PermPSSM n_m,%13.10f,%13.10f\n" % (mean_mscr, std_mscr)) # get normal distributions for background and motif n_g_temp = norm(mean_gscr, std_gscr) n_g_perms.append(n_g_temp) n_m_temp = norm(mean_mscr, std_mscr) n_m_perms.append(n_m_temp) smeans_file.close() # write csv header out_file.write( 'COG Num,Pos/Neg Regulated,Posterior,LogLikelihood,True Model LL,LL Pval\n' ) # For each cog, do the posterior calculation and the permutation tests for i in range(0, num_cogs): label = get_cog_type(i, neg_cutoff) #print "Test Cog:", i,label # The original posterior computation #compute softmax scores for sequences in dataset scrs = MGlib.esfmax_score_seqs(all_cogs[i], pssm, rpssm) #print np.min(scrs[0]), np.max(scrs[0]) # Compute posterior # get log-likelihoods for sequences in dataset llrs = MGlib.ll_ratios(scrs, n_g, n_m, alpha) # get per-sequence posterior for the sequences in dataset fpost = MGlib.PostP(llrs, PPR, 0) true_model_ll = compute_log_l(scrs, n_g, n_m, alpha) ##################################### # Permutation test log_ls = [] for j in range(0, num_perms): #print " ... perm test", j # Compute score and log likelihood for each permutation. scrs = MGlib.esfmax_score_seqs(all_cogs[i], new_pssm_list[j], rnew_pssm_list[j]) log_l = compute_log_l(scrs, n_g_perms[j], n_m_perms[j], alpha) log_ls.append(log_l) rev_pval = compute_p_val(log_ls, true_model_ll) pval = 1.0 - rev_pval out_file.write("%d,%s,%10.7f,%10.7f,%10.7f,%10.7f\n" % (i, label, fpost, rev_pval, true_model_ll, pval)) out_file.close()
def main(): ############################################################################### # set default parameters motif_filename = "CsoR.txt" # input file out_filename = "cog_exp_sym2_c" # prefix for output verbose = 0 # verbose mode alpha = 1.0 / 300.0 # mixing ratio for regulated model rproms = 3.0 # number of regulated promoters [prior] tproms = 1811.0 # total number of promoters in genome [prior] # control number of cogs and number of permutations num_cogs = 10000 neg_cutoff = 9900 # Cog #'s less than this are negative num_perms = 100 cog_sample_size = 1000 # verbose if verbose: print "Using: ", motif_filename, " as input" if verbose: print "Writing to (suffix): ", "[void]" if out_filename == "" else out_filename # open file for ouput try: out_file = open( out_filename + str(num_cogs) + "_s" + str(cog_sample_size) + "_p" + str(num_perms) + ".csv", "w" ) except (IOError, OSError) as file_open_exception: print "*** Something went wrong while opening the output file" print "*** Error: ", file_open_exception.errno, " - ", file_open_exception.strerror sys.exit() # compute priors PR = rproms / tproms # prior probability of regulation PB = 1.0 - PR # prior probability of non-regulation PPR = PB / PR # prior probability ratio # read motif and assign 0.25 pseudocounts to PSWM # also assign background uniform distribution for the PSSM (default) mot = MGlib.read_motif(motif_filename) mot.pseudocounts = 1 mot.background = None # save the pssm for the motif and the reverse complement # (so that they are not recalculated everytime we invoke motif.pssm) pssm = mot.pssm rpssm = pssm.reverse_complement() # Save the motif itself as a list of strings for later permuting motif_sites = [] num_motif_sites = len(mot.instances) for i in range(num_motif_sites): motif_sites.append(str(mot.instances[i])) random.seed(None) # Create the COGS all_cogs = [] the_neg_seqs = [] neg_cog_nums = [i for i in range(0, neg_cutoff)] ran_neg_cog_nums = sample(cog_sample_size, neg_cog_nums, replace=False) cog_file = open("seqs_sym2_" + str(num_cogs) + "_s" + str(cog_sample_size) + "_p" + str(num_perms) + ".csv", "w") for i in range(0, num_cogs): label = get_cog_type(i, neg_cutoff) # print "Create cog #", i, label cur_cog = create_COG(label, mot) all_cogs.append(cur_cog) if i in ran_neg_cog_nums: # A negatively regulated cog for s in cur_cog: the_neg_seqs.append(s) cog_file.write("%d,%s\n" % (i, s)) else: for s in cur_cog: cog_file.write("%d,%s\n" % (i, s)) cog_file.close() # compute softmax scores for sampled background sequences gscr = MGlib.esfmax_score_seqs(the_neg_seqs, pssm, rpssm) # compute softmax scores for motif sequences mscr = MGlib.esfmax_score_seqs(mot.instances, pssm, rpssm) # get normal distributions for background and motif mean_gscr = mean(gscr) std_gscr = std(gscr) n_g = norm(mean_gscr, std_gscr) mean_mscr = mean(mscr) std_mscr = std(mscr) n_m = norm(mean(mscr), std(mscr)) smeans_file = open( "smeans_stds_sym2_" + str(num_cogs) + "_s" + str(cog_sample_size) + "_p" + str(num_perms) + ".csv", "w" ) smeans_file.write("PSSM n_g,%13.10f,%13.10f\n" % (mean_gscr, std_gscr)) smeans_file.write("PSSM n_m,%13.10f,%13.10f\n" % (mean_mscr, std_mscr)) # Create the permuted pssm and n_m and n_g for the permutation tests new_pssm_list = [] rnew_pssm_list = [] n_m_perms = [] n_g_perms = [] for j in range(0, num_perms): # print "\n***************** Create permutation #", j # permute the columns of the motif new_mot = sym_permute_motif(motif_sites) new_pssm = new_mot.pssm # rnew_pssm = new_pssm.reverse_complement() new_pssm_list.append(new_pssm) rnew_pssm_list.append(rnew_pssm) # compute score for the negative sequences gscr = MGlib.esfmax_score_seqs(the_neg_seqs, new_pssm, rnew_pssm) mean_gscr = mean(gscr) std_gscr = std(gscr) # compute softmax scores for new motif sequences mscr = MGlib.esfmax_score_seqs(new_mot.instances, new_pssm, rnew_pssm) mean_mscr = mean(mscr) std_mscr = std(mscr) smeans_file.write("PermPSSM n_g,%13.10f,%13.10f\n" % (mean_gscr, std_gscr)) smeans_file.write("PermPSSM n_m,%13.10f,%13.10f\n" % (mean_mscr, std_mscr)) # get normal distributions for background and motif n_g_temp = norm(mean_gscr, std_gscr) n_g_perms.append(n_g_temp) n_m_temp = norm(mean_mscr, std_mscr) n_m_perms.append(n_m_temp) smeans_file.close() # write csv header out_file.write("COG Num,Pos/Neg Regulated,Posterior,LogLikelihood,True Model LL,LL Pval\n") # For each cog, do the posterior calculation and the permutation tests for i in range(0, num_cogs): label = get_cog_type(i, neg_cutoff) # print "Test Cog:", i,label # The original posterior computation # compute softmax scores for sequences in dataset scrs = MGlib.esfmax_score_seqs(all_cogs[i], pssm, rpssm) # print np.min(scrs[0]), np.max(scrs[0]) # Compute posterior # get log-likelihoods for sequences in dataset llrs = MGlib.ll_ratios(scrs, n_g, n_m, alpha) # get per-sequence posterior for the sequences in dataset fpost = MGlib.PostP(llrs, PPR, 0) true_model_ll = compute_log_l(scrs, n_g, n_m, alpha) ##################################### # Permutation test log_ls = [] for j in range(0, num_perms): # print " ... perm test", j # Compute score and log likelihood for each permutation. scrs = MGlib.esfmax_score_seqs(all_cogs[i], new_pssm_list[j], rnew_pssm_list[j]) log_l = compute_log_l(scrs, n_g_perms[j], n_m_perms[j], alpha) log_ls.append(log_l) rev_pval = compute_p_val(log_ls, true_model_ll) pval = 1.0 - rev_pval out_file.write("%d,%s,%10.7f,%10.7f,%10.7f,%10.7f\n" % (i, label, fpost, rev_pval, true_model_ll, pval)) out_file.close()