def best_model_sel(prob_treshold,survey_type="random", pro_path="/home/jemejia/CosmicVarianceLAES/"): ID_file=pro_path + "data/mock_survey/" + "ID_" + survey_type + "_surveys.dat" p_values_file= pro_path + "data/mock_survey/" + "p_values_FOF_ID_" + survey_type + "_surveys.dat" ks_data=np.loadtxt(p_values_file) m_min_arr = ks_data[:,0] m_max_arr =ks_data[:,1] f_occ_arr =ks_data[:,2] ID_survey_arr = ks_data[:,3] model_prob_arr = ks_data[:,4] print np.size(m_min_arr) #choosing th models with ks test probabilities greater than prob_treshold index=np.where(model_prob_arr>=prob_treshold) print np.size(index) best_models=np.empty( [ np.size(index) , np.size( ks_data[0,:] ) ] ) del(ks_data) best_models[:,0]=m_min_arr[index] best_models[:,1]=m_max_arr[index] best_models[:,2]=f_occ_arr[index] best_models[:,3]= ID_survey_arr[index] best_models[:,4]=model_prob_arr[index] print "the number of selected models are", np.size(best_models[:,0]) return best_models
def read(filename): """Accepts filename string. Reads filename line by line and unpickles from json each line. Returns generator of objects. """ if '.npy.list.npz' in filename: a = numpy.load(filename) return [a['arr_%s' % i] for i in xrange(len(a.keys()))] elif '.npy' in filename: # use numpy to read in; may be .gz compressed return numpy.loadtxt(filename) else: with open(filename, 'r') as f: return [cjson.decode(r) for r in f.xreadlines()]
[ 1.7, 2.0, 1.2, 4.8, 5, 4.2, ]) #var = SupervisedLDAVars(test_data, K=3) #var = SupervisedLDAVars(noisy_test_data, K=3) # use my big generated dataset n = 9994 labeled_documents = topiclib.read_sparse('data/synthbigtlc/labeled.dat')[:100] y = np.loadtxt('data/synthbigtlc/yL.npy')[:100] real_data = (labeled_documents, y) var = PartialSupervisedLDAVars(real_data, Ks=5, Kb=20) try: output = run_partial_slda(var) except Exception,e: print e import pdb; pdb.post_mortem()
def best_model_correlation(best_model_array, theta_min,theta_max,theta_bins, survey_type="random",field="full", distance=6558.3, obs_surveys=12,x_width=46.0,y_width=35.0, z_depth=41.0 ,box_length=250,random_cat_number=16, pro_path="/home/jemejia/CosmicVariance/"): print "computing correlation functions of the selected models" dmh_path=pro_path+"data/dark_matter/FOF/" laes_path=pro_path+"data/laes/FOF/" n_i= int( box_length/x_width) n_j= int( box_length/y_width) n_k= int( box_length/z_depth) n_models = n_i * n_j * n_k ID_file=pro_path + "data/mock_survey/" + "ID_" + survey_type + "_surveys.dat" ID_data=np.loadtxt(ID_file,dtype='int') survey_ID=ID_data[:,0] field_ID=ID_data[:,1] i_field = ID_data[:,2] j_field = ID_data[:,3] k_field = ID_data[:,4] moc_surveys=survey_ID[-1] ID_arr=best_models_array[:,3] index_eq_ID=np.where(ID_arr== 1) cat_number= index_eq_ID[0]-1 i_fields_to_measure=[] j_fields_to_measure=[] k_fields_to_measure=[] m_min_to_measure=[] m_max_to_measure=[] f_occ_to_measure=[] #choosing the subcatalogs of the best fields. best_correlation=np.empty([len(ID_arr),theta_bins]) std_correlation=np.empty([len(ID_arr),theta_bins]) #for w in range( len(ID_arr) ): for w in range( 7 ): index=np.where( survey_ID == int(ID_arr[w]) ) S_ID=survey_ID[index] ID_ini=S_ID[0] ID_end=int(ID_ini+obs_surveys) m_min=best_models[w,0] m_max=best_models[w,1] f_occ=best_models[w,2] print "model:",w,"parameters:" ,m_min, m_max, f_occ i_s=i_field[ID_ini:ID_end] j_s=j_field[ID_ini:ID_end] k_s=k_field[ID_ini:ID_end] corr=np.zeros( (len(i_s),theta_bins) ) corr_peebles=np.zeros( (len(i_s),theta_bins) ) corr_standard=np.zeros( (len(i_s),theta_bins) ) corr_laes=np.zeros(theta_bins) if(field=="large"): i_range=7 print "large field" else: i_range=np.size(i_s) print "full field" print "number of sub-catalogs=",i_range for i in range( i_range ): dmh_filename=dmh_path+"halos_bolshoi_"+str(i_s[i])+"-"+str(j_s[i])+"-"+str(k_s[i])+".csv" halos_prop=np.loadtxt(dmh_filename,delimiter=",",skiprows=12) halo_mass=halos_prop[:,4] x_halos=halos_prop[:,0] y_halos=halos_prop[:,1] z_halos=halos_prop[:,2] numbers=np.arange(len(halo_mass)) halo_index=np.where( (halo_mass< m_max) & (halo_mass> m_min) ) halo_mass_sel=halo_mass[halo_index] halo_index=numbers[halo_index] np.random.shuffle(halo_index) n_halos=np.size(halo_mass_sel) del halo_mass_sel n_laes=int(f_occ*n_halos) lae_index=halo_index[0:n_laes] x_laes=x_halos[lae_index] y_laes=y_halos[lae_index] del x_halos del y_halos del z_halos del halo_mass #random cat histogram generation #P.xlabel(r'$\theta$', fontsize=16) #P.ylabel(r"$\xi(\theta)$",fontsize=16) if(w==0): if(i==0): print w,i print "computing RR (it takes much time but it is only computed once)" x_random= x_width*np.random.random_sample(n_laes*random_cat_number) y_random=y_width*np.random.random_sample(n_laes*random_cat_number) RR,bins=RR_histogram(x_laes,y_laes,x_random,y_random,distance,theta_min,theta_max,theta_bins,cat_number=random_cat_number) print RR print "subcat number ",i,"i j k=",i_s[i],j_s[i],k_s[i] #random-survey histogram generation Xmin=x_width*i_s[i] Xmax=Xmin + x_width Ymin=y_width*j_s[i] Ymax=Ymin + y_width x_random= Xmin + ( Xmax - Xmin )*np.random.random_sample(n_laes) y_random=Ymin + ( Ymax - Ymin )*np.random.random_sample(n_laes) DR,bins=DR_histogram(x_laes,y_laes,x_random,y_random,distance,theta_min,theta_max,theta_bins,cat_number=1) #survey histogram generation DD,bins=DD_histogram(x_laes,y_laes,distance,theta_min,theta_max,theta_bins) corr[i,:]=landy_correlation(DD,RR,DR) print "CORR_landy=",corr[i,:] corr_laes=np.mean(corr,axis=0) std_corr=np.std(corr,axis=0) print "corr_landy=",corr_laes, "std_landy=",std_corr best_correlation[w,:]=corr_laes std_correlation[w,:]=std_corr dtheta=(theta_max - theta_min)/theta_bins correlation_data=np.empty(( np.size(corr_laes) , 3 ) ) model='{0}_{1}_{2}'.format(m_min, m_max, f_occ) model_name = 'model_{0}_{1}_{2}'.format(m_min, m_max, f_occ) filename=pro_path + "data/mock_survey/" + "correlation_best_models/" + survey_type + "_correlation_" + model_name + ".dat" angles = np.linspace( theta_min + dtheta/2.0 , theta_max - dtheta/2.0, theta_bins ) correlation_data[:,0]=angles correlation_data[:,1]=best_correlation[w,:] correlation_data[:,2]=std_correlation[w,:] np.savetxt(filename,correlation_data) #P.errorbar(correlation_data[:,0]+2.0*w, correlation_data[:,1], correlation_data[:,2],label=model,elinewidth=2.0) file_plot=pro_path + "data/mock_survey/" + "correlation_best_models/" + survey_type + "_" + field +"_"+ "correlation_plots" + ".png" #P.legend(shadow=False) obs_correlation_file=pro_path + "data/obs/hayashino_whole_SSA22_field.txt" obs_correlation=np.loadtxt(obs_correlation_file,skiprows=4) #P.ylim(ymax=0.6) #P.xlim(xmax=1040) #P.errorbar(obs_correlation[0:theta_bins,0]-3.0, obs_correlation[0:theta_bins,1], obs_correlation[0:theta_bins,2],label="Hayashino et al 2004",elinewidth=3.0,fmt="o-") #P.legend(shadow=False) #P.title(survey_type) #P.savefig(file_plot) #P.figure() return best_correlation,std_correlation,angles
run_tlc = partial(graphlib.run_variational_em, e_step_func=tlc_e_step, m_step_func=tlc_m_step, global_elbo_func=tlc_global_elbo, print_func=tlc_print_func) if __name__=='__main__': dirname = 'synthtlc' dirname = 'synthbig' # use my tlc synthetically generated dataset documents = topiclib.read_sparse(dirname + '/documents.dat') comments = topiclib.read_sparse(dirname + '/comments.dat') labeled_documents = topiclib.read_sparse(dirname + '/labeled.dat') background = topiclib.read_sparse(dirname + '/background.dat') y = np.loadtxt(dirname + '/yL.npy') real_data = (documents, comments, labeled_documents, background, y) var = TLCVars(real_data, Ku=29, Ks=5, Kb=24) try: output = run_tlc(var) except Exception,e: print e import pdb; pdb.post_mortem()
[(5,2), (6,1), (8,1), (9,1),], ], [ 1.7, 2.0, 1.2, 4.8, 5, 4.2, ]) #var = SupervisedLDAVars(test_data, K=3) #var = SupervisedLDAVars(noisy_test_data, K=3) # use my big generated dataset labeled_documents = topiclib.read_sparse('synthtlc/labeled.dat') y = np.loadtxt('synthtlc/yL.npy') real_data = (labeled_documents, y) var = SupervisedLDAVars(real_data, K=13) try: output = run_slda(var) except Exception,e: print e import pdb; pdb.post_mortem()