def simqso_predict_dr7dr12(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_test = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') df_train = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5', 'data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_ks', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_test = df_test.query('0.3 < Z_VI < 5.5') # df_train.query('obsMag_SDSS_i <= 18.5',inplace=True) # df_test.query('SDSS_mag_i <= 18.5',inplace=True) # df_train.query('z > 1.1',inplace=True) for name in passband_names: df_train.rename(columns={'obsFlux_' + name: name}, inplace=True) df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_test.replace(np.inf, np.nan, inplace=True) df_train.replace(np.inf, np.nan, inplace=True) df_test, features = qs.prepare_flux_ratio_catalog(df_test, passband_names) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) print df_test.shape, df_train.shape # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- # features = ['SDSS_u','SDSS_i','SDSS_r','SDSS_z','SDSS_g','WISE_w1','WISE_w2'] # features = ['SDSS_i','WISE_w1','ug','gr','ri','iz','zw1','w1w2'] features = ['ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', 'hks', 'ksw1', 'w1w2'] label = 'z' rand_state = 1 params = { 'n_estimators': 300, 'max_depth': 30, 'min_samples_split': 4, 'n_jobs': 2, 'random_state': rand_state } df_test = rf.rf_reg_predict(df_train, df_test, features, label, params, 'rf_photoz') ml_an.evaluate_regression(df_test['Z_VI'], df_test['rf_photoz']) pz_an.plot_redshifts(df_test['Z_VI'], df_test['rf_photoz']) pz_an.plot_error_hist(df_test['Z_VI'], df_test['rf_photoz']) plt.show()
def predict_example(): # UNRESOLVED ISSUES WITH PREDICTION # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_test = pd.read_hdf('../class_photoz/data/DR7DR14Q_flux_cat.hdf5', 'data') # df_train = pd.read_hdf('../class_photoz/data/DR7DR14Q_flux_cat.hdf5','data') df_train = pd.read_hdf('../class_photoz/data/brightqsos_2.hdf5', 'data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_ks', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] # Try a fraction of the whole datafile first df_train = df_train.sample(frac=1.0) # df_test.query('Z > 1.1',inplace=True) # df_train.query('z > 1.1',inplace=True) for name in passband_names: df_train.rename(columns={'obsFlux_' + name: name}, inplace=True) df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_test.replace(np.inf, np.nan, inplace=True) df_train.replace(np.inf, np.nan, inplace=True) df_test, features = qs.prepare_flux_ratio_catalog(df_test, passband_names) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) print df_test.shape, df_train.shape # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', 'hks', 'ksw1', 'w1w2'] label = 'z' params = { 'kernel': 'rbf', 'C': 1.0, 'gamma': 0.001, 'epsilon': 0.2, 'cache_size': 1200 } df_test = svr.svm_reg_predict(df_train, df_test, features, label, params, 'svm_photoz') print df_test['svm_photoz'].describe() ml_an.evaluate_regression(df_test['Z'], df_test['svm_photoz']) pz_an.plot_redshifts(df_test['Z'], df_test['svm_photoz']) pz_an.plot_error_hist(df_test['Z'], df_test['svm_photoz']) plt.show()
def dr7dr12_predict_simqso(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_train = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') df_test = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5', 'data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_ks', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df_train.query('0.3 < Z_VI < 5.5') df_test.query('obsMag_SDSS_i <= 18.5', inplace=True) df_train.query('SDSS_mag_i <= 18.5', inplace=True) # df_train.query('z > 1.1',inplace=True) for name in passband_names: df_test.rename(columns={'obsFlux_' + name: name}, inplace=True) df_test.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_test.replace(np.inf, np.nan, inplace=True) df_train.replace(np.inf, np.nan, inplace=True) df_test, features = qs.prepare_flux_ratio_catalog(df_test, passband_names) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) print df_test.shape, df_train.shape # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- # features = ['SDSS_u','SDSS_i','SDSS_r','SDSS_z','SDSS_g','WISE_w1','WISE_w2'] features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', 'hks', 'ksw1', 'w1w2'] label = 'Z_VI' rand_state = 1 params = { 'kernel': 'rbf', 'C': 1.0, 'gamma': 0.001, 'epsilon': 0.2, 'cache_size': 1200 } df_test = svr.svm_reg_predict(df_train, df_test, features, label, params, 'svm_photoz') ml_an.evaluate_regression(df_test['z'], df_test['svm_photoz']) pz_an.plot_redshifts(df_test['z'], df_test['svm_photoz']) pz_an.plot_error_hist(df_test['z'], df_test['svm_photoz']) plt.show()
def grid_search_example(): df_stars = pd.read_hdf( '../class_photoz/data/DR13_stars_clean_flux_cat.hdf5', 'data') df_quasars = pd.read_hdf( '../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') passband_names = [ 'SDSS_u', 'SDSS_g', 'SDSS_r', 'SDSS_i', 'SDSS_z', 'TMASS_j', 'TMASS_h', 'TMASS_k', 'WISE_w1', 'WISE_w2', # 'WISE_w3', \ # 'WISE_w4', \ ] df_stars, features = qs.prepare_flux_ratio_catalog(df_stars, passband_names) df_quasars, features = qs.prepare_flux_ratio_catalog( df_quasars, passband_names) #Reduce the total set of objects for testing the routines df_stars = df_stars.sample(frac=0.2) df_quasars = df_quasars.sample(frac=0.2) # Build a test sample with a given QSO to STAR ratio df = qs.build_full_sample(df_stars, df_quasars, 100) print df.label.value_counts() # Declare labels and select features to classify on labels = ["STAR", "QSO"] features = ['WISE_w1', 'TMASS_j', 'jh', 'hk', 'kw1', 'w1w2'] # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', \ # 'hk', 'kw1', 'w1w2'] # features = ['SDSS_i','TMASS_j','ug','gr','ri','iz','zj','jh', \ # 'hk'] # features = ['SDSS_i','ug','gr','ri','iz'] label = 'label' param_grid = [{ 'n_estimators': [25, 50, 100, 200], 'min_samples_split': [2, 3, 4], 'max_depth': [10, 15, 20] }] rf_class.rf_class_grid_search(df, features, label, param_grid)
def test_star_fit(): # Load the catalog from wich to make the quasar model df = pd.read_hdf('../class_photoz/data/DR13_stars_clean_flux_cat.hdf5','data') df.drop(df.query('star_class == "null"').index, inplace=True) df = df.query('SDSS_mag_i < 17.5') #specify passband and other column names for model file passband_names = ['SDSS_u',\ 'SDSS_g',\ 'SDSS_r',\ 'SDSS_i',\ 'SDSS_z',\ 'WISE_w1',\ 'WISE_w2'] df, features = qs.prepare_flux_ratio_catalog(df, \ passband_names, sigma=True) params = {'binning' : 'minimum', 'bin_param' : 100, 'model_type' : 'median'} label = 'star_class' star_fit_test(df, features, label, params, rand_state = 1, save_data=True, save_name = 'test')
def test_photoz(): # Load the catalog from wich to make the quasar model df = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5','data') df = df.query('0 < Z_VI < 10') df = df.query('SDSS_mag_i < 17.5') # df = df.sample(frac=0.2) #specify passband and other column names for model file passband_names = ['SDSS_u',\ 'SDSS_g',\ 'SDSS_r',\ 'SDSS_i',\ 'SDSS_z',\ 'WISE_w1',\ 'WISE_w2'] df, features = qs.prepare_flux_ratio_catalog(df, \ passband_names, sigma=True) label = 'Z_VI' params = {'binning' : 'minimum', 'bin_param' : 100, 'model_type' : 'median'} photoz_fit_test(df,features,label,params,rand_state = 1, save_data=True, save_name = 'test2')
def simqso_test(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_train = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5', 'data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] for name in passband_names: df_train.rename(columns={'obsFlux_' + name: name}, inplace=True) df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_train.replace(np.inf, np.nan, inplace=True) # df_train.query('obsMag_SDSS_i <= 18.5',inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # features = ['SDSS_i','ug','gr','ri','iz'] label = 'z' rand_state = 1 params = { 'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 4, 'n_jobs': 2, 'random_state': rand_state } rf.rf_reg_example(df_train, features, label, params, rand_state, save=True, save_filename='rf_sim_sdssw1w2')
def dr7dr12_test(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_train = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train.replace(np.inf, np.nan, inplace=True) df_train = df_train.query('0 < Z_VI < 10') # df_train.query('SDSS_mag_i <= 18.5',inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # features = ['SDSS_i','ug','gr','ri','iz'] label = 'Z_VI' rand_state = 1 params = { 'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2, 'n_jobs': 2, 'random_state': rand_state } print df_train.shape[0] rf.rf_reg_example(df_train, features, label, params, rand_state, save=True, save_filename='rf_sdssw1w2')
def test_example(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_train = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') # df_train = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k.hdf5','data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train.replace(np.inf, np.nan, inplace=True) df_train.query('10 > Z_VI > 0.0 and PSFMAG_I < 18.5', inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # df_train = df_train.sample(frac=0.5) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', 'hk', 'kw1', 'w1w2'] label = 'z' rand_state = 1 params = { 'n_estimators': 200, 'max_depth': 25, 'min_samples_split': 2, 'n_jobs': 2, 'random_state': rand_state, } rf.rf_reg_example(df_train, features, label, params, rand_state, save=True, save_filename='test')
def dr7dr12_test(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_train = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train.replace(np.inf, np.nan, inplace=True) df_train = df_train.query('0 < Z_VI < 10') # df_train.query('SDSS_mag_i <= 18.5',inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # features = ['SDSS_i','ug','gr','ri','iz'] label = 'Z_VI' rand_state = 1 params = { 'kernel': 'rbf', 'epsilon': 0.1, 'C': 10, 'gamma': 0.1, 'cache_size': 2000 } svr.svm_reg_example(df_train, features, label, params, rand_state, save=True, save_filename='svr_sdssw1w2')
def test_example(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_train = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') # Try a fraction of the whole datafile first df_train = df_train.sample(frac=0.1) passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train.replace(np.inf, np.nan, inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] label = 'z' rand_state = 1 params = { 'kernel': 'linear', 'C': 1.0, 'gamma': 0.001, 'epsilon': 0.2, 'cache_size': 1200 } svr.svm_reg_example(df_train, features, label, params, rand_state, save=True, save_filename='test')
def simqso_test(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_train = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5', 'data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] for name in passband_names: df_train.rename(columns={'obsFlux_' + name: name}, inplace=True) df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_train.replace(np.inf, np.nan, inplace=True) df_train.query('obsMag_SDSS_i <= 18.5', inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # features = ['SDSS_i','ug','gr','ri','iz'] label = 'z' rand_state = 1 params = { 'kernel': 'rbf', 'epsilon': 0.1, 'C': 10, 'gamma': 0.1, 'cache_size': 1200 } svr.svm_reg_example(df_train, features, label, params, rand_state)
def grid_search_example(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_train = pd.read_hdf('../class_photoz/data/DR7DR14Q_flux_cat.hdf5', 'data') passband_names = [ 'SDSS_u', 'SDSS_g', 'SDSS_r', 'SDSS_i', 'SDSS_z', # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1', 'WISE_w2', # 'WISE_w3' \ ] # Try a fraction of the whole datafile first df_train = df_train.sample(frac=0.1) df_train.replace(np.inf, np.nan, inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] label = 'Z' rand_state = 1 param_grid = [{'C': [1, 10], 'gamma': [0.001], \ 'kernel': ('rbf','linear')}] scores = ['mean_absolute_error', 'mean_squared_error', 'r2'] svr.svm_reg_grid_search(df_train, features, label, param_grid, rand_state, scores, 'example')
def grid_search_example(): # -------------------------------------------------------------------------- # Preparing the feature matrix # -------------------------------------------------------------------------- df_train = pd.read_hdf('../class_photoz/data/DR7DR14Q_flux_cat.hdf5', 'data') df_train = df_train.sample(frac=0.1) passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train.replace(np.inf, np.nan, inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] label = 'Z' rand_state = 1 param_grid = [{'n_estimators': [5,10], 'min_samples_split': [2], \ 'max_depth' : [15]} ] # scores = ['neg_mean_absolute_error','neg_mean_squared_error','r2',] scores = [ 'r2', ] rf.rf_reg_grid_search(df_train, features, label, param_grid, rand_state, scores, 'example')
def simqsos_grid_search(): # TODO This needs to be adjusted for the simulated QSOS # -------------------------------------------------------------------------- # Read data file and input parameters # -------------------------------------------------------------------------- df_stars = pd.read_hdf( '../class_photoz/data/DR13_stars_clean_flux_cat.hdf5', 'data') df_quasars = pd.read_hdf( '../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') param_grid = [{ 'n_estimators': [50, 100, 200, 300], 'min_samples_split': [2, 3, 4], 'max_depth': [15, 20, 25] }] rand_state = 1 scores = ['f1_weighted'] # Restrict the data set df_stars.query('SDSS_mag_i <= 18.5', inplace=True) df_quasars.query('SDSS_mag_i <=18.5', inplace=True) # Create basic classes df_quasars['label'] = 'QSO' df_stars['label'] = 'STAR' #Create more detailed classes df_stars, df_quasars = create_labels(df_stars, df_quasars, 'Z_VI') # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ # 'WISE_w1','WISE_w2', \ ] df_stars_train = df_stars.copy(deep=True) df_qsos_train = df_quasars.copy(deep=True) df_stars_train, features = qs.prepare_flux_ratio_catalog( df_stars_train, passband_names) df_qsos_train, features = qs.prepare_flux_ratio_catalog( df_qsos_train, passband_names) df = pd.concat([df_stars_train, df_qsos_train]) #Choose label: 'label' = 2 classes, 'class_label'= multiple classes label = 'class_label' features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- rf_class.rf_class_grid_search(df, features, label, param_grid, rand_state, scores, 'SDSS') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1','WISE_w2', \ ] df_stars_train = df_stars.copy(deep=True) df_qsos_train = df_quasars.copy(deep=True) df_stars_train, features = qs.prepare_flux_ratio_catalog( df_stars_train, passband_names) df_qsos_train, features = qs.prepare_flux_ratio_catalog( df_qsos_train, passband_names) df = pd.concat([df_stars_train, df_qsos_train]) #Choose label: 'label' = 2 classes, 'class_label'= multiple classes label = 'class_label' features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- rf_class.rf_class_grid_search(df, features, label, param_grid, rand_state, scores, 'SDSSW1W2') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ 'TMASS_j', \ 'TMASS_h', \ 'TMASS_k', \ 'WISE_w1','WISE_w2', \ ] df_stars_train = df_stars.copy(deep=True) df_qsos_train = df_quasars.copy(deep=True) df_stars_train, features = qs.prepare_flux_ratio_catalog( df_stars_train, passband_names) df_qsos_train, features = qs.prepare_flux_ratio_catalog( df_qsos_train, passband_names) df = pd.concat([df_stars_train, df_qsos_train]) #Choose label: 'label' = 2 classes, 'class_label'= multiple classes label = 'class_label' features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', \ 'hk', 'kw1', 'w1w2'] # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- rf_class.rf_class_grid_search(df, features, label, param_grid, rand_state, scores, 'SDSSTMASSW1W2')
def dr7dr12q_grid_search(): # -------------------------------------------------------------------------- # Read data file and input parameters # -------------------------------------------------------------------------- df_stars = pd.read_hdf( '../class_photoz/data/DR13_stars_clean_flux_cat.hdf5', 'data') df_quasars = pd.read_hdf( '../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') df_stars.dropna(subset=['star_class'], inplace=True) param_grid = [{ 'n_estimators': [100, 200, 300], 'min_samples_split': [2, 3, 4], 'max_depth': [15, 20, 25] }] # param_grid = [{'n_estimators': [100], 'min_samples_split': [2], # 'max_depth' : [20]}] rand_state = 1 scores = ['f1_weighted'] # Restrict the data set df_stars.query('SDSS_mag_i <= 21.5', inplace=True) df_quasars.query('SDSS_mag_i <= 21.5', inplace=True) # Create basic classes df_quasars['label'] = 'QSO' df_stars['label'] = 'STAR' #Create more detailed classes df_quasars = qs.create_qso_labels(df_quasars, 'class_label', 'Z_VI') df_stars = qs.create_star_labels(df_stars, 'class_label', 'star_class') # FOR TESTING PURPOSES # df_stars = df_stars.sample(frac=0.2) # df_quasars = df_quasars.sample(frac=0.2) # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ # 'WISE_w1','WISE_w2', \ ] df_stars_train = df_stars.copy(deep=True) df_qsos_train = df_quasars.copy(deep=True) label = 'class_label' df_stars_train, features = qs.prepare_flux_ratio_catalog( df_stars_train, passband_names) df_qsos_train, features = qs.prepare_flux_ratio_catalog( df_qsos_train, passband_names) df_train, df_pred = qs.make_train_pred_set(df_stars_train, df_qsos_train, 0.2, rand_state, 'i19_5_') #Choose label: 'label' = 2 classes, 'class_label'= multiple classes features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- #rf_class.rf_class_grid_search(df_train, df_pred, features, label ,param_grid, rand_state, scores, 'test') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1','WISE_w2', \ ] label = 'class_label' df_stars_train = df_stars.copy(deep=True) df_qsos_train = df_quasars.copy(deep=True) df_stars_train, features = qs.prepare_flux_ratio_catalog( df_stars_train, passband_names) df_qsos_train, features = qs.prepare_flux_ratio_catalog( df_qsos_train, passband_names) df_train, df_pred = qs.make_train_pred_set(df_stars_train, df_qsos_train, 0.2, rand_state) #Choose label: 'label' = 2 classes, 'class_label'= multiple classes features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] print df_train.shape, df_pred.shape # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- #rf_class.rf_class_grid_search(df_train, df_pred, features, label ,param_grid, rand_state, scores, 'test') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ 'TMASS_j', \ 'TMASS_h', \ 'TMASS_k', \ 'WISE_w1','WISE_w2', \ ] label = 'class_label' df_stars_train = df_stars.copy(deep=True) df_qsos_train = df_quasars.copy(deep=True) df_stars_train, features = qs.prepare_flux_ratio_catalog( df_stars_train, passband_names) df_qsos_train, features = qs.prepare_flux_ratio_catalog( df_qsos_train, passband_names) df_train, df_pred = qs.make_train_pred_set(df_stars_train, df_qsos_train, 0.2, rand_state) #Choose label: 'label' = 2 classes, 'class_label'= multiple classes features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', \ 'hk', 'kw1', 'w1w2']
def simqsos_grid_search(): # -------------------------------------------------------------------------- # Read data file and input parameters # -------------------------------------------------------------------------- df = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5','data') # df = df.sample(frac=0.1) label = 'z' rand_state = 1 param_grid = [{'n_estimators': [50,100,200,300], 'min_samples_split': [2,3,4], \ 'max_depth' : [15,20,25]} ] # scores = ['neg_mean_absolute_error','neg_mean_squared_error','r2',] scores = ['r2'] df.replace(np.inf, np.nan,inplace=True) # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['PS_g','PS_r','PS_i','PS_z', 'PS_y', # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', 'WISE_w2', \ ] df_train = df.copy(deep=True) for name in passband_names: df_train.rename(columns={'obsFlux_'+name:name},inplace=True) df_train.rename(columns={'obsFluxErr_'+name:'sigma_'+name},inplace=True) df_train,features = qs.prepare_flux_ratio_catalog(df_train,passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['PS_i', 'WISE_w1', 'gr', 'ri', 'iz', 'zy', 'yw1', 'w1w2'] rf.rf_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'simqsos_PS5W1W2') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['PS_g','PS_r','PS_i','PS_z', 'PS_y', # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', 'WISE_w2', \ ] df_train = df.copy(deep=True) for name in passband_names: df_train.rename(columns={'obsFlux_'+name:name},inplace=True) df_train.rename(columns={'obsFluxErr_'+name:'sigma_'+name},inplace=True) df_train,features = qs.prepare_flux_ratio_catalog(df_train,passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['PS_i', 'gr', 'ri', 'iz', 'zy'] rf.rf_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'simqsos_PS5a') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['PS_g','PS_r','PS_i','PS_z', 'PS_y', # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', 'WISE_w2', \ ] df_train = df.copy(deep=True) for name in passband_names: df_train.rename(columns={'obsFlux_'+name:name},inplace=True) df_train.rename(columns={'obsFluxErr_'+name:'sigma_'+name},inplace=True) df_train.query('obsMag_SDSS_i <= 18.5',inplace=True) df_train,features = qs.prepare_flux_ratio_catalog(df_train,passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['PS_i', 'WISE_w1', 'gr', 'ri', 'iz', 'zy', 'yw1', 'w1w2'] rf.rf_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'simqsos_PS5W1W2_icut') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['PS_g','PS_r','PS_i','PS_z', 'PS_y', # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', 'WISE_w2', \ ] df_train = df.copy(deep=True) for name in passband_names: df_train.rename(columns={'obsFlux_'+name:name},inplace=True) df_train.rename(columns={'obsFluxErr_'+name:'sigma_'+name},inplace=True) df_train.query('obsMag_SDSS_i <= 18.5',inplace=True) df_train,features = qs.prepare_flux_ratio_catalog(df_train,passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['PS_i', 'gr', 'ri', 'iz', 'zy'] rf.rf_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'simqsos_PS5_icut') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['PS_g','PS_r','PS_i','PS_z', 'PS_y', # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', 'WISE_w2', \ ] df_train = df.copy(deep=True) for name in passband_names: df_train.rename(columns={'obsFlux_'+name:name},inplace=True) df_train.rename(columns={'obsFluxErr_'+name:'sigma_'+name},inplace=True) df_train.query('obsMag_SDSS_i <= 18.5',inplace=True) df_train['kw2'] = df_train.obsMag_TMASS_k-df_train.obsMag_WISE_w2 df_train['jk'] = df_train.obsMag_TMASS_j-df_train.obsMag_TMASS_k df_train.query('kw2 >= -0.501208-0.848*jk',inplace=True) df_train,features = qs.prepare_flux_ratio_catalog(df_train,passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['PS_i', 'WISE_w1', 'gr', 'ri', 'iz', 'zy', 'yw1', 'w1w2'] rf.rf_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'simqsos_PS5W1W2_icut_colorcut')
def simqsos_grid_search(): # -------------------------------------------------------------------------- # Read data file and input parameters # -------------------------------------------------------------------------- df = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5', 'data') # df = df.sample(frac=0.1) label = 'z' rand_state = 1 param_grid = [{'C': [10,1.0,0.1], 'gamma': [0.01,0.1,1.0], \ 'kernel': ['rbf'],'epsilon':[0.1,0.2,0.3]}] # scores = ['neg_mean_absolute_error','neg_mean_squared_error','r2',] scores = ['r2'] df.replace(np.inf, np.nan, inplace=True) # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) for name in passband_names: df_train.rename(columns={'obsFlux_' + name: name}, inplace=True) df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] svr.svm_reg_grid_search(df_train, features, label, param_grid, rand_state, scores, 'simqsos_SDSS5W1W2') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) for name in passband_names: df_train.rename(columns={'obsFlux_' + name: name}, inplace=True) df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] svr.svm_reg_grid_search(df_train, features, label, param_grid, rand_state, scores, 'simqsos_SDSS5a') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) for name in passband_names: df_train.rename(columns={'obsFlux_' + name: name}, inplace=True) df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_train.query('obsMag_SDSS_i <= 18.5', inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # svr.svm_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'simqsos_SDSS5W1W2_icut') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) for name in passband_names: df_train.rename(columns={'obsFlux_' + name: name}, inplace=True) df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_train.query('obsMag_SDSS_i <= 18.5', inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] # svr.svm_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'simqsos_SDSS5_icut') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) for name in passband_names: df_train.rename(columns={'obsFlux_' + name: name}, inplace=True) df_train.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_train.query('obsMag_SDSS_i <= 18.5', inplace=True) df_train['kw2'] = df_train.obsMag_TMASS_k - df_train.obsMag_WISE_w2 df_train['jk'] = df_train.obsMag_TMASS_j - df_train.obsMag_TMASS_k df_train.query('kw2 >= -0.501208-0.848*jk', inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2']
def photofit_full_emp(df_pred): # Load the catalog from wich to make the star model df_stars = pd.read_hdf('../class_photoz/data/DR13_stars_clean_flux_cat.hdf5','data') df_stars.drop(df_stars.query('star_class == "null"').index, inplace=True) # Load the catalog from wich to make the quasar model df_qsos = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5','data') df_qsos = df_qsos.query('0 <= Z_VI <= 10') print df_qsos.shape z_label = 'Z_VI' star_label = 'class_label' rand_state = 1 params = {'binning' : 'minimum', 'bin_param' : 50, 'model_type' : 'median'} df_qsos = df_qsos.query('SDSS_mag_i < 18.5') df_stars = df_stars.query('SDSS_mag_i < 18.5') df_stars = qs.create_star_labels(df_stars, star_label, 'star_class') # Set binary and multi class columns for evaluation routines df_stars['bin_class_true'] = 'STAR' df_stars['mult_class_true'] = df_stars[star_label] df_qsos['bin_class_true'] = 'QSO' df_qsos = pf_an.set_redshift_classes(df_qsos, 'Z_VI', 'mult_class_true') #specify passband and other column names for model file passband_names = ['SDSS_u',\ 'SDSS_g',\ 'SDSS_r',\ 'SDSS_i',\ 'SDSS_z',\ 'WISE_w1',\ 'WISE_w2',\ ] df_stars, features = qs.prepare_flux_ratio_catalog(df_stars, \ passband_names, sigma=True) df_qsos, features = qs.prepare_flux_ratio_catalog(df_qsos, \ passband_names, sigma=False) print df_qsos.shape, features df_train = pd.concat([df_stars,df_qsos]) df_pred, qso_prob, qso_chisq = \ photoz_fit(df_qsos,df_pred,features, z_label, params) df_pred, star_prob, star_chisq, star_model = \ star_fit(df_stars, df_pred, features, star_label, params) # Classify the test set according to the lowest chi-squared value df_pred = pf_an.set_redshift_classes(df_pred, 'pf_photoz', 'pf_qso_class') df_pred = pf_an.set_pred_classes(df_pred) return df_pred
def test_example(): df_stars = pd.read_hdf( '../class_photoz/data/DR13_stars_clean_flux_cat.hdf5', 'data') df_quasars = pd.read_hdf( '../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') passband_names = ['SDSS_u', 'SDSS_g', 'SDSS_r', 'SDSS_i', 'SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', \ 'WISE_w2', \ # 'WISE_w3', \ # 'WISE_w4', \ ] df_stars, features = \ qs.prepare_flux_ratio_catalog(df_stars, passband_names) df_quasars, features = \ qs.prepare_flux_ratio_catalog(df_quasars, passband_names) # Reduce the total set of objects for testing the routines # df_stars = df_stars.sample(frac=0.2) # df_quasars = df_quasars.sample(frac=0.2) df_stars.query('SDSS_mag_i <= 21.5', inplace=True) df_quasars.query('SDSS_mag_i <= 21.5', inplace=True) print "Stars: ", df_stars.shape print "Quasars: ", df_quasars.shape # Create detailed classes df_quasars = qs.create_qso_labels(df_quasars, 'mult_class_true', 'z') df_stars = qs.create_star_labels(df_stars, 'mult_class_true', 'star_class') # Create binary classes df_quasars['bin_class_true'] = 'QSO' df_stars['bin_class_true'] = 'STAR' # Make test and training set df_train, df_pred = qs.make_train_pred_set(df_stars, df_quasars, 0.2, rand_state=1) #features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', \ # 'hk', 'kw1', 'w1w2'] # features = ['SDSS_i','TMASS_j','ug','gr','ri','iz','zj','jh', 'hk'] features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] #features = ['SDSS_i','ug','gr','ri','iz'] label = 'mult_class_true' params = { 'n_estimators': 300, 'max_depth': 25, 'min_samples_split': 3, 'n_jobs': 2, 'random_state': 1 } rand_state = 1 y_true, y_pred, df_prob = rf_class.rf_class_example( df_train, df_pred, features, label, params, rand_state)
def full_test(): # -------------------------------------------------------------------------- # Loading and preparing the data files # -------------------------------------------------------------------------- df_stars = pd.read_hdf( '../class_photoz/data/DR13_stars_clean_flux_cat.hdf5', 'data') df_quasars = pd.read_hdf( '../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') # df_quasars = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5','data') # df_stars.drop(df_stars.query('star_class == "null"').index, inplace=True) passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', \ 'WISE_w2', \ ] # embed this in the sim qso conversion file! for name in passband_names: df_quasars.rename(columns={'obsFlux_' + name: name}, inplace=True) df_quasars.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_stars, features = qs.prepare_flux_ratio_catalog(df_stars, passband_names) df_quasars, features = qs.prepare_flux_ratio_catalog( df_quasars, passband_names) # Introducing selection criteria df_stars.query('SDSS_mag_i <= 21.5', inplace=True) df_quasars.query('SDSS_mag_i <= 21.5', inplace=True) # df_quasars.query('obsMag_SDSS_i <= 18.5',inplace=True) print "Stars: ", df_stars.shape print "Quasars: ", df_quasars.shape # -------------------------------------------------------------------------- # Preparing test and training sets # -------------------------------------------------------------------------- #Create detailed classes df_quasars = qs.create_qso_labels(df_quasars, 'mult_class_true', 'Z_VI') df_stars = qs.create_star_labels(df_stars, 'mult_class_true', 'star_class') # Create binary classes df_quasars['bin_class_true'] = 'QSO' df_stars['bin_class_true'] = 'STAR' # Make test and training set df_train, df_pred = qs.make_train_pred_set(df_stars, df_quasars, 0.2, rand_state=1) print df_train.shape, df_pred.shape # -------------------------------------------------------------------------- # Running the Random Forest method # -------------------------------------------------------------------------- # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', \ # 'hk', 'kw1', 'w1w2'] # features = ['SDSS_i','WISE_w1','ug','gr','ri','iz', \ # 'zw1', 'w1w2'] features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] label = 'mult_class_true' params = { 'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2, 'n_jobs': 2, 'random_state': 1 } print features print params rand_state = 1 y_true, y_pred, df_prob = \ rf_class.rf_class_example(df_train, df_pred, features, label, params,rand_state) # -------------------------------------------------------------------------- # Additional analysis # -------------------------------------------------------------------------- data = {'mult_class_true': y_true, 'mult_class_pred': y_pred} df = pd.DataFrame(data) df['bin_class_pred'] = 'STAR' df['bin_class_true'] = 'STAR' qso_query = 'mult_class_pred == "vlowz" or mult_class_pred == "lowz" or mult_class_pred == "midz" or mult_class_pred == "highz"' df.loc[df.query(qso_query).index, 'bin_class_pred'] = 'QSO' qso_query_true = 'mult_class_true == "vlowz" or mult_class_true == "lowz" or mult_class_true == "midz" or mult_class_true == "highz"' df.loc[df.query(qso_query_true).index, 'bin_class_true'] = 'QSO' labels = ('QSO', 'STAR') y_true = df.bin_class_true.values y_pred = df.bin_class_pred.values pf_an.classification_analysis(y_true, y_pred, labels) df.to_hdf('fitted_classes.hdf5', 'data')
def DR7DR12_grid_search(): # -------------------------------------------------------------------------- # Read data file and input parameters # -------------------------------------------------------------------------- df = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') df = df.query('0 < Z_VI < 10') df.replace(np.inf, np.nan, inplace=True) # scores = ['neg_mean_absolute_error','neg_mean_squared_error','r2',] scores = ['r2'] label = 'Z_VI' rand_state = 1 param_grid = [{'C': [10,1.0,0.1], 'gamma': [0.01,0.1,1.0], \ 'kernel': ['rbf'],'epsilon':[0.1,0.2,0.3]}] # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # svr.svm_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'DR7DR12_SDSS5W1W2') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] # svr.svm_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'DR7DR12_SDSS5a') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ # 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] # svr.svm_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'DR7DR12_SDSS5b') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) df_train.query('SDSS_mag_i <= 18.5', inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] svr.svm_reg_grid_search(df_train, features, label, param_grid, rand_state, scores, 'DR7DR12_SDSS5W1W2_icut') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ # 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) df_train.query('SDSS_mag_i <= 18.5', inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] svr.svm_reg_grid_search(df_train, features, label, param_grid, rand_state, scores, 'DR7DR12_SDSS5b_icut')
def dr7dr12_grid_search(): # -------------------------------------------------------------------------- # Read data file and input parameters # -------------------------------------------------------------------------- df = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') df = df.query('0 < Z_VI < 10') df.replace(np.inf, np.nan, inplace=True) # scores = ['neg_mean_absolute_error','neg_mean_squared_error','r2',] scores = ['r2'] label = 'Z_VI' rand_state = 1 param_grid = [{'n_estimators': [50,100,200,300], 'min_samples_split': [2,3,4], \ 'max_depth' : [15,20,25]} ] # param_grid = [{'n_estimators': [200], 'min_samples_split': [4], \ # 'max_depth' : [15]} ] # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # df_train = df_train.sample(frac=0.5) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] # rf.rf_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'DR7DR12_SDSS5W1W2') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_ks', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] # rf.rf_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'DR7DR12_SDSS5a') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ # 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] # rf.rf_reg_grid_search(df_train,features,label,param_grid,rand_state,scores,'DR7DR12_SDSS5b') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) df_train.query('SDSS_mag_i <= 18.5', inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] rf.rf_reg_grid_search(df_train, features, label, param_grid, rand_state, scores, 'DR7DR12_SDSS5W1W2_icut') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ # 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) df_train.query('SDSS_mag_i <= 18.5', inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] rf.rf_reg_grid_search(df_train, features, label, param_grid, rand_state, scores, 'DR7DR12_SDSS5b_icut') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train = df.copy(deep=True) df_train.query('SDSS_mag_i <= 18.5', inplace=True) df_train, features = qs.prepare_flux_ratio_catalog(df_train, passband_names) # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] rf.rf_reg_grid_search(df_train, features, label, param_grid, rand_state, scores, 'DR7DR12_SDSS5a_icut')
def sim_test_full_fit(): # Load the catalog from wich to make the star model df_stars = pd.read_hdf('../class_photoz/data/DR13_stars_clean_flux_cat.hdf5','data') df_stars.drop(df_stars.query('star_class == "null"').index, inplace=True) # Load the catalog from wich to make the quasar model df_qsos = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5','data') z_label = 'z' star_label = 'class_label' rand_state = 1 params = {'binning' : 'minimum', 'bin_param' : 50, 'model_type' : 'median'} df_qsos.query('obsMag_SDSS_i <= 18.5',inplace=True) df_stars = df_stars.query('SDSS_mag_i < 18.5') df_stars = qs.create_star_labels(df_stars, star_label, 'star_class') # Set binary and multi class columns for evaluation routines df_stars['bin_class_true'] = 'STAR' df_stars['mult_class_true'] = df_stars[star_label] df_qsos['bin_class_true'] = 'QSO' df_qsos = pf_an.set_redshift_classes(df_qsos, 'z', 'mult_class_true') #specify passband and other column names for model file passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1','WISE_w2', \ ] #embed this in the sim qso conversion file! for name in passband_names: df_qsos.rename(columns={'obsFlux_'+name:name},inplace=True) df_qsos.rename(columns={'obsFluxErr_'+name:'sigma_'+name},inplace=True) df_stars, features = qs.prepare_flux_ratio_catalog(df_stars, \ passband_names, sigma=True) df_qsos, features = qs.prepare_flux_ratio_catalog(df_qsos, \ passband_names, sigma=True) df_train_stars, df_train_qsos, df_test = \ qs.make_train_pred_set(df_stars, df_qsos, 0.2, rand_state, 'SDSSW1W2_sim_i18_5_', concat=False, save = True) print df_train_stars.mult_class_true.value_counts() print df_train_qsos.mult_class_true.value_counts() print df_test.mult_class_true.value_counts() df_test, qso_prob, qso_chisq = \ photoz_fit(df_train_qsos,df_test,features, z_label, params) df_test, star_prob, star_chisq, star_model = \ star_fit(df_train_stars, df_test, features, star_label, params) # Classify the test set according to the lowest chi-squared value df_test = pf_an.set_redshift_classes(df_test, 'pf_photoz', 'qso_class') df_test = pf_an.set_pred_classes(df_test) df_test.to_hdf('photofit_SDSSW1W2_bin50_sim_i18_5.hdf5','data') full_analysis_sim(df_test)
def rf_full_emp(df_pred): # -------------------------------------------------------------------------- # PHOTOMETRIC REDSHIFT ESTIMATION # -------------------------------------------------------------------------- # Preparing the feature matrix df_train = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5','data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train.replace(np.inf, np.nan,inplace=True) df_train = df_train.query('0 < Z_VI < 10') df_train.query('SDSS_mag_i <= 18.5',inplace=True) df_train,features = qs.prepare_flux_ratio_catalog(df_train,passband_names) # Random Forest Regression Grid Search features = ['SDSS_i','WISE_w1','ug','gr','ri','iz','zw1','w1w2'] rand_state = 1 params = {'n_estimators': 200, 'max_depth': 25, 'min_samples_split': 2, 'n_jobs': 4, 'random_state':rand_state} df_pred = rf_reg.rf_reg_predict(df_train, df_pred, features, label, params, 'rf_emp_photoz') # -------------------------------------------------------------------------- # QSO-STAR-CLASSIFICATION # -------------------------------------------------------------------------- # Loading and preparing the data files df_stars = pd.read_hdf('../class_photoz/data/DR13_stars_clean_flux_cat.hdf5','data') df_quasars = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5','data') passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', \ 'WISE_w2', \ ] df_stars,features = qs.prepare_flux_ratio_catalog(df_stars,passband_names) df_quasars,features = qs.prepare_flux_ratio_catalog(df_quasars,passband_names) df_stars.query('SDSS_mag_i <= 18.5',inplace=True) df_quasars.query('SDSS_mag_i <= 18.5',inplace=True) print "Stars: ",df_stars.shape print "Quasars: ",df_quasars.shape # Preparing test and training sets #Create detailed classes df_quasars = qs.create_qso_labels(df_quasars, 'mult_class_true', 'z') df_stars = qs.create_star_labels(df_stars, 'mult_class_true', 'star_class') # Create binary classes df_quasars['bin_class_true']='QSO' df_stars['bin_class_true']='STAR' # Concatenate training set df_train = pd.concat([df_star,df_quasars]) # Running the Random Forest method features = ['SDSS_i','WISE_w1','ug','gr','ri','iz', \ 'zw1', 'w1w2'] label = 'mult_class_true' params = {'n_estimators': 300, 'max_depth': 25, 'min_samples_split': 4, 'n_jobs': 4, 'random_state': 1} rand_state = 1 clf,y_pred = rf_class_predict(df_train, df_pred, features, label, params, rand_state) df_pred['rf_emp_mult_label_pred'] = y_pred df_pred['rf_emp_bin_class_pred'] = 'STAR' qso_query = 'rf_emp_mult_class_pred == "vlowz" or rf_emp_mult_class_pred == "lowz" or rf_emp_mult_class_pred == "midz" or rf_emp_mult_class_pred == "highz"' df_pred.loc[df_pred.query(qso_query).index,'rf_emp_bin_class_pred'] = 'QSO' return df_pred