def full_test(): # -------------------------------------------------------------------------- # Loading and preparing the data files # -------------------------------------------------------------------------- df_stars = pd.read_hdf( '../class_photoz/data/DR13_stars_clean_flux_cat.hdf5', 'data') df_quasars = pd.read_hdf( '../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') # df_quasars = pd.read_hdf('../class_photoz/data/brightqsos_sim_2k_new.hdf5','data') # df_stars.drop(df_stars.query('star_class == "null"').index, inplace=True) passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', \ 'WISE_w2', \ ] # embed this in the sim qso conversion file! for name in passband_names: df_quasars.rename(columns={'obsFlux_' + name: name}, inplace=True) df_quasars.rename(columns={'obsFluxErr_' + name: 'sigma_' + name}, inplace=True) df_stars, features = qs.prepare_flux_ratio_catalog(df_stars, passband_names) df_quasars, features = qs.prepare_flux_ratio_catalog( df_quasars, passband_names) # Introducing selection criteria df_stars.query('SDSS_mag_i <= 21.5', inplace=True) df_quasars.query('SDSS_mag_i <= 21.5', inplace=True) # df_quasars.query('obsMag_SDSS_i <= 18.5',inplace=True) print "Stars: ", df_stars.shape print "Quasars: ", df_quasars.shape # -------------------------------------------------------------------------- # Preparing test and training sets # -------------------------------------------------------------------------- #Create detailed classes df_quasars = qs.create_qso_labels(df_quasars, 'mult_class_true', 'Z_VI') df_stars = qs.create_star_labels(df_stars, 'mult_class_true', 'star_class') # Create binary classes df_quasars['bin_class_true'] = 'QSO' df_stars['bin_class_true'] = 'STAR' # Make test and training set df_train, df_pred = qs.make_train_pred_set(df_stars, df_quasars, 0.2, rand_state=1) print df_train.shape, df_pred.shape # -------------------------------------------------------------------------- # Running the Random Forest method # -------------------------------------------------------------------------- # features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', \ # 'hk', 'kw1', 'w1w2'] # features = ['SDSS_i','WISE_w1','ug','gr','ri','iz', \ # 'zw1', 'w1w2'] features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] label = 'mult_class_true' params = { 'n_estimators': 300, 'max_depth': 20, 'min_samples_split': 2, 'n_jobs': 2, 'random_state': 1 } print features print params rand_state = 1 y_true, y_pred, df_prob = \ rf_class.rf_class_example(df_train, df_pred, features, label, params,rand_state) # -------------------------------------------------------------------------- # Additional analysis # -------------------------------------------------------------------------- data = {'mult_class_true': y_true, 'mult_class_pred': y_pred} df = pd.DataFrame(data) df['bin_class_pred'] = 'STAR' df['bin_class_true'] = 'STAR' qso_query = 'mult_class_pred == "vlowz" or mult_class_pred == "lowz" or mult_class_pred == "midz" or mult_class_pred == "highz"' df.loc[df.query(qso_query).index, 'bin_class_pred'] = 'QSO' qso_query_true = 'mult_class_true == "vlowz" or mult_class_true == "lowz" or mult_class_true == "midz" or mult_class_true == "highz"' df.loc[df.query(qso_query_true).index, 'bin_class_true'] = 'QSO' labels = ('QSO', 'STAR') y_true = df.bin_class_true.values y_pred = df.bin_class_pred.values pf_an.classification_analysis(y_true, y_pred, labels) df.to_hdf('fitted_classes.hdf5', 'data')
def test_example(): df_stars = pd.read_hdf( '../class_photoz/data/DR13_stars_clean_flux_cat.hdf5', 'data') df_quasars = pd.read_hdf( '../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') passband_names = ['SDSS_u', 'SDSS_g', 'SDSS_r', 'SDSS_i', 'SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', \ 'WISE_w2', \ # 'WISE_w3', \ # 'WISE_w4', \ ] df_stars, features = \ qs.prepare_flux_ratio_catalog(df_stars, passband_names) df_quasars, features = \ qs.prepare_flux_ratio_catalog(df_quasars, passband_names) # Reduce the total set of objects for testing the routines # df_stars = df_stars.sample(frac=0.2) # df_quasars = df_quasars.sample(frac=0.2) df_stars.query('SDSS_mag_i <= 21.5', inplace=True) df_quasars.query('SDSS_mag_i <= 21.5', inplace=True) print "Stars: ", df_stars.shape print "Quasars: ", df_quasars.shape # Create detailed classes df_quasars = qs.create_qso_labels(df_quasars, 'mult_class_true', 'z') df_stars = qs.create_star_labels(df_stars, 'mult_class_true', 'star_class') # Create binary classes df_quasars['bin_class_true'] = 'QSO' df_stars['bin_class_true'] = 'STAR' # Make test and training set df_train, df_pred = qs.make_train_pred_set(df_stars, df_quasars, 0.2, rand_state=1) #features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', \ # 'hk', 'kw1', 'w1w2'] # features = ['SDSS_i','TMASS_j','ug','gr','ri','iz','zj','jh', 'hk'] features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] #features = ['SDSS_i','ug','gr','ri','iz'] label = 'mult_class_true' params = { 'n_estimators': 300, 'max_depth': 25, 'min_samples_split': 3, 'n_jobs': 2, 'random_state': 1 } rand_state = 1 y_true, y_pred, df_prob = rf_class.rf_class_example( df_train, df_pred, features, label, params, rand_state)
def dr7dr12q_grid_search(): # -------------------------------------------------------------------------- # Read data file and input parameters # -------------------------------------------------------------------------- df_stars = pd.read_hdf( '../class_photoz/data/DR13_stars_clean_flux_cat.hdf5', 'data') df_quasars = pd.read_hdf( '../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5', 'data') df_stars.dropna(subset=['star_class'], inplace=True) param_grid = [{ 'n_estimators': [100, 200, 300], 'min_samples_split': [2, 3, 4], 'max_depth': [15, 20, 25] }] # param_grid = [{'n_estimators': [100], 'min_samples_split': [2], # 'max_depth' : [20]}] rand_state = 1 scores = ['f1_weighted'] # Restrict the data set df_stars.query('SDSS_mag_i <= 21.5', inplace=True) df_quasars.query('SDSS_mag_i <= 21.5', inplace=True) # Create basic classes df_quasars['label'] = 'QSO' df_stars['label'] = 'STAR' #Create more detailed classes df_quasars = qs.create_qso_labels(df_quasars, 'class_label', 'Z_VI') df_stars = qs.create_star_labels(df_stars, 'class_label', 'star_class') # FOR TESTING PURPOSES # df_stars = df_stars.sample(frac=0.2) # df_quasars = df_quasars.sample(frac=0.2) # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ # 'WISE_w1','WISE_w2', \ ] df_stars_train = df_stars.copy(deep=True) df_qsos_train = df_quasars.copy(deep=True) label = 'class_label' df_stars_train, features = qs.prepare_flux_ratio_catalog( df_stars_train, passband_names) df_qsos_train, features = qs.prepare_flux_ratio_catalog( df_qsos_train, passband_names) df_train, df_pred = qs.make_train_pred_set(df_stars_train, df_qsos_train, 0.2, rand_state, 'i19_5_') #Choose label: 'label' = 2 classes, 'class_label'= multiple classes features = ['SDSS_i', 'ug', 'gr', 'ri', 'iz'] # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- #rf_class.rf_class_grid_search(df_train, df_pred, features, label ,param_grid, rand_state, scores, 'test') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1','WISE_w2', \ ] label = 'class_label' df_stars_train = df_stars.copy(deep=True) df_qsos_train = df_quasars.copy(deep=True) df_stars_train, features = qs.prepare_flux_ratio_catalog( df_stars_train, passband_names) df_qsos_train, features = qs.prepare_flux_ratio_catalog( df_qsos_train, passband_names) df_train, df_pred = qs.make_train_pred_set(df_stars_train, df_qsos_train, 0.2, rand_state) #Choose label: 'label' = 2 classes, 'class_label'= multiple classes features = ['SDSS_i', 'WISE_w1', 'ug', 'gr', 'ri', 'iz', 'zw1', 'w1w2'] print df_train.shape, df_pred.shape # -------------------------------------------------------------------------- # Random Forest Regression Grid Search # -------------------------------------------------------------------------- #rf_class.rf_class_grid_search(df_train, df_pred, features, label ,param_grid, rand_state, scores, 'test') # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # -------------------------------------------------------------------------- # Preparation of training set # -------------------------------------------------------------------------- passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ 'TMASS_j', \ 'TMASS_h', \ 'TMASS_k', \ 'WISE_w1','WISE_w2', \ ] label = 'class_label' df_stars_train = df_stars.copy(deep=True) df_qsos_train = df_quasars.copy(deep=True) df_stars_train, features = qs.prepare_flux_ratio_catalog( df_stars_train, passband_names) df_qsos_train, features = qs.prepare_flux_ratio_catalog( df_qsos_train, passband_names) df_train, df_pred = qs.make_train_pred_set(df_stars_train, df_qsos_train, 0.2, rand_state) #Choose label: 'label' = 2 classes, 'class_label'= multiple classes features = ['SDSS_i','WISE_w1','TMASS_j','ug','gr','ri','iz','zj','jh', \ 'hk', 'kw1', 'w1w2']
def rf_full_emp(df_pred): # -------------------------------------------------------------------------- # PHOTOMETRIC REDSHIFT ESTIMATION # -------------------------------------------------------------------------- # Preparing the feature matrix df_train = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5','data') passband_names = [\ 'SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j','TMASS_h','TMASS_k', \ 'WISE_w1','WISE_w2', \ # 'WISE_w3' \ ] df_train.replace(np.inf, np.nan,inplace=True) df_train = df_train.query('0 < Z_VI < 10') df_train.query('SDSS_mag_i <= 18.5',inplace=True) df_train,features = qs.prepare_flux_ratio_catalog(df_train,passband_names) # Random Forest Regression Grid Search features = ['SDSS_i','WISE_w1','ug','gr','ri','iz','zw1','w1w2'] rand_state = 1 params = {'n_estimators': 200, 'max_depth': 25, 'min_samples_split': 2, 'n_jobs': 4, 'random_state':rand_state} df_pred = rf_reg.rf_reg_predict(df_train, df_pred, features, label, params, 'rf_emp_photoz') # -------------------------------------------------------------------------- # QSO-STAR-CLASSIFICATION # -------------------------------------------------------------------------- # Loading and preparing the data files df_stars = pd.read_hdf('../class_photoz/data/DR13_stars_clean_flux_cat.hdf5','data') df_quasars = pd.read_hdf('../class_photoz/data/DR7DR12Q_clean_flux_cat.hdf5','data') passband_names = ['SDSS_u','SDSS_g','SDSS_r','SDSS_i','SDSS_z', \ # 'TMASS_j', \ # 'TMASS_h', \ # 'TMASS_k', \ 'WISE_w1', \ 'WISE_w2', \ ] df_stars,features = qs.prepare_flux_ratio_catalog(df_stars,passband_names) df_quasars,features = qs.prepare_flux_ratio_catalog(df_quasars,passband_names) df_stars.query('SDSS_mag_i <= 18.5',inplace=True) df_quasars.query('SDSS_mag_i <= 18.5',inplace=True) print "Stars: ",df_stars.shape print "Quasars: ",df_quasars.shape # Preparing test and training sets #Create detailed classes df_quasars = qs.create_qso_labels(df_quasars, 'mult_class_true', 'z') df_stars = qs.create_star_labels(df_stars, 'mult_class_true', 'star_class') # Create binary classes df_quasars['bin_class_true']='QSO' df_stars['bin_class_true']='STAR' # Concatenate training set df_train = pd.concat([df_star,df_quasars]) # Running the Random Forest method features = ['SDSS_i','WISE_w1','ug','gr','ri','iz', \ 'zw1', 'w1w2'] label = 'mult_class_true' params = {'n_estimators': 300, 'max_depth': 25, 'min_samples_split': 4, 'n_jobs': 4, 'random_state': 1} rand_state = 1 clf,y_pred = rf_class_predict(df_train, df_pred, features, label, params, rand_state) df_pred['rf_emp_mult_label_pred'] = y_pred df_pred['rf_emp_bin_class_pred'] = 'STAR' qso_query = 'rf_emp_mult_class_pred == "vlowz" or rf_emp_mult_class_pred == "lowz" or rf_emp_mult_class_pred == "midz" or rf_emp_mult_class_pred == "highz"' df_pred.loc[df_pred.query(qso_query).index,'rf_emp_bin_class_pred'] = 'QSO' return df_pred