DEFAULT_LOG_LS_WEIGHT = np.log(0.35).astype(np.float32) DEFAULT_LOG_LS_RESID = np.log(0.1).astype(np.float32) os.makedirs(os.path.join(_SAVE_ADDR_PREFIX, 'base'), exist_ok=True) if not os.path.isdir(_DATA_ADDR_PREFIX): raise ValueError( "Data diretory {} doesn't exist!".format(_DATA_ADDR_PREFIX)) """ 0. prepare training data dictionary """ y_obs_2010 = pd.read_csv("{}/training_data_2010.csv".format(_DATA_ADDR_PREFIX)) X_train = np.asarray(y_obs_2010[["lon", "lat"]].values.tolist()).astype(np.float32) """ 1. prepare prediction data dictionary """ base_valid_feat = dict() base_valid_pred = dict() for model_name in tail_free.get_leaf_model_names(_MODEL_DICTIONARY): data_pd = pd.read_csv("{}/{}_2010_align.csv".format( _DATA_ADDR_PREFIX, model_name)) base_valid_feat[model_name] = np.asarray( data_pd[["lon", "lat"]].values.tolist()).astype(np.float32) base_valid_pred[model_name] = np.asarray(data_pd["pm25"].tolist()).astype( np.float32) X_valid = base_valid_feat[model_name] N_pred = X_valid.shape[0] # standardize X_centr = np.mean(X_valid, axis=0) X_scale = np.max(X_valid, axis=0) - np.min(X_valid, axis=0) X_valid = (X_valid - X_centr) / X_scale
def main(argv): n = argv # n is subset number os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' _DATA_ADDR_PREFIX = "./example/data_monthly" _SAVE_ADDR_PREFIX = "./result_ca_2010_monthly/modified_calibre_2d_annual_pm25_example_ca_20101" _MODEL_DICTIONARY = {"root": ["AV_clean", "GS_clean", 'GM_clean']} family_name = "hmc" + str(n) os.makedirs("{}/{}".format(_SAVE_ADDR_PREFIX, family_name), exist_ok=True) family_tree_dict = _MODEL_DICTIONARY DEFAULT_LOG_LS_WEIGHT = np.log(0.35).astype(np.float32) DEFAULT_LOG_LS_RESID = np.log(0.1).astype(np.float32) """""" """""" """""" """""" """""" """ # 0. Prepare data """ """""" """""" """""" """""" """""" """ 0. prepare training data dictionary """ y_obs_2010 = pd.read_csv( "{}/training_data_clean_2010_fake.csv".format(_DATA_ADDR_PREFIX)) X_train = np.asarray(y_obs_2010[["lon", "lat" ]].values.tolist()).astype(np.float32) base_train_feat = dict() for model_name in tail_free.get_leaf_model_names(_MODEL_DICTIONARY): base_train_feat[model_name] = X_train """ 1. prepare prediction data dictionary """ base_valid_feat = dict() base_valid_pred = dict() for model_name in tail_free.get_leaf_model_names(_MODEL_DICTIONARY): data_pd = pd.read_csv(_DATA_ADDR_PREFIX + "/" + model_name + "_20101_align." + str(n) + ".csv") base_valid_feat[model_name] = np.asarray( data_pd[["lon", "lat"]].values.tolist()).astype(np.float32) base_valid_pred[model_name] = np.asarray( data_pd["pm25"].tolist()).astype(np.float32) X_valid = base_valid_feat[model_name] """ 3. standardize data """ # standardize X_centr = np.mean(X_valid, axis=0) X_scale = np.max(X_valid, axis=0) - np.min(X_valid, axis=0) X_valid = (X_valid - X_centr) / X_scale X_train = (X_train - X_centr) / X_scale """""" """""" """""" """""" """""" """ # 2. Perform Model Prediction """ """""" """""" """""" """""" """""" # load mcmc posterior samples with open( os.path.join( _SAVE_ADDR_PREFIX, '{}/ensemble_posterior_train_parameter_samples_dict.pkl'. format('hmc')), 'rb') as file: parameter_samples_val = pk.load(file) # extract parameters sigma_sample_val = parameter_samples_val["sigma_sample"] resid_sample_val = parameter_samples_val["ensemble_resid_sample"] temp_sample_val = parameter_samples_val["temp_sample"] weight_sample_val = parameter_samples_val["weight_sample"] print(sigma_sample_val.shape) # (5000,) print(resid_sample_val.shape) # (5000, 80) print(temp_sample_val[0].shape) #(5000,) print(weight_sample_val[0].shape) #(5000, 80) # since validation data is very large, perform prediction by data into batch, kf = KFold(n_splits=20) # prepare output container # ensemble_sample_val = np.zeros(shape=(X_valid.shape[0], num_mcmc_steps)) # ensemble_mean_val = np.zeros(shape=(X_valid.shape[0], num_mcmc_steps)) # ensemble_sample_val_mean = np.zeros(shape=(X_valid.shape[0], )) # ensemble_mean_val_mean = np.zeros(shape=(X_valid.shape[0], )) # mean_resid = np.zeros(shape=(X_valid.shape[0], )) # ensemble_sample_val_var = np.zeros(shape=(X_valid.shape[0], )) # ensemble_mean_val_var = np.zeros(shape=(X_valid.shape[0], )) # uncn_resid = np.zeros(shape=(X_valid.shape[0], )) # uncn_noise = np.zeros(shape=(X_valid.shape[0], )) # ensemble_weights_val = np.zeros(shape=(X_valid.shape[0], 3)) # 3 is the number of models # print(ensemble_sample_val_mean.shape) #(20000, 5000) # print(ensemble_mean_val_var.shape) #(20000, 5000) #need to do something here to store all 20k rows #(5000, 20000, 3) #cond_weights_dict_val = np.zeros(shape=(X_valid.shape[0], num_mcmc_steps)) # the above is a dictionary where the keys are the models for fold_id, (_, pred_index) in enumerate(kf.split(X_valid)): print("Running fold {} out of {}".format(fold_id + 1, kf.n_splits)) # prepare X_pred and base_pred_dict for each batch X_pred_fold = X_valid[pred_index] base_pred_dict_fold = { model_name: model_pred_val[pred_index] for (model_name, model_pred_val) in base_valid_pred.items() } # added new returned parameters here # run prediction routine (ensemble_sample_fold, ensemble_mean_fold, ensemble_weights_fold, _, _) = (pred_util.prediction_tailfree( X_pred=X_pred_fold, base_pred_dict=base_pred_dict_fold, X_train=X_train, family_tree=family_tree_dict, weight_sample_list=weight_sample_val, resid_sample=resid_sample_val, temp_sample=temp_sample_val, default_log_ls_weight=DEFAULT_LOG_LS_WEIGHT, default_log_ls_resid=DEFAULT_LOG_LS_RESID, )) # print(ensemble_sample_fold.shape) #(5000, 200) # print(ensemble_mean_fold.shape) #(5000, 200) # print(ensemble_weights_fold.shape) #(5000, 200, 3) t = np.mean(np.exp(2 * sigma_sample_val)) # save to output container # ensemble_sample_val_mean[pred_index] = np.mean(ensemble_sample_fold.T, axis=1) # ensemble_mean_val_mean[pred_index] = np.mean(ensemble_mean_fold.T, axis=1) # mean_resid[pred_index] = np.mean(ensemble_sample_fold.T - ensemble_mean_fold.T, axis=1) # ensemble_sample_val_var[pred_index] = np.var(ensemble_sample_fold.T, axis=1) + t # ensemble_mean_val_var[pred_index] = np.var(ensemble_mean_fold.T, axis=1) # uncn_resid[pred_index]= np.var(ensemble_sample_fold.T - ensemble_mean_fold.T, axis=1) # uncn_noise[pred_index] = t * np.ones(shape=(ensemble_sample_fold.T.shape[0])) ensemble_sample_val_mean_fold = np.mean(ensemble_sample_fold.T, axis=1) ensemble_mean_val_mean_fold = np.mean(ensemble_mean_fold.T, axis=1) mean_resid_fold = np.mean(ensemble_sample_fold.T - ensemble_mean_fold.T, axis=1) ensemble_sample_val_var_fold = np.var(ensemble_sample_fold.T, axis=1) + t ensemble_mean_val_var_fold = np.var(ensemble_mean_fold.T, axis=1) uncn_resid_fold = np.var(ensemble_sample_fold.T - ensemble_mean_fold.T, axis=1) uncn_noise_fold = t * np.ones(shape=(ensemble_sample_fold.T.shape[0])) #model weights # ensemble_weights_val[pred_index, :] = np.mean(ensemble_weights_fold, axis=0) # cond_weights_dict_val[pred_index] = cond_weights_dict_fold.T ensemble_weights_val_fold = np.mean(ensemble_weights_fold, axis=0) with open( os.path.join( _SAVE_ADDR_PREFIX, '{}/ensemble_posterior_sigma_sample.pkl'.format( family_name)), 'ab') as file: pk.dump(sigma_sample_val, file, protocol=pk.HIGHEST_PROTOCOL) with open( os.path.join( _SAVE_ADDR_PREFIX, '{}/ensemble_sample_val_mean.pkl'.format(family_name)), 'ab') as file: pk.dump(ensemble_sample_val_mean_fold, file, protocol=pk.HIGHEST_PROTOCOL) with open( os.path.join( _SAVE_ADDR_PREFIX, '{}/ensemble_mean_val_mean.pkl'.format(family_name)), 'ab') as file: pk.dump(ensemble_mean_val_mean_fold, file, protocol=pk.HIGHEST_PROTOCOL) with open( os.path.join(_SAVE_ADDR_PREFIX, '{}/mean_resid.pkl'.format(family_name)), 'ab') as file: pk.dump(mean_resid_fold, file, protocol=pk.HIGHEST_PROTOCOL) with open( os.path.join( _SAVE_ADDR_PREFIX, '{}/ensemble_sample_val_var.pkl'.format(family_name)), 'ab') as file: pk.dump(ensemble_sample_val_var_fold, file, protocol=pk.HIGHEST_PROTOCOL) with open( os.path.join( _SAVE_ADDR_PREFIX, '{}/ensemble_mean_val_var.pkl'.format(family_name)), 'ab') as file: pk.dump(ensemble_mean_val_var_fold, file, protocol=pk.HIGHEST_PROTOCOL) with open( os.path.join(_SAVE_ADDR_PREFIX, '{}/uncn_resid.pkl'.format(family_name)), 'ab') as file: pk.dump(uncn_resid_fold, file, protocol=pk.HIGHEST_PROTOCOL) with open( os.path.join(_SAVE_ADDR_PREFIX, '{}/uncn_noise.pkl'.format(family_name)), 'ab') as file: pk.dump(uncn_noise_fold, file, protocol=pk.HIGHEST_PROTOCOL) with open( os.path.join( _SAVE_ADDR_PREFIX, '{}/ensemble_weights_val.pkl'.format(family_name)), 'ab') as file: pk.dump(ensemble_weights_val_fold, file, protocol=pk.HIGHEST_PROTOCOL) print("Estimated ls_weight {:.4f}, ls_resid {:.4f}".format( np.exp(DEFAULT_LOG_LS_WEIGHT), np.exp(DEFAULT_LOG_LS_RESID)))