def autoExecReg(self): # Fix random seed for reproducibility: np.random.seed(self.seed) # ------------------------------------------------------------------------------ df_orig = pd.read_csv(os.path.join(self.path, self.file), sep=',', decimal='.') df_input = df_orig.loc[:, [ '10V', '10H', '18V', '18H', '36V', '36H', '89V', '89H', '166V', '166H', '183VH', 'sfccode', 'T2m', 'tcwv', 'PCT36', 'PCT89', '89VH', 'lat' ]] colunas = [ '10V', '10H', '18V', '18H', '36V', '36H', '89V', '89H', '166V', '166H', '183VH', 'sfccode', 'T2m', 'tcwv', 'PCT36', 'PCT89', '89VH', 'lat' ] scaler = StandardScaler() normed_input = scaler.fit_transform(df_input) df_normed_input = pd.DataFrame(normed_input[:], columns=colunas) ancillary = df_normed_input.loc[:, [ '183VH', 'sfccode', 'T2m', 'tcwv', 'PCT36', 'PCT89', '89VH', 'lat' ]] # regions=df_orig.loc[:,['R1','R2','R3','R4','R5']] # ------------------------------------------------------------------------------ # Choosing the number of components: TB1 = df_normed_input.loc[:, ['10V', '10H', '18V', '18H']] TB2 = df_normed_input.loc[:, ['36V', '36H', '89V', '89H', '166V', '166H']] # ------------------------------------------------------------------------------ # Verifying the number of components that most contribute: pca = self.PCA pca1 = pca.fit(TB1) plt.plot(np.cumsum(pca1.explained_variance_ratio_)) plt.xlabel('Number of components for TB1') plt.ylabel('Cumulative explained variance') plt.savefig(self.path_fig + self.version + 'PCA_TB1.png') # --- pca_trans1 = PCA(n_components=2) pca1 = pca_trans1.fit(TB1) TB1_transformed = pca_trans1.transform(TB1) print("original shape: ", TB1.shape) print("transformed shape:", TB1_transformed.shape) # ------------------------------------------------------------------------------ pca = PCA() pca2 = pca.fit(TB2) plt.plot(np.cumsum(pca2.explained_variance_ratio_)) plt.xlabel('Number of components for TB2') plt.ylabel('Cumulative explained variance') plt.savefig(self.path_fig + self.version + 'PCA_TB2.png') # --- pca_trans2 = PCA(n_components=2) pca2 = pca_trans2.fit(TB2) TB2_transformed = pca_trans2.transform(TB2) print("original shape: ", TB2.shape) print("transformed shape:", TB2_transformed.shape) # ------------------------------------------------------------------------------ # JOIN THE TREATED VARIABLES IN ONE SINGLE DATASET AGAIN: PCA1 = pd.DataFrame(TB1_transformed[:], columns=['pca1_1', 'pca_2']) PCA2 = pd.DataFrame(TB2_transformed[:], columns=['pca2_1', 'pca2_2']) dataset = PCA1.join(PCA2, how='right') dataset = dataset.join(ancillary, how='right') dataset = dataset.join(df_orig.loc[:, ['sfcprcp']], how='right') # ------------------------------------------------------------------------------ dataset = self.keep_interval(0.2, 60, dataset, 'sfcprcp') # ---------------------------------------- # SUBSET BY SPECIFIC CLASS (UNDERSAMPLING) # n = 0.98 # to_remove = np.random.choice( # dataset.index, # size=int(dataset.shape[0] * n), # replace=False) # dataset = dataset.drop(to_remove) # ------------------------------------------------------------------------------ # Split the data into train and test # Now split the dataset into a training set and a test set. # We will use the test set in the final evaluation of our model. train_dataset = dataset.sample(frac=0.8, random_state=0) test_dataset = dataset.drop(train_dataset.index) # ------------------------------------------------------------------------------ # Inspect the data: # Have a quick look at the joint distribution of a few pairs of columns from the training set. colunas = list(dataset.columns.values) # ------------------------------------------------------------------------------ # Also look at the overall statistics: train_stats = train_dataset.describe() train_stats.pop("sfcprcp") train_stats = train_stats.transpose() # ------------------------------------------------------------------------------ # Split features from labels: # Separate the target value, or "label", from the features. # This label is the value that you will train the model to predict. y_train = train_dataset.pop('sfcprcp') y_test = test_dataset.pop('sfcprcp') # ------------------------------------------------------------------------------ # Normalize the data: scaler = StandardScaler() normed_train_data = scaler.fit_transform(train_dataset) normed_test_data = scaler.fit_transform(test_dataset) # ------------------------------------------------------------------------------ # Build the model: model = self.build_reg_model(len(train_dataset.keys())) # ------------------------------------------------------------------------------ # Inspect the model: # Use the .summary method to print a simple description of the model model.summary() # ------------------------------------------------------------------------------ # It seems to be working, and it produces a result # of the expected shape and type. # Train the model: # Train the model for 1000 epochs, and record the training # and validation accuracy in the history object. # ------------------------------------------------------------------------------ # Display training progress by printing a single dot for each completed epoch class PrintDot(keras.callbacks.Callback): def on_epoch_end(self, epoch, logs): if epoch % 100 == 0: print('') print('.', end='') EPOCHS = 1000 history = model.fit(normed_train_data, y_train, epochs=EPOCHS, validation_split=0.2, verbose=0, callbacks=[PrintDot()]) print(history.history.keys()) # ------------------------------------------------------------------------------ # Visualize the model's training progress using the stats # stored in the history object. hist = pd.DataFrame(history.history) hist['epoch'] = history.epoch hist.tail() self.plot_history(history) # ------------------------------------------------------------------------------ model = self.build_reg_model(len(train_dataset.keys())) # The patience parameter is the amount of epochs to check for improvement early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10) history = model.fit(normed_train_data, y_train, epochs=EPOCHS, validation_split=0.2, verbose=0, callbacks=[early_stop, PrintDot()]) # ------------------------------------------------------------------------------ # Ploting again, but with the EarlyStopping apllied: self.plot_history_EarlyStopping(history) # The graph shows that on the validation set, the average error # is usually around +/- 2 MPG. Is this good? # We'll leave that decision up to you. # ------------------------------------------------------------------------------ # Let's see how well the model generalizes by using # the test set, which we did not use when training the model. # This tells us how well we can expect the model to predict # when we use it in the real world. loss, mae, mse = model.evaluate(normed_test_data, y_test, verbose=0) print("Testing set Mean Abs Error: {:5.2f} sfcprcp".format(mae)) #------------------------------------------------------------------------------ # ----------------------------------------------------------------------------- # Make predictions # Finally, predict SFCPRCP values using data in the testing set: test_predictions = model.predict(normed_test_data).flatten() # Appplying meteorological skills to verify the performance of the TRAIN/TESTE model, in this case, continous scores: skills = ContinuousScores() val_y_pred_mean, val_y_test_mean, val_mae, val_rmse, val_std, val_fseperc, val_fse, val_corr, val_num_pixels = skills.metrics( y_test, test_predictions) #converting to text file print("converting arrays to text files") my_scores = { 'val_y_pred_mean': val_y_pred_mean, 'val_y_test_mean': val_y_test_mean, 'val_mae': val_mae, 'val_rmse': val_rmse, 'val_std': val_std, 'val_fseperc': val_fseperc, 'val_fse': val_fse, 'val_corr': val_corr, 'val_num_pixels': val_num_pixels } with open( self.path_fig + 'continuous_scores_TEST_TRAIN_' + self.version + '.txt', 'w') as myfile: myfile.write(str(my_scores)) print("Text file saved!") plt.figure() plt.scatter(y_test, test_predictions) plt.xlabel('True Values [sfcprcp]') plt.ylabel('Predictions [sfcprcp]') plt.axis('equal') plt.axis('square') plt.xlim([0, plt.xlim()[1]]) plt.ylim([0, plt.ylim()[1]]) plt.plot([-100, 100], [-100, 100]) fig_name = self.fig_title + "_plot_scatter_y_test_vs_y_pred.png" plt.savefig(self.path_fig + fig_name) plt.clf() #------------------------------------------------------------------------------ ax = plt.gca() ax.plot(y_test, test_predictions, 'o', c='blue', alpha=0.07, markeredgecolor='none') ax.set_yscale('log') ax.set_xscale('log') ax.set_xlabel('True Values [sfcprcp]') ax.set_ylabel('Predictions [sfcprcp]') plt.plot([-100, 100], [-100, 100]) fig_name = self.fig_title + "_plot_scatter_LOG_y_test_vs_y_pred.png" plt.savefig(self.path_fig + fig_name) plt.clf() #------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ # It looks like our model predicts reasonably well. # Let's take a look at the error distribution. error = test_predictions - y_test plt.hist(error, bins=25) plt.xlabel("Prediction Error [sfcprcp]") plt.ylabel("Count") fig_name = self.fig_title + "_prediction_error.png" plt.savefig(self.path_fig + fig_name) plt.clf() # ------------------------------------------------------------------------------ # HISTROGRAM 2D plt.hist2d(y_test, test_predictions, cmin=1, bins=(50, 50), cmap=plt.cm.jet, range=np.array([(0.2, 110), (0.2, 110)])) plt.axis('equal') plt.axis('square') plt.plot([0, 100], [0, 100], ls="--", c=".3") plt.xlim([0, max(y_test)]) plt.ylim([0, max(y_test)]) plt.colorbar() plt.xlabel("Observed rain rate (mm/h) - Training") plt.ylabel("Predicted rain rate (mm/h) - Training") fig_name = self.fig_title + "_hist2D.png" plt.savefig(self.path_fig + fig_name) plt.clf() # ------------------------------------------------------------------------------ # Saving model to YAML: model_yaml = model.to_yaml() with open(self.mod_out_pth + self.mod_out_name + '.yaml', 'w') as yaml_file: yaml_file.write(model_yaml) # serialize weights to HDF5 model.save_weights(self.mod_out_pth + self.mod_out_name + '.h5') print("Saved model to disk") # Saving the complete model in HDF5: model.save(self.mod_out_pth + self.mod_out_name + '_tf.h5')
def PredictRetrieval(self): #------------------------------------------------------------------------------ #load YAML and create model yaml_file = open(self.ymlp + 'final_' + self.ymlv + '.yaml', 'r') loaded_model_yaml = yaml_file.read() yaml_file.close() loaded_model = model_from_yaml(loaded_model_yaml) # load weights into new model loaded_model.load_weights(self.ymlp + 'final_' + self.ymlv + '.h5') print("Loaded models yaml and h5 from disk!") #------------------------------------------------------------------------------ #------------------------------------------------------------------------------ # Fix random seed for reproducibility: np.random.seed(self.seed) # ------------------------------------------------------------------------------ df_orig = pd.read_csv(os.path.join(self.path, self.file), sep=',', decimal='.') df_input = df_orig.loc[:, [ '10V', '10H', '18V', '18H', '36V', '36H', '89V', '89H', '166V', '166H', '183VH', 'sfccode', 'T2m', 'tcwv', 'PCT36' ]] colunas = [ '10V', '10H', '18V', '18H', '36V', '36H', '89V', '89H', '166V', '166H', '183VH', 'sfccode', 'T2m', 'tcwv', 'PCT36' ] scaler = StandardScaler() normed_input = scaler.fit_transform(df_input) df_normed_input = pd.DataFrame(normed_input[:], columns=colunas) ancillary = df_normed_input.loc[:, [ '183VH', 'sfccode', 'T2m', 'tcwv', 'PCT36' ]] # regions=df_orig.loc[:,['R1','R2','R3','R4','R5']] # ------------------------------------------------------------------------------ # Choosing the number of components: TB1 = df_normed_input.loc[:, ['10V', '10H', '18V', '18H']] TB2 = df_normed_input.loc[:, ['36V', '36H', '89V', '89H', '166V', '166H']] # ------------------------------------------------------------------------------ # Verifying the number of components that most contribute: pca = self.PCA pca1 = pca.fit(TB1) TB1_pca = pca1.transform(TB1) # plt.plot(np.cumsum(pca1.explained_variance_ratio_)) # plt.xlabel('Number of components for TB1') # plt.ylabel('Cumulative explained variance'); # plt.savefig(self.path_fig + self.version + 'PCA_TB1.png') # # --- # pca_trans1 = PCA(n_components=2) # pca1 = pca_trans1.fit(TB1) # #TB1_transformed = pca_trans1.transform(TB1) #print("original shape: ", TB1.shape) #print("transformed shape:", TB1_transformed.shape) # ------------------------------------------------------------------------------ pca2 = pca.fit(TB2) TB2_pca = pca2.transform(TB2) # plt.plot(np.cumsum(pca2.explained_variance_ratio_)) # plt.xlabel('Number of components for TB2') # plt.ylabel('Cumulative explained variance'); # plt.savefig(self.path_fig + self.version + 'PCA_TB2.png') # # --- # pca_trans2 = PCA(n_components=2) # pca2 = pca_trans2.fit(TB2) # #TB2_transformed = pca_trans2.transform(TB2) # print("original shape: ", TB2.shape) # print("transformed shape:", TB2_transformed.shape) # ------------------------------------------------------------------------------ # JOIN THE TREATED VARIABLES IN ONE SINGLE DATASET AGAIN: PCA1 = pd.DataFrame(TB1_pca, columns=['pca1_1', 'pca1_2', 'pca1_3', 'pca1_4']) PCA2 = pd.DataFrame(TB2_pca, columns=[ 'pca2_1', 'pca2_2', 'pca2_3', 'pca2_4', 'pca2_5', 'pca2_6' ]) dataset = PCA1.join(PCA2, how='right') dataset = dataset.join(ancillary, how='right') dataset = dataset.join(df_orig.loc[:, ['sfcprcp']], how='right') # ------------------------------------------------------------------------------ # dataset = self.keep_interval(0.2, 60, dataset, 'sfcprcp') # ------------------------------------------------------------------------------ # dataset = self.keep_interval(0.2, 60, dataset, 'sfcprcp') # NaN_pixels = np.where((dataset['sfcprcp'] != -9999.0)) # dataset = dataset.iloc[NaN_pixels] dataset = dataset.dropna() SCR_pixels = np.where((df_orig['SCR01'] == 1)) dataset = dataset.iloc[SCR_pixels] dataset_index = dataset.index.values #SCR = dataset.pop('SCR01') y_true = dataset.pop('sfcprcp') #SCR = dataset.pop('SCR01') x_normed = dataset.values y_pred = loaded_model.predict(x_normed).flatten() # ------------------------------------------------------------------------------ # Appplying meteorological skills to verify the performance of the model, in this case, categorical scores: skills = ContinuousScores() val_y_pred_mean, val_y_true_mean, val_mae, val_rmse, val_std, val_fseperc, val_fse, val_corr, val_num_pixels = skills.metrics( y_true, y_pred) #converting to text file print("converting arrays to text files") my_scores = { 'val_y_pred_mean': val_y_pred_mean, 'val_y_true_mean': val_y_true_mean, 'val_mae': val_mae, 'val_rmse': val_rmse, 'val_std': val_std, 'val_fseperc': val_fseperc, 'val_fse': val_fse, 'val_corr': val_corr, 'val_num_pixels': val_num_pixels } with open( self.path + 'continuous_scores_SCR01_' + self.tver + '_' + self.ymlv + '.txt', 'w') as myfile: myfile.write(str(my_scores)) print("Text file saved!") # ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------ df_final = df_orig.iloc[dataset_index] df_final['y_true'] = y_true.values df_final['y_pred'] = y_pred #filename=self.file[21:58] filename = 'retrieval_SCR01_' + self.tver + '_' + self.ymlv + '.csv' df_final.to_csv(os.path.join(self.path, filename), index=False, sep=",", decimal='.') return df_final
# This tells us how well we can expect the model to predict # when we use it in the real world. loss, mae, mse = model.evaluate(normed_test_data, y_test, verbose=0) print("Testing set Mean Abs Error: {:5.2f} sfcprcp".format(mae)) #------------------------------------------------------------------------------ # ----------------------------------------------------------------------------- # Make predictions # Finally, predict SFCPRCP values using data in the testing set: test_predictions = model.predict(normed_test_data).flatten() # Appplying meteorological skills to verify the performance of the TRAIN/TESTE model, in this case, continous scores: skills = ContinuousScores() val_y_pred_mean, val_y_test_mean, val_mae, val_rmse, val_std, val_fseperc, val_fse, val_corr, val_num_pixels = skills.metrics(y_test, test_predictions) #converting to text file print("converting arrays to text files") my_scores = {'val_y_pred_mean': val_y_pred_mean, 'val_y_test_mean': val_y_test_mean, 'val_mae': val_mae, 'val_rmse': val_rmse, 'val_std': val_std, 'val_fseperc': val_fseperc, 'val_fse': val_fse, 'val_corr': val_corr, 'val_num_pixels': val_num_pixels} with open(self.path_fig+'continuous_scores_TEST_TRAIN_'+self.version+'.txt', 'w') as myfile: