def prediction(img_list, pctls, feat_list_new, data_path, batch, remove_perm): for j, img in enumerate(img_list): times = [] accuracy, precision, recall, f1 = [], [], [], [] preds_path = data_path / batch / 'predictions' / img bin_file = preds_path / 'predictions.h5' metrics_path = data_path / batch / 'metrics' / 'testing' / img try: metrics_path.mkdir(parents=True) except FileExistsError: print('Metrics directory already exists') for i, pctl in enumerate(pctls): print('Preprocessing', img, pctl, '% cloud cover') data_test, data_vector_test, data_ind_test, feat_keep = preprocessing(data_path, img, pctl, feat_list_new, test=True) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') if remove_perm: data_vector_test[data_vector_test[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_test = np.delete(data_vector_test, perm_index, axis=1) # Remove GSW_perm column data_shape = data_vector_test.shape X_test, y_test = data_vector_test[:, 0:data_shape[1]-1], data_vector_test[:, data_shape[1]-1] print('Predicting for {} at {}% cloud cover'.format(img, pctl)) start_time = time.time() model_path = data_path / batch / 'models' / img / '{}'.format(img + '_clouds_' + str(pctl) + '.sav') trained_model = joblib.load(model_path) pred_probs = trained_model.predict_proba(X_test) preds = np.argmax(pred_probs, axis=1) try: preds_path.mkdir(parents=True) except FileExistsError: pass with h5py.File(bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier mean predictions') del f[str(pctl)] f.create_dataset(str(pctl), data=pred_probs) times.append(timer(start_time, time.time(), False)) # Elapsed time for MC simulations print('Evaluating predictions') accuracy.append(accuracy_score(y_test, preds)) precision.append(precision_score(y_test, preds)) recall.append(recall_score(y_test, preds)) f1.append(f1_score(y_test, preds)) del preds, pred_probs, X_test, y_test, trained_model, data_test, data_vector_test, data_ind_test metrics = pd.DataFrame(np.column_stack([pctls, accuracy, precision, recall, f1]), columns=['cloud_cover', 'accuracy', 'precision', 'recall', 'f1']) metrics.to_csv(metrics_path / 'metrics.csv', index=False) times = [float(i) for i in times] # Convert time objects to float, otherwise valMetrics will be non-numeric times_df = pd.DataFrame(np.column_stack([pctls, times]), columns=['cloud_cover', 'testing_time']) times_df.to_csv(metrics_path / 'testing_times.csv', index=False)
def training_bnn(img_list, pctls, feat_list_new, data_path, batch, **model_params): for j, img in enumerate(img_list): print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) for i, pctl in enumerate(pctls): print(img, pctl, '% CLOUD COVER') print('Preprocessing') tf.keras.backend.clear_session() data_train, data_vector_train, data_ind_train, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=False) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0 data_vector_train = np.delete(data_vector_train, perm_index, axis=1) shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] y_train = to_categorical(y_train) D = len(set(y_train[:, 0])) # Target classes model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) try: metrics_path.mkdir(parents=True) model_path.mkdir(parents=True) except FileExistsError: pass model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.h5') print('Training model') start_time = time.time() aleatoric_model = get_aleatoric_uncertainty_model(X_train, y_train, **model_params, D=D) end_time = time.time() times.append(timer(start_time, end_time, False)) aleatoric_model.save(model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
def log_reg_training(img_list, pctls, feat_list_new, data_path, batch): for j, img in enumerate(img_list): print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) for i, pctl in enumerate(pctls): print(img, pctl, '% CLOUD COVER') print('Preprocessing') data_train, data_vector_train, data_ind_train, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=False) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') # data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0 data_vector_train = np.delete(data_vector_train, perm_index, axis=1) shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) if not model_path.exists(): model_path.mkdir(parents=True) if not metrics_path.exists(): metrics_path.mkdir(parents=True) model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.sav') print('Training') start_time = time.time() logreg = LogisticRegression(n_jobs=-1, solver='sag') logreg.fit(X_train, y_train) end_time = time.time() times.append(timer(start_time, end_time, False)) joblib.dump(logreg, model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
def log_reg_gen_training(img_list_train, feat_list_new, data_path, batch): times = [] print('Preprocessing') data_vector_train = preprocessing_gen_model(data_path, img_list_train) perm_index = feat_list_new.index('GSW_perm') flood_index = feat_list_new.index('flooded') gsw_index = feat_list_new.index('GSW_maxExtent') # data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0 data_vector_train = np.delete(data_vector_train, perm_index, axis=1) data_vector_train = np.delete(data_vector_train, gsw_index, axis=1) shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] model_path = data_path / batch / 'models' metrics_path = data_path / batch / 'metrics' / 'training' if not model_path.exists(): model_path.mkdir(parents=True) if not metrics_path.exists(): metrics_path.mkdir(parents=True) with open(metrics_path / 'training_images.csv', 'w') as f: for listitem in img_list: f.write('%s\n' % listitem) model_path = model_path / 'gen_model.sav' print('Training') start_time = time.time() logreg = LogisticRegression(n_jobs=-1, solver='sag') logreg.fit(X_train, y_train) end_time = time.time() times.append(timer(start_time, end_time, False)) joblib.dump(logreg, model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times_df = pd.DataFrame(times, columns=['training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
def NN_training(img_list, pctls, model_func, feat_list_new, data_path, batch, **model_params): get_model = model_func for j, img in enumerate(img_list): print(img + ': stacking tif, generating clouds') times = [] lr_mins = [] lr_maxes = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) for i, pctl in enumerate(pctls): print(img, pctl, '% CLOUD COVER') print('Preprocessing') tf.keras.backend.clear_session() data_train, data_vector_train, data_ind_train, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=False) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') # data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0 data_vector_train = np.delete(data_vector_train, perm_index, axis=1) shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] INPUT_DIMS = X_train.shape[1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) lr_plots_path = metrics_path.parents[1] / 'lr_plots' lr_vals_path = metrics_path.parents[1] / 'lr_vals' try: metrics_path.mkdir(parents=True) model_path.mkdir(parents=True) lr_plots_path.mkdir(parents=True) lr_vals_path.mkdir(parents=True) except FileExistsError: pass # --------------------------------------------------------------------------------------------------- # Determine learning rate by finding max loss decrease during single epoch training lrRangeFinder = LrRangeFinder(start_lr=0.1, end_lr=2) lr_model_params = { 'batch_size': model_params['batch_size'], 'epochs': 1, 'verbose': 2, 'callbacks': [lrRangeFinder], 'use_multiprocessing': True } model = model_func(INPUT_DIMS) print('Finding learning rate') model.fit(X_train, y_train, **lr_model_params) lr_min, lr_max, lr, losses = lr_plots(lrRangeFinder, lr_plots_path, img, pctl) lr_mins.append(lr_min) lr_maxes.append(lr_max) # --------------------------------------------------------------------------------------------------- # Training the model with cyclical learning rate scheduler model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.h5') scheduler = SGDRScheduler(min_lr=lr_min, max_lr=lr_max, lr_decay=0.9, cycle_length=3, mult_factor=1.5) callbacks = [ tf.keras.callbacks.EarlyStopping( monitor='sparse_categorical_accuracy', min_delta=0.0001, patience=10), tf.keras.callbacks.ModelCheckpoint(filepath=str(model_path), monitor='loss', save_best_only=True), CSVLogger(metrics_path / 'training_log.log'), scheduler ] model = get_model(INPUT_DIMS) print('Training full model with best LR') start_time = time.time() model.fit(X_train, y_train, **model_params, callbacks=callbacks) end_time = time.time() times.append(timer(start_time, end_time, False)) # model.save(model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False) lr_range = np.column_stack([pctls, lr_mins, lr_maxes]) lr_avg = np.mean(lr_range[:, 1:2], axis=1) lr_range = np.column_stack([lr_range, lr_avg]) lr_range_df = pd.DataFrame( lr_range, columns=['cloud_cover', 'lr_min', 'lr_max', 'lr_avg']) lr_range_df.to_csv((lr_vals_path / img).with_suffix('.csv'), index=False) losses_path = lr_vals_path / img / '{}'.format('losses_' + str(pctl) + '.csv') try: losses_path.parent.mkdir(parents=True) except FileExistsError: pass lr_losses = np.column_stack([lr, losses]) lr_losses = pd.DataFrame(lr_losses, columns=['lr', 'losses']) lr_losses.to_csv(losses_path, index=False)
def log_reg_gen_prediction(img_list, pctls, feat_list_new, data_path, batch): for j, img in enumerate(img_list): times = [] accuracy, precision, recall, f1, roc_auc = [], [], [], [], [] preds_path = data_path / batch / 'predictions' / img bin_file = preds_path / 'predictions.h5' uncertainties_path = data_path / batch / 'uncertainties' / img se_lower_bin_file = uncertainties_path / 'se_lower.h5' se_upper_bin_file = uncertainties_path / 'se_upper.h5' metrics_path = data_path / batch / 'metrics' / 'testing' / img try: metrics_path.mkdir(parents=True) except FileExistsError: print('Metrics directory already exists') for i, pctl in enumerate(pctls): print('Preprocessing', img, pctl, '% cloud cover') data_test, data_vector_test, data_ind_test, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=True) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_vector_test[data_vector_test[:, perm_index] == 1, flood_index] = 0 data_vector_test = np.delete(data_vector_test, perm_index, axis=1) # Remove GSW_perm column data_shape = data_vector_test.shape X_test, y_test = data_vector_test[:, 0:data_shape[ 1] - 1], data_vector_test[:, data_shape[1] - 1] print('Predicting for {} at {}% cloud cover'.format(img, pctl)) start_time = time.time() model_path = data_path / batch / 'models' / 'gen_model.sav' trained_model = joblib.load(model_path) pred_probs = trained_model.predict_proba(X_test) preds = np.argmax(pred_probs, axis=1) try: preds_path.mkdir(parents=True) except FileExistsError: pass with h5py.File(bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier mean predictions') del f[str(pctl)] f.create_dataset(str(pctl), data=pred_probs) # Computer standard errors SE_est = get_se(X_test, y_test, trained_model) probs, upper, lower = get_probs( trained_model, X_test, SE_est, z=1.96) # probs is redundant, predicted above try: uncertainties_path.mkdir(parents=True) except FileExistsError: pass with h5py.File(se_lower_bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier lower SEs') del f[str(pctl)] f.create_dataset(str(pctl), data=lower) with h5py.File(se_upper_bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier upper SEs') del f[str(pctl)] f.create_dataset(str(pctl), data=upper) times.append(timer(start_time, time.time(), False)) print('Evaluating predictions') perm_mask = data_test[:, :, perm_index] perm_mask = perm_mask.reshape( [perm_mask.shape[0] * perm_mask.shape[1]]) perm_mask = perm_mask[~np.isnan(perm_mask)] preds[perm_mask.astype('bool')] = 0 y_test[perm_mask.astype('bool')] = 0 accuracy.append(accuracy_score(y_test, preds)) precision.append(precision_score(y_test, preds)) recall.append(recall_score(y_test, preds)) f1.append(f1_score(y_test, preds)) roc_auc.append(roc_auc_score(y_test, pred_probs[:, 1])) del preds, probs, pred_probs, upper, lower, X_test, y_test, \ trained_model, data_test, data_vector_test, data_ind_test metrics = pd.DataFrame( np.column_stack([pctls, accuracy, precision, recall, f1, roc_auc]), columns=[ 'cloud_cover', 'accuracy', 'precision', 'recall', 'f1', 'auc' ]) metrics.to_csv(metrics_path / 'metrics.csv', index=False) times = [ float(i) for i in times ] # Convert time objects to float, otherwise valMetrics will be non-numeric times_df = pd.DataFrame(np.column_stack([pctls, times]), columns=['cloud_cover', 'testing_time']) times_df.to_csv(metrics_path / 'testing_times.csv', index=False)
def training2(img_list, pctls, model_func, feat_list_new, data_path, batch, DROPOUT_RATE=0, HOLDOUT=0.3, **model_params): ''' Removes flood water that is permanent water ''' get_model = model_func for j, img in enumerate(img_list): times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=True) cloud_generator(img, data_path, overwrite=False) for i, pctl in enumerate(pctls): data_train, data_vector_train, data_ind_train = preprocessing( data_path, img, pctl, gaps=False) perm_index = feat_list_new.index('GSW_perm') flood_index = feat_list_new.index('flooded') data_vector_train[ data_vector_train[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_train = np.delete(data_vector_train, perm_index, axis=1) # Remove perm water column training_data, validation_data = train_val(data_vector_train, holdout=HOLDOUT) X_train, y_train = training_data[:, 0:14], training_data[:, 14] X_val, y_val = validation_data[:, 0:14], validation_data[:, 14] INPUT_DIMS = X_train.shape[1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) try: metrics_path.mkdir(parents=True) model_path.mkdir(parents=True) except FileExistsError: pass model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.h5') csv_logger = CSVLogger(metrics_path / 'training_log.log') model_params['callbacks'].append(csv_logger) print('~~~~~', img, pctl, '% CLOUD COVER') model = get_model(INPUT_DIMS) start_time = time.time() model.fit(X_train, y_train, **model_params, validation_data=(X_val, y_val)) end_time = time.time() times.append(timer(start_time, end_time, False)) model.save(model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
] # Convert time objects to float, otherwise valMetrics will be non-numeric times_df = pd.DataFrame(np.column_stack([pctls, times]), columns=['cloud_cover', 'testing_time']) times_df.to_csv(metrics_path / 'testing_times.csv', index=False) # ====================================================================================================================== img_list = ['4514_LC08_027033_20170826_1'] pctls = [50] times = [] start_time = time.time() rf_training(img_list, pctls, feat_list_new, data_path, batch, n_jobs=20) end_time = time.time() times.append(timer(start_time, end_time, True)) start_time = time.time() rf_training(img_list, pctls, feat_list_new, data_path, batch, n_jobs=40) end_time = time.time() times.append(timer(start_time, end_time, True)) start_time = time.time() rf_training(img_list, pctls, feat_list_new, data_path, batch, n_jobs=60) end_time = time.time() times.append(timer(start_time, end_time, True)) np.savetxt(data_path / 'v31' / 'times20_40_60_njobs.csv', times, delimiter=",") # prediction(img_list, pctls, feat_list_new, data_path, batch, remove_perm=True) #
def training_BNN_gen_model(img_list_train, feat_list_new, model_func, data_path, batch, dropout_rate, **model_params): get_model = model_func times = [] lr_mins = [] lr_maxes = [] print('Preprocessing') tf.keras.backend.clear_session() data_vector_train = preprocessing_gen_model(data_path, img_list_train) perm_index = feat_list_new.index('GSW_perm') flood_index = feat_list_new.index('flooded') print(data_vector_train.shape) data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0 data_vector_train = np.delete(data_vector_train, perm_index, axis=1) shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] input_dims = X_train.shape[1] model_path = data_path / batch / 'models' metrics_path = data_path / batch / 'metrics' / 'training' lr_plots_path = metrics_path / 'lr_plots' lr_vals_path = metrics_path / 'lr_vals' try: metrics_path.mkdir(parents=True) model_path.mkdir(parents=True) lr_plots_path.mkdir(parents=True) lr_vals_path.mkdir(parents=True) except FileExistsError: pass # --------------------------------------------------------------------------------------------------- # Determine learning rate by finding max loss decrease during single epoch training lrRangeFinder = LrRangeFinder(start_lr=0.1, end_lr=2) lr_model_params = { 'batch_size': model_params['batch_size'], 'epochs': 1, 'verbose': 2, 'callbacks': [lrRangeFinder], 'use_multiprocessing': True } model = model_func(input_dims, dropout_rate) print('Finding learning rate') model.fit(X_train, y_train, **lr_model_params) lr_min, lr_max, lr, losses = lr_plots(lrRangeFinder, lr_plots_path) lr_mins.append(lr_min) lr_maxes.append(lr_max) # --------------------------------------------------------------------------------------------------- # Training the model with cyclical learning rate scheduler model_path = model_path / 'gen_model.h5' scheduler = SGDRScheduler(min_lr=lr_min, max_lr=lr_max, lr_decay=0.9, cycle_length=3, mult_factor=1.5) callbacks = [ tf.keras.callbacks.EarlyStopping(monitor='sparse_categorical_accuracy', min_delta=0.001, patience=10), tf.keras.callbacks.ModelCheckpoint(filepath=str(model_path), monitor='loss', save_best_only=True), CSVLogger(metrics_path / 'training_log.log'), scheduler ] model = get_model(input_dims, dropout_rate) print('Training full model with best LR') start_time = time.time() model.fit(X_train, y_train, **model_params, callbacks=callbacks) end_time = time.time() times.append(timer(start_time, end_time, False)) metrics_path = metrics_path.parent times = [float(i) for i in times] times_df = pd.DataFrame(times, columns=['training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False) lr_range = np.column_stack([lr_mins, lr_maxes]) lr_avg = np.mean(lr_range, axis=1) lr_range = np.column_stack([lr_range, lr_avg]) lr_range_df = pd.DataFrame(lr_range, columns=['lr_min', 'lr_max', 'lr_avg']) lr_range_df.to_csv((lr_vals_path).with_suffix('.csv'), index=False) losses_path = lr_vals_path / 'gen_model_losses.csv' try: losses_path.parent.mkdir(parents=True) except FileExistsError: pass lr_losses = np.column_stack([lr, losses]) lr_losses = pd.DataFrame(lr_losses, columns=['lr', 'losses']) lr_losses.to_csv(losses_path, index=False)
def prediction_gen_model(img_list, pctls, feat_list_new, data_path, batch, **model_params): model_path = data_path / batch / 'models' / 'gen_model.h5' for j, img in enumerate(img_list): times = [] accuracy, precision, recall, f1 = [], [], [], [] preds_path = data_path / batch / 'predictions' / img bin_file = preds_path / 'predictions.h5' metrics_path = data_path / batch / 'metrics' / 'testing' / img try: metrics_path.mkdir(parents=True) except FileExistsError: print('Metrics directory already exists') for i, pctl in enumerate(pctls): pretrained_model = tf.keras.models.load_model(model_path) for i in range(6): pretrained_model.layers[i].trainable = False pretrained_model.layers[6].trainable = True ll = pretrained_model.layers[6].output ll = tf.keras.layers.Dense(6)(ll) ll = tf.keras.layers.Dense(6)(ll) new_model = Model(pretrained_model.input, outputs=ll) print('Training') data_train, data_vector_train, data_ind_train, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=False) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') gsw_index = feat_keep.index('GSW_maxExtent') data_vector_train = np.delete(data_vector_train, perm_index, axis=1) # Remove GSW_perm column data_vector_train = np.delete(data_vector_train, gsw_index, axis=1) data_shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:data_shape[ 1] - 1], data_vector_train[:, data_shape[1] - 1] trained_model = new_model.fit(X_train, y_train) print('Preprocessing', img, pctl, '% cloud cover') data_test, data_vector_test, data_ind_test, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=True) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') gsw_index = feat_list_new.index('GSW_maxExtent') data_vector_test[ data_vector_test[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_test = np.delete(data_vector_test, perm_index, axis=1) # Remove GSW_perm column data_vector_test = np.delete(data_vector_test, gsw_index, axis=1) data_shape = data_vector_test.shape X_test, y_test = data_vector_test[:, 0:data_shape[ 1] - 1], data_vector_test[:, data_shape[1] - 1] print('Predicting for {} at {}% cloud cover'.format(img, pctl)) start_time = time.time() preds = trained_model.predict( X_test, batch_size=model_params['batch_size'], use_multiprocessing=True) preds = np.argmax(preds, axis=1) # Display most probable value try: preds_path.mkdir(parents=True) except FileExistsError: pass with h5py.File(bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier mean predictions') del f[str(pctl)] f.create_dataset(str(pctl), data=preds) times.append(timer(start_time, time.time(), False)) # Elapsed time for MC simulations print('Evaluating predictions') accuracy.append(accuracy_score(y_test, preds)) precision.append(precision_score(y_test, preds)) recall.append(recall_score(y_test, preds)) f1.append(f1_score(y_test, preds)) del preds, X_test, y_test, data_test, data_vector_test, data_ind_test metrics = pd.DataFrame( np.column_stack([pctls, accuracy, precision, recall, f1]), columns=['cloud_cover', 'accuracy', 'precision', 'recall', 'f1']) metrics.to_csv(metrics_path / 'metrics.csv', index=False) times = [ float(i) for i in times ] # Convert time objects to float, otherwise valMetrics will be non-numeric times_df = pd.DataFrame(np.column_stack([pctls, times]), columns=['cloud_cover', 'testing_time']) times_df.to_csv(metrics_path / 'testing_times.csv', index=False)
def rf_training(img_list, pctls, feat_list_new, data_path, batch, n_jobs): for j, img in enumerate(img_list): print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) for i, pctl in enumerate(pctls): print(img, pctl, '% CLOUD COVER') print('Preprocessing') tf.keras.backend.clear_session() data_train, data_vector_train, data_ind_train, feat_keep = preprocessing(data_path, img, pctl, feat_list_new, test=False) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_vector_train[ data_vector_train[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_train = np.delete(data_vector_train, perm_index, axis=1) # Remove perm water column shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) try: metrics_path.mkdir(parents=True) model_path.mkdir(parents=True) except FileExistsError: pass param_path = data_path / batch / 'models' / '4514_LC08_027033_20170826_1' / '{}'.format( '4514_LC08_027033_20170826_1_clouds_50params.pkl') model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.sav') # # Hyperparameter optimization # print('Hyperparameter search') # base_rf = RandomForestClassifier(random_state=0, n_estimators=100, max_leaf_nodes=10) # space = [skopt.space.Integer(2, 1000, name="max_leaf_nodes"), # skopt.space.Integer(2, 200, name="n_estimators"), # skopt.space.Integer(2, 3000, name="max_depth")] # @use_named_args(space) # def objective(**params): # base_rf.set_params(**params) # return -np.mean(cross_val_score(base_rf, X_train, y_train, cv=5, n_jobs=n_jobs, scoring="f1")) # res_rf = forest_minimize(objective, space, base_estimator='RF', n_calls=11, # random_state=0, verbose=True, n_jobs=n_jobs) # print(type(res_rf)) # skopt.utils.dump(res_rf, param_path, store_objective=False) res_rf = skopt.utils.load(param_path) # Training print('Training with optimized hyperparameters') start_time = time.time() rf = RandomForestClassifier(random_state=0, max_leaf_nodes=res_rf.x[0], n_estimators=res_rf.x[1], max_depth=res_rf.x[2], n_jobs=-1) rf.fit(X_train, y_train) end_time = time.time() times.append(timer(start_time, end_time, False)) joblib.dump(rf, model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
def log_reg_training_sample(img_list, pctls, feat_list_new, feat_list_all, data_path, batch, n_flood, n_nonflood): for img in img_list: print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, overwrite=False) cloud_generator(img, data_path, overwrite=False) for pctl in pctls: print(img, pctl, '% CLOUD COVER') print('Preprocessing') sample_coords, data_train = get_sample_coords( img, pctl, n_flood, n_nonflood) perm_index = data_train.shape[2] - 2 flood_index = data_train.shape[2] - 1 data_vector_train = get_sample_data(sample_coords, data_train) data_vector_train, scaler = standardize_data(data_vector_train) data_vector_train = np.delete(data_vector_train, perm_index, axis=1) # Remove perm water column shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) scaler_dir = data_path / 'scalers' / img if not model_path.exists(): model_path.mkdir(parents=True) if not metrics_path.exists(): metrics_path.mkdir(parents=True) if not scaler_dir.exists(): scaler_dir.mkdir(parents=True) model_path = data_path / batch / 'models' / img / '{}'.format( img + '_clouds_' + str(pctl) + '.sav') scaler_path = scaler_dir / '{}_clouds_{}_scaler_.sav'.format( img, str(pctl)) joblib.dump(scaler, scaler_path) print('Training') start_time = time.time() logreg = LogisticRegression(solver='lbfgs') logreg.fit(X_train, y_train) end_time = time.time() times.append(timer(start_time, end_time, False)) joblib.dump(logreg, model_path) del data_train, data_vector_train, logreg metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
flood_index = feat_keep.index('flooded') # data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0 data_vector_train = np.delete(data_vector_train, perm_index, axis=1) shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) if not model_path.exists(): model_path.mkdir(parents=True) if not metrics_path.exists(): metrics_path.mkdir(parents=True) model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.sav') print('Training') start_time = time.time() logreg = LogisticRegression(n_jobs=-1, solver='sag') logreg.fit(X_train, y_train) end_time = time.time() times.append(timer(start_time, end_time, False)) joblib.dump(logreg, model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
def prediction_with_uncertainty(img_list, pctls, feat_list_new, data_path, batch, DROPOUT_RATE, MC_PASSES, remove_perm, weight_decay=0.005, length_scale=0.00001, **model_params): for j, img in enumerate(img_list): times = [] accuracy, precision, recall, f1 = [], [], [], [] preds_path = data_path / batch / 'predictions' / img vars_path = data_path / batch / 'variances' / img mc_bin_file = preds_path / 'mc_preds.h5' preds_bin_file = preds_path / 'predictions.h5' vars_bin_file = vars_path / 'variances.h5' metrics_path = data_path / batch / 'metrics' / 'testing' / img try: metrics_path.mkdir(parents=True) except FileExistsError: print('Metrics directory already exists') try: preds_path.mkdir(parents=True) except FileExistsError: print('Metrics directory already exists') try: vars_path.mkdir(parents=True) except FileExistsError: print('Metrics directory already exists') for i, pctl in enumerate(pctls): print('Preprocessing', img, pctl, '% cloud cover') data_test, data_vector_test, data_ind_test, feat_keep = preprocessing( data_path, img, pctl, gaps=True) feat_list_keep = [feat_list_new[i] for i in feat_keep ] # Removed if feat was deleted in preprocessing if remove_perm: perm_index = feat_list_keep.index('GSW_perm') flood_index = feat_list_keep.index('flooded') data_vector_test[ data_vector_test[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_test = np.delete(data_vector_test, perm_index, axis=1) # Remove GSW_perm column data_shape = data_vector_test.shape X_test, y_test = data_vector_test[:, 0:data_shape[ 1] - 1], data_vector_test[:, data_shape[1] - 1] # Initialize binary file to hold predictions with h5py.File(mc_bin_file, 'w') as f: f.create_dataset('mc_preds', shape=(X_test.shape[0], 1), maxshape=(X_test.shape[0], None), chunks=True, compression='gzip' ) # Create empty dataset with shape of data start_time = time.time() model_path = data_path / batch / 'models' / img / '{}'.format( img + '_clouds_' + str(pctl) + '.h5') trained_model = tf.keras.models.load_model(model_path) for k in range(MC_PASSES): if k % 10 == 0 or k == MC_PASSES - 1: print('Running MC {}/{} for {} at {}% cloud cover'.format( k, MC_PASSES, img, pctl)) flood_prob = trained_model.predict( X_test, batch_size=model_params['batch_size'], use_multiprocessing=True) # Predict flood_prob = flood_prob[:, 1] # Drop probability of not flooded (0) to save space with h5py.File(mc_bin_file, 'a') as f: f['mc_preds'][:, -1] = flood_prob # Append preds to h5 file if k < MC_PASSES - 1: # Resize to append next pass, if there is one f['mc_preds'].resize((f['mc_preds'].shape[1] + 1), axis=1) tf.keras.backend.clear_session() del flood_prob # Calculate MC statistics print('Calculating MC statistics for {} at {}% cloud cover'.format( img, pctl)) with h5py.File(mc_bin_file, 'r') as f: dset = f['mc_preds'] preds_da = da.from_array( dset, chunks="250 MiB") # Open h5 file as dask array means = preds_da.mean(axis=1) means = means.compute() variance = preds_da.var(axis=1) variance = variance.compute() tau = (length_scale**2 * (1 - DROPOUT_RATE)) / (2 * data_shape[0] * weight_decay) variance = variance + tau preds = means.round() del f, means, preds_da, dset os.remove(mc_bin_file) # Delete predictions to save space on disk print('Saving mean preds/vars for {} at {}% cloud cover'.format( img, pctl)) with h5py.File(preds_bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier mean predictions') del f[str(pctl)] f.create_dataset(str(pctl), data=preds) with h5py.File(vars_bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier variances') del f[str(pctl)] f.create_dataset(str(pctl), data=variance) times.append(timer(start_time, time.time(), False)) # Elapsed time for MC simulations print('Evaluating predictions for {} at {}% cloud cover'.format( img, pctl)) accuracy.append(accuracy_score(y_test, preds)) precision.append(precision_score(y_test, preds)) recall.append(recall_score(y_test, preds)) f1.append(f1_score(y_test, preds)) del preds, X_test, y_test, trained_model, data_test, data_vector_test, data_ind_test metrics = pd.DataFrame( np.column_stack([pctls, accuracy, precision, recall, f1]), columns=['cloud_cover', 'accuracy', 'precision', 'recall', 'f1']) metrics.to_csv(metrics_path / 'metrics.csv', index=False) times = [ float(i) for i in times ] # Convert time objects to float, otherwise valMetrics will be non-numeric times_df = pd.DataFrame(np.column_stack([pctls, times]), columns=['cloud_cover', 'testing_time']) times_df.to_csv(metrics_path / 'testing_times.csv', index=False)
def NN_prediction(img_list, pctls, feat_list_all, data_path, batch, **model_params): for j, img in enumerate(img_list): times = [] accuracy, precision, recall, f1, roc_auc = [], [], [], [], [] preds_path = data_path / batch / 'predictions' / img bin_file = preds_path / 'predictions.h5' metrics_path = data_path / batch / 'metrics' / 'testing' / img try: metrics_path.mkdir(parents=True) except FileExistsError: print('Metrics directory already exists') for i, pctl in enumerate(pctls): print('Preprocessing', img, pctl, '% cloud cover') data_test, data_vector_test, data_ind_test, feat_keep = preprocessing( data_path, img, pctl, feat_list_all, test=True) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_vector_test[ data_vector_test[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_test = np.delete(data_vector_test, perm_index, axis=1) # Remove GSW_perm column data_shape = data_vector_test.shape X_test, y_test = data_vector_test[:, 0:data_shape[ 1] - 1], data_vector_test[:, data_shape[1] - 1] print('Predicting for {} at {}% cloud cover'.format(img, pctl)) start_time = time.time() model_path = data_path / batch / 'models' / img / '{}'.format( img + '_clouds_' + str(pctl) + '.h5') trained_model = load_macro_soft_f1_model(model_path) pred_probs = trained_model.predict( X_test, batch_size=model_params['batch_size'], use_multiprocessing=True) preds = np.argmax(pred_probs, axis=1) # Display most probable value try: preds_path.mkdir(parents=True) except FileExistsError: pass with h5py.File(bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier mean predictions') del f[str(pctl)] f.create_dataset(str(pctl), data=preds) times.append(timer(start_time, time.time(), False)) # Elapsed time for MC simulations print('Evaluating predictions') perm_mask = data_test[:, :, perm_index] perm_mask = perm_mask.reshape( [perm_mask.shape[0] * perm_mask.shape[1]]) perm_mask = perm_mask[~np.isnan(perm_mask)] preds[perm_mask.astype('bool')] = 0 y_test[perm_mask.astype('bool')] = 0 accuracy.append(accuracy_score(y_test, preds)) precision.append(precision_score(y_test, preds)) recall.append(recall_score(y_test, preds)) f1.append(f1_score(y_test, preds)) roc_auc.append(roc_auc_score(y_test, pred_probs[:, 1])) del preds, pred_probs, X_test, y_test, trained_model, data_test, data_vector_test, data_ind_test metrics = pd.DataFrame( np.column_stack([pctls, accuracy, precision, recall, f1, roc_auc]), columns=[ 'cloud_cover', 'accuracy', 'precision', 'recall', 'f1', 'auc' ]) metrics.to_csv(metrics_path / 'metrics.csv', index=False) times = [ float(i) for i in times ] # Convert time objects to float, otherwise valMetrics will be non-numeric times_df = pd.DataFrame(np.column_stack([pctls, times]), columns=['cloud_cover', 'testing_time']) times_df.to_csv(metrics_path / 'testing_times.csv', index=False)
# end_time = time.time() # print('statsmodel training time:', timer(start_time, end_time, False)) # result.save(str(model_path)) # Logistic regression using statsmodel model_path = data_path / batch / 'models' / img if not model_path.exists(): model_path.mkdir(parents=True) model_path = model_path / '{}'.format(img + '_statsmodel2.pickle') print('Training statsmodel') start_time = time.time() logreg_sm = sm.Logit(y_train, X_train) result = logreg_sm.fit_regularized() print(result.summary()) end_time = time.time() print('statsmodel training time:', timer(start_time, end_time, False)) result.save(str(model_path)) # Prediction preds_path = data_path / batch / 'predictions' / img if not preds_path.exists(): preds_path.mkdir(parents=True) bin_file = preds_path / 'predictions.h5' data_test, data_vector_test, data_ind_test, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=True) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_vector_test[data_vector_test[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_test = np.delete(data_vector_test, perm_index, axis=1) # Remove GSW_perm column
def log_reg_training_buffer(img_list, pctls, feat_list_new, data_path, batch, buffer_iters, buffer_flood_only): from imageio import imwrite for img in img_list: print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) for pctl in pctls: print('Preprocessing') data_train_full, data_vector_train, data_ind_train, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=False) for buffer_iter in buffer_iters: perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_train = data_train_full.copy() if buffer_flood_only: data_train[data_train[:, :, perm_index] == 1, flood_index] = 0 mask = data_train[:, :, flood_index] buffer_mask = np.invert( binary_dilation(mask, iterations=buffer_iter)) else: mask = data_train[:, :, flood_index] buffer_mask = np.invert( binary_dilation(mask, iterations=buffer_iter)) data_train[data_train[:, :, perm_index] == 1, flood_index] = 0 data_train[buffer_mask] = np.nan data_vector_train = data_train.reshape([ data_train.shape[0] * data_train.shape[1], data_train.shape[2] ]) data_vector_train = data_vector_train[ ~np.isnan(data_vector_train).any(axis=1)] data_vector_train = np.delete( data_vector_train, perm_index, axis=1) # Remove perm water column shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[ 1] - 1], data_vector_train[:, shape[1] - 1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) if not model_path.exists(): model_path.mkdir(parents=True) if not metrics_path.exists(): metrics_path.mkdir(parents=True) model_path = model_path / '{}'.format(img + '_clouds_' + str( pctl) + 'buff' + str(buffer_iter) + '.sav') # Save data flooding image to check that buffering is working correctly # imwrite(model_path.parents[0] / '{}'.format('buff' + str(buffer_iter) + '.jpg'), data_train[:, :, 6]) print('Training') start_time = time.time() logreg = LogisticRegression(n_jobs=-1, solver='sag') logreg.fit(X_train, y_train) end_time = time.time() times.append(timer(start_time, end_time, False)) joblib.dump(logreg, model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([ np.repeat(pctls, len(buffer_iters)), np.tile(buffer_iters, len(pctls)), times ]) times_df = pd.DataFrame( times, columns=['cloud_cover', 'buffer_iters', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
def training6(img_list, pctls, model_func, feat_list_new, data_path, batch, T, dropout_rate=0.2, **model_params): ''' 1. Removes ALL pixels that are over permanent water 2. Finds the optimum learning rate and uses cyclic LR scheduler to train the model 3. No validation set for training 4. ''' get_model = model_func for j, img in enumerate(img_list): print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) for i, pctl in enumerate(pctls): print(img, pctl, '% CLOUD COVER') print('Preprocessing') tf.keras.backend.clear_session() data_train, data_vector_train, data_ind_train, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, gaps=False) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_vector_train[ data_vector_train[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_train = np.delete(data_vector_train, perm_index, axis=1) # Remove perm water column shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] y_train = to_categorical(y_train) INPUT_DIMS = X_train.shape[1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) try: metrics_path.mkdir(parents=True) model_path.mkdir(parents=True) except FileExistsError: pass model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.h5') callbacks = [ tf.keras.callbacks.EarlyStopping( monitor='softmax_output_categorical_accuracy', min_delta=0.005, patience=5), tf.keras.callbacks.ModelCheckpoint(filepath=str(model_path), monitor='loss', save_best_only=True), CSVLogger(metrics_path / 'training_log.log') ] start_time = time.time() model = get_model(model_params['epochs'], X_train, y_train, X_train.shape, T, D=2, batch_size=model_params['batch_size'], dropout_rate=dropout_rate, callbacks=callbacks) end_time = time.time() times.append(timer(start_time, end_time, False)) # model.save(model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
def prediction_BNN_gen_model(img_list, pctls, feat_list_new, data_path, batch, MC_passes, **model_params): for j, img in enumerate(img_list): times = [] accuracy, precision, recall, f1 = [], [], [], [] preds_path = data_path / batch / 'predictions' / img bin_file = preds_path / 'predictions.h5' model_path = data_path / batch / 'models' / 'gen_model.h5' uncertainties_path = data_path / batch / 'uncertainties' / img aleatoric_bin_file = uncertainties_path / 'aleatoric_uncertainties.h5' epistemic_bin_file = uncertainties_path / 'epistemic_uncertainties.h5' metrics_path = data_path / batch / 'metrics' / 'testing' / img try: metrics_path.mkdir(parents=True) except FileExistsError: print('Metrics directory already exists') for i, pctl in enumerate(pctls): print('Preprocessing', img, pctl, '% cloud cover') data_test, data_vector_test, data_ind_test, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=True) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_vector_test[ data_vector_test[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_test = np.delete(data_vector_test, perm_index, axis=1) # Remove GSW_perm column data_shape = data_vector_test.shape X_test, y_test = data_vector_test[:, 0:data_shape[ 1] - 1], data_vector_test[:, data_shape[1] - 1] print('Predicting for {} at {}% cloud cover'.format(img, pctl)) start_time = time.time() model = tf.keras.models.load_model(model_path) p_hat = [] for t in range(MC_passes): p_hat.append( model.predict(X_test, batch_size=model_params['batch_size'], use_multiprocessing=True)[:, 1]) p_hat = np.array(p_hat) preds = np.round(np.mean(p_hat, axis=0)) aleatoric = np.mean(p_hat * (1 - p_hat), axis=0) epistemic = np.mean(p_hat**2, axis=0) - np.mean(p_hat, axis=0)**2 try: preds_path.mkdir(parents=True) except FileExistsError: pass with h5py.File(bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier mean predictions') del f[str(pctl)] f.create_dataset(str(pctl), data=preds) try: uncertainties_path.mkdir(parents=True) except FileExistsError: pass with h5py.File(epistemic_bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier epistemic predictions') del f[str(pctl)] f.create_dataset(str(pctl), data=epistemic) with h5py.File(aleatoric_bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier epistemic predictions') del f[str(pctl)] f.create_dataset(str(pctl), data=aleatoric) times.append(timer(start_time, time.time(), False)) print('Evaluating predictions') accuracy.append(accuracy_score(y_test, preds)) precision.append(precision_score(y_test, preds)) recall.append(recall_score(y_test, preds)) f1.append(f1_score(y_test, preds)) del preds, p_hat, aleatoric, epistemic, X_test, y_test, model, data_test, data_vector_test, data_ind_test metrics = pd.DataFrame( np.column_stack([pctls, accuracy, precision, recall, f1]), columns=['cloud_cover', 'accuracy', 'precision', 'recall', 'f1']) metrics.to_csv(metrics_path / 'metrics.csv', index=False) times = [float(i) for i in times] times_df = pd.DataFrame(np.column_stack([pctls, times]), columns=['cloud_cover', 'testing_time']) times_df.to_csv(metrics_path / 'testing_times.csv', index=False)
def prediction_bnn(img_list, pctls, feat_list_new, data_path, batch, MC_passes): for j, img in enumerate(img_list): epistemic_times = [] aleatoric_times = [] accuracy, precision, recall, f1 = [], [], [], [] preds_path = data_path / batch / 'predictions' / img bin_file = preds_path / 'predictions.h5' aleatoric_bin_file = preds_path / 'aleatoric_predictions.h5' uncertainties_path = data_path / batch / 'uncertainties' / img aleatoric_uncertainty_file = uncertainties_path / 'aleatoric_uncertainties.h5' epistemic_uncertainty_file = uncertainties_path / 'epistemic_uncertainties.h5' metrics_path = data_path / batch / 'metrics' / 'testing' / img try: metrics_path.mkdir(parents=True) except FileExistsError: print('Metrics directory already exists') for i, pctl in enumerate(pctls): print('Preprocessing', img, pctl, '% cloud cover') data_test, data_vector_test, data_ind_test, feat_keep = preprocessing(data_path, img, pctl, feat_list_new, test=True) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_vector_test[data_vector_test[:, perm_index] == 1, flood_index] = 0 data_vector_test = np.delete(data_vector_test, perm_index, axis=1) data_shape = data_vector_test.shape X_test, y_test = data_vector_test[:, 0:data_shape[1] - 1], data_vector_test[:, data_shape[1] - 1] y_test = to_categorical(y_test) D = len(set(y_test[:, 0])) # Target classes iterable = K.variable(np.ones(MC_passes)) print('Predicting (aleatoric) for {} at {}% cloud cover'.format(img, pctl)) model_path = data_path / batch / 'models' / img / '{}'.format(img + '_clouds_' + str(pctl) + '.h5') start_time = time.time() # aleatoric_model = tf.keras.models.load_model(model_path) aleatoric_model = load_bayesian_model(model_path, MC_passes, D, iterable) aleatoric_results = aleatoric_model.predict(X_test, verbose=1) aleatoric_uncertainties = np.reshape(aleatoric_results[0][:, D:], (-1)) try: uncertainties_path.mkdir(parents=True) except FileExistsError: pass with h5py.File(aleatoric_uncertainty_file, 'a') as f: if str(pctl) in f: print('Deleting earlier aleatoric uncertainties') del f[str(pctl)] f.create_dataset(str(pctl), data=aleatoric_uncertainties) logits = aleatoric_results[0][:, 0:D] aleatoric_preds = np.argmax(aleatoric_results[1], axis=1) aleatoric_times.append(timer(start_time, time.time(), False)) try: preds_path.mkdir(parents=True) except FileExistsError: pass with h5py.File(aleatoric_bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier aleatoric predictions') del f[str(pctl)] f.create_dataset(str(pctl), data=aleatoric_preds) print('Predicting (epistemic) for {} at {}% cloud cover'.format(img, pctl)) start_time = time.time() epistemic_model = get_epistemic_uncertainty_model(model_path, T=MC_passes, D=D) epistemic_results = epistemic_model.predict(X_test, verbose=2, use_multiprocessing=True) epistemic_uncertainties = epistemic_results[0] with h5py.File(epistemic_uncertainty_file, 'a') as f: if str(pctl) in f: print('Deleting earlier epistemic uncertainties') del f[str(pctl)] f.create_dataset(str(pctl), data=epistemic_uncertainties) epistemic_preds = np.argmax(epistemic_results[1], axis=1) epistemic_times.append(timer(start_time, time.time(), False)) with h5py.File(bin_file, 'a') as f: if str(pctl) in f: print('Deleting earlier epistemic predictions') del f[str(pctl)] f.create_dataset(str(pctl), data=epistemic_preds) print('Evaluating predictions') accuracy.append(accuracy_score(y_test[:, 1], epistemic_preds)) precision.append(precision_score(y_test[:, 1], epistemic_preds)) recall.append(recall_score(y_test[:, 1], epistemic_preds)) f1.append(f1_score(y_test[:, 1], epistemic_preds)) del aleatoric_model, aleatoric_results, aleatoric_uncertainties, logits, aleatoric_preds, \ epistemic_model, epistemic_uncertainties, epistemic_preds, epistemic_results, \ data_test, data_vector_test, data_ind_test metrics = pd.DataFrame(np.column_stack([pctls, accuracy, precision, recall, f1]), columns=['cloud_cover', 'accuracy', 'precision', 'recall', 'f1']) metrics.to_csv(metrics_path / 'metrics.csv', index=False) epistemic_times = [float(i) for i in epistemic_times] aleatoric_times = [float(i) for i in aleatoric_times] times_df = pd.DataFrame(np.column_stack([pctls, epistemic_times, aleatoric_times]), columns=['cloud_cover', 'epistemic_testing_time', 'aleatoric_testing_time']) times_df.to_csv(metrics_path / 'testing_times.csv', index=False)