def training_bnn(img_list, pctls, feat_list_new, data_path, batch, **model_params): for j, img in enumerate(img_list): print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) for i, pctl in enumerate(pctls): print(img, pctl, '% CLOUD COVER') print('Preprocessing') tf.keras.backend.clear_session() data_train, data_vector_train, data_ind_train, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=False) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0 data_vector_train = np.delete(data_vector_train, perm_index, axis=1) shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] y_train = to_categorical(y_train) D = len(set(y_train[:, 0])) # Target classes model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) try: metrics_path.mkdir(parents=True) model_path.mkdir(parents=True) except FileExistsError: pass model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.h5') print('Training model') start_time = time.time() aleatoric_model = get_aleatoric_uncertainty_model(X_train, y_train, **model_params, D=D) end_time = time.time() times.append(timer(start_time, end_time, False)) aleatoric_model.save(model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
def log_reg_training(img_list, pctls, feat_list_new, data_path, batch): for j, img in enumerate(img_list): print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) for i, pctl in enumerate(pctls): print(img, pctl, '% CLOUD COVER') print('Preprocessing') data_train, data_vector_train, data_ind_train, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=False) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') # data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0 data_vector_train = np.delete(data_vector_train, perm_index, axis=1) shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) if not model_path.exists(): model_path.mkdir(parents=True) if not metrics_path.exists(): metrics_path.mkdir(parents=True) model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.sav') print('Training') start_time = time.time() logreg = LogisticRegression(n_jobs=-1, solver='sag') logreg.fit(X_train, y_train) end_time = time.time() times.append(timer(start_time, end_time, False)) joblib.dump(logreg, model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
def log_reg_training_buffer(img_list, pctls, feat_list_new, data_path, batch, buffer_iters, buffer_flood_only): from imageio import imwrite for img in img_list: print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) for pctl in pctls: print('Preprocessing') data_train_full, data_vector_train, data_ind_train, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=False) for buffer_iter in buffer_iters: perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_train = data_train_full.copy() if buffer_flood_only: data_train[data_train[:, :, perm_index] == 1, flood_index] = 0 mask = data_train[:, :, flood_index] buffer_mask = np.invert( binary_dilation(mask, iterations=buffer_iter)) else: mask = data_train[:, :, flood_index] buffer_mask = np.invert( binary_dilation(mask, iterations=buffer_iter)) data_train[data_train[:, :, perm_index] == 1, flood_index] = 0 data_train[buffer_mask] = np.nan data_vector_train = data_train.reshape([ data_train.shape[0] * data_train.shape[1], data_train.shape[2] ]) data_vector_train = data_vector_train[ ~np.isnan(data_vector_train).any(axis=1)] data_vector_train = np.delete( data_vector_train, perm_index, axis=1) # Remove perm water column shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[ 1] - 1], data_vector_train[:, shape[1] - 1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) if not model_path.exists(): model_path.mkdir(parents=True) if not metrics_path.exists(): metrics_path.mkdir(parents=True) model_path = model_path / '{}'.format(img + '_clouds_' + str( pctl) + 'buff' + str(buffer_iter) + '.sav') # Save data flooding image to check that buffering is working correctly # imwrite(model_path.parents[0] / '{}'.format('buff' + str(buffer_iter) + '.jpg'), data_train[:, :, 6]) print('Training') start_time = time.time() logreg = LogisticRegression(n_jobs=-1, solver='sag') logreg.fit(X_train, y_train) end_time = time.time() times.append(timer(start_time, end_time, False)) joblib.dump(logreg, model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([ np.repeat(pctls, len(buffer_iters)), np.tile(buffer_iters, len(pctls)), times ]) times_df = pd.DataFrame( times, columns=['cloud_cover', 'buffer_iters', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
os.environ["OMP_NUM_THREADS"] = str(NUM_PARALLEL_EXEC_UNITS) os.environ["KMP_BLOCKTIME"] = "30" os.environ["KMP_SETTINGS"] = "1" os.environ["KMP_AFFINITY"] = "granularity=fine,verbose,compact,1,0" # ====================================================================================================================== img = img_list[0] pctl = 30 batch = 'test' import statsmodels.api as sm print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) print(img, pctl, '% CLOUD COVER') print('Preprocessing') tf.keras.backend.clear_session() data_train, data_vector_train, data_ind_train, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=False) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_train = np.delete(data_vector_train, perm_index, axis=1) # Remove perm water column shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1]
def NN_training(img_list, pctls, model_func, feat_list_new, data_path, batch, **model_params): get_model = model_func for j, img in enumerate(img_list): print(img + ': stacking tif, generating clouds') times = [] lr_mins = [] lr_maxes = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) for i, pctl in enumerate(pctls): print(img, pctl, '% CLOUD COVER') print('Preprocessing') tf.keras.backend.clear_session() data_train, data_vector_train, data_ind_train, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, test=False) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') # data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0 data_vector_train = np.delete(data_vector_train, perm_index, axis=1) shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] INPUT_DIMS = X_train.shape[1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) lr_plots_path = metrics_path.parents[1] / 'lr_plots' lr_vals_path = metrics_path.parents[1] / 'lr_vals' try: metrics_path.mkdir(parents=True) model_path.mkdir(parents=True) lr_plots_path.mkdir(parents=True) lr_vals_path.mkdir(parents=True) except FileExistsError: pass # --------------------------------------------------------------------------------------------------- # Determine learning rate by finding max loss decrease during single epoch training lrRangeFinder = LrRangeFinder(start_lr=0.1, end_lr=2) lr_model_params = { 'batch_size': model_params['batch_size'], 'epochs': 1, 'verbose': 2, 'callbacks': [lrRangeFinder], 'use_multiprocessing': True } model = model_func(INPUT_DIMS) print('Finding learning rate') model.fit(X_train, y_train, **lr_model_params) lr_min, lr_max, lr, losses = lr_plots(lrRangeFinder, lr_plots_path, img, pctl) lr_mins.append(lr_min) lr_maxes.append(lr_max) # --------------------------------------------------------------------------------------------------- # Training the model with cyclical learning rate scheduler model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.h5') scheduler = SGDRScheduler(min_lr=lr_min, max_lr=lr_max, lr_decay=0.9, cycle_length=3, mult_factor=1.5) callbacks = [ tf.keras.callbacks.EarlyStopping( monitor='sparse_categorical_accuracy', min_delta=0.0001, patience=10), tf.keras.callbacks.ModelCheckpoint(filepath=str(model_path), monitor='loss', save_best_only=True), CSVLogger(metrics_path / 'training_log.log'), scheduler ] model = get_model(INPUT_DIMS) print('Training full model with best LR') start_time = time.time() model.fit(X_train, y_train, **model_params, callbacks=callbacks) end_time = time.time() times.append(timer(start_time, end_time, False)) # model.save(model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False) lr_range = np.column_stack([pctls, lr_mins, lr_maxes]) lr_avg = np.mean(lr_range[:, 1:2], axis=1) lr_range = np.column_stack([lr_range, lr_avg]) lr_range_df = pd.DataFrame( lr_range, columns=['cloud_cover', 'lr_min', 'lr_max', 'lr_avg']) lr_range_df.to_csv((lr_vals_path / img).with_suffix('.csv'), index=False) losses_path = lr_vals_path / img / '{}'.format('losses_' + str(pctl) + '.csv') try: losses_path.parent.mkdir(parents=True) except FileExistsError: pass lr_losses = np.column_stack([lr, losses]) lr_losses = pd.DataFrame(lr_losses, columns=['lr', 'losses']) lr_losses.to_csv(losses_path, index=False)
def training2(img_list, pctls, model_func, feat_list_new, data_path, batch, DROPOUT_RATE=0, HOLDOUT=0.3, **model_params): ''' Removes flood water that is permanent water ''' get_model = model_func for j, img in enumerate(img_list): times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=True) cloud_generator(img, data_path, overwrite=False) for i, pctl in enumerate(pctls): data_train, data_vector_train, data_ind_train = preprocessing( data_path, img, pctl, gaps=False) perm_index = feat_list_new.index('GSW_perm') flood_index = feat_list_new.index('flooded') data_vector_train[ data_vector_train[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_train = np.delete(data_vector_train, perm_index, axis=1) # Remove perm water column training_data, validation_data = train_val(data_vector_train, holdout=HOLDOUT) X_train, y_train = training_data[:, 0:14], training_data[:, 14] X_val, y_val = validation_data[:, 0:14], validation_data[:, 14] INPUT_DIMS = X_train.shape[1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) try: metrics_path.mkdir(parents=True) model_path.mkdir(parents=True) except FileExistsError: pass model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.h5') csv_logger = CSVLogger(metrics_path / 'training_log.log') model_params['callbacks'].append(csv_logger) print('~~~~~', img, pctl, '% CLOUD COVER') model = get_model(INPUT_DIMS) start_time = time.time() model.fit(X_train, y_train, **model_params, validation_data=(X_val, y_val)) end_time = time.time() times.append(timer(start_time, end_time, False)) model.save(model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
def training6(img_list, pctls, model_func, feat_list_new, data_path, batch, T, dropout_rate=0.2, **model_params): ''' 1. Removes ALL pixels that are over permanent water 2. Finds the optimum learning rate and uses cyclic LR scheduler to train the model 3. No validation set for training 4. ''' get_model = model_func for j, img in enumerate(img_list): print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) for i, pctl in enumerate(pctls): print(img, pctl, '% CLOUD COVER') print('Preprocessing') tf.keras.backend.clear_session() data_train, data_vector_train, data_ind_train, feat_keep = preprocessing( data_path, img, pctl, feat_list_new, gaps=False) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_vector_train[ data_vector_train[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_train = np.delete(data_vector_train, perm_index, axis=1) # Remove perm water column shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] y_train = to_categorical(y_train) INPUT_DIMS = X_train.shape[1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) try: metrics_path.mkdir(parents=True) model_path.mkdir(parents=True) except FileExistsError: pass model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.h5') callbacks = [ tf.keras.callbacks.EarlyStopping( monitor='softmax_output_categorical_accuracy', min_delta=0.005, patience=5), tf.keras.callbacks.ModelCheckpoint(filepath=str(model_path), monitor='loss', save_best_only=True), CSVLogger(metrics_path / 'training_log.log') ] start_time = time.time() model = get_model(model_params['epochs'], X_train, y_train, X_train.shape, T, D=2, batch_size=model_params['batch_size'], dropout_rate=dropout_rate, callbacks=callbacks) end_time = time.time() times.append(timer(start_time, end_time, False)) # model.save(model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
def rf_training(img_list, pctls, feat_list_new, data_path, batch, n_jobs): for j, img in enumerate(img_list): print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False) cloud_generator(img, data_path, overwrite=False) for i, pctl in enumerate(pctls): print(img, pctl, '% CLOUD COVER') print('Preprocessing') tf.keras.backend.clear_session() data_train, data_vector_train, data_ind_train, feat_keep = preprocessing(data_path, img, pctl, feat_list_new, test=False) perm_index = feat_keep.index('GSW_perm') flood_index = feat_keep.index('flooded') data_vector_train[ data_vector_train[:, perm_index] == 1, flood_index] = 0 # Remove flood water that is perm water data_vector_train = np.delete(data_vector_train, perm_index, axis=1) # Remove perm water column shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) try: metrics_path.mkdir(parents=True) model_path.mkdir(parents=True) except FileExistsError: pass param_path = data_path / batch / 'models' / '4514_LC08_027033_20170826_1' / '{}'.format( '4514_LC08_027033_20170826_1_clouds_50params.pkl') model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.sav') # # Hyperparameter optimization # print('Hyperparameter search') # base_rf = RandomForestClassifier(random_state=0, n_estimators=100, max_leaf_nodes=10) # space = [skopt.space.Integer(2, 1000, name="max_leaf_nodes"), # skopt.space.Integer(2, 200, name="n_estimators"), # skopt.space.Integer(2, 3000, name="max_depth")] # @use_named_args(space) # def objective(**params): # base_rf.set_params(**params) # return -np.mean(cross_val_score(base_rf, X_train, y_train, cv=5, n_jobs=n_jobs, scoring="f1")) # res_rf = forest_minimize(objective, space, base_estimator='RF', n_calls=11, # random_state=0, verbose=True, n_jobs=n_jobs) # print(type(res_rf)) # skopt.utils.dump(res_rf, param_path, store_objective=False) res_rf = skopt.utils.load(param_path) # Training print('Training with optimized hyperparameters') start_time = time.time() rf = RandomForestClassifier(random_state=0, max_leaf_nodes=res_rf.x[0], n_estimators=res_rf.x[1], max_depth=res_rf.x[2], n_jobs=-1) rf.fit(X_train, y_train) end_time = time.time() times.append(timer(start_time, end_time, False)) joblib.dump(rf, model_path) metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)
def log_reg_training_sample(img_list, pctls, feat_list_new, feat_list_all, data_path, batch, n_flood, n_nonflood): for img in img_list: print(img + ': stacking tif, generating clouds') times = [] tif_stacker(data_path, img, feat_list_new, overwrite=False) cloud_generator(img, data_path, overwrite=False) for pctl in pctls: print(img, pctl, '% CLOUD COVER') print('Preprocessing') sample_coords, data_train = get_sample_coords( img, pctl, n_flood, n_nonflood) perm_index = data_train.shape[2] - 2 flood_index = data_train.shape[2] - 1 data_vector_train = get_sample_data(sample_coords, data_train) data_vector_train, scaler = standardize_data(data_vector_train) data_vector_train = np.delete(data_vector_train, perm_index, axis=1) # Remove perm water column shape = data_vector_train.shape X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1] model_path = data_path / batch / 'models' / img metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format( img + '_clouds_' + str(pctl)) scaler_dir = data_path / 'scalers' / img if not model_path.exists(): model_path.mkdir(parents=True) if not metrics_path.exists(): metrics_path.mkdir(parents=True) if not scaler_dir.exists(): scaler_dir.mkdir(parents=True) model_path = data_path / batch / 'models' / img / '{}'.format( img + '_clouds_' + str(pctl) + '.sav') scaler_path = scaler_dir / '{}_clouds_{}_scaler_.sav'.format( img, str(pctl)) joblib.dump(scaler, scaler_path) print('Training') start_time = time.time() logreg = LogisticRegression(solver='lbfgs') logreg.fit(X_train, y_train) end_time = time.time() times.append(timer(start_time, end_time, False)) joblib.dump(logreg, model_path) del data_train, data_vector_train, logreg metrics_path = metrics_path.parent times = [float(i) for i in times] times = np.column_stack([pctls, times]) times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time']) times_df.to_csv(metrics_path / 'training_times.csv', index=False)