Esempio n. 1
0
def prediction(img_list, pctls, feat_list_new, data_path, batch, remove_perm):
    for j, img in enumerate(img_list):
        times = []
        accuracy, precision, recall, f1 = [], [], [], []
        preds_path = data_path / batch / 'predictions' / img
        bin_file = preds_path / 'predictions.h5'
        metrics_path = data_path / batch / 'metrics' / 'testing' / img

        try:
            metrics_path.mkdir(parents=True)
        except FileExistsError:
            print('Metrics directory already exists')

        for i, pctl in enumerate(pctls):
            print('Preprocessing', img, pctl, '% cloud cover')
            data_test, data_vector_test, data_ind_test, feat_keep = preprocessing(data_path, img, pctl, feat_list_new,
                                                                                  test=True)
            perm_index = feat_keep.index('GSW_perm')
            flood_index = feat_keep.index('flooded')
            if remove_perm:
                data_vector_test[data_vector_test[:, perm_index] == 1, flood_index] = 0  # Remove flood water that is perm water
            data_vector_test = np.delete(data_vector_test, perm_index, axis=1)  # Remove GSW_perm column
            data_shape = data_vector_test.shape
            X_test, y_test = data_vector_test[:, 0:data_shape[1]-1], data_vector_test[:, data_shape[1]-1]

            print('Predicting for {} at {}% cloud cover'.format(img, pctl))
            start_time = time.time()
            model_path = data_path / batch / 'models' / img / '{}'.format(img + '_clouds_' + str(pctl) + '.sav')
            trained_model = joblib.load(model_path)
            pred_probs = trained_model.predict_proba(X_test)
            preds = np.argmax(pred_probs, axis=1)

            try:
                preds_path.mkdir(parents=True)
            except FileExistsError:
                pass

            with h5py.File(bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier mean predictions')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=pred_probs)

            times.append(timer(start_time, time.time(), False))  # Elapsed time for MC simulations

            print('Evaluating predictions')
            accuracy.append(accuracy_score(y_test, preds))
            precision.append(precision_score(y_test, preds))
            recall.append(recall_score(y_test, preds))
            f1.append(f1_score(y_test, preds))

            del preds, pred_probs, X_test, y_test, trained_model, data_test, data_vector_test, data_ind_test

        metrics = pd.DataFrame(np.column_stack([pctls, accuracy, precision, recall, f1]),
                               columns=['cloud_cover', 'accuracy', 'precision', 'recall', 'f1'])
        metrics.to_csv(metrics_path / 'metrics.csv', index=False)
        times = [float(i) for i in times]  # Convert time objects to float, otherwise valMetrics will be non-numeric
        times_df = pd.DataFrame(np.column_stack([pctls, times]),
                                columns=['cloud_cover', 'testing_time'])
        times_df.to_csv(metrics_path / 'testing_times.csv', index=False)
Esempio n. 2
0
def training_bnn(img_list, pctls, feat_list_new, data_path, batch,
                 **model_params):
    for j, img in enumerate(img_list):
        print(img + ': stacking tif, generating clouds')
        times = []
        tif_stacker(data_path,
                    img,
                    feat_list_new,
                    features=True,
                    overwrite=False)
        cloud_generator(img, data_path, overwrite=False)

        for i, pctl in enumerate(pctls):
            print(img, pctl, '% CLOUD COVER')
            print('Preprocessing')
            tf.keras.backend.clear_session()
            data_train, data_vector_train, data_ind_train, feat_keep = preprocessing(
                data_path, img, pctl, feat_list_new, test=False)
            perm_index = feat_keep.index('GSW_perm')
            flood_index = feat_keep.index('flooded')
            data_vector_train[data_vector_train[:, perm_index] == 1,
                              flood_index] = 0
            data_vector_train = np.delete(data_vector_train,
                                          perm_index,
                                          axis=1)
            shape = data_vector_train.shape
            X_train, y_train = data_vector_train[:, 0:shape[1] -
                                                 1], data_vector_train[:,
                                                                       shape[1]
                                                                       - 1]
            y_train = to_categorical(y_train)
            D = len(set(y_train[:, 0]))  # Target classes

            model_path = data_path / batch / 'models' / img
            metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format(
                img + '_clouds_' + str(pctl))
            try:
                metrics_path.mkdir(parents=True)
                model_path.mkdir(parents=True)
            except FileExistsError:
                pass
            model_path = model_path / '{}'.format(img + '_clouds_' +
                                                  str(pctl) + '.h5')
            print('Training model')
            start_time = time.time()
            aleatoric_model = get_aleatoric_uncertainty_model(X_train,
                                                              y_train,
                                                              **model_params,
                                                              D=D)
            end_time = time.time()
            times.append(timer(start_time, end_time, False))
            aleatoric_model.save(model_path)

        metrics_path = metrics_path.parent
        times = [float(i) for i in times]
        times = np.column_stack([pctls, times])
        times_df = pd.DataFrame(times,
                                columns=['cloud_cover', 'training_time'])
        times_df.to_csv(metrics_path / 'training_times.csv', index=False)
Esempio n. 3
0
def log_reg_training(img_list, pctls, feat_list_new, data_path, batch):
    for j, img in enumerate(img_list):
        print(img + ': stacking tif, generating clouds')
        times = []
        tif_stacker(data_path,
                    img,
                    feat_list_new,
                    features=True,
                    overwrite=False)
        cloud_generator(img, data_path, overwrite=False)

        for i, pctl in enumerate(pctls):
            print(img, pctl, '% CLOUD COVER')
            print('Preprocessing')
            data_train, data_vector_train, data_ind_train, feat_keep = preprocessing(
                data_path, img, pctl, feat_list_new, test=False)
            perm_index = feat_keep.index('GSW_perm')
            flood_index = feat_keep.index('flooded')
            # data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0
            data_vector_train = np.delete(data_vector_train,
                                          perm_index,
                                          axis=1)
            shape = data_vector_train.shape
            X_train, y_train = data_vector_train[:, 0:shape[1] -
                                                 1], data_vector_train[:,
                                                                       shape[1]
                                                                       - 1]

            model_path = data_path / batch / 'models' / img
            metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format(
                img + '_clouds_' + str(pctl))

            if not model_path.exists():
                model_path.mkdir(parents=True)
            if not metrics_path.exists():
                metrics_path.mkdir(parents=True)

            model_path = model_path / '{}'.format(img + '_clouds_' +
                                                  str(pctl) + '.sav')

            print('Training')
            start_time = time.time()
            logreg = LogisticRegression(n_jobs=-1, solver='sag')
            logreg.fit(X_train, y_train)
            end_time = time.time()
            times.append(timer(start_time, end_time, False))
            joblib.dump(logreg, model_path)

        metrics_path = metrics_path.parent
        times = [float(i) for i in times]
        times = np.column_stack([pctls, times])
        times_df = pd.DataFrame(times,
                                columns=['cloud_cover', 'training_time'])
        times_df.to_csv(metrics_path / 'training_times.csv', index=False)
Esempio n. 4
0
def log_reg_gen_training(img_list_train, feat_list_new, data_path, batch):
    times = []

    print('Preprocessing')
    data_vector_train = preprocessing_gen_model(data_path, img_list_train)
    perm_index = feat_list_new.index('GSW_perm')
    flood_index = feat_list_new.index('flooded')
    gsw_index = feat_list_new.index('GSW_maxExtent')
    # data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0
    data_vector_train = np.delete(data_vector_train, perm_index, axis=1)
    data_vector_train = np.delete(data_vector_train, gsw_index, axis=1)
    shape = data_vector_train.shape
    X_train, y_train = data_vector_train[:, 0:shape[1] -
                                         1], data_vector_train[:, shape[1] - 1]

    model_path = data_path / batch / 'models'
    metrics_path = data_path / batch / 'metrics' / 'training'

    if not model_path.exists():
        model_path.mkdir(parents=True)
    if not metrics_path.exists():
        metrics_path.mkdir(parents=True)

    with open(metrics_path / 'training_images.csv', 'w') as f:
        for listitem in img_list:
            f.write('%s\n' % listitem)

    model_path = model_path / 'gen_model.sav'

    print('Training')
    start_time = time.time()
    logreg = LogisticRegression(n_jobs=-1, solver='sag')
    logreg.fit(X_train, y_train)
    end_time = time.time()
    times.append(timer(start_time, end_time, False))
    joblib.dump(logreg, model_path)

    metrics_path = metrics_path.parent
    times = [float(i) for i in times]
    times_df = pd.DataFrame(times, columns=['training_time'])
    times_df.to_csv(metrics_path / 'training_times.csv', index=False)
Esempio n. 5
0
def NN_training(img_list, pctls, model_func, feat_list_new, data_path, batch,
                **model_params):
    get_model = model_func
    for j, img in enumerate(img_list):
        print(img + ': stacking tif, generating clouds')
        times = []
        lr_mins = []
        lr_maxes = []
        tif_stacker(data_path,
                    img,
                    feat_list_new,
                    features=True,
                    overwrite=False)
        cloud_generator(img, data_path, overwrite=False)

        for i, pctl in enumerate(pctls):
            print(img, pctl, '% CLOUD COVER')
            print('Preprocessing')
            tf.keras.backend.clear_session()
            data_train, data_vector_train, data_ind_train, feat_keep = preprocessing(
                data_path, img, pctl, feat_list_new, test=False)
            perm_index = feat_keep.index('GSW_perm')
            flood_index = feat_keep.index('flooded')
            # data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0
            data_vector_train = np.delete(data_vector_train,
                                          perm_index,
                                          axis=1)
            shape = data_vector_train.shape
            X_train, y_train = data_vector_train[:, 0:shape[1] -
                                                 1], data_vector_train[:,
                                                                       shape[1]
                                                                       - 1]
            INPUT_DIMS = X_train.shape[1]

            model_path = data_path / batch / 'models' / img
            metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format(
                img + '_clouds_' + str(pctl))

            lr_plots_path = metrics_path.parents[1] / 'lr_plots'
            lr_vals_path = metrics_path.parents[1] / 'lr_vals'
            try:
                metrics_path.mkdir(parents=True)
                model_path.mkdir(parents=True)
                lr_plots_path.mkdir(parents=True)
                lr_vals_path.mkdir(parents=True)
            except FileExistsError:
                pass

            # ---------------------------------------------------------------------------------------------------
            # Determine learning rate by finding max loss decrease during single epoch training
            lrRangeFinder = LrRangeFinder(start_lr=0.1, end_lr=2)

            lr_model_params = {
                'batch_size': model_params['batch_size'],
                'epochs': 1,
                'verbose': 2,
                'callbacks': [lrRangeFinder],
                'use_multiprocessing': True
            }

            model = model_func(INPUT_DIMS)

            print('Finding learning rate')
            model.fit(X_train, y_train, **lr_model_params)
            lr_min, lr_max, lr, losses = lr_plots(lrRangeFinder, lr_plots_path,
                                                  img, pctl)
            lr_mins.append(lr_min)
            lr_maxes.append(lr_max)
            # ---------------------------------------------------------------------------------------------------
            # Training the model with cyclical learning rate scheduler
            model_path = model_path / '{}'.format(img + '_clouds_' +
                                                  str(pctl) + '.h5')
            scheduler = SGDRScheduler(min_lr=lr_min,
                                      max_lr=lr_max,
                                      lr_decay=0.9,
                                      cycle_length=3,
                                      mult_factor=1.5)

            callbacks = [
                tf.keras.callbacks.EarlyStopping(
                    monitor='sparse_categorical_accuracy',
                    min_delta=0.0001,
                    patience=10),
                tf.keras.callbacks.ModelCheckpoint(filepath=str(model_path),
                                                   monitor='loss',
                                                   save_best_only=True),
                CSVLogger(metrics_path / 'training_log.log'), scheduler
            ]

            model = get_model(INPUT_DIMS)

            print('Training full model with best LR')
            start_time = time.time()
            model.fit(X_train, y_train, **model_params, callbacks=callbacks)
            end_time = time.time()
            times.append(timer(start_time, end_time, False))
            # model.save(model_path)

        metrics_path = metrics_path.parent
        times = [float(i) for i in times]
        times = np.column_stack([pctls, times])
        times_df = pd.DataFrame(times,
                                columns=['cloud_cover', 'training_time'])
        times_df.to_csv(metrics_path / 'training_times.csv', index=False)

        lr_range = np.column_stack([pctls, lr_mins, lr_maxes])
        lr_avg = np.mean(lr_range[:, 1:2], axis=1)
        lr_range = np.column_stack([lr_range, lr_avg])
        lr_range_df = pd.DataFrame(
            lr_range, columns=['cloud_cover', 'lr_min', 'lr_max', 'lr_avg'])
        lr_range_df.to_csv((lr_vals_path / img).with_suffix('.csv'),
                           index=False)

        losses_path = lr_vals_path / img / '{}'.format('losses_' + str(pctl) +
                                                       '.csv')
        try:
            losses_path.parent.mkdir(parents=True)
        except FileExistsError:
            pass
        lr_losses = np.column_stack([lr, losses])
        lr_losses = pd.DataFrame(lr_losses, columns=['lr', 'losses'])
        lr_losses.to_csv(losses_path, index=False)
Esempio n. 6
0
def log_reg_gen_prediction(img_list, pctls, feat_list_new, data_path, batch):
    for j, img in enumerate(img_list):
        times = []
        accuracy, precision, recall, f1, roc_auc = [], [], [], [], []
        preds_path = data_path / batch / 'predictions' / img
        bin_file = preds_path / 'predictions.h5'
        uncertainties_path = data_path / batch / 'uncertainties' / img
        se_lower_bin_file = uncertainties_path / 'se_lower.h5'
        se_upper_bin_file = uncertainties_path / 'se_upper.h5'
        metrics_path = data_path / batch / 'metrics' / 'testing' / img

        try:
            metrics_path.mkdir(parents=True)
        except FileExistsError:
            print('Metrics directory already exists')

        for i, pctl in enumerate(pctls):
            print('Preprocessing', img, pctl, '% cloud cover')
            data_test, data_vector_test, data_ind_test, feat_keep = preprocessing(
                data_path, img, pctl, feat_list_new, test=True)
            perm_index = feat_keep.index('GSW_perm')
            flood_index = feat_keep.index('flooded')
            data_vector_test[data_vector_test[:, perm_index] == 1,
                             flood_index] = 0
            data_vector_test = np.delete(data_vector_test, perm_index,
                                         axis=1)  # Remove GSW_perm column
            data_shape = data_vector_test.shape
            X_test, y_test = data_vector_test[:, 0:data_shape[
                1] - 1], data_vector_test[:, data_shape[1] - 1]

            print('Predicting for {} at {}% cloud cover'.format(img, pctl))
            start_time = time.time()
            model_path = data_path / batch / 'models' / 'gen_model.sav'
            trained_model = joblib.load(model_path)
            pred_probs = trained_model.predict_proba(X_test)
            preds = np.argmax(pred_probs, axis=1)

            try:
                preds_path.mkdir(parents=True)
            except FileExistsError:
                pass

            with h5py.File(bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier mean predictions')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=pred_probs)

            # Computer standard errors
            SE_est = get_se(X_test, y_test, trained_model)
            probs, upper, lower = get_probs(
                trained_model, X_test, SE_est,
                z=1.96)  # probs is redundant, predicted above

            try:
                uncertainties_path.mkdir(parents=True)
            except FileExistsError:
                pass

            with h5py.File(se_lower_bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier lower SEs')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=lower)

            with h5py.File(se_upper_bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier upper SEs')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=upper)

            times.append(timer(start_time, time.time(), False))

            print('Evaluating predictions')
            perm_mask = data_test[:, :, perm_index]
            perm_mask = perm_mask.reshape(
                [perm_mask.shape[0] * perm_mask.shape[1]])
            perm_mask = perm_mask[~np.isnan(perm_mask)]
            preds[perm_mask.astype('bool')] = 0
            y_test[perm_mask.astype('bool')] = 0

            accuracy.append(accuracy_score(y_test, preds))
            precision.append(precision_score(y_test, preds))
            recall.append(recall_score(y_test, preds))
            f1.append(f1_score(y_test, preds))
            roc_auc.append(roc_auc_score(y_test, pred_probs[:, 1]))

            del preds, probs, pred_probs, upper, lower, X_test, y_test, \
                trained_model, data_test, data_vector_test, data_ind_test

        metrics = pd.DataFrame(
            np.column_stack([pctls, accuracy, precision, recall, f1, roc_auc]),
            columns=[
                'cloud_cover', 'accuracy', 'precision', 'recall', 'f1', 'auc'
            ])
        metrics.to_csv(metrics_path / 'metrics.csv', index=False)
        times = [
            float(i) for i in times
        ]  # Convert time objects to float, otherwise valMetrics will be non-numeric
        times_df = pd.DataFrame(np.column_stack([pctls, times]),
                                columns=['cloud_cover', 'testing_time'])
        times_df.to_csv(metrics_path / 'testing_times.csv', index=False)
Esempio n. 7
0
def training2(img_list,
              pctls,
              model_func,
              feat_list_new,
              data_path,
              batch,
              DROPOUT_RATE=0,
              HOLDOUT=0.3,
              **model_params):
    '''
    Removes flood water that is permanent water
    '''

    get_model = model_func

    for j, img in enumerate(img_list):
        times = []
        tif_stacker(data_path,
                    img,
                    feat_list_new,
                    features=True,
                    overwrite=True)
        cloud_generator(img, data_path, overwrite=False)

        for i, pctl in enumerate(pctls):
            data_train, data_vector_train, data_ind_train = preprocessing(
                data_path, img, pctl, gaps=False)
            perm_index = feat_list_new.index('GSW_perm')
            flood_index = feat_list_new.index('flooded')
            data_vector_train[
                data_vector_train[:, perm_index] == 1,
                flood_index] = 0  # Remove flood water that is perm water
            data_vector_train = np.delete(data_vector_train,
                                          perm_index,
                                          axis=1)  # Remove perm water column

            training_data, validation_data = train_val(data_vector_train,
                                                       holdout=HOLDOUT)
            X_train, y_train = training_data[:, 0:14], training_data[:, 14]
            X_val, y_val = validation_data[:, 0:14], validation_data[:, 14]
            INPUT_DIMS = X_train.shape[1]

            model_path = data_path / batch / 'models' / img
            metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format(
                img + '_clouds_' + str(pctl))

            try:
                metrics_path.mkdir(parents=True)
                model_path.mkdir(parents=True)
            except FileExistsError:
                pass

            model_path = model_path / '{}'.format(img + '_clouds_' +
                                                  str(pctl) + '.h5')

            csv_logger = CSVLogger(metrics_path / 'training_log.log')
            model_params['callbacks'].append(csv_logger)

            print('~~~~~', img, pctl, '% CLOUD COVER')

            model = get_model(INPUT_DIMS)

            start_time = time.time()
            model.fit(X_train,
                      y_train,
                      **model_params,
                      validation_data=(X_val, y_val))

            end_time = time.time()
            times.append(timer(start_time, end_time, False))

            model.save(model_path)

        metrics_path = metrics_path.parent
        times = [float(i) for i in times]
        times = np.column_stack([pctls, times])
        times_df = pd.DataFrame(times,
                                columns=['cloud_cover', 'training_time'])
        times_df.to_csv(metrics_path / 'training_times.csv', index=False)
Esempio n. 8
0
        ]  # Convert time objects to float, otherwise valMetrics will be non-numeric
        times_df = pd.DataFrame(np.column_stack([pctls, times]),
                                columns=['cloud_cover', 'testing_time'])
        times_df.to_csv(metrics_path / 'testing_times.csv', index=False)


# ======================================================================================================================

img_list = ['4514_LC08_027033_20170826_1']
pctls = [50]

times = []
start_time = time.time()
rf_training(img_list, pctls, feat_list_new, data_path, batch, n_jobs=20)
end_time = time.time()
times.append(timer(start_time, end_time, True))

start_time = time.time()
rf_training(img_list, pctls, feat_list_new, data_path, batch, n_jobs=40)
end_time = time.time()
times.append(timer(start_time, end_time, True))

start_time = time.time()
rf_training(img_list, pctls, feat_list_new, data_path, batch, n_jobs=60)
end_time = time.time()
times.append(timer(start_time, end_time, True))

np.savetxt(data_path / 'v31' / 'times20_40_60_njobs.csv', times, delimiter=",")

# prediction(img_list, pctls, feat_list_new, data_path, batch, remove_perm=True)
#
Esempio n. 9
0
def training_BNN_gen_model(img_list_train, feat_list_new, model_func,
                           data_path, batch, dropout_rate, **model_params):
    get_model = model_func
    times = []
    lr_mins = []
    lr_maxes = []

    print('Preprocessing')
    tf.keras.backend.clear_session()
    data_vector_train = preprocessing_gen_model(data_path, img_list_train)
    perm_index = feat_list_new.index('GSW_perm')
    flood_index = feat_list_new.index('flooded')
    print(data_vector_train.shape)
    data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0
    data_vector_train = np.delete(data_vector_train, perm_index, axis=1)
    shape = data_vector_train.shape
    X_train, y_train = data_vector_train[:, 0:shape[1] -
                                         1], data_vector_train[:, shape[1] - 1]
    input_dims = X_train.shape[1]

    model_path = data_path / batch / 'models'
    metrics_path = data_path / batch / 'metrics' / 'training'

    lr_plots_path = metrics_path / 'lr_plots'
    lr_vals_path = metrics_path / 'lr_vals'
    try:
        metrics_path.mkdir(parents=True)
        model_path.mkdir(parents=True)
        lr_plots_path.mkdir(parents=True)
        lr_vals_path.mkdir(parents=True)
    except FileExistsError:
        pass

    # ---------------------------------------------------------------------------------------------------
    # Determine learning rate by finding max loss decrease during single epoch training
    lrRangeFinder = LrRangeFinder(start_lr=0.1, end_lr=2)

    lr_model_params = {
        'batch_size': model_params['batch_size'],
        'epochs': 1,
        'verbose': 2,
        'callbacks': [lrRangeFinder],
        'use_multiprocessing': True
    }

    model = model_func(input_dims, dropout_rate)

    print('Finding learning rate')
    model.fit(X_train, y_train, **lr_model_params)
    lr_min, lr_max, lr, losses = lr_plots(lrRangeFinder, lr_plots_path)
    lr_mins.append(lr_min)
    lr_maxes.append(lr_max)
    # ---------------------------------------------------------------------------------------------------
    # Training the model with cyclical learning rate scheduler
    model_path = model_path / 'gen_model.h5'
    scheduler = SGDRScheduler(min_lr=lr_min,
                              max_lr=lr_max,
                              lr_decay=0.9,
                              cycle_length=3,
                              mult_factor=1.5)

    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor='sparse_categorical_accuracy',
                                         min_delta=0.001,
                                         patience=10),
        tf.keras.callbacks.ModelCheckpoint(filepath=str(model_path),
                                           monitor='loss',
                                           save_best_only=True),
        CSVLogger(metrics_path / 'training_log.log'), scheduler
    ]

    model = get_model(input_dims, dropout_rate)

    print('Training full model with best LR')
    start_time = time.time()
    model.fit(X_train, y_train, **model_params, callbacks=callbacks)
    end_time = time.time()
    times.append(timer(start_time, end_time, False))

    metrics_path = metrics_path.parent
    times = [float(i) for i in times]
    times_df = pd.DataFrame(times, columns=['training_time'])
    times_df.to_csv(metrics_path / 'training_times.csv', index=False)

    lr_range = np.column_stack([lr_mins, lr_maxes])
    lr_avg = np.mean(lr_range, axis=1)
    lr_range = np.column_stack([lr_range, lr_avg])
    lr_range_df = pd.DataFrame(lr_range,
                               columns=['lr_min', 'lr_max', 'lr_avg'])
    lr_range_df.to_csv((lr_vals_path).with_suffix('.csv'), index=False)

    losses_path = lr_vals_path / 'gen_model_losses.csv'
    try:
        losses_path.parent.mkdir(parents=True)
    except FileExistsError:
        pass
    lr_losses = np.column_stack([lr, losses])
    lr_losses = pd.DataFrame(lr_losses, columns=['lr', 'losses'])
    lr_losses.to_csv(losses_path, index=False)
Esempio n. 10
0
def prediction_gen_model(img_list, pctls, feat_list_new, data_path, batch,
                         **model_params):
    model_path = data_path / batch / 'models' / 'gen_model.h5'
    for j, img in enumerate(img_list):
        times = []
        accuracy, precision, recall, f1 = [], [], [], []
        preds_path = data_path / batch / 'predictions' / img
        bin_file = preds_path / 'predictions.h5'
        metrics_path = data_path / batch / 'metrics' / 'testing' / img

        try:
            metrics_path.mkdir(parents=True)
        except FileExistsError:
            print('Metrics directory already exists')

        for i, pctl in enumerate(pctls):
            pretrained_model = tf.keras.models.load_model(model_path)
            for i in range(6):
                pretrained_model.layers[i].trainable = False
            pretrained_model.layers[6].trainable = True
            ll = pretrained_model.layers[6].output
            ll = tf.keras.layers.Dense(6)(ll)
            ll = tf.keras.layers.Dense(6)(ll)
            new_model = Model(pretrained_model.input, outputs=ll)

            print('Training')
            data_train, data_vector_train, data_ind_train, feat_keep = preprocessing(
                data_path, img, pctl, feat_list_new, test=False)
            perm_index = feat_keep.index('GSW_perm')
            flood_index = feat_keep.index('flooded')
            gsw_index = feat_keep.index('GSW_maxExtent')
            data_vector_train = np.delete(data_vector_train,
                                          perm_index,
                                          axis=1)  # Remove GSW_perm column
            data_vector_train = np.delete(data_vector_train, gsw_index, axis=1)
            data_shape = data_vector_train.shape
            X_train, y_train = data_vector_train[:, 0:data_shape[
                1] - 1], data_vector_train[:, data_shape[1] - 1]
            trained_model = new_model.fit(X_train, y_train)

            print('Preprocessing', img, pctl, '% cloud cover')
            data_test, data_vector_test, data_ind_test, feat_keep = preprocessing(
                data_path, img, pctl, feat_list_new, test=True)
            perm_index = feat_keep.index('GSW_perm')
            flood_index = feat_keep.index('flooded')
            gsw_index = feat_list_new.index('GSW_maxExtent')
            data_vector_test[
                data_vector_test[:, perm_index] == 1,
                flood_index] = 0  # Remove flood water that is perm water
            data_vector_test = np.delete(data_vector_test, perm_index,
                                         axis=1)  # Remove GSW_perm column
            data_vector_test = np.delete(data_vector_test, gsw_index, axis=1)
            data_shape = data_vector_test.shape
            X_test, y_test = data_vector_test[:, 0:data_shape[
                1] - 1], data_vector_test[:, data_shape[1] - 1]

            print('Predicting for {} at {}% cloud cover'.format(img, pctl))
            start_time = time.time()
            preds = trained_model.predict(
                X_test,
                batch_size=model_params['batch_size'],
                use_multiprocessing=True)
            preds = np.argmax(preds, axis=1)  # Display most probable value

            try:
                preds_path.mkdir(parents=True)
            except FileExistsError:
                pass

            with h5py.File(bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier mean predictions')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=preds)

            times.append(timer(start_time, time.time(),
                               False))  # Elapsed time for MC simulations

            print('Evaluating predictions')
            accuracy.append(accuracy_score(y_test, preds))
            precision.append(precision_score(y_test, preds))
            recall.append(recall_score(y_test, preds))
            f1.append(f1_score(y_test, preds))

            del preds, X_test, y_test, data_test, data_vector_test, data_ind_test

        metrics = pd.DataFrame(
            np.column_stack([pctls, accuracy, precision, recall, f1]),
            columns=['cloud_cover', 'accuracy', 'precision', 'recall', 'f1'])
        metrics.to_csv(metrics_path / 'metrics.csv', index=False)
        times = [
            float(i) for i in times
        ]  # Convert time objects to float, otherwise valMetrics will be non-numeric
        times_df = pd.DataFrame(np.column_stack([pctls, times]),
                                columns=['cloud_cover', 'testing_time'])
        times_df.to_csv(metrics_path / 'testing_times.csv', index=False)
Esempio n. 11
0
File: RF.py Progetto: moghimis/CPR
def rf_training(img_list, pctls, feat_list_new, data_path, batch, n_jobs):
    for j, img in enumerate(img_list):
        print(img + ': stacking tif, generating clouds')
        times = []
        tif_stacker(data_path, img, feat_list_new, features=True, overwrite=False)
        cloud_generator(img, data_path, overwrite=False)

        for i, pctl in enumerate(pctls):
            print(img, pctl, '% CLOUD COVER')
            print('Preprocessing')
            tf.keras.backend.clear_session()
            data_train, data_vector_train, data_ind_train, feat_keep = preprocessing(data_path, img, pctl,
                                                                                     feat_list_new,
                                                                                     test=False)
            perm_index = feat_keep.index('GSW_perm')
            flood_index = feat_keep.index('flooded')
            data_vector_train[
                data_vector_train[:, perm_index] == 1, flood_index] = 0  # Remove flood water that is perm water
            data_vector_train = np.delete(data_vector_train, perm_index, axis=1)  # Remove perm water column
            shape = data_vector_train.shape
            X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1]

            model_path = data_path / batch / 'models' / img
            metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format(
                img + '_clouds_' + str(pctl))

            try:
                metrics_path.mkdir(parents=True)
                model_path.mkdir(parents=True)
            except FileExistsError:
                pass

            param_path = data_path / batch / 'models' / '4514_LC08_027033_20170826_1' / '{}'.format(
                '4514_LC08_027033_20170826_1_clouds_50params.pkl')
            model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.sav')

            # # Hyperparameter optimization
            # print('Hyperparameter search')
            # base_rf = RandomForestClassifier(random_state=0, n_estimators=100, max_leaf_nodes=10)

            # space = [skopt.space.Integer(2, 1000, name="max_leaf_nodes"),
            # skopt.space.Integer(2, 200, name="n_estimators"),
            # skopt.space.Integer(2, 3000, name="max_depth")]

            # @use_named_args(space)
            # def objective(**params):
            # base_rf.set_params(**params)
            # return -np.mean(cross_val_score(base_rf, X_train, y_train, cv=5, n_jobs=n_jobs, scoring="f1"))

            # res_rf = forest_minimize(objective, space, base_estimator='RF', n_calls=11,
            # random_state=0, verbose=True, n_jobs=n_jobs)
            # print(type(res_rf))
            # skopt.utils.dump(res_rf, param_path, store_objective=False)

            res_rf = skopt.utils.load(param_path)
            # Training
            print('Training with optimized hyperparameters')
            start_time = time.time()
            rf = RandomForestClassifier(random_state=0,
                                        max_leaf_nodes=res_rf.x[0],
                                        n_estimators=res_rf.x[1],
                                        max_depth=res_rf.x[2],
                                        n_jobs=-1)
            rf.fit(X_train, y_train)
            end_time = time.time()
            times.append(timer(start_time, end_time, False))
            joblib.dump(rf, model_path)

        metrics_path = metrics_path.parent
        times = [float(i) for i in times]
        times = np.column_stack([pctls, times])
        times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time'])
        times_df.to_csv(metrics_path / 'training_times.csv', index=False)
Esempio n. 12
0
def log_reg_training_sample(img_list, pctls, feat_list_new, feat_list_all,
                            data_path, batch, n_flood, n_nonflood):
    for img in img_list:
        print(img + ': stacking tif, generating clouds')
        times = []
        tif_stacker(data_path, img, feat_list_new, overwrite=False)
        cloud_generator(img, data_path, overwrite=False)

        for pctl in pctls:
            print(img, pctl, '% CLOUD COVER')
            print('Preprocessing')

            sample_coords, data_train = get_sample_coords(
                img, pctl, n_flood, n_nonflood)
            perm_index = data_train.shape[2] - 2
            flood_index = data_train.shape[2] - 1
            data_vector_train = get_sample_data(sample_coords, data_train)
            data_vector_train, scaler = standardize_data(data_vector_train)
            data_vector_train = np.delete(data_vector_train,
                                          perm_index,
                                          axis=1)  # Remove perm water column
            shape = data_vector_train.shape
            X_train, y_train = data_vector_train[:, 0:shape[1] -
                                                 1], data_vector_train[:,
                                                                       shape[1]
                                                                       - 1]

            model_path = data_path / batch / 'models' / img
            metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format(
                img + '_clouds_' + str(pctl))
            scaler_dir = data_path / 'scalers' / img

            if not model_path.exists():
                model_path.mkdir(parents=True)
            if not metrics_path.exists():
                metrics_path.mkdir(parents=True)
            if not scaler_dir.exists():
                scaler_dir.mkdir(parents=True)

            model_path = data_path / batch / 'models' / img / '{}'.format(
                img + '_clouds_' + str(pctl) + '.sav')
            scaler_path = scaler_dir / '{}_clouds_{}_scaler_.sav'.format(
                img, str(pctl))
            joblib.dump(scaler, scaler_path)

            print('Training')
            start_time = time.time()
            logreg = LogisticRegression(solver='lbfgs')
            logreg.fit(X_train, y_train)
            end_time = time.time()
            times.append(timer(start_time, end_time, False))
            joblib.dump(logreg, model_path)

            del data_train, data_vector_train, logreg

        metrics_path = metrics_path.parent
        times = [float(i) for i in times]
        times = np.column_stack([pctls, times])
        times_df = pd.DataFrame(times,
                                columns=['cloud_cover', 'training_time'])
        times_df.to_csv(metrics_path / 'training_times.csv', index=False)
Esempio n. 13
0
flood_index = feat_keep.index('flooded')
# data_vector_train[data_vector_train[:, perm_index] == 1, flood_index] = 0
data_vector_train = np.delete(data_vector_train, perm_index, axis=1)
shape = data_vector_train.shape
X_train, y_train = data_vector_train[:, 0:shape[1] - 1], data_vector_train[:, shape[1] - 1]

model_path = data_path / batch / 'models' / img
metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format(
    img + '_clouds_' + str(pctl))

if not model_path.exists():
    model_path.mkdir(parents=True)
if not metrics_path.exists():
    metrics_path.mkdir(parents=True)

model_path = model_path / '{}'.format(img + '_clouds_' + str(pctl) + '.sav')

print('Training')
start_time = time.time()
logreg = LogisticRegression(n_jobs=-1, solver='sag')
logreg.fit(X_train, y_train)
end_time = time.time()
times.append(timer(start_time, end_time, False))
joblib.dump(logreg, model_path)

metrics_path = metrics_path.parent
times = [float(i) for i in times]
times = np.column_stack([pctls, times])
times_df = pd.DataFrame(times, columns=['cloud_cover', 'training_time'])
times_df.to_csv(metrics_path / 'training_times.csv', index=False)
Esempio n. 14
0
def prediction_with_uncertainty(img_list,
                                pctls,
                                feat_list_new,
                                data_path,
                                batch,
                                DROPOUT_RATE,
                                MC_PASSES,
                                remove_perm,
                                weight_decay=0.005,
                                length_scale=0.00001,
                                **model_params):
    for j, img in enumerate(img_list):
        times = []
        accuracy, precision, recall, f1 = [], [], [], []
        preds_path = data_path / batch / 'predictions' / img
        vars_path = data_path / batch / 'variances' / img
        mc_bin_file = preds_path / 'mc_preds.h5'
        preds_bin_file = preds_path / 'predictions.h5'
        vars_bin_file = vars_path / 'variances.h5'
        metrics_path = data_path / batch / 'metrics' / 'testing' / img

        try:
            metrics_path.mkdir(parents=True)
        except FileExistsError:
            print('Metrics directory already exists')
        try:
            preds_path.mkdir(parents=True)
        except FileExistsError:
            print('Metrics directory already exists')
        try:
            vars_path.mkdir(parents=True)
        except FileExistsError:
            print('Metrics directory already exists')

        for i, pctl in enumerate(pctls):
            print('Preprocessing', img, pctl, '% cloud cover')
            data_test, data_vector_test, data_ind_test, feat_keep = preprocessing(
                data_path, img, pctl, gaps=True)
            feat_list_keep = [feat_list_new[i] for i in feat_keep
                              ]  # Removed if feat was deleted in preprocessing
            if remove_perm:
                perm_index = feat_list_keep.index('GSW_perm')
                flood_index = feat_list_keep.index('flooded')
                data_vector_test[
                    data_vector_test[:, perm_index] == 1,
                    flood_index] = 0  # Remove flood water that is perm water
            data_vector_test = np.delete(data_vector_test, perm_index,
                                         axis=1)  # Remove GSW_perm column
            data_shape = data_vector_test.shape
            X_test, y_test = data_vector_test[:, 0:data_shape[
                1] - 1], data_vector_test[:, data_shape[1] - 1]

            # Initialize binary file to hold predictions
            with h5py.File(mc_bin_file, 'w') as f:
                f.create_dataset('mc_preds',
                                 shape=(X_test.shape[0], 1),
                                 maxshape=(X_test.shape[0], None),
                                 chunks=True,
                                 compression='gzip'
                                 )  # Create empty dataset with shape of data

            start_time = time.time()
            model_path = data_path / batch / 'models' / img / '{}'.format(
                img + '_clouds_' + str(pctl) + '.h5')
            trained_model = tf.keras.models.load_model(model_path)

            for k in range(MC_PASSES):
                if k % 10 == 0 or k == MC_PASSES - 1:
                    print('Running MC {}/{} for {} at {}% cloud cover'.format(
                        k, MC_PASSES, img, pctl))
                flood_prob = trained_model.predict(
                    X_test,
                    batch_size=model_params['batch_size'],
                    use_multiprocessing=True)  # Predict
                flood_prob = flood_prob[:,
                                        1]  # Drop probability of not flooded (0) to save space
                with h5py.File(mc_bin_file, 'a') as f:
                    f['mc_preds'][:,
                                  -1] = flood_prob  # Append preds to h5 file
                    if k < MC_PASSES - 1:  # Resize to append next pass, if there is one
                        f['mc_preds'].resize((f['mc_preds'].shape[1] + 1),
                                             axis=1)
                tf.keras.backend.clear_session()
                del flood_prob

            # Calculate MC statistics
            print('Calculating MC statistics for {} at {}% cloud cover'.format(
                img, pctl))
            with h5py.File(mc_bin_file, 'r') as f:
                dset = f['mc_preds']
                preds_da = da.from_array(
                    dset, chunks="250 MiB")  # Open h5 file as dask array
                means = preds_da.mean(axis=1)
                means = means.compute()
                variance = preds_da.var(axis=1)
                variance = variance.compute()
                tau = (length_scale**2 *
                       (1 - DROPOUT_RATE)) / (2 * data_shape[0] * weight_decay)
                variance = variance + tau
                preds = means.round()
                del f, means, preds_da, dset

            os.remove(mc_bin_file)  # Delete predictions to save space on disk

            print('Saving mean preds/vars for {} at {}% cloud cover'.format(
                img, pctl))
            with h5py.File(preds_bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier mean predictions')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=preds)
            with h5py.File(vars_bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier variances')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=variance)

            times.append(timer(start_time, time.time(),
                               False))  # Elapsed time for MC simulations

            print('Evaluating predictions for {} at {}% cloud cover'.format(
                img, pctl))
            accuracy.append(accuracy_score(y_test, preds))
            precision.append(precision_score(y_test, preds))
            recall.append(recall_score(y_test, preds))
            f1.append(f1_score(y_test, preds))

            del preds, X_test, y_test, trained_model, data_test, data_vector_test, data_ind_test

        metrics = pd.DataFrame(
            np.column_stack([pctls, accuracy, precision, recall, f1]),
            columns=['cloud_cover', 'accuracy', 'precision', 'recall', 'f1'])
        metrics.to_csv(metrics_path / 'metrics.csv', index=False)
        times = [
            float(i) for i in times
        ]  # Convert time objects to float, otherwise valMetrics will be non-numeric
        times_df = pd.DataFrame(np.column_stack([pctls, times]),
                                columns=['cloud_cover', 'testing_time'])
        times_df.to_csv(metrics_path / 'testing_times.csv', index=False)
Esempio n. 15
0
def NN_prediction(img_list, pctls, feat_list_all, data_path, batch,
                  **model_params):
    for j, img in enumerate(img_list):
        times = []
        accuracy, precision, recall, f1, roc_auc = [], [], [], [], []
        preds_path = data_path / batch / 'predictions' / img
        bin_file = preds_path / 'predictions.h5'
        metrics_path = data_path / batch / 'metrics' / 'testing' / img

        try:
            metrics_path.mkdir(parents=True)
        except FileExistsError:
            print('Metrics directory already exists')

        for i, pctl in enumerate(pctls):
            print('Preprocessing', img, pctl, '% cloud cover')
            data_test, data_vector_test, data_ind_test, feat_keep = preprocessing(
                data_path, img, pctl, feat_list_all, test=True)
            perm_index = feat_keep.index('GSW_perm')
            flood_index = feat_keep.index('flooded')
            data_vector_test[
                data_vector_test[:, perm_index] == 1,
                flood_index] = 0  # Remove flood water that is perm water
            data_vector_test = np.delete(data_vector_test, perm_index,
                                         axis=1)  # Remove GSW_perm column
            data_shape = data_vector_test.shape
            X_test, y_test = data_vector_test[:, 0:data_shape[
                1] - 1], data_vector_test[:, data_shape[1] - 1]

            print('Predicting for {} at {}% cloud cover'.format(img, pctl))
            start_time = time.time()
            model_path = data_path / batch / 'models' / img / '{}'.format(
                img + '_clouds_' + str(pctl) + '.h5')
            trained_model = load_macro_soft_f1_model(model_path)

            pred_probs = trained_model.predict(
                X_test,
                batch_size=model_params['batch_size'],
                use_multiprocessing=True)
            preds = np.argmax(pred_probs,
                              axis=1)  # Display most probable value

            try:
                preds_path.mkdir(parents=True)
            except FileExistsError:
                pass

            with h5py.File(bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier mean predictions')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=preds)

            times.append(timer(start_time, time.time(),
                               False))  # Elapsed time for MC simulations

            print('Evaluating predictions')
            perm_mask = data_test[:, :, perm_index]
            perm_mask = perm_mask.reshape(
                [perm_mask.shape[0] * perm_mask.shape[1]])
            perm_mask = perm_mask[~np.isnan(perm_mask)]
            preds[perm_mask.astype('bool')] = 0
            y_test[perm_mask.astype('bool')] = 0

            accuracy.append(accuracy_score(y_test, preds))
            precision.append(precision_score(y_test, preds))
            recall.append(recall_score(y_test, preds))
            f1.append(f1_score(y_test, preds))
            roc_auc.append(roc_auc_score(y_test, pred_probs[:, 1]))

            del preds, pred_probs, X_test, y_test, trained_model, data_test, data_vector_test, data_ind_test

        metrics = pd.DataFrame(
            np.column_stack([pctls, accuracy, precision, recall, f1, roc_auc]),
            columns=[
                'cloud_cover', 'accuracy', 'precision', 'recall', 'f1', 'auc'
            ])
        metrics.to_csv(metrics_path / 'metrics.csv', index=False)
        times = [
            float(i) for i in times
        ]  # Convert time objects to float, otherwise valMetrics will be non-numeric
        times_df = pd.DataFrame(np.column_stack([pctls, times]),
                                columns=['cloud_cover', 'testing_time'])
        times_df.to_csv(metrics_path / 'testing_times.csv', index=False)
Esempio n. 16
0
# end_time = time.time()
# print('statsmodel training time:', timer(start_time, end_time, False))
# result.save(str(model_path))

# Logistic regression using statsmodel
model_path = data_path / batch / 'models' / img
if not model_path.exists():
    model_path.mkdir(parents=True)
model_path = model_path / '{}'.format(img + '_statsmodel2.pickle')
print('Training statsmodel')
start_time = time.time()
logreg_sm = sm.Logit(y_train, X_train)
result = logreg_sm.fit_regularized()
print(result.summary())
end_time = time.time()
print('statsmodel training time:', timer(start_time, end_time, False))
result.save(str(model_path))

# Prediction
preds_path = data_path / batch / 'predictions' / img
if not preds_path.exists():
    preds_path.mkdir(parents=True)
bin_file = preds_path / 'predictions.h5'
data_test, data_vector_test, data_ind_test, feat_keep = preprocessing(
    data_path, img, pctl, feat_list_new, test=True)
perm_index = feat_keep.index('GSW_perm')
flood_index = feat_keep.index('flooded')
data_vector_test[data_vector_test[:, perm_index] == 1,
                 flood_index] = 0  # Remove flood water that is perm water
data_vector_test = np.delete(data_vector_test, perm_index,
                             axis=1)  # Remove GSW_perm column
Esempio n. 17
0
def log_reg_training_buffer(img_list, pctls, feat_list_new, data_path, batch,
                            buffer_iters, buffer_flood_only):
    from imageio import imwrite

    for img in img_list:
        print(img + ': stacking tif, generating clouds')
        times = []
        tif_stacker(data_path,
                    img,
                    feat_list_new,
                    features=True,
                    overwrite=False)
        cloud_generator(img, data_path, overwrite=False)

        for pctl in pctls:
            print('Preprocessing')
            data_train_full, data_vector_train, data_ind_train, feat_keep = preprocessing(
                data_path, img, pctl, feat_list_new, test=False)
            for buffer_iter in buffer_iters:
                perm_index = feat_keep.index('GSW_perm')
                flood_index = feat_keep.index('flooded')
                data_train = data_train_full.copy()
                if buffer_flood_only:
                    data_train[data_train[:, :, perm_index] == 1,
                               flood_index] = 0
                    mask = data_train[:, :, flood_index]
                    buffer_mask = np.invert(
                        binary_dilation(mask, iterations=buffer_iter))
                else:
                    mask = data_train[:, :, flood_index]
                    buffer_mask = np.invert(
                        binary_dilation(mask, iterations=buffer_iter))
                    data_train[data_train[:, :, perm_index] == 1,
                               flood_index] = 0
                data_train[buffer_mask] = np.nan

                data_vector_train = data_train.reshape([
                    data_train.shape[0] * data_train.shape[1],
                    data_train.shape[2]
                ])
                data_vector_train = data_vector_train[
                    ~np.isnan(data_vector_train).any(axis=1)]
                data_vector_train = np.delete(
                    data_vector_train, perm_index,
                    axis=1)  # Remove perm water column
                shape = data_vector_train.shape
                X_train, y_train = data_vector_train[:, 0:shape[
                    1] - 1], data_vector_train[:, shape[1] - 1]

                model_path = data_path / batch / 'models' / img
                metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format(
                    img + '_clouds_' + str(pctl))

                if not model_path.exists():
                    model_path.mkdir(parents=True)
                if not metrics_path.exists():
                    metrics_path.mkdir(parents=True)

                model_path = model_path / '{}'.format(img + '_clouds_' + str(
                    pctl) + 'buff' + str(buffer_iter) + '.sav')

                # Save data flooding image to check that buffering is working correctly
                # imwrite(model_path.parents[0] / '{}'.format('buff' + str(buffer_iter) + '.jpg'), data_train[:, :, 6])

                print('Training')
                start_time = time.time()
                logreg = LogisticRegression(n_jobs=-1, solver='sag')
                logreg.fit(X_train, y_train)
                end_time = time.time()
                times.append(timer(start_time, end_time, False))
                joblib.dump(logreg, model_path)

        metrics_path = metrics_path.parent
        times = [float(i) for i in times]
        times = np.column_stack([
            np.repeat(pctls, len(buffer_iters)),
            np.tile(buffer_iters, len(pctls)), times
        ])
        times_df = pd.DataFrame(
            times, columns=['cloud_cover', 'buffer_iters', 'training_time'])
        times_df.to_csv(metrics_path / 'training_times.csv', index=False)
Esempio n. 18
0
def training6(img_list,
              pctls,
              model_func,
              feat_list_new,
              data_path,
              batch,
              T,
              dropout_rate=0.2,
              **model_params):
    '''
    1. Removes ALL pixels that are over permanent water
    2. Finds the optimum learning rate and uses cyclic LR scheduler
    to train the model
    3. No validation set for training
    4.
    '''
    get_model = model_func
    for j, img in enumerate(img_list):
        print(img + ': stacking tif, generating clouds')
        times = []
        tif_stacker(data_path,
                    img,
                    feat_list_new,
                    features=True,
                    overwrite=False)
        cloud_generator(img, data_path, overwrite=False)

        for i, pctl in enumerate(pctls):
            print(img, pctl, '% CLOUD COVER')
            print('Preprocessing')
            tf.keras.backend.clear_session()
            data_train, data_vector_train, data_ind_train, feat_keep = preprocessing(
                data_path, img, pctl, feat_list_new, gaps=False)
            perm_index = feat_keep.index('GSW_perm')
            flood_index = feat_keep.index('flooded')
            data_vector_train[
                data_vector_train[:, perm_index] == 1,
                flood_index] = 0  # Remove flood water that is perm water
            data_vector_train = np.delete(data_vector_train,
                                          perm_index,
                                          axis=1)  # Remove perm water column
            shape = data_vector_train.shape
            X_train, y_train = data_vector_train[:, 0:shape[1] -
                                                 1], data_vector_train[:,
                                                                       shape[1]
                                                                       - 1]
            y_train = to_categorical(y_train)
            INPUT_DIMS = X_train.shape[1]

            model_path = data_path / batch / 'models' / img
            metrics_path = data_path / batch / 'metrics' / 'training' / img / '{}'.format(
                img + '_clouds_' + str(pctl))

            try:
                metrics_path.mkdir(parents=True)
                model_path.mkdir(parents=True)
            except FileExistsError:
                pass

            model_path = model_path / '{}'.format(img + '_clouds_' +
                                                  str(pctl) + '.h5')

            callbacks = [
                tf.keras.callbacks.EarlyStopping(
                    monitor='softmax_output_categorical_accuracy',
                    min_delta=0.005,
                    patience=5),
                tf.keras.callbacks.ModelCheckpoint(filepath=str(model_path),
                                                   monitor='loss',
                                                   save_best_only=True),
                CSVLogger(metrics_path / 'training_log.log')
            ]

            start_time = time.time()
            model = get_model(model_params['epochs'],
                              X_train,
                              y_train,
                              X_train.shape,
                              T,
                              D=2,
                              batch_size=model_params['batch_size'],
                              dropout_rate=dropout_rate,
                              callbacks=callbacks)
            end_time = time.time()
            times.append(timer(start_time, end_time, False))
            # model.save(model_path)

        metrics_path = metrics_path.parent
        times = [float(i) for i in times]
        times = np.column_stack([pctls, times])
        times_df = pd.DataFrame(times,
                                columns=['cloud_cover', 'training_time'])
        times_df.to_csv(metrics_path / 'training_times.csv', index=False)
Esempio n. 19
0
def prediction_BNN_gen_model(img_list, pctls, feat_list_new, data_path, batch,
                             MC_passes, **model_params):
    for j, img in enumerate(img_list):
        times = []
        accuracy, precision, recall, f1 = [], [], [], []
        preds_path = data_path / batch / 'predictions' / img
        bin_file = preds_path / 'predictions.h5'
        model_path = data_path / batch / 'models' / 'gen_model.h5'

        uncertainties_path = data_path / batch / 'uncertainties' / img
        aleatoric_bin_file = uncertainties_path / 'aleatoric_uncertainties.h5'
        epistemic_bin_file = uncertainties_path / 'epistemic_uncertainties.h5'
        metrics_path = data_path / batch / 'metrics' / 'testing' / img

        try:
            metrics_path.mkdir(parents=True)
        except FileExistsError:
            print('Metrics directory already exists')

        for i, pctl in enumerate(pctls):
            print('Preprocessing', img, pctl, '% cloud cover')
            data_test, data_vector_test, data_ind_test, feat_keep = preprocessing(
                data_path, img, pctl, feat_list_new, test=True)
            perm_index = feat_keep.index('GSW_perm')
            flood_index = feat_keep.index('flooded')
            data_vector_test[
                data_vector_test[:, perm_index] == 1,
                flood_index] = 0  # Remove flood water that is perm water
            data_vector_test = np.delete(data_vector_test, perm_index,
                                         axis=1)  # Remove GSW_perm column
            data_shape = data_vector_test.shape
            X_test, y_test = data_vector_test[:, 0:data_shape[
                1] - 1], data_vector_test[:, data_shape[1] - 1]

            print('Predicting for {} at {}% cloud cover'.format(img, pctl))
            start_time = time.time()
            model = tf.keras.models.load_model(model_path)
            p_hat = []
            for t in range(MC_passes):
                p_hat.append(
                    model.predict(X_test,
                                  batch_size=model_params['batch_size'],
                                  use_multiprocessing=True)[:, 1])
            p_hat = np.array(p_hat)
            preds = np.round(np.mean(p_hat, axis=0))
            aleatoric = np.mean(p_hat * (1 - p_hat), axis=0)
            epistemic = np.mean(p_hat**2, axis=0) - np.mean(p_hat, axis=0)**2

            try:
                preds_path.mkdir(parents=True)
            except FileExistsError:
                pass

            with h5py.File(bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier mean predictions')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=preds)

            try:
                uncertainties_path.mkdir(parents=True)
            except FileExistsError:
                pass

            with h5py.File(epistemic_bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier epistemic predictions')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=epistemic)

            with h5py.File(aleatoric_bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier epistemic predictions')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=aleatoric)

            times.append(timer(start_time, time.time(), False))

            print('Evaluating predictions')
            accuracy.append(accuracy_score(y_test, preds))
            precision.append(precision_score(y_test, preds))
            recall.append(recall_score(y_test, preds))
            f1.append(f1_score(y_test, preds))

            del preds, p_hat, aleatoric, epistemic, X_test, y_test, model, data_test, data_vector_test, data_ind_test

        metrics = pd.DataFrame(
            np.column_stack([pctls, accuracy, precision, recall, f1]),
            columns=['cloud_cover', 'accuracy', 'precision', 'recall', 'f1'])
        metrics.to_csv(metrics_path / 'metrics.csv', index=False)
        times = [float(i) for i in times]
        times_df = pd.DataFrame(np.column_stack([pctls, times]),
                                columns=['cloud_cover', 'testing_time'])
        times_df.to_csv(metrics_path / 'testing_times.csv', index=False)
Esempio n. 20
0
def prediction_bnn(img_list, pctls, feat_list_new, data_path, batch, MC_passes):
    for j, img in enumerate(img_list):
        epistemic_times = []
        aleatoric_times = []
        accuracy, precision, recall, f1 = [], [], [], []
        preds_path = data_path / batch / 'predictions' / img
        bin_file = preds_path / 'predictions.h5'
        aleatoric_bin_file = preds_path / 'aleatoric_predictions.h5'
        uncertainties_path = data_path / batch / 'uncertainties' / img
        aleatoric_uncertainty_file = uncertainties_path / 'aleatoric_uncertainties.h5'
        epistemic_uncertainty_file = uncertainties_path / 'epistemic_uncertainties.h5'
        metrics_path = data_path / batch / 'metrics' / 'testing' / img

        try:
            metrics_path.mkdir(parents=True)
        except FileExistsError:
            print('Metrics directory already exists')

        for i, pctl in enumerate(pctls):
            print('Preprocessing', img, pctl, '% cloud cover')
            data_test, data_vector_test, data_ind_test, feat_keep = preprocessing(data_path, img, pctl, feat_list_new,
                                                                                  test=True)
            perm_index = feat_keep.index('GSW_perm')
            flood_index = feat_keep.index('flooded')
            data_vector_test[data_vector_test[:, perm_index] == 1, flood_index] = 0
            data_vector_test = np.delete(data_vector_test, perm_index, axis=1)
            data_shape = data_vector_test.shape
            X_test, y_test = data_vector_test[:, 0:data_shape[1] - 1], data_vector_test[:, data_shape[1] - 1]
            y_test = to_categorical(y_test)
            D = len(set(y_test[:, 0]))  # Target classes
            iterable = K.variable(np.ones(MC_passes))

            print('Predicting (aleatoric) for {} at {}% cloud cover'.format(img, pctl))
            model_path = data_path / batch / 'models' / img / '{}'.format(img + '_clouds_' + str(pctl) + '.h5')
            start_time = time.time()
            # aleatoric_model = tf.keras.models.load_model(model_path)
            aleatoric_model = load_bayesian_model(model_path, MC_passes, D, iterable)
            aleatoric_results = aleatoric_model.predict(X_test, verbose=1)
            aleatoric_uncertainties = np.reshape(aleatoric_results[0][:, D:], (-1))
            try:
                uncertainties_path.mkdir(parents=True)
            except FileExistsError:
                pass
            with h5py.File(aleatoric_uncertainty_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier aleatoric uncertainties')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=aleatoric_uncertainties)
            logits = aleatoric_results[0][:, 0:D]
            aleatoric_preds = np.argmax(aleatoric_results[1], axis=1)
            aleatoric_times.append(timer(start_time, time.time(), False))
            try:
                preds_path.mkdir(parents=True)
            except FileExistsError:
                pass
            with h5py.File(aleatoric_bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier aleatoric predictions')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=aleatoric_preds)

            print('Predicting (epistemic) for {} at {}% cloud cover'.format(img, pctl))
            start_time = time.time()
            epistemic_model = get_epistemic_uncertainty_model(model_path, T=MC_passes, D=D)
            epistemic_results = epistemic_model.predict(X_test, verbose=2, use_multiprocessing=True)
            epistemic_uncertainties = epistemic_results[0]
            with h5py.File(epistemic_uncertainty_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier epistemic uncertainties')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=epistemic_uncertainties)
            epistemic_preds = np.argmax(epistemic_results[1], axis=1)
            epistemic_times.append(timer(start_time, time.time(), False))
            with h5py.File(bin_file, 'a') as f:
                if str(pctl) in f:
                    print('Deleting earlier epistemic predictions')
                    del f[str(pctl)]
                f.create_dataset(str(pctl), data=epistemic_preds)

            print('Evaluating predictions')
            accuracy.append(accuracy_score(y_test[:, 1], epistemic_preds))
            precision.append(precision_score(y_test[:, 1], epistemic_preds))
            recall.append(recall_score(y_test[:, 1], epistemic_preds))
            f1.append(f1_score(y_test[:, 1], epistemic_preds))

            del aleatoric_model, aleatoric_results, aleatoric_uncertainties, logits, aleatoric_preds, \
                epistemic_model, epistemic_uncertainties, epistemic_preds, epistemic_results, \
                data_test, data_vector_test, data_ind_test

        metrics = pd.DataFrame(np.column_stack([pctls, accuracy, precision, recall, f1]),
                               columns=['cloud_cover', 'accuracy', 'precision', 'recall', 'f1'])
        metrics.to_csv(metrics_path / 'metrics.csv', index=False)
        epistemic_times = [float(i) for i in epistemic_times]
        aleatoric_times = [float(i) for i in aleatoric_times]
        times_df = pd.DataFrame(np.column_stack([pctls, epistemic_times, aleatoric_times]),
                                columns=['cloud_cover', 'epistemic_testing_time', 'aleatoric_testing_time'])
        times_df.to_csv(metrics_path / 'testing_times.csv', index=False)