Beispiel #1
0
    def load_dataset(input_filename,
                     target_filename,
                     matching_key='relative_path',
                     target_key='mean_slope',
                     latent_name_prefix='latent_'):
        Console.info("load_dataset called for: ", input_filename)

        df = pd.read_csv(
            input_filename, index_col=0
        )  # use 1st column as ID, the 2nd (relative_path) can be used as part of UUID
        # 1) Data validation, remove invalid entries (e.g. NaN)
        print(df.head())
        df = df.dropna()
        Console.info("Total valid entries: ", len(df))
        # df.reset)index(drop = True) # not sure if we prefer to reset index, as column index was externallly defined

        # 2) Let's determine number of latent-space dimensions
        # The number of 'features' are defined by those columns labeled as 'relative_path'xxx, where xx is 0-based index for the h-latent space vector
        # Example: (8 dimensions: h0, h1, ... , h7)
        # relative_path northing [m] easting [m] ... latitude [deg] longitude [deg] recon_loss h0 h1 h2 h3 h4 h5 h6 h7
        n_latents = len(df.filter(regex=latent_name_prefix).columns)
        Console.info("Latent dimensions: ", n_latents)

        # 3) Key matching
        # each 'relative_path' entry has the format  slo/20181121_depthmap_1050_0251_no_slo.tif
        # where the filename is composed by [date_type_tilex_tiley_mod_type]. input and target tables differ only in 'type' field
        # let's use regex
        df['filename_base'] = df[matching_key].str.extract(
            '(?:\/)(.*_)')  # I think it is possible to do it in a single regex
        df['filename_base'] = df['filename_base'].str.rstrip('_')

        tdf = pd.read_csv(
            target_filename
        )  # expected header: relative_path	mean_slope [ ... ] mean_rugosity
        tdf = tdf.dropna()
        # target_key='mean_rugosity'
        tdf['filename_base'] = tdf[matching_key].str.extract(
            '(?:\/)(.*_)')  # I think it is possible to do it in a single regex
        tdf['filename_base'] = tdf['filename_base'].str.rstrip('_r002')

        # print (tdf.head())
        Console.info("Target entries: ", len(tdf))
        merged_df = pd.merge(df, tdf, how='right', on='filename_base')
        merged_df = merged_df.dropna()

        latent_df = merged_df.filter(regex=latent_name_prefix)
        Console.info("Latent size: ", latent_df.shape)
        target_df = merged_df[target_key]

        np_latent = latent_df.to_numpy(dtype='float')
        np_target = target_df.to_numpy(dtype='float')
        # input-output datasets are linked using the key provided by matching_key
        return np_latent, np_target, merged_df['filename_base']
Beispiel #2
0
    def load_toydataset(input_filename,
                        target_key='mean_slope',
                        input_prefix='latent_',
                        matching_key='relative_path'):
        Console.info("load_toydataset called for: ", input_filename)

        df = pd.read_csv(
            input_filename, index_col=0
        )  # use 1st column as ID, the 2nd (relative_path) can be used as part of UUID
        # 1) Data validation, remove invalid entries (e.g. NaN)
        print(df.head())
        df = df.dropna()
        Console.info("Total valid entries: ", len(df))
        # df.reset)index(drop = True) # not sure if we prefer to reset index, as column index was externallly defined

        # 2) Let's determine number of latent-space dimensions
        # The number of 'features' are defined by those columns labeled as 'relative_path'xxx, where xx is 0-based index for the h-latent space vector
        # Example: (8 dimensions: h0, h1, ... , h7)
        # relative_path northing [m] easting [m] ... latitude [deg] longitude [deg] recon_loss h0 h1 h2 h3 h4 h5 h6 h7
        n_latents = len(df.filter(regex=input_prefix).columns)
        Console.info("Latent dimensions: ", n_latents)

        latent_df = df.filter(regex=input_prefix)
        target_df = df[target_key]
        Console.info("Latent size: ", latent_df.shape)

        np_latent = latent_df.to_numpy(dtype='float')
        np_target = target_df.to_numpy(dtype='float')
        np_uuid = df[matching_key].to_numpy()
        # input-output datasets are linked using the key provided by matching_key
        return np_latent, np_target, np_uuid
Beispiel #3
0
def main(args=None):
    parser = argparse.ArgumentParser()
    add_arguments(parser)

    if len(sys.argv) == 1 and args is None: # no arggument passed? error, some parameters were expected
        # Show help if no args provided
        parser.print_help(sys.stderr)
        sys.exit(2)
    args = parser.parse_args(args)  # retrieve parsed arguments

    Console.info("Bayesian Neural Network for hi-res inference from low res acoustic priors (LGA-Bathymetry)")
    # let's check if input files exist
    if os.path.isfile(args.target):
        Console.info("Target input file: ", args.target)
    else:
        Console.error("Target input file [" + args.target + "] not found. Please check the provided input path (-t, --target)")

    if os.path.isfile(args.latent):
        Console.info("Latent input file: ", args.latent)
    else:
        Console.error("Latent input file [" + args.latent + "] not found. Please check the provided input path (-l, --latent)")
    # check for pre-trained network
    # if output file exists, warn user
    if os.path.isfile(args.network):
        Console.warn("Destination trained network file [", args.network, "] already exists. It will be overwritten (default action)")
    else:
        Console.info("Destination trained network: ", args.network)

    if os.path.isfile(args.output):
        Console.warn("Output file [", args.output, "] already exists. It will be overwritten (default action)")
    else:
        Console.info("Output file: ", args.output)
    # it can be "none"

    if (args.epochs):
        num_epochs = args.epochs
    else:
        num_epochs = 150

    if (args.samples):
        n_samples = args.samples
    else:
        num_epochs = 20

    if (args.key):
        col_key = args.key
    else:
        col_key = 'mean_slope'

    if (args.xinput):
        input_key = args.key
    else:
        input_key = 'latent_'

    # // TODO : add arg parser, admit input file (dataset), config file, validation dataset file, mode (train, validate, predict)
    Console.info("Geotech landability/measurability predictor from low-res acoustics. Uses Bayesian Neural Networks as predictive engine")
    dataset_filename = args.latent # dataset containing the predictive input. e.g. the latent vector
    target_filename = args.target  # output variable to be predicted, e.g. mean_slope
    # dataset_filename = "data/output-201811-merged-h14.xls"     # dataset containing the predictive input
    # target_filename = "data/target/koyo20181121-stat-r002-slo.csv"  # output variable to be predicted
    Console.info("Loading dataset: " + dataset_filename)

    X, y, index_df = CustomDataloader.load_dataset(dataset_filename, target_filename, matching_key='relative_path', target_key = col_key)    # relative_path is the common key in both tables
    # X, y, index_df = CustomDataloader.load_toydataset(dataset_filename, target_key = col_key, input_prefix= input_key, matching_key='uuid')    # relative_path is the common key in both tables

    Console.info("Data loaded...")
    # y = y/10    #some rescale    WARNING

    #X = X/10.0
    # n_sample = X.shape[0]
    n_latents = X.shape[1]
    # X = StandardScaler().fit_transform(X)
    # y = StandardScaler().fit_transform(np.expand_dims(y, -1)) # this is resizing the array so it can match Size (D,1) expected by pytorch

    # norm = MinMaxScaler().fit(y)
    # y_norm = norm.transform(y)      # min max normalization of our input data
    # y_norm = (y - 5.0)/30.0
    y_norm = y

    norm = MinMaxScaler().fit(X)
    X_norm = norm.transform(X)      # min max normalization of our input data

    print ("X [min,max]", np.amin(X),"/", np.amax(X))
    print ("X_norm [min,max]", np.amin(X_norm),"/", np.amax(X_norm))
    print ("Y [min,max]", np.amin(y),"/", np.amax(y))

    X_train, X_test, y_train, y_test = train_test_split(X_norm,
                                                        y_norm,
                                                        test_size=.25, # 3:1 ratio
                                                        shuffle = True) 

    X_train, y_train = torch.tensor(X_train).float(), torch.tensor(y_train).float()
    X_test, y_test = torch.tensor(X_test).float(), torch.tensor(y_test).float()

    y_train = torch.unsqueeze(y_train, -1)  # PyTorch will complain if we feed the (N) tensor rather than a (NX1) tensor
    y_test = torch.unsqueeze(y_test, -1)    # we add an additional dummy dimension
    # sys.exit(1)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    regressor = BayesianRegressor(n_latents, 1).to(device)  # Single output being predicted
    # regressor.init
    optimizer = optim.Adam(regressor.parameters(), lr=0.002) # learning rate
    criterion = torch.nn.MSELoss()

    # print("Model's state_dict:")
    # for param_tensor in regressor.state_dict():
    #     print(param_tensor, "\t", regressor .state_dict()[param_tensor].size())

    ds_train = torch.utils.data.TensorDataset(X_train, y_train)
    dataloader_train = torch.utils.data.DataLoader(ds_train, batch_size=16, shuffle=True)

    ds_test = torch.utils.data.TensorDataset(X_test, y_test)
    dataloader_test = torch.utils.data.DataLoader(ds_test, batch_size=16, shuffle=True)

    iteration = 0
    # Training time
    test_hist = []
    uncert_hist = []
    train_hist = []
    fit_hist = []
    ufit_hist = []

    elbo_kld = 1.0

    print ("ELBO KLD factor: ", elbo_kld/X_train.shape[0]);

    for epoch in range(num_epochs):
        train_loss = []
        for i, (datapoints, labels) in enumerate(dataloader_train):
            optimizer.zero_grad()
            
            loss = regressor.sample_elbo(inputs=datapoints.to(device),
                            labels=labels.to(device),
                            criterion=criterion,    # MSELoss
                            sample_nbr=n_samples,
                            complexity_cost_weight=elbo_kld/X_train.shape[0])  # normalize the complexity cost by the number of input points
            loss.backward() # the returned loss is the combination of fit loss (MSELoss) and complexity cost (KL_div against the )
            optimizer.step()
            train_loss.append(loss.item())
            
        test_loss = []
        fit_loss = []

        for k, (test_datapoints, test_labels) in enumerate(dataloader_test):
            sample_loss = regressor.sample_elbo(inputs=test_datapoints.to(device),
                                labels=test_labels.to(device),
                                criterion=criterion,
                                sample_nbr=n_samples,
                                complexity_cost_weight=elbo_kld/X_test.shape[0])

            fit_loss_sample = regressor.sample_elbo(inputs=test_datapoints.to(device),
                                labels=test_labels.to(device),
                                criterion=criterion,
                                sample_nbr=n_samples,
                                complexity_cost_weight=0)   # we are interested in the reconstruction/prediction loss only (no KL cost)

            test_loss.append(sample_loss.item())
            fit_loss.append(fit_loss_sample.item())

        mean_test_loss = statistics.mean(test_loss)
        stdv_test_loss = statistics.stdev(test_loss)

        mean_train_loss = statistics.mean(train_loss)

        mean_fit_loss = statistics.mean(fit_loss)
        stdv_fit_loss = statistics.stdev(fit_loss)

        Console.info("Epoch [" + str(epoch) + "] Train loss: {:.4f}".format(mean_train_loss) + " Valid. loss: {:.4f}".format(mean_test_loss) + " Fit loss: {:.4f}  ***".format(mean_fit_loss) )
        Console.progress(epoch, num_epochs)

        test_hist.append(mean_test_loss)
        uncert_hist.append(stdv_test_loss)
        train_hist.append(mean_train_loss)

        fit_hist.append(mean_fit_loss)
        ufit_hist.append(stdv_fit_loss)

        # train_hist.append(statistics.mean(train_loss))

        # if (epoch % 50) == 0:   # every 50 epochs, we save a network snapshot
        #     temp_name = "bnn_model_" + str(epoch) + ".pth"
        #     torch.save(regressor.state_dict(), temp_name)

    Console.info("Training completed!")
    # torch.save(regressor.state_dict(), "bnn_model_N" + str (num_epochs) + ".pth")
    torch.save(regressor.state_dict(), args.network)

    export_df = pd.DataFrame([train_hist, test_hist, uncert_hist, fit_hist, ufit_hist]).transpose()
    export_df.columns = ['train_error', 'test_error', 'test_error_stdev', 'test_loss', 'test_loss_stdev']

    print ("head", export_df.head())
    output_name = "bnn_training_S" + str(n_samples) + "_E" + str(num_epochs) + "_H" + str(n_latents) + ".csv"
    export_df.to_csv(output_name)
    # export_df.to_csv("bnn_train_report.csv")
    # df = pd.read_csv(input_filename, index_col=0) # use 1t column as ID, the 2nd (relative_path) can be used as part of UUID

    # Once trained, we start inferring
    expected = []
    uncertainty = []
    predicted = [] # == y

    Console.info("testing predictions...")
    idx = 0 
    # for x in X_test:
    Xp_ = torch.tensor(X_norm).float()

    for x in Xp_:
        predictions = []
        for n in range(n_samples):
            p = regressor(x.to(device)).item()
            # print ("p.type", type(p)) ----> float
            # print ("p.len", len(p))
            predictions.append(p) #1D output, retieve single item

        # print ("pred.type", type(predictions))
        # print ("pred.len", len(predictions))    ---> 10 (n_samples)

        p_mean = statistics.mean(predictions)
        p_stdv = statistics.stdev(predictions)
        idx = idx + 1

        # print ("p_mean", type(p_mean))  --> float

        predicted.append(p_mean)
        uncertainty.append(p_stdv)


        Console.progress(idx, len(Xp_))

    # print ("predicted:" , predicted)
    # print ("predicted.type", type(predicted))
    # print ("predicted.len", len(predicted))
    # print ("X.len:" , len(X_test))
    # y_list = y_train.squeeze().tolist()
    y_list = y_norm.squeeze().tolist()
    # y_list = y_test.squeeze().tolist()

    # y_list = [element.item() for element in y_test.flatten()]

    xl = np.squeeze(X_norm).tolist()

    # print ("y_list.len", len(y_list))
    # predicted.len = X.len (as desired)
    # pred_df  = pd.DataFrame ([xl, y_list, predicted, uncertainty, index_df]).transpose()
    pred_df  = pd.DataFrame ([y_list, predicted, uncertainty, index_df]).transpose()
    # pred_df  = pd.DataFrame ([y_list, predicted, uncertainty, index_df.values.tolist() ]).transpose()
    # pred_df.columns = ['Xp_', 'y', 'predicted', 'uncertainty', 'index']
    pred_df.columns = ['y', 'predicted', 'uncertainty', 'index']

    output_name = "bnn_predictions_S" + str(n_samples) + "_E" + str(num_epochs) + "_H" + str(n_latents) + ".csv"
    # output_name = args.output
    pred_df.to_csv(output_name)