def load_dataset(input_filename, target_filename, matching_key='relative_path', target_key='mean_slope', latent_name_prefix='latent_'): Console.info("load_dataset called for: ", input_filename) df = pd.read_csv( input_filename, index_col=0 ) # use 1st column as ID, the 2nd (relative_path) can be used as part of UUID # 1) Data validation, remove invalid entries (e.g. NaN) print(df.head()) df = df.dropna() Console.info("Total valid entries: ", len(df)) # df.reset)index(drop = True) # not sure if we prefer to reset index, as column index was externallly defined # 2) Let's determine number of latent-space dimensions # The number of 'features' are defined by those columns labeled as 'relative_path'xxx, where xx is 0-based index for the h-latent space vector # Example: (8 dimensions: h0, h1, ... , h7) # relative_path northing [m] easting [m] ... latitude [deg] longitude [deg] recon_loss h0 h1 h2 h3 h4 h5 h6 h7 n_latents = len(df.filter(regex=latent_name_prefix).columns) Console.info("Latent dimensions: ", n_latents) # 3) Key matching # each 'relative_path' entry has the format slo/20181121_depthmap_1050_0251_no_slo.tif # where the filename is composed by [date_type_tilex_tiley_mod_type]. input and target tables differ only in 'type' field # let's use regex df['filename_base'] = df[matching_key].str.extract( '(?:\/)(.*_)') # I think it is possible to do it in a single regex df['filename_base'] = df['filename_base'].str.rstrip('_') tdf = pd.read_csv( target_filename ) # expected header: relative_path mean_slope [ ... ] mean_rugosity tdf = tdf.dropna() # target_key='mean_rugosity' tdf['filename_base'] = tdf[matching_key].str.extract( '(?:\/)(.*_)') # I think it is possible to do it in a single regex tdf['filename_base'] = tdf['filename_base'].str.rstrip('_r002') # print (tdf.head()) Console.info("Target entries: ", len(tdf)) merged_df = pd.merge(df, tdf, how='right', on='filename_base') merged_df = merged_df.dropna() latent_df = merged_df.filter(regex=latent_name_prefix) Console.info("Latent size: ", latent_df.shape) target_df = merged_df[target_key] np_latent = latent_df.to_numpy(dtype='float') np_target = target_df.to_numpy(dtype='float') # input-output datasets are linked using the key provided by matching_key return np_latent, np_target, merged_df['filename_base']
def load_toydataset(input_filename, target_key='mean_slope', input_prefix='latent_', matching_key='relative_path'): Console.info("load_toydataset called for: ", input_filename) df = pd.read_csv( input_filename, index_col=0 ) # use 1st column as ID, the 2nd (relative_path) can be used as part of UUID # 1) Data validation, remove invalid entries (e.g. NaN) print(df.head()) df = df.dropna() Console.info("Total valid entries: ", len(df)) # df.reset)index(drop = True) # not sure if we prefer to reset index, as column index was externallly defined # 2) Let's determine number of latent-space dimensions # The number of 'features' are defined by those columns labeled as 'relative_path'xxx, where xx is 0-based index for the h-latent space vector # Example: (8 dimensions: h0, h1, ... , h7) # relative_path northing [m] easting [m] ... latitude [deg] longitude [deg] recon_loss h0 h1 h2 h3 h4 h5 h6 h7 n_latents = len(df.filter(regex=input_prefix).columns) Console.info("Latent dimensions: ", n_latents) latent_df = df.filter(regex=input_prefix) target_df = df[target_key] Console.info("Latent size: ", latent_df.shape) np_latent = latent_df.to_numpy(dtype='float') np_target = target_df.to_numpy(dtype='float') np_uuid = df[matching_key].to_numpy() # input-output datasets are linked using the key provided by matching_key return np_latent, np_target, np_uuid
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. """This temporary script contains a light demonstration of the POC ability. It needs the 'datas' subdirectory, created in this directory and the user.yml file, included in the repository. """ from model import * from dc.sqlite3 import Sqlite3Connector from tools.console import Console class User(Model): """A user model.""" username = String() def __repr__(self): return "<user id={}, username={}>".format(self.id, repr(self.username)) # Load the stored datas connector = Sqlite3Connector() Model.data_connector = connector connector.setup("data.db") connector.record_tables([User]) console = Console({"Model": Model, "User": User}) console.launch()
def execute(self, namespace): """Execute the command.""" console = ConsoleTool({"server": self.server}) console.launch() self.server.data_connector.loop()
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT # OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. """This temporary script contains a light demonstration of the POC ability. It needs the 'datas' subdirectory, created in this directory and the user.yml file, included in the repository. """ from dc import connectors from model import * from tests.model import User from tools.console import Console # Load the stored datas connector = connectors["mongo"]() Model.data_connector = connector connector.setup_test() connector.record_models([User]) console = Console({"Model": Model, "User": User}) console.launch()
def main(args=None): parser = argparse.ArgumentParser() add_arguments(parser) if len(sys.argv) == 1 and args is None: # no arggument passed? error, some parameters were expected # Show help if no args provided parser.print_help(sys.stderr) sys.exit(2) args = parser.parse_args(args) # retrieve parsed arguments Console.info("Bayesian Neural Network for hi-res inference from low res acoustic priors (LGA-Bathymetry)") # let's check if input files exist if os.path.isfile(args.target): Console.info("Target input file: ", args.target) else: Console.error("Target input file [" + args.target + "] not found. Please check the provided input path (-t, --target)") if os.path.isfile(args.latent): Console.info("Latent input file: ", args.latent) else: Console.error("Latent input file [" + args.latent + "] not found. Please check the provided input path (-l, --latent)") # check for pre-trained network # if output file exists, warn user if os.path.isfile(args.network): Console.warn("Destination trained network file [", args.network, "] already exists. It will be overwritten (default action)") else: Console.info("Destination trained network: ", args.network) if os.path.isfile(args.output): Console.warn("Output file [", args.output, "] already exists. It will be overwritten (default action)") else: Console.info("Output file: ", args.output) # it can be "none" if (args.epochs): num_epochs = args.epochs else: num_epochs = 150 if (args.samples): n_samples = args.samples else: num_epochs = 20 if (args.key): col_key = args.key else: col_key = 'mean_slope' if (args.xinput): input_key = args.key else: input_key = 'latent_' # // TODO : add arg parser, admit input file (dataset), config file, validation dataset file, mode (train, validate, predict) Console.info("Geotech landability/measurability predictor from low-res acoustics. Uses Bayesian Neural Networks as predictive engine") dataset_filename = args.latent # dataset containing the predictive input. e.g. the latent vector target_filename = args.target # output variable to be predicted, e.g. mean_slope # dataset_filename = "data/output-201811-merged-h14.xls" # dataset containing the predictive input # target_filename = "data/target/koyo20181121-stat-r002-slo.csv" # output variable to be predicted Console.info("Loading dataset: " + dataset_filename) X, y, index_df = CustomDataloader.load_dataset(dataset_filename, target_filename, matching_key='relative_path', target_key = col_key) # relative_path is the common key in both tables # X, y, index_df = CustomDataloader.load_toydataset(dataset_filename, target_key = col_key, input_prefix= input_key, matching_key='uuid') # relative_path is the common key in both tables Console.info("Data loaded...") # y = y/10 #some rescale WARNING #X = X/10.0 # n_sample = X.shape[0] n_latents = X.shape[1] # X = StandardScaler().fit_transform(X) # y = StandardScaler().fit_transform(np.expand_dims(y, -1)) # this is resizing the array so it can match Size (D,1) expected by pytorch # norm = MinMaxScaler().fit(y) # y_norm = norm.transform(y) # min max normalization of our input data # y_norm = (y - 5.0)/30.0 y_norm = y norm = MinMaxScaler().fit(X) X_norm = norm.transform(X) # min max normalization of our input data print ("X [min,max]", np.amin(X),"/", np.amax(X)) print ("X_norm [min,max]", np.amin(X_norm),"/", np.amax(X_norm)) print ("Y [min,max]", np.amin(y),"/", np.amax(y)) X_train, X_test, y_train, y_test = train_test_split(X_norm, y_norm, test_size=.25, # 3:1 ratio shuffle = True) X_train, y_train = torch.tensor(X_train).float(), torch.tensor(y_train).float() X_test, y_test = torch.tensor(X_test).float(), torch.tensor(y_test).float() y_train = torch.unsqueeze(y_train, -1) # PyTorch will complain if we feed the (N) tensor rather than a (NX1) tensor y_test = torch.unsqueeze(y_test, -1) # we add an additional dummy dimension # sys.exit(1) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') regressor = BayesianRegressor(n_latents, 1).to(device) # Single output being predicted # regressor.init optimizer = optim.Adam(regressor.parameters(), lr=0.002) # learning rate criterion = torch.nn.MSELoss() # print("Model's state_dict:") # for param_tensor in regressor.state_dict(): # print(param_tensor, "\t", regressor .state_dict()[param_tensor].size()) ds_train = torch.utils.data.TensorDataset(X_train, y_train) dataloader_train = torch.utils.data.DataLoader(ds_train, batch_size=16, shuffle=True) ds_test = torch.utils.data.TensorDataset(X_test, y_test) dataloader_test = torch.utils.data.DataLoader(ds_test, batch_size=16, shuffle=True) iteration = 0 # Training time test_hist = [] uncert_hist = [] train_hist = [] fit_hist = [] ufit_hist = [] elbo_kld = 1.0 print ("ELBO KLD factor: ", elbo_kld/X_train.shape[0]); for epoch in range(num_epochs): train_loss = [] for i, (datapoints, labels) in enumerate(dataloader_train): optimizer.zero_grad() loss = regressor.sample_elbo(inputs=datapoints.to(device), labels=labels.to(device), criterion=criterion, # MSELoss sample_nbr=n_samples, complexity_cost_weight=elbo_kld/X_train.shape[0]) # normalize the complexity cost by the number of input points loss.backward() # the returned loss is the combination of fit loss (MSELoss) and complexity cost (KL_div against the ) optimizer.step() train_loss.append(loss.item()) test_loss = [] fit_loss = [] for k, (test_datapoints, test_labels) in enumerate(dataloader_test): sample_loss = regressor.sample_elbo(inputs=test_datapoints.to(device), labels=test_labels.to(device), criterion=criterion, sample_nbr=n_samples, complexity_cost_weight=elbo_kld/X_test.shape[0]) fit_loss_sample = regressor.sample_elbo(inputs=test_datapoints.to(device), labels=test_labels.to(device), criterion=criterion, sample_nbr=n_samples, complexity_cost_weight=0) # we are interested in the reconstruction/prediction loss only (no KL cost) test_loss.append(sample_loss.item()) fit_loss.append(fit_loss_sample.item()) mean_test_loss = statistics.mean(test_loss) stdv_test_loss = statistics.stdev(test_loss) mean_train_loss = statistics.mean(train_loss) mean_fit_loss = statistics.mean(fit_loss) stdv_fit_loss = statistics.stdev(fit_loss) Console.info("Epoch [" + str(epoch) + "] Train loss: {:.4f}".format(mean_train_loss) + " Valid. loss: {:.4f}".format(mean_test_loss) + " Fit loss: {:.4f} ***".format(mean_fit_loss) ) Console.progress(epoch, num_epochs) test_hist.append(mean_test_loss) uncert_hist.append(stdv_test_loss) train_hist.append(mean_train_loss) fit_hist.append(mean_fit_loss) ufit_hist.append(stdv_fit_loss) # train_hist.append(statistics.mean(train_loss)) # if (epoch % 50) == 0: # every 50 epochs, we save a network snapshot # temp_name = "bnn_model_" + str(epoch) + ".pth" # torch.save(regressor.state_dict(), temp_name) Console.info("Training completed!") # torch.save(regressor.state_dict(), "bnn_model_N" + str (num_epochs) + ".pth") torch.save(regressor.state_dict(), args.network) export_df = pd.DataFrame([train_hist, test_hist, uncert_hist, fit_hist, ufit_hist]).transpose() export_df.columns = ['train_error', 'test_error', 'test_error_stdev', 'test_loss', 'test_loss_stdev'] print ("head", export_df.head()) output_name = "bnn_training_S" + str(n_samples) + "_E" + str(num_epochs) + "_H" + str(n_latents) + ".csv" export_df.to_csv(output_name) # export_df.to_csv("bnn_train_report.csv") # df = pd.read_csv(input_filename, index_col=0) # use 1t column as ID, the 2nd (relative_path) can be used as part of UUID # Once trained, we start inferring expected = [] uncertainty = [] predicted = [] # == y Console.info("testing predictions...") idx = 0 # for x in X_test: Xp_ = torch.tensor(X_norm).float() for x in Xp_: predictions = [] for n in range(n_samples): p = regressor(x.to(device)).item() # print ("p.type", type(p)) ----> float # print ("p.len", len(p)) predictions.append(p) #1D output, retieve single item # print ("pred.type", type(predictions)) # print ("pred.len", len(predictions)) ---> 10 (n_samples) p_mean = statistics.mean(predictions) p_stdv = statistics.stdev(predictions) idx = idx + 1 # print ("p_mean", type(p_mean)) --> float predicted.append(p_mean) uncertainty.append(p_stdv) Console.progress(idx, len(Xp_)) # print ("predicted:" , predicted) # print ("predicted.type", type(predicted)) # print ("predicted.len", len(predicted)) # print ("X.len:" , len(X_test)) # y_list = y_train.squeeze().tolist() y_list = y_norm.squeeze().tolist() # y_list = y_test.squeeze().tolist() # y_list = [element.item() for element in y_test.flatten()] xl = np.squeeze(X_norm).tolist() # print ("y_list.len", len(y_list)) # predicted.len = X.len (as desired) # pred_df = pd.DataFrame ([xl, y_list, predicted, uncertainty, index_df]).transpose() pred_df = pd.DataFrame ([y_list, predicted, uncertainty, index_df]).transpose() # pred_df = pd.DataFrame ([y_list, predicted, uncertainty, index_df.values.tolist() ]).transpose() # pred_df.columns = ['Xp_', 'y', 'predicted', 'uncertainty', 'index'] pred_df.columns = ['y', 'predicted', 'uncertainty', 'index'] output_name = "bnn_predictions_S" + str(n_samples) + "_E" + str(num_epochs) + "_H" + str(n_latents) + ".csv" # output_name = args.output pred_df.to_csv(output_name)