def plot_train_(self, model, x_train, y_train, x_val, y_val): plt.figure() train_errors, val_errors = [], [] data_size = len(x_train) size_per_event = int(data_size / self.config.train_events) step = int(data_size / self.config.train_plot_npoints) checkpoints = np.arange(start=size_per_event, stop=data_size, step=step) for ind, checkpoint in enumerate(checkpoints): model.fit(x_train[:checkpoint], y_train[:checkpoint]) y_train_predict = model.predict(x_train[:checkpoint]) y_val_predict = model.predict(x_val) train_errors.append( mean_squared_error(y_train_predict, y_train[:checkpoint])) val_errors.append(mean_squared_error(y_val_predict, y_val)) if ind in (0, self.config.train_plot_npoints // 2, self.config.train_plot_npoints - 1): self.plot_results_(y_train[:checkpoint], y_train_predict, "train-%d" % ind) self.plot_results_(y_val, y_val_predict, "val-%d" % ind) self.config.logger.info("Memory usage during plot train") log_total_memory_usage() plt.plot(checkpoints, np.sqrt(train_errors), ".", label="train") plt.plot(checkpoints, np.sqrt(val_errors), ".", label="validation") plt.ylim([0, np.amax(np.sqrt(val_errors)) * 2]) plt.title("Learning curve BDT") plt.xlabel("Training set size") plt.ylabel("RMSE") plt.legend(loc="lower left") plt.savefig("%s/learning_plot_%s_nEv%d.png" % (self.config.dirplots, self.config.suffix, self.config.train_events)) plt.clf()
def apply(self): self.config.logger.info("XGBoostOptimiser::apply, input size: %d", self.config.dim_input) loaded_model = self.load_model() inputs, exp_outputs = self.get_data_("apply") log_memory_usage( ((inputs, "Input apply data"), (exp_outputs, "Output apply data"))) log_total_memory_usage("Memory usage after loading apply data") start = timer() pred_outputs = loaded_model.predict(inputs) end = timer() log_time(start, end, "actual predict") self.plot_apply_(exp_outputs, pred_outputs) self.config.logger.info("Done apply")
def __plot_train(self, model, x_train, y_train, x_val, y_val): """ Plot the learning curve for 1D calibration. Function used internally. :param xgboost.sklearn.XGBModel model: the XGBoost model to be checked :param np.ndarray x_train: input data for training :param np.ndarray y_train: expected training output :param np.ndarray x_train: input data for validation :param np.ndarray y_train: expected validation output """ plt.figure() train_errors, val_errors = [], [] data_size = len(x_train) size_per_event = int(data_size / self.config.train_events) step = int(data_size / self.config.train_plot_npoints) checkpoints = np.arange(start=size_per_event, stop=data_size, step=step) for ind, checkpoint in enumerate(checkpoints): model.fit(x_train[:checkpoint], y_train[:checkpoint]) y_train_predict = model.predict(x_train[:checkpoint]) y_val_predict = model.predict(x_val) train_errors.append( mean_squared_error(y_train_predict, y_train[:checkpoint])) val_errors.append(mean_squared_error(y_val_predict, y_val)) if ind in (0, self.config.train_plot_npoints // 2, self.config.train_plot_npoints - 1): self.__plot_results(y_train[:checkpoint], y_train_predict, "train-%d" % ind) self.__plot_results(y_val, y_val_predict, "val-%d" % ind) self.config.logger.info("Memory usage during plot train") log_total_memory_usage() plt.plot(checkpoints, np.sqrt(train_errors), ".", label="train") plt.plot(checkpoints, np.sqrt(val_errors), ".", label="validation") plt.ylim([0, np.amax(np.sqrt(val_errors)) * 2]) plt.title("Learning curve BDT") plt.xlabel("Training set size") plt.ylabel("RMSE") plt.legend(loc="lower left") plt.savefig("%s/learning_plot_%s_nEv%d.png" % (self.config.dirplots, self.config.suffix, self.config.train_events)) plt.clf()
def train(self): self.config.logger.info("XGBoostOptimiser::train") model = XGBRFRegressor(verbosity=1, **(self.config.params)) start = timer() inputs, exp_outputs = self.get_data_("train") end = timer() log_time(start, end, "for loading training data") log_memory_usage( ((inputs, "Input train data"), (exp_outputs, "Output train data"))) log_total_memory_usage("Memory usage after loading data") if self.config.plot_train: inputs_val, outputs_val = self.get_data_("validation") log_memory_usage(((inputs_val, "Input val data"), (outputs_val, "Output val data"))) log_total_memory_usage("Memory usage after loading val data") self.plot_train_(model, inputs, exp_outputs, inputs_val, outputs_val) start = timer() model.fit(inputs, exp_outputs) end = timer() log_time(start, end, "actual train") self.save_model(model)
def train(self): """ Train the optimizer. """ self.config.logger.info("XGBoostOptimiser::train") if self.config.dim_output > 1: logger = get_logger() logger.fatal( "YOU CAN PREDICT ONLY 1 DISTORTION. dim_output is bigger than 1." ) model = XGBRFRegressor(verbosity=1, **(self.config.params)) start = timer() inputs, exp_outputs, *_ = self.__get_data("train") end = timer() log_time(start, end, "for loading training data") log_memory_usage( ((inputs, "Input train data"), (exp_outputs, "Output train data"))) log_total_memory_usage("Memory usage after loading data") if self.config.plot_train: inputs_val, outputs_val, *_ = self.__get_data("validation") log_memory_usage(((inputs_val, "Input validation data"), (outputs_val, "Output validation data"))) log_total_memory_usage( "Memory usage after loading validation data") self.__plot_train(model, inputs, exp_outputs, inputs_val, outputs_val) start = timer() model.fit(inputs, exp_outputs) end = timer() log_time(start, end, "actual train") model.get_booster().feature_names = get_input_names_oned_idc( self.config.opt_usederivative, self.config.num_fourier_coeffs_train) self.__plot_feature_importance(model) self.save_model(model)
def main(): """ The global main function """ logger = get_logger() logger.info("Starting TPC ML...") log_total_memory_usage("Initial memory usage") parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("-c", "--config", dest="config_file", default="config_model_parameters.yml", type=str, help="path to the *.yml configuration file") parser.add_argument("-s", "--steer", dest="steer_file", default="default.yml", type=str, help="path to the *.yml steering file") # parameters for steer file parser.add_argument("--dotrain", action='store_true', default=argparse.SUPPRESS, help="Perform the training") parser.add_argument("--docreateinputdata", action='store_true', default=argparse.SUPPRESS, help="Create input data trees") parser.add_argument("--docreatevaldata", action='store_true', default=argparse.SUPPRESS, help="Create validation data trees") # parameters for config file parser.add_argument("--rndaugment", action='store_true', default=argparse.SUPPRESS, help="Use random-random augmentation for training") parser.add_argument("--ntrain1d", dest='train_events_oned', type=int, default=argparse.SUPPRESS, help="Set custom number of training events") parser.add_argument("--nval", dest='val_events', type=int, default=argparse.SUPPRESS, help="Set custom number of validation events") parser.add_argument( "--frac", dest='downsample_fraction', type=float, default=argparse.SUPPRESS, help="Set downsampling fraction if --downsample is set") parser.add_argument("--nestimators", dest='n_estimators', type=int, default=argparse.SUPPRESS, help="Set the number of trees for xgboost models") parser.add_argument("--maxdepth", dest='max_depth', type=int, default=argparse.SUPPRESS, help="Set maximum depth of trees for xgboost models") args = parser.parse_args() logger.info("Using configuration: %s steer file: %s", args.config_file, args.steer_file) with open(args.steer_file, "r") as steer_data: default = yaml.safe_load(steer_data) with open(args.config_file, "r") as config_data: config_parameters = yaml.safe_load(config_data) logger.info("Arguments provided: %s", str(args)) if "dotrain" in args: default['dotrain'] = True if "docreateinputdata" in args or "docreatevaldata" in args: default['docreatevaldata'] = True config_parameters['common']['nd_validate_model'] = False if "docreatevaldata" in args: config_parameters['common']['nd_validate_model'] = True if "rndaugment" in args: config_parameters['common']['rnd_augment'] = True if "train_events_oned" in args: config_parameters['xgboost']['train_events'] = [args.train_events_oned] if "val_events" in args: config_parameters['common']['val_events'] = args.val_events if "downsample_fraction" in args: config_parameters['xgboost']['downsample'] = True config_parameters['xgboost'][ 'downsample_fraction'] = args.downsample_fraction if "n_estimators" in args: config_parameters['xgboost']['params'][ 'n_estimators'] = args.n_estimators if "max_depth" in args: config_parameters['xgboost']['params']['max_depth'] = args.max_depth models, corr, dataval = init_models(config_parameters) events_counts = (get_events_counts( config_parameters[model.name]["train_events"], config_parameters[model.name]["validation_events"], config_parameters[model.name]["apply_events"]) for model in models) ranges_rnd = config_parameters["common"]["range_rnd_index_train"] ranges_mean = config_parameters["common"]["range_mean_index"] if config_parameters["common"]["rnd_augment"]: max_available_events = (ranges_rnd[1] + 1 - ranges_rnd[0]) * ( ranges_rnd[1] - ranges_rnd[0]) else: max_available_events = (ranges_rnd[1] + 1 - ranges_rnd[0]) * \ (ranges_mean[1] + 1 - ranges_mean[0]) for model, model_events_counts in zip(models, events_counts): all_events_counts = [] for (train_events, val_events, apply_events) in model_events_counts: total_events = train_events + val_events + apply_events if total_events > max_available_events: logger.warning( "Too big number of events requested: %d available: %d", total_events, max_available_events) continue all_events_counts.append( (train_events, val_events, apply_events, total_events)) ranges = { "train": [0, train_events], "val": [train_events, train_events + val_events], "apply": [train_events + val_events, total_events] } model.config.set_ranges(ranges, total_events, train_events, val_events, apply_events) run_model_and_val(model, dataval, default, config_parameters["common"]) # TODO: apply the correction and save in files if corr is not None: pass if default["doprofile"] is True: model.draw_profile(all_events_counts) logger.info("Program finished.")
def main(): """ The global main function """ logger = get_logger() logger.info("Starting TPC ML...") log_total_memory_usage("Initial memory usage") parser = argparse.ArgumentParser( formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("-c", "--config", dest="config_file", default="config_model_parameters.yml", type=str, help="path to the *.yml configuration file") parser.add_argument("-s", "--steer", dest="steer_file", default="default.yml", type=str, help="path to the *.yml steering file") # parameters for steer file parser.add_argument("--dotrain", action="store_true", default=argparse.SUPPRESS, help="Perform the training") parser.add_argument("--docreatendvaldata", action="store_true", default=argparse.SUPPRESS, help="Create validation data trees") parser.add_argument("--docache", action="store_true", default=argparse.SUPPRESS, help="Cache training data if not already existing") # parameters for config file parser.add_argument("--rndaugment", action="store_true", default=argparse.SUPPRESS, help="Use random-random augmentation for training") parser.add_argument("--ntrain1d", dest="train_events_oned", type=int, default=argparse.SUPPRESS, help="Set custom number of training events") parser.add_argument("--nval", dest="nd_val_events", type=int, default=argparse.SUPPRESS, help="Set custom number of max nd validation events") parser.add_argument("--dnpoints", dest="downsample_npoints", type=int, default=argparse.SUPPRESS, help="Set number of downsampling points") parser.add_argument("--nestimators", dest="n_estimators", type=int, default=argparse.SUPPRESS, help="Set the number of trees for xgboost models") parser.add_argument("--maxdepth", dest="max_depth", type=int, default=argparse.SUPPRESS, help="Set maximum depth of trees for xgboost models") parser.add_argument("--nfftidcs", dest="num_fft_idcs", type=int, default=argparse.SUPPRESS, help="Set number of 1D IDCs used for" \ " the FFT. Corresponds to the ion drift time (ms) used in simulation.") parser.add_argument("--nfouriertrain", dest="num_fourier_coeffs_train", type=int, default=argparse.SUPPRESS, help="Set number of Fourier coefficients" \ " to take from the 1D IDC train input") parser.add_argument("--nfourierapply", dest="num_fourier_coeffs_apply", type=int, default=argparse.SUPPRESS, help="Set number of Fourier coefficients" \ " to take from the 1D IDC apply input") # parameters for caching parser.add_argument("--cache-events", dest="cache_events", type=int, default=argparse.SUPPRESS, help="Set the number of events to cache") parser.add_argument("--cache-train", action="store_true", default=argparse.SUPPRESS, help="Use cached data for training") parser.add_argument( "--cache-file-size", dest="cache_file_size", type=int, default=argparse.SUPPRESS, help="Set the number of events per single temporary cache file") args = parser.parse_args() logger.info("Using configuration: %s steer file: %s", args.config_file, args.steer_file) with open(args.steer_file, "r", encoding="utf-8") as steer_data: default = yaml.safe_load(steer_data) with open(args.config_file, "r", encoding="utf-8") as config_data: config_parameters = yaml.safe_load(config_data) logger.info("Arguments provided: %s", str(args)) if "dotrain" in args: default["dotrain"] = True if "docreatendvaldata" in args: default["docreatendvaldata"] = True if "docache" in args: default["docache"] = True # if "rndaugment" in args: config_parameters["common"]["rnd_augment"] = True if "train_events_oned" in args: config_parameters["xgboost"]["train_events"] = [args.train_events_oned] if "nd_val_events" in args: config_parameters["common"]["nd_val_events"] = args.nd_val_events if "downsample_npoints" in args: config_parameters["xgboost"]["downsample"] = True config_parameters["xgboost"][ "downsample_npoints"] = args.downsample_npoints if "n_estimators" in args: config_parameters["xgboost"]["params"][ "n_estimators"] = args.n_estimators if "max_depth" in args: config_parameters["xgboost"]["params"]["max_depth"] = args.max_depth if "num_fft_idcs" in args: config_parameters["common"]["num_fft_idcs"] = args.num_fft_idcs if "num_fourier_coeffs_train" in args: config_parameters["common"][ "num_fourier_coeffs_train"] = args.num_fourier_coeffs_train if "num_fourier_coeffs_apply" in args: config_parameters["common"][ "num_fourier_coeffs_apply"] = args.num_fourier_coeffs_apply # if "cache_events" in args: config_parameters["xgboost"]["cache_events"] = args.cache_events if "cache_train" in args: config_parameters["xgboost"]["cache_train"] = True if "cache_file_size" in args: config_parameters["xgboost"]["cache_file_size"] = args.cache_file_size models, corr, dataval = init_models(config_parameters) events_counts = (get_events_counts( config_parameters[model.name]["train_events"], config_parameters[model.name]["validation_events"], config_parameters[model.name]["apply_events"]) for model in models) ranges_rnd = config_parameters["common"]["range_rnd_index_train"] ranges_mean = config_parameters["common"]["range_mean_index"] if config_parameters["common"]["rnd_augment"]: max_available_events = (ranges_rnd[1] + 1 - ranges_rnd[0]) * ( ranges_rnd[1] - ranges_rnd[0]) else: max_available_events = (ranges_rnd[1] + 1 - ranges_rnd[0]) * \ (ranges_mean[1] + 1 - ranges_mean[0]) for model in models: if default["docache"] is True and model.name == "xgboost": start = timer() model.cache_train_data() end = timer() log_time(start, end, "cache") for model, model_events_counts in zip(models, events_counts): all_events_counts = [] for (train_events, val_events, apply_events) in model_events_counts: total_events = train_events + val_events + apply_events if total_events > max_available_events: logger.warning( "Too big number of events requested: %d available: %d", total_events, max_available_events) continue all_events_counts.append( (train_events, val_events, apply_events, total_events)) ranges = { "train": [0, train_events], "validation": [train_events, train_events + val_events], "apply": [train_events + val_events, total_events] } model.config.set_ranges(ranges, total_events, train_events, val_events, apply_events) run_model_and_val(model, dataval, default, config_parameters["common"]) # TODO: apply the correction and save in files if corr is not None: pass if default["doprofile"] is True: model.draw_profile(all_events_counts) logger.info("Program finished.")