Beispiel #1
0
 def plot_train_(self, model, x_train, y_train, x_val, y_val):
     plt.figure()
     train_errors, val_errors = [], []
     data_size = len(x_train)
     size_per_event = int(data_size / self.config.train_events)
     step = int(data_size / self.config.train_plot_npoints)
     checkpoints = np.arange(start=size_per_event,
                             stop=data_size,
                             step=step)
     for ind, checkpoint in enumerate(checkpoints):
         model.fit(x_train[:checkpoint], y_train[:checkpoint])
         y_train_predict = model.predict(x_train[:checkpoint])
         y_val_predict = model.predict(x_val)
         train_errors.append(
             mean_squared_error(y_train_predict, y_train[:checkpoint]))
         val_errors.append(mean_squared_error(y_val_predict, y_val))
         if ind in (0, self.config.train_plot_npoints // 2,
                    self.config.train_plot_npoints - 1):
             self.plot_results_(y_train[:checkpoint], y_train_predict,
                                "train-%d" % ind)
             self.plot_results_(y_val, y_val_predict, "val-%d" % ind)
     self.config.logger.info("Memory usage during plot train")
     log_total_memory_usage()
     plt.plot(checkpoints, np.sqrt(train_errors), ".", label="train")
     plt.plot(checkpoints, np.sqrt(val_errors), ".", label="validation")
     plt.ylim([0, np.amax(np.sqrt(val_errors)) * 2])
     plt.title("Learning curve BDT")
     plt.xlabel("Training set size")
     plt.ylabel("RMSE")
     plt.legend(loc="lower left")
     plt.savefig("%s/learning_plot_%s_nEv%d.png" %
                 (self.config.dirplots, self.config.suffix,
                  self.config.train_events))
     plt.clf()
Beispiel #2
0
 def apply(self):
     self.config.logger.info("XGBoostOptimiser::apply, input size: %d",
                             self.config.dim_input)
     loaded_model = self.load_model()
     inputs, exp_outputs = self.get_data_("apply")
     log_memory_usage(
         ((inputs, "Input apply data"), (exp_outputs, "Output apply data")))
     log_total_memory_usage("Memory usage after loading apply data")
     start = timer()
     pred_outputs = loaded_model.predict(inputs)
     end = timer()
     log_time(start, end, "actual predict")
     self.plot_apply_(exp_outputs, pred_outputs)
     self.config.logger.info("Done apply")
    def __plot_train(self, model, x_train, y_train, x_val, y_val):
        """
        Plot the learning curve for 1D calibration.
        Function used internally.

        :param xgboost.sklearn.XGBModel model: the XGBoost model to be checked
        :param np.ndarray x_train: input data for training
        :param np.ndarray y_train: expected training output
        :param np.ndarray x_train: input data for validation
        :param np.ndarray y_train: expected validation output
        """
        plt.figure()
        train_errors, val_errors = [], []
        data_size = len(x_train)
        size_per_event = int(data_size / self.config.train_events)
        step = int(data_size / self.config.train_plot_npoints)
        checkpoints = np.arange(start=size_per_event,
                                stop=data_size,
                                step=step)
        for ind, checkpoint in enumerate(checkpoints):
            model.fit(x_train[:checkpoint], y_train[:checkpoint])
            y_train_predict = model.predict(x_train[:checkpoint])
            y_val_predict = model.predict(x_val)
            train_errors.append(
                mean_squared_error(y_train_predict, y_train[:checkpoint]))
            val_errors.append(mean_squared_error(y_val_predict, y_val))
            if ind in (0, self.config.train_plot_npoints // 2,
                       self.config.train_plot_npoints - 1):
                self.__plot_results(y_train[:checkpoint], y_train_predict,
                                    "train-%d" % ind)
                self.__plot_results(y_val, y_val_predict, "val-%d" % ind)
        self.config.logger.info("Memory usage during plot train")
        log_total_memory_usage()
        plt.plot(checkpoints, np.sqrt(train_errors), ".", label="train")
        plt.plot(checkpoints, np.sqrt(val_errors), ".", label="validation")
        plt.ylim([0, np.amax(np.sqrt(val_errors)) * 2])
        plt.title("Learning curve BDT")
        plt.xlabel("Training set size")
        plt.ylabel("RMSE")
        plt.legend(loc="lower left")
        plt.savefig("%s/learning_plot_%s_nEv%d.png" %
                    (self.config.dirplots, self.config.suffix,
                     self.config.train_events))
        plt.clf()
Beispiel #4
0
 def train(self):
     self.config.logger.info("XGBoostOptimiser::train")
     model = XGBRFRegressor(verbosity=1, **(self.config.params))
     start = timer()
     inputs, exp_outputs = self.get_data_("train")
     end = timer()
     log_time(start, end, "for loading training data")
     log_memory_usage(
         ((inputs, "Input train data"), (exp_outputs, "Output train data")))
     log_total_memory_usage("Memory usage after loading data")
     if self.config.plot_train:
         inputs_val, outputs_val = self.get_data_("validation")
         log_memory_usage(((inputs_val, "Input val data"),
                           (outputs_val, "Output val data")))
         log_total_memory_usage("Memory usage after loading val data")
         self.plot_train_(model, inputs, exp_outputs, inputs_val,
                          outputs_val)
     start = timer()
     model.fit(inputs, exp_outputs)
     end = timer()
     log_time(start, end, "actual train")
     self.save_model(model)
    def train(self):
        """
        Train the optimizer.
        """
        self.config.logger.info("XGBoostOptimiser::train")
        if self.config.dim_output > 1:
            logger = get_logger()
            logger.fatal(
                "YOU CAN PREDICT ONLY 1 DISTORTION. dim_output is bigger than 1."
            )

        model = XGBRFRegressor(verbosity=1, **(self.config.params))
        start = timer()
        inputs, exp_outputs, *_ = self.__get_data("train")
        end = timer()
        log_time(start, end, "for loading training data")
        log_memory_usage(
            ((inputs, "Input train data"), (exp_outputs, "Output train data")))
        log_total_memory_usage("Memory usage after loading data")
        if self.config.plot_train:
            inputs_val, outputs_val, *_ = self.__get_data("validation")
            log_memory_usage(((inputs_val, "Input validation data"),
                              (outputs_val, "Output validation data")))
            log_total_memory_usage(
                "Memory usage after loading validation data")
            self.__plot_train(model, inputs, exp_outputs, inputs_val,
                              outputs_val)
        start = timer()
        model.fit(inputs, exp_outputs)
        end = timer()
        log_time(start, end, "actual train")
        model.get_booster().feature_names = get_input_names_oned_idc(
            self.config.opt_usederivative,
            self.config.num_fourier_coeffs_train)
        self.__plot_feature_importance(model)
        self.save_model(model)
Beispiel #6
0
def main():
    """ The global main function """
    logger = get_logger()
    logger.info("Starting TPC ML...")

    log_total_memory_usage("Initial memory usage")

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("-c",
                        "--config",
                        dest="config_file",
                        default="config_model_parameters.yml",
                        type=str,
                        help="path to the *.yml configuration file")
    parser.add_argument("-s",
                        "--steer",
                        dest="steer_file",
                        default="default.yml",
                        type=str,
                        help="path to the *.yml steering file")
    # parameters for steer file
    parser.add_argument("--dotrain",
                        action='store_true',
                        default=argparse.SUPPRESS,
                        help="Perform the training")
    parser.add_argument("--docreateinputdata",
                        action='store_true',
                        default=argparse.SUPPRESS,
                        help="Create input data trees")
    parser.add_argument("--docreatevaldata",
                        action='store_true',
                        default=argparse.SUPPRESS,
                        help="Create validation data trees")
    # parameters for config file
    parser.add_argument("--rndaugment",
                        action='store_true',
                        default=argparse.SUPPRESS,
                        help="Use random-random augmentation for training")
    parser.add_argument("--ntrain1d",
                        dest='train_events_oned',
                        type=int,
                        default=argparse.SUPPRESS,
                        help="Set custom number of training events")
    parser.add_argument("--nval",
                        dest='val_events',
                        type=int,
                        default=argparse.SUPPRESS,
                        help="Set custom number of validation events")
    parser.add_argument(
        "--frac",
        dest='downsample_fraction',
        type=float,
        default=argparse.SUPPRESS,
        help="Set downsampling fraction if --downsample is set")
    parser.add_argument("--nestimators",
                        dest='n_estimators',
                        type=int,
                        default=argparse.SUPPRESS,
                        help="Set the number of trees for xgboost models")
    parser.add_argument("--maxdepth",
                        dest='max_depth',
                        type=int,
                        default=argparse.SUPPRESS,
                        help="Set maximum depth of trees for xgboost models")
    args = parser.parse_args()

    logger.info("Using configuration: %s steer file: %s", args.config_file,
                args.steer_file)

    with open(args.steer_file, "r") as steer_data:
        default = yaml.safe_load(steer_data)
    with open(args.config_file, "r") as config_data:
        config_parameters = yaml.safe_load(config_data)

    logger.info("Arguments provided: %s", str(args))
    if "dotrain" in args:
        default['dotrain'] = True
    if "docreateinputdata" in args or "docreatevaldata" in args:
        default['docreatevaldata'] = True
        config_parameters['common']['nd_validate_model'] = False
    if "docreatevaldata" in args:
        config_parameters['common']['nd_validate_model'] = True
    if "rndaugment" in args:
        config_parameters['common']['rnd_augment'] = True
    if "train_events_oned" in args:
        config_parameters['xgboost']['train_events'] = [args.train_events_oned]
    if "val_events" in args:
        config_parameters['common']['val_events'] = args.val_events
    if "downsample_fraction" in args:
        config_parameters['xgboost']['downsample'] = True
        config_parameters['xgboost'][
            'downsample_fraction'] = args.downsample_fraction
    if "n_estimators" in args:
        config_parameters['xgboost']['params'][
            'n_estimators'] = args.n_estimators
    if "max_depth" in args:
        config_parameters['xgboost']['params']['max_depth'] = args.max_depth

    models, corr, dataval = init_models(config_parameters)
    events_counts = (get_events_counts(
        config_parameters[model.name]["train_events"],
        config_parameters[model.name]["validation_events"],
        config_parameters[model.name]["apply_events"]) for model in models)
    ranges_rnd = config_parameters["common"]["range_rnd_index_train"]
    ranges_mean = config_parameters["common"]["range_mean_index"]
    if config_parameters["common"]["rnd_augment"]:
        max_available_events = (ranges_rnd[1] + 1 - ranges_rnd[0]) * (
            ranges_rnd[1] - ranges_rnd[0])
    else:
        max_available_events = (ranges_rnd[1] + 1 - ranges_rnd[0]) * \
            (ranges_mean[1] + 1 - ranges_mean[0])

    for model, model_events_counts in zip(models, events_counts):
        all_events_counts = []
        for (train_events, val_events, apply_events) in model_events_counts:
            total_events = train_events + val_events + apply_events
            if total_events > max_available_events:
                logger.warning(
                    "Too big number of events requested: %d available: %d",
                    total_events, max_available_events)
                continue

            all_events_counts.append(
                (train_events, val_events, apply_events, total_events))

            ranges = {
                "train": [0, train_events],
                "val": [train_events, train_events + val_events],
                "apply": [train_events + val_events, total_events]
            }
            model.config.set_ranges(ranges, total_events, train_events,
                                    val_events, apply_events)

            run_model_and_val(model, dataval, default,
                              config_parameters["common"])

            # TODO: apply the correction and save in files
            if corr is not None:
                pass

        if default["doprofile"] is True:
            model.draw_profile(all_events_counts)

    logger.info("Program finished.")
def main():
    """ The global main function """
    logger = get_logger()
    logger.info("Starting TPC ML...")

    log_total_memory_usage("Initial memory usage")

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument("-c",
                        "--config",
                        dest="config_file",
                        default="config_model_parameters.yml",
                        type=str,
                        help="path to the *.yml configuration file")
    parser.add_argument("-s",
                        "--steer",
                        dest="steer_file",
                        default="default.yml",
                        type=str,
                        help="path to the *.yml steering file")
    # parameters for steer file
    parser.add_argument("--dotrain",
                        action="store_true",
                        default=argparse.SUPPRESS,
                        help="Perform the training")
    parser.add_argument("--docreatendvaldata",
                        action="store_true",
                        default=argparse.SUPPRESS,
                        help="Create validation data trees")
    parser.add_argument("--docache",
                        action="store_true",
                        default=argparse.SUPPRESS,
                        help="Cache training data if not already existing")
    # parameters for config file
    parser.add_argument("--rndaugment",
                        action="store_true",
                        default=argparse.SUPPRESS,
                        help="Use random-random augmentation for training")
    parser.add_argument("--ntrain1d",
                        dest="train_events_oned",
                        type=int,
                        default=argparse.SUPPRESS,
                        help="Set custom number of training events")
    parser.add_argument("--nval",
                        dest="nd_val_events",
                        type=int,
                        default=argparse.SUPPRESS,
                        help="Set custom number of max nd validation events")
    parser.add_argument("--dnpoints",
                        dest="downsample_npoints",
                        type=int,
                        default=argparse.SUPPRESS,
                        help="Set number of downsampling points")
    parser.add_argument("--nestimators",
                        dest="n_estimators",
                        type=int,
                        default=argparse.SUPPRESS,
                        help="Set the number of trees for xgboost models")
    parser.add_argument("--maxdepth",
                        dest="max_depth",
                        type=int,
                        default=argparse.SUPPRESS,
                        help="Set maximum depth of trees for xgboost models")
    parser.add_argument("--nfftidcs", dest="num_fft_idcs", type=int,
                        default=argparse.SUPPRESS, help="Set number of 1D IDCs used for" \
                        " the FFT. Corresponds to the ion drift time (ms) used in simulation.")
    parser.add_argument("--nfouriertrain", dest="num_fourier_coeffs_train", type=int,
                        default=argparse.SUPPRESS, help="Set number of Fourier coefficients" \
                        " to take from the 1D IDC train input")
    parser.add_argument("--nfourierapply", dest="num_fourier_coeffs_apply", type=int,
                        default=argparse.SUPPRESS, help="Set number of Fourier coefficients" \
                        " to take from the 1D IDC apply input")
    # parameters for caching
    parser.add_argument("--cache-events",
                        dest="cache_events",
                        type=int,
                        default=argparse.SUPPRESS,
                        help="Set the number of events to cache")
    parser.add_argument("--cache-train",
                        action="store_true",
                        default=argparse.SUPPRESS,
                        help="Use cached data for training")
    parser.add_argument(
        "--cache-file-size",
        dest="cache_file_size",
        type=int,
        default=argparse.SUPPRESS,
        help="Set the number of events per single temporary cache file")
    args = parser.parse_args()

    logger.info("Using configuration: %s steer file: %s", args.config_file,
                args.steer_file)

    with open(args.steer_file, "r", encoding="utf-8") as steer_data:
        default = yaml.safe_load(steer_data)
    with open(args.config_file, "r", encoding="utf-8") as config_data:
        config_parameters = yaml.safe_load(config_data)

    logger.info("Arguments provided: %s", str(args))
    if "dotrain" in args:
        default["dotrain"] = True
    if "docreatendvaldata" in args:
        default["docreatendvaldata"] = True
    if "docache" in args:
        default["docache"] = True
    #
    if "rndaugment" in args:
        config_parameters["common"]["rnd_augment"] = True
    if "train_events_oned" in args:
        config_parameters["xgboost"]["train_events"] = [args.train_events_oned]
    if "nd_val_events" in args:
        config_parameters["common"]["nd_val_events"] = args.nd_val_events
    if "downsample_npoints" in args:
        config_parameters["xgboost"]["downsample"] = True
        config_parameters["xgboost"][
            "downsample_npoints"] = args.downsample_npoints
    if "n_estimators" in args:
        config_parameters["xgboost"]["params"][
            "n_estimators"] = args.n_estimators
    if "max_depth" in args:
        config_parameters["xgboost"]["params"]["max_depth"] = args.max_depth
    if "num_fft_idcs" in args:
        config_parameters["common"]["num_fft_idcs"] = args.num_fft_idcs
    if "num_fourier_coeffs_train" in args:
        config_parameters["common"][
            "num_fourier_coeffs_train"] = args.num_fourier_coeffs_train
    if "num_fourier_coeffs_apply" in args:
        config_parameters["common"][
            "num_fourier_coeffs_apply"] = args.num_fourier_coeffs_apply
    #
    if "cache_events" in args:
        config_parameters["xgboost"]["cache_events"] = args.cache_events
    if "cache_train" in args:
        config_parameters["xgboost"]["cache_train"] = True
    if "cache_file_size" in args:
        config_parameters["xgboost"]["cache_file_size"] = args.cache_file_size

    models, corr, dataval = init_models(config_parameters)
    events_counts = (get_events_counts(
        config_parameters[model.name]["train_events"],
        config_parameters[model.name]["validation_events"],
        config_parameters[model.name]["apply_events"]) for model in models)
    ranges_rnd = config_parameters["common"]["range_rnd_index_train"]
    ranges_mean = config_parameters["common"]["range_mean_index"]
    if config_parameters["common"]["rnd_augment"]:
        max_available_events = (ranges_rnd[1] + 1 - ranges_rnd[0]) * (
            ranges_rnd[1] - ranges_rnd[0])
    else:
        max_available_events = (ranges_rnd[1] + 1 - ranges_rnd[0]) * \
            (ranges_mean[1] + 1 - ranges_mean[0])

    for model in models:
        if default["docache"] is True and model.name == "xgboost":
            start = timer()
            model.cache_train_data()
            end = timer()
            log_time(start, end, "cache")
    for model, model_events_counts in zip(models, events_counts):
        all_events_counts = []
        for (train_events, val_events, apply_events) in model_events_counts:
            total_events = train_events + val_events + apply_events
            if total_events > max_available_events:
                logger.warning(
                    "Too big number of events requested: %d available: %d",
                    total_events, max_available_events)
                continue

            all_events_counts.append(
                (train_events, val_events, apply_events, total_events))

            ranges = {
                "train": [0, train_events],
                "validation": [train_events, train_events + val_events],
                "apply": [train_events + val_events, total_events]
            }
            model.config.set_ranges(ranges, total_events, train_events,
                                    val_events, apply_events)

            run_model_and_val(model, dataval, default,
                              config_parameters["common"])

            # TODO: apply the correction and save in files
            if corr is not None:
                pass

        if default["doprofile"] is True:
            model.draw_profile(all_events_counts)

    logger.info("Program finished.")