コード例 #1
0
    def setup_grid_params(self):
        """
        This function setup the randomized gridsearch parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GLM.
        2. It will find the intersection of parameters that are both griddable and used by GLM.
        3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds)
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        self.one_model_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(self.one_model_time))

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # give the user opportunity to pre-assign hyper parameters for fixed values
        self.hyper_params = {}
        self.hyper_params["fold_assignment"] = ['AUTO', 'Random', 'Modulo']
        self.hyper_params["missing_values_handling"] = ['MeanImputation', 'Skip']

        # randomly generate griddable parameters
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val)


        # change the value of lambda parameters to be from 0 to self.lambda_scale instead of 0 to 1.
        if "lambda" in list(self.hyper_params):
            self.hyper_params["lambda"] = [self.lambda_scale * x for x in self.hyper_params["lambda"]]

        time_scale = self.max_runtime_scale * self.one_model_time
        # change the value of runtime parameters to be from 0 to self.lambda_scale instead of 0 to 1.
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x in
                                                     self.hyper_params["max_runtime_secs"]]

        # number of possible models being built:
        self.possible_number_models = pyunit_utils.count_models(self.hyper_params)

        # save hyper-parameters in sandbox and current test directories.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.hyper_params)
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GBM.
        2. It will find the intersection of parameters that are both griddable and used by GBM.
        3. There are several extra parameters that are used by GBM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGradientBoostingEstimator(distribution=self.family, seed=self.seed, nfolds=self.nfolds)
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(self.model_run_time))

        summary_list = model._model_json["output"]["model_summary"]
        num_trees = summary_list.cell_values[0][summary_list.col_header.index('number_of_trees')]

        if num_trees == 0:
            self.min_runtime_per_tree = self.model_run_time
        else:
            self.min_runtime_per_tree = self.model_run_time / num_trees

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters while
        # hyper_params take all griddable parameters and generate the grid search hyper-parameters
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in list(self.final_hyper_params)) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"]
            len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models*len_good_time

        if "fold_assignment" in list(self.final_hyper_params):
            self.possible_number_models = self.possible_number_models * self.scale_model

        self.final_hyper_params["seed"] = [self.seed]     # added see to make test more repeatable

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.final_hyper_params)
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by deeplearning.
        2. It will find the intersection of parameters that are both griddable and used by deeplearning.
        3. There are several extra parameters that are used by deeplearning that are denoted as griddable but actually
        is not.  These parameters have to be discovered manually and they These are captured in
        self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2ODeepLearningEstimator(distribution=self.family, seed=self.seed, nfolds=self.nfolds,
                                         hidden=[10, 10, 10])
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(self.model_run_time))

        summary_list = model._model_json["output"]["scoring_history"]
        num_iterations = summary_list.cell_values[2][summary_list.col_header.index('iterations')]

        if num_iterations == 0:
            self.min_runtime_per_iteration = self.model_run_time
        else:
            self.min_runtime_per_iteration = self.model_run_time / num_iterations

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameter and others as well to make sure they make sense
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        if "epsilon" in list(self.hyper_params):
            self.hyper_params["epsilon"] = [1e-4 * x for x in self.hyper_params["epsilon"]]

        if "input_dropout_ratio" in list(self.hyper_params):
            self.hyper_params["input_dropout_ratio"] = [0.5 * x for x in self.hyper_params["input_dropout_ratio"]]

        if "hidden_dropout_ratio" in list(self.hyper_params):
            self.hyper_params["hidden_dropout_ratio"] = [0.5 * x for x in self.hyper_params["hidden_dropout_ratio"]]

        if "hidden" in list(self.hyper_params):     # need to change this up
            # randomly generate the number of layers in the network
            num_layer = random.randint(1,3)

            # for each layer, randomly generate the number of nodes in it
            self.hyper_params["hidden"] = [random.randint(1, self.max_int_val) for p in range(0, num_layer)]

        if "epochs" in self.hyper_params:
            self.hyper_params["epochs"] = [random.randint(self.min_int_val, self.max_int_val) for p in
                                           range(0, self.max_int_number)]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters while
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)
        #
        # # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in list(self.final_hyper_params)) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"]
            len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models*len_good_time

        # make correction for stratified not being a legal argument
        if "fold_assignment" in list(self.final_hyper_params):
            self.possible_number_models = self.possible_number_models * 3/4

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.final_hyper_params)
コード例 #4
0
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GBM.
        2. It will find the intersection of parameters that are both griddable and used by GBM.
        3. There are several extra parameters that are used by GBM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGradientBoostingEstimator(distribution=self.family, seed=self.seed, nfolds=self.nfolds)
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(self.model_run_time))

        summary_list = model._model_json["output"]["model_summary"]
        num_trees = summary_list["number_of_trees"][0]

        if num_trees == 0:
            self.min_runtime_per_tree = self.model_run_time
        else:
            self.min_runtime_per_tree = self.model_run_time / num_trees

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters while
        # hyper_params take all griddable parameters and generate the grid search hyper-parameters
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in list(self.final_hyper_params)) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"]
            len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models*len_good_time

        if "fold_assignment" in list(self.final_hyper_params):
            self.possible_number_models = self.possible_number_models * self.scale_model

        self.final_hyper_params["seed"] = [self.seed]     # added see to make test more repeatable

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.final_hyper_params)
コード例 #5
0
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by PCA.
        2. It will find the intersection of parameters that are both griddable and used by PCA.
        3. There are several extra parameters that are used by PCA that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OPCA(k=10, transform="NONE", pca_method=self.pca_method)
        model.train(x=self.x_indices, training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(self.model_run_time))

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        if 'max_iterations' in list(self.hyper_params):
            self.hyper_params['max_iterations'] = [self.max_iter_scale * x for x in self.hyper_params['max_iterations']]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters while
        # hyper_params take all griddable parameters and generate the grid search hyper-parameters
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in list(self.final_hyper_params)) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"]
            len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models*len_good_time

        # must include k in hyper-parameters
        if ('k' not in list(self.final_hyper_params)) and ('k' in list(self.hyper_params)):
            self.final_hyper_params["k"] = self.hyper_params["k"]
            len_good_k = len([x for x in self.hyper_params["k"] if (x > 0)])
            self.possible_number_models = self.possible_number_models*len_good_k

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.final_hyper_params)
コード例 #6
0
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GBM.
        2. It will find the intersection of parameters that are both griddable and used by GBM.
        3. There are several extra parameters that are used by GBM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGradientBoostingEstimator(distribution=self.family)
        model.train(x=self.x_indices,
                    y=self.y_index,
                    training_frame=self.training1_data)

        run_time = pyunit_utils.find_grid_runtime([model
                                                   ])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(
            run_time))

        summary_list = model._model_json["output"]["model_summary"]
        num_trees = summary_list.cell_values[0][summary_list.col_header.index(
            'number_of_trees')]

        if num_trees == 0:
            self.min_runtime_per_tree = run_time
        else:
            self.min_runtime_per_tree = run_time / num_trees

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [
                time_scale * x for x in self.hyper_params["max_runtime_secs"]
            ]

        self.possible_number_models = self.check_and_count_models()

        self.final_hyper_params["max_runtime_secs"] = self.hyper_params[
            "max_runtime_secs"]

        # calculate true possible_number_models and exclude the bad parameters since they will not
        # result in any models being built
        # alpha_len = len(self.hyper_params["alpha"])
        # lambda_len = len(self.hyper_params["lambda"])
        time_len = len(self.hyper_params["max_runtime_secs"])
        # len_good_alpha = len([x for x in self.hyper_params["alpha"] if (x >= 0) and (x <= 1)])
        # len_good_lambda = len([x for x in self.hyper_params["lambda"] if (x >= 0)])
        len_good_time = len(
            [x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])

        self.possible_number_models = int(self.possible_number_models *
                                          len_good_time / time_len)

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir,
                                                 self.sandbox_dir,
                                                 self.json_filename,
                                                 self.final_hyper_params)
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GLM.
        2. It will find the intersection of parameters that are both griddable and used by GLM.
        3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds)
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(run_time))

        summary_list = model._model_json["output"]["model_summary"]
        num_iteration = summary_list.cell_values[0][summary_list.col_header.index('number_of_iterations')]

        if num_iteration == 0:
            self.min_runtime_per_epoch = run_time
        else:
            self.min_runtime_per_epoch = run_time/num_iteration

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params_bad, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params_bad,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val*self.alpha_scale, self.min_real_val*self.alpha_scale)

        # scale the value of lambda parameters
        if "lambda" in list(self.hyper_params_bad):
            self.hyper_params_bad["lambda"] = [self.lambda_scale * x for x in self.hyper_params_bad["lambda"]]

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * run_time
        if "max_runtime_secs" in list(self.hyper_params_bad):
            self.hyper_params_bad["max_runtime_secs"] = [time_scale * x for x
                                                         in self.hyper_params_bad["max_runtime_secs"]]

        [self.possible_number_models, self.final_hyper_params_bad] = \
            pyunit_utils.check_and_count_models(self.hyper_params_bad, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        if ("max_runtime_secs" not in list(self.final_hyper_params_bad)) and \
                ("max_runtime_secs" in list(self.hyper_params_bad)):
            self.final_hyper_params_bad["max_runtime_secs"] = self.hyper_params_bad["max_runtime_secs"]
            len_good_time = len([x for x in self.hyper_params_bad["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models * len_good_time

        # Stratified is illegal for Gaussian GLM
        self.possible_number_models = self.possible_number_models * self.scale_model

        # randomly generate griddable parameters with only good values
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, 0,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, 0)

        # scale the value of lambda parameters
        if "lambda" in list(self.hyper_params):
            self.hyper_params["lambda"] = [self.lambda_scale * x for x in self.hyper_params["lambda"]]

        # scale the max_runtime_secs parameters
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        [self.true_correct_model_number, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        if ("max_runtime_secs" not in list(self.final_hyper_params)) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"]
            self.true_correct_model_number = self.true_correct_model_number * \
                                             len(self.final_hyper_params["max_runtime_secs"])

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename_bad,
                                                 self.final_hyper_params_bad)

        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.final_hyper_params)
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GBM.
        2. It will find the intersection of parameters that are both griddable and used by GBM.
        3. There are several extra parameters that are used by GBM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGradientBoostingEstimator(distribution=self.family)
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(run_time))

        summary_list = model._model_json["output"]["model_summary"]
        num_trees = summary_list.cell_values[0][summary_list.col_header.index('number_of_trees')]

        if num_trees == 0:
            self.min_runtime_per_tree = run_time
        else:
            self.min_runtime_per_tree = run_time / num_trees

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        self.possible_number_models = self.check_and_count_models()

        self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"]

        # calculate true possible_number_models and exclude the bad parameters since they will not
        # result in any models being built
        # alpha_len = len(self.hyper_params["alpha"])
        # lambda_len = len(self.hyper_params["lambda"])
        time_len = len(self.hyper_params["max_runtime_secs"])
        # len_good_alpha = len([x for x in self.hyper_params["alpha"] if (x >= 0) and (x <= 1)])
        # len_good_lambda = len([x for x in self.hyper_params["lambda"] if (x >= 0)])
        len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])

        self.possible_number_models = int(self.possible_number_models*len_good_time/time_len)

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.final_hyper_params)
コード例 #9
0
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GLRM.
        2. It will find the intersection of parameters that are both griddable and used by GLRM.
        3. There are several extra parameters that are used by GLRM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGeneralizedLowRankEstimator(k=10, loss="Quadratic", gamma_x=random.uniform(0, 1),
                                               gamma_y=random.uniform(0, 1), transform="DEMEAN")
        model.train(x=self.training1_data.names, training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(self.model_run_time))

        summary_list = model._model_json["output"]["model_summary"]
        num_iter = summary_list["number_of_iterations"][0]

        self.min_runtime_per_iter = self.model_run_time / num_iter

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        hyper_params_list = list(self.hyper_params)

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in hyper_params_list:
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        # scale up the max_iterations to 100
        if "max_iterations" in hyper_params_list:
            self.hyper_params["max_iterations"] = [self.iter_scale * x for x in self.hyper_params["max_iterations"]]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters while
        # hyper_params take all griddable parameters and generate the grid search hyper-parameters
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in list(self.final_hyper_params)) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"]
            len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models*len_good_time

        if "k" not in list(self.final_hyper_params):    # must add this one
            self.final_hyper_params["k"] = self.hyper_params["k"]
            len_good_k = len([x for x in self.final_hyper_params["k"] if (x >= 1)])
            self.possible_number_models = self.possible_number_models*len_good_k

        self.final_hyper_params["seed"] = [self.seed]     # added see to make test more repeatable

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.final_hyper_params)
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GLRM.
        2. It will find the intersection of parameters that are both griddable and used by GLRM.
        3. There are several extra parameters that are used by GLRM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGeneralizedLowRankEstimator(k=10,
                                               loss="Quadratic",
                                               gamma_x=random.uniform(0, 1),
                                               gamma_y=random.uniform(0, 1),
                                               transform="DEMEAN")
        model.train(x=self.training1_data.names,
                    training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime(
            [model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(
            self.model_run_time))

        summary_list = model._model_json["output"]["model_summary"]
        num_iter = summary_list.cell_values[0][summary_list.col_header.index(
            'number_of_iterations')]

        self.min_runtime_per_iter = self.model_run_time / num_iter

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        hyper_params_list = list(self.hyper_params)

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in hyper_params_list:
            self.hyper_params["max_runtime_secs"] = [
                time_scale * x for x in self.hyper_params["max_runtime_secs"]
            ]

        # scale up the max_iterations to 100
        if "max_iterations" in hyper_params_list:
            self.hyper_params["max_iterations"] = [
                self.iter_scale * x
                for x in self.hyper_params["max_iterations"]
            ]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters while
        # hyper_params take all griddable parameters and generate the grid search hyper-parameters
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in list(self.final_hyper_params)) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params[
                "max_runtime_secs"]
            len_good_time = len(
                [x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models * len_good_time

        if "k" not in list(self.final_hyper_params):  # must add this one
            self.final_hyper_params["k"] = self.hyper_params["k"]
            len_good_k = len(
                [x for x in self.final_hyper_params["k"] if (x >= 1)])
            self.possible_number_models = self.possible_number_models * len_good_k

        self.final_hyper_params["seed"] = [
            self.seed
        ]  # added see to make test more repeatable

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir,
                                                 self.sandbox_dir,
                                                 self.json_filename,
                                                 self.final_hyper_params)
コード例 #11
0
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by deeplearning.
        2. It will find the intersection of parameters that are both griddable and used by deeplearning.
        3. There are several extra parameters that are used by deeplearning that are denoted as griddable but actually
        is not.  These parameters have to be discovered manually and they These are captured in
        self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2ODeepLearningEstimator(distribution=self.family,
                                         seed=self.seed,
                                         nfolds=self.nfolds,
                                         hidden=[10, 10, 10])
        model.train(x=self.x_indices,
                    y=self.y_index,
                    training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime(
            [model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(
            self.model_run_time))

        summary_list = model._model_json["output"]["scoring_history"]
        num_iterations = summary_list.cell_values[2][
            summary_list.col_header.index('iterations')]

        if num_iterations == 0:
            self.min_runtime_per_iteration = self.model_run_time
        else:
            self.min_runtime_per_iteration = self.model_run_time / num_iterations

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameter and others as well to make sure they make sense
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [
                time_scale * x for x in self.hyper_params["max_runtime_secs"]
            ]

        if "epsilon" in list(self.hyper_params):
            self.hyper_params["epsilon"] = [
                1e-4 * x for x in self.hyper_params["epsilon"]
            ]

        if "input_dropout_ratio" in list(self.hyper_params):
            self.hyper_params["input_dropout_ratio"] = [
                0.5 * x for x in self.hyper_params["input_dropout_ratio"]
            ]

        if "hidden_dropout_ratio" in list(self.hyper_params):
            self.hyper_params["hidden_dropout_ratio"] = [
                0.5 * x for x in self.hyper_params["hidden_dropout_ratio"]
            ]

        if "hidden" in list(self.hyper_params):  # need to change this up
            # randomly generate the number of layers in the network
            num_layer = random.randint(1, 3)

            # for each layer, randomly generate the number of nodes in it
            self.hyper_params["hidden"] = [
                random.randint(1, self.max_int_val)
                for p in range(0, num_layer)
            ]

        if "epochs" in self.hyper_params:
            self.hyper_params["epochs"] = [
                random.randint(self.min_int_val, self.max_int_val)
                for p in range(0, self.max_int_number)
            ]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, [], [],
                                                [], [],
                                                self.max_grid_model)

        # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in list(self.final_hyper_params)) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params[
                "max_runtime_secs"]

        self.final_hyper_params["seed"] = [
            self.seed
        ]  # added see to make test more repeatable

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir,
                                                 self.sandbox_dir,
                                                 self.json_filename,
                                                 self.final_hyper_params)
コード例 #12
0
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GLM.
        2. It will find the intersection of parameters that are both griddable and used by GLM.
        3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGeneralizedLinearEstimator(family=self.family,
                                              nfolds=self.nfolds)
        model.train(x=self.x_indices,
                    y=self.y_index,
                    training_frame=self.training1_data)

        run_time = pyunit_utils.find_grid_runtime([model
                                                   ])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(
            run_time))

        summary_list = model._model_json["output"]["model_summary"]
        num_iteration = summary_list.cell_values[0][
            summary_list.col_header.index("number_of_iterations")]

        if num_iteration == 0:
            self.min_runtime_per_epoch = run_time
        else:
            self.min_runtime_per_epoch = run_time / num_iteration

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params_bad, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params_bad,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val*self.alpha_scale, self.min_real_val*self.alpha_scale)

        # scale the value of lambda parameters
        if "lambda" in list(self.hyper_params_bad):
            self.hyper_params_bad["lambda"] = [
                self.lambda_scale * x for x in self.hyper_params_bad["lambda"]
            ]

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * run_time
        if "max_runtime_secs" in list(self.hyper_params_bad):
            self.hyper_params_bad["max_runtime_secs"] = [
                time_scale * x
                for x in self.hyper_params_bad["max_runtime_secs"]
            ]

        [self.possible_number_models, self.final_hyper_params_bad] = \
            pyunit_utils.check_and_count_models(self.hyper_params_bad, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        if ("max_runtime_secs" not in list(self.final_hyper_params_bad)) and \
                ("max_runtime_secs" in list(self.hyper_params_bad)):
            self.final_hyper_params_bad[
                "max_runtime_secs"] = self.hyper_params_bad["max_runtime_secs"]
            len_good_time = len([
                x for x in self.hyper_params_bad["max_runtime_secs"]
                if (x >= 0)
            ])
            self.possible_number_models = self.possible_number_models * len_good_time

        # Stratified is illegal for Gaussian GLM
        self.possible_number_models = self.possible_number_models * self.scale_model

        # randomly generate griddable parameters with only good values
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, 0,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, 0)

        # scale the value of lambda parameters
        if "lambda" in list(self.hyper_params):
            self.hyper_params["lambda"] = [
                self.lambda_scale * x for x in self.hyper_params["lambda"]
            ]

        # scale the max_runtime_secs parameters
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [
                time_scale * x for x in self.hyper_params["max_runtime_secs"]
            ]

        [self.true_correct_model_number, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        if ("max_runtime_secs" not in list(self.final_hyper_params)) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params[
                "max_runtime_secs"]
            self.true_correct_model_number = \
                self.true_correct_model_number * len(self.final_hyper_params["max_runtime_secs"])

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir,
                                                 self.sandbox_dir,
                                                 self.json_filename_bad,
                                                 self.final_hyper_params_bad)

        pyunit_utils.write_hyper_parameters_json(self.current_dir,
                                                 self.sandbox_dir,
                                                 self.json_filename,
                                                 self.final_hyper_params)
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by naivebayes.
        2. It will find the intersection of parameters that are both griddable and used by naivebayes.
        3. There are several extra parameters that are used by naivebayes that are denoted as griddable but actually
        are not.  These parameters have to be discovered manually and they are captured in
        self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2ONaiveBayesEstimator(nfolds=self.nfolds, compute_metrics=True)
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(self.model_run_time))

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameter and others as well to make sure they make sense
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters while
        # hyper_params take all griddable parameters and generate the grid search hyper-parameters
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        final_hyper_params_keys = list(self.final_hyper_params)
        # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in final_hyper_params_keys) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"]
            len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models*len_good_time

        # need to check that min_prob >= 1e-10
        if "min_prob" in final_hyper_params_keys:
            old_len_prob = len([x for x in self.final_hyper_params["max_runtime_secs"] if (x >= 0)])
            good_len_prob = len([x for x in self.final_hyper_params["max_runtime_secs"] if (x >= 1e-10)])
            if (old_len_prob > 0):
                self.possible_number_models = self.possible_number_models*good_len_prob/old_len_prob
            else:
                self.possible_number_models = 0

        if "laplace" in final_hyper_params_keys:
            self.final_hyper_params["laplace"] = [self.laplace_scale * x for x
                                                  in self.hyper_params["laplace"]]

            # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.final_hyper_params)
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by naivebayes.
        2. It will find the intersection of parameters that are both griddable and used by naivebayes.
        3. There are several extra parameters that are used by naivebayes that are denoted as griddable but actually
        are not.  These parameters have to be discovered manually and they are captured in
        self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2ONaiveBayesEstimator(nfolds=self.nfolds, compute_metrics=True)
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        self.model_run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(self.model_run_time))

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # scale the max_runtime_secs parameter and others as well to make sure they make sense
        time_scale = self.time_scale * self.model_run_time
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        # generate a new final_hyper_params which only takes a subset of all griddable parameters while
        # hyper_params take all griddable parameters and generate the grid search hyper-parameters
        [self.possible_number_models, self.final_hyper_params] = \
            pyunit_utils.check_and_count_models(self.hyper_params, self.params_zero_one, self.params_more_than_zero,
                                                self.params_more_than_one, self.params_zero_positive,
                                                self.max_grid_model)

        final_hyper_params_keys = list(self.final_hyper_params)
        # must add max_runtime_secs to restrict unit test run time and as a promise to Arno to test for this
        if ("max_runtime_secs" not in final_hyper_params_keys) and \
                ("max_runtime_secs" in list(self.hyper_params)):
            self.final_hyper_params["max_runtime_secs"] = self.hyper_params["max_runtime_secs"]
            len_good_time = len([x for x in self.hyper_params["max_runtime_secs"] if (x >= 0)])
            self.possible_number_models = self.possible_number_models*len_good_time

        # need to check that min_prob >= 1e-10
        if "min_prob" in final_hyper_params_keys:
            old_len_prob = len([x for x in self.final_hyper_params["max_runtime_secs"] if (x >= 0)])
            good_len_prob = len([x for x in self.final_hyper_params["max_runtime_secs"] if (x >= 1e-10)])
            if (old_len_prob > 0):
                self.possible_number_models = self.possible_number_models*good_len_prob/old_len_prob
            else:
                self.possible_number_models = 0

        if "laplace" in final_hyper_params_keys:
            self.final_hyper_params["laplace"] = [self.laplace_scale * x for x
                                                  in self.hyper_params["laplace"]]

            # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.final_hyper_params)