Esempio n. 1
0
    def setup_grid_params(self):
        """
        This function setup the randomized gridsearch parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GLM.
        2. It will find the intersection of parameters that are both griddable and used by GLM.
        3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds)
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        self.one_model_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(self.one_model_time))

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # give the user opportunity to pre-assign hyper parameters for fixed values
        self.hyper_params = {}
        self.hyper_params["fold_assignment"] = ['AUTO', 'Random', 'Modulo']
        self.hyper_params["missing_values_handling"] = ['MeanImputation', 'Skip']

        # randomly generate griddable parameters
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val)


        # change the value of lambda parameters to be from 0 to self.lambda_scale instead of 0 to 1.
        if "lambda" in list(self.hyper_params):
            self.hyper_params["lambda"] = [self.lambda_scale * x for x in self.hyper_params["lambda"]]

        time_scale = self.max_runtime_scale * self.one_model_time
        # change the value of runtime parameters to be from 0 to self.lambda_scale instead of 0 to 1.
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x in
                                                     self.hyper_params["max_runtime_secs"]]

        # number of possible models being built:
        self.possible_number_models = pyunit_utils.count_models(self.hyper_params)

        # save hyper-parameters in sandbox and current test directories.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.hyper_params)
Esempio n. 2
0
    def setup_model(self):
        """
        This function setup the gridsearch parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GLM.
        2. It will find the intersection of parameters that are both griddable and used by GLM.
        3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGeneralizedLinearEstimator(family=self.family)
        model.train(x=self.x_indices,
                    y=self.y_index,
                    training_frame=self.training1_data)

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model._parms.keys(), self.hyper_params, self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # change the value of lambda parameters to be from 0 to self.lambda_scale instead of 0 to 1.
        if "lambda" in list(self.hyper_params):
            self.hyper_params["lambda"] = [
                self.lambda_scale * x for x in self.hyper_params["lambda"]
            ]

        # fixed the float precision again.  It might be changed with the scaling
        self.hyper_params["lambda"] = pyunit_utils.fix_float_precision(
            self.hyper_params["lambda"])
        self.possible_number_models = pyunit_utils.count_models(
            self.hyper_params)

        # write out the jenkins job info into log files.
        json_file = os.path.join(self.sandbox_dir, self.json_filename)

        with open(json_file, 'w') as test_file:
            json.dump(self.hyper_params, test_file)
    def setup_model(self):
        """
        This function setup the gridsearch parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GLM.
        2. It will find the intersection of parameters that are both griddable and used by GLM.
        3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGeneralizedLinearEstimator(family=self.family)
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model._parms.keys(), self.hyper_params, self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, self.min_real_val)

        # change the value of lambda parameters to be from 0 to self.lambda_scale instead of 0 to 1.
        if "lambda" in list(self.hyper_params):
            self.hyper_params["lambda"] = [self.lambda_scale * x for x in self.hyper_params["lambda"]]

        # fixed the float precision again.  It might be changed with the scaling
        self.hyper_params["lambda"] = pyunit_utils.fix_float_precision(self.hyper_params["lambda"])
        self.possible_number_models = pyunit_utils.count_models(self.hyper_params)

        # write out the jenkins job info into log files.
        json_file = os.path.join(self.sandbox_dir, self.json_filename)

        with open(json_file,'w') as test_file:
            json.dump(self.hyper_params, test_file)
    def test3_glm_random_grid_search_max_runtime_secs(self):
        """
        This function will test the stopping criteria max_runtime_secs.  For each model built, the field
        run_time actually denote the time in ms used to build the model.  We will add up the run_time from all
        models and check against the stopping criteria max_runtime_secs.  Since each model will check its run time
        differently, there is some inaccuracies in the actual run time.  For example, if we give a model 10 ms to
        build.  The GLM may check and see if it has used up all the time for every 10 epochs that it has run.  On
        the other hand, deeplearning may check the time it has spent after every epoch of training.

        If we are able to restrict the runtime to not exceed the specified max_runtime_secs by a certain
        percentage, we will consider the test a success.

        :return: None
        """
        print("*******************************************************************************************")
        print("test3_glm_random_grid_search_max_runtime_secs for GLM " + self.family)
        h2o.cluster_info()

        if "max_runtime_secs" in list(self.hyper_params):
            del self.hyper_params['max_runtime_secs']
            # number of possible models being built:
            self.possible_number_models = pyunit_utils.count_models(self.hyper_params)

        # setup_data our stopping condition here
        max_run_time_secs = random.uniform(self.one_model_time, self.max_grid_runtime)
        max_run_time_secs = random.uniform(self.one_model_time, self.allowed_scaled_time*self.max_grid_runtime)
        search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': max_run_time_secs,
                           "seed": int(round(time.time()))}
        # search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': 1/1e8}

        print("GLM Binomial grid search_criteria: {0}".format(search_criteria))

        # fire off random grid-search
        grid_model = \
            H2OGridSearch(H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds),
                          hyper_params=self.hyper_params, search_criteria=search_criteria)
        grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        actual_run_time_secs = pyunit_utils.find_grid_runtime(grid_model)

        print("Maximum time limit is {0}.  Time taken to build all model is "
              "{1}".format(search_criteria["max_runtime_secs"], actual_run_time_secs))

        print("Maximum model number is {0}.  Actual number of models built is {1}".format(self.possible_number_models,
                                                                                          len(grid_model)))

        if actual_run_time_secs <= search_criteria["max_runtime_secs"]*(1+self.allowed_diff):
            print("test3_glm_random_grid_search_max_runtime_secs: passed!")

            if len(grid_model) > self.possible_number_models:   # generate too many models, something is wrong
                self.test_failed += 1
                self.test_failed_array[self.test_num] = 1
                print("test3_glm_random_grid_search_max_runtime_secs: failed.  Generated {0} models "
                      " which exceeds maximum possible model number {1}".format(len(grid_model),
                                                                                self.possible_number_models))
        elif len(grid_model) == 1:  # will always generate 1 model
            print("test3_glm_random_grid_search_max_runtime_secs: passed!")
        else:
            self.test_failed += 1
            self.test_failed_array[self.test_num] = 1
            print("test3_glm_random_grid_search_max_runtime_secs: failed.  Model takes time {0}"
                  " seconds which exceeds allowed time {1}".format(actual_run_time_secs,
                                                                   max_run_time_secs*(1+self.allowed_diff)))
        self.test_num += 1
        sys.stdout.flush()
    def test3_glm_random_grid_search_max_runtime_secs(self):
        """
        This function will test the stopping criteria max_runtime_secs.  For each model built, the field
        run_time actually denote the time in ms used to build the model.  We will add up the run_time from all
        models and check against the stopping criteria max_runtime_secs.  Since each model will check its run time
        differently, there is some inaccuracies in the actual run time.  For example, if we give a model 10 ms to
        build.  The GLM may check and see if it has used up all the time for every 10 epochs that it has run.  On
        the other hand, deeplearning may check the time it has spent after every epoch of training.

        If we are able to restrict the runtime to not exceed the specified max_runtime_secs by a certain
        percentage, we will consider the test a success.

        :return: None
        """
        print("*******************************************************************************************")
        print("test3_glm_random_grid_search_max_runtime_secs for GLM " + self.family)
        h2o.cluster_info()

        if "max_runtime_secs" in list(self.hyper_params):
            del self.hyper_params['max_runtime_secs']
            # number of possible models being built:
            self.possible_number_models = pyunit_utils.count_models(self.hyper_params)

        # setup_data our stopping condition here
        max_run_time_secs = random.uniform(self.one_model_time, self.allowed_scaled_time*self.max_grid_runtime)

        search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': max_run_time_secs,
                           "seed": round(time.time())}
        # search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': 1/1e8}

        print("GLM Gaussian grid search_criteria: {0}".format(search_criteria))

        # fire off random grid-search
        grid_model = \
            H2OGridSearch(H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds),
                          hyper_params=self.hyper_params, search_criteria=search_criteria)
        grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        actual_run_time_secs = pyunit_utils.find_grid_runtime(grid_model)

        print("Maximum time limit is {0}.  Time taken to build all model is "
              "{1}".format(search_criteria["max_runtime_secs"], actual_run_time_secs))

        print("Maximum model number is {0}.  Actual number of models built is {1}".format(self.possible_number_models,
                                                                                          len(grid_model)))

        if actual_run_time_secs <= search_criteria["max_runtime_secs"]*(1+self.allowed_diff):
            print("test3_glm_random_grid_search_max_runtime_secs: passed!")

            if len(grid_model) > self.possible_number_models:   # generate too many models, something is wrong
                self.test_failed += 1
                self.test_failed_array[self.test_num] = 1
                print("test3_glm_random_grid_search_max_runtime_secs: failed.  Generated {0} models "
                      " which exceeds maximum possible model number {1}".format(len(grid_model),
                                                                                self.possible_number_models))
        elif len(grid_model) == 1:  # will always generate 1 model
            print("test3_glm_random_grid_search_max_runtime_secs: passed!")
        else:
            self.test_failed += 1
            self.test_failed_array[self.test_num] = 1
            print("test3_glm_random_grid_search_max_runtime_secs: failed.  Model takes time {0}"
                  " seconds which exceeds allowed time {1}".format(actual_run_time_secs,
                                                                   max_run_time_secs*(1+self.allowed_diff)))
        self.test_num += 1
        sys.stdout.flush()
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GLM.
        2. It will find the intersection of parameters that are both griddable and used by GLM.
        3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds)
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(run_time))

        summary_list = model._model_json["output"]["model_summary"]
        num_iteration = summary_list.cell_values[0][summary_list.col_header.index('number_of_iterations')]

        if num_iteration == 0:
            self.min_runtime_per_epoch = run_time
        else:
            self.min_runtime_per_epoch = run_time/num_iteration

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params_bad, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params_bad,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val*self.alpha_scale, self.min_real_val*self.alpha_scale)

        # scale the value of lambda parameters
        if "lambda" in list(self.hyper_params_bad):
            self.hyper_params_bad["lambda"] = [self.lambda_scale * x for x in self.hyper_params_bad["lambda"]]

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * run_time
        if "max_runtime_secs" in list(self.hyper_params_bad):
            self.hyper_params_bad["max_runtime_secs"] = [time_scale * x for x
                                                         in self.hyper_params_bad["max_runtime_secs"]]

        self.possible_number_models = pyunit_utils.count_models(self.hyper_params_bad)

        # calculate true possible_number_models and exclude the bad parameters since they will not
        # result in any models being built
        alpha_len = len(self.hyper_params_bad["alpha"])
        lambda_len = len(self.hyper_params_bad["lambda"])
        time_len = len(self.hyper_params_bad["max_runtime_secs"])
        len_good_alpha = len([x for x in self.hyper_params_bad["alpha"] if (x >= 0) and (x <= 1)])
        len_good_lambda = len([x for x in self.hyper_params_bad["lambda"] if (x >= 0)])
        len_good_time = len([x for x in self.hyper_params_bad["max_runtime_secs"] if (x >= 0)])

        self.possible_number_models = int(self.possible_number_models * len_good_alpha * len_good_lambda *
                                          len_good_time/ (alpha_len * lambda_len * time_len))

        # randomly generate griddable parameters with only good values
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, 0,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, 0)

        self.true_correct_model_number = pyunit_utils.count_models(self.hyper_params)

        # scale the value of lambda parameters
        if "lambda" in list(self.hyper_params):
            self.hyper_params["lambda"] = [self.lambda_scale * x for x in self.hyper_params["lambda"]]

        # scale the max_runtime_secs parameters
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename_bad,
                                                 self.hyper_params_bad)

        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.hyper_params)
Esempio n. 7
0
    def setup_model(self):
        """
        This function setup the gridsearch hyper-parameters that will be used later on:

        1. It will first try to grab all the parameters that are griddable and parameters used by GLM.
        2. It will find the intersection of parameters that are both griddable and used by GLM.
        3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not.
        These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists.
        4. We generate the gridsearch hyper-parameter.  For numerical parameters, we will generate those randomly.
        For enums, we will include all of them.

        :return: None
        """
        # build bare bone model to get all parameters
        model = H2OGeneralizedLinearEstimator(family=self.family)
        model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data)

        run_time = pyunit_utils.find_grid_runtime([model])  # find model train time
        print("Time taken to build a base barebone model is {0}".format(run_time))

        summary_list = model._model_json["output"]["model_summary"]
        num_iteration = summary_list.cell_values[0][summary_list.col_header.index('number_of_iterations')]

        if num_iteration == 0:
            self.min_runtime_per_epoch = run_time
        else:
            self.min_runtime_per_epoch = run_time/num_iteration

        # grab all gridable parameters and its type
        (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.get_gridables(model._model_json["parameters"])

        # randomly generate griddable parameters including values outside legal range, like setting alpha values to
        # be outside legal range of 0 and 1 and etc
        (self.hyper_params_bad, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params_bad,
                                         self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, self.min_int_val,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val*self.alpha_scale, self.min_real_val*self.alpha_scale)

        # scale the value of lambda parameters
        if "lambda" in list(self.hyper_params_bad):
            self.hyper_params_bad["lambda"] = [self.lambda_scale * x for x in self.hyper_params_bad["lambda"]]

        # scale the max_runtime_secs parameters
        time_scale = self.time_scale * run_time
        if "max_runtime_secs" in list(self.hyper_params_bad):
            self.hyper_params_bad["max_runtime_secs"] = [time_scale * x for x
                                                         in self.hyper_params_bad["max_runtime_secs"]]

        self.possible_number_models = pyunit_utils.count_models(self.hyper_params_bad)

        # calculate true possible_number_models and exclude the bad parameters since they will not
        # result in any models being built
        alpha_len = len(self.hyper_params_bad["alpha"])
        lambda_len = len(self.hyper_params_bad["lambda"])
        time_len = len(self.hyper_params_bad["max_runtime_secs"])
        len_good_alpha = len([x for x in self.hyper_params_bad["alpha"] if (x >= 0) and (x <= 1)])
        len_good_lambda = len([x for x in self.hyper_params_bad["lambda"] if (x >= 0)])
        len_good_time = len([x for x in self.hyper_params_bad["max_runtime_secs"] if (x >= 0)])

        self.possible_number_models = int(self.possible_number_models * len_good_alpha * len_good_lambda *
                                          len_good_time/ (alpha_len * lambda_len * time_len))

        # randomly generate griddable parameters with only good values
        (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \
            pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists,
                                         self.gridable_parameters, self.gridable_types, self.gridable_defaults,
                                         random.randint(1, self.max_int_number),
                                         self.max_int_val, 0,
                                         random.randint(1, self.max_real_number),
                                         self.max_real_val, 0)

        self.true_correct_model_number = pyunit_utils.count_models(self.hyper_params)

        # scale the value of lambda parameters
        if "lambda" in list(self.hyper_params):
            self.hyper_params["lambda"] = [self.lambda_scale * x for x in self.hyper_params["lambda"]]

        # scale the max_runtime_secs parameters
        if "max_runtime_secs" in list(self.hyper_params):
            self.hyper_params["max_runtime_secs"] = [time_scale * x for x
                                                     in self.hyper_params["max_runtime_secs"]]

        # write out the hyper-parameters used into json files.
        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename_bad,
                                                 self.hyper_params_bad)

        pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename,
                                                 self.hyper_params)