def setup_grid_params(self): """ This function setup the randomized gridsearch parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GLM. 2. It will find the intersection of parameters that are both griddable and used by GLM. 3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) self.one_model_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(self.one_model_time)) # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # give the user opportunity to pre-assign hyper parameters for fixed values self.hyper_params = {} self.hyper_params["fold_assignment"] = ['AUTO', 'Random', 'Modulo'] self.hyper_params["missing_values_handling"] = ['MeanImputation', 'Skip'] # randomly generate griddable parameters (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # change the value of lambda parameters to be from 0 to self.lambda_scale instead of 0 to 1. if "lambda" in list(self.hyper_params): self.hyper_params["lambda"] = [self.lambda_scale * x for x in self.hyper_params["lambda"]] time_scale = self.max_runtime_scale * self.one_model_time # change the value of runtime parameters to be from 0 to self.lambda_scale instead of 0 to 1. if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] # number of possible models being built: self.possible_number_models = pyunit_utils.count_models(self.hyper_params) # save hyper-parameters in sandbox and current test directories. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.hyper_params)
def setup_model(self): """ This function setup the gridsearch parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GLM. 2. It will find the intersection of parameters that are both griddable and used by GLM. 3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGeneralizedLinearEstimator(family=self.family) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model._parms.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # change the value of lambda parameters to be from 0 to self.lambda_scale instead of 0 to 1. if "lambda" in list(self.hyper_params): self.hyper_params["lambda"] = [ self.lambda_scale * x for x in self.hyper_params["lambda"] ] # fixed the float precision again. It might be changed with the scaling self.hyper_params["lambda"] = pyunit_utils.fix_float_precision( self.hyper_params["lambda"]) self.possible_number_models = pyunit_utils.count_models( self.hyper_params) # write out the jenkins job info into log files. json_file = os.path.join(self.sandbox_dir, self.json_filename) with open(json_file, 'w') as test_file: json.dump(self.hyper_params, test_file)
def setup_model(self): """ This function setup the gridsearch parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GLM. 2. It will find the intersection of parameters that are both griddable and used by GLM. 3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGeneralizedLinearEstimator(family=self.family) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model._parms.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val, self.min_real_val) # change the value of lambda parameters to be from 0 to self.lambda_scale instead of 0 to 1. if "lambda" in list(self.hyper_params): self.hyper_params["lambda"] = [self.lambda_scale * x for x in self.hyper_params["lambda"]] # fixed the float precision again. It might be changed with the scaling self.hyper_params["lambda"] = pyunit_utils.fix_float_precision(self.hyper_params["lambda"]) self.possible_number_models = pyunit_utils.count_models(self.hyper_params) # write out the jenkins job info into log files. json_file = os.path.join(self.sandbox_dir, self.json_filename) with open(json_file,'w') as test_file: json.dump(self.hyper_params, test_file)
def test3_glm_random_grid_search_max_runtime_secs(self): """ This function will test the stopping criteria max_runtime_secs. For each model built, the field run_time actually denote the time in ms used to build the model. We will add up the run_time from all models and check against the stopping criteria max_runtime_secs. Since each model will check its run time differently, there is some inaccuracies in the actual run time. For example, if we give a model 10 ms to build. The GLM may check and see if it has used up all the time for every 10 epochs that it has run. On the other hand, deeplearning may check the time it has spent after every epoch of training. If we are able to restrict the runtime to not exceed the specified max_runtime_secs by a certain percentage, we will consider the test a success. :return: None """ print("*******************************************************************************************") print("test3_glm_random_grid_search_max_runtime_secs for GLM " + self.family) h2o.cluster_info() if "max_runtime_secs" in list(self.hyper_params): del self.hyper_params['max_runtime_secs'] # number of possible models being built: self.possible_number_models = pyunit_utils.count_models(self.hyper_params) # setup_data our stopping condition here max_run_time_secs = random.uniform(self.one_model_time, self.max_grid_runtime) max_run_time_secs = random.uniform(self.one_model_time, self.allowed_scaled_time*self.max_grid_runtime) search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': max_run_time_secs, "seed": int(round(time.time()))} # search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': 1/1e8} print("GLM Binomial grid search_criteria: {0}".format(search_criteria)) # fire off random grid-search grid_model = \ H2OGridSearch(H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds), hyper_params=self.hyper_params, search_criteria=search_criteria) grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) actual_run_time_secs = pyunit_utils.find_grid_runtime(grid_model) print("Maximum time limit is {0}. Time taken to build all model is " "{1}".format(search_criteria["max_runtime_secs"], actual_run_time_secs)) print("Maximum model number is {0}. Actual number of models built is {1}".format(self.possible_number_models, len(grid_model))) if actual_run_time_secs <= search_criteria["max_runtime_secs"]*(1+self.allowed_diff): print("test3_glm_random_grid_search_max_runtime_secs: passed!") if len(grid_model) > self.possible_number_models: # generate too many models, something is wrong self.test_failed += 1 self.test_failed_array[self.test_num] = 1 print("test3_glm_random_grid_search_max_runtime_secs: failed. Generated {0} models " " which exceeds maximum possible model number {1}".format(len(grid_model), self.possible_number_models)) elif len(grid_model) == 1: # will always generate 1 model print("test3_glm_random_grid_search_max_runtime_secs: passed!") else: self.test_failed += 1 self.test_failed_array[self.test_num] = 1 print("test3_glm_random_grid_search_max_runtime_secs: failed. Model takes time {0}" " seconds which exceeds allowed time {1}".format(actual_run_time_secs, max_run_time_secs*(1+self.allowed_diff))) self.test_num += 1 sys.stdout.flush()
def test3_glm_random_grid_search_max_runtime_secs(self): """ This function will test the stopping criteria max_runtime_secs. For each model built, the field run_time actually denote the time in ms used to build the model. We will add up the run_time from all models and check against the stopping criteria max_runtime_secs. Since each model will check its run time differently, there is some inaccuracies in the actual run time. For example, if we give a model 10 ms to build. The GLM may check and see if it has used up all the time for every 10 epochs that it has run. On the other hand, deeplearning may check the time it has spent after every epoch of training. If we are able to restrict the runtime to not exceed the specified max_runtime_secs by a certain percentage, we will consider the test a success. :return: None """ print("*******************************************************************************************") print("test3_glm_random_grid_search_max_runtime_secs for GLM " + self.family) h2o.cluster_info() if "max_runtime_secs" in list(self.hyper_params): del self.hyper_params['max_runtime_secs'] # number of possible models being built: self.possible_number_models = pyunit_utils.count_models(self.hyper_params) # setup_data our stopping condition here max_run_time_secs = random.uniform(self.one_model_time, self.allowed_scaled_time*self.max_grid_runtime) search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': max_run_time_secs, "seed": round(time.time())} # search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': 1/1e8} print("GLM Gaussian grid search_criteria: {0}".format(search_criteria)) # fire off random grid-search grid_model = \ H2OGridSearch(H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds), hyper_params=self.hyper_params, search_criteria=search_criteria) grid_model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) actual_run_time_secs = pyunit_utils.find_grid_runtime(grid_model) print("Maximum time limit is {0}. Time taken to build all model is " "{1}".format(search_criteria["max_runtime_secs"], actual_run_time_secs)) print("Maximum model number is {0}. Actual number of models built is {1}".format(self.possible_number_models, len(grid_model))) if actual_run_time_secs <= search_criteria["max_runtime_secs"]*(1+self.allowed_diff): print("test3_glm_random_grid_search_max_runtime_secs: passed!") if len(grid_model) > self.possible_number_models: # generate too many models, something is wrong self.test_failed += 1 self.test_failed_array[self.test_num] = 1 print("test3_glm_random_grid_search_max_runtime_secs: failed. Generated {0} models " " which exceeds maximum possible model number {1}".format(len(grid_model), self.possible_number_models)) elif len(grid_model) == 1: # will always generate 1 model print("test3_glm_random_grid_search_max_runtime_secs: passed!") else: self.test_failed += 1 self.test_failed_array[self.test_num] = 1 print("test3_glm_random_grid_search_max_runtime_secs: failed. Model takes time {0}" " seconds which exceeds allowed time {1}".format(actual_run_time_secs, max_run_time_secs*(1+self.allowed_diff))) self.test_num += 1 sys.stdout.flush()
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GLM. 2. It will find the intersection of parameters that are both griddable and used by GLM. 3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGeneralizedLinearEstimator(family=self.family, nfolds=self.nfolds) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) run_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(run_time)) summary_list = model._model_json["output"]["model_summary"] num_iteration = summary_list.cell_values[0][summary_list.col_header.index('number_of_iterations')] if num_iteration == 0: self.min_runtime_per_epoch = run_time else: self.min_runtime_per_epoch = run_time/num_iteration # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params_bad, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params_bad, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val*self.alpha_scale, self.min_real_val*self.alpha_scale) # scale the value of lambda parameters if "lambda" in list(self.hyper_params_bad): self.hyper_params_bad["lambda"] = [self.lambda_scale * x for x in self.hyper_params_bad["lambda"]] # scale the max_runtime_secs parameters time_scale = self.time_scale * run_time if "max_runtime_secs" in list(self.hyper_params_bad): self.hyper_params_bad["max_runtime_secs"] = [time_scale * x for x in self.hyper_params_bad["max_runtime_secs"]] self.possible_number_models = pyunit_utils.count_models(self.hyper_params_bad) # calculate true possible_number_models and exclude the bad parameters since they will not # result in any models being built alpha_len = len(self.hyper_params_bad["alpha"]) lambda_len = len(self.hyper_params_bad["lambda"]) time_len = len(self.hyper_params_bad["max_runtime_secs"]) len_good_alpha = len([x for x in self.hyper_params_bad["alpha"] if (x >= 0) and (x <= 1)]) len_good_lambda = len([x for x in self.hyper_params_bad["lambda"] if (x >= 0)]) len_good_time = len([x for x in self.hyper_params_bad["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = int(self.possible_number_models * len_good_alpha * len_good_lambda * len_good_time/ (alpha_len * lambda_len * time_len)) # randomly generate griddable parameters with only good values (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, 0, random.randint(1, self.max_real_number), self.max_real_val, 0) self.true_correct_model_number = pyunit_utils.count_models(self.hyper_params) # scale the value of lambda parameters if "lambda" in list(self.hyper_params): self.hyper_params["lambda"] = [self.lambda_scale * x for x in self.hyper_params["lambda"]] # scale the max_runtime_secs parameters if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename_bad, self.hyper_params_bad) pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.hyper_params)
def setup_model(self): """ This function setup the gridsearch hyper-parameters that will be used later on: 1. It will first try to grab all the parameters that are griddable and parameters used by GLM. 2. It will find the intersection of parameters that are both griddable and used by GLM. 3. There are several extra parameters that are used by GLM that are denoted as griddable but actually is not. These parameters have to be discovered manually and they These are captured in self.exclude_parameter_lists. 4. We generate the gridsearch hyper-parameter. For numerical parameters, we will generate those randomly. For enums, we will include all of them. :return: None """ # build bare bone model to get all parameters model = H2OGeneralizedLinearEstimator(family=self.family) model.train(x=self.x_indices, y=self.y_index, training_frame=self.training1_data) run_time = pyunit_utils.find_grid_runtime([model]) # find model train time print("Time taken to build a base barebone model is {0}".format(run_time)) summary_list = model._model_json["output"]["model_summary"] num_iteration = summary_list.cell_values[0][summary_list.col_header.index('number_of_iterations')] if num_iteration == 0: self.min_runtime_per_epoch = run_time else: self.min_runtime_per_epoch = run_time/num_iteration # grab all gridable parameters and its type (self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.get_gridables(model._model_json["parameters"]) # randomly generate griddable parameters including values outside legal range, like setting alpha values to # be outside legal range of 0 and 1 and etc (self.hyper_params_bad, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params_bad, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, self.min_int_val, random.randint(1, self.max_real_number), self.max_real_val*self.alpha_scale, self.min_real_val*self.alpha_scale) # scale the value of lambda parameters if "lambda" in list(self.hyper_params_bad): self.hyper_params_bad["lambda"] = [self.lambda_scale * x for x in self.hyper_params_bad["lambda"]] # scale the max_runtime_secs parameters time_scale = self.time_scale * run_time if "max_runtime_secs" in list(self.hyper_params_bad): self.hyper_params_bad["max_runtime_secs"] = [time_scale * x for x in self.hyper_params_bad["max_runtime_secs"]] self.possible_number_models = pyunit_utils.count_models(self.hyper_params_bad) # calculate true possible_number_models and exclude the bad parameters since they will not # result in any models being built alpha_len = len(self.hyper_params_bad["alpha"]) lambda_len = len(self.hyper_params_bad["lambda"]) time_len = len(self.hyper_params_bad["max_runtime_secs"]) len_good_alpha = len([x for x in self.hyper_params_bad["alpha"] if (x >= 0) and (x <= 1)]) len_good_lambda = len([x for x in self.hyper_params_bad["lambda"] if (x >= 0)]) len_good_time = len([x for x in self.hyper_params_bad["max_runtime_secs"] if (x >= 0)]) self.possible_number_models = int(self.possible_number_models * len_good_alpha * len_good_lambda * len_good_time/ (alpha_len * lambda_len * time_len)) # randomly generate griddable parameters with only good values (self.hyper_params, self.gridable_parameters, self.gridable_types, self.gridable_defaults) = \ pyunit_utils.gen_grid_search(model.full_parameters.keys(), self.hyper_params, self.exclude_parameter_lists, self.gridable_parameters, self.gridable_types, self.gridable_defaults, random.randint(1, self.max_int_number), self.max_int_val, 0, random.randint(1, self.max_real_number), self.max_real_val, 0) self.true_correct_model_number = pyunit_utils.count_models(self.hyper_params) # scale the value of lambda parameters if "lambda" in list(self.hyper_params): self.hyper_params["lambda"] = [self.lambda_scale * x for x in self.hyper_params["lambda"]] # scale the max_runtime_secs parameters if "max_runtime_secs" in list(self.hyper_params): self.hyper_params["max_runtime_secs"] = [time_scale * x for x in self.hyper_params["max_runtime_secs"]] # write out the hyper-parameters used into json files. pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename_bad, self.hyper_params_bad) pyunit_utils.write_hyper_parameters_json(self.current_dir, self.sandbox_dir, self.json_filename, self.hyper_params)