def setup_data(self): """ This function performs all initializations necessary: 1. generates all the random values for our dynamic tests like the Gaussian noise std, column count and row count for training/test data sets. 2. generate the appropriate data sets. """ # clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # DEBUGGING setup_data, remember to comment them out once done. # self.max_real_number = 5 # self.max_int_number = 5 # end DEBUGGING # preload data sets self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filename)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def tear_down(self): """ This function performs teardown after the dynamic test is completed. If all tests passed, it will delete all data sets generated since they can be quite large. It will move the training/validation/test data sets into a Rsandbox directory so that we can re-run the failed test. """ if self.test_failed: # some tests have failed. Need to save data sets for later re-runs # create Rsandbox directory to keep data sets and weight information self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # Do not want to save all data sets. Only save data sets that are needed for failed tests pyunit_utils.move_files(self.sandbox_dir, self.training1_data_file, self.training1_filename) # write out the jenkins job info into log files. json_file = os.path.join(self.sandbox_dir, self.json_filename) with open(json_file,'wb') as test_file: json.dump(self.hyper_params, test_file) else: # all tests have passed. Delete sandbox if if was not wiped before pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, False) # remove any csv files left in test directory pyunit_utils.remove_csv_files(self.current_dir, ".csv") pyunit_utils.remove_csv_files(self.current_dir, ".json")
def setup_data(self): """ This function performs all initializations necessary: 1. generates all the random parameter values for our dynamic tests like the Gaussian noise std, column count and row count for training/test data sets. 2. with the chosen distribution family, generate the appropriate data sets 4. load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir( self.current_dir, self.test_name, True) # preload data sets self.training1_data = h2o.import_file( path=pyunit_utils.locate(self.training1_filename)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol - 1 self.x_indices = list(range(self.y_index)) # set response to be categorical for classification tasks self.training1_data[self.y_index] = self.training1_data[ self.y_index].round().asfactor() # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: 1. generates all the random values for our dynamic tests like the Gaussian noise std, column count and row count for training/test data sets. 2. generate the appropriate data sets. """ # clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir( self.current_dir, self.test_name, True) # DEBUGGING setup_data, remember to comment them out once done. # self.max_real_number = 5 # self.max_int_number = 5 # end DEBUGGING # preload data sets self.training1_data = h2o.import_file( path=pyunit_utils.locate(self.training1_filename)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol - 1 self.x_indices = list(range(self.y_index)) # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir( self.current_dir, self.test_name, True) # preload data sets self.training1_data = h2o.import_file( path=pyunit_utils.locate(self.training1_filename)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol - 1 self.x_indices = list(range(self.y_index)) # set response to be categorical for classification tasks self.training1_data[self.y_index] = self.training1_data[ self.y_index].round().asfactor() # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: 1. generates all the random parameter values for our dynamic tests like the Gaussian noise std, column count and row count for training/test data sets. 2. randomly choose the distribution family (gaussian, binomial, multinomial) to test. 3. with the chosen distribution family, generate the appropriate data sets 4. load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # DEBUGGING setup_data, remember to comment them out once done. # self.max_real_number = 1 # self.max_int_number = 1 # end DEBUGGING # This is used to generate dataset for regression or classification. Nothing to do # with setting the distribution family in this case # preload datasets self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filename)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # randomly choose which family of GBM algo to use self.family = self.families[random.randint(0, len(self.families)-1)] # preload datasets, set x_indices, y_index and change response to factor for classification if 'multinomial' in self.family: self.training_metric = 'logloss' self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[1])) self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor() self.scale_model = 1 else: self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames[0])) self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) self.scale_model = 0.75 # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filenames)) self.x_indices = list(range(self.training1_data.ncol)) # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: 1. generates all the random parameter values for our dynamic tests like the Gaussian noise std, column count and row count for training/test data sets. 2. randomly choose the distribution family (gaussian, binomial, multinomial) to test. 3. with the chosen distribution family, generate the appropriate data sets 4. load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir( self.current_dir, self.test_name, True) # DEBUGGING setup_data, remember to comment them out once done. # self.max_real_number = 1 # self.max_int_number = 1 # end DEBUGGING # This is used to generate dataset for regression or classification. Nothing to do # with setting the distribution family in this case # randomly choose which family of GBM algo to use self.family = self.families[random.randint(0, len(self.families) - 1)] # set class number for classification if 'multinomial' in self.family: self.training_metric = 'logloss' self.training1_data = h2o.import_file( path=pyunit_utils.locate(self.training1_filenames[1])) self.y_index = self.training1_data.ncol - 1 self.x_indices = list(range(self.y_index)) self.training1_data[self.y_index] = self.training1_data[ self.y_index].round().asfactor() self.scale_model = 1 else: # preload data sets self.training1_data = h2o.import_file( path=pyunit_utils.locate(self.training1_filenames[0])) # set data set indices for predictors and response self.y_index = self.training1_data.ncol - 1 self.x_indices = list(range(self.y_index)) self.scale_model = 0.75 # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: 1. Randomly choose which distribution family to use 2. load the correct data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir( self.current_dir, self.test_name, True) # randomly choose which family of GLM algo to use self.family = self.families[random.randint(0, len(self.families) - 1)] # set class number for classification if 'binomial' in self.family: self.training1_data = h2o.import_file( path=pyunit_utils.locate(self.training1_filename[1])) self.training2_data = h2o.import_file( path=pyunit_utils.locate(self.training2_filename[1])) elif 'multinomial' in self.family: self.training1_data = h2o.import_file( path=pyunit_utils.locate(self.training1_filename[2])) self.training2_data = h2o.import_file( path=pyunit_utils.locate(self.training2_filename[2])) else: self.training1_data = h2o.import_file( path=pyunit_utils.locate(self.training1_filename[0])) self.training2_data = h2o.import_file( path=pyunit_utils.locate(self.training2_filename[0])) self.scale_model = 0.75 self.hyper_params["fold_assignment"] = ['AUTO', 'Random', 'Modulo'] # set data set indices for predictors and response self.y_index = self.training1_data.ncol - 1 self.x_indices = list(range(self.y_index)) # set response to be categorical for classification tasks if ('binomial' in self.family) or ('multinomial' in self.family): self.training1_data[self.y_index] = self.training1_data[ self.y_index].round().asfactor() self.training2_data[self.y_index] = self.training2_data[ self.y_index].round().asfactor() # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: 1. generates all the random parameter values for our dynamic tests like the Gaussian noise std, column count and row count for training/test data sets. 2. randomly choose the distribution family (gaussian, binomial, multinomial) to test. 3. with the chosen distribution family, generate the appropriate data sets 4. load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # randomly determine data set size in terms of column and row counts # DEBUGGING setup_data, remember to comment them out once done. # self.max_real_number = 3 # self.max_int_number = 3 # end DEBUGGING # randomly choose which family of GLM algo to use self.family = self.families[random.randint(0, len(self.families)-1)] self.family = 'gaussian' # set class number for classification if 'binomial' in self.family: self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filename[1])) self.training2_data = h2o.import_file(path=pyunit_utils.locate(self.training2_filename[1])) elif 'multinomial' in self.family: self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filename[2])) self.training2_data = h2o.import_file(path=pyunit_utils.locate(self.training2_filename[2])) else: self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filename[0])) self.training2_data = h2o.import_file(path=pyunit_utils.locate(self.training2_filename[0])) # set data set indices for predictors and response self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) # set response to be categorical for classification tasks if ('binomial' in self.family) or ('multinomial' in self.family): self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor() self.training2_data[self.y_index] = self.training2_data[self.y_index].round().asfactor() # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: load the data sets and set the training set indices and response column index """ # clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # preload data sets self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filename)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor() # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: 1. Randomly choose which distribution family to use 2. load the correct data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # randomly choose which family of GLM algo to use self.family = self.families[random.randint(0, len(self.families)-1)] # set class number for classification if 'binomial' in self.family: self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filename[1])) self.training2_data = h2o.import_file(path=pyunit_utils.locate(self.training2_filename[1])) elif 'multinomial' in self.family: self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filename[2])) self.training2_data = h2o.import_file(path=pyunit_utils.locate(self.training2_filename[2])) else: self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filename[0])) self.training2_data = h2o.import_file(path=pyunit_utils.locate(self.training2_filename[0])) self.scale_model = 0.75 self.hyper_params["fold_assignment"] = ['AUTO', 'Random', 'Modulo'] # set data set indices for predictors and response self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) # set response to be categorical for classification tasks if ('binomial' in self.family) or ('multinomial' in self.family): self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor() self.training2_data[self.y_index] = self.training2_data[self.y_index].round().asfactor() # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: 1. generates all the random parameter values for our dynamic tests like the Gaussian noise std, column count and row count for training/test data sets. 2. with the chosen distribution family, generate the appropriate data sets 4. load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # preload data sets self.training1_data = h2o.import_file(path=pyunit_utils.locate(self.training1_filename)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) # set response to be categorical for classification tasks self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor() # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: 1. generates all the random parameter values for our dynamic tests like the Gaussian noise std, column count and row count for training/test data sets. 2. randomly choose the distribution family (gaussian, binomial, multinomial) to test. 3. with the chosen distribution family, generate the appropriate data sets 4. load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir( self.current_dir, self.test_name, True) # randomly set Gaussian noise standard deviation as a fraction of actual predictor standard deviation self.noise_std = random.uniform( 0, math.sqrt(pow((self.max_p_value - self.min_p_value), 2) / 12)) self.noise_var = self.noise_std * self.noise_std # randomly determine data set size in terms of column and row counts self.train_col_count = random.randint(1, self.max_col_count) self.train_row_count = round( self.train_col_count * random.uniform(self.min_col_count_ratio, self.max_col_count_ratio)) # DEBUGGING setup_data, remember to comment them out once done. self.train_col_count = 3 self.train_row_count = 200 # self.max_real_number = 1 # self.max_int_number = 1 # end DEBUGGING #### This is used to generate dataset for regression or classification. Nothing to do #### with setting the distribution family in this case # randomly choose which family of GLM algo to use self.family = self.families[random.randint(0, len(self.families) - 1)] # set class number for classification if 'multinomial' in self.family: self.class_number = random.randint( 2, self.max_class_number) # randomly set number of classes K # generate real value weight vector and training/validation/test data sets for GLM pyunit_utils.write_syn_floating_point_dataset_glm( self.training1_data_file, self.training2_data_file, self.training3_data_file, self.weight_data_file, self.train_row_count, self.train_col_count, 2, self.max_p_value, self.min_p_value, self.max_w_value, self.min_w_value, self.noise_std, self.family, self.train_row_count, self.train_row_count, class_number=self.class_number, class_method=['probability', 'probability', 'probability']) # preload data sets self.training1_data = h2o.import_file( pyunit_utils.locate(self.training1_data_file)) self.training2_data = h2o.import_file( pyunit_utils.locate(self.training2_data_file)) self.training3_data = h2o.import_file( pyunit_utils.locate(self.training3_data_file)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol - 1 self.x_indices = list(range(self.y_index)) # set response to be categorical for classification tasks if 'multinomial' in self.family: self.training1_data[self.y_index] = self.training1_data[ self.y_index].round().asfactor() # check to make sure all response classes are represented, otherwise, quit if self.training1_data[ self.y_index].nlevels()[0] < self.class_number: print( "Response classes are not represented in training dataset." ) sys.exit(0) self.training2_data[self.y_index] = self.training2_data[ self.y_index].round().asfactor() self.training3_data[self.y_index] = self.training2_data[ self.y_index].round().asfactor() # self.hyper_params["validation_frame"] = [self.training1_data.frame_id, self.training2_data.frame_id, # self.training3_data.frame_id] # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: 1. generates all the random values for our dynamic tests like the Gaussian noise std, column count and row count for training/test data sets. 2. generate the appropriate data sets. """ # clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir( self.current_dir, self.test_name, True) # randomly set Gaussian noise standard deviation as a fraction of actual predictor standard deviation self.noise_std = random.uniform( 0, math.sqrt(pow((self.max_p_value - self.min_p_value), 2) / 12)) self.noise_var = self.noise_std * self.noise_std # randomly determine data set size in terms of column and row counts self.train_col_count = random.randint(1, self.max_col_count) self.train_row_count = round( self.train_col_count * random.uniform(self.min_col_count_ratio, self.max_col_count_ratio)) # DEBUGGING setup_data, remember to comment them out once done. # self.train_col_count = 3 # self.train_row_count = 200 # self.max_real_number = 5 # self.max_int_number = 5 # end DEBUGGING if 'gaussian' in self.family: # increase data range self.max_p_value *= 50 self.min_p_value *= 50 self.max_w_value *= 50 self.min_w_value *= 50 # generate real value weight vector and training/validation/test data sets for GLM pyunit_utils.write_syn_floating_point_dataset_glm( self.training1_data_file, "", "", self.weight_data_file, self.train_row_count, self.train_col_count, self.data_type, self.max_p_value, self.min_p_value, self.max_w_value, self.min_w_value, self.noise_std, self.family, self.train_row_count, self.train_row_count, class_number=self.class_number, class_method=[ self.class_method, self.class_method, self.test_class_method ], class_margin=[self.margin, self.margin, self.test_class_margin]) # preload data sets self.training1_data = h2o.import_file( pyunit_utils.locate(self.training1_data_file)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol - 1 self.x_indices = list(range(self.y_index)) # set response to be categorical for classification tasks if ('binomial' in self.family): self.training1_data[self.y_index] = self.training1_data[ self.y_index].round().asfactor() # check to make sure all response classes are represented, otherwise, quit if self.training1_data[ self.y_index].nlevels()[0] < self.class_number: print( "Response classes are not represented in training dataset." ) sys.exit(0) # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: 1. generates all the random parameter values for our dynamic tests like the Gaussian noise std, column count and row count for training/test data sets. 2. randomly choose the distribution family (gaussian, binomial, multinomial) to test. 3. with the chosen distribution family, generate the appropriate data sets 4. load the data sets and set the training set indices and response column index """ # create and clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # randomly set Gaussian noise standard deviation as a fraction of actual predictor standard deviation self.noise_std = random.uniform(0, math.sqrt(pow((self.max_p_value - self.min_p_value), 2) / 12)) self.noise_var = self.noise_std*self.noise_std # randomly determine data set size in terms of column and row counts self.train_col_count = random.randint(1, self.max_col_count) self.train_row_count = round(self.train_col_count * random.uniform(self.min_col_count_ratio, self.max_col_count_ratio)) # DEBUGGING setup_data, remember to comment them out once done. self.train_col_count = 3 self.train_row_count = 200 # self.max_real_number = 1 # self.max_int_number = 1 # end DEBUGGING #### This is used to generate dataset for regression or classification. Nothing to do #### with setting the distribution family in this case # randomly choose which family of GLM algo to use self.family = self.families[random.randint(0, len(self.families)-1)] # set class number for classification if 'multinomial' in self.family: self.class_number = random.randint(2, self.max_class_number) # randomly set number of classes K # generate real value weight vector and training/validation/test data sets for GLM pyunit_utils.write_syn_floating_point_dataset_glm(self.training1_data_file, self.training2_data_file, self.training3_data_file, self.weight_data_file, self.train_row_count, self.train_col_count, 2, self.max_p_value, self.min_p_value, self.max_w_value, self.min_w_value, self.noise_std, self.family, self.train_row_count, self.train_row_count, class_number=self.class_number, class_method=['probability', 'probability', 'probability']) # preload data sets self.training1_data = h2o.import_file(pyunit_utils.locate(self.training1_data_file)) self.training2_data = h2o.import_file(pyunit_utils.locate(self.training2_data_file)) self.training3_data = h2o.import_file(pyunit_utils.locate(self.training3_data_file)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) # set response to be categorical for classification tasks if 'multinomial' in self.family: self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor() # check to make sure all response classes are represented, otherwise, quit if self.training1_data[self.y_index].nlevels()[0] < self.class_number: print("Response classes are not represented in training dataset.") sys.exit(0) self.training2_data[self.y_index] = self.training2_data[self.y_index].round().asfactor() self.training3_data[self.y_index] = self.training2_data[self.y_index].round().asfactor() # self.hyper_params["validation_frame"] = [self.training1_data.frame_id, self.training2_data.frame_id, # self.training3_data.frame_id] # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)
def setup_data(self): """ This function performs all initializations necessary: 1. generates all the random values for our dynamic tests like the Gaussian noise std, column count and row count for training/test data sets. 2. generate the appropriate data sets. """ # clean out the sandbox directory first self.sandbox_dir = pyunit_utils.make_Rsandbox_dir(self.current_dir, self.test_name, True) # randomly set Gaussian noise standard deviation as a fraction of actual predictor standard deviation self.noise_std = random.uniform(0, math.sqrt(pow((self.max_p_value - self.min_p_value), 2) / 12)) self.noise_var = self.noise_std*self.noise_std # randomly determine data set size in terms of column and row counts self.train_col_count = random.randint(1, self.max_col_count) self.train_row_count = round(self.train_col_count * random.uniform(self.min_col_count_ratio, self.max_col_count_ratio)) # DEBUGGING setup_data, remember to comment them out once done. # self.train_col_count = 3 # self.train_row_count = 200 # self.max_real_number = 1 # self.max_int_number = 1 # end DEBUGGING if 'gaussian' in self.family: # increase data range self.max_p_value *= 50 self.min_p_value *= 50 self.max_w_value *= 50 self.min_w_value *= 50 # generate real value weight vector and training/validation/test data sets for GLM pyunit_utils.write_syn_floating_point_dataset_glm(self.training1_data_file, "", "", self.weight_data_file, self.train_row_count, self.train_col_count, self.data_type, self.max_p_value, self.min_p_value, self.max_w_value, self.min_w_value, self.noise_std, self.family, self.train_row_count, self.train_row_count, class_number=self.class_number, class_method=[self.class_method, self.class_method, self.test_class_method], class_margin=[self.margin, self.margin, self.test_class_margin]) # preload data sets self.training1_data = h2o.import_file(pyunit_utils.locate(self.training1_data_file)) # set data set indices for predictors and response self.y_index = self.training1_data.ncol-1 self.x_indices = list(range(self.y_index)) # set response to be categorical for classification tasks if 'binomial' in self.family: self.training1_data[self.y_index] = self.training1_data[self.y_index].round().asfactor() # check to make sure all response classes are represented, otherwise, quit if self.training1_data[self.y_index].nlevels()[0] < self.class_number: print("Response classes are not represented in training dataset.") sys.exit(0) # save the training data files just in case the code crashed. pyunit_utils.remove_csv_files(self.current_dir, ".csv", action='copy', new_dir_path=self.sandbox_dir)