def _fit_learner(self, verbose=False): def time_seconds(): return default_timer() # Create learner from configuration learner = self.model(**self.configurations) # Train learner if self.__class__.__bases__[0] == EvaluatorSklearn: start_time = time_seconds() learner.fit( get_input_variables(self.training_set).values, get_target_variable(self.training_set).values) training_time = time_seconds() - start_time else: start_time = time_seconds() learner.fit( get_input_variables(self.training_set).values, get_target_variable(self.training_set).values, self.metric, verbose) training_time = time_seconds() - start_time # testing_value = self._calculate_value(learner, self.testing_set) return { 'learner': learner, # 'testing_value': testing_value, 'training_time': training_time }
def test_fit(self): X = get_input_variables(self.training).as_matrix() y = get_target_variable(self.training).as_matrix() self.ftne.fit(X, y, RootMeanSquaredError, verbose=True) self.assertTrue(expr=self.ftne.champion) prediction = self.ftne.predict(get_input_variables(self.validation).as_matrix()) self.assertEqual(len(prediction), len(get_target_variable(self.validation).as_matrix()))
def test_predict(self): print("testing predict()...") self.ensemble_learner.fit(get_input_variables(self.training).values, get_target_variable(self.training).values, RootMeanSquaredError, verbose=True) prediction = self.ensemble_learner.predict(get_input_variables(self.validation).values) self.assertTrue(expr=len(prediction) == len(get_target_variable(self.validation).values))
def _select_best_learner(self, time_limit=TIME_LIMIT_SECONDS, time_buffer=TIME_BUFFER, verbose=False): # Best learner found (lowest validation error). best_learner = None # Lowest validation error found. best_validation_value = float( '-Inf') if self.metric.greater_is_better else float('Inf') # Validation error list. validation_value_list = list() # Current time in seconds. time_seconds = lambda: default_timer() # Random order of configurations. shuffle(self.configurations) # Number of configurations run. number_of_runs = 0 # Start of run. run_start = time_seconds() # Time left. time_left = lambda: time_limit - (time_seconds() - run_start) # Iterate though all configurations. for configuration in tqdm(self.configurations): # Create learner from configuration. learner = self.model(**configuration) # Train learner. if self.__class__.__bases__[0] == EvaluatorSklearn: learner.fit( get_input_variables(self.training_set).as_matrix(), get_target_variable(self.training_set).as_matrix()) else: learner.fit( get_input_variables(self.training_set).as_matrix(), get_target_variable(self.training_set).as_matrix(), self.metric, verbose) # Calculate validation value. validation_value = self._calculate_value(learner, self.validation_set) # If validation error lower than best validation error, set learner as best learner and validation error as best validation error. if is_better(validation_value, best_validation_value, self.metric): best_learner = learner best_validation_value = validation_value # Add configuration and validation error to validation error list. validation_value_list.append((configuration, validation_value)) # Increase number of runs. number_of_runs += 1 # Calculate time left. run_end = time_left() # Calculate time expected for next run. run_expected = (time_limit - run_end) / number_of_runs # If no time left or time expected for next run is greater than time left, break. if run_end < 0 or run_end * (1 + time_buffer) < run_expected: break # When all configurations tested, return best learner. return { 'best_learner': best_learner, 'validation_value_list': validation_value_list }
def test_predict(self): self.ensemble_learner.fit( get_input_variables(self.training).as_matrix(), get_target_variable(self.training).as_matrix(), RootMeanSquaredError, verbose=True) prediction = self.ensemble_learner.predict( get_input_variables(self.validation).as_matrix()) self.assertTrue(expr=len(prediction) == len( get_target_variable(self.validation).as_matrix()))
def test_predict(self): print("testing predict()...") base_learner = SemanticLearningMachine(50, ErrorDeviationVariationCriterion(0.25), 2, 1, 10, Mutation2()) ensemble_learner = EnsembleRandomIndependentWeighting(base_learner, 100, weight_range=2) X = get_input_variables(self.training).values y = get_target_variable(self.training).values def time_seconds(): return default_timer() start_time = time_seconds() ensemble_learner.fit(X, y, RootMeanSquaredError, verbose=False) print("time to train algorithm: ", (time_seconds()-start_time)) start_time = time_seconds() prediction = ensemble_learner.predict(get_input_variables(self.validation).values) print("time to predict algorithm: ", (time_seconds()-start_time)) self.assertTrue(expr=len(prediction) == len(get_target_variable(self.validation).values)) print()
def pickup(self): print('Entering ensemble pickup for dataset:', self.data_set_name) outer_cv = 0 outer_folds = self._get_outer_folds(outer_cv) for training_outer_index, testing_index in outer_folds.split( get_input_variables(self.samples).values, get_target_variable(self.samples).values): print('\tIndex of outer fold:', outer_cv) training_outer, testing = pd.DataFrame( self.samples.values[training_outer_index]), pd.DataFrame( self.samples.values[testing_index]) algorithm = self.best_result[outer_cv]['best_overall_algorithm'] configuration = self.best_result[outer_cv][ 'best_overall_configuration'] self._run_ensembles(outer_cv, algorithm.get_corresponding_algo(), configuration, training_outer, testing, self.metric) outer_cv += 1 benchmark_to_pickle(self) print('Leaving ensemble pickup for dataset:', self.data_set_name)
def test_fit(self): self.ensemble_learner.fit( get_input_variables(self.training).as_matrix(), get_target_variable(self.training).as_matrix(), RootMeanSquaredError, verbose=True) self.assertTrue(expr=self.ensemble_learner.learners)
def _calculate_solution_value(self, solution, data_set, learner): X = get_input_variables(data_set).as_matrix() target = get_target_variable(data_set).as_matrix() neural_network = FeedForwardNetwork.create(solution, learner.configuration) prediction = self._predict_neural_network(neural_network, X) return self.metric.evaluate(prediction, target)
def test_benchmark_slm(self): print('Test BenchmarkSLM()...') algorithm = BenchmarkSLM(10, MaxGenerationsCriterion(10), 3, 0.01, 50, Mutation2()) X = get_input_variables(self.training).as_matrix() y = get_target_variable(self.training).as_matrix() log = algorithm.fit(X, y, RootMeanSquaredError, verbose=True) self.assertTrue(expr=log) print()
def test_ols(self): print('OLS tests of fit()...') algorithm = SemanticLearningMachine(100, MaxGenerationsCriterion(200), 3, 'optimized', 50, Mutation2()) X = get_input_variables(self.training).as_matrix() y = get_target_variable(self.training).as_matrix() algorithm.fit(X, y, RootMeanSquaredError, verbose=True) self.assertTrue(expr=algorithm.champion) print()
def test_benchmark_neat(self): print('Test BenchmarkNEAT()...') algorithm = BenchmarkNEAT(10, MaxGenerationsCriterion(10), 4, 1, 1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1) X = get_input_variables(self.training).as_matrix() y = get_target_variable(self.training).as_matrix() log = algorithm.fit(X, y, RootMeanSquaredError, verbose=True) self.assertTrue(expr=log) print()
def test_edv(self): print('EDV tests of fit()...') algorithm = SemanticLearningMachine( 100, ErrorDeviationVariationCriterion(0.25), 3, 0.01, 50, Mutation2()) X = get_input_variables(self.training).as_matrix() y = get_target_variable(self.training).as_matrix() algorithm.fit(X, y, RootMeanSquaredError, verbose=True) self.assertTrue(expr=algorithm.champion) print()
def test_ols(self): print('OLS tests of fit()...') def time_seconds(): return default_timer() start_time = time_seconds() algorithm = SemanticLearningMachine(100, MaxGenerationsCriterion(200), 3, 'optimized', 50, Mutation2(), RootMeanSquaredError, True) X = get_input_variables(self.training).values y = get_target_variable(self.training).values start_time = time_seconds() algorithm.fit(X, y, RootMeanSquaredError, verbose=False) print("time to train algorithm: ", (time_seconds()-start_time)) self.assertTrue(expr=algorithm.champion) print()
def test_slm_ols_wo_edv(self): print("testing fit() for SLM (OLS) without EDV ...") base_learner = SemanticLearningMachine(50, MaxGenerationsCriterion(20), 2, 'optimized', 10, Mutation2()) ensemble_learner = EnsembleBoosting(base_learner, 100, meta_learner=median, learning_rate=1) X = get_input_variables(self.training).values y = get_target_variable(self.training).values def time_seconds(): return default_timer() start_time = time_seconds() ensemble_learner.fit(X, y, RootMeanSquaredError, verbose=False) print("time to train algorithm: ", (time_seconds()-start_time)) self.assertTrue(expr=ensemble_learner.learners) print()
def test_tie(self): print('TIE tests of fit()...') def time_seconds(): return default_timer() start_time = time_seconds() algorithm = SemanticLearningMachine(100, TrainingImprovementEffectivenessCriterion(0.25), 3, 0.01, 50, Mutation2(), RootMeanSquaredError, True) X = get_input_variables(self.training).values y = get_target_variable(self.training).values start_time = time_seconds() algorithm.fit(X, y, RootMeanSquaredError, verbose=False) print("time to train algorithm: ", (time_seconds()-start_time)) self.assertTrue(expr=algorithm.champion) print()
def test_benchmark_sga(self): print('Test BenchmarkSGA()...') topology = create_network_from_topology([2, 2]) algorithm = BenchmarkSGA(10, MaxGenerationsCriterion(10), topology, SelectionOperatorTournament(5), MutationOperatorGaussian(0.1), CrossoverOperatorArithmetic(), 0.01, 0.25) X = get_input_variables(self.training).as_matrix() y = get_target_variable(self.training).as_matrix() log = algorithm.fit(X, y, RootMeanSquaredError, verbose=True) self.assertTrue(expr=log) print()
def test_slm_fls(self): print("testing fit() for SLM (FLS) ...") base_learner = SemanticLearningMachine(50, MaxGenerationsCriterion(100), 2, 1, 10, Mutation2()) ensemble_learner = EnsembleRandomIndependentWeighting(base_learner, 100, weight_range=1) X = get_input_variables(self.training).values y = get_target_variable(self.training).values def time_seconds(): return default_timer() start_time = time_seconds() ensemble_learner.fit(X, y, RootMeanSquaredError, verbose=False) print("time to train algorithm: ", (time_seconds()-start_time)) self.assertTrue(expr=ensemble_learner.learners) print()
def test_fit(self): print("testing fit()...") self.ensemble_learner.fit(get_input_variables(self.training).values, get_target_variable(self.training).values, RootMeanSquaredError, verbose=True) self.assertTrue(expr=self.ensemble_learner.learners)
def _calculate_value(self, learner, data_set): prediction = learner.predict(get_input_variables(data_set).as_matrix()) target = get_target_variable(data_set).as_matrix() return self.metric.evaluate(prediction, target)
def run_nested_cv(self): """ runs benchmark study on a nested cross-validation environment """ print('Entering run_nested_cv for dataset:', self.data_set_name) outer_cv = 0 outer_folds = self._get_outer_folds(outer_cv) for training_outer_index, testing_index in outer_folds.split( get_input_variables(self.samples).values, get_target_variable(self.samples).values): print('\tIndex of outer fold:', outer_cv) training_outer, testing = pd.DataFrame( self.samples.values[training_outer_index]), pd.DataFrame( self.samples.values[testing_index]) if self.classification: best_overall_validation_value = float('-Inf') else: best_overall_validation_value = float( '-Inf') if self.metric.greater_is_better else float('Inf') for key in self.models.keys(): print('\t\tAlgorithm with key:', key) if not self.results[key][outer_cv]: if self.classification: best_validation_value = float('-Inf') else: best_validation_value = float( '-Inf' ) if self.metric.greater_is_better else float('Inf') validation_value_list = list() for configuration in range( self.models[key]['max_combinations']): print('\t\t\tIndex of algorithm configuration:', len(validation_value_list)) if (len(self.models[key]['algorithms'])) > 1: option = randint(0, 2) algorithm = self.models[key]['algorithms'][option] config = self.models[key]['configuration_method']( option) else: algorithm = self.models[key]['algorithms'][0] if (key == 'mlpc_sgd' or key == 'mlpc_adam' or key == 'mlpr_sgd' or key == 'mlpr_adam'): # version from 01-22 # config = self.models[key]['configuration_method'](self.get_data_set_size(training_outer)) # version from 01-25 batch_size = int(training_outer.shape[0] / _INNER_FOLDS) # batch_size = int(training_outer.shape[0] / _INNER_FOLDS) * 2 config = self.models[key][ 'configuration_method'](batch_size) else: config = self.models[key][ 'configuration_method']() inner_folds = self._get_inner_folds(outer_cv) tmp_valid_training_values_list = list() for training_inner_index, validation_index in inner_folds.split( get_input_variables(training_outer).values, get_target_variable(training_outer).values): print('\t\t\t\tIndex of inner fold:', len(tmp_valid_training_values_list)) training_inner, validation = pd.DataFrame( training_outer.values[training_inner_index] ), pd.DataFrame( training_outer.values[validation_index]) results = self._evaluate_algorithm( algorithm=algorithm, configurations=config, training_set=training_inner, validation_set=None, testing_set=validation, metric=self.metric) # print('results[testing_value] =', results['testing_value'], ', results[training_value] =', results['training_value']) if self.classification: tmp_valid_training_values_list.append( (results['testing_accuracy'], results['training_accuracy'])) else: tmp_valid_training_values_list.append( (results['testing_value'], results['training_value'])) # Calculate average validation value and check if the current value is better than the best one average_validation_value = mean( tmp_valid_training_values_list, axis=0)[0] average_training_value = mean( tmp_valid_training_values_list, axis=0)[1] if self.classification: if average_validation_value > best_validation_value: best_algorithm = algorithm best_key = key best_configuration = config best_validation_value = average_validation_value best_training_value = average_training_value else: if is_better(average_validation_value, best_validation_value, self.metric): best_algorithm = algorithm best_key = key best_configuration = config best_validation_value = average_validation_value best_training_value = average_training_value # Add configuration and validation error to validation error list. validation_value_list.append( (configuration, average_validation_value)) self.results[key][outer_cv] = self._evaluate_algorithm( algorithm=best_algorithm, configurations=best_configuration, training_set=training_outer, validation_set=None, testing_set=testing, metric=self.metric) self.results[key][outer_cv][ 'best_configuration'] = best_configuration self.results[key][outer_cv][ 'avg_inner_validation_error'] = best_validation_value self.results[key][outer_cv][ 'avg_inner_training_error'] = best_training_value if self.classification: self.results[key][outer_cv][ 'avg_inner_validation_accuracy'] = best_validation_value self.results[key][outer_cv][ 'avg_inner_training_accuracy'] = best_training_value # # Serialize benchmark # benchmark_to_pickle(self) if self.classification: if best_validation_value > best_overall_validation_value: best_overall_key = best_key best_overall_algorithm = best_algorithm best_overall_configuration = best_configuration best_overall_validation_value = best_validation_value else: if is_better(best_validation_value, best_overall_validation_value, self.metric): best_overall_key = best_key best_overall_algorithm = best_algorithm best_overall_configuration = best_configuration best_overall_validation_value = best_validation_value print( '\tBest overall configuration found for outer fold with index', outer_cv) self.best_result[outer_cv] = self._evaluate_algorithm( algorithm=best_overall_algorithm, configurations=best_overall_configuration, training_set=training_outer, validation_set=None, testing_set=testing, metric=self.metric) self.best_result[outer_cv][ 'best_overall_algorithm'] = best_overall_algorithm self.best_result[outer_cv][ 'best_overall_configuration'] = best_overall_configuration self.best_result[outer_cv]['best_overall_key'] = best_overall_key if self.ensembles != None: print('\tCreating ensembles') self._run_ensembles( outer_cv, best_overall_algorithm.get_corresponding_algo(), best_overall_configuration, training_outer, testing, self.metric) else: print('\tNo ensembles to create') outer_cv += 1 # Serialize benchmark benchmark_to_pickle(self) print('Leaving run_nested_cv for dataset:', self.data_set_name)
def _calculate_network_value(self, network, data_set): predictions = network.predict( get_input_variables(data_set).as_matrix()) target = get_target_variable(data_set).as_matrix() return self.metric.evaluate(predictions, target)
def run_nested_cv(self): """ runs benchmark study on a nested cross-validation environment """ #======================================================================= # print('self.learning_metric =', self.learning_metric) # print('self.selection_metric =', self.selection_metric) #======================================================================= print('Entering run_nested_cv for dataset:', self.dataset_name) outer_cv = 0 outer_folds = self._get_outer_folds(outer_cv) for training_outer_index, testing_index in outer_folds.split( get_input_variables(self.samples).values, get_target_variable(self.samples).values): print('\n\tIndex of outer fold:', outer_cv) training_outer, testing = pd.DataFrame( self.samples.values[training_outer_index]), pd.DataFrame( self.samples.values[testing_index]) if self.classification: best_overall_validation_value = float('-Inf') else: best_overall_validation_value = float( '-Inf' ) if self.selection_metric.greater_is_better else float('Inf') for key in self.models.keys(): print('\t\tAlgorithm with key:', key) if not self.results[key][outer_cv]: if self.classification: best_validation_value = float('-Inf') else: best_validation_value = float( '-Inf' ) if self.selection_metric.greater_is_better else float( 'Inf') validation_value_list = list() for configuration in range( self.models[key]['max_combinations']): print('\n\t\t\tIndex of algorithm configuration:', len(validation_value_list)) if (len(self.models[key]['algorithms'])) > 1: option = randint(0, 2) algorithm = self.models[key]['algorithms'][option] config = self.models[key]['configuration_method']( option) else: algorithm = self.models[key]['algorithms'][0] #=================================================== # if (key == 'mlpc_sgd' or key == 'mlpc_adam' or key == 'mlpr_sgd' or key == 'mlpr_adam'): #=================================================== if key.startswith('mlp'): # version from 01-22 # config = self.models[key]['configuration_method'](self.get_dataset_size(training_outer)) # version from 01-25 batch_size = int(training_outer.shape[0] / _INNER_FOLDS) # batch_size = int(training_outer.shape[0] / _INNER_FOLDS) * 2 config = self.models[key][ 'configuration_method'](batch_size) else: config = self.models[key][ 'configuration_method']() inner_folds = self._get_inner_folds(outer_cv) tmp_valid_training_values_list = list() for training_inner_index, validation_index in inner_folds.split( get_input_variables(training_outer).values, get_target_variable(training_outer).values): print('\t\t\t\tIndex of inner fold:', len(tmp_valid_training_values_list)) training_inner, validation = pd.DataFrame( training_outer.values[training_inner_index] ), pd.DataFrame( training_outer.values[validation_index]) results = self._evaluate_algorithm( algorithm=algorithm, configurations=config, training_set=training_inner, validation_set=None, testing_set=validation, metric=self.learning_metric) # print('results[testing_value] =', results['testing_value'], ', results[training_value] =', results['training_value']) if self.classification: tmp_valid_training_values_list.append( (results['testing_accuracy'], results['training_accuracy'])) else: tmp_valid_training_values_list.append( (results['testing_value'], results['training_value'])) # Calculate average validation value and check if the current value is better than the best one average_validation_value = mean( tmp_valid_training_values_list, axis=0)[0] average_training_value = mean( tmp_valid_training_values_list, axis=0)[1] if self.classification: print( "\t\t\tAverage AUROC training vs. validation: %.3f vs. %.3f" % (average_training_value, average_validation_value)) else: print( "\t\t\tAverage RMSE training vs. validation: %.3f vs. %.3f" % (average_training_value, average_validation_value)) if self.classification: if average_validation_value > best_validation_value: #=============================================== # print('\n\t\t\t\t\tClassification: %.3f is better than %.3f\n' % (average_validation_value, best_validation_value)) #=============================================== best_algorithm = algorithm best_key = key best_configuration = config best_validation_value = average_validation_value best_training_value = average_training_value #=================================================== # else: # print('\n\t\t\t\t\tClassification: %.3f is worse (!) than %.3f\n' % (average_validation_value, best_validation_value)) #=================================================== else: if is_better(average_validation_value, best_validation_value, self.selection_metric): #=============================================== # print('\n\t\t\t\t\tRegression: %.3f is better than %.3f\n' % (average_validation_value, best_validation_value)) #=============================================== best_algorithm = algorithm best_key = key best_configuration = config best_validation_value = average_validation_value best_training_value = average_training_value #=================================================== # else: # print('\n\t\t\t\t\tRegression: %.3f is worse (!) than %.3f\n' % (average_validation_value, best_validation_value)) #=================================================== # Add configuration and validation error to validation error list. validation_value_list.append( (configuration, average_validation_value)) """ all allowed configurations assessed of a given variant/algorithm/method (key) """ print( '\n\t\tEvaluating best configuration in outer fold with index', outer_cv) self.results[key][outer_cv] = self._evaluate_algorithm( algorithm=best_algorithm, configurations=best_configuration, training_set=training_outer, validation_set=None, testing_set=testing, metric=self.learning_metric) self.results[key][outer_cv][ 'best_configuration'] = best_configuration self.results[key][outer_cv][ 'avg_inner_validation_error'] = best_validation_value self.results[key][outer_cv][ 'avg_inner_training_error'] = best_training_value if self.classification: self.results[key][outer_cv][ 'avg_inner_validation_accuracy'] = best_validation_value self.results[key][outer_cv][ 'avg_inner_training_accuracy'] = best_training_value if self.classification: print( "\n\t\tAUROC training vs. test: %.3f vs. %.3f" % (self.results[key][outer_cv]['training_accuracy'], self.results[key][outer_cv]['testing_accuracy'])) #======================================================= # print("\n\t\tAlgorithm %s, AUROC training vs. test: %.3f vs. %.3f" % (key, self.results[key][outer_cv]['training_accuracy'], self.results[key][outer_cv]['testing_accuracy'])) #======================================================= else: print("\n\t\tRMSE training vs. test: %.3f vs. %.3f" % (self.results[key][outer_cv]['training_value'], self.results[key][outer_cv]['testing_value'])) #======================================================= # print("\n\t\tAlgorithm %s, RMSE training vs. test: %.3f vs. %.3f" % (key, self.results[key][outer_cv]['training_value'], self.results[key][outer_cv]['testing_value'])) #======================================================= best_overall_algorithm = best_algorithm best_overall_configuration = best_configuration best_overall_key = best_key self.best_result[outer_cv] = self.results[key][outer_cv] self.best_result[outer_cv][ 'best_overall_algorithm'] = best_overall_algorithm self.best_result[outer_cv][ 'best_overall_configuration'] = best_overall_configuration self.best_result[outer_cv][ 'best_overall_key'] = best_overall_key # # Serialize benchmark # benchmark_to_pickle(self) outer_cv += 1 # Serialize benchmark benchmark_to_pickle(self) print('Leaving run_nested_cv for dataset:', self.dataset_name)
def _calculate_accuracy(self, learner, dataset): prediction = learner.predict(get_input_variables(dataset).values) target = get_target_variable(dataset).values return Accuracy.evaluate(prediction, target.astype(int))
def test_fit(self): X = get_input_variables(self.training).as_matrix() y = get_target_variable(self.training).as_matrix() self.neat.fit(X, y, Accuracy, verbose=True) self.assertTrue(expr=self.neat.champion)