def make_mode_holdout_iterative_fit(data, seed, configuration, num_run): global evaluator evaluator = HoldoutEvaluator(data, configuration, seed=seed, num_run=num_run, **_get_base_dict()) evaluator.iterative_fit() signal.signal(15, empty_signal_handler) evaluator.finish_up() backend = Backend(None, os.getcwd()) if os.path.exists(backend.get_model_dir()): backend.save_model(evaluator.model, num_run, seed)
def make_mode_holdout_iterative_fit(data, seed, configuration, num_run): global evaluator evaluator = HoldoutEvaluator(data, configuration, seed=seed, num_run=num_run, **_get_base_dict()) evaluator.iterative_fit() signal.signal(15, empty_signal_handler) evaluator.finish_up() backend = Backend(None, os.getcwd()) if os.path.exists(backend.get_model_dir()): backend.save_model(evaluator.model, num_run, seed)
class AbstractEvaluator(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod def __init__(self, Datamanager, output_dir, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, output_y_test=False, num_run=None): self.starttime = time.time() self.output_dir = output_dir self.configuration = configuration self.D = Datamanager self.X_valid = Datamanager.data.get('X_valid') self.X_test = Datamanager.data.get('X_test') self.metric = Datamanager.info['metric'] self.task_type = Datamanager.info['task'] self.seed = seed self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: if self.configuration is None: self.model_class = MyDummyRegressor else: self.model_class = \ autosklearn.pipeline.regression.SimpleRegressionPipeline self.predict_function = self._predict_regression else: if self.configuration is None: self.model_class = MyDummyClassifier else: self.model_class = \ autosklearn.pipeline.classification.SimpleClassificationPipeline self.predict_function = self._predict_proba if num_run is None: num_run = get_new_run_num() self.num_run = num_run self.backend = Backend(None, self.output_dir) self.model = self.model_class(self.configuration, self.seed) def fit_predict_and_loss(self): """Fit model(s) according to resampling strategy, predict for the validation set and return the loss and predictions on the validation set. Provides a closed interface in which all steps of the target algorithm are performed without any communication with other processes. Useful for cross-validation because it allows to train a model, predict for the validation set and then forget the model in order to save main memory. """ raise NotImplementedError() def iterative_fit(self): """Fit a model iteratively. Fitting can be interrupted in order to use a partially trained model.""" raise NotImplementedError() def predict_and_loss(self): """Use current model to predict on the validation set and calculate loss. Should be used when using iterative fitting.""" raise NotImplementedError() def predict(self): """Use the current model to predict on the validation set. Should only be used to create dummy predictions.""" raise NotImplementedError() def _loss(self, y_true, y_hat): if self.configuration is None: if self.all_scoring_functions: return {self.metric: 1.0} else: return 1.0 score = calculate_score( y_true, y_hat, self.task_type, self.metric, self.D.info['label_num'], all_scoring_functions=self.all_scoring_functions) if hasattr(score, '__len__'): err = {key: 1 - score[key] for key in score} else: err = 1 - score return err def finish_up(self, loss=None, opt_pred=None, valid_pred=None, test_pred=None): """This function does everything necessary after the fitting is done: * predicting * saving the files for the ensembles_statistics * generate output for SMAC We use it as the signal handler so we can recycle the code for the normal usecase and when the runsolver kills us here :)""" try: self.duration = time.time() - self.starttime if loss is None: loss, opt_pred, valid_pred, test_pred = self.predict_and_loss() self.file_output(loss, opt_pred, valid_pred, test_pred) self.duration = time.time() - self.starttime num_run = str(self.num_run).zfill(5) if isinstance(loss, dict): loss_ = loss loss = loss_[self.D.info['metric']] else: loss_ = {} additional_run_info = ';'.join([ '%s: %s' % (METRIC_TO_STRING[metric] if metric in METRIC_TO_STRING else metric, value) for metric, value in loss_.items() ]) additional_run_info += ';' + 'duration: ' + str(self.duration) additional_run_info += ';' + 'num_run:' + num_run if self.configuration is not None: self._output_SMAC_string(self.duration, loss, self.seed, additional_run_info) except Exception as e: self.duration = time.time() - self.starttime print(traceback.format_exc()) self._output_SMAC_string( self.duration, 2.0, self.seed, 'No results were produced! Error is %s' % str(e)) def _output_SMAC_string(self, duration, loss, seed, additional_run_info): print( 'Result for ParamILS: %s, %f, 1, %f, %d, %s' % ('SAT', abs(self.duration), loss, self.seed, additional_run_info)) def file_output(self, loss, Y_optimization_pred, Y_valid_pred, Y_test_pred): seed = os.environ.get('AUTOSKLEARN_SEED') if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: return 2, "Targets %s and prediction %s don't have the same " \ "length. Probably training didn't finish" % ( self.Y_optimization.shape, Y_optimization_pred.shape) num_run = str(self.num_run).zfill(5) if os.path.exists(self.backend.get_model_dir()): self.backend.save_model(self.model, self.num_run, seed) if self.output_y_test: try: os.makedirs(self.output_dir) except OSError: pass self.backend.save_targets_ensemble(self.Y_optimization) self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble', seed, num_run) if Y_valid_pred is not None: self.backend.save_predictions_as_npy(Y_valid_pred, 'valid', seed, num_run) if Y_test_pred is not None: self.backend.save_predictions_as_npy(Y_test_pred, 'test', seed, num_run) def _predict_proba(self, X, model, task_type, Y_train): Y_pred = model.predict_proba(X, batch_size=1000) Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train) return Y_pred def _predict_regression(self, X, model, task_type, Y_train=None): Y_pred = model.predict(X) if len(Y_pred.shape) == 1: Y_pred = Y_pred.reshape((-1, 1)) return Y_pred def _ensure_prediction_array_sizes(self, prediction, Y_train): num_classes = self.D.info['label_num'] if self.task_type == MULTICLASS_CLASSIFICATION and \ prediction.shape[1] < num_classes: if Y_train is None: raise ValueError('Y_train must not be None!') classes = list(np.unique(Y_train)) mapping = dict() for class_number in range(num_classes): if class_number in classes: index = classes.index(class_number) mapping[index] = class_number new_predictions = np.zeros((prediction.shape[0], num_classes), dtype=np.float32) for index in mapping: class_index = mapping[index] new_predictions[:, class_index] = prediction[:, index] return new_predictions return prediction
class AbstractEvaluator(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod def __init__(self, Datamanager, output_dir, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, output_y_test=False, num_run=None): self.starttime = time.time() self.output_dir = output_dir self.configuration = configuration self.D = Datamanager self.X_valid = Datamanager.data.get('X_valid') self.X_test = Datamanager.data.get('X_test') self.metric = Datamanager.info['metric'] self.task_type = Datamanager.info['task'] self.seed = seed self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: if self.configuration is None: self.model_class = MyDummyRegressor else: self.model_class = \ autosklearn.pipeline.regression.SimpleRegressionPipeline self.predict_function = self._predict_regression else: if self.configuration is None: self.model_class = MyDummyClassifier else: self.model_class = \ autosklearn.pipeline.classification.SimpleClassificationPipeline self.predict_function = self._predict_proba if num_run is None: num_run = get_new_run_num() self.num_run = num_run self.backend = Backend(None, self.output_dir) self.model = self.model_class(self.configuration, self.seed) def fit_predict_and_loss(self): """Fit model(s) according to resampling strategy, predict for the validation set and return the loss and predictions on the validation set. Provides a closed interface in which all steps of the target algorithm are performed without any communication with other processes. Useful for cross-validation because it allows to train a model, predict for the validation set and then forget the model in order to save main memory. """ raise NotImplementedError() def iterative_fit(self): """Fit a model iteratively. Fitting can be interrupted in order to use a partially trained model.""" raise NotImplementedError() def predict_and_loss(self): """Use current model to predict on the validation set and calculate loss. Should be used when using iterative fitting.""" raise NotImplementedError() def predict(self): """Use the current model to predict on the validation set. Should only be used to create dummy predictions.""" raise NotImplementedError() def _loss(self, y_true, y_hat): if self.configuration is None: if self.all_scoring_functions: return {self.metric: 1.0} else: return 1.0 score = calculate_score( y_true, y_hat, self.task_type, self.metric, self.D.info['label_num'], all_scoring_functions=self.all_scoring_functions) if hasattr(score, '__len__'): err = {key: 1 - score[key] for key in score} else: err = 1 - score return err def finish_up(self, loss=None, opt_pred=None, valid_pred=None, test_pred=None): """This function does everything necessary after the fitting is done: * predicting * saving the files for the ensembles_statistics * generate output for SMAC We use it as the signal handler so we can recycle the code for the normal usecase and when the runsolver kills us here :)""" try: self.duration = time.time() - self.starttime if loss is None: loss, opt_pred, valid_pred, test_pred = self.predict_and_loss() self.file_output(loss, opt_pred, valid_pred, test_pred) self.duration = time.time() - self.starttime num_run = str(self.num_run).zfill(5) if isinstance(loss, dict): loss_ = loss loss = loss_[self.D.info['metric']] else: loss_ = {} additional_run_info = ';'.join(['%s: %s' % (METRIC_TO_STRING[ metric] if metric in METRIC_TO_STRING else metric, value) for metric, value in loss_.items()]) additional_run_info += ';' + 'duration: ' + str(self.duration) additional_run_info += ';' + 'num_run:' + num_run if self.configuration is not None: self._output_SMAC_string(self.duration, loss, self.seed, additional_run_info) except Exception as e: self.duration = time.time() - self.starttime print(traceback.format_exc()) self._output_SMAC_string(self.duration, 2.0, self.seed, 'No results were produced! Error is %s' % str(e)) def _output_SMAC_string(self, duration, loss, seed, additional_run_info): print('Result for ParamILS: %s, %f, 1, %f, %d, %s' % ('SAT', abs(self.duration), loss, self.seed, additional_run_info)) def file_output(self, loss, Y_optimization_pred, Y_valid_pred, Y_test_pred): seed = os.environ.get('AUTOSKLEARN_SEED') if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: return 2, "Targets %s and prediction %s don't have the same " \ "length. Probably training didn't finish" % ( self.Y_optimization.shape, Y_optimization_pred.shape) num_run = str(self.num_run).zfill(5) if os.path.exists(self.backend.get_model_dir()): self.backend.save_model(self.model, self.num_run, seed) if self.output_y_test: try: os.makedirs(self.output_dir) except OSError: pass self.backend.save_targets_ensemble(self.Y_optimization) self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble', seed, num_run) if Y_valid_pred is not None: self.backend.save_predictions_as_npy(Y_valid_pred, 'valid', seed, num_run) if Y_test_pred is not None: self.backend.save_predictions_as_npy(Y_test_pred, 'test', seed, num_run) def _predict_proba(self, X, model, task_type, Y_train): Y_pred = model.predict_proba(X, batch_size=1000) Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train) return Y_pred def _predict_regression(self, X, model, task_type, Y_train=None): Y_pred = model.predict(X) if len(Y_pred.shape) == 1: Y_pred = Y_pred.reshape((-1, 1)) return Y_pred def _ensure_prediction_array_sizes(self, prediction, Y_train): num_classes = self.D.info['label_num'] if self.task_type == MULTICLASS_CLASSIFICATION and \ prediction.shape[1] < num_classes: if Y_train is None: raise ValueError('Y_train must not be None!') classes = list(np.unique(Y_train)) mapping = dict() for class_number in range(num_classes): if class_number in classes: index = classes.index(class_number) mapping[index] = class_number new_predictions = np.zeros((prediction.shape[0], num_classes), dtype=np.float32) for index in mapping: class_index = mapping[index] new_predictions[:, class_index] = prediction[:, index] return new_predictions return prediction
class AbstractEvaluator(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod def __init__(self, Datamanager, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, output_dir=None, output_y_test=False, num_run=None): self.starttime = time.time() self.configuration = configuration self.D = Datamanager self.X_valid = Datamanager.data.get('X_valid') self.X_test = Datamanager.data.get('X_test') self.metric = Datamanager.info['metric'] self.task_type = Datamanager.info['task'] self.seed = seed if output_dir is None: self.output_dir = os.getcwd() else: self.output_dir = output_dir self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: if self.configuration is None: self.model_class = MyDummyRegressor else: self.model_class = ParamSklearnRegressor self.predict_function = self.predict_regression else: if self.configuration is None: self.model_class = MyDummyClassifier else: self.model_class = ParamSklearnClassifier self.predict_function = self.predict_proba if num_run is None: num_run = get_new_run_num() self.num_run = num_run self.backend = Backend(None, self.output_dir) self.model = self.model_class(self.configuration, self.seed) @abc.abstractmethod def fit(self): pass @abc.abstractmethod def predict(self): pass # This function does everything necessary after the fitting is done: # predicting # saving the files for the ensembles_statistics # generate output for SMAC # We use it as the signal handler so we can recycle the code for the # normal usecase and when the runsolver kills us here :) def finish_up(self): try: self.duration = time.time() - self.starttime result, additional_run_info = self.file_output() if self.configuration is not None: print('Result for ParamILS: %s, %f, 1, %f, %d, %s' % ('SAT', abs(self.duration), result, self.seed, additional_run_info)) except Exception as e: self.duration = time.time() - self.starttime print(traceback.format_exc()) print('Result for ParamILS: %s, %f, 1, %f, %d, %s' % ('TIMEOUT', abs(self.duration), 1.0, self.seed, 'No results were produced! Error is %s' % str(e))) def file_output(self): seed = os.environ.get('AUTOSKLEARN_SEED') errs, Y_optimization_pred, Y_valid_pred, Y_test_pred = self.predict() if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: return 2, "Targets %s and prediction %s don't have the same " \ "length. Probably training didn't finish" % ( self.Y_optimization.shape, Y_optimization_pred.shape) num_run = str(self.num_run).zfill(5) if os.path.exists(self.backend.get_model_dir()): self.backend.save_model(self.model, self.num_run, seed) if self.output_y_test: try: os.makedirs(self.output_dir) except OSError: pass self.backend.save_targets_ensemble(self.Y_optimization) self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble', seed, num_run) if Y_valid_pred is not None: self.backend.save_predictions_as_npy(Y_valid_pred, 'valid', seed, num_run) if Y_test_pred is not None: self.backend.save_predictions_as_npy(Y_test_pred, 'test', seed, num_run) self.duration = time.time() - self.starttime err = errs[self.D.info['metric']] additional_run_info = ';'.join(['%s: %s' % (METRIC_TO_STRING[metric] if metric in METRIC_TO_STRING else metric, value) for metric, value in errs.items()]) additional_run_info += ';' + 'duration: ' + str(self.duration) additional_run_info += ';' + 'num_run:' + num_run return err, additional_run_info def predict_proba(self, X, model, task_type, Y_train=None): Y_pred = model.predict_proba(X, batch_size=1000) if task_type == MULTILABEL_CLASSIFICATION: Y_pred = np.hstack([Y_pred[i][:, -1].reshape((-1, 1)) for i in range(len(Y_pred))]) elif task_type == BINARY_CLASSIFICATION: if len(Y_pred.shape) != 1: Y_pred = Y_pred[:, 1].reshape(-1, 1) elif task_type == MULTICLASS_CLASSIFICATION: pass Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train) return Y_pred def predict_regression(self, X, model, task_type, Y_train=None): Y_pred = model.predict(X) if len(Y_pred.shape) == 1: Y_pred = Y_pred.reshape((-1, 1)) return Y_pred def _ensure_prediction_array_sizes(self, prediction, Y_train): num_classes = self.D.info['label_num'] if self.task_type == MULTICLASS_CLASSIFICATION and \ prediction.shape[1] < num_classes: classes = list(np.unique(self.D.data['Y_train'])) if num_classes == prediction.shape[1]: return prediction if Y_train is not None: classes = list(np.unique(Y_train)) mapping = dict() for class_number in range(num_classes): if class_number in classes: index = classes.index(class_number) mapping[index] = class_number new_predictions = np.zeros((prediction.shape[0], num_classes)) for index in mapping: class_index = mapping[index] new_predictions[:, class_index] = prediction[:, index] return new_predictions return prediction
class AbstractEvaluator(object): __metaclass__ = abc.ABCMeta @abc.abstractmethod def __init__(self, Datamanager, configuration=None, with_predictions=False, all_scoring_functions=False, seed=1, output_dir=None, output_y_test=False, num_run=None): self.starttime = time.time() self.configuration = configuration self.D = Datamanager self.X_valid = Datamanager.data.get('X_valid') self.X_test = Datamanager.data.get('X_test') self.metric = Datamanager.info['metric'] self.task_type = Datamanager.info['task'] self.seed = seed if output_dir is None: self.output_dir = os.getcwd() else: self.output_dir = output_dir self.output_y_test = output_y_test self.with_predictions = with_predictions self.all_scoring_functions = all_scoring_functions if self.task_type in REGRESSION_TASKS: if self.configuration is None: self.model_class = MyDummyRegressor else: self.model_class = SimpleRegressionPipeline self.predict_function = self.predict_regression else: if self.configuration is None: self.model_class = MyDummyClassifier else: self.model_class = SimpleClassificationPipeline self.predict_function = self.predict_proba if num_run is None: num_run = get_new_run_num() self.num_run = num_run self.backend = Backend(None, self.output_dir) self.model = self.model_class(self.configuration, self.seed) @abc.abstractmethod def fit(self): pass @abc.abstractmethod def predict(self): pass # This function does everything necessary after the fitting is done: # predicting # saving the files for the ensembles_statistics # generate output for SMAC # We use it as the signal handler so we can recycle the code for the # normal usecase and when the runsolver kills us here :) def finish_up(self): try: self.duration = time.time() - self.starttime result, additional_run_info = self.file_output() if self.configuration is not None: print('Result for ParamILS: %s, %f, 1, %f, %d, %s' % ('SAT', abs(self.duration), result, self.seed, additional_run_info)) except Exception as e: self.duration = time.time() - self.starttime print(traceback.format_exc()) print('Result for ParamILS: %s, %f, 1, %f, %d, %s' % ('TIMEOUT', abs(self.duration), 1.0, self.seed, 'No results were produced! Error is %s' % str(e))) def file_output(self): seed = os.environ.get('AUTOSKLEARN_SEED') errs, Y_optimization_pred, Y_valid_pred, Y_test_pred = self.predict() if self.Y_optimization.shape[0] != Y_optimization_pred.shape[0]: return 2, "Targets %s and prediction %s don't have the same " \ "length. Probably training didn't finish" % ( self.Y_optimization.shape, Y_optimization_pred.shape) num_run = str(self.num_run).zfill(5) if os.path.exists(self.backend.get_model_dir()): self.backend.save_model(self.model, self.num_run, seed) if self.output_y_test: try: os.makedirs(self.output_dir) except OSError: pass self.backend.save_targets_ensemble(self.Y_optimization) self.backend.save_predictions_as_npy(Y_optimization_pred, 'ensemble', seed, num_run) if Y_valid_pred is not None: self.backend.save_predictions_as_npy(Y_valid_pred, 'valid', seed, num_run) if Y_test_pred is not None: self.backend.save_predictions_as_npy(Y_test_pred, 'test', seed, num_run) self.duration = time.time() - self.starttime err = errs[self.D.info['metric']] additional_run_info = ';'.join([ '%s: %s' % (METRIC_TO_STRING[metric] if metric in METRIC_TO_STRING else metric, value) for metric, value in errs.items() ]) additional_run_info += ';' + 'duration: ' + str(self.duration) additional_run_info += ';' + 'num_run:' + num_run return err, additional_run_info def predict_proba(self, X, model, task_type, Y_train=None): Y_pred = model.predict_proba(X, batch_size=1000) if task_type == MULTILABEL_CLASSIFICATION: Y_pred = np.hstack([ Y_pred[i][:, -1].reshape((-1, 1)) for i in range(len(Y_pred)) ]) elif task_type == BINARY_CLASSIFICATION: if len(Y_pred.shape) != 1: Y_pred = Y_pred[:, 1].reshape(-1, 1) elif task_type == MULTICLASS_CLASSIFICATION: pass Y_pred = self._ensure_prediction_array_sizes(Y_pred, Y_train) return Y_pred def predict_regression(self, X, model, task_type, Y_train=None): Y_pred = model.predict(X) if len(Y_pred.shape) == 1: Y_pred = Y_pred.reshape((-1, 1)) return Y_pred def _ensure_prediction_array_sizes(self, prediction, Y_train): num_classes = self.D.info['label_num'] if self.task_type == MULTICLASS_CLASSIFICATION and \ prediction.shape[1] < num_classes: classes = list(np.unique(self.D.data['Y_train'])) if num_classes == prediction.shape[1]: return prediction if Y_train is not None: classes = list(np.unique(Y_train)) mapping = dict() for class_number in range(num_classes): if class_number in classes: index = classes.index(class_number) mapping[index] = class_number new_predictions = np.zeros((prediction.shape[0], num_classes)) for index in mapping: class_index = mapping[index] new_predictions[:, class_index] = prediction[:, index] return new_predictions return prediction