def _loss( self, y_true: np.ndarray, y_hat: np.ndarray, scoring_functions: Optional[List[Scorer]] = None ) -> Union[float, Dict[str, float]]: """Auto-sklearn follows a minimization goal. The calculate_loss internally translate a score function to a minimization problem. For a dummy prediction, the worst result is assumed. Parameters ---------- y_true """ scoring_functions = (self.scoring_functions if scoring_functions is None else scoring_functions) if not isinstance(self.configuration, Configuration): if scoring_functions: return {self.metric.name: self.metric._worst_possible_result} else: return self.metric._worst_possible_result return calculate_loss(y_true, y_hat, self.task_type, self.metric, scoring_functions=scoring_functions)
def _slow( self, predictions: List[np.ndarray], labels: np.ndarray ) -> None: """Rich Caruana's ensemble selection method.""" self.num_input_models_ = len(predictions) ensemble = [] trajectory = [] order = [] ensemble_size = self.ensemble_size for i in range(ensemble_size): losses = np.zeros( [np.shape(predictions)[0]], dtype=np.float64, ) for j, pred in enumerate(predictions): ensemble.append(pred) ensemble_prediction = np.mean(np.array(ensemble), axis=0) # calculate_loss is versatile and can return a dict of losses # when scoring_functions=None, we know it will be a float losses[j] = cast( float, calculate_loss( solution=labels, prediction=ensemble_prediction, task_type=self.task_type, metric=self.metric, scoring_functions=None ) ) ensemble.pop() best = np.nanargmin(losses) ensemble.append(predictions[best]) trajectory.append(losses[best]) order.append(best) # Handle special case if len(predictions) == 1: break self.indices_ = np.array( order, dtype=np.int64, ) self.trajectory_ = np.array( trajectory, dtype=np.float64, ) self.train_loss_ = trajectory[-1]
def predict_and_loss( self, train: bool = False ) -> Tuple[Union[Dict[str, float], float], np.array, Any, Any]: if train: Y_pred = self.predict_function(self.X_train, self.model, self.task_type, self.Y_train) err = calculate_loss(solution=self.Y_train, prediction=Y_pred, task_type=self.task_type, metric=self.metric, scoring_functions=self.scoring_functions) else: Y_pred = self.predict_function(self.X_test, self.model, self.task_type, self.Y_train) err = calculate_loss(solution=self.Y_test, prediction=Y_pred, task_type=self.task_type, metric=self.metric, scoring_functions=self.scoring_functions) return err, Y_pred, None, None
def _loss( self, y_true: np.ndarray, y_hat: np.ndarray, idx: np.ndarray, scoring_functions: Optional[List[Scorer]] = None ) -> Union[float, Dict[str, float]]: """Auto-sklearn follows a minimization goal. The calculate_loss internally translate a score function to a minimization problem. For a dummy prediction, the worst result is assumed. Parameters ---------- y_true """ scoring_functions = (self.scoring_functions if scoring_functions is None else scoring_functions) if not isinstance(self.configuration, Configuration): if scoring_functions: return {self.metric.name: self.metric._worst_possible_result} else: return self.metric._worst_possible_result # Handle protected attributes if self.metric.needs_prot: sensitive_features = self.X_train[idx, 0] metric = copy.copy(self.metric) metric._kwargs.update({'sensitive_features': sensitive_features}) else: metric = self.metric return calculate_loss(y_true, y_hat, self.task_type, metric, scoring_functions=scoring_functions)
def _fast( self, predictions: List[np.ndarray], labels: np.ndarray, ) -> None: """Fast version of Rich Caruana's ensemble selection method.""" self.num_input_models_ = len(predictions) ensemble = [] # type: List[np.ndarray] trajectory = [] order = [] ensemble_size = self.ensemble_size weighted_ensemble_prediction = np.zeros( predictions[0].shape, dtype=np.float64, ) fant_ensemble_prediction = np.zeros( weighted_ensemble_prediction.shape, dtype=np.float64, ) for i in range(ensemble_size): losses = np.zeros( (len(predictions)), dtype=np.float64, ) s = len(ensemble) if s > 0: np.add( weighted_ensemble_prediction, ensemble[-1], out=weighted_ensemble_prediction, ) # Memory-efficient averaging! for j, pred in enumerate(predictions): # fant_ensemble_prediction is the prediction of the current ensemble # and should be ([predictions[selected_prev_iterations] + predictions[j])/(s+1) # We overwrite the contents of fant_ensemble_prediction # directly with weighted_ensemble_prediction + new_prediction and then scale for avg np.add(weighted_ensemble_prediction, pred, out=fant_ensemble_prediction) np.multiply(fant_ensemble_prediction, (1. / float(s + 1)), out=fant_ensemble_prediction) # calculate_loss is versatile and can return a dict of losses # when scoring_functions=None, we know it will be a float losses[j] = cast( float, calculate_loss(solution=labels, prediction=fant_ensemble_prediction, task_type=self.task_type, metric=self.metric, scoring_functions=None)) all_best = np.argwhere(losses == np.nanmin(losses)).flatten() best = self.random_state.choice(all_best) ensemble.append(predictions[best]) trajectory.append(losses[best]) order.append(best) # Handle special case if len(predictions) == 1: break self.indices_ = order self.trajectory_ = trajectory self.train_loss_ = trajectory[-1]
def test_calculate_loss(): # In a 0-1 ranged scorer, make sure that the loss # has a expected positive value y_pred = np.array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0]) y_true = np.array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0]) score = sklearn.metrics.accuracy_score(y_true, y_pred) assert pytest.approx(score) == calculate_score( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, metric=autosklearn.metrics.accuracy, ) loss = 1.0 - score assert pytest.approx(loss) == calculate_loss( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, metric=autosklearn.metrics.accuracy, ) # Test the dictionary case score_dict = calculate_score(solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, metric=autosklearn.metrics.accuracy, scoring_functions=[ autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy ]) expected_score_dict = { 'accuracy': 0.9, 'balanced_accuracy': 0.9285714285714286, } loss_dict = calculate_loss(solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, metric=autosklearn.metrics.accuracy, scoring_functions=[ autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy ]) for expected_metric, expected_score in expected_score_dict.items(): assert pytest.approx(expected_score) == score_dict[expected_metric] assert pytest.approx(1 - expected_score) == loss_dict[expected_metric] # Lastly make sure that metrics whose optimum is zero # are also properly working y_true = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) y_pred = np.array([0.11, 0.22, 0.33, 0.44, 0.55, 0.66]) score = sklearn.metrics.mean_squared_error(y_true, y_pred) assert pytest.approx(score) == calculate_score( solution=y_true, prediction=y_pred, task_type=REGRESSION, metric=autosklearn.metrics.mean_squared_error, ) loss = score assert pytest.approx(loss) == calculate_loss( solution=y_true, prediction=y_pred, task_type=REGRESSION, metric=autosklearn.metrics.mean_squared_error, )