def test_create_model_quantile_model(self): # Test if quantile model is properly returned model_type = MLModelType.XGB_QUANTILE quantiles = tuple([0.5, 0.2, 0.5]) # Create relevant model model = ModelCreator.create_model(model_type, quantiles=quantiles) self.assertIsInstance(model, OpenstfRegressorInterface) self.assertIsInstance(model, XGBQuantileOpenstfRegressor) self.assertEqual(model.quantiles, quantiles)
def test_create_model_happy_flow(self): # Test happy flow (both str and enum model_type arguments) valid_types = [t.value for t in MLModelType] + [t for t in MLModelType] for model_type in valid_types: model = ModelCreator.create_model(model_type) self.assertIsInstance(model, OpenstfRegressorInterface) self.assertTrue(hasattr(model, "can_predict_quantiles")) if model_type in ["xgb_quantile", MLModelType("xgb_quantile")]: self.assertTrue(model.can_predict_quantiles) else: self.assertFalse(model.can_predict_quantiles)
def test_create_custom_model(self): model_path = __name__ + ".DummyRegressor" model_class = load_custom_model(model_path) assert model_class == DummyRegressor model = ModelCreator().create_model(model_path) assert isinstance(model, DummyRegressor) with self.assertRaises(AttributeError): model_path = __name__ + ".UnknownRegressor" load_custom_model(model_path) with self.assertRaises(ValueError): model_path = __name__ + ".InvalidRegressor" load_custom_model(model_path)
def optuna_optimization( pj: PredictionJobDataClass, objective: RegressorObjective, validated_data_with_features: pd.DataFrame, n_trials: int, ) -> Tuple[optuna.study.Study, RegressorObjective]: """Perform hyperparameter optimization with optuna Args: pj: Prediction job objective: Objective function for optuna validated_data_with_features: cleaned input dataframe n_trials: number of optuna trials Returns: model (OpenstfRegressor): Optimized model study (optuna.study.Study): Optimization study from optuna objective : The objective object used by optuna """ model = ModelCreator.create_model(pj["model"]) study = optuna.create_study( study_name=pj["model"], pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="minimize", ) # Start with evaluating the default set of parameters, # this way the optimization never get worse than the default values study.enqueue_trial(objective.get_default_values()) objective = objective( model, validated_data_with_features, ) # Optuna updates the model by itself # and the model is the optimized over this finishes study.optimize( objective, n_trials=n_trials, callbacks=[_log_study_progress_and_save_best_model], show_progress_bar=False, timeout=TIMEOUT, ) return study, objective
def test_call(self): model_type = "xgb" model = ModelCreator.create_model(model_type) objective = XGBRegressorObjective( model, input_data, ) study = optuna.create_study( study_name=model_type, pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="minimize", ) with self.assertRaises(RuntimeError): study.optimize(objective, n_trials=N_TRIALS)
def test_call(self): model_type = "linear" model = ModelCreator.create_model(model_type) objective = LinearRegressorObjective( model, input_data_with_features, ) study = optuna.create_study( study_name=model_type, pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="minimize", ) study.optimize(objective, n_trials=N_TRIALS) self.assertIsInstance(objective, LinearRegressorObjective) self.assertEqual(len(study.trials), N_TRIALS)
def test_call(self): input_data = TestData.load("reference_sets/307-train-data.csv") pj = {"model": "proloaf"} input_data_with_features = TrainFeatureApplicator( horizons=[24.0]).add_features(input_data, pj=pj) model_type = "proloaf" model = ModelCreator.create_model(model_type) objective = ProLoafRegressorObjective( model, input_data_with_features, ) study = optuna.create_study( study_name=model_type, pruner=optuna.pruners.MedianPruner(n_warmup_steps=5), direction="minimize", ) study.optimize(objective, n_trials=1) self.assertIsInstance(objective, ProLoafRegressorObjective) self.assertEqual(len(study.trials), 1)
def train_pipeline_step_train_model( pj: PredictionJobDataClass, model_specs: ModelSpecificationDataClass, train_data: pd.DataFrame, validation_data: pd.DataFrame, ) -> OpenstfRegressor: """Train the model Args: pj (PredictionJobDataClass): Prediction job model_specs (ModelSpecificationDataClass): Dataclass containing model specifications train_data (pd.DataFrame): The training data validation_data (pd.DataFrame): The test data Returns: trained_model (OpenstfRegressor): The trained model """ # Test if first column is "load" and last column is "horizon" if train_data.columns[0] != "load" or train_data.columns[-1] != "horizon": raise InputDataWrongColumnOrderError( f"Wrong column order for {pj['id']} " "'load' column should be first and 'horizon' column last.") # Create relevant model model = ModelCreator.create_model( pj["model"], quantiles=pj["quantiles"], ) # split x and y data train_x, train_y = train_data.iloc[:, 1:-1], train_data.iloc[:, 0] validation_x, validation_y = ( validation_data.iloc[:, 1:-1], validation_data.iloc[:, 0], ) # Configure evals for early stopping eval_set = [(train_x, train_y), (validation_x, validation_y)] # Set relevant hyperparameters # define protected hyperparams which are derived from prediction_job protected_hyperparams = ["quantiles"] valid_hyper_parameters = { key: value for key, value in model_specs.hyper_params.items() if key in model.get_params().keys() and key not in protected_hyperparams } model.set_params(**valid_hyper_parameters) model.fit( train_x, train_y, eval_set=eval_set, early_stopping_rounds=DEFAULT_EARLY_STOPPING_ROUNDS, verbose=False, ) # Gets the feature importance df or None if we don't have feature importance model.feature_importance_dataframe = model.set_feature_importance() logging.info("Fitted a new model, not yet stored") # Do confidence interval determination model = StandardDeviationGenerator( validation_data).generate_standard_deviation_data(model) return model
def test_create_model_unknown_model(self): # Test if NotImplementedError is raised when model type is unknown model_type = "Unknown" with self.assertRaises(NotImplementedError): ModelCreator.create_model(model_type)