def test_create_model_quantile_model(self):
        # Test if quantile model is properly returned
        model_type = MLModelType.XGB_QUANTILE
        quantiles = tuple([0.5, 0.2, 0.5])
        # Create relevant model
        model = ModelCreator.create_model(model_type, quantiles=quantiles)

        self.assertIsInstance(model, OpenstfRegressorInterface)
        self.assertIsInstance(model, XGBQuantileOpenstfRegressor)
        self.assertEqual(model.quantiles, quantiles)
 def test_create_model_happy_flow(self):
     # Test happy flow (both str and enum model_type arguments)
     valid_types = [t.value for t in MLModelType] + [t for t in MLModelType]
     for model_type in valid_types:
         model = ModelCreator.create_model(model_type)
         self.assertIsInstance(model, OpenstfRegressorInterface)
         self.assertTrue(hasattr(model, "can_predict_quantiles"))
         if model_type in ["xgb_quantile", MLModelType("xgb_quantile")]:
             self.assertTrue(model.can_predict_quantiles)
         else:
             self.assertFalse(model.can_predict_quantiles)
Example #3
0
    def test_create_custom_model(self):
        model_path = __name__ + ".DummyRegressor"
        model_class = load_custom_model(model_path)
        assert model_class == DummyRegressor

        model = ModelCreator().create_model(model_path)
        assert isinstance(model, DummyRegressor)

        with self.assertRaises(AttributeError):
            model_path = __name__ + ".UnknownRegressor"
            load_custom_model(model_path)

        with self.assertRaises(ValueError):
            model_path = __name__ + ".InvalidRegressor"
            load_custom_model(model_path)
def optuna_optimization(
    pj: PredictionJobDataClass,
    objective: RegressorObjective,
    validated_data_with_features: pd.DataFrame,
    n_trials: int,
) -> Tuple[optuna.study.Study, RegressorObjective]:
    """Perform hyperparameter optimization with optuna

    Args:
        pj: Prediction job
        objective: Objective function for optuna
        validated_data_with_features: cleaned input dataframe
        n_trials: number of optuna trials

    Returns:
        model (OpenstfRegressor): Optimized model
        study (optuna.study.Study): Optimization study from optuna
        objective : The objective object used by optuna

    """
    model = ModelCreator.create_model(pj["model"])

    study = optuna.create_study(
        study_name=pj["model"],
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
        direction="minimize",
    )

    # Start with evaluating the default set of parameters,
    # this way the optimization never get worse than the default values
    study.enqueue_trial(objective.get_default_values())

    objective = objective(
        model,
        validated_data_with_features,
    )

    # Optuna updates the model by itself
    # and the model is the optimized over this finishes
    study.optimize(
        objective,
        n_trials=n_trials,
        callbacks=[_log_study_progress_and_save_best_model],
        show_progress_bar=False,
        timeout=TIMEOUT,
    )

    return study, objective
    def test_call(self):
        model_type = "xgb"
        model = ModelCreator.create_model(model_type)

        objective = XGBRegressorObjective(
            model,
            input_data,
        )

        study = optuna.create_study(
            study_name=model_type,
            pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
            direction="minimize",
        )

        with self.assertRaises(RuntimeError):
            study.optimize(objective, n_trials=N_TRIALS)
    def test_call(self):
        model_type = "linear"
        model = ModelCreator.create_model(model_type)

        objective = LinearRegressorObjective(
            model,
            input_data_with_features,
        )
        study = optuna.create_study(
            study_name=model_type,
            pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
            direction="minimize",
        )
        study.optimize(objective, n_trials=N_TRIALS)

        self.assertIsInstance(objective, LinearRegressorObjective)
        self.assertEqual(len(study.trials), N_TRIALS)
    def test_call(self):
        input_data = TestData.load("reference_sets/307-train-data.csv")
        pj = {"model": "proloaf"}
        input_data_with_features = TrainFeatureApplicator(
            horizons=[24.0]).add_features(input_data, pj=pj)

        model_type = "proloaf"
        model = ModelCreator.create_model(model_type)

        objective = ProLoafRegressorObjective(
            model,
            input_data_with_features,
        )
        study = optuna.create_study(
            study_name=model_type,
            pruner=optuna.pruners.MedianPruner(n_warmup_steps=5),
            direction="minimize",
        )
        study.optimize(objective, n_trials=1)

        self.assertIsInstance(objective, ProLoafRegressorObjective)
        self.assertEqual(len(study.trials), 1)
Example #8
0
def train_pipeline_step_train_model(
    pj: PredictionJobDataClass,
    model_specs: ModelSpecificationDataClass,
    train_data: pd.DataFrame,
    validation_data: pd.DataFrame,
) -> OpenstfRegressor:
    """Train the model
    Args:
        pj (PredictionJobDataClass): Prediction job
        model_specs (ModelSpecificationDataClass): Dataclass containing model specifications
        train_data (pd.DataFrame): The training data
        validation_data (pd.DataFrame): The test data

    Returns:
        trained_model (OpenstfRegressor): The trained model
    """
    # Test if first column is "load" and last column is "horizon"
    if train_data.columns[0] != "load" or train_data.columns[-1] != "horizon":
        raise InputDataWrongColumnOrderError(
            f"Wrong column order for {pj['id']} "
            "'load' column should be first and 'horizon' column last.")

    # Create relevant model
    model = ModelCreator.create_model(
        pj["model"],
        quantiles=pj["quantiles"],
    )

    # split x and y data
    train_x, train_y = train_data.iloc[:, 1:-1], train_data.iloc[:, 0]
    validation_x, validation_y = (
        validation_data.iloc[:, 1:-1],
        validation_data.iloc[:, 0],
    )

    # Configure evals for early stopping
    eval_set = [(train_x, train_y), (validation_x, validation_y)]

    # Set relevant hyperparameters
    # define protected hyperparams which are derived from prediction_job
    protected_hyperparams = ["quantiles"]
    valid_hyper_parameters = {
        key: value
        for key, value in model_specs.hyper_params.items() if
        key in model.get_params().keys() and key not in protected_hyperparams
    }

    model.set_params(**valid_hyper_parameters)
    model.fit(
        train_x,
        train_y,
        eval_set=eval_set,
        early_stopping_rounds=DEFAULT_EARLY_STOPPING_ROUNDS,
        verbose=False,
    )
    # Gets the feature importance df or None if we don't have feature importance
    model.feature_importance_dataframe = model.set_feature_importance()

    logging.info("Fitted a new model, not yet stored")

    # Do confidence interval determination
    model = StandardDeviationGenerator(
        validation_data).generate_standard_deviation_data(model)

    return model
 def test_create_model_unknown_model(self):
     # Test if NotImplementedError is raised when model type is unknown
     model_type = "Unknown"
     with self.assertRaises(NotImplementedError):
         ModelCreator.create_model(model_type)