コード例 #1
0
 def __validate_project_name(self, project_name):
     check_id(project_name, "H2OAutoML")
     return project_name
コード例 #2
0
    def __init__(self,
                 nfolds=5,
                 balance_classes=False,
                 class_sampling_factors=None,
                 max_after_balance_size=5.0,
                 max_runtime_secs=3600,
                 max_runtime_secs_per_model=None,
                 max_models=None,
                 stopping_metric="AUTO",
                 stopping_tolerance=None,
                 stopping_rounds=3,
                 seed=None,
                 project_name=None,
                 exclude_algos=None,
                 include_algos=None,
                 modeling_plan=None,
                 keep_cross_validation_predictions=False,
                 keep_cross_validation_models=False,
                 keep_cross_validation_fold_assignment=False,
                 sort_metric="AUTO",
                 export_checkpoints_dir=None,
                 verbosity="warn"):
        """
        Create a new H2OAutoML instance.
        
        :param int nfolds: Number of folds for k-fold cross-validation. Defaults to ``5``. Use ``0`` to disable cross-validation; this will also 
          disable Stacked Ensemble (thus decreasing the overall model performance).
        :param bool balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data).  Defaults to ``False``.
        :param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling
          factors will be automatically computed to obtain class balance during training. Requires ``balance_classes``.
        :param float max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be less than 1.0).
          Requires ``balance_classes``. Defaults to ``5.0``.
        :param int max_runtime_secs: This argument controls how long the AutoML run will execute. Defaults to ``3600`` seconds (1 hour).
        :param int max_runtime_secs_per_model: This argument controls the max time the AutoML run will dedicate to each individual model. Defaults to `0` (disabled).
        :param int max_models: Specify the maximum number of models to build in an AutoML run. (Does not include the Stacked Ensemble models.)
        :param str stopping_metric: Specifies the metric to use for early stopping. Defaults to ``"AUTO"``.
          The available options are:
          ``"AUTO"`` (This defaults to ``"logloss"`` for classification, ``"deviance"`` for regression),
          ``"deviance"``, ``"logloss"``, ``"mse"``, ``"rmse"``, ``"mae"``, ``"rmsle"``, ``"auc"``, ``"lift_top_group"``,
          ``"misclassification"``, ``"mean_per_class_error"``, ``"r2"``.
        :param float stopping_tolerance: This option specifies the relative tolerance for the metric-based stopping
          to stop the AutoML run if the improvement is less than this value. This value defaults to ``0.001``
          if the dataset is at least 1 million rows; otherwise it defaults to a value determined by the size of the dataset
          and the non-NA-rate.  In that case, the value is computed as 1/sqrt(nrows * non-NA-rate).
        :param int stopping_rounds: This argument stops training new models in the AutoML run when the option selected for
          stopping_metric doesn't improve for the specified number of models, based on a simple moving average.
          To disable this feature, set it to ``0``. Defaults to ``3`` and must be an non-negative integer.
        :param int seed: Set a seed for reproducibility. AutoML can only guarantee reproducibility if ``max_models`` or
          early stopping is used because ``max_runtime_secs`` is resource limited, meaning that if the resources are
          not the same between runs, AutoML may be able to train more models on one run vs another.  Defaults to ``None``.
        :param str project_name: Character string to identify an AutoML project. Defaults to ``None``, which means
          a project name will be auto-generated based on the training frame ID.  More models can be trained on an
          existing AutoML project by specifying the same project name in muliple calls to the AutoML function
          (as long as the same training frame is used in subsequent runs).
        :param exclude_algos: List of character strings naming the algorithms to skip during the model-building phase. 
          An example use is ``exclude_algos = ["GLM", "DeepLearning", "DRF"]``, and the full list of options is: ``"DRF"`` 
          (Random Forest and Extremely-Randomized Trees), ``"GLM"``, ``"XGBoost"``, ``"GBM"``, ``"DeepLearning"`` and ``"StackedEnsemble"``. 
          Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional.
        :param include_algos: List of character strings naming the algorithms to restrict to during the model-building phase.
          This can't be used in combination with `exclude_algos` param.
          Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional.
        :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation predictions.
          This needs to be set to ``True`` if running the same AutoML object for repeated runs because CV predictions are required to build 
          additional Stacked Ensemble models in AutoML. This option defaults to ``False``.
        :param keep_cross_validation_models: Whether to keep the cross-validated models. Keeping cross-validation models may consume 
          significantly more memory in the H2O cluster. Defaults to ``False``.
        :param keep_cross_validation_fold_assignment: Whether to keep fold assignments in the models. Deleting them will save memory 
          in the H2O cluster. This option defaults to ``False``.
        :param sort_metric: Metric to sort the leaderboard by. Defaults to ``"AUTO"`` (This defaults to ``auc`` for binomial classification, 
          ``mean_per_class_error`` for multinomial classification, ``deviance`` for regression). For binomial classification choose between 
          ``auc``, ``"logloss"``, ``"mean_per_class_error"``, ``"rmse"``, ``"mse"``.  For regression choose between ``"deviance"``, ``"rmse"``, 
          ``"mse"``, ``"mae"``, ``"rmlse"``. For multinomial classification choose between ``"mean_per_class_error"``, ``"logloss"``, ``"rmse"``, ``"mse"``.
        :param export_checkpoints_dir: Path to a directory where every model will be stored in binary form.
        :param verbosity: Verbosity of the backend messages printed during training.
            Available options are None (live log disabled), 'debug', 'info' or 'warn'. Defaults to 'warn'.
        """
        # Check if H2O jar contains AutoML
        try:
            h2o.api("GET /3/Metadata/schemas/AutoMLV99")
        except h2o.exceptions.H2OResponseError as e:
            print(e)
            print("*******************************************************************\n" \
                  "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \
                  "*******************************************************************\n" \
                  "\nVerbose Error Message:")

        
        # Make bare minimum build_control (if max_runtimes_secs is an invalid value, it will catch below)
        self.build_control = {
            'stopping_criteria': {
                'max_runtime_secs': max_runtime_secs,
            }
        }

        # Make bare minimum build_models
        self.build_models = {
            'exclude_algos': None
        }

        # nfolds must be an non-negative integer and not equal to 1:
        if nfolds is not 5:
            assert_is_type(nfolds,int)
        assert nfolds >= 0, "nfolds set to " + str(nfolds) + "; nfolds cannot be negative. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable."
        assert nfolds is not 1, "nfolds set to " + str(nfolds) + "; nfolds = 1 is an invalid value. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable."           
        self.build_control["nfolds"] = nfolds
        self.nfolds = nfolds

        # Pass through to all algorithms
        if balance_classes is True:
            self.build_control["balance_classes"] = balance_classes
            self.balance_classes = balance_classes
        if class_sampling_factors is not None:
            self.build_control["class_sampling_factors"] = class_sampling_factors
            self.class_sampling_factors = class_sampling_factors
        if max_after_balance_size != 5.0:
            assert_is_type(max_after_balance_size, float)
            self.build_control["max_after_balance_size"] = max_after_balance_size
            self.max_after_balance_size = max_after_balance_size

        # If max_runtime_secs is not provided, then it is set to default (3600 secs)
        if max_runtime_secs is not 3600:
            assert_is_type(max_runtime_secs, int)
        self.max_runtime_secs = max_runtime_secs

        assert_is_type(max_runtime_secs_per_model, None, int)
        self.max_runtime_secs_per_model = max_runtime_secs_per_model
        if self.max_runtime_secs_per_model is not None:
            self.build_control["stopping_criteria"]["max_runtime_secs_per_model"] = self.max_runtime_secs_per_model

        # Add other parameters to build_control if available
        if max_models is not None:
            assert_is_type(max_models, int)
            self.build_control["stopping_criteria"]["max_models"] = max_models
        self.max_models = max_models

        if stopping_metric is not "AUTO":
            assert_is_type(stopping_metric, str)
        self.build_control["stopping_criteria"]["stopping_metric"] = stopping_metric
        self.stopping_metric = stopping_metric

        if stopping_tolerance is not None:
            assert_is_type(stopping_tolerance, float)
            self.build_control["stopping_criteria"]["stopping_tolerance"] = stopping_tolerance
        self.stopping_tolerence = stopping_tolerance

        if stopping_rounds is not 3:
            assert_is_type(stopping_rounds, int)
        self.build_control["stopping_criteria"]["stopping_rounds"] = stopping_rounds
        self.stopping_rounds = stopping_rounds    

        if seed is not None:
            assert_is_type(seed, int)
            self.build_control["stopping_criteria"]["seed"] = seed
            self.seed = seed

        # Set project name if provided. If None, then we set in .train() to "automl_" + training_frame.frame_id
        if project_name is not None:
            assert_is_type(project_name, str)
            check_id(project_name, "H2OAutoML")
            self.build_control["project_name"] = project_name
            self.project_name = project_name
        else:
            self.project_name = None

        if exclude_algos is not None:
            assert_is_type(exclude_algos, list)
            for elem in exclude_algos:
                assert_is_type(elem, str)
            self.build_models['exclude_algos'] = exclude_algos

        if include_algos is not None:
            assert exclude_algos is None, "Use either include_algos or exclude_algos, not both."
            assert_is_type(include_algos, list)
            for elem in include_algos:
                assert_is_type(elem, str)
            self.build_models['include_algos'] = include_algos

        if modeling_plan is not None:
            assert_is_type(modeling_plan, list)
            supported_aliases = ['all', 'defaults', 'grids']

            def assert_is_step_def(sd):
                assert 'name' in sd, "each definition must have a 'name' key"
                assert 0 < len(sd) < 3, "each definition must have only 1 or 2 keys: name, name+alias or name+steps"
                assert len(sd) == 1 or 'alias' in sd or 'steps' in sd, "steps definitions support only the following keys: name, alias, steps"
                assert 'alias' not in sd or sd['alias'] in supported_aliases, "alias must be one of %s" % supported_aliases
                assert 'steps' not in sd or (is_type(sd['steps'], list) and all(assert_is_step(s) for s in sd['steps']))

            def assert_is_step(s):
                assert is_type(s, dict), "each step must be a dict with an 'id' key and an optional 'weight' key"
                assert 'id' in s, "each step must have an 'id' key"
                assert len(s) == 1 or ('weight' in s and is_type(s['weight'], int)), "weight must be an integer"
                return True

            plan = []
            for step_def in modeling_plan:
                assert_is_type(step_def, dict, tuple, str)
                if is_type(step_def, dict):
                    assert_is_step_def(step_def)
                    plan.append(step_def)
                elif is_type(step_def, str):
                    plan.append(dict(name=step_def))
                else:
                    assert 0 < len(step_def) < 3
                    assert_is_type(step_def[0], str)
                    name = step_def[0]
                    if len(step_def) == 1:
                        plan.append(dict(name=name))
                    else:
                        assert_is_type(step_def[1], str, list)
                        ids = step_def[1]
                        if is_type(ids, str):
                            assert_is_type(ids, *supported_aliases)
                            plan.append(dict(name=name, alias=ids))
                        else:
                            plan.append(dict(name=name, steps=[dict(id=i) for i in ids]))
            self.build_models['modeling_plan'] = plan


        assert_is_type(keep_cross_validation_predictions, bool)
        self.build_control["keep_cross_validation_predictions"] = keep_cross_validation_predictions

        assert_is_type(keep_cross_validation_models, bool)
        self.build_control["keep_cross_validation_models"] = keep_cross_validation_models

        assert_is_type(keep_cross_validation_fold_assignment, bool)
        self.build_control["keep_cross_validation_fold_assignment"] = self.nfolds != 0 and keep_cross_validation_fold_assignment

        self._job = None
        self._leader_id = None
        self._leaderboard = None
        self._verbosity = verbosity
        self._event_log = None
        self._training_info = None
        self._state_json = None
        if sort_metric == "AUTO":
            self.sort_metric = None
        else:
            self.sort_metric = sort_metric

        if export_checkpoints_dir is not None:
            assert_is_type(export_checkpoints_dir, str)
            self.build_control["export_checkpoints_dir"] = export_checkpoints_dir
コード例 #3
0
    def __init__(self,
                 nfolds=5,
                 balance_classes=False,
                 class_sampling_factors=None,
                 max_after_balance_size=5.0,
                 max_runtime_secs=None,
                 max_runtime_secs_per_model=None,
                 max_models=None,
                 stopping_metric="AUTO",
                 stopping_tolerance=None,
                 stopping_rounds=3,
                 seed=None,
                 project_name=None,
                 exclude_algos=None,
                 include_algos=None,
                 exploitation_ratio=0,
                 modeling_plan=None,
                 monotone_constraints=None,
                 algo_parameters=None,
                 keep_cross_validation_predictions=False,
                 keep_cross_validation_models=False,
                 keep_cross_validation_fold_assignment=False,
                 sort_metric="AUTO",
                 export_checkpoints_dir=None,
                 verbosity="warn"):
        """
        Create a new H2OAutoML instance.
        
        :param int nfolds: Number of folds for k-fold cross-validation. Defaults to ``5``. Use ``0`` to disable cross-validation; this will also 
          disable Stacked Ensemble (thus decreasing the overall model performance).
        :param bool balance_classes: Balance training data class counts via over/under-sampling (for imbalanced data).  Defaults to ``False``.
        :param class_sampling_factors: Desired over/under-sampling ratios per class (in lexicographic order). If not specified, sampling
          factors will be automatically computed to obtain class balance during training. Requires ``balance_classes``.
        :param float max_after_balance_size: Maximum relative size of the training data after balancing class counts (can be less than 1.0).
          Requires ``balance_classes``. Defaults to ``5.0``.
        :param int max_runtime_secs: This argument specifies the maximum time that the AutoML process will run for, prior to training the final Stacked Ensemble models. If neither ``max_runtime_secs`` nor ``max_models`` are specified by the user, then ``max_runtime_secs`` defaults to 3600 seconds (1 hour).
        :param int max_runtime_secs_per_model: This argument controls the max time the AutoML run will dedicate to each individual model. Defaults to `0` (disabled).
        :param int max_models: Specify the maximum number of models to build in an AutoML run. Not limited by default. (Does not include the Stacked Ensemble models.)
        :param str stopping_metric: Specifies the metric to use for early stopping. Defaults to ``"AUTO"``.
          The available options are:
          ``"AUTO"`` (This defaults to ``"logloss"`` for classification, ``"deviance"`` for regression),
          ``"deviance"``, ``"logloss"``, ``"mse"``, ``"rmse"``, ``"mae"``, ``"rmsle"``, ``"auc"``, ``aucpr``, ``"lift_top_group"``,
          ``"misclassification"``, ``"mean_per_class_error"``, ``"r2"``.
        :param float stopping_tolerance: This option specifies the relative tolerance for the metric-based stopping
          to stop the AutoML run if the improvement is less than this value. This value defaults to ``0.001``
          if the dataset is at least 1 million rows; otherwise it defaults to a value determined by the size of the dataset
          and the non-NA-rate.  In that case, the value is computed as 1/sqrt(nrows * non-NA-rate).
        :param int stopping_rounds: This argument stops training new models in the AutoML run when the option selected for
          stopping_metric doesn't improve for the specified number of models, based on a simple moving average.
          To disable this feature, set it to ``0``. Defaults to ``3`` and must be an non-negative integer.
        :param int seed: Set a seed for reproducibility. AutoML can only guarantee reproducibility if ``max_models`` or
          early stopping is used because ``max_runtime_secs`` is resource limited, meaning that if the resources are
          not the same between runs, AutoML may be able to train more models on one run vs another.  Defaults to ``None``.
        :param str project_name: Character string to identify an AutoML project. Defaults to ``None``, which means
          a project name will be auto-generated based on the training frame ID.  More models can be trained on an
          existing AutoML project by specifying the same project name in muliple calls to the AutoML function
          (as long as the same training frame is used in subsequent runs).
        :param exclude_algos: List of character strings naming the algorithms to skip during the model-building phase. 
          An example use is ``exclude_algos = ["GLM", "DeepLearning", "DRF"]``, and the full list of options is: ``"DRF"`` 
          (Random Forest and Extremely-Randomized Trees), ``"GLM"``, ``"XGBoost"``, ``"GBM"``, ``"DeepLearning"`` and ``"StackedEnsemble"``. 
          Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional.
        :param include_algos: List of character strings naming the algorithms to restrict to during the model-building phase.
          This can't be used in combination with `exclude_algos` param.
          Defaults to ``None``, which means that all appropriate H2O algorithms will be used, if the search stopping criteria allow. Optional.
        :param exploitation_ratio: The budget ratio (between 0 and 1) dedicated to the exploitation (vs exploration) phase. By default, the exploitation phase is disabled (exploitation_ratio=0) as this is still experimental; to activate it, it is recommended to try a ratio around 0.1. Note that the current exploitation phase only tries to fine-tune the best XGBoost and the best GBM found during exploration.
        :param modeling_plan: List of modeling steps to be used by the AutoML engine (they may not all get executed, depending on other constraints).
          Defaults to None (Expert usage only).
        :param monotone_constraints: Dict representing monotonic constraints.
          Use +1 to enforce an increasing constraint and -1 to specify a decreasing constraint.
        :param algo_parameters: Dict of ``param_name=param_value`` to be passed to internal models. Defaults to none (Expert usage only).
          By default, params are set only to algorithms accepting them, and ignored by others.
          Only following parameters are currently allowed: ``"monotone_constraints"``.
        :param keep_cross_validation_predictions: Whether to keep the predictions of the cross-validation predictions.
          This needs to be set to ``True`` if running the same AutoML object for repeated runs because CV predictions are required to build 
          additional Stacked Ensemble models in AutoML. This option defaults to ``False``.
        :param keep_cross_validation_models: Whether to keep the cross-validated models. Keeping cross-validation models may consume 
          significantly more memory in the H2O cluster. Defaults to ``False``.
        :param keep_cross_validation_fold_assignment: Whether to keep fold assignments in the models. Deleting them will save memory 
          in the H2O cluster. This option defaults to ``False``.
        :param sort_metric: Metric to sort the leaderboard by. Defaults to ``"AUTO"`` (This defaults to ``auc`` for binomial classification, 
          ``mean_per_class_error`` for multinomial classification, ``deviance`` for regression). For binomial classification choose between 
          ``auc``, ``aucpr``, ``"logloss"``, ``"mean_per_class_error"``, ``"rmse"``, ``"mse"``.  For regression choose between ``"deviance"``, ``"rmse"``, 
          ``"mse"``, ``"mae"``, ``"rmlse"``. For multinomial classification choose between ``"mean_per_class_error"``, ``"logloss"``, ``"rmse"``, ``"mse"``.
        :param export_checkpoints_dir: Path to a directory where every model will be stored in binary form.
        :param verbosity: Verbosity of the backend messages printed during training.
            Available options are None (live log disabled), 'debug', 'info' or 'warn'. Defaults to 'warn'.
        """
        # Check if H2O jar contains AutoML
        try:
            h2o.api("GET /3/Metadata/schemas/AutoMLV99")
        except h2o.exceptions.H2OResponseError as e:
            print(e)
            print("*******************************************************************\n" \
                  "*Please verify that your H2O jar has the proper AutoML extensions.*\n" \
                  "*******************************************************************\n" \
                  "\nVerbose Error Message:")

        self._job = None
        self._leader_id = None
        self._leaderboard = None
        self._verbosity = verbosity
        self._event_log = None
        self._training_info = None
        self._state_json = None
        self._build_resp = None  # contains all the actual parameters used on backend

        # Make bare minimum params containers
        self.build_control = dict(stopping_criteria=dict())
        self.build_models = dict()
        self.input_spec = dict()

        # build_control params #

        assert_is_type(project_name, None, str)
        check_id(project_name, "H2OAutoML")
        self._project_name = self.build_control["project_name"] = project_name

        assert_is_type(nfolds, int)
        assert nfolds >= 0, "nfolds set to " + str(
            nfolds
        ) + "; nfolds cannot be negative. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable."
        assert nfolds is not 1, "nfolds set to " + str(
            nfolds
        ) + "; nfolds = 1 is an invalid value. Use nfolds >=2 if you want cross-valiated metrics and Stacked Ensembles or use nfolds = 0 to disable."
        self.nfolds = self.build_control["nfolds"] = nfolds

        assert_is_type(balance_classes, bool)
        self.balance_classes = self.build_control[
            "balance_classes"] = balance_classes

        assert_is_type(class_sampling_factors, None, [numeric])
        self.class_sampling_factors = self.build_control[
            "class_sampling_factors"] = class_sampling_factors

        assert_is_type(max_after_balance_size, None, numeric)
        self.max_after_balance_size = self.build_control[
            "max_after_balance_size"] = max_after_balance_size

        assert_is_type(keep_cross_validation_models, bool)
        self.keep_cross_validation_models = self.build_control[
            "keep_cross_validation_models"] = keep_cross_validation_models

        assert_is_type(keep_cross_validation_fold_assignment, bool)
        self.keep_cross_validation_fold_assignment = self.build_control[
            "keep_cross_validation_fold_assignment"] = keep_cross_validation_fold_assignment

        assert_is_type(keep_cross_validation_predictions, bool)
        self.keep_cross_validation_predictions = self.build_control[
            "keep_cross_validation_predictions"] = keep_cross_validation_predictions

        assert_is_type(export_checkpoints_dir, None, str)
        self.export_checkpoints_dir = self.build_control[
            "export_checkpoints_dir"] = export_checkpoints_dir

        # stopping criteria params #

        assert_is_type(max_runtime_secs, None, int)
        self.max_runtime_secs = self.build_control['stopping_criteria'][
            'max_runtime_secs'] = max_runtime_secs

        assert_is_type(max_runtime_secs_per_model, None, int)
        self.max_runtime_secs_per_model = self.build_control[
            "stopping_criteria"][
                "max_runtime_secs_per_model"] = max_runtime_secs_per_model

        assert_is_type(max_models, None, int)
        self.max_models = self.build_control["stopping_criteria"][
            "max_models"] = max_models

        assert_is_type(stopping_metric, None, str)
        self.stopping_metric = self.build_control["stopping_criteria"][
            "stopping_metric"] = stopping_metric

        assert_is_type(stopping_tolerance, None, numeric)
        self.stopping_tolerance = self.build_control["stopping_criteria"][
            "stopping_tolerance"] = stopping_tolerance

        assert_is_type(stopping_rounds, None, int)
        self.stopping_rounds = self.build_control["stopping_criteria"][
            "stopping_rounds"] = stopping_rounds

        assert_is_type(seed, None, int)
        self.seed = self.build_control["stopping_criteria"]["seed"] = seed

        # build models params #

        assert_is_type(exclude_algos, None, [str])
        self.exclude_algos = self.build_models['exclude_algos'] = exclude_algos

        assert_is_type(include_algos, None, [str])
        if include_algos is not None:
            assert exclude_algos is None, "Use either include_algos or exclude_algos, not both."
        self.include_algos = self.build_models['include_algos'] = include_algos

        assert_is_type(exploitation_ratio, None, numeric)
        self.exploitation_ratio = self.build_models[
            'exploitation_ratio'] = exploitation_ratio

        assert_is_type(modeling_plan, None, list)
        if modeling_plan is not None:
            supported_aliases = ['all', 'defaults', 'grids']

            def assert_is_step_def(sd):
                assert 'name' in sd, "each definition must have a 'name' key"
                assert 0 < len(
                    sd
                ) < 3, "each definition must have only 1 or 2 keys: name, name+alias or name+steps"
                assert len(
                    sd
                ) == 1 or 'alias' in sd or 'steps' in sd, "steps definitions support only the following keys: name, alias, steps"
                assert 'alias' not in sd or sd[
                    'alias'] in supported_aliases, "alias must be one of %s" % supported_aliases
                assert 'steps' not in sd or (is_type(sd['steps'], list)
                                             and all(
                                                 assert_is_step(s)
                                                 for s in sd['steps']))

            def assert_is_step(s):
                assert is_type(
                    s, dict
                ), "each step must be a dict with an 'id' key and an optional 'weight' key"
                assert 'id' in s, "each step must have an 'id' key"
                assert len(s) == 1 or ('weight' in s and is_type(
                    s['weight'], int)), "weight must be an integer"
                return True

            plan = []
            for step_def in modeling_plan:
                assert_is_type(step_def, dict, tuple, str)
                if is_type(step_def, dict):
                    assert_is_step_def(step_def)
                    plan.append(step_def)
                elif is_type(step_def, str):
                    plan.append(dict(name=step_def))
                else:
                    assert 0 < len(step_def) < 3
                    assert_is_type(step_def[0], str)
                    name = step_def[0]
                    if len(step_def) == 1:
                        plan.append(dict(name=name))
                    else:
                        assert_is_type(step_def[1], str, list)
                        ids = step_def[1]
                        if is_type(ids, str):
                            assert_is_type(ids, *supported_aliases)
                            plan.append(dict(name=name, alias=ids))
                        else:
                            plan.append(
                                dict(name=name,
                                     steps=[dict(id=i) for i in ids]))
            self.modeling_plan = self.build_models['modeling_plan'] = plan
        else:
            self.modeling_plan = None

        assert_is_type(algo_parameters, None, dict)
        if monotone_constraints is not None:
            if algo_parameters is None:
                algo_parameters = {}
            self.monotone_constraints = algo_parameters[
                'monotone_constraints'] = monotone_constraints
        else:
            self.monotone_constraints = None

        assert_is_type(algo_parameters, None, dict)
        if algo_parameters is not None:
            algo_parameters_json = []
            for k, v in algo_parameters.items():
                scope, __, name = k.partition('__')
                if len(name) == 0:
                    name, scope = scope, 'any'
                value = [
                    dict(key=k, value=v) for k, v in v.items()
                ] if isinstance(
                    v, dict
                ) else v  # we can't use stringify_dict here as this will be converted into a JSON string
                algo_parameters_json.append(
                    dict(scope=scope, name=name, value=value))

            self.algo_parameters = self.build_models[
                'algo_parameters'] = algo_parameters_json
        else:
            self.algo_parameters = None

        # input spec params #

        assert_is_type(sort_metric, None, str)
        self.sort_metric = self.input_spec['sort_metric'] = sort_metric