def predict(self, X, **kwargs):
        model, _, _, _ = self.get_model_properties()
        X = dt.Frame(X)
        X = self.inf_impute(X)
        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = os.path.join(exp_dir(), self.id)
        model_file = os.path.join(model_path, "h2o_model." + str(uuid.uuid4()) + ".bin")
        os.makedirs(model_path, exist_ok=True)
        with open(model_file, "wb") as f:
            f.write(model)
        model = h2o.load_model(os.path.abspath(model_file))
        test_frame = h2o.H2OFrame(X.to_pandas(), column_types=self.col_types)
        preds_frame = None

        try:
            if kwargs.get("pred_contribs"):
                orig_cols = list(X.names)
                df_varimp_orig, df_varimp, df_varimp_merged = self.get_df_varimp(model, orig_cols)
                dfmap = {k: v for k, v in zip(df_varimp_orig.index, df_varimp.index)}
                preds_df = model.predict_contributions(test_frame).as_data_frame(header=False)
                # this only has to work for regression and binary since h2o-3 does not support multiclass shapley
                preds_df.columns = [dfmap.get(x, x) for x in preds_df.columns]
                preds_df = preds_df.groupby(preds_df.columns, axis=1).sum()
                return preds_df.values
            preds_frame = model.predict(test_frame)
            preds = preds_frame.as_data_frame(header=False)

            is_final = 'IS_FINAL' in kwargs
            struuid = str(uuid.uuid4())
            json_file = os.path.join(exp_dir(), 'stderr_is_final_%s_%s.json' % (is_final, struuid))

            if self.num_classes == 1:
                if self.doing_p_values():
                    df = preds.iloc[:, 1]
                    with open(json_file, "wt") as f:
                        pd.set_option('precision', 16)
                        f.write(json.dumps(json.loads(df.to_json()), indent=4))
                        pd.set_option('precision', 6)
                    return preds.iloc[:, 0].values.ravel()
                else:
                    return preds.values.ravel()
            elif self.num_classes == 2:
                if self.doing_p_values():
                    df = preds.iloc[:, 2]
                    with open(json_file, "wt") as f:
                        pd.set_option('precision', 16)
                        f.write(json.dumps(json.loads(df.to_json()), indent=4))
                        pd.set_option('precision', 6)
                    return preds.iloc[:, -1 - 1].values.ravel()
                else:
                    return preds.iloc[:, -1].values.ravel()
            else:
                return preds.iloc[:, 1:].values
        finally:
            # h2o.remove(self.id) # Cannot remove id, do multiple predictions on same model
            h2o.remove(test_frame)
            remove(model_file)
            if preds_frame is not None:
                h2o.remove(preds_frame)
 def logger(self):
     from h2oaicore import application_context
     from h2oaicore.systemutils import exp_dir
     # Don't assign to self, not picklable
     return make_experiment_logger(
         experiment_id=application_context.context.experiment_id,
         tmp_dir=None,
         experiment_tmp_dir=exp_dir())
Esempio n. 3
0
    def predict(self, X, y=None, **kwargs):
        model, features, importances, iterations = self.get_model_properties()
        if not self._save_by_pickle:
            from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType
            if self.num_classes >= 2:
                from_file = CatBoostClassifier()
            else:
                from_file = CatBoostRegressor()
            with open(self.model_path, mode='wb') as f:
                f.write(model)
            model = from_file.load_model(self.model_path)

        # FIXME: Do equivalent throttling of predict size like def _predict_internal(self, X, **kwargs), wrap-up.
        if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0:
            # dt -> lightgbm internally using buffer leaks, so convert here
            # assume predict is after pipeline collection or in subprocess so needs no protection
            X = X.to_numpy(
            )  # don't assign back to X so don't damage during predict
            X = np.ascontiguousarray(X,
                                     dtype=np.float32 if config.data_precision
                                     == "float32" else np.float64)

        X, eval_set = self.process_cats(X, None, self.feature_names_fitted)

        pred_contribs = kwargs.get('pred_contribs', False)
        output_margin = kwargs.get('output_margin', False)
        fast_approx = kwargs.pop('fast_approx', False)
        if fast_approx:
            iterations = min(config.fast_approx_num_trees, iterations)

        # implicit import
        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType, Pool
        n_jobs = max(1, physical_cores_count)
        if not pred_contribs and not output_margin:
            if self.num_classes >= 2:
                preds = model.predict_proba(
                    X,
                    ntree_start=0,
                    ntree_end=iterations,  # index of first tree *not* to be used
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )

                if preds.shape[1] == 2:
                    return preds[:, 1]
                else:
                    return preds
            else:
                return model.predict(
                    X,
                    ntree_start=0,
                    ntree_end=iterations,  # index of first tree *not* to be used
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )
        elif output_margin:
            # uses "predict" for raw for any class
            preds = model.predict(
                X,
                prediction_type="RawFormulaVal",
                ntree_start=0,
                ntree_end=iterations,  # index of first tree *not* to be used
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported
            )
            if len(preds.shape
                   ) > 1 and preds.shape[1] == 2 and self.num_classes == 2:
                return preds[:, 1]
            else:
                return preds
        elif pred_contribs:
            # For Shapley, doesn't come from predict
            # For regression/binary, shap is shape of (rows, features + bias)
            # for multiclass, shap is shape of (rows, classes, features + bias)
            data = Pool(X, label=y, cat_features=self.params['cat_features'])
            if fast_approx:
                # https://github.com/catboost/catboost/issues/1146
                # https://github.com/catboost/catboost/issues/1535
                # can't specify trees, but they have approx version
                # Regular, Exact, or Approximate
                shap_calc_type = "Approximate"
            else:
                shap_calc_type = "Regular"
            # See also shap_mode
            # help(CatBoostClassifier.get_feature_importance)
            print_debug("shap_calc_type: %s" % shap_calc_type)

            pickle_path = None
            if config.debug_daimodel_level >= 2:
                self.uuid = str(uuid.uuid4())[:6]
                pickle_path = os.path.join(
                    exp_dir(), "catboost_shappredict%s.tmp.pickle" % self.uuid)
                model.save_model(
                    os.path.join(exp_dir(), "catshapproblem%s.catboost.model" %
                                 self.uuid))
                # save_obj((self, self.model, model, X, y, kwargs, shap_calc_type, self.params['cat_features']), pickle_path)
                save_obj((model, X, y, kwargs, shap_calc_type,
                          self.params['cat_features']), pickle_path)

            preds_shap = model.get_feature_importance(
                data=data,
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported,
                type=EFstrType.ShapValues,
                shap_calc_type=shap_calc_type,
            )
            # repair broken shap sum: https://github.com/catboost/catboost/issues/1125
            print_debug("shap_fix")
            preds_raw = model.predict(
                X,
                prediction_type="RawFormulaVal",
                ntree_start=0,
                ntree_end=iterations,  # index of first tree *not* to be used
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported
            )
            if self.num_classes <= 2:
                axis = 1
            else:
                axis = 2
            orig_sum = np.sum(preds_shap, axis=axis)
            print_debug("shap_fix2")
            # avoid division by 0, need different trick, e.g. change baseline, to fix that case
            if axis == 1:
                orig_sum[orig_sum[:] == 0.0] = 1.0
                preds_shap = preds_shap * preds_raw[:, None] / orig_sum[:,
                                                                        None]
            else:
                # each feature and each class must sum up
                orig_sum[orig_sum[:, :] == 0.0] = 1.0
                preds_shap = preds_shap * preds_raw[:, :,
                                                    None] / orig_sum[:, :,
                                                                     None]

            if config.hard_asserts and config.debug_daimodel_level >= 2:
                print_debug("shap_check")
                model.save_model(os.path.join(exp_dir(), "catshapproblem"))
                pickle.dump((X, y, self.params['cat_features']),
                            open(os.path.join(exp_dir(), "catshapproblem.pkl"),
                                 "wb"))
                preds_raw = model.predict(
                    X,
                    prediction_type="RawFormulaVal",
                    ntree_start=0,
                    ntree_end=iterations,  # index of first tree *not* to be used
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )

                assert np.isclose(preds_raw, np.sum(
                    preds_shap, axis=axis)).all(
                    ), "catboost shapley does not sum up correctly"

            if config.debug_daimodel_level <= 2:
                remove(pickle_path)

            if axis == 1:
                return preds_shap
            else:
                # DAI expects (shape rows) * (classes x (features + 1)) with "columns" as blocks of
                # feature_0_class_0 feature_0_class_0 ... feature_0_class_1 feature_1_class_1 ...
                return preds_shap.reshape(
                    preds_shap.shape[0],
                    preds_shap.shape[1] * preds_shap.shape[2])
        else:
            raise RuntimeError("No such case")
Esempio n. 4
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        logger = None
        if self._make_logger:
            # Example use of logger, with required import of:
            #  from h2oaicore.systemutils import make_experiment_logger, loggerinfo
            # Can use loggerwarning, loggererror, etc. for different levels
            if self.context and self.context.experiment_id:
                logger = make_experiment_logger(
                    experiment_id=self.context.experiment_id,
                    tmp_dir=self.context.tmp_dir,
                    experiment_tmp_dir=self.context.experiment_tmp_dir)

        if self._show_logger_test:
            loggerinfo(logger, "TestLOGGER: Fit CatBoost")

        if self._show_task_test:
            # Example task sync operations
            if hasattr(self, 'testcount'):
                self.test_count += 1
            else:
                self.test_count = 0

            # The below generates a message in the GUI notifications panel
            if self.test_count == 0 and self.context and self.context.experiment_id:
                warning = "TestWarning: First CatBoost fit for this model instance"
                loggerwarning(logger, warning)
                task = kwargs.get('task')
                if task:
                    task.sync(key=self.context.experiment_id,
                              progress=dict(type='warning', data=warning))
                    task.flush()

            # The below generates a message in the GUI top-middle panel above the progress wheel
            if self.test_count == 0 and self.context and self.context.experiment_id:
                message = "Tuning CatBoost"
                loggerinfo(logger, message)
                task = kwargs.get('task')
                if task:
                    task.sync(key=self.context.experiment_id,
                              progress=dict(type='update', message=message))
                    task.flush()

        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType

        # label encode target and setup type of problem
        lb = LabelEncoder()
        if self.num_classes >= 2:
            lb.fit(self.labels)
            y = lb.transform(y)
            if eval_set is not None:
                valid_X = eval_set[0][0]
                valid_y = eval_set[0][1]
                valid_y = lb.transform(valid_y)
                eval_set = [(valid_X, valid_y)]
            self.params.update({'objective': 'Logloss'})
        if self.num_classes > 2:
            self.params.update({'objective': 'MultiClass'})

        if isinstance(X, dt.Frame):
            orig_cols = list(X.names)
            numeric_cols = list(X[:, [bool, int, float]].names)
        else:
            orig_cols = list(X.columns)
            numeric_cols = list(X.select_dtypes([np.number]).columns)

        # unlike lightgbm that needs label encoded categoricals, catboots can take raw strings etc.
        self.params['cat_features'] = [
            i for i, x in enumerate(orig_cols)
            if 'CatOrig:' in x or 'Cat:' in x or x not in numeric_cols
        ]

        if not self.get_uses_gpus(self.params):
            # monotonicity constraints not available for GPU for catboost
            # get names of columns in same order
            X_names = list(dt.Frame(X).names)
            X_numeric = self.get_X_ordered_numerics(X)
            X_numeric_names = list(X_numeric.names)
            _, _, constraints, self.set_monotone_constraints(X=X_numeric, y=y)
            # if non-numerics, then fix those to have 0 constraint
            self.params['monotone_constraints'] = [0] * len(X_names)
            colnumi = 0
            for coli in X_names:
                if X_names[coli] in X_numeric_names:
                    self.params['monotone_constraints'][coli] = constraints[
                        colnumi]
                    colnumi += 1

        if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0:
            # dt -> catboost internally using buffer leaks, so convert here
            # assume predict is after pipeline collection or in subprocess so needs no protection
            X = X.to_numpy(
            )  # don't assign back to X so don't damage during predict
            X = np.ascontiguousarray(X,
                                     dtype=np.float32 if config.data_precision
                                     == "float32" else np.float64)
            if eval_set is not None:
                valid_X = eval_set[0][0].to_numpy(
                )  # don't assign back to X so don't damage during predict
                valid_X = np.ascontiguousarray(
                    valid_X,
                    dtype=np.float32
                    if config.data_precision == "float32" else np.float64)
                valid_y = eval_set[0][1]
                eval_set = [(valid_X, valid_y)]

        if eval_set is not None:
            valid_X_shape = eval_set[0][0].shape
        else:
            valid_X_shape = None

        X, eval_set = self.process_cats(X, eval_set, orig_cols)

        # modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes
        self.acquire_gpus_function(train_shape=X.shape,
                                   valid_shape=valid_X_shape)

        params = copy.deepcopy(
            self.params
        )  # keep separate, since then can be pulled form lightgbm params
        params = self.transcribe_params(params=params, **kwargs)

        if logger is not None:
            loggerdata(
                logger,
                "CatBoost parameters: params_base : %s params: %s catboost_params: %s"
                % (str(self.params_base), str(self.params), str(params)))

        if self.num_classes == 1:
            self.model = CatBoostRegressor(**params)
        else:
            self.model = CatBoostClassifier(**params)
        # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored.
        if self.num_classes == 1:
            # assume not mae, which would use median
            # baseline = [np.mean(y)] * len(y)
            baseline = None
        else:
            baseline = None

        kwargs_fit = dict(baseline=baseline, eval_set=eval_set)
        pickle_path = None
        if config.debug_daimodel_level >= 2:
            self.uuid = str(uuid.uuid4())[:6]
            pickle_path = os.path.join(exp_dir(),
                                       "catboost%s.tmp.pickle" % self.uuid)
            save_obj((self.model, X, y, sample_weight, kwargs_fit),
                     pickle_path)

        # FIT (with migration safety before hyperopt/Optuna function added)
        try:
            if hasattr(self, 'dask_or_hyper_or_normal_fit'):
                self.dask_or_hyper_or_normal_fit(X,
                                                 y,
                                                 sample_weight=sample_weight,
                                                 kwargs=kwargs,
                                                 **kwargs_fit)
            else:
                self.model.fit(X, y, sample_weight=sample_weight, **kwargs_fit)
        except Exception as e:
            if "All features are either constant or ignored" in str(e):
                raise IgnoreEntirelyError(str(e))
            raise

        if config.debug_daimodel_level <= 2:
            remove(pickle_path)

        # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html
        # need to move to wrapper
        if self.model.get_best_iteration() is not None:
            iterations = self.model.get_best_iteration() + 1
        else:
            iterations = self.params['n_estimators']
        # must always set best_iterations
        self.model_path = None
        importances = copy.deepcopy(self.model.feature_importances_)
        if not self._save_by_pickle:
            self.uuid = str(uuid.uuid4())[:6]
            model_file = "catboost_%s.bin" % str(self.uuid)
            self.model_path = os.path.join(self.context.experiment_tmp_dir,
                                           model_file)
            self.model.save_model(self.model_path)
            with open(self.model_path, mode='rb') as f:
                model = f.read()
        else:
            model = self.model
        self.set_model_properties(
            model=
            model,  # overwrites self.model object with bytes if not using pickle
            features=orig_cols,
            importances=importances,
            iterations=iterations)
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        X = dt.Frame(X)
        X = self.inf_impute(X)
        self.transcribe(X=X)

        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = None

        if isinstance(self, H2ONBModel):
            # NB can only handle weights of 0 / 1
            if sample_weight is not None:
                sample_weight = (sample_weight != 0).astype(int)
            if sample_weight_eval_set is not None and len(sample_weight_eval_set) > 0 and sample_weight_eval_set[
                0] is not None:
                sample_weight_eval_set1 = sample_weight_eval_set[0]
                sample_weight_eval_set1[sample_weight_eval_set1 != 0] = 1
                sample_weight_eval_set1 = sample_weight_eval_set1.astype(int)
                sample_weight_eval_set = [sample_weight_eval_set1]

        X_pd = X.to_pandas()

        # fix if few levels for "enum" type.  h2o-3 auto-type is too greedy and only looks at very first rows
        np_real_types = [np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]
        column_types = {}
        for col in X_pd.columns:
            if X_pd[col].dtype.type in np_real_types:
                column_types[col] = 'real'
        nuniques = {}
        for col in X_pd.columns:
            nuniques[col] = len(pd.unique(X_pd[col]))
            print_debug("NumUniques for col: %s: %d" % (col, nuniques[col]))
            if nuniques[col] <= config.max_int_as_cat_uniques and X_pd[col].dtype.type in np_real_types:
                # override original "real"
                column_types[col] = 'enum'
        # if column_types is partially filled, that is ok to h2o-3

        train_X = h2o.H2OFrame(X_pd, column_types=column_types)
        self.col_types = train_X.types

        # see uniques-types dict
        nuniques_and_types = {}
        for col, typ, in self.col_types.items():
            nuniques_and_types[col] = [typ, nuniques[col]]
            print_debug("NumUniques and types for col: %s : %s" % (col, nuniques_and_types[col]))

        train_y = h2o.H2OFrame(y,
                               column_names=[self.target],
                               column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
            valid_y = h2o.H2OFrame(eval_set[0][1],
                                   column_names=[self.target],
                                   column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()
            params = copy.deepcopy(self.params)
            if not isinstance(self, H2OAutoMLModel):
                # AutoML needs max_runtime_secs in initializer, all others in train() method
                max_runtime_secs = params.pop('max_runtime_secs', 0)
                train_kwargs = dict(max_runtime_secs=max_runtime_secs)
            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight

            # Don't ever use the offset column as a feature
            offset_col = None  # if no column is called offset we will pass "None" and not use this feature
            cols_to_train = []  # list of all non-offset columns

            for col in list(train_X.names):
                if not col.lower() == "offset":
                    cols_to_train.append(col)
                else:
                    offset_col = col

            orig_cols = cols_to_train  # not training on offset

            if self.doing_p_values():
                # https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/algo-params/compute_p_values.html
                # take a look at the coefficients_table to see the p_values
                params['remove_collinear_columns'] = True
                params['compute_p_values'] = True
                # h2o-3 only supports p-values if lambda=0
                params['lambda_'] = 0
                if self.num_classes == 2:
                    params['family'] = 'binomial'
                params['solver'] = 'IRLSM'
                params.pop('beta_constraints', None)

            trials = 2
            for trial in range(0, trials):
                try:
                    # Models that can use an offset column
                    loggerinfo(self.get_logger(**kwargs), "%s (%s) fit parameters: %s" % (
                    self.display_name, self.__class__.__module__, dict(params)))
                    model = self.make_instance(**params)
                    if isinstance(model, H2OGBMModel) | isinstance(model, H2ODLModel) | isinstance(model, H2OGLMModel):
                        model.train(x=cols_to_train, y=self.target, training_frame=train_frame,
                                    offset_column=offset_col,
                                    **train_kwargs)
                    else:
                        model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs)
                    break
                except Exception as e:
                    print(str(e))
                    t, v, tb = sys.exc_info()
                    ex = ''.join(traceback.format_exception(t, v, tb))
                    if 'Training data must have at least 2 features' in str(ex) and X.ncols != 0:
                        # if had non-zero features but h2o-3 saw as constant, ignore h2o-3 in that case
                        raise IgnoreEntirelyError
                    elif "min_rows: The dataset size is too small to split for min_rows" in str(e) and trial == 0:
                        # then h2o-3 counted as rows some reduced set, since we already protect against actual rows vs. min_rows
                        params['min_rows'] = 1  # go down to lowest value
                        # permit another trial
                    elif "min_rows: The dataset size is too small to split for min_rows" in str(e) and trial == 1:
                        raise IgnoreEntirelyError
                    elif " java.lang.AssertionError" in str(ex):
                        # bug in h2o-3, nothing can be done
                        raise IgnoreEntirelyError
                    elif "NotStrictlyPositiveException" in str(ex):
                        # bad input data for given hyperparameters
                        raise IgnoreEntirelyError
                    else:
                        raise
                    if trial == trials - 1:
                        # if at end of trials, raise no matter what
                        raise

            if self._show_performance:
                # retrieve the model performance
                perf_train = model.model_performance(train_frame)
                loggerinfo(self.get_logger(**kwargs), self.perf_to_list(perf_train, which="training"))
                if valid_frame is not None:
                    perf_valid = model.model_performance(valid_frame)
                    loggerinfo(self.get_logger(**kwargs), self.perf_to_list(perf_valid, which="validation"))

            struuid = str(uuid.uuid4())

            if self._show_coefficients:
                coeff_table = model._model_json['output']['coefficients_table']
                # convert table to a pandas dataframe
                coeff_table = coeff_table.as_data_frame()
                is_final = 'IS_FINAL' in kwargs
                json_file = os.path.join(exp_dir(), 'coefficients_table_is_final_%s_%s.json' % (is_final, struuid))
                with open(json_file, "wt") as f:
                    pd.set_option('precision', 16)
                    f.write(json.dumps(json.loads(coeff_table.to_json()), indent=4))
                    pd.set_option('precision', 6)

            if isinstance(model, H2OAutoML):
                model = model.leader
            self.id = model.model_id
            model_path = os.path.join(exp_dir(), "h2o_model." + struuid)
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                remove(model_path)
            for xx in [train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y]:
                if xx is not None:
                    if isinstance(xx, H2OAutoML):
                        h2o.remove(xx.project_name)
                    else:
                        h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            _, _, df_varimp = self.get_df_varimp(model, orig_cols)

            missing_features_set = set([x for x in orig_cols if x not in list(df_varimp.index)])
            # must not keep "missing features", even as zero, since h2o-3 won't have them in pred_contribs output
            orig_cols = [x for x in orig_cols if x not in missing_features_set]
            self.col_types = {k: v for k, v in self.col_types.items() if k not in missing_features_set}
            varimp = df_varimp[orig_cols].values  # order by (and select) fitted features
            varimp = np.nan_to_num(varimp)

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))