def preprocess_image(self, source_img_path):
     try:
         final_img_path = os.path.join(temporary_files_path, self.uuid, os.path.basename(source_img_path))
     except:  # we are sometimes getting np.float32, why?
         return None
     delete = False
     if not os.path.exists(final_img_path):
         if not os.path.exists(source_img_path):
             try:
                 self.download(source_img_path, final_img_path)
             except requests.RequestException as e:
                 print_debug("Error: %s for source_img_path: %s" % (str(e), str(source_img_path)))
                 return None
             delete = False  # True to avoid re-download or a race condition between multiple procs
         else:
             final_img_path = source_img_path
     import h2oaicore.keras as keras
     importlib.reload(keras)
     img = keras.preprocessing.image.load_img(final_img_path, target_size=(224, 224))
     if delete:
         remove(final_img_path)
     x = keras.preprocessing.image.img_to_array(img)
     x = np.expand_dims(x, axis=0)
     x = keras.applications.resnet50.preprocess_input(x)
     return x
 def transform(self, X: dt.Frame):
     if not os.path.exists(self.model_path):
         with open(self.model_path, 'wb') as f:
             f.write(self.model_bytes)
     import h2oaicore.keras as keras
     importlib.reload(keras)
     self.model = keras.models.load_model(self.model_path)
     # remove(self.model_path) # can't remove, used by other procs or later
     values = X[:, self.col_name].to_numpy().ravel()
     batch_size = min(len(values), self.batch_size)
     values_ = np.array_split(values, int(len(values) / self.batch_size) + 1)
     print(values_)
     results = []
     for v in values_:
         images = []
         for x in v:
             if True or x[-4:] in [".jpg", ".png", ".jpeg"]:
                 image = self.preprocess_image(x)
                 images.append(image)
             else:
                 raise NotImplementedError
         # deal with missing images (None in images)
         good_imagei = None
         for imagei, image in enumerate(images):
             if image is not None:
                 good_imagei = imagei
                 break
         if len(images) > 0:
             msg = "no good images out of %d images" % len(images)
             if False:  # for debugging only
                 assert good_imagei is not None, msg
             elif good_imagei is None:
                 print_debug(msg)
         if good_imagei is not None:
             for imagei, image in enumerate(images):
                 if image is None:
                     images[imagei] = images[good_imagei] * 0  # impute 0 for missing images
             images = np.vstack(images)
             results.append(self.model.predict(images))
     if len(results) > 0:
         return np.vstack(results)
     else:
         return [0] * X.shape[0]
Esempio n. 3
0
    def predict(self, X, y=None, **kwargs):
        model, features, importances, iterations = self.get_model_properties()
        if not self._save_by_pickle:
            from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType
            if self.num_classes >= 2:
                from_file = CatBoostClassifier()
            else:
                from_file = CatBoostRegressor()
            with open(self.model_path, mode='wb') as f:
                f.write(model)
            model = from_file.load_model(self.model_path)

        # FIXME: Do equivalent throttling of predict size like def _predict_internal(self, X, **kwargs), wrap-up.
        if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0:
            # dt -> lightgbm internally using buffer leaks, so convert here
            # assume predict is after pipeline collection or in subprocess so needs no protection
            X = X.to_numpy(
            )  # don't assign back to X so don't damage during predict
            X = np.ascontiguousarray(X,
                                     dtype=np.float32 if config.data_precision
                                     == "float32" else np.float64)

        X, eval_set = self.process_cats(X, None, self.feature_names_fitted)

        pred_contribs = kwargs.get('pred_contribs', False)
        output_margin = kwargs.get('output_margin', False)
        fast_approx = kwargs.pop('fast_approx', False)
        if fast_approx:
            iterations = min(config.fast_approx_num_trees, iterations)

        # implicit import
        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType, Pool
        n_jobs = max(1, physical_cores_count)
        if not pred_contribs and not output_margin:
            if self.num_classes >= 2:
                preds = model.predict_proba(
                    X,
                    ntree_start=0,
                    ntree_end=iterations,  # index of first tree *not* to be used
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )

                if preds.shape[1] == 2:
                    return preds[:, 1]
                else:
                    return preds
            else:
                return model.predict(
                    X,
                    ntree_start=0,
                    ntree_end=iterations,  # index of first tree *not* to be used
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )
        elif output_margin:
            # uses "predict" for raw for any class
            preds = model.predict(
                X,
                prediction_type="RawFormulaVal",
                ntree_start=0,
                ntree_end=iterations,  # index of first tree *not* to be used
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported
            )
            if len(preds.shape
                   ) > 1 and preds.shape[1] == 2 and self.num_classes == 2:
                return preds[:, 1]
            else:
                return preds
        elif pred_contribs:
            # For Shapley, doesn't come from predict
            # For regression/binary, shap is shape of (rows, features + bias)
            # for multiclass, shap is shape of (rows, classes, features + bias)
            data = Pool(X, label=y, cat_features=self.params['cat_features'])
            if fast_approx:
                # https://github.com/catboost/catboost/issues/1146
                # https://github.com/catboost/catboost/issues/1535
                # can't specify trees, but they have approx version
                # Regular, Exact, or Approximate
                shap_calc_type = "Approximate"
            else:
                shap_calc_type = "Regular"
            # See also shap_mode
            # help(CatBoostClassifier.get_feature_importance)
            print_debug("shap_calc_type: %s" % shap_calc_type)

            pickle_path = None
            if config.debug_daimodel_level >= 2:
                self.uuid = str(uuid.uuid4())[:6]
                pickle_path = os.path.join(
                    exp_dir(), "catboost_shappredict%s.tmp.pickle" % self.uuid)
                model.save_model(
                    os.path.join(exp_dir(), "catshapproblem%s.catboost.model" %
                                 self.uuid))
                # save_obj((self, self.model, model, X, y, kwargs, shap_calc_type, self.params['cat_features']), pickle_path)
                save_obj((model, X, y, kwargs, shap_calc_type,
                          self.params['cat_features']), pickle_path)

            preds_shap = model.get_feature_importance(
                data=data,
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported,
                type=EFstrType.ShapValues,
                shap_calc_type=shap_calc_type,
            )
            # repair broken shap sum: https://github.com/catboost/catboost/issues/1125
            print_debug("shap_fix")
            preds_raw = model.predict(
                X,
                prediction_type="RawFormulaVal",
                ntree_start=0,
                ntree_end=iterations,  # index of first tree *not* to be used
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported
            )
            if self.num_classes <= 2:
                axis = 1
            else:
                axis = 2
            orig_sum = np.sum(preds_shap, axis=axis)
            print_debug("shap_fix2")
            # avoid division by 0, need different trick, e.g. change baseline, to fix that case
            if axis == 1:
                orig_sum[orig_sum[:] == 0.0] = 1.0
                preds_shap = preds_shap * preds_raw[:, None] / orig_sum[:,
                                                                        None]
            else:
                # each feature and each class must sum up
                orig_sum[orig_sum[:, :] == 0.0] = 1.0
                preds_shap = preds_shap * preds_raw[:, :,
                                                    None] / orig_sum[:, :,
                                                                     None]

            if config.hard_asserts and config.debug_daimodel_level >= 2:
                print_debug("shap_check")
                model.save_model(os.path.join(exp_dir(), "catshapproblem"))
                pickle.dump((X, y, self.params['cat_features']),
                            open(os.path.join(exp_dir(), "catshapproblem.pkl"),
                                 "wb"))
                preds_raw = model.predict(
                    X,
                    prediction_type="RawFormulaVal",
                    ntree_start=0,
                    ntree_end=iterations,  # index of first tree *not* to be used
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )

                assert np.isclose(preds_raw, np.sum(
                    preds_shap, axis=axis)).all(
                    ), "catboost shapley does not sum up correctly"

            if config.debug_daimodel_level <= 2:
                remove(pickle_path)

            if axis == 1:
                return preds_shap
            else:
                # DAI expects (shape rows) * (classes x (features + 1)) with "columns" as blocks of
                # feature_0_class_0 feature_0_class_0 ... feature_0_class_1 feature_1_class_1 ...
                return preds_shap.reshape(
                    preds_shap.shape[0],
                    preds_shap.shape[1] * preds_shap.shape[2])
        else:
            raise RuntimeError("No such case")
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        X = dt.Frame(X)
        X = self.inf_impute(X)
        self.transcribe(X=X)

        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = None

        if isinstance(self, H2ONBModel):
            # NB can only handle weights of 0 / 1
            if sample_weight is not None:
                sample_weight = (sample_weight != 0).astype(int)
            if sample_weight_eval_set is not None and len(sample_weight_eval_set) > 0 and sample_weight_eval_set[
                0] is not None:
                sample_weight_eval_set1 = sample_weight_eval_set[0]
                sample_weight_eval_set1[sample_weight_eval_set1 != 0] = 1
                sample_weight_eval_set1 = sample_weight_eval_set1.astype(int)
                sample_weight_eval_set = [sample_weight_eval_set1]

        X_pd = X.to_pandas()

        # fix if few levels for "enum" type.  h2o-3 auto-type is too greedy and only looks at very first rows
        np_real_types = [np.int8, np.int16, np.int32, np.int64, np.float16, np.float32, np.float64]
        column_types = {}
        for col in X_pd.columns:
            if X_pd[col].dtype.type in np_real_types:
                column_types[col] = 'real'
        nuniques = {}
        for col in X_pd.columns:
            nuniques[col] = len(pd.unique(X_pd[col]))
            print_debug("NumUniques for col: %s: %d" % (col, nuniques[col]))
            if nuniques[col] <= config.max_int_as_cat_uniques and X_pd[col].dtype.type in np_real_types:
                # override original "real"
                column_types[col] = 'enum'
        # if column_types is partially filled, that is ok to h2o-3

        train_X = h2o.H2OFrame(X_pd, column_types=column_types)
        self.col_types = train_X.types

        # see uniques-types dict
        nuniques_and_types = {}
        for col, typ, in self.col_types.items():
            nuniques_and_types[col] = [typ, nuniques[col]]
            print_debug("NumUniques and types for col: %s : %s" % (col, nuniques_and_types[col]))

        train_y = h2o.H2OFrame(y,
                               column_names=[self.target],
                               column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(), column_types=self.col_types)
            valid_y = h2o.H2OFrame(eval_set[0][1],
                                   column_names=[self.target],
                                   column_types=['categorical' if self.num_classes >= 2 else 'numeric'])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()
            params = copy.deepcopy(self.params)
            if not isinstance(self, H2OAutoMLModel):
                # AutoML needs max_runtime_secs in initializer, all others in train() method
                max_runtime_secs = params.pop('max_runtime_secs', 0)
                train_kwargs = dict(max_runtime_secs=max_runtime_secs)
            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight

            # Don't ever use the offset column as a feature
            offset_col = None  # if no column is called offset we will pass "None" and not use this feature
            cols_to_train = []  # list of all non-offset columns

            for col in list(train_X.names):
                if not col.lower() == "offset":
                    cols_to_train.append(col)
                else:
                    offset_col = col

            orig_cols = cols_to_train  # not training on offset

            if self.doing_p_values():
                # https://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/algo-params/compute_p_values.html
                # take a look at the coefficients_table to see the p_values
                params['remove_collinear_columns'] = True
                params['compute_p_values'] = True
                # h2o-3 only supports p-values if lambda=0
                params['lambda_'] = 0
                if self.num_classes == 2:
                    params['family'] = 'binomial'
                params['solver'] = 'IRLSM'
                params.pop('beta_constraints', None)

            trials = 2
            for trial in range(0, trials):
                try:
                    # Models that can use an offset column
                    loggerinfo(self.get_logger(**kwargs), "%s (%s) fit parameters: %s" % (
                    self.display_name, self.__class__.__module__, dict(params)))
                    model = self.make_instance(**params)
                    if isinstance(model, H2OGBMModel) | isinstance(model, H2ODLModel) | isinstance(model, H2OGLMModel):
                        model.train(x=cols_to_train, y=self.target, training_frame=train_frame,
                                    offset_column=offset_col,
                                    **train_kwargs)
                    else:
                        model.train(x=train_X.names, y=self.target, training_frame=train_frame, **train_kwargs)
                    break
                except Exception as e:
                    print(str(e))
                    t, v, tb = sys.exc_info()
                    ex = ''.join(traceback.format_exception(t, v, tb))
                    if 'Training data must have at least 2 features' in str(ex) and X.ncols != 0:
                        # if had non-zero features but h2o-3 saw as constant, ignore h2o-3 in that case
                        raise IgnoreEntirelyError
                    elif "min_rows: The dataset size is too small to split for min_rows" in str(e) and trial == 0:
                        # then h2o-3 counted as rows some reduced set, since we already protect against actual rows vs. min_rows
                        params['min_rows'] = 1  # go down to lowest value
                        # permit another trial
                    elif "min_rows: The dataset size is too small to split for min_rows" in str(e) and trial == 1:
                        raise IgnoreEntirelyError
                    elif " java.lang.AssertionError" in str(ex):
                        # bug in h2o-3, nothing can be done
                        raise IgnoreEntirelyError
                    elif "NotStrictlyPositiveException" in str(ex):
                        # bad input data for given hyperparameters
                        raise IgnoreEntirelyError
                    else:
                        raise
                    if trial == trials - 1:
                        # if at end of trials, raise no matter what
                        raise

            if self._show_performance:
                # retrieve the model performance
                perf_train = model.model_performance(train_frame)
                loggerinfo(self.get_logger(**kwargs), self.perf_to_list(perf_train, which="training"))
                if valid_frame is not None:
                    perf_valid = model.model_performance(valid_frame)
                    loggerinfo(self.get_logger(**kwargs), self.perf_to_list(perf_valid, which="validation"))

            struuid = str(uuid.uuid4())

            if self._show_coefficients:
                coeff_table = model._model_json['output']['coefficients_table']
                # convert table to a pandas dataframe
                coeff_table = coeff_table.as_data_frame()
                is_final = 'IS_FINAL' in kwargs
                json_file = os.path.join(exp_dir(), 'coefficients_table_is_final_%s_%s.json' % (is_final, struuid))
                with open(json_file, "wt") as f:
                    pd.set_option('precision', 16)
                    f.write(json.dumps(json.loads(coeff_table.to_json()), indent=4))
                    pd.set_option('precision', 6)

            if isinstance(model, H2OAutoML):
                model = model.leader
            self.id = model.model_id
            model_path = os.path.join(exp_dir(), "h2o_model." + struuid)
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                remove(model_path)
            for xx in [train_frame, train_X, train_y, model, valid_frame, valid_X, valid_y]:
                if xx is not None:
                    if isinstance(xx, H2OAutoML):
                        h2o.remove(xx.project_name)
                    else:
                        h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            _, _, df_varimp = self.get_df_varimp(model, orig_cols)

            missing_features_set = set([x for x in orig_cols if x not in list(df_varimp.index)])
            # must not keep "missing features", even as zero, since h2o-3 won't have them in pred_contribs output
            orig_cols = [x for x in orig_cols if x not in missing_features_set]
            self.col_types = {k: v for k, v in self.col_types.items() if k not in missing_features_set}
            varimp = df_varimp[orig_cols].values  # order by (and select) fitted features
            varimp = np.nan_to_num(varimp)

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))
Esempio n. 5
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        X = dt.Frame(X)
        X = self.inf_impute(X)
        self.transcribe(X=X)

        h2o.init(port=config.h2o_recipes_port, log_dir=self.my_log_dir)
        model_path = None

        if isinstance(self, H2ONBModel):
            # NB can only handle weights of 0 / 1
            if sample_weight is not None:
                sample_weight = (sample_weight != 0).astype(int)
            if sample_weight_eval_set is not None and len(
                    sample_weight_eval_set
            ) > 0 and sample_weight_eval_set[0] is not None:
                sample_weight_eval_set1 = sample_weight_eval_set[0]
                sample_weight_eval_set1[sample_weight_eval_set1 != 0] = 1
                sample_weight_eval_set1 = sample_weight_eval_set1.astype(int)
                sample_weight_eval_set = [sample_weight_eval_set1]

        X_pd = X.to_pandas()

        # fix if few levels for "enum" type.  h2o-3 auto-type is too greedy and only looks at very first rows
        np_real_types = [
            np.int8, np.int16, np.int32, np.int64, np.float16, np.float32,
            np.float64
        ]
        column_types = {}
        for col in X_pd.columns:
            if X_pd[col].dtype.type in np_real_types:
                column_types[col] = 'real'
        nuniques = {}
        for col in X_pd.columns:
            nuniques[col] = len(pd.unique(X_pd[col]))
            print_debug("NumUniques for col: %s: %d" % (col, nuniques[col]))
            if nuniques[col] <= config.max_int_as_cat_uniques and X_pd[
                    col].dtype.type in np_real_types:
                # override original "real"
                column_types[col] = 'enum'
        # if column_types is partially filled, that is ok to h2o-3

        train_X = h2o.H2OFrame(X_pd, column_types=column_types)
        self.col_types = train_X.types

        # see uniques-types dict
        nuniques_and_types = {}
        for col, typ, in self.col_types.items():
            nuniques_and_types[col] = [typ, nuniques[col]]
            print_debug("NumUniques and types for col: %s : %s" %
                        (col, nuniques_and_types[col]))

        train_y = h2o.H2OFrame(
            y,
            column_names=[self.target],
            column_types=[
                'categorical' if self.num_classes >= 2 else 'numeric'
            ])
        train_frame = train_X.cbind(train_y)
        if sample_weight is not None:
            train_w = h2o.H2OFrame(sample_weight,
                                   column_names=[self.weight],
                                   column_types=['numeric'])
            train_frame = train_frame.cbind(train_w)
        valid_frame = None
        valid_X = None
        valid_y = None
        model = None
        if eval_set is not None:
            valid_X = h2o.H2OFrame(eval_set[0][0].to_pandas(),
                                   column_types=self.col_types)
            valid_y = h2o.H2OFrame(
                eval_set[0][1],
                column_names=[self.target],
                column_types=[
                    'categorical' if self.num_classes >= 2 else 'numeric'
                ])
            valid_frame = valid_X.cbind(valid_y)
            if sample_weight is not None:
                if sample_weight_eval_set is None:
                    sample_weight_eval_set = [np.ones(len(eval_set[0][1]))]
                valid_w = h2o.H2OFrame(sample_weight_eval_set[0],
                                       column_names=[self.weight],
                                       column_types=['numeric'])
                valid_frame = valid_frame.cbind(valid_w)

        try:
            train_kwargs = dict()
            params = copy.deepcopy(self.params)
            if not isinstance(self, H2OAutoMLModel):
                # AutoML needs max_runtime_secs in initializer, all others in train() method
                max_runtime_secs = params.pop('max_runtime_secs', 0)
                train_kwargs = dict(max_runtime_secs=max_runtime_secs)
            if valid_frame is not None:
                train_kwargs['validation_frame'] = valid_frame
            if sample_weight is not None:
                train_kwargs['weights_column'] = self.weight

            # Don't ever use the offset column as a feature
            offset_col = None  # if no column is called offset we will pass "None" and not use this feature
            cols_to_train = []  # list of all non-offset columns

            for col in list(train_X.names):
                if not col.lower() == "offset":
                    cols_to_train.append(col)
                else:
                    offset_col = col

            orig_cols = cols_to_train  # not training on offset

            trials = 2
            for trial in range(0, trials):
                try:
                    # Models that can use an offset column
                    model = self.make_instance(**params)
                    if isinstance(model, H2OGBMModel) | isinstance(
                            model, H2ODLModel) | isinstance(
                                model, H2OGLMModel):
                        model.train(x=cols_to_train,
                                    y=self.target,
                                    training_frame=train_frame,
                                    offset_column=offset_col,
                                    **train_kwargs)
                    else:
                        model.train(x=train_X.names,
                                    y=self.target,
                                    training_frame=train_frame,
                                    **train_kwargs)
                    break
                except Exception as e:
                    print(str(e))
                    t, v, tb = sys.exc_info()
                    ex = ''.join(traceback.format_exception(t, v, tb))
                    if 'Training data must have at least 2 features' in str(
                            ex) and X.ncols != 0:
                        # if had non-zero features but h2o-3 saw as constant, ignore h2o-3 in that case
                        raise IgnoreEntirelyError
                    elif "min_rows: The dataset size is too small to split for min_rows" in str(
                            e):
                        # then h2o-3 counted as rows some reduced set, since we already protect against actual rows vs. min_rows
                        params['min_rows'] = 1  # go down to lowest value
                        # permit another trial
                    else:
                        raise
                    if trial == trials - 1:
                        # if at end of trials, raise no matter what
                        raise

            if isinstance(model, H2OAutoML):
                model = model.leader
            self.id = model.model_id
            model_path = os.path.join(user_dir(),
                                      "h2o_model." + str(uuid.uuid4()))
            model_path = h2o.save_model(model=model, path=model_path)
            with open(model_path, "rb") as f:
                raw_model_bytes = f.read()

        finally:
            if model_path is not None:
                remove(model_path)
            for xx in [
                    train_frame, train_X, train_y, model, valid_frame, valid_X,
                    valid_y
            ]:
                if xx is not None:
                    if isinstance(xx, H2OAutoML):
                        h2o.remove(xx.project_name)
                    else:
                        h2o.remove(xx)

        df_varimp = model.varimp(True)
        if df_varimp is None:
            varimp = np.ones(len(orig_cols))
        else:
            df_varimp.index = df_varimp['variable']
            df_varimp = df_varimp.iloc[:, 1]  # relative importance
            for missing in [
                    x for x in orig_cols if x not in list(df_varimp.index)
            ]:
                # h2o3 doesn't handle raw strings all the time, can hit:
                # KeyError: "None of [Index(['0_Str:secret_ChangeTemp'], dtype='object', name='variable')] are in the [index]"
                df_varimp[missing] = 0
            varimp = df_varimp[orig_cols].values  # order by fitted features
            varimp = np.nan_to_num(varimp)

        self.set_model_properties(model=raw_model_bytes,
                                  features=orig_cols,
                                  importances=varimp,
                                  iterations=self.get_iterations(model))