Esempio n. 1
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        X = dt.Frame(X)

        orig_cols = list(X.names)

        if self.num_classes >= 2:
            mod = linsvc(random_state=self.random_state,
                         C=self.params["C"],
                         penalty=self.params["penalty"],
                         loss=self.params["loss"],
                         dual=self.params["dual"])
            kf = StratifiedKFold(n_splits=3,
                                 shuffle=True,
                                 random_state=self.random_state)
            model = CalibratedClassifierCV(base_estimator=mod,
                                           method='isotonic',
                                           cv=kf)
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
        else:
            model = LinearSVR(epsilon=self.params["epsilon"],
                              C=self.params["C"],
                              loss=self.params["loss"],
                              dual=self.params["dual"],
                              random_state=self.random_state)
        self.means = dict()
        self.standard_scaler = StandardScaler()
        X = self.basic_impute(X)
        X = X.to_numpy()
        X = self.standard_scaler.fit_transform(X)
        try:
            model.fit(X, y, sample_weight=sample_weight)
        except Exception as e:
            if 'cross-validation but provided less than' in str(e):
                raise IgnoreEntirelyError(str(e))
            raise
        importances = np.array([0.0 for k in range(len(orig_cols))])
        if self.num_classes >= 2:
            for classifier in model.calibrated_classifiers_:
                importances += np.array(
                    abs(classifier.base_estimator.get_coeff()))
        else:
            importances += np.array(abs(model.coef_[0]))

        self.set_model_properties(
            model=model,
            features=orig_cols,
            importances=importances.tolist(),  # abs(model.coef_[0])
            iterations=0)
Esempio n. 2
0
    def transform(self, X: dt.Frame, y: np.array = None):
        if ngpus_vis == 0:
            raise IgnoreEntirelyError("Transformer cannot run without GPUs")

        import cudf
        import cuml
        cuml.common.memory_utils.set_global_output_type('numpy')
        X = X.to_pandas().fillna(0)
        X = cudf.DataFrame(X)
        return self.model.predict(X)
Esempio n. 3
0
    def fit_transform(self, X: dt.Frame, y: np.array = None):
        if ngpus_vis == 0:
            raise IgnoreEntirelyError("Transformer cannot run without GPUs")

        import cudf
        import cuml
        cuml.common.memory_utils.set_global_output_type('numpy')
        self.n_clusters = min(self.n_clusters, X.nrows)
        self.model = cuml.cluster.KMeans(n_clusters=self.n_clusters,
                                         max_iter=self.max_iters,
                                         tol=self.tol)
        X = X.to_pandas().fillna(0)
        X = cudf.DataFrame(X)
        return self.model.fit_predict(X)
Esempio n. 4
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        orig_cols = list(X.names)

        self.params_override()
        params = self.params.copy()
        if params.get('model_type', 'lda') == 'lda':
            model_class = LinearDiscriminantAnalysis
            params.pop('reg_param', None)
        else:
            model_class = QuadraticDiscriminantAnalysis
            params.pop('solver', None)
        params.pop('model_type', None)

        if self.num_classes >= 2:
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
            model = model_class(**params)
        else:
            model = model_class(**params)

        X = self.basic_impute(X)
        X = X.to_numpy()

        try:
            model.fit(X, y)
        except np.linalg.LinAlgError as e:
            # nothing can be done, just revert to constant predictions
            raise IgnoreEntirelyError(str(e))

        importances = np.array([1 for x in range(len(orig_cols))])
        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=importances.tolist(),
                                  iterations=1)
    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        y_ = y.copy()
        orig_cols = list(X.names)
        text_names = X[:, [str]].names

        self.loaded = False

        self.load_path = get_value(config, self.load_key)
        self.save_path = get_value(config, self.save_key)

        if self.load_path:
            data = joblib.load(self.load_path)
            self.tfidf_objs = data["tf_idf_obj"]
            self.tf_idf_data = data["tf_idf_data"]
            self.prev_params = data["params"]
            self.target = data["target"]
            self.loaded = True

        if not self.loaded:
            if self.num_classes >= 2:
                lb = LabelEncoder()
                lb.fit(self.labels)
                y = lb.transform(y)

            self.tfidf_objs = {}
            self.tf_idf_data = {}
            new_X = None
            for col in text_names:
                XX = X[:, col].to_pandas()
                XX = XX[col].astype(str).fillna("NA").values.tolist()
                tfidf_vec = TfidfVectorizer(**self.return_tfidf_params())
                try:
                    tfidf_vec = self._fit_vectorizer(tfidf_vec, XX)
                except ValueError as e:
                    if 'vocab' in str(e):
                        # skip non-text-like column
                        continue
                    else:
                        raise
                XX = tfidf_vec.transform(XX)
                tfidf_vec.N_ = XX.shape[0]
                self.tfidf_objs[col] = tfidf_vec
                self.tf_idf_data[col] = XX
                if new_X is None:
                    new_X = XX
                else:
                    new_X = sp.sparse.hstack([new_X, XX])
        else:
            y_ = np.hstack([self.target, y_])
            y = y_.copy()
            if self.num_classes >= 2:
                lb = LabelEncoder()
                lb.fit(self.labels)
                y = lb.transform(y)

            new_X = None
            for col in text_names:
                XX = X[:, col].to_pandas()
                XX = XX[col].astype(str).fillna("NA").values.tolist()
                N_ = len(XX)
                tfidf_vec = TfidfVectorizer()
                tfidf_vec.set_params(**self.tfidf_objs[col].get_params())
                try:
                    tfidf_vec.fit(XX)
                    new_data_avail = True
                except ValueError as e:
                    if 'vocab' in str(e):
                        # skip non-text-like column
                        continue
                    new_data_avail = False
                if new_data_avail:
                    tfidf_vec.N_ = N_
                    pre_trained = self.tfidf_objs[col]
                    pre_trained = self.sync_vectorizers(pre_trained, tfidf_vec)
                else:
                    pre_trained = self.tfidf_objs[col]

                XX = pre_trained.transform(XX)
                self.tfidf_objs[col] = pre_trained

                XX = self.sync_tfidf(self.tf_idf_data[col], XX)
                self.tf_idf_data[col] = XX
                if new_X is None:
                    new_X = XX
                else:
                    new_X = sp.sparse.hstack([new_X, XX])

        models = [LogisticRegression(**self.return_lin_params())]
        if self.params["add_rf"]:
            from h2oaicore.lightgbm_dynamic import import_lightgbm
            lgbm = import_lightgbm()
            import lightgbm as lgbm
            models.append(lgbm.LGBMClassifier(
                boosting_type='rf',
                colsample_bytree=.5,
                subsample=.632,  # Standard RF bagging fraction
                min_child_weight=2.5,
                min_child_samples=5,
                subsample_freq=1,
                min_split_gain=0,
                n_jobs=-1,
                **self.return_rf_params()
            ))

        for mi, m in enumerate(models):
            try:
                m.fit(new_X, y)
            except ValueError as e:
                # general mutation as specified is not alllowed, see logistic_regression recipe.
                # Could use restricted choices there, but for simplicity just ignore the error entirely
                if mi == 0:
                    raise IgnoreEntirelyError(str(e))
                raise

        importances = [1] * len(orig_cols)
        self.set_model_properties(
            model={
                "model": models,
                "tf-idfs": self.tfidf_objs
            },
            features=orig_cols,
            importances=importances,
            iterations=0
        )
        if self.save_path:
            joblib.dump({
                "tf_idf_obj": self.tfidf_objs,
                "tf_idf_data": self.tf_idf_data,
                "params": self.params,
                "target": y_,
            },
                self.save_path
            )
        # clear large objects to avoid large data in subprocess pipe
        self.tfidf_objs = None
        self.tf_idf_data = None
Esempio n. 6
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        logger = None
        if self._make_logger:
            # Example use of logger, with required import of:
            #  from h2oaicore.systemutils import make_experiment_logger, loggerinfo
            # Can use loggerwarning, loggererror, etc. for different levels
            if self.context and self.context.experiment_id:
                logger = make_experiment_logger(
                    experiment_id=self.context.experiment_id,
                    tmp_dir=self.context.tmp_dir,
                    experiment_tmp_dir=self.context.experiment_tmp_dir)

        if self._show_logger_test:
            loggerinfo(logger, "TestLOGGER: Fit CatBoost")

        if self._show_task_test:
            # Example task sync operations
            if hasattr(self, 'testcount'):
                self.test_count += 1
            else:
                self.test_count = 0

            # The below generates a message in the GUI notifications panel
            if self.test_count == 0 and self.context and self.context.experiment_id:
                warning = "TestWarning: First CatBoost fit for this model instance"
                loggerwarning(logger, warning)
                task = kwargs.get('task')
                if task:
                    task.sync(key=self.context.experiment_id,
                              progress=dict(type='warning', data=warning))
                    task.flush()

            # The below generates a message in the GUI top-middle panel above the progress wheel
            if self.test_count == 0 and self.context and self.context.experiment_id:
                message = "Tuning CatBoost"
                loggerinfo(logger, message)
                task = kwargs.get('task')
                if task:
                    task.sync(key=self.context.experiment_id,
                              progress=dict(type='update', message=message))
                    task.flush()

        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType

        # label encode target and setup type of problem
        lb = LabelEncoder()
        if self.num_classes >= 2:
            lb.fit(self.labels)
            y = lb.transform(y)
            if eval_set is not None:
                valid_X = eval_set[0][0]
                valid_y = eval_set[0][1]
                valid_y = lb.transform(valid_y)
                eval_set = [(valid_X, valid_y)]
            self.params.update({'objective': 'Logloss'})
        if self.num_classes > 2:
            self.params.update({'objective': 'MultiClass'})

        if isinstance(X, dt.Frame):
            orig_cols = list(X.names)
            numeric_cols = list(X[:, [bool, int, float]].names)
        else:
            orig_cols = list(X.columns)
            numeric_cols = list(X.select_dtypes([np.number]).columns)

        # unlike lightgbm that needs label encoded categoricals, catboots can take raw strings etc.
        self.params['cat_features'] = [
            i for i, x in enumerate(orig_cols)
            if 'CatOrig:' in x or 'Cat:' in x or x not in numeric_cols
        ]

        if not self.get_uses_gpus(self.params):
            # monotonicity constraints not available for GPU for catboost
            # get names of columns in same order
            X_names = list(dt.Frame(X).names)
            X_numeric = self.get_X_ordered_numerics(X)
            X_numeric_names = list(X_numeric.names)
            _, _, constraints, self.set_monotone_constraints(X=X_numeric, y=y)
            # if non-numerics, then fix those to have 0 constraint
            self.params['monotone_constraints'] = [0] * len(X_names)
            colnumi = 0
            for coli in X_names:
                if X_names[coli] in X_numeric_names:
                    self.params['monotone_constraints'][coli] = constraints[
                        colnumi]
                    colnumi += 1

        if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0:
            # dt -> catboost internally using buffer leaks, so convert here
            # assume predict is after pipeline collection or in subprocess so needs no protection
            X = X.to_numpy(
            )  # don't assign back to X so don't damage during predict
            X = np.ascontiguousarray(X,
                                     dtype=np.float32 if config.data_precision
                                     == "float32" else np.float64)
            if eval_set is not None:
                valid_X = eval_set[0][0].to_numpy(
                )  # don't assign back to X so don't damage during predict
                valid_X = np.ascontiguousarray(
                    valid_X,
                    dtype=np.float32
                    if config.data_precision == "float32" else np.float64)
                valid_y = eval_set[0][1]
                eval_set = [(valid_X, valid_y)]

        if eval_set is not None:
            valid_X_shape = eval_set[0][0].shape
        else:
            valid_X_shape = None

        X, eval_set = self.process_cats(X, eval_set, orig_cols)

        # modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes
        self.acquire_gpus_function(train_shape=X.shape,
                                   valid_shape=valid_X_shape)

        params = copy.deepcopy(
            self.params
        )  # keep separate, since then can be pulled form lightgbm params
        params = self.transcribe_params(params=params, **kwargs)

        if logger is not None:
            loggerdata(
                logger,
                "CatBoost parameters: params_base : %s params: %s catboost_params: %s"
                % (str(self.params_base), str(self.params), str(params)))

        if self.num_classes == 1:
            self.model = CatBoostRegressor(**params)
        else:
            self.model = CatBoostClassifier(**params)
        # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored.
        if self.num_classes == 1:
            # assume not mae, which would use median
            # baseline = [np.mean(y)] * len(y)
            baseline = None
        else:
            baseline = None

        kwargs_fit = dict(baseline=baseline, eval_set=eval_set)
        pickle_path = None
        if config.debug_daimodel_level >= 2:
            self.uuid = str(uuid.uuid4())[:6]
            pickle_path = os.path.join(exp_dir(),
                                       "catboost%s.tmp.pickle" % self.uuid)
            save_obj((self.model, X, y, sample_weight, kwargs_fit),
                     pickle_path)

        # FIT (with migration safety before hyperopt/Optuna function added)
        try:
            if hasattr(self, 'dask_or_hyper_or_normal_fit'):
                self.dask_or_hyper_or_normal_fit(X,
                                                 y,
                                                 sample_weight=sample_weight,
                                                 kwargs=kwargs,
                                                 **kwargs_fit)
            else:
                self.model.fit(X, y, sample_weight=sample_weight, **kwargs_fit)
        except Exception as e:
            if "All features are either constant or ignored" in str(e):
                raise IgnoreEntirelyError(str(e))
            raise

        if config.debug_daimodel_level <= 2:
            remove(pickle_path)

        # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html
        # need to move to wrapper
        if self.model.get_best_iteration() is not None:
            iterations = self.model.get_best_iteration() + 1
        else:
            iterations = self.params['n_estimators']
        # must always set best_iterations
        self.model_path = None
        importances = copy.deepcopy(self.model.feature_importances_)
        if not self._save_by_pickle:
            self.uuid = str(uuid.uuid4())[:6]
            model_file = "catboost_%s.bin" % str(self.uuid)
            self.model_path = os.path.join(self.context.experiment_tmp_dir,
                                           model_file)
            self.model.save_model(self.model_path)
            with open(self.model_path, mode='rb') as f:
                model = f.read()
        else:
            model = self.model
        self.set_model_properties(
            model=
            model,  # overwrites self.model object with bytes if not using pickle
            features=orig_cols,
            importances=importances,
            iterations=iterations)