class myNGBoostClassifier: def make(self , params ): self.model = NGBClassifier(**params ) return self def fit(self, xtrain, ytrain, xtest=None, ytest=None, fit_params={}): if type(xtrain) == pd.core.frame.DataFrame: xtrain = xtrain.values ytrain = ytrain.values if type(xtest) != type(None) and type(ytest) != type(None): xtest = xtest.values ytest = ytest.values if type(xtest) == type(None) or type(ytest) == type(None) : self.model.fit( xtrain , ytrain , **fit_params ) else: self.model.fit( xtrain , ytrain , X_val = xtest , Y_val = ytest ,**fit_params ) def predict(self , xs ): return self.model.predict(xs) def predict_proba(self, xs): if len(xs.shape) == 1: return self.model.predict_proba(xs.reshape(1,-1)) else: return self.model.predict_proba(xs)
class NGBoost(BaseEstimator, ClassifierMixin): def __init__(self, **params): logger.info('Initializing NGBoost...') self.params_ = params self.classes_ = np.array([0, 1]) def get_params(self, deep=True): return self.params_ def _to_numpy(self, X): if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): try: return X.to_numpy() except: ValueError('There is error when converting to numpy') elif isinstance(X, np.ndarray): return X else: ValueError('X must be pandas DataFrame, Series or numpy ndarray') def fit(self, X, y, *args, **kwargs): logger.info(f'NGBoost, fit') logger.info(f'NGBoost, training data shape {X.shape}') logger.info(f'NGBoost, training label shape {y.shape}') X_np = self._to_numpy(X) y_np = self._to_numpy(y) y_np = y_np.astype(int) print(f'np.unique(y_np)') self.estimator_ = NGBClassifier(**self.params_) self.estimator_.fit(X_np, y_np) logger.info(f'NGBoost, done fit') return self def transform(self, X, *args, **kwargs): logger.info(f'NGBoost, transform') logger.info(f'NGBoost, transform, testing shape: {X.shape}') X_np = self._to_numpy(X) pred = self.estimator_.predict_proba(X_np)[:, 1].reshape(-1) logger.info(f'NGBoost, transform, predictions shape: {pred.shape}') logger.info(f'NGBoost, done transform') return pred def score(self, X, y, *args, **kwargs): return roc_auc_score(y, self.transform(X)) def predict_proba(self, X, *args, **kwargs): logger.info(f'NGBoost, predict_proba') logger.info(f'NGBoost, predict_proba, testing shape: {X.shape}') X_np = self._to_numpy(X) pred = self.estimator_.predict_proba(X_np) logger.info(f'NGBoost, predict_proba, done') return pred
def test_classification(): from sklearn.datasets import load_breast_cancer from sklearn.metrics import roc_auc_score, log_loss data, target = load_breast_cancer(True) x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42) ngb = NGBClassifier(Dist=Bernoulli, verbose=False) ngb.fit(x_train, y_train) preds = ngb.predict(x_test) score = roc_auc_score(y_test, preds) assert score >= 0.95 preds = ngb.predict_proba(x_test) score = log_loss(y_test, preds) assert score <= 0.20 score = ngb.score(x_test, y_test) assert score <= 0.20 dist = ngb.pred_dist(x_test) assert isinstance(dist, Bernoulli) score = roc_auc_score(y_test, preds[:, 1]) assert score >= 0.95
def test_classification(breast_cancer_data): from sklearn.metrics import roc_auc_score, log_loss x_train, x_test, y_train, y_test = breast_cancer_data ngb = NGBClassifier(Dist=Bernoulli, verbose=False) ngb.fit(x_train, y_train) preds = ngb.predict(x_test) score = roc_auc_score(y_test, preds) # loose score requirement so it isn't failing all the time assert score >= 0.85 preds = ngb.predict_proba(x_test) score = log_loss(y_test, preds) assert score <= 0.30 score = ngb.score(x_test, y_test) assert score <= 0.30 dist = ngb.pred_dist(x_test) assert isinstance(dist, Bernoulli) score = roc_auc_score(y_test, preds[:, 1]) assert score >= 0.85
class ModelNgbClassifier(Model): def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): # ハイパーパラメータの設定 params = dict(self.params) early_stopping_rounds = params.pop('early_stopping_rounds') self.model = NGBClassifier(**params) self.model.fit(tr_x.values, tr_y.astype(int).values, va_x.values, va_y.astype(int).values, early_stopping_rounds=early_stopping_rounds) def predict(self, te_x): return self.model.predict_proba(te_x.values)[:, 1] def save_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') os.makedirs(os.path.dirname(model_path), exist_ok=True) Data.dump(self.model, model_path) def load_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') self.model = Data.load(model_path)
def test_bernoulli(learner, breast_cancer_data: Tuple4Array): X_cls_train, X_cls_test, Y_cls_train, Y_cls_test = breast_cancer_data # test early stopping features # test other args, n_trees, LR, minibatching- args as fixture ngb = NGBClassifier(Dist=Bernoulli, Score=LogScore, Base=learner, verbose=False) ngb.fit(X_cls_train, Y_cls_train) y_pred = ngb.predict(X_cls_test) y_prob = ngb.predict_proba(X_cls_test) y_dist = ngb.pred_dist(X_cls_test)
def test_categorical(k: int, learner, breast_cancer_data: Tuple4Array): X_train, X_test, y_train, _ = breast_cancer_data dist = k_categorical(k) y_train = np.random.randint(0, k, (len(y_train))) # test early stopping features ngb = NGBClassifier(Dist=dist, Score=LogScore, Base=learner, verbose=False) ngb.fit(X_train, y_train) y_pred = ngb.predict(X_test) y_prob = ngb.predict_proba(X_test) y_dist = ngb.pred_dist(X_test)
def test_bernoulli(self, learners, cls_data): X_cls_train, X_cls_test, Y_cls_train, Y_cls_test = cls_data for Learner in learners: # test early stopping features # test other args, n_trees, LR, minibatching- args as fixture ngb = NGBClassifier(Dist=Bernoulli, Score=LogScore, Base=Learner, verbose=False) ngb.fit(X_cls_train, Y_cls_train) y_pred = ngb.predict(X_cls_test) y_prob = ngb.predict_proba(X_cls_test) y_dist = ngb.pred_dist(X_cls_test)
def test_categorical(self, learners, cls_data): X_cls_train, X_cls_test, Y_cls_train, Y_cls_test = cls_data for K in [2,4,7]: Dist = k_categorical(K) Y_cls_train = np.random.randint(0,K,(len(Y_cls_train))) for Learner in learners: # test early stopping features ngb = NGBClassifier(Dist=Dist, Score=LogScore, Base=Learner, verbose=False) ngb.fit(X_cls_train, Y_cls_train) y_pred = ngb.predict(X_cls_test) y_prob = ngb.predict_proba(X_cls_test) y_dist = ngb.pred_dist(X_cls_test)
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from ngboost import NGBClassifier from ngboost.distns import k_categorical if __name__ == "__main__": X, y = load_breast_cancer(True) y[0: 15] = 2 # artificially make this a 3-class problem instead of a 2-class problem X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2) ngb = NGBClassifier(Dist=k_categorical( 3)) # tell ngboost that there are 3 possible outcomes ngb.fit(X_train, Y_train) # Y should have only 3 values: {0,1,2} # predicted probabilities of class 0, 1, and 2 (columns) for each observation (row) preds = ngb.predict_proba(X_test)