def fixture_learners_data(breast_cancer_data, boston_data, boston_survival_data): """ Returns: A list of iterables, each iterable containing a fitted model and X data and the predictions for the X_data """ models_data = [] X_class_train, _, Y_class_train, _ = breast_cancer_data ngb = NGBClassifier(verbose=False, n_estimators=10) ngb.fit(X_class_train, Y_class_train) models_data.append((ngb, X_class_train, ngb.predict(X_class_train))) X_reg_train, _, Y_reg_train, _ = boston_data ngb = NGBRegressor(verbose=False, n_estimators=10) ngb.fit(X_reg_train, Y_reg_train) models_data.append((ngb, X_reg_train, ngb.predict(X_reg_train))) X_surv_train, _, T_surv_train, E_surv_train, _ = boston_survival_data ngb = NGBSurvival(verbose=False, n_estimators=10) ngb.fit(X_surv_train, T_surv_train, E_surv_train) models_data.append((ngb, X_surv_train, ngb.predict(X_surv_train))) ngb = NGBRegressor(Dist=MultivariateNormal(2), n_estimators=10) ngb.fit(X_surv_train, np.vstack([T_surv_train, E_surv_train]).T) models_data.append((ngb, X_surv_train, ngb.predict(X_surv_train))) return models_data
def test_classification(): from sklearn.datasets import load_breast_cancer from sklearn.metrics import roc_auc_score, log_loss data, target = load_breast_cancer(True) x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42) ngb = NGBClassifier(Dist=Bernoulli, verbose=False) ngb.fit(x_train, y_train) preds = ngb.predict(x_test) score = roc_auc_score(y_test, preds) assert score >= 0.95 preds = ngb.predict_proba(x_test) score = log_loss(y_test, preds) assert score <= 0.20 score = ngb.score(x_test, y_test) assert score <= 0.20 dist = ngb.pred_dist(x_test) assert isinstance(dist, Bernoulli) score = roc_auc_score(y_test, preds[:, 1]) assert score >= 0.95
def test_classification(breast_cancer_data): from sklearn.metrics import roc_auc_score, log_loss x_train, x_test, y_train, y_test = breast_cancer_data ngb = NGBClassifier(Dist=Bernoulli, verbose=False) ngb.fit(x_train, y_train) preds = ngb.predict(x_test) score = roc_auc_score(y_test, preds) # loose score requirement so it isn't failing all the time assert score >= 0.85 preds = ngb.predict_proba(x_test) score = log_loss(y_test, preds) assert score <= 0.30 score = ngb.score(x_test, y_test) assert score <= 0.30 dist = ngb.pred_dist(x_test) assert isinstance(dist, Bernoulli) score = roc_auc_score(y_test, preds[:, 1]) assert score >= 0.85
class myNGBoostClassifier: def make(self , params ): self.model = NGBClassifier(**params ) return self def fit(self, xtrain, ytrain, xtest=None, ytest=None, fit_params={}): if type(xtrain) == pd.core.frame.DataFrame: xtrain = xtrain.values ytrain = ytrain.values if type(xtest) != type(None) and type(ytest) != type(None): xtest = xtest.values ytest = ytest.values if type(xtest) == type(None) or type(ytest) == type(None) : self.model.fit( xtrain , ytrain , **fit_params ) else: self.model.fit( xtrain , ytrain , X_val = xtest , Y_val = ytest ,**fit_params ) def predict(self , xs ): return self.model.predict(xs) def predict_proba(self, xs): if len(xs.shape) == 1: return self.model.predict_proba(xs.reshape(1,-1)) else: return self.model.predict_proba(xs)
def test_bernoulli(learner, breast_cancer_data: Tuple4Array): X_cls_train, X_cls_test, Y_cls_train, Y_cls_test = breast_cancer_data # test early stopping features # test other args, n_trees, LR, minibatching- args as fixture ngb = NGBClassifier(Dist=Bernoulli, Score=LogScore, Base=learner, verbose=False) ngb.fit(X_cls_train, Y_cls_train) y_pred = ngb.predict(X_cls_test) y_prob = ngb.predict_proba(X_cls_test) y_dist = ngb.pred_dist(X_cls_test)
def test_categorical(k: int, learner, breast_cancer_data: Tuple4Array): X_train, X_test, y_train, _ = breast_cancer_data dist = k_categorical(k) y_train = np.random.randint(0, k, (len(y_train))) # test early stopping features ngb = NGBClassifier(Dist=dist, Score=LogScore, Base=learner, verbose=False) ngb.fit(X_train, y_train) y_pred = ngb.predict(X_test) y_prob = ngb.predict_proba(X_test) y_dist = ngb.pred_dist(X_test)
def test_bernoulli(self, learners, cls_data): X_cls_train, X_cls_test, Y_cls_train, Y_cls_test = cls_data for Learner in learners: # test early stopping features # test other args, n_trees, LR, minibatching- args as fixture ngb = NGBClassifier(Dist=Bernoulli, Score=LogScore, Base=Learner, verbose=False) ngb.fit(X_cls_train, Y_cls_train) y_pred = ngb.predict(X_cls_test) y_prob = ngb.predict_proba(X_cls_test) y_dist = ngb.pred_dist(X_cls_test)
def test_categorical(self, learners, cls_data): X_cls_train, X_cls_test, Y_cls_train, Y_cls_test = cls_data for K in [2,4,7]: Dist = k_categorical(K) Y_cls_train = np.random.randint(0,K,(len(Y_cls_train))) for Learner in learners: # test early stopping features ngb = NGBClassifier(Dist=Dist, Score=LogScore, Base=Learner, verbose=False) ngb.fit(X_cls_train, Y_cls_train) y_pred = ngb.predict(X_cls_test) y_prob = ngb.predict_proba(X_cls_test) y_dist = ngb.pred_dist(X_cls_test)
def NGBoost(self, args): ## Natural gradient Boosting logger.info("Running Natural Gradient Boosting ... ") ## https://stanfordmlgroup.github.io/ngboost/1-useage.html from ngboost.learners import default_tree_learner from ngboost.distns import k_categorical, Bernoulli ##Classifier from ngboost.distns import Exponential, Normal, LogNormal ## Regressor from ngboost.scores import MLE, LogScore, CRPScore ## Base Learner from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier # NGBoost ## Comment: If error with singular matrix, increase size of input data from 0.05 to 0.15 if args.predictor.lower() == 'classifier': from ngboost import NGBClassifier as ngb learner = DecisionTreeRegressor(criterion='friedman_mse', max_depth=6, random_state=SEED) ngb = ngb(Base=learner, n_estimators=2000, Score=MLE, Dist=Bernoulli, random_state=SEED) elif args.predictor.lower() == 'regressor': from ngboost import NGBRegressor as ngb learner = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3, random_state=SEED) ngb = ngb(Base=default_tree_learner, Dist=Exponential, Score=LogScore, learning_rate=0.01, minibatch_frac=0.6, col_sample=0.6) ## Fit model ngb.fit(self.X_train, np.asarray(self.y_train).astype(int)) ## Predict the labels self.y_pred = ngb.predict(self.X_data) if args.predictor.lower() == 'regressor': self.y_pred = logistic.cdf(self.y_pred) self.data['boosting_score'] = self.y_pred self.model = ngb return self
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, classification_report from ngboost import NGBClassifier from ngboost.distns import k_categorical if __name__ == "__main__": X, y = load_breast_cancer(True) y[0: 15] = 2 # artificially make this a 3-class problem instead of a 2-class problem X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2) ngb = NGBClassifier(Dist=k_categorical( 3)) # tell ngboost that there are 3 possible outcomes ngb.fit(X_train, Y_train) # Y should have only 3 values: {0,1,2} # predicted probabilities of class 0, 1, and 2 (columns) for each observation (row) preds_proba = ngb.predict_proba(X_test) preds = ngb.predict(X_test) print(confusion_matrix(Y_test, preds)) print(classification_report(Y_test, preds))