class ModelNgbClassifier(Model): def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): # ハイパーパラメータの設定 params = dict(self.params) early_stopping_rounds = params.pop('early_stopping_rounds') self.model = NGBClassifier(**params) self.model.fit(tr_x.values, tr_y.astype(int).values, va_x.values, va_y.astype(int).values, early_stopping_rounds=early_stopping_rounds) def predict(self, te_x): return self.model.predict_proba(te_x.values)[:, 1] def save_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') os.makedirs(os.path.dirname(model_path), exist_ok=True) Data.dump(self.model, model_path) def load_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') self.model = Data.load(model_path)
def test_classification(): from sklearn.datasets import load_breast_cancer from sklearn.metrics import roc_auc_score, log_loss data, target = load_breast_cancer(True) x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42) ngb = NGBClassifier(Dist=Bernoulli, verbose=False) ngb.fit(x_train, y_train) preds = ngb.predict(x_test) score = roc_auc_score(y_test, preds) assert score >= 0.95 preds = ngb.predict_proba(x_test) score = log_loss(y_test, preds) assert score <= 0.20 score = ngb.score(x_test, y_test) assert score <= 0.20 dist = ngb.pred_dist(x_test) assert isinstance(dist, Bernoulli) preds = ngb.dist_to_prediction(dist) score = roc_auc_score(y_test, preds) assert score >= 0.95
class NGBoost(BaseEstimator, ClassifierMixin): def __init__(self, **params): logger.info('Initializing NGBoost...') self.params_ = params self.classes_ = np.array([0, 1]) def get_params(self, deep=True): return self.params_ def _to_numpy(self, X): if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series): try: return X.to_numpy() except: ValueError('There is error when converting to numpy') elif isinstance(X, np.ndarray): return X else: ValueError('X must be pandas DataFrame, Series or numpy ndarray') def fit(self, X, y, *args, **kwargs): logger.info(f'NGBoost, fit') logger.info(f'NGBoost, training data shape {X.shape}') logger.info(f'NGBoost, training label shape {y.shape}') X_np = self._to_numpy(X) y_np = self._to_numpy(y) y_np = y_np.astype(int) print(f'np.unique(y_np)') self.estimator_ = NGBClassifier(**self.params_) self.estimator_.fit(X_np, y_np) logger.info(f'NGBoost, done fit') return self def transform(self, X, *args, **kwargs): logger.info(f'NGBoost, transform') logger.info(f'NGBoost, transform, testing shape: {X.shape}') X_np = self._to_numpy(X) pred = self.estimator_.predict_proba(X_np)[:, 1].reshape(-1) logger.info(f'NGBoost, transform, predictions shape: {pred.shape}') logger.info(f'NGBoost, done transform') return pred def score(self, X, y, *args, **kwargs): return roc_auc_score(y, self.transform(X)) def predict_proba(self, X, *args, **kwargs): logger.info(f'NGBoost, predict_proba') logger.info(f'NGBoost, predict_proba, testing shape: {X.shape}') X_np = self._to_numpy(X) pred = self.estimator_.predict_proba(X_np) logger.info(f'NGBoost, predict_proba, done') return pred
def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): # ハイパーパラメータの設定 params = dict(self.params) early_stopping_rounds = params.pop('early_stopping_rounds') self.model = NGBClassifier(**params) self.model.fit(tr_x.values, tr_y.astype(int).values, va_x.values, va_y.astype(int).values, early_stopping_rounds=early_stopping_rounds)
def fit(self, X, y, *args, **kwargs): logger.info(f'NGBoost, fit') logger.info(f'NGBoost, training data shape {X.shape}') logger.info(f'NGBoost, training label shape {y.shape}') X_np = self._to_numpy(X) y_np = self._to_numpy(y) y_np = y_np.astype(int) print(f'np.unique(y_np)') self.estimator_ = NGBClassifier(**self.params_) self.estimator_.fit(X_np, y_np) logger.info(f'NGBoost, done fit') return self
def test_classification(breast_cancer_data): from sklearn.metrics import roc_auc_score, log_loss x_train, x_test, y_train, y_test = breast_cancer_data ngb = NGBClassifier(Dist=Bernoulli, verbose=False) ngb.fit(x_train, y_train) preds = ngb.predict(x_test) score = roc_auc_score(y_test, preds) # loose score requirement so it isn't failing all the time assert score >= 0.85 preds = ngb.predict_proba(x_test) score = log_loss(y_test, preds) assert score <= 0.30 score = ngb.score(x_test, y_test) assert score <= 0.30 dist = ngb.pred_dist(x_test) assert isinstance(dist, Bernoulli) score = roc_auc_score(y_test, preds[:, 1]) assert score >= 0.85
def NGBoost(self, args): ## Natural gradient Boosting logger.info("Running Natural Gradient Boosting ... ") ## https://stanfordmlgroup.github.io/ngboost/1-useage.html from ngboost.learners import default_tree_learner from ngboost.distns import k_categorical, Bernoulli ##Classifier from ngboost.distns import Exponential, Normal, LogNormal ## Regressor from ngboost.scores import MLE, LogScore, CRPScore ## Base Learner from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier # NGBoost ## Comment: If error with singular matrix, increase size of input data from 0.05 to 0.15 if args.predictor.lower() == 'classifier': from ngboost import NGBClassifier as ngb learner = DecisionTreeRegressor(criterion='friedman_mse', max_depth=6, random_state=SEED) ngb = ngb(Base=learner, n_estimators=2000, Score=MLE, Dist=Bernoulli, random_state=SEED) elif args.predictor.lower() == 'regressor': from ngboost import NGBRegressor as ngb learner = DecisionTreeRegressor(criterion='friedman_mse', max_depth=3, random_state=SEED) ngb = ngb(Base=default_tree_learner, Dist=Exponential, Score=LogScore, learning_rate=0.01, minibatch_frac=0.6, col_sample=0.6) ## Fit model ngb.fit(self.X_train, np.asarray(self.y_train).astype(int)) ## Predict the labels self.y_pred = ngb.predict(self.X_data) if args.predictor.lower() == 'regressor': self.y_pred = logistic.cdf(self.y_pred) self.data['boosting_score'] = self.y_pred self.model = ngb return self
def start(self): """ 01. Initialise the data paths and transformation functions. """ self.data_dir = '../data/raw_data' self.trans_primitives = ['weekday', 'hour', 'time_since_previous'] self.agg_primitives = [ 'mean', 'max', 'min', 'std', 'count', 'percent_true', 'last', 'time_since_last', 'mode' ] self.ignore_cols = [ 'num_contacts', 'num_referrals', 'num_successful_referrals' ] self.feature_windows = [10, 30, 60, 90] #[10,20,30] self.max_feature_depth = 2 # list of estimators to use self.estimators = [ ('cbc', CatBoostClassifier()), ('lgbmc', LGBMClassifier()), ('gbc', GradientBoostingClassifier(validation_fraction=0.15, n_iter_no_change=50)), ('et', ExtraTreeClassifier()), ('abc', AdaBoostClassifier()), ('rfc', RandomForestClassifier()), ('bc', BaggingClassifier()), ('etc', ExtraTreesClassifier()), ('gnb', GaussianNB()), ('mlpc', MLPClassifier()), ('gpc', GaussianProcessClassifier()), ('dtc', DecisionTreeClassifier()), ('qda', QuadraticDiscriminantAnalysis()), ('lr', LogisticRegression()), ('knn3', KNeighborsClassifier(3)), ('knn6', KNeighborsClassifier(6)), ('knn12', KNeighborsClassifier(12)), ('nc', NearestCentroid()), ('rnc', RadiusNeighborsClassifier()), ('lp', LabelPropagation()), ('pac', PassiveAggressiveClassifier()), ('rc', RidgeClassifier()), ('sgdc', SGDClassifier()), ('svg', SVC()), ('ngbc', NGBClassifier(Dist=Bernoulli)) ] self.next(self.load_raw_data)
class myNGBoostClassifier: def make(self , params ): self.model = NGBClassifier(**params ) return self def fit(self, xtrain, ytrain, xtest=None, ytest=None, fit_params={}): if type(xtrain) == pd.core.frame.DataFrame: xtrain = xtrain.values ytrain = ytrain.values if type(xtest) != type(None) and type(ytest) != type(None): xtest = xtest.values ytest = ytest.values if type(xtest) == type(None) or type(ytest) == type(None) : self.model.fit( xtrain , ytrain , **fit_params ) else: self.model.fit( xtrain , ytrain , X_val = xtest , Y_val = ytest ,**fit_params ) def predict(self , xs ): return self.model.predict(xs) def predict_proba(self, xs): if len(xs.shape) == 1: return self.model.predict_proba(xs.reshape(1,-1)) else: return self.model.predict_proba(xs)
def test_bernoulli(learner, breast_cancer_data: Tuple4Array): X_cls_train, X_cls_test, Y_cls_train, Y_cls_test = breast_cancer_data # test early stopping features # test other args, n_trees, LR, minibatching- args as fixture ngb = NGBClassifier(Dist=Bernoulli, Score=LogScore, Base=learner, verbose=False) ngb.fit(X_cls_train, Y_cls_train) y_pred = ngb.predict(X_cls_test) y_prob = ngb.predict_proba(X_cls_test) y_dist = ngb.pred_dist(X_cls_test)
def test_bernoulli(self, learners, cls_data): X_cls_train, X_cls_test, Y_cls_train, Y_cls_test = cls_data for Learner in learners: # test early stopping features # test other args, n_trees, LR, minibatching- args as fixture ngb = NGBClassifier(Dist=Bernoulli, Score=LogScore, Base=Learner, verbose=False) ngb.fit(X_cls_train, Y_cls_train) y_pred = ngb.predict(X_cls_test) y_prob = ngb.predict_proba(X_cls_test) y_dist = ngb.pred_dist(X_cls_test)
def test_categorical(k: int, learner, breast_cancer_data: Tuple4Array): X_train, X_test, y_train, _ = breast_cancer_data dist = k_categorical(k) y_train = np.random.randint(0, k, (len(y_train))) # test early stopping features ngb = NGBClassifier(Dist=dist, Score=LogScore, Base=learner, verbose=False) ngb.fit(X_train, y_train) y_pred = ngb.predict(X_test) y_prob = ngb.predict_proba(X_test) y_dist = ngb.pred_dist(X_test)
def test_categorical(self, learners, cls_data): X_cls_train, X_cls_test, Y_cls_train, Y_cls_test = cls_data for K in [2,4,7]: Dist = k_categorical(K) Y_cls_train = np.random.randint(0,K,(len(Y_cls_train))) for Learner in learners: # test early stopping features ngb = NGBClassifier(Dist=Dist, Score=LogScore, Base=Learner, verbose=False) ngb.fit(X_cls_train, Y_cls_train) y_pred = ngb.predict(X_cls_test) y_prob = ngb.predict_proba(X_cls_test) y_dist = ngb.pred_dist(X_cls_test)
from ngboost.distns import Bernoulli from ngboost import NGBClassifier from sklearn.datasets import load_breast_cancer from sklearn.model_selection import GridSearchCV, train_test_split if __name__ == "__main__": X, y = load_breast_cancer(True) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2) param_grid = { 'n_estimators': [200, 500], 'minibatch_frac': [1.0, 0.5], } ngb = NGBClassifier(natural_gradient=True, verbose=False, Dist=Bernoulli) grid_search = GridSearchCV(ngb, param_grid=param_grid, cv=5) grid_search.fit(X_train, y_train) print(grid_search.best_params_)
import numpy as np from ngboost import NGBClassifier from ngboost.distns import Bernoulli from ngboost.learners import default_tree_learner from ngboost.scores import MLE from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score if __name__ == "__main__": np.random.seed(12345) X, Y = load_breast_cancer(True) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) ngb = NGBClassifier(Base=default_tree_learner, Dist=Bernoulli, Score=MLE, verbose=True, natural_gradient=True) ngb.fit(X_train, Y_train) preds = ngb.pred_dist(X_test) print("ROC:", roc_auc_score(Y_test, preds.prob))
from ngboost.distns import k_categorical from ngboost import NGBClassifier from sklearn.datasets import load_breast_cancer from sklearn.model_selection import GridSearchCV, train_test_split from sklearn.tree import DecisionTreeRegressor from sklearn.linear_model import Ridge if __name__ == "__main__": # An example where the base learner is also searched over (this is how you would vary tree depth): X, Y = load_breast_cancer(True) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) b1 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=2) b2 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=4) b3 = Ridge(alpha=0.0) param_grid = { 'n_estimators': [20, 50], 'minibatch_frac': [1.0, 0.5], 'Base': [b1, b2] } ngb = NGBClassifier(natural_gradient=True, verbose=False, Dist=k_categorical(2)) grid_search = GridSearchCV(ngb, param_grid=param_grid, cv=5) grid_search.fit(X_train, Y_train) print(grid_search.best_params_)
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from ngboost import NGBClassifier from ngboost.distns import k_categorical if __name__ == "__main__": X, y = load_breast_cancer(True) y[0: 15] = 2 # artificially make this a 3-class problem instead of a 2-class problem X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2) ngb = NGBClassifier(Dist=k_categorical( 3)) # tell ngboost that there are 3 possible outcomes ngb.fit(X_train, Y_train) # Y should have only 3 values: {0,1,2} # predicted probabilities of class 0, 1, and 2 (columns) for each observation (row) preds = ngb.predict_proba(X_test)
def fixture_learners_data(breast_cancer_data, boston_data, boston_survival_data): """ Returns: A list of iterables, each iterable containing a fitted model and X data and the predictions for the X_data """ models_data = [] X_class_train, _, Y_class_train, _ = breast_cancer_data ngb = NGBClassifier(verbose=False, n_estimators=10) ngb.fit(X_class_train, Y_class_train) models_data.append((ngb, X_class_train, ngb.predict(X_class_train))) X_reg_train, _, Y_reg_train, _ = boston_data ngb = NGBRegressor(verbose=False, n_estimators=10) ngb.fit(X_reg_train, Y_reg_train) models_data.append((ngb, X_reg_train, ngb.predict(X_reg_train))) X_surv_train, _, T_surv_train, E_surv_train, _ = boston_survival_data ngb = NGBSurvival(verbose=False, n_estimators=10) ngb.fit(X_surv_train, T_surv_train, E_surv_train) models_data.append((ngb, X_surv_train, ngb.predict(X_surv_train))) ngb = NGBRegressor(Dist=MultivariateNormal(2), n_estimators=10) ngb.fit(X_surv_train, np.vstack([T_surv_train, E_surv_train]).T) models_data.append((ngb, X_surv_train, ngb.predict(X_surv_train))) return models_data
import numpy as np from ngboost import NGBClassifier from ngboost.distns import Bernoulli from ngboost.learners import default_tree_learner from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score if __name__ == "__main__": np.random.seed(12345) X, Y = load_breast_cancer(True) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) ngb = NGBClassifier(Dist=Bernoulli) ngb.fit(X_train, Y_train) preds = ngb.pred_dist(X_test) print("ROC:", roc_auc_score(Y_test, preds.probs[1]))
def ng_model(self): ngb_cat = NGBClassifier(Dist=k_categorical(2), verbose=True) ng_clf = ngb_cat.fit(self.X_t, self.y_t) print(ng_clf.feature_importances_) return ng_clf
def make(self , params ): self.model = NGBClassifier(**params ) return self