def test_add_knots(): npts = 5000 np.random.seed(42) xvec = np.random.uniform(size=npts) yvec = np.random.binomial(n=1, p=xvec) sc = mli.SplineCalib(knot_sample_size=0, add_knots=[.1, .2, .3, .4, .5]) sc.fit(xvec, yvec) assert (len(sc.knot_vec) == 5)
def test_identity_calibration_reg_param(): npts = 5000 np.random.seed(42) xvec = np.random.uniform(size=npts) yvec = np.random.binomial(n=1, p=xvec) sc = mli.SplineCalib(reg_param_vec=np.logspace(-2, 2, 41)) sc.fit(xvec, yvec) tvec = np.linspace(.001, .999, 999) max_err = np.max(np.abs(sc.calibrate(tvec) - tvec)) assert (max_err < .015)
def test_knot_ss(): npts = 100 np.random.seed(42) xvec = np.random.uniform(size=npts) yvec = np.random.binomial(n=1, p=xvec) sc = mli.SplineCalib(knot_sample_size=20, force_knot_endpts=True) sc.fit(xvec, yvec) t1 = len(sc.knot_vec) == 20 t2 = np.min(xvec) in sc.knot_vec t3 = np.max(xvec) in sc.knot_vec assert (t1 and t2 and t3)
def test_identity_calibration_unity(): npts = 1000 np.random.seed(42) xvec = np.random.uniform(size=npts) yvec = np.random.binomial(n=1, p=xvec) sc = mli.SplineCalib(unity_prior=True, unity_prior_weight=2000, unity_prior_gridsize=200) sc.fit(xvec, yvec) tvec = np.linspace(.001, .999, 999) max_err = np.max(np.abs(sc.calibrate(tvec) - tvec)) assert (max_err < .01)
def test_random_and_add_knots(): npts = 5000 np.random.seed(42) xvec = np.random.uniform(size=npts) yvec = np.random.binomial(n=1, p=xvec) sc = mli.SplineCalib(knot_sample_size=20, force_knot_endpts=True, add_knots=[.1, .2, .3, .4, .5]) sc.fit(xvec, yvec) t1 = len(sc.knot_vec) == 25 t2 = np.min(xvec) in sc.knot_vec t3 = np.max(xvec) in sc.knot_vec t4 = .2 in sc.knot_vec assert (t1 and t2 and t3)
def test_mnist_calib(): """ This tests a multiclass calibration on data derived from MNIST (using just the digits 0-4). We test the default settings and ensure that the resulting log-loss gives good performance """ calib_set = pd.read_csv(op.join(data_path, 'mnist4_calib_set.csv')) test_set = pd.read_csv(op.join(data_path, 'mnist4_test_set.csv')) preds_calib_set = calib_set.iloc[:, :-1].to_numpy() y_calib_set = calib_set.iloc[:, -1].to_numpy() preds_test = test_set.iloc[:, :-1].to_numpy() y_test = test_set.iloc[:, -1].to_numpy() sc = mli.SplineCalib() sc.fit(preds_calib_set, y_calib_set) preds_test_calibrated = sc.calibrate(preds_test) ll_calib = log_loss(y_test, preds_test_calibrated) assert (ll_calib < .2334)
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): assert len(self.__class__.__bases__) == 3 assert CalibratedClassifierModel in self.__class__.__bases__ self.le.fit(self.labels) y_ = self.le.transform(y) whoami = [ x for x in self.__class__.__bases__ if (x != CustomModel and x != CalibratedClassifierModel) ][0] kwargs_classification = copy.deepcopy(self.params_base) kwargs_update = dict( num_classes=len(self.le.classes_), labels=list(np.arange(len(self.le.classes_))), ) kwargs_classification.update(kwargs_update) for k in kwargs: if k in kwargs_classification: kwargs[k] = kwargs_classification[k] model_classification = whoami( context=self.context, unfitted_pipeline_path=self.unfitted_pipeline_path, transformed_features=self.transformed_features, original_user_cols=self.original_user_cols, date_format_strings=self.date_format_strings, **kwargs_classification) eval_set_classification = None val_y = None if eval_set is not None: eval_set_y = self.le.transform(eval_set[0][1]) val_y = eval_set_y.astype(int) eval_set_classification = [(eval_set[0][0], val_y)] # Stratified split with classes control - making sure all classes present in both train and test unique_cls = np.unique(y_) tr_indx, te_indx = [], [] for c in unique_cls: c_indx = np.argwhere(y_ == c).ravel() indx = np.random.permutation(c_indx) if self.params["calib_method"] in ["sigmoid", "isotonic"]: start_indx = max(1, int(self.params["calib_perc"] * len(c_indx))) else: start_indx = max(3, int(self.params["calib_perc"] * len(c_indx))) tr_indx += list(indx[start_indx:]) te_indx += list(indx[:start_indx]) tr_indx = np.array(tr_indx) te_indx = np.array(te_indx) X_train, y_train = X[tr_indx, :], y_.astype(int)[tr_indx] if self.params["calib_method"] in ["sigmoid", "isotonic"]: X_calibrate, y_calibrate = X[ te_indx, :].to_pandas(), y[te_indx].ravel() else: X_calibrate, y_calibrate = X[te_indx, :].to_pandas(), y_.astype( int)[te_indx].ravel() if sample_weight is not None: sample_weight_ = sample_weight[tr_indx] sample_weight_calib = sample_weight[te_indx] else: sample_weight_ = sample_weight sample_weight_calib = sample_weight # mimic rest of fit_base not done: # get self.observed_labels model_classification.check_labels_and_response(y_train, val_y=val_y) model_classification.orig_cols = self.orig_cols model_classification.X_shape = self.X_shape model_classification.fit(X_train, y_train, sample_weight=sample_weight_, eval_set=eval_set_classification, sample_weight_eval_set=sample_weight_eval_set, **kwargs) model_classification.fitted = True model_classification.eval_set_used_during_fit = val_y is not None # calibration model_classification.predict_proba = model_classification.predict_simple model_classification.classes_ = self.le.classes_ if self.params["calib_method"] in ["sigmoid", "isotonic"]: calibrator = CalibratedClassifierCV( base_estimator=model_classification, method=self.params["calib_method"], cv='prefit') calibrator.fit(X_calibrate, y_calibrate, sample_weight=sample_weight_calib) self.calib_method = calibrator.method if calibrator.method == "sigmoid": self.slope = [] self.intercept = [] for c in calibrator.calibrated_classifiers_[0].calibrators_: self.slope.append(c.a_) self.intercept.append(c.b_) elif calibrator.method == "isotonic": self._necessary_X_ = [] self._necessary_y_ = [] self.X_min_ = [] self.X_max_ = [] for c in calibrator.calibrated_classifiers_[0].calibrators_: self._necessary_X_.append(c._necessary_X_) self._necessary_y_.append(c._necessary_y_) self.X_min_.append(c.X_min_) self.X_max_.append(c.X_max_) else: raise RuntimeError('Unknown calibration method in fit()') elif self.params["calib_method"] in ["spline"]: import ml_insights as mli self.calib_method = "spline" spline = mli.SplineCalib( penalty='l2', solver='liblinear', reg_param_vec='default', cv_spline=3, random_state=4451, knot_sample_size=30, ) preds = model_classification.predict_proba(X_calibrate) for c in range(preds.shape[1]): if len( np.unique(preds[:, c]) ) < 3: # we need at least 3 unique points to form the knots preds[:, c] = preds[:, c] + .0001 * np.random.randn( len(preds[:, c])) spline.fit(preds, y_calibrate, verbose=False) # no weight support so far :( self.calib_logodds_scale = spline.logodds_scale self.calib_logodds_eps = spline.logodds_eps self.calib_knot_vec_tr = [] self.calib_basis_coef_vec = [] if spline.n_classes > 2: for calib_ in spline.binary_splinecalibs: self.calib_knot_vec_tr.append(calib_.knot_vec_tr) self.calib_basis_coef_vec.append(calib_.basis_coef_vec) else: self.calib_knot_vec_tr.append(spline.knot_vec_tr) self.calib_basis_coef_vec.append(spline.basis_coef_vec) else: raise RuntimeError('Unknown calibration method in fit()') # calibration varimp = model_classification.imp_features(columns=X.names)[[ 'LGain', 'LInteraction' ]].dropna(axis=0) varimp.index = varimp['LInteraction'] varimp = varimp['LGain'] varimp = varimp[:len(X.names)] varimp = varimp.reindex(X.names).values importances = varimp iters = model_classification.best_iterations iters = int(max(1, iters)) self.set_model_properties(model=model_classification.model, features=list(X.names), importances=importances, iterations=iters)