def test_add_knots():
    npts = 5000
    np.random.seed(42)
    xvec = np.random.uniform(size=npts)
    yvec = np.random.binomial(n=1, p=xvec)
    sc = mli.SplineCalib(knot_sample_size=0, add_knots=[.1, .2, .3, .4, .5])
    sc.fit(xvec, yvec)
    assert (len(sc.knot_vec) == 5)
def test_identity_calibration_reg_param():
    npts = 5000
    np.random.seed(42)
    xvec = np.random.uniform(size=npts)
    yvec = np.random.binomial(n=1, p=xvec)
    sc = mli.SplineCalib(reg_param_vec=np.logspace(-2, 2, 41))
    sc.fit(xvec, yvec)
    tvec = np.linspace(.001, .999, 999)
    max_err = np.max(np.abs(sc.calibrate(tvec) - tvec))
    assert (max_err < .015)
def test_knot_ss():
    npts = 100
    np.random.seed(42)
    xvec = np.random.uniform(size=npts)
    yvec = np.random.binomial(n=1, p=xvec)
    sc = mli.SplineCalib(knot_sample_size=20, force_knot_endpts=True)
    sc.fit(xvec, yvec)
    t1 = len(sc.knot_vec) == 20
    t2 = np.min(xvec) in sc.knot_vec
    t3 = np.max(xvec) in sc.knot_vec
    assert (t1 and t2 and t3)
def test_identity_calibration_unity():
    npts = 1000
    np.random.seed(42)
    xvec = np.random.uniform(size=npts)
    yvec = np.random.binomial(n=1, p=xvec)
    sc = mli.SplineCalib(unity_prior=True,
                         unity_prior_weight=2000,
                         unity_prior_gridsize=200)
    sc.fit(xvec, yvec)
    tvec = np.linspace(.001, .999, 999)
    max_err = np.max(np.abs(sc.calibrate(tvec) - tvec))
    assert (max_err < .01)
def test_random_and_add_knots():
    npts = 5000
    np.random.seed(42)
    xvec = np.random.uniform(size=npts)
    yvec = np.random.binomial(n=1, p=xvec)
    sc = mli.SplineCalib(knot_sample_size=20,
                         force_knot_endpts=True,
                         add_knots=[.1, .2, .3, .4, .5])
    sc.fit(xvec, yvec)
    t1 = len(sc.knot_vec) == 25
    t2 = np.min(xvec) in sc.knot_vec
    t3 = np.max(xvec) in sc.knot_vec
    t4 = .2 in sc.knot_vec
    assert (t1 and t2 and t3)
def test_mnist_calib():
    """
    This tests a multiclass calibration on data derived from MNIST 
    (using just the digits 0-4). We test the default settings and 
    ensure that the resulting log-loss gives good performance

    """
    calib_set = pd.read_csv(op.join(data_path, 'mnist4_calib_set.csv'))
    test_set = pd.read_csv(op.join(data_path, 'mnist4_test_set.csv'))

    preds_calib_set = calib_set.iloc[:, :-1].to_numpy()
    y_calib_set = calib_set.iloc[:, -1].to_numpy()
    preds_test = test_set.iloc[:, :-1].to_numpy()
    y_test = test_set.iloc[:, -1].to_numpy()
    sc = mli.SplineCalib()
    sc.fit(preds_calib_set, y_calib_set)
    preds_test_calibrated = sc.calibrate(preds_test)
    ll_calib = log_loss(y_test, preds_test_calibrated)
    assert (ll_calib < .2334)
Exemple #7
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        assert len(self.__class__.__bases__) == 3
        assert CalibratedClassifierModel in self.__class__.__bases__

        self.le.fit(self.labels)
        y_ = self.le.transform(y)

        whoami = [
            x for x in self.__class__.__bases__
            if (x != CustomModel and x != CalibratedClassifierModel)
        ][0]

        kwargs_classification = copy.deepcopy(self.params_base)
        kwargs_update = dict(
            num_classes=len(self.le.classes_),
            labels=list(np.arange(len(self.le.classes_))),
        )
        kwargs_classification.update(kwargs_update)
        for k in kwargs:
            if k in kwargs_classification:
                kwargs[k] = kwargs_classification[k]

        model_classification = whoami(
            context=self.context,
            unfitted_pipeline_path=self.unfitted_pipeline_path,
            transformed_features=self.transformed_features,
            original_user_cols=self.original_user_cols,
            date_format_strings=self.date_format_strings,
            **kwargs_classification)

        eval_set_classification = None
        val_y = None
        if eval_set is not None:
            eval_set_y = self.le.transform(eval_set[0][1])
            val_y = eval_set_y.astype(int)
            eval_set_classification = [(eval_set[0][0], val_y)]

        # Stratified split with classes control - making sure all classes present in both train and test
        unique_cls = np.unique(y_)
        tr_indx, te_indx = [], []

        for c in unique_cls:
            c_indx = np.argwhere(y_ == c).ravel()
            indx = np.random.permutation(c_indx)
            if self.params["calib_method"] in ["sigmoid", "isotonic"]:
                start_indx = max(1,
                                 int(self.params["calib_perc"] * len(c_indx)))
            else:
                start_indx = max(3,
                                 int(self.params["calib_perc"] * len(c_indx)))

            tr_indx += list(indx[start_indx:])
            te_indx += list(indx[:start_indx])
        tr_indx = np.array(tr_indx)
        te_indx = np.array(te_indx)

        X_train, y_train = X[tr_indx, :], y_.astype(int)[tr_indx]
        if self.params["calib_method"] in ["sigmoid", "isotonic"]:
            X_calibrate, y_calibrate = X[
                te_indx, :].to_pandas(), y[te_indx].ravel()
        else:
            X_calibrate, y_calibrate = X[te_indx, :].to_pandas(), y_.astype(
                int)[te_indx].ravel()

        if sample_weight is not None:
            sample_weight_ = sample_weight[tr_indx]
            sample_weight_calib = sample_weight[te_indx]
        else:
            sample_weight_ = sample_weight
            sample_weight_calib = sample_weight

        # mimic rest of fit_base not done:
        # get self.observed_labels
        model_classification.check_labels_and_response(y_train, val_y=val_y)
        model_classification.orig_cols = self.orig_cols
        model_classification.X_shape = self.X_shape

        model_classification.fit(X_train,
                                 y_train,
                                 sample_weight=sample_weight_,
                                 eval_set=eval_set_classification,
                                 sample_weight_eval_set=sample_weight_eval_set,
                                 **kwargs)

        model_classification.fitted = True
        model_classification.eval_set_used_during_fit = val_y is not None

        # calibration

        model_classification.predict_proba = model_classification.predict_simple
        model_classification.classes_ = self.le.classes_
        if self.params["calib_method"] in ["sigmoid", "isotonic"]:
            calibrator = CalibratedClassifierCV(
                base_estimator=model_classification,
                method=self.params["calib_method"],
                cv='prefit')

            calibrator.fit(X_calibrate,
                           y_calibrate,
                           sample_weight=sample_weight_calib)

            self.calib_method = calibrator.method

            if calibrator.method == "sigmoid":
                self.slope = []
                self.intercept = []

                for c in calibrator.calibrated_classifiers_[0].calibrators_:
                    self.slope.append(c.a_)
                    self.intercept.append(c.b_)

            elif calibrator.method == "isotonic":
                self._necessary_X_ = []
                self._necessary_y_ = []

                self.X_min_ = []
                self.X_max_ = []
                for c in calibrator.calibrated_classifiers_[0].calibrators_:
                    self._necessary_X_.append(c._necessary_X_)
                    self._necessary_y_.append(c._necessary_y_)

                    self.X_min_.append(c.X_min_)
                    self.X_max_.append(c.X_max_)

            else:
                raise RuntimeError('Unknown calibration method in fit()')

        elif self.params["calib_method"] in ["spline"]:
            import ml_insights as mli
            self.calib_method = "spline"
            spline = mli.SplineCalib(
                penalty='l2',
                solver='liblinear',
                reg_param_vec='default',
                cv_spline=3,
                random_state=4451,
                knot_sample_size=30,
            )

            preds = model_classification.predict_proba(X_calibrate)

            for c in range(preds.shape[1]):
                if len(
                        np.unique(preds[:, c])
                ) < 3:  # we need at least 3 unique points to form the knots
                    preds[:, c] = preds[:, c] + .0001 * np.random.randn(
                        len(preds[:, c]))

            spline.fit(preds, y_calibrate,
                       verbose=False)  # no weight support so far :(

            self.calib_logodds_scale = spline.logodds_scale
            self.calib_logodds_eps = spline.logodds_eps

            self.calib_knot_vec_tr = []
            self.calib_basis_coef_vec = []

            if spline.n_classes > 2:
                for calib_ in spline.binary_splinecalibs:
                    self.calib_knot_vec_tr.append(calib_.knot_vec_tr)
                    self.calib_basis_coef_vec.append(calib_.basis_coef_vec)
            else:
                self.calib_knot_vec_tr.append(spline.knot_vec_tr)
                self.calib_basis_coef_vec.append(spline.basis_coef_vec)

        else:
            raise RuntimeError('Unknown calibration method in fit()')
        # calibration

        varimp = model_classification.imp_features(columns=X.names)[[
            'LGain', 'LInteraction'
        ]].dropna(axis=0)
        varimp.index = varimp['LInteraction']
        varimp = varimp['LGain']
        varimp = varimp[:len(X.names)]
        varimp = varimp.reindex(X.names).values
        importances = varimp

        iters = model_classification.best_iterations
        iters = int(max(1, iters))
        self.set_model_properties(model=model_classification.model,
                                  features=list(X.names),
                                  importances=importances,
                                  iterations=iters)