def test_isotonic_regression():
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    y_ = np.array([3, 6, 6, 8, 8, 8, 10])
    assert_array_equal(y_, isotonic_regression(y))

    y = np.array([10, 0, 2])
    y_ = np.array([4, 4, 4])
    assert_array_equal(y_, isotonic_regression(y))

    x = np.arange(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    ir.fit(x, y)
    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
    assert_array_equal(ir.transform(x), ir.predict(x))

    # check that it is immune to permutation
    perm = np.random.permutation(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    assert_array_equal(ir.fit_transform(x[perm], y[perm]),
                       ir.fit_transform(x, y)[perm])
    assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])

    # check we don't crash when all x are equal:
    ir = IsotonicRegression()
    assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
Beispiel #2
0
def test_isotonic_regression():
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    y_ = np.array([3, 6, 6, 8, 8, 8, 10])
    assert_array_equal(y_, isotonic_regression(y))

    y = np.array([10, 0, 2])
    y_ = np.array([4, 4, 4])
    assert_array_equal(y_, isotonic_regression(y))

    x = np.arange(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    ir.fit(x, y)
    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
    assert_array_equal(ir.transform(x), ir.predict(x))

    # check that it is immune to permutation
    perm = np.random.permutation(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    assert_array_equal(ir.fit_transform(x[perm], y[perm]),
                       ir.fit_transform(x, y)[perm])
    assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])

    # check we don't crash when all x are equal:
    ir = IsotonicRegression()
    assert_array_equal(ir.fit_transform(np.ones(len(x)), y), np.mean(y))
Beispiel #3
0
def test_isotonic_regression_oob_bad_after():
    # Set y and x
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    x = np.arange(len(y))

    # Create model and fit
    ir = IsotonicRegression(increasing="auto", out_of_bounds="raise")

    # Make sure that we throw an error for bad out_of_bounds value in transform
    ir.fit(x, y)
    ir.out_of_bounds = "xyz"
    msg = "The argument ``out_of_bounds`` must be in 'nan', 'clip', 'raise'; got xyz"
    with pytest.raises(ValueError, match=msg):
        ir.transform(x)
Beispiel #4
0
def bootstrap_calibrate_prob(labels, weights, probs, n_calibrations=30, group_column=None, threshold=0., symmetrize=False, plot=False):
    """
    Bootstrap isotonic calibration: 
     * randomly divide data into train-test
     * on train isotonic is fitted and applyed to test
     * on test using calibrated probs p(B+) D2 and auc are calculated 
    
    :param probs: probabilities, numpy.array of shape [n_samples]
    :param labels: numpy.array of shape [n_samples] with labels 
    :param weights: numpy.array of shape [n_samples]
    :param threshold: float, to set labels 0/1 
    :param symmetrize: bool, do symmetric calibration, ex. for B+, B-
    
    :return: D2 array and auc array
    """
    aucs = []
    D2_array = []
    labels = (labels > threshold) * 1
    
    for _ in range(n_calibrations):
        if group_column is not None:
            train_probs, test_probs, train_labels, test_labels, train_weights, test_weights = train_test_split_group(
                group_column, probs, labels, weights, train_size=0.5)
        else:
            train_probs, test_probs, train_labels, test_labels, train_weights, test_weights = train_test_split(
                probs, labels, weights, train_size=0.5)
        iso_est = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
        if symmetrize:
            train_weights = 0.5*train_weights;
            iso_est.fit(numpy.r_[train_probs, 1-train_probs], 
                        numpy.r_[train_labels > 0, train_labels <= 0],
                        numpy.r_[train_weights, train_weights])
        else:
            iso_est.fit(train_probs, train_labels, train_weights)
            
        probs_calib = iso_est.transform(test_probs)

        if plot:
            plt.figure(1,figsize=(6,5))
            plt.scatter(train_probs, train_labels, color='black', zorder=20)
            X_test = numpy.linspace(0.001,0.999,500)
            y_test = iso_est.transform(X_test)
            plt.plot(X_test, y_test, color='blue', linewidth=3)
            plt.show()

        alpha = (1 - 2 * probs_calib) ** 2
        aucs.append(roc_auc_score(test_labels, test_probs, sample_weight=test_weights))
        D2_array.append(numpy.average(alpha, weights=test_weights))
    return D2_array, aucs
Beispiel #5
0
class SavableIsotonicRegression(object):
    def __init__(self, origvals, nullvals, increasing, min_frac_neg=0.95):
        self.origvals = origvals
        self.nullvals = nullvals
        self.increasing = increasing
        self.min_frac_neg = min_frac_neg
        self.ir = IsotonicRegression(
            out_of_bounds='clip', increasing=increasing).fit(
                X=np.concatenate([self.origvals, self.nullvals], axis=0),
                y=([1.0 for x in self.origvals] + [0.0
                                                   for x in self.nullvals]),
                sample_weight=([1.0 for x in self.origvals] + [
                    float(len(self.origvals)) / len(self.nullvals)
                    for x in self.nullvals
                ]))
        #Infer frac_pos based on the minimum value of the ir probs
        #See derivation in irval_to_probpos function
        min_prec_x = self.ir.X_min_ if self.increasing else self.ir.X_max_
        min_precision = self.ir.transform([min_prec_x])[0]
        implied_frac_neg = -1 / (1 - (1 / max(min_precision, 1e-7)))
        print("For increasing =", increasing, ", the minimum IR precision was",
              min_precision, "occurring at", min_prec_x, "implying a frac_neg",
              "of", implied_frac_neg)
        if (implied_frac_neg > 1.0 or implied_frac_neg < self.min_frac_neg):
            implied_frac_neg = max(min(1.0, implied_frac_neg),
                                   self.min_frac_neg)
            print("To be conservative, adjusted frac neg is", implied_frac_neg)
        self.implied_frac_neg = implied_frac_neg

    def transform(self, vals):
        return irval_to_probpos(self.ir.transform(vals),
                                frac_neg=self.implied_frac_neg)

    def save_hdf5(self, grp):
        grp.attrs['increasing'] = self.increasing
        grp.attrs['min_frac_neg'] = self.min_frac_neg
        grp.create_dataset('origvals', data=self.origvals)
        grp.create_dataset('nullvals', data=self.nullvals)

    @classmethod
    def from_hdf5(cls, grp):
        increasing = grp.attrs['increasing']
        min_frac_neg = grp.attrs['min_frac_neg']
        origvals = np.array(grp['origvals'])
        nullvals = np.array(grp['nullvals'])
        return cls(origvals=origvals,
                   nullvals=nullvals,
                   increasing=increasing,
                   min_frac_neg=min_frac_neg)
def test_isotonic_regression():
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    y_ = np.array([3, 6, 6, 8, 8, 8, 10])
    assert_array_equal(y_, isotonic_regression(y))

    x = np.arange(len(y))
    ir = IsotonicRegression(y_min=0.0, y_max=1.0)
    ir.fit(x, y)
    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
    assert_array_equal(ir.transform(x), ir.predict(x))

    # check that it is immune to permutation
    perm = np.random.permutation(len(y))
    ir = IsotonicRegression(y_min=0.0, y_max=1.0)
    assert_array_equal(ir.fit_transform(x[perm], y[perm]), ir.fit_transform(x, y)[perm])
    assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
Beispiel #7
0
class HTMLTime(object):
    """
    >>> htmlTime = HTMLTime(pathToIDX)
    >>> t = htmlTime(frameNumber)
    """

    def __init__(self, idx):
        super(HTMLTime, self).__init__()
        self.idx = idx

        # load .idx file using pandas
        df = read_table(
            self.idx, sep='\s+',
            names=['frame_number', 'frame_type', 'bytes', 'seconds']
        )
        x = np.array(df['frame_number'], dtype=np.float)
        y = np.array(df['seconds'], dtype=np.float)

        # train isotonic regression
        self.ir = IsotonicRegression(y_min=np.min(y), y_max=np.max(y))
        self.ir.fit(x, y)

        # frame number support
        self.xmin = np.min(x)
        self.xmax = np.max(x)

    def __call__(self, frameNumber):

        return self.ir.transform([min(self.xmax,
                                      max(self.xmin, frameNumber)
                                      )])[0]
Beispiel #8
0
def test_isotonic_2darray_more_than_1_feature():
    # Ensure IsotonicRegression raises error if input has more than 1 feature
    X = np.arange(10)
    X_2d = np.c_[X, X]
    y = np.arange(10)

    msg = "should be a 1d array or 2d array with 1 feature"
    with pytest.raises(ValueError, match=msg):
        IsotonicRegression().fit(X_2d, y)

    iso_reg = IsotonicRegression().fit(X, y)
    with pytest.raises(ValueError, match=msg):
        iso_reg.predict(X_2d)

    with pytest.raises(ValueError, match=msg):
        iso_reg.transform(X_2d)
def calibration_isotonic_regression(model_name, model, prob_model,
                                    X_calibration, y_calibration, X_train):
    # 1. function that trains the calibration regressor using as input calibration data in the first instance
    # 2. it then takes in the prob_out of the mdel on the test and outputs calibrated prob for further calculation of
    # calibrated std
    # ref: https: // arxiv.org / abs / 1807.00263
    if model_name == 'Bayes_Ridge_model':
        y_hat_calibration, sem_hat_calibration = model.predict(X_calibration,
                                                               return_std=True)

    elif model_name == 'RF_model':
        y_hat_calibration = model.predict(X_calibration)
        sem_hat_calibration = np.sqrt(
            fci.random_forest_error(model, X_train, X_calibration))

    else:
        print('Error: Not able to calculate variace!')
        # y_hat, sem = model.predict(X_calibration)

    prob_per_int_y_calibration, prob_y_calibration, prob_y_calibration_expected, prob = count_entries_per_interval(
        y_calibration, y_hat_calibration, sem_hat_calibration)
    prob_model_y_calibration = predict_prob(y_calibration, y_hat_calibration,
                                            sem_hat_calibration)

    # isotonic regression
    from sklearn.isotonic import IsotonicRegression as IR
    ir = IR(out_of_bounds='clip')
    ir.fit(prob_model_y_calibration, prob_y_calibration)

    prob_test_calibrated = ir.transform(prob_model)
    return prob_test_calibrated
def test_isotonic_regression_ties_secondary_():
    """
    Test isotonic regression fit, transform  and fit_transform
    against the "secondary" ties method and "pituitary" data from R
     "isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair,
     Isotone Optimization in R: Pool-Adjacent-Violators Algorithm
    (PAVA) and Active Set Methods

    Set values based on pituitary example and
     the following R command detailed in the paper above:
    > library("isotone")
    > data("pituitary")
    > res1 <- gpava(pituitary$age, pituitary$size, ties="secondary")
    > res1$x

    `isotone` version: 1.0-2, 2014-09-07
    R version: R version 3.1.1 (2014-07-10)
    """
    x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14]
    y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25]
    y_true = [22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222,
              22.22222, 22.22222, 22.22222, 24.25, 24.25]

    # Check fit, transform and fit_transform
    ir = IsotonicRegression()
    ir.fit(x, y)
    assert_array_almost_equal(ir.transform(x), y_true, 4)
    assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
Beispiel #11
0
def test_isotonic_regression_ties_secondary_():
    """
    Test isotonic regression fit, transform  and fit_transform
    against the "secondary" ties method and "pituitary" data from R
     "isotone" package, as detailed in: J. d. Leeuw, K. Hornik, P. Mair,
     Isotone Optimization in R: Pool-Adjacent-Violators Algorithm
    (PAVA) and Active Set Methods

    Set values based on pituitary example and
     the following R command detailed in the paper above:
    > library("isotone")
    > data("pituitary")
    > res1 <- gpava(pituitary$age, pituitary$size, ties="secondary")
    > res1$x

    `isotone` version: 1.0-2, 2014-09-07
    R version: R version 3.1.1 (2014-07-10)
    """
    x = [8, 8, 8, 10, 10, 10, 12, 12, 12, 14, 14]
    y = [21, 23.5, 23, 24, 21, 25, 21.5, 22, 19, 23.5, 25]
    y_true = [
        22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222, 22.22222,
        22.22222, 22.22222, 24.25, 24.25
    ]

    # Check fit, transform and fit_transform
    ir = IsotonicRegression()
    ir.fit(x, y)
    assert_array_almost_equal(ir.transform(x), y_true, 4)
    assert_array_almost_equal(ir.fit_transform(x, y), y_true, 4)
Beispiel #12
0
class IsotonicCalibrator(BaseEstimator, TransformerMixin):
    """
    Calculates a likelihood ratio of a score value, provided it is from one of
    two distributions. Uses isotonic regression for interpolation.
    """
    def __init__(self, add_one=False):
        self.add_one = add_one
        self._ir = IsotonicRegression()

    def fit(self, X, y, **fit_params):
        # prevent extreme LRs
        if ('add_one' in fit_params and fit_params['add_one']) or self.add_one:
            X = np.append(X, [1, 0])
            y = np.append(y, [0, 1])

        prior = np.sum(y) / y.size
        weight = y * (1 - prior) + (1 - y) * prior
        self._ir.fit(X, y, sample_weight=weight)

        return self

    def transform(self, X):
        self.p1 = self._ir.transform(X)
        self.p0 = 1 - self.p1
        return to_odds(self.p1)
Beispiel #13
0
def test_isotonic_regression():
    y = np.array([3, 7, 5, 9, 8, 7, 10])
    y_ = np.array([3, 6, 6, 8, 8, 8, 10])
    assert_array_equal(y_, isotonic_regression(y))

    x = np.arange(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    ir.fit(x, y)
    assert_array_equal(ir.fit(x, y).transform(x), ir.fit_transform(x, y))
    assert_array_equal(ir.transform(x), ir.predict(x))

    # check that it is immune to permutation
    perm = np.random.permutation(len(y))
    ir = IsotonicRegression(y_min=0., y_max=1.)
    assert_array_equal(ir.fit_transform(x[perm], y[perm]),
                       ir.fit_transform(x, y)[perm])
    assert_array_equal(ir.transform(x[perm]), ir.transform(x)[perm])
def predict_probs(model, train_class, train_features, test_features, normalize_probs=None):
    """
    Fit a given binary classification model to training sample features
    and return predicted probabilities for the positive class for
    the training and test samples.
    """
    model.fit(train_features, train_class)
    train_prob, test_prob = [model.predict_proba(f)[:, 1] for f in (train_features, test_features)]
    if normalize_probs == "ROCSlope":
        # calibrate probabilities based on the estimated local slope
        # of the ROC curve
        chunk_size = 10  # number of instances for slope estimation
        n_train_pos = 301  # total number of positive (preictal) instances
        n_train_neg = 3766  # total negative (interictal)
        n_chunk_tot = 4000.0 / float(chunk_size)  # estimated total in test data
        # sort training data classes by predicted probability
        sort_order = train_prob.argsort()
        p_sorted = train_prob[sort_order]
        c_sorted = train_class[sort_order]
        ix = np.array(range(len(train_prob)))
        # loop over chunks
        for i_ch in range(1 + (len(train_prob) - 1) / chunk_size):
            p_chunk, c_chunk = [
                x[np.where((ix >= i_ch * chunk_size) & (ix < (i_ch + 1) * chunk_size))[0]] for x in (p_sorted, c_sorted)
            ]
            pmin = np.min(p_chunk)
            pmax = np.max(p_chunk)
            # compute TPR/FPR (relative to the entire training set)
            tpr = np.sum(c_chunk) / float(n_train_pos)
            fpr = np.sum(1 - c_chunk) / float(n_train_neg)
            # compute probability transformation for this chunk
            qc = (2.0 / np.pi) * np.arctan(tpr / (fpr + 1.0e-3 / float(n_train_neg)))
            qmin = np.max((0.0, qc - 0.5 / float(n_chunk_tot)))
            qmax = np.min((1.0, qc + 0.5 / float(n_chunk_tot)))
            # transform probabilities
            tr_p_ch = np.where((train_prob > pmin) & (train_prob <= pmax))[0]
            train_prob[tr_p_ch] = qmin + (train_prob[tr_p_ch] - pmin) * (qmax - qmin) / (pmax - pmin)
            te_p_ch = np.where((test_prob > pmin) & (test_prob <= pmax))[0]
            test_prob[te_p_ch] = qmin + (test_prob[te_p_ch] - pmin) * (qmax - qmin) / (pmax - pmin)
    elif normalize_probs == "LogShift":
        # shift probabilities in log(p/(1-p)) so that a fraction f_pre
        # of the samples has probability > 0.5, where f_pre is the
        # fraction of preictal samples in the training data
        f_pre = len(np.where(train_class)[0]) / float(len(train_class))
        train_th, test_th = [sorted(p)[int((1.0 - f_pre) * len(p))] for p in (train_prob, test_prob)]
        train_prob, test_prob = [
            (1.0 - pth) * p / (pth + p - 2.0 * pth * p)
            for (pth, p) in zip((train_th, test_th), (train_prob, test_prob))
        ]
    elif normalize_probs == "IsoReg":
        # fit an isotonic regression model to training probabilities
        # and use the model to transform all probabilities
        prob_model = IsotonicRegression(out_of_bounds="clip")
        prob_model.fit(train_prob, train_class)
        train_prob, test_prob = [prob_model.transform(p) for p in (train_prob, test_prob)]
    elif normalize_probs is not None:
        sys.exit("Invalid value of normalize_probs:", str(normalize_probs))
    return (train_prob, test_prob)
Beispiel #15
0
def test_assert_raises_exceptions():
    ir = IsotonicRegression()
    rng = np.random.RandomState(42)

    msg = "Found input variables with inconsistent numbers of samples"
    with pytest.raises(ValueError, match=msg):
        ir.fit([0, 1, 2], [5, 7, 3], [0.1, 0.6])

    with pytest.raises(ValueError, match=msg):
        ir.fit([0, 1, 2], [5, 7])

    msg = 'X should be a 1d array'
    with pytest.raises(ValueError, match=msg):
        ir.fit(rng.randn(3, 10), [0, 1, 2])

    msg = 'Isotonic regression input X should be a 1d array'
    with pytest.raises(ValueError, match=msg):
        ir.transform(rng.randn(3, 10))
Beispiel #16
0
class IsotonicCalibrator(BaseEstimator, TransformerMixin):
    """
    Calculates a likelihood ratio of a score value, provided it is from one of
    two distributions. Uses isotonic regression for interpolation.
    """
    def __init__(self, add_one=False, add_misleading=0):
        """
        Arguments:
            add_one: deprecated (same as add_misleading=1)
            add_misleading: int: add misleading data points on both sides (default: 0)
        """
        if add_one:
            warnings.warn(
                'parameter `add_one` is deprecated; use `add_misleading=1` instead'
            )

        self.add_misleading = (1 if add_one else 0) + add_misleading
        self._ir = IsotonicRegression()

    def fit(self, X, y, **fit_params):
        # prevent extreme LRs
        if 'add_misleading' in fit_params:
            n_misleading = fit_params['add_misleading']
        elif 'add_one' in fit_params:
            warnings.warn(
                'parameter `add_one` is deprecated; use `add_misleading=1` instead'
            )
            n_misleading = 1 if fit_params['add_one'] else 0
        else:
            n_misleading = self.add_misleading

        if n_misleading > 0:
            X = np.concatenate([
                X,
                np.ones(n_misleading) * (X.max() + 1),
                np.ones(n_misleading) * (X.min() - 1)
            ])
            y = np.concatenate(
                [y, np.zeros(n_misleading),
                 np.ones(n_misleading)])

        prior = np.sum(y) / y.size
        weight = y * (1 - prior) + (1 - y) * prior
        self._ir.fit(X, y, sample_weight=weight)

        return self

    def transform(self, X):
        self.p1 = self._ir.transform(X)
        self.p0 = 1 - self.p1
        return to_odds(self.p1)
def bootstrap_calibrate_prob(labels,
                             weights,
                             probs,
                             n_calibrations=30,
                             threshold=0.,
                             symmetrize=False):
    """
    Bootstrap isotonic calibration (borrowed from tata-antares/tagging_LHCb):
    * randomly divide data into train-test
    * on train isotonic is fitted and applyed to test
    * on test using calibrated probs p(B+) D2 and auc are calculated
    
    :param probs: probabilities, numpy.array of shape [n_samples]
    :param labels: numpy.array of shape [n_samples] with labels
    :param weights: numpy.array of shape [n_samples]
    :param threshold: float, to set labels 0/1j
    :param symmetrize: bool, do symmetric calibration, ex. for B+, B-
    
    :return: D2 array and auc array
    """

    import numpy as np
    from sklearn.isotonic import IsotonicRegression
    from sklearn.cross_validation import train_test_split
    from sklearn.metrics import roc_auc_score

    aucs = []
    D2_array = []
    labels = (labels > threshold) * 1

    for _ in range(n_calibrations):
        (train_probs, test_probs, train_labels, test_labels, train_weights,
         test_weights) = train_test_split(probs,
                                          labels,
                                          weights,
                                          train_size=0.5)
        iso_reg = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
        if symmetrize:
            iso_reg.fit(np.r_[train_probs, 1 - train_probs],
                        np.r_[train_labels > 0, train_labels <= 0],
                        np.r_[train_weights, train_weights])
        else:
            iso_reg.fit(train_probs, train_labels, train_weights)

        probs_calib = iso_reg.transform(test_probs)
        alpha = (1 - 2 * probs_calib)**2
        aucs.append(
            roc_auc_score(test_labels, test_probs, sample_weight=test_weights))
        D2_array.append(np.average(alpha, weights=test_weights))
    return np.array(D2_array), np.array(aucs)
Beispiel #18
0
class IsotonicCalibration(BaseEstimator, TransformerMixin):
    """
    Построение модели изотонической регресии на наблюдениях:
    y_pred -> y_target
    """
    def __init__(self):
        self.calibration = IsotonicRegression(out_of_bounds="clip")

    def fit(self, y_pred: pd.Series, y_true: pd.Series):
        self.calibration.fit(y_pred, y_true)
        return self

    def transform(self, y_pred):
        return self.calibration.transform(y_pred)
Beispiel #19
0
    def cal(self, mod_rootdir=None, model_dirpaths=None, example_dirname='clean_example_data', n_samples=100):
        """
        Implements calibration
        :param mod_rootdir: directory containing a bunch of model directories to be used for calibration. Either this or
         model_dirpaths should be set.
        :param model_dirpaths: list of model directories to be used for calibration. Either this or mod_rootdir should
        be set.
        :param example_dirname: name of the (clean) example data directory in each model directory
        :param n_samples: number of noisy samples of each data point
        :return: numpy array of calibrated probabilities, ordered like the model_dirpaths (or sorted directories in
        mod_rootdir)
        """

        assert (mod_rootdir is not None) != (model_dirpaths is not None), "set either mod_rootdir or model_dirpaths"
        if model_dirpaths is None:
            print("deprecation warning: using mod_rootdir is deprecated in favor of explicitly setting model_dirpaths")
            model_dirpaths = utils.get_modeldirs(mod_rootdir)

        # get the data for calibration
        mags = self.get_cal_data(model_dirpaths, example_dirname, n_samples=n_samples)
        mags = mags.reshape(-1)
        y = np.array([utils.get_class(os.path.join(pth, 'config.json'), classtype='binary', file=True) for pth in
                      model_dirpaths])
        if n_samples is not None:
            y = y.reshape(-1, 1) * np.ones([1, n_samples])
            y = y.reshape(-1)



        # check for saved model
        irpath = self.get_irpath()
        if os.path.exists(irpath) and not self.overwrite:
            ir_model = joblib.load(irpath)
        else:
            # run the calibration & save model
            ir_model = IsotonicRegression(out_of_bounds='clip')
            clippedmags = np.clip(mags, np.percentile(mags, 10), np.percentile(mags, 90))
            # clippedmags = np.clip(mags, np.percentile(mags, 25), np.percentile(mags, 75))

            ir_model.fit(clippedmags, y)
            joblib.dump(ir_model, irpath)

        # get & return the calibrated probabilities
        pcal = ir_model.transform(mags)
        return pcal
Beispiel #20
0
class LLRIsotonicRegression(LLR):
    """Log-likelihood ratio estimation by isotonic regression"""

    def __init__(self, equal_priors=False):
        super(LLRIsotonicRegression, self).__init__()
        self.equal_priors = equal_priors

    def fit(self, X, Y):

        self.prior = self._get_prior(X, Y)

        scores, ratios = self._get_scores_ratios(X, Y)

        y_min = np.min(ratios)
        y_max = np.max(ratios)
        self.ir = IsotonicRegression(y_min=y_min, y_max=y_max)
        self.ir.fit(scores, ratios)

        return self

    def toLogLikelihoodRatio(self, scores):
        """Get log-likelihood ratio given scores

        Parameters
        ----------
        scores : numpy array
            Test scores

        Returns
        -------
        llr : numpy array
            Log-likelihood ratio array with same shape as input `scores`
        """
        x_min = np.min(self.ir.X_)
        x_max = np.max(self.ir.X_)

        oob_min = np.where(scores < x_min)
        oob_max = np.where(scores > x_max)
        ok = np.where((scores >= x_min) * (scores <= x_max))

        calibrated = np.zeros(scores.shape)
        calibrated[ok] = self.ir.transform(scores[ok])
        calibrated[oob_min] = self.ir.y_min
        calibrated[oob_max] = self.ir.y_max
        return calibrated
def calibration_isotonic_regression(data_calibration,
                                    prob_model):  # calibration function

    y_true_calibration, y_hat_calibration, sem_hat_calibration = predict_w_DNN(
        data_calibration)

    prob_per_int_y_calibration, prob_y_calibration, prob_y_calibration_expected, prob = count_entries_per_interval(
        y_true_calibration, y_hat_calibration, sem_hat_calibration)
    prob_model_y_calibration = predict_prob(y_true_calibration,
                                            y_hat_calibration,
                                            sem_hat_calibration)

    # isotonic regression
    from sklearn.isotonic import IsotonicRegression as IR
    ir = IR(out_of_bounds='clip')
    ir.fit(prob_model_y_calibration, prob_y_calibration)

    prob_test_calibrated = ir.transform(prob_model)
    return prob_test_calibrated
Beispiel #22
0
class IsotonicRecalibrator():
    def __init__(self, c, device):
        self.c = c
        self.ir = IR(out_of_bounds='clip')
        self.device = device

    def fit(self, output, label):
        x = output[:, self.c, :, :].reshape(-1).data.cpu().numpy().astype(
            np.float)
        y = (label == self.c).reshape(-1).data.cpu().numpy().astype(np.float)
        self.ir.fit(x, y)

    def predict(self, x):
        shape = x.shape
        x = x.reshape(-1).data.cpu().numpy().astype(np.float)

        return torch.tensor(self.ir.transform(x),
                            device=self.device,
                            dtype=torch.float).reshape(shape)
class IDXHack(object):
    """

    Usage
    =====
    >>> from mediaeval_util.repere import IDXHack
    >>> frame2time = IDXHack(args['--idx'])
    >>> trueTime = frame2time(opencvFrame, opencvTime)

    """

    def __init__(self, idx=None):
        super(IDXHack, self).__init__()
        self.idx = idx

        if self.idx:

            # load .idx file using pandas
            df = read_table(
                self.idx, sep='\s+',
                names=['frame_number', 'frame_type', 'bytes', 'seconds']
            )
            x = np.array(df['frame_number'], dtype=np.float)
            y = np.array(df['seconds'], dtype=np.float)

            # train isotonic regression
            self.ir = IsotonicRegression(y_min=np.min(y), y_max=np.max(y))
            self.ir.fit(x, y)

            # frame number support
            self.xmin = np.min(x)
            self.xmax = np.max(x)

    def __call__(self, opencvFrame, opencvTime):

        if self.idx is None:
            return opencvTime

        return self.ir.transform([min(self.xmax,
                                      max(self.xmin, opencvFrame)
                                      )])[0]
Beispiel #24
0
def calibrate_probabilities(prob_dict,instance_label_dict):
    labels = []
    probabilities = []
    print(len(prob_dict))
    print(len(instance_label_dict))
    for i in prob_dict:
        labels.append(instance_label_dict[i])
        probabilities.append(prob_dict[i])

    ir = IR(out_of_bounds='clip')
    ir.fit(probabilities,labels) #fit ir to abstract level precision and classes
    p_calibrated=ir.transform(probabilities)

    fig,ax = plt.subplots()
    fraction_of_positives, mean_predicted_value = calibration_curve(labels, p_calibrated, n_bins=10)
    ax.plot(mean_predicted_value, fraction_of_positives)
    fraction_of_positives, mean_predicted_value = calibration_curve(labels, probabilities, n_bins=10)
    ax.plot(mean_predicted_value, fraction_of_positives)

    plt.savefig('calibration_curve_on_data.png')
    return ir
Beispiel #25
0
def bootstrap_calibrate_prob(labels, weights, probs, n_calibrations=30,
                             threshold=0., symmetrize=False):
    """
    Bootstrap isotonic calibration (borrowed from tata-antares/tagging_LHCb):
     * randomly divide data into train-test
     * on train isotonic is fitted and applyed to test
     * on test using calibrated probs p(B+) D2 and auc are calculated

    :param probs: probabilities, numpy.array of shape [n_samples]
    :param labels: numpy.array of shape [n_samples] with labels
    :param weights: numpy.array of shape [n_samples]
    :param threshold: float, to set labels 0/1j
    :param symmetrize: bool, do symmetric calibration, ex. for B+, B-

    :return: D2 array and auc array
    """
    aucs = []
    D2_array = []
    labels = (labels > threshold) * 1

    for _ in range(n_calibrations):
        (train_probs, test_probs,
         train_labels, test_labels,
         train_weights, test_weights) = train_test_split(
            probs, labels, weights, train_size=0.5)
        iso_reg = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
        if symmetrize:
            iso_reg.fit(np.r_[train_probs, 1-train_probs],
                        np.r_[train_labels > 0, train_labels <= 0],
                        np.r_[train_weights, train_weights])
        else:
            iso_reg.fit(train_probs, train_labels, train_weights)

        probs_calib = iso_reg.transform(test_probs)
        alpha = (1 - 2 * probs_calib) ** 2
        aucs.append(roc_auc_score(test_labels, test_probs,
                                  sample_weight=test_weights))
        D2_array.append(np.average(alpha, weights=test_weights))
    return np.array(D2_array), np.array(aucs)
def test_isotonic_regression_with_ties_in_differently_sized_groups():
    """
    Non-regression test to handle issue 9432:
    https://github.com/scikit-learn/scikit-learn/issues/9432

    Compare against output in R:
    > library("isotone")
    > x <- c(0, 1, 1, 2, 3, 4)
    > y <- c(0, 0, 1, 0, 0, 1)
    > res1 <- gpava(x, y, ties="secondary")
    > res1$x

    `isotone` version: 1.1-0, 2015-07-24
    R version: R version 3.3.2 (2016-10-31)
    """
    x = np.array([0, 1, 1, 2, 3, 4])
    y = np.array([0, 0, 1, 0, 0, 1])
    y_true = np.array([0., 0.25, 0.25, 0.25, 0.25, 1.])
    ir = IsotonicRegression()
    ir.fit(x, y)
    assert_array_almost_equal(ir.transform(x), y_true)
    assert_array_almost_equal(ir.fit_transform(x, y), y_true)
Beispiel #27
0
def test_isotonic_regression_with_ties_in_differently_sized_groups():
    """
    Non-regression test to handle issue 9432:
    https://github.com/scikit-learn/scikit-learn/issues/9432

    Compare against output in R:
    > library("isotone")
    > x <- c(0, 1, 1, 2, 3, 4)
    > y <- c(0, 0, 1, 0, 0, 1)
    > res1 <- gpava(x, y, ties="secondary")
    > res1$x

    `isotone` version: 1.1-0, 2015-07-24
    R version: R version 3.3.2 (2016-10-31)
    """
    x = np.array([0, 1, 1, 2, 3, 4])
    y = np.array([0, 0, 1, 0, 0, 1])
    y_true = np.array([0., 0.25, 0.25, 0.25, 0.25, 1.])
    ir = IsotonicRegression()
    ir.fit(x, y)
    assert_array_almost_equal(ir.transform(x), y_true)
    assert_array_almost_equal(ir.fit_transform(x, y), y_true)
Beispiel #28
0
def get_mapping(counts, lengths, bs=None, smoothed=True, verbose=False):
    if verbose:
        print("Computing relationship genomic distance & expected counts")

    if sparse.issparse(counts):
        gdis, means = _get_mapping_sparse(counts, lengths, bs)
    else:
        gdis, means = _get_mapping_dense(counts, lengths, bs)

    if not smoothed:
        return np.array([gdis, means])

    if verbose:
        print("Fitting Isotonic Regression")

    from sklearn.isotonic import IsotonicRegression
    ir = IsotonicRegression(increasing=False, out_of_bounds="clip")
    if gdis.min() > 0:
        y = np.array(means).flatten()
        x = np.arange(y.shape[0])
    elif gdis.min() == 0:
        y = np.array(means)[1:].flatten()
        x = np.arange(y.shape[0])
    else:
        y = np.array(means)[2:].flatten()
        x = np.arange(y.shape[0])

    mask = np.invert(np.isnan(y) | np.isinf(y) | (y == 0))
    ir.fit(x[mask], y[mask])
    means_fitted = ir.transform(x)
    if gdis.min() < 0:
        expected_counts = np.concatenate([[means[0], 0], means_fitted])
    elif gdis.min() == 0:
        expected_counts = np.concatenate([[0], means_fitted])
    else:
        expected_counts = means_fitted

    return np.array([gdis, expected_counts])
Beispiel #29
0
class IsotonicCalibrator(BaseEstimator, TransformerMixin):
    """
    Calculates a likelihood ratio of a score value, provided it is from one of
    two distributions. Uses isotonic regression for interpolation.
    """
    def __init__(self, add_one=False):
        self.add_one = add_one
        self._ir = IsotonicRegression()

    def fit(self, X, y, **fit_params):
        X0, X1 = Xy_to_Xn(X, y)

        # prevent extreme LRs
        if ('add_one' in fit_params and fit_params['add_one']) or self.add_one:
            X0 = np.append(X0, 1)
            X1 = np.append(X1, 0)

        X0n = X0.shape[0]
        X1n = X1.shape[0]
        X, y = Xn_to_Xy(X0, X1)

        weight = np.concatenate([[X1n] * X0n, [X0n] * X1n])
        self._ir.fit(X, y, sample_weight=weight)

        return self

    def transform(self, X):
        if isinstance(X, np.matrix):
            X = X.A1

        posterior = self._ir.transform(X)

        self.p0 = (1 - posterior)
        self.p1 = posterior
        with np.errstate(divide='ignore'):
            return self.p1 / self.p0
Beispiel #30
0
           # print("fold: " + str(j))
           idx0 = xfolds[xfolds.fold5 != j + 1].index
           idx1 = xfolds[xfolds.fold5 == j + 1].index
           x0 = xtrain[xtrain.index.isin(idx0)]
           x1 = xtrain[xtrain.index.isin(idx1)]
           y0 = y[y.index.isin(idx0)]
           y1 = y[y.index.isin(idx1)]
       
           y_raw = np.array(x1)[:,wfold]
           storage_mat[j,0] = log_loss(y1, y_raw)
           ymat_valid[idx1,0] = y_raw
           
           # fit an isotonic regression for iso scaling
           ir = IR( out_of_bounds = 'clip' )	
           ir.fit( np.array(x0)[:,wfold], y0 )
           y_iso = ir.transform((np.array(x1)[:,0]))           
           storage_mat[j,1] = log_loss(y1, y_iso)        
           ymat_valid[idx1,1] = y_iso
           storage_mat[j,7] = log_loss(y1, y_iso + y0.mean() - y_iso.mean())
           ymat_valid[idx1,7] = y_iso + y0.mean() - y_iso.mean()

            # fit a logistic regression for Platt scaling           
           lr = LR(C = c_val)														
           lr.fit( np.array(x0)[:,0].reshape( -1, 1 ), y0 )
           y_platt = lr.predict_proba(np.array(x1)[:,0].reshape(-1,1))[:,1]
           storage_mat[j,2] = log_loss(y1, y_platt)
           ymat_valid[idx1,2] = y_platt
           storage_mat[j,8] = log_loss(y1, y_platt + y0.mean() - y_platt.mean())
           ymat_valid[idx1,8] = y_platt + y0.mean() - y_platt.mean()
           
           y_ri = 0.5 * (y_raw + y_iso)
Beispiel #31
0
p_train_all = read_csv(trainResult)['prob']
oriTrain = read_csv('../data/train.csv')
sameTrain = oriTrain[oriTrain['clickTime'] >= 190000].reset_index()
print len(sameTrain), len(p_train_all)
part_sameTrain = sameTrain[(sameTrain['clickTime'] >= 200000)
                           & (sameTrain['clickTime'] < 290000)]

p_train = p_train_all.loc[part_sameTrain.index]
y_train = part_sameTrain['label']

ir = IR()
ir.fit(p_train, y_train)

oriResult = read_csv(
    '../data/calibration/ffm_mergeAppUser_s17_preAction_190000_no_Dist_noNum_t150_k8_l2e-05_2017-06-05-20-58-00.csv'
)
p_test = oriResult['prob']
p_calibrated = ir.transform(
    p_test)  # or ir.fit( p_test ), that's the same thing

oriResult['new_prob'] = Series(p_calibrated)
oriResult.to_csv('../data/calibration/calib_temp.csv', index=False)
oriResult['nozero_new_prob'] = oriResult.apply(
    lambda x: x['new_prob'] if x['new_prob'] > 0 else x['prob'],
    axis='columns')

del oriResult['prob'], oriResult['new_prob']
oriResult.rename(columns={'nozero_new_prob': 'prob'}, inplace=True)
oriResult.to_csv('../data/calibration/calib_submit.csv', index=False)
Beispiel #32
0
class InterpolatedIsotonicRegression(BaseEstimator, TransformerMixin,
                                     RegressorMixin):
    """Interpolated Isotonic Regression model.

        apply linear interpolation to transform piecewise constant isotonic
        regression model into piecewise linear model
    """

    def __init__(self, y_min=None, y_max=None, increasing=True,
                 out_of_bounds='nan'):
        self.y_min = y_min
        self.y_max = y_max
        self.increasing = increasing
        self.out_of_bounds = out_of_bounds

    def fit(self, X, y, sample_weight=None):
        """Fit the model using X, y as training data.
        Parameters
        ----------
        X : array-like, shape=(n_samples,)
            Training data.
        y : array-like, shape=(n_samples,)
            Training target.
        sample_weight : array-like, shape=(n_samples,), optional, default: None
            Weights. If set to None, all weights will be set to 1 (equal
            weights).
        Returns
        -------
        self : object
            Returns an instance of self.
        Notes
        -----
        X is stored for future use, as `transform` needs X to interpolate
        new input data.
        """
        self.iso_ = IsotonicRegression(y_min=self.y_min,
                                       y_max=self.y_max,
                                       increasing=self.increasing,
                                       out_of_bounds=self.out_of_bounds)
        self.iso_.fit(X, y, sample_weight=sample_weight)

        p = self.iso_.transform(X)
        change_mask1 = (p - np.roll(p, 1)) > 0
        change_mask2 = np.roll(change_mask1, -1)
        change_mask1[0] = True
        change_mask1[-1] = True
        change_mask2[0] = True
        change_mask2[-1] = True

        self.iso_interp1_ = interp1d(X[change_mask1],
                                     p[change_mask1],
                                     bounds_error=False,
                                     fill_value=(0., 1.))
        self.iso_interp2_ = interp1d(X[change_mask2],
                                     p[change_mask2],
                                     bounds_error=False,
                                     fill_value=(0., 1.))

        return self

    def transform(self, T):
        """Transform new data by linear interpolation
        Parameters
        ----------
        T : array-like, shape=(n_samples,)
            Data to transform.
        Returns
        -------
        T_ : array, shape=(n_samples,)
            The transformed data
        """
        return 0.5 * (self.iso_interp1_(T) + self.iso_interp2_(T))

    def predict(self, T):
        """Predict new data by linear interpolation.
        Parameters
        ----------
        T : array-like, shape=(n_samples,)
            Data to transform.
        Returns
        -------
        T_ : array, shape=(n_samples,)
            Transformed data.
        """
        return self.transform(T)
# train/test split (in half)

train_end = y.shape[0] / 2
test_start = train_end + 1

y_train = y[0:train_end]
y_test = y[test_start:]

p_train = p[0:train_end]
p_test = p[test_start:]

###

ir = IR(out_of_bounds="clip")  # out_of_bounds param needs scikit-learn >= 0.15
ir.fit(p_train, y_train)
p_calibrated = ir.transform(p_test)

p_calibrated[np.isnan(p_calibrated)] = 0

###

acc = accuracy_score(y_test, np.round(p_test))
acc_calibrated = accuracy_score(y_test, np.round(p_calibrated))

auc = AUC(y_test, p_test)
auc_calibrated = AUC(y_test, p_calibrated)

ll = log_loss(y_test, p_test)
ll_calibrated = log_loss(y_test, p_calibrated)

print "accuracy - before/after:", acc, "/", acc_calibrated
Beispiel #34
0
class IsotonicCalibrator(BaseEstimator, RegressorMixin):
    """Probability calibration with isotonic regression.

    Note
    ----
    This class backports and extends `sklearn.isotonic.IsotonicRegression`.
    """
    def __init__(self,
                 y_min=None,
                 y_max=None,
                 increasing=True,
                 interpolation=False):
        """Constructor.

        Parameters
        ----------
        * `y_min` [optional]:
            If not `None`, set the lowest value of the fit to `y_min`.

        * `y_max` [optional]:
            If not `None`, set the highest value of the fit to `y_max`.

        * `increasing` [boolean or string, default=`True`]:
            If boolean, whether or not to fit the isotonic regression with `y`
            increasing or decreasing.
            The string value `"auto"` determines whether `y` should increase or
            decrease based on the Spearman correlation estimate's sign.

        * `interpolation` [boolean, default=`False`]:
            Whether linear interpolation is enabled or not.
        """
        self.y_min = y_min
        self.y_max = y_max
        self.increasing = increasing
        self.interpolation = interpolation

    def fit(self, T, y, sample_weight=None):
        """Fit using `T`, `y` as training data.

        Parameters
        ----------
        * `T` [array-like, shape=(n_samples,)]:
            Training data.

        * `y` [array-like, shape=(n_samples,)]:
            Training target.

        * `sample_weight` [array-like, shape=(n_samples,), optional]:
            Weights. If set to None, all weights will be set to 1.

        Returns
        -------
        * `self` [object]:
            `self`.

        Notes
        -----
        `T` is stored for future use, as `predict` needs T to interpolate
        new input data.
        """
        # Check input
        T = column_or_1d(T)

        # Fit isotonic regression
        self.ir_ = IsotonicRegression(y_min=self.y_min,
                                      y_max=self.y_max,
                                      increasing=self.increasing,
                                      out_of_bounds="clip")
        self.ir_.fit(T, y, sample_weight=sample_weight)

        # Interpolators
        if self.interpolation:
            p = self.ir_.transform(T)

            change_mask1 = (p - np.roll(p, 1)) > 0
            change_mask2 = np.roll(change_mask1, -1)
            change_mask1[0] = True
            change_mask1[-1] = True
            change_mask2[0] = True
            change_mask2[-1] = True

            self.interp1_ = interp1d(T[change_mask1],
                                     p[change_mask1],
                                     bounds_error=False,
                                     fill_value=(0., 1.))
            self.interp2_ = interp1d(T[change_mask2],
                                     p[change_mask2],
                                     bounds_error=False,
                                     fill_value=(0., 1.))

        return self

    def predict(self, T):
        """Calibrate data.

        Parameters
        ----------
        * `T` [array-like, shape=(n_samples,)]:
            Data to calibrate.

        Returns
        -------
        * `Tt` [array, shape=(n_samples,)]:
            Calibrated data.
        """
        if self.interpolation:
            T = column_or_1d(T)
            return 0.5 * (self.interp1_(T) + self.interp2_(T))

        else:
            return self.ir_.transform(T)
Beispiel #35
0
class CalibratedRegression:
    def __init__(self,
                 X,
                 y,
                 model,
                 cal_prop=0.2,
                 cdf_method='bayesian',
                 pp=None,
                 pp_params=None):
        '''Initializes the class

        Parameters
        ----------
        X : np.array
            Data X
        y : np.array
            Data y
        model : pymc3 model or sklearn or statsmodels model
            The model to be calibrated
        cal_prop : float, optional, default: None
            The proportion of the training set to be used to make the calibration set
        cdf_method : string, optional, default: 'bayesian'
            Whether it is a Bayesian model, statsmodel or sklearn model
            Must be 'bayesian', 'bootstrap' or 'statsmodels'
        pp : function, optional, default: None
            The function to calculate the posterior predictive. Must return a numpy array.
        pp_params : dict, default: None
            Any additional parameters to be passed into the posterior predictive function.
        '''
        # data
        self.X = X
        self.y = y

        # model
        self.model = model
        self.posterior_predictive = pp
        self.pp_params = pp_params

        # calibration features
        self.calibration_dataset = None
        self.isotonic = None

        # split up training and calibration sets
        self.X_train, self.X_cal, self.y_train, self.y_cal = train_test_split(
            X, y, test_size=cal_prop)

        if cdf_method in ['bayesian', 'bootstrap', 'statsmodels']:
            self.cdf_method = cdf_method
        else:
            raise ValueError(
                "cdf_method must be of type 'bayesian', 'bootstrap', or 'statsmodels'"
            )

    def bootstrap():
        '''Utility function to bootstrap.'''
        pass

    def fit(self):
        '''Fit underlying model

        Creates the calibration dataset and fits an IsotonicRegression on this dataset

        Returns
        -------
        self : CalibratedRegression object
        Returns a fit instance of the CalibratedRegression class
        '''

        if self.cdf_method == 'bayesian':
            # there should be a posterior_predictive function
            assert self.posterior_predictive is not None and self.pp_params is not None
            # call the posterior predictive function
            self.posterior_predictive_cal = self.posterior_predictive(
                self.X_cal, **self.pp_params)

        elif self.cdf_method == 'bootstrap':
            # get CDF from bootstrapping
            pass

        elif self.cdf_method == 'statsmodels':
            # get CDF from statsmodels
            pass

        # create the calibration dataset
        self.calibration_dataset, self.predicted_cdf, self.empirical_cdf = self.create_calibration_dataset(
        )

        # fit the isotonic regression
        self.isotonic = IsotonicRegression(out_of_bounds='clip')
        self.isotonic.fit(self.empirical_cdf, self.predicted_cdf)

        return self

    def create_calibration_dataset(self,
                                   X=None,
                                   y=None,
                                   pp=None,
                                   pp_params=None):
        '''Creates a Pandas dataframe which has the calibration dataset

        Parameters
        ----------
        X : np.array, optional, default: None
            Data X. Uses self.X_cal if None
        y : np.array, optional, default: None
            Data y. Uses self.y_cal if None
        pp : function, optional, default: None
            The function to calculate the posterior predictive. Must return a numpy array.
            Uses self.posterior_predictive if None
        pp_params : dict, default: None
            Any additional parameters to be passed into the posterior predictive function.
            Uses self.pp_params if None

        Returns
        -------
        calibration_dataset : Pandas dataframe
            THis contains X, y, predicted_cdf and empirical_cdf
        '''
        # check conditions
        X = X if X is not None else self.X_cal
        y = y if y is not None else self.y_cal
        pp = pp if pp is not None else self.posterior_predictive
        pp_params = pp_params if pp_params is not None else self.pp_params

        post_pred = pp(X, **pp_params)
        predicted_cdf = self.pcdf(post_pred, y)  # predicted CDF
        empirical_cdf = self.ecdf(predicted_cdf)  # empirical CDF

        # putting results in a Pandas dataframe
        calibration_dataset = pd.DataFrame({
            'X': X,
            'y': y,
            'predicted_cdf': predicted_cdf,
            'empirical_cdf': empirical_cdf
        })

        return calibration_dataset[[
            'X', 'y', 'predicted_cdf', 'empirical_cdf'
        ]], predicted_cdf, empirical_cdf

    def predict(self, X_test, y_pred, quantiles):
        '''Return point estimates and PIs.

        Parameters
        ----------
        X_test : np.array
            Test data
        y_pred : np.array
            The predictions made by the model
        quantiles : list
            List of floats between 0 and 1 to be calibrated. Example: [0.05, 0.5, 0.95]

        Returns
        -------
        posterior_predictive_test : np.array
            Posterior predictive samples for the test data
        new_quantiles : list
            List of new floats, also between 0 and 1, that are the calibrated version of the input quantiles

        '''
        assert self.isotonic is not None, 'Call fit() first'
        new_quantiles = self.predict_quantiles(quantiles)

        # saving variables
        self.X_test = X_test
        self.y_pred = y_pred
        self.posterior_predictive_test = self.posterior_predictive(
            X_test, **self.pp_params)

        # returning quantiles
        return self.posterior_predictive_test, new_quantiles

    def predict_quantiles(self, quantiles):
        '''Returns transformed quantiles according to the isotonic regression model

        Parameters
        ----------
        quantiles : list
            List of floats between 0 and 1 to be calibrated. Example: [0.05, 0.5, 0.95]

        Returns
        -------
        quantiles_ : list
            List of new floats, also between 0 and 1, that are the calibrated version of the input quantiles

        '''
        assert self.isotonic is not None, 'Call fit() first'
        return self.isotonic.transform(quantiles)

    def pcdf(self, post_pred, y):
        '''Gets Predicted CDF

        Gets the predicted cdf, also represented as H(x_t)(y_t) in the paper.

        Parameters
        ----------
        post_pred : np.array
            Posterior predictive samples generated by the model (at a particular quantile).
        y : np.array
            The true data

        Returns
        -------
        pcdf_ : np.array
            The predicted cdf

        '''
        return np.mean(post_pred <= y.reshape(-1, 1), axis=1)

    def ecdf(self, predicted_cdf):
        '''Empirical CDF.

        Gets the empirical cdf, also represented as $\hat{P}[H(x_t)(y_t)]$ in the paper.
        Counts how many points in the dataset have a pcdf <= to the pcdf of a point for all points in the dataset.

        Parameters
        ----------
        predicted_cdf : np.array
            Predicted cdf. Can be generated by calling self.pcdf for posterior predictive samples at a particular quantile.

        Returns
        -------
        ecdf_ : np.array
            The empirical cdf

        '''
        empirical_cdf = np.zeros(len(predicted_cdf))
        for i, p in enumerate(predicted_cdf):
            empirical_cdf[i] = np.sum(predicted_cdf <= p) / len(predicted_cdf)
        return empirical_cdf

    def plot_calibration_curve(self, ax):
        '''Plot calibration curve as described in paper (figure 3b).

        Parameters
        ----------
        ax : matplotlib axis object
            Axis to plot on

        Returns
        -------
        ax : matplotlib axis object
            Axis after it has been plotted on

        '''
        assert self.empirical_cdf is not None, 'Call fit() first'
        ax.scatter(self.predicted_cdf, self.empirical_cdf, alpha=0.7)
        ax.plot([0, 1], [0, 1],
                '--',
                color='grey',
                label='Perfect calibration')
        ax.set_xlabel('Predicted', fontsize=17)
        ax.set_ylabel('Empirical', fontsize=17)
        ax.set_title('Predicted CDF vs Empirical CDF', fontsize=17)
        ax.legend(fontsize=17)
        return ax

    def plot_diagnostic_curve(self, ax, X_test, y_test):
        '''Plot diagnostic curve as described in paper (figure 3c).

        Parameters
        ----------
        ax : matplotlib axis object
            Axis to plot on
        X_test : np.array
            Test data (X)
        y_test : np.array
            Test data (y). These are the predictions that need to be calibrated.

        Returns
        -------
        ax : matplotlib axis object
            Axis after it has been plotted on

        '''
        conf_level_lower_bounds = np.arange(start=0.025, stop=0.5, step=0.025)
        conf_levels = 1 - 2 * conf_level_lower_bounds
        unc_pcts = []
        cal_pcts = []

        for cl_lower in conf_level_lower_bounds:
            quants = [cl_lower, 1 - cl_lower]
            post_pred_test, new_quantiles = self.predict(
                X_test, y_test, quants)

            cal_lower, cal_upper = np.quantile(post_pred_test,
                                               new_quantiles,
                                               axis=1)
            unc_lower, unc_upper = np.quantile(post_pred_test, quants, axis=1)

            perc_within_unc = np.mean((y_test <= unc_upper)
                                      & (y_test >= unc_lower))
            perc_within_cal = np.mean((y_test <= cal_upper)
                                      & (y_test >= cal_lower))

            unc_pcts.append(perc_within_unc)
            cal_pcts.append(perc_within_cal)

        ax.plot([0, 1], [0, 1], '--', color='grey')
        ax.plot(conf_levels,
                unc_pcts,
                '-o',
                color='purple',
                label='uncalibrated')
        ax.plot(conf_levels, cal_pcts, '-o', color='red', label='calibrated')
        ax.legend(fontsize=14)
        ax.set_title('Diagnostic Plot', fontsize=17)
        ax.set_xlabel('Predicted Confidence Level', fontsize=17)
        ax.set_ylabel('Observed Confidence Level', fontsize=17)
        return ax

    def plot_intervals(self, ax, X_test, y_test, quantiles=[0.05, 0.5, 0.95]):
        '''Plot uncalibrated and calibrated predictive intervals.

        Parameters
        ----------
        ax : matplotlib axis object
            Axis to plot on
        X_test : np.array
            Test data (X)
        y_test : np.array
            Test data (y). These are the predictions that need to be calibrated.
        quantiles : list, optional, default=[0.05, 0.5, 0.95]
            List of floats between 0 and 1 to be calibrated.

        Returns
        -------
        ax : matplotlib axis object
            Axis after it has been plotted on

        '''
        assert len(ax) == 2, 'Need to provide two axes'

        post_pred_test, new_quantiles = self.predict(X_test, y_test, quantiles)
        cal_lower, cal_median, cal_upper = np.quantile(post_pred_test,
                                                       new_quantiles,
                                                       axis=1)
        unc_lower, unc_median, unc_upper = np.quantile(post_pred_test,
                                                       quantiles,
                                                       axis=1)
        perc_within_unc = np.mean((y_test <= unc_upper)
                                  & (y_test >= unc_lower))
        perc_within_cal = np.mean((y_test <= cal_upper)
                                  & (y_test >= cal_lower))

        ax[0].plot(X_test, y_test, 'o', color='black', alpha=0.2, markersize=3)
        ax[0].set_title(
            f'Uncalibrated: {100*perc_within_unc:.2f}% of the test points within {round((1-2*quantiles[0])*100)}% interval',
            fontsize=17)
        ax[0].set_xlabel('X', fontsize=17)
        ax[0].set_ylabel('y', fontsize=17)
        ax[0].fill_between(X_test,
                           unc_lower,
                           unc_upper,
                           color='green',
                           alpha=0.2)
        ax[0].plot(
            X_test,
            unc_median,
            label=f'Median. MSE={mean_squared_error(y_test, unc_median):.2f}')
        ax[0].legend(fontsize=17)

        ax[1].plot(X_test, y_test, 'o', color='black', alpha=0.2, markersize=3)
        ax[1].set_title(
            f'Calibrated: {100*perc_within_cal:.2f}% of the test points within {round((1-2*quantiles[0])*100)}% interval',
            fontsize=17)
        ax[1].set_xlabel('X', fontsize=17)
        ax[1].set_ylabel('y', fontsize=17)
        ax[1].fill_between(X_test,
                           cal_lower,
                           cal_upper,
                           color='yellow',
                           alpha=0.2)
        ax[1].plot(
            X_test,
            cal_median,
            label=f'Median. MSE={mean_squared_error(y_test, cal_median):.2f}')
        ax[1].legend(fontsize=17)

        return ax, (cal_lower, cal_median, cal_upper), (unc_lower, unc_median,
                                                        unc_upper)
def calculate_probability_distribution(tree , instances , index , cal_method =None):

	if cal_method == None :
		return tree.distribution_for_instance(instances.get_instance(index))

	elif cal_method == 'Platt' :

		p_train = np.zeros(shape=(instances.num_instances,1))
		y_train = np.zeros(shape=(instances.num_instances,1))

		for i,instance in enumerate(instances) :
		    dist = tree.distribution_for_instance(instance)
		    p_train[i] = [ (dist[1] - 0.5)*2.0 ]
		    y_train[i] = [instance.get_value(instance.class_index)]

		# print("p_train ====>>>" , p_train)
		# print("y_train ====>>>" , y_train)

		dist = (tree.distribution_for_instance(instances.get_instance(index))[1]-0.5)*2.0
		tmp = np.zeros(shape=(1,1))
		tmp[0] = [dist]

		print(np.sum(y_train))
		if np.sum(y_train) in [len(y_train),0]:
			print("all one class")
			for ins in instances : 
				print("ins ===> " , ins)
			return tree.distribution_for_instance(instances.get_instance(index))

		else :

			warnings.filterwarnings("ignore", category=FutureWarning)
			lr = LR(solver='lbfgs')                                                      
			lr.fit( p_train , np.ravel(y_train,order='C') )

			return lr.predict_proba( tmp.reshape(1, -1))[0]


	elif cal_method == 'Isotonic' :

		p_train = np.zeros(shape=(instances.num_instances,1))
		y_train = np.zeros(shape=(instances.num_instances,1))

		for i,instance in enumerate(instances) :
		    dist = tree.distribution_for_instance(instance)
		    p_train[i] = [ dist[1] ]
		    y_train[i] = [instance.get_value(instance.class_index)]


		dist = tree.distribution_for_instance(instances.get_instance(index))[1]
		tmp = np.zeros(shape=(1,1))
		tmp[0] = [dist]

		print(np.sum(y_train))
		if np.sum(y_train) in [len(y_train),0]:
			print("all one class")
			for ins in instances : 
				print("ins ===> " , ins)
			return tree.distribution_for_instance(instances.get_instance(index))

		else :

			ir = IR( out_of_bounds = 'clip' )
			ir.fit(np.ravel(p_train,order='C')  , np.ravel(y_train,order='C'))

			p = ir.transform( np.ravel(tmp,order='C'))[0]
			return [p,1-p]
			
	# elif cal_method == 'ProbabilityCalibrationTree' :
	# 	pass


	elif cal_method == 'ICP' :


		pass
	elif cal_method == 'Venn1' :
		calibrPts = []
		
		for i,instance in enumerate(instances) :
		    dist = tree.distribution_for_instance(instance)
		    score = dist[0] if  dist[1] < dist[0] else dist[1]
		    calibrPts.append( ( (score) , instance.get_value(instance.class_index) ) ) 
		    

		dist = (tree.distribution_for_instance(instances.get_instance(index)))
		score = dist[0] if dist[1] < dist[0] else dist[1]
		tmp = [score]

		p0,p1=VennABERS.ScoresToMultiProbs(calibrPts,tmp)
		print("Vennnnnn =========>>>>>>>>>>>>  ", p0, "  , ",p1)
		return [p0,p1]
		pass
# X = np.random.rand(N)
# X = np.sort(X)
# rs = check_random_state(312312)
# Y = rs.randint(-10, 10, size=(N,)) + 10. * np.log1p(np.arange(N))

L = 20 / 0.4
# L = 3 * 100

plt.plot(X, Y, 'o', label="Y")

idx_vector = np.arange(N)

ir = IsotonicRegression()
ir = ir.fit(X, Y)
Y_iso = ir.transform(X)
plt.plot(X, Y_iso, '-d', label="iso(Y)")
plt.legend()

T = np.linspace(0.001, 0.999, 50)
f = ir.predict(T)
f[T < X[0]] = Y_iso[0]
f[T > X[-1]] = Y_iso[-1]

delta = 0.1

# for idx in range(len(T)):
#     X_new = T[idx]
#     if X_new < X[0]:
#         lb = -L * np.abs(X_new - X[0]) - np.sqrt(2*np.log((N**2 + N)/delta))
#         lbm = 1
Beispiel #38
0
class Forecaster(nn.Module):
    def __init__(self, args):
        super(Forecaster, self).__init__()
        self.args = args

    def eval_all(self, bx, by):
        br = torch.rand(bx.shape[0], 1, device=bx.device)
        mean, stddev = self.forward(bx=bx, br=br)
        cdf = 0.5 * (1.0 + torch.erf((by - mean) / stddev / math.sqrt(2)))

        loss_cdf = torch.abs(cdf - br).mean()

        eps = 1e-5
        loss_cdf_kl = cdf * (torch.log(cdf + eps) - torch.log(br + eps)) + \
                      (1 - cdf) * (torch.log(1 - cdf + eps) - torch.log(1 - br + eps))
        loss_cdf_kl = loss_cdf_kl.mean()

        loss_stddev = stddev.mean()

        # loss_l2 = ((by - mean) ** 2).mean()

        # Log likelihood of by under the predicted Gaussian distribution
        loss_nll = torch.log(stddev) + math.log(2 * math.pi) / 2.0 + ((
            (by - mean) / stddev)**2 / 2.0)
        loss_nll = loss_nll.mean()

        return cdf, loss_cdf * (
            1 - self.args.klcoeff
        ) + loss_cdf_kl * self.args.klcoeff, loss_stddev, loss_nll

    def eval_in_batch(self, bx, by, batch_size):
        pass

    def recalibrate(self, bx, by):
        with torch.no_grad():
            cdf = self.eval_all(bx, by)[0].cpu().numpy()[:, 0].astype(np.float)

        cdf = np.sort(cdf)
        lin = np.linspace(0, 1, int(cdf.shape[0]))

        # Insert an extra 0 and 1 to ensure the range is always [0, 1], and trim CDF for numerical stability
        cdf = np.clip(cdf, a_max=1.0 - 1e-6, a_min=1e-6)
        cdf = np.insert(np.insert(cdf, -1, 1), 0, 0)
        lin = np.insert(np.insert(lin, -1, 1), 0, 0)

        self.iso_transform = IsotonicRegression()
        self.iso_transform.fit_transform(cdf, lin)

    def apply_recalibrate(self, cdf):
        if self.iso_transform is not None:
            # If input tensor output tensor
            # If input numpy array output numpy array
            is_torch = False
            if isinstance(cdf, type(torch.zeros(1))):
                device = cdf.get_device()
                cdf = cdf.cpu().numpy()
                is_torch = True

            original_shape = cdf.shape
            new_cdf = np.reshape(self.iso_transform.transform(cdf.flatten()),
                                 original_shape)
            if is_torch:
                new_cdf = torch.from_numpy(new_cdf).to(device)
            return new_cdf
        else:
            return cdf
Beispiel #39
0
class QuantileCalibration:
    """Quantile calibration based on Kuleshov et al. (2018):
    https://arxiv.org/abs/1807.00263

    Learns the relationship between predicted and empirical quantiles of the
    posterior predictive based on observations using isotonic regression.
    """
    def __init__(self):
        self.isotonic = None
        self.isotonic_inverse = None

    def fit(self, y, post_pred):
        """Train isotonic regression on predicted and empirical quantiles

        Constructs a recalibration dataset from the posterior predictive and
        observations of the response variable Y. Learns the inverse relationship
        between the two using isotonic regression.

        Args:
            y: the response variable, array of shape (T,) or (T, 1)
            post_pred: samples of the posterior predictive, array of shape (N, T)

        Returns:
            self: a fitted instance of the QuantileCalibration class
        """

        assert y.shape[0] == post_pred.shape[
            1], "y.shape[0] must match post_pred.shape[1]"

        # Build a recalibration dataset
        predicted, empirical = make_cal_dataset(y, post_pred)

        # Fit the recalibration dataset in forward mode: from predicted to empirical
        self.isotonic = IsotonicRegression(out_of_bounds="clip")
        self.isotonic.fit(predicted, empirical)

        # Fit the recalibration dataset in reverse: from empirical to predicted
        self.isotonic_inverse = IsotonicRegression(out_of_bounds="clip")
        self.isotonic_inverse.fit(empirical, predicted)

        return self

    def transform(self, quantiles):
        """Forward transform the values of the predicted quantiles to the
        empirical quantiles using a previously learned relationship.

        Args:
            quantiles: a 1-dimensional array

        Returns:
            empirical_quantiles: the values of the empirical quantiles corresponding
            to the predicted quantiles in the posterior predictive,
            a 1-dimensional array
        """
        assert self.isotonic is not None, "The calibration instance must be fit first"
        empirical_quantiles = self.isotonic.transform(quantiles)
        return empirical_quantiles

    def inverse_transform(self, quantiles):
        """Inverse transform the values of the desired (empirical) quantiles to the
        predicted quantiles using a previously learned relationship.

        Args:
            quantiles: a 1-dimensional array

        Returns:
            predicted_quantiles: the values of the predicted quantiles corresponding
            to the desired quantiles in the posterior predictive,
            a 1-dimensional array
        """
        assert self.isotonic_inverse is not None, "The calibration instance must be fit first"
        predicted_quantiles = self.isotonic_inverse.transform(quantiles)
        return predicted_quantiles
Beispiel #40
0
 def transform(self, X):
     return IsotonicRegression.transform(self, T=X)
Beispiel #41
0
class LLRIsotonicRegression(BaseEstimator, TransformerMixin):

    def __init__(self, equal_priors=False, y_min=1e-4, y_max=1. - 1e-4,
                 plottable=False):
        super(LLRIsotonicRegression, self).__init__()
        self.equal_priors = equal_priors
        self.y_min = y_min
        self.y_max = y_max
        self.plottable = plottable

    def fit(self, X, y):

        X, y = keepZeroOrOne(X, y, reshape=(-1, ))

        if self.plottable:
            self.X_ = X
            self.y_ = y

        if self.equal_priors:

            positive = X[y == 1]
            n_positive = len(positive)
            negative = X[y == 0]
            n_negative = len(negative)

            if n_positive > n_negative:
                # downsample positive examples
                positive = np.random.choice(positive,
                                            size=(n_negative, ),
                                            replace=False)
                n_positive = len(positive)

            else:
                # downsample negative examples
                negative = np.random.choice(negative,
                                            size=(n_positive, ),
                                            replace=False)
                n_negative = len(negative)

            X = np.hstack([negative, positive])
            y = np.hstack([
                np.zeros((n_negative, ), dtype=int),
                np.ones((n_positive, ), dtype=int)
            ])

        n_samples = X.shape[0]

        # hack for numpy
        _X_, f8 = str('X'), str('f8')
        _y_, i1 = str('y'), str('i1')

        Xy = np.zeros((n_samples, ), dtype=[(_X_, f8), (_y_, i1)])
        Xy[_X_] = X
        Xy[_y_] = y

        sorted_Xy = np.sort(Xy, order=_X_)

        self.regression_ = IsotonicRegression(y_min=self.y_min,
                                              y_max=self.y_max,
                                              out_of_bounds='clip')

        self.regression_.fit(sorted_Xy[_X_], sorted_Xy[_y_])

        return self

    def transform(self, X):
        shape = X.shape
        p = self.regression_.transform(X.reshape((-1, )))
        p = p.reshape(shape)
        return np.log(p) - np.log(1. - p)

    def _repr_png_(self):

        from pyannote.core.notebook import plt, _render

        # remember current figure size
        figsize = plt.rcParams['figure.figsize']
        # and update it for segment display
        plt.rcParams['figure.figsize'] = (5, 10)

        fig, (ax1, ax2, ax3) = plt.subplots(3, 1)

        _, bins = np.histogram(self.X_, bins=100)

        mu, sigma = np.mean(self.X_), np.std(self.X_)
        m = mu - 3 * sigma
        M = mu + 3 * sigma

        # m, M = np.min(self.X_), np.max(self.X_)

        positive = self.X_[self.y_ == 1]
        negative = self.X_[self.y_ == 0]
        ax1.hist(positive, bins=bins, alpha=0.5, color='g', normed=True)
        ax1.hist(negative, bins=bins, alpha=0.5, color='r', normed=True)
        ax1.set_xlim(m, M)

        t = np.linspace(m, M, 50)
        ax2.plot(t, self.transform(t))
        ax2.plot([m, M], [0, 0], 'k--')
        ax2.set_xlim(m, M)

        ax3.plot(t, posterior(self.transform(t)))
        ax3.plot([m, M], [0.5, 0.5], 'k--')
        ax3.set_xlim(m, M)
        ax3.set_ylim(-0.1, 1.1)

        data = _render(fig)

        # go back to previous figure size
        plt.rcParams['figure.figsize'] = figsize

        return data
Beispiel #42
0
def calibrated(test_predictions,
               oof_predictions,
               flag_transform=sigmoid,
               type_transform=parse_classifier_probas):
    """
    Update test predictions w.r.t to calibration trained on OOF predictions
    :param test_predictions:
    :param oof_predictions:
    :return:
    """
    from sklearn.isotonic import IsotonicRegression as IR
    import matplotlib.pyplot as plt

    oof_predictions = oof_predictions.copy()
    oof_predictions[OUTPUT_PRED_MODIFICATION_TYPE] = oof_predictions[
        OUTPUT_PRED_MODIFICATION_TYPE].apply(type_transform)
    oof_predictions[OUTPUT_PRED_MODIFICATION_FLAG] = oof_predictions[
        OUTPUT_PRED_MODIFICATION_FLAG].apply(flag_transform)

    test_predictions = test_predictions.copy()
    test_predictions[OUTPUT_PRED_MODIFICATION_TYPE] = test_predictions[
        OUTPUT_PRED_MODIFICATION_TYPE].apply(type_transform)
    test_predictions[OUTPUT_PRED_MODIFICATION_FLAG] = test_predictions[
        OUTPUT_PRED_MODIFICATION_FLAG].apply(flag_transform)

    y_true = oof_predictions["true_modification_flag"].values.astype(int)
    # print("Target", np.bincount(oof_predictions["true_modification_type"].values.astype(int)))

    if True:
        y_pred_raw = oof_predictions[OUTPUT_PRED_MODIFICATION_FLAG].values
        b_auc_before = alaska_weighted_auc(y_true, y_pred_raw)

        ir_flag = IR(out_of_bounds="clip", y_min=0, y_max=1)
        y_pred_cal = ir_flag.fit_transform(y_pred_raw, y_true)
        b_auc_after = alaska_weighted_auc(y_true, y_pred_cal)

        if b_auc_after > b_auc_before:
            test_predictions[
                OUTPUT_PRED_MODIFICATION_FLAG] = ir_flag.transform(
                    test_predictions[OUTPUT_PRED_MODIFICATION_FLAG].values)
        else:
            # test_predictions[OUTPUT_PRED_MODIFICATION_FLAG] = ir_flag.transform(
            #     test_predictions[OUTPUT_PRED_MODIFICATION_FLAG].values
            # )

            warnings.warn(
                f"Failed to train IR flag {b_auc_before} {b_auc_after}")

            plt.figure()
            plt.hist(y_pred_raw,
                     alpha=0.5,
                     bins=100,
                     label=f"non-calibrated {b_auc_after}")
            plt.hist(y_pred_cal,
                     alpha=0.5,
                     bins=100,
                     label=f"calibrated {b_auc_before}")
            plt.yscale("log")
            plt.legend()
            plt.show()

    if True:
        ir_type = IR(out_of_bounds="clip", y_min=0, y_max=1)
        y_pred_raw = oof_predictions[OUTPUT_PRED_MODIFICATION_TYPE].values
        c_auc_before = alaska_weighted_auc(y_true, y_pred_raw)
        y_pred_cal = ir_type.fit_transform(y_pred_raw, y_true)
        c_auc_after = alaska_weighted_auc(y_true, y_pred_cal)
        if c_auc_after > c_auc_before:
            test_predictions[
                OUTPUT_PRED_MODIFICATION_TYPE] = ir_type.transform(
                    test_predictions[OUTPUT_PRED_MODIFICATION_TYPE].values)

            # plt.figure()
            # plt.hist(y_pred_raw, alpha=0.5, bins=100, label=f"non-calibrated {c_auc_before}")
            # plt.hist(y_pred_cal, alpha=0.5, bins=100, label=f"calibrated {c_auc_after}")
            # plt.yscale("log")
            # plt.legend()
            # plt.show()
        else:
            # test_predictions[OUTPUT_PRED_MODIFICATION_TYPE] = ir_type.transform(
            #     test_predictions[OUTPUT_PRED_MODIFICATION_TYPE].values
            # )

            warnings.warn(
                f"Failed to train IR on type {c_auc_before} {c_auc_after}")

            # plt.figure()
            # plt.hist(y_pred_raw, alpha=0.5, bins=100, label=f"non-calibrated {c_auc_before}")
            # plt.hist(y_pred_cal, alpha=0.5, bins=100, label=f"calibrated {c_auc_after}")
            # plt.yscale("log")
            # plt.legend()
            # plt.show()

    results = {
        "b_auc_before": b_auc_before,
        "b_auc_after": b_auc_after,
        "c_auc_before": c_auc_before,
        "c_auc_after": c_auc_after,
    }
    return test_predictions, results
Beispiel #43
0
class IsotonicCalibrator(BaseEstimator, RegressorMixin):
    """Probability calibration with isotonic regression.

    Note
    ----
    This class backports and extends `sklearn.isotonic.IsotonicRegression`.
    """

    def __init__(self, y_min=None, y_max=None, increasing=True,
                 interpolation=False):
        """Constructor.

        Parameters
        ----------
        * `y_min` [optional]:
            If not `None`, set the lowest value of the fit to `y_min`.

        * `y_max` [optional]:
            If not `None`, set the highest value of the fit to `y_max`.

        * `increasing` [boolean or string, default=`True`]:
            If boolean, whether or not to fit the isotonic regression with `y`
            increasing or decreasing.
            The string value `"auto"` determines whether `y` should increase or
            decrease based on the Spearman correlation estimate's sign.

        * `interpolation` [boolean, default=`False`]:
            Whether linear interpolation is enabled or not.
        """
        self.y_min = y_min
        self.y_max = y_max
        self.increasing = increasing
        self.interpolation = interpolation

    def fit(self, T, y, sample_weight=None):
        """Fit using `T`, `y` as training data.

        Parameters
        ----------
        * `T` [array-like, shape=(n_samples,)]:
            Training data.

        * `y` [array-like, shape=(n_samples,)]:
            Training target.

        * `sample_weight` [array-like, shape=(n_samples,), optional]:
            Weights. If set to None, all weights will be set to 1.

        Returns
        -------
        * `self` [object]:
            `self`.

        Notes
        -----
        `T` is stored for future use, as `predict` needs T to interpolate
        new input data.
        """
        # Check input
        T = column_or_1d(T)

        # Fit isotonic regression
        self.ir_ = IsotonicRegression(y_min=self.y_min,
                                      y_max=self.y_max,
                                      increasing=self.increasing,
                                      out_of_bounds="clip")
        self.ir_.fit(T, y, sample_weight=sample_weight)

        # Interpolators
        if self.interpolation:
            p = self.ir_.transform(T)

            change_mask1 = (p - np.roll(p, 1)) > 0
            change_mask2 = np.roll(change_mask1, -1)
            change_mask1[0] = True
            change_mask1[-1] = True
            change_mask2[0] = True
            change_mask2[-1] = True

            self.interp1_ = interp1d(T[change_mask1], p[change_mask1],
                                     bounds_error=False,
                                     fill_value=(0., 1.))
            self.interp2_ = interp1d(T[change_mask2], p[change_mask2],
                                     bounds_error=False,
                                     fill_value=(0., 1.))

        return self

    def predict(self, T):
        """Calibrate data.

        Parameters
        ----------
        * `T` [array-like, shape=(n_samples,)]:
            Data to calibrate.

        Returns
        -------
        * `Tt` [array, shape=(n_samples,)]:
            Calibrated data.
        """
        if self.interpolation:
            T = column_or_1d(T)
            return 0.5 * (self.interp1_(T) + self.interp2_(T))

        else:
            return self.ir_.transform(T)
Beispiel #44
0
                   is_contact[is_contact < 0.5],
                   alpha=0.1,
                   s=SIZE,
                   color='k')
sc_1 = plt.scatter(gdca_scores[is_contact > 0.5],
                   is_contact[is_contact > 0.5],
                   alpha=0.1,
                   s=SIZE,
                   color='k')
sc_0.set_rasterized(True)
sc_1.set_rasterized(True)
print('--', time.time() - t0)

mean, edges, _ = stats.binned_statistic(gdca_scores, is_contact, bins=bins)
centres = (edges[:-1] + edges[1:]) / 2
plt.plot(centres, mean, color=settings.BLUE, alpha=0.5, linestyle='--')
plt.plot(x, iso.transform(x), color=settings.MAROON, linewidth=2)

plt.xlabel('Contact score')
plt.ylabel('Contact probability')
#plt.title('Isotonic regression')

plt.xlim(-1.2, 4.2)
plt.xticks(range(-1, 5))
plt.tight_layout()

print('.', time.time() - t0)
plt.savefig('../figures/isotonic.pdf')
print('!', time.time() - t0)
plt.show()