Example #1
0
def test_train_test_split_group():
    data = list(range(50)) * 2
    group_column = list(range(50)) * 2
    train, test = utils.train_test_split_group(group_column, data)
    assert len(set.intersection(set(test), set(train))) == 0
Example #2
0
def test_train_test_split_group():
    data = list(range(50)) * 2
    group_column = list(range(50)) * 2
    train, test = utils.train_test_split_group(group_column, data)
    assert len(set.intersection(set(test), set(train))) == 0
Example #3
0
def bootstrap_calibrate_prob(labels, weights, probs, n_calibrations=30, group_column=None, threshold=0., symmetrize=False):
    """
    Bootstrap isotonic calibration: 
     * randomly divide data into train-test
     * on train isotonic is fitted and applyed to test
     * on test using calibrated probs p(B+) D2 and auc are calculated 
    
    :param probs: probabilities, numpy.array of shape [n_samples]
    :param labels: numpy.array of shape [n_samples] with labels 
    :param weights: numpy.array of shape [n_samples]
    :param threshold: float, to set labels 0/1 
    :param symmetrize: bool, do symmetric calibration, ex. for B+, B-
    
    :return: D2 array and auc array
    """
    aucs = []
    D2_array = []
    labels = (labels > threshold) * 1
    
    for _ in range(n_calibrations):
        if group_column is not None:
            train_probs, test_probs, train_labels, test_labels, train_weights, test_weights = train_test_split_group(
                group_column, probs, labels, weights, train_size=0.5)
        else:
            train_probs, test_probs, train_labels, test_labels, train_weights, test_weights = train_test_split(
                probs, labels, weights, train_size=0.5)
        iso_est = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
        if symmetrize:
            iso_est.fit(numpy.r_[train_probs, 1-train_probs], 
                        numpy.r_[train_labels > 0, train_labels <= 0],
                        numpy.r_[train_weights, train_weights])
        else:
            iso_est.fit(train_probs, train_labels, train_weights)
            
        probs_calib = iso_est.transform(test_probs)
        alpha = (1 - 2 * probs_calib) ** 2
        aucs.append(roc_auc_score(test_labels, test_probs, sample_weight=test_weights))
        D2_array.append(numpy.average(alpha, weights=test_weights))
    return D2_array, aucs