def test_train_test_split_group(): data = list(range(50)) * 2 group_column = list(range(50)) * 2 train, test = utils.train_test_split_group(group_column, data) assert len(set.intersection(set(test), set(train))) == 0
def bootstrap_calibrate_prob(labels, weights, probs, n_calibrations=30, group_column=None, threshold=0., symmetrize=False): """ Bootstrap isotonic calibration: * randomly divide data into train-test * on train isotonic is fitted and applyed to test * on test using calibrated probs p(B+) D2 and auc are calculated :param probs: probabilities, numpy.array of shape [n_samples] :param labels: numpy.array of shape [n_samples] with labels :param weights: numpy.array of shape [n_samples] :param threshold: float, to set labels 0/1 :param symmetrize: bool, do symmetric calibration, ex. for B+, B- :return: D2 array and auc array """ aucs = [] D2_array = [] labels = (labels > threshold) * 1 for _ in range(n_calibrations): if group_column is not None: train_probs, test_probs, train_labels, test_labels, train_weights, test_weights = train_test_split_group( group_column, probs, labels, weights, train_size=0.5) else: train_probs, test_probs, train_labels, test_labels, train_weights, test_weights = train_test_split( probs, labels, weights, train_size=0.5) iso_est = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip') if symmetrize: iso_est.fit(numpy.r_[train_probs, 1-train_probs], numpy.r_[train_labels > 0, train_labels <= 0], numpy.r_[train_weights, train_weights]) else: iso_est.fit(train_probs, train_labels, train_weights) probs_calib = iso_est.transform(test_probs) alpha = (1 - 2 * probs_calib) ** 2 aucs.append(roc_auc_score(test_labels, test_probs, sample_weight=test_weights)) D2_array.append(numpy.average(alpha, weights=test_weights)) return D2_array, aucs