def __init__(self, model, sklearn_model: bool):
        r"""__init__ method

        This method is used to adapt the input `model` so it can be used for creating 
        confidente intervals with conformal prediction.

        Parameters
        ----------
        model:
            Model we want to use as the underlying model to generate predictions and the
            confidence interval. This model can only be a scikit learn model, LGBMRegressor,
            LGBMClassifier, XGBRegressor, XGBClassifier, CatBoostRegressor or CatBoostClassifier.
        sklearn_model: bool
            This variable indicates if the model belongs to scikit learn or not.

        Returns
        -------
        cp: obj: Adapt_to_CP
            The class of the adapted model.

        Examples
        --------
        >>> model = lightgbm.LGBMRegressor()
        >>> cp = Adapt_to_CP(model)
        """
        self.model = model
        if sklearn_model:
            if is_classifier(model):
                self.icp = IcpClassifier(NcFactory.create_nc(model))
            elif is_regressor(model):
                self.icp = IcpRegressor(NcFactory.create_nc(model))
        else:
            model_adapter = NonConformistAdapter(model)
            if is_classifier(model):
                self.icp = IcpClassifier(ClassifierNc(model_adapter))
            elif is_regressor(model):
                self.icp = IcpRegressor(RegressorNc(model_adapter))
            elif model.__class__.__name__ == "Booster":
                self.icp = IcpRegressor(RegressorNc(model_adapter))
model_name = "Tree"
framework_name = 'BCP'
# ------------------------------------------------------------------------------
# prediction with significance

error_summary = []
for sig in np.arange(0, 1.0001, 0.005):
    print('sig = ' + str(sig))
    s_folder = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
    for k, (train, test) in enumerate(s_folder.split(X, y)):
        x_train, x_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        truth = y_test.reshape((-1, 1))
        # -----------------------------------------------
        # BCP
        conformal_model = BootstrapConformalClassifier(IcpClassifier(ClassifierNc(ClassifierAdapter(simple_model))),
                                                       n_models=10)
        conformal_model.fit(x_train, y_train)

        # ------------------------------------------
        # ICP
        # x_train_sp, x_cal, y_train_sp, y_cal = train_test_split(x_train, y_train, test_size=0.3, shuffle=True,
        #                                                         random_state=1)
        # nc = NcFactory.create_nc(model=simple_model)
        # conformal_model = IcpClassifier(nc)
        # conformal_model.fit(x_train_sp, y_train_sp)
        # conformal_model.calibrate(x_cal, y_cal)

        # ---------------------------------------------------
        # CP
        # nc = NcFactory.create_nc(model=simple_model)
Beispiel #3
0
# -----------------------------------------------------------------
# force_prediction

result_summary = []
s_folder = StratifiedKFold(n_splits=10, shuffle=True)
for index, (train, test) in enumerate(s_folder.split(X, y)):
    x_train_std, x_test_std = X[train], X[test]
    y_train, y_test = y[train], y[test]
    truth = y_test.reshape((-1, 1))

    lda = LinearDiscriminantAnalysis(n_components=9)
    x_train_lda = lda.fit_transform(x_train_std, y_train)
    x_test_lda = lda.transform(x_test_std)

    nc_fun = NcFactory.create_nc(model=simple_model)
    model = BootstrapConformalClassifier(IcpClassifier(nc_fun))
    model.fit(x_train_lda, y_train)
    prediction = model.predict(x_test_lda, significance=None)
    table = np.hstack((prediction, truth))
    result = [1 - force_mean_errors(prediction, truth)]

    if index == 0:
        result_summary = result
    else:
        result_summary = np.vstack((result_summary, result))
    print('\nBCP_Force')
    if np.unique(truth).shape[0] == 10:
        print('True')
    else:
        print(
            'Warning!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'
# simple_model = KNeighborsClassifier(n_neighbors=1)
# model_name = '1NN'

# ------------------------------------------------------------------------------
# prediction with significance
error_summary = []
for sig in np.arange(0.1, 1.0, 0.002):
    print('sig = ' + str(sig))
    s_folder = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
    for k, (train, test) in enumerate(s_folder.split(X, y)):
        x_train, x_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        truth = y_test.reshape((-1, 1))

        model = BootstrapConformalClassifier(
            IcpClassifier(ClassifierNc(ClassifierAdapter(simple_model))))
        model.fit(x_train, y_train)
        prediction = model.predict(x_test, significance=None)
        table = np.hstack((prediction, truth))
        result = [
            class_mean_errors(prediction, truth, significance=sig),
            class_avg_c(prediction, truth, significance=sig)
        ]
        if k == 0:
            summary = result
        else:
            summary = np.vstack((summary, result))
        # print('\nBCP')
        # print('Accuracy: {}'.format(result[0]))
        # print('Average count: {}'.format(result[1]))
# force_prediction
s_folder = StratifiedKFold(n_splits=10, shuffle=True)
for index, (train, test) in enumerate(s_folder.split(X, y)):
    X_train, X_test = X[train], X[test]
    y_train, y_test = y[train], y[test]
    x_train_sp, x_cal, y_train_sp, y_cal = train_test_split(
        X_train, y_train, test_size=test_size, shuffle=True)
    y_test = y_test.reshape((-1, 1))

    lda = LinearDiscriminantAnalysis(n_components=9)
    x_train_lda = lda.fit_transform(x_train_sp, y_train_sp)
    x_cal_lda = lda.transform(x_cal)
    x_test_lda = lda.transform(X_test)

    nc = NcFactory.create_nc(model=model)
    icp = IcpClassifier(nc)

    icp.fit(x_train_lda, y_train_sp)
    icp.calibrate(x_cal_lda, y_cal)
    prediction = icp.predict(x_test_lda, significance=None)

    result = [1 - force_mean_errors(prediction, y_test)]
    if index == 0:
        result_summary = result
    else:
        result_summary = np.vstack((result_summary, result))
    print('\nICP_Force')
    if np.unique(y_test).shape[0] == 10:
        print('True')
    else:
        print(
Beispiel #6
0
def run_experiment(cur_test_method,
                   cur_dataset_name,
                   cur_batch_size,
                   cur_lr_loss,
                   cur_lr_dis,
                   cur_loss_steps,
                   cur_dis_steps,
                   cur_mu_val,
                   cur_epochs,
                   cur_model_type,
                   cur_random_state,
                   cur_second_scale,
                   num_experiments):

    method = cur_test_method

    seed = cur_random_state
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    dataset = cur_dataset_name

    batch_size = cur_batch_size

    # step size to minimize loss
    lr_loss = cur_lr_loss

    # step size used to fit bianry classifier (discriminator)
    lr_dis = cur_lr_dis

    # inner epochs to fit loss
    loss_steps = cur_loss_steps

    # inner epochs to fit binary classifier (discriminator)
    dis_steps = cur_dis_steps

    # total number of epochs
    epochs = cur_epochs

    # utility loss
    cost_pred = torch.nn.CrossEntropyLoss()

    model_type = cur_model_type

    metric = "equalized_odds"

    print(dataset)
    print(method)
    sys.stdout.flush()


    avg_length_0 = np.zeros(num_experiments)
    avg_length_1 = np.zeros(num_experiments)

    avg_coverage_0 = np.zeros(num_experiments)
    avg_coverage_1 = np.zeros(num_experiments)

    avg_p_val = np.zeros(num_experiments)
    pred_error = np.zeros(num_experiments)

    for i in range(num_experiments):

        # Split into train and test
        X, A, Y, X_cal, A_cal, Y_cal, X_test, A_test, Y_test = get_dataset.get_train_test_data(base_path, dataset, seed+i)
        in_shape = X.shape[1]
        num_classes = len(np.unique(Y))


        print("n train = " + str(X.shape[0]) + " p = " + str(X.shape[1]))
        print("n calibration = " + str(X_cal.shape[0]))
        print("n test = " + str(X_test.shape[0]))

        sys.stdout.flush()

        if method == "AdversarialDebiasing":

            class ClassAdapter(ClassifierAdapter):
                def __init__(self, model=None,fit_params=None, params=None):
                    super(ClassAdapter, self).__init__(model,fit_params)
                    # Instantiate model
                    self.learner = adv_debiasing.AdvDebiasingClassLearner(lr_loss,
                                                                          loss_steps,
                                                                          dis_steps,
                                                                          epochs,
                                                                          cost_pred,
                                                                          in_shape,
                                                                          batch_size,
                                                                          model_type,
                                                                          num_classes,
                                                                          cur_mu_val)

                def fit(self, x, y):
                    self.learner.fit(x, y)

                def predict(self, x):
                    return self.learner.predict(x)

        elif method == "FairDummies":

            class ClassAdapter(ClassifierAdapter):
                def __init__(self, model=None,fit_params=None, params=None):
                    super(ClassAdapter, self).__init__(model,fit_params)
                    # Instantiate model
                    self.learner = fair_dummies_learning.EquiClassLearner(lr=lr_loss,
                                                                       pretrain_pred_epochs=loss_steps,
                                                                       pretrain_dis_epochs=dis_steps,
                                                                       epochs=epochs,
                                                                       loss_steps=1,
                                                                       dis_steps=1,
                                                                       cost_pred=cost_pred,
                                                                       in_shape=in_shape,
                                                                       batch_size=batch_size,
                                                                       model_type=model_type,
                                                                       lambda_vec=cur_mu_val,
                                                                       second_moment_scaling=cur_second_scale,
                                                                       num_classes=num_classes)

                def fit(self, x, y):
                    self.learner.fit(x, y)

                def predict(self, x):
                    return self.learner.predict(x)

        elif method == "HGR":

            class ClassAdapter(ClassifierAdapter):
                def __init__(self, model=None,fit_params=None, params=None):
                    super(ClassAdapter, self).__init__(model,fit_params)
                    # Instantiate model
                    self.learner = continuous_fairness.HGR_Class_Learner(lr = lr_loss,
                                                                         epochs = epochs,
                                                                         mu=cur_mu_val,
                                                                         cost_pred=cost_pred,
                                                                         in_shape=in_shape,
                                                                         out_shape=num_classes,
                                                                         batch_size=batch_size,
                                                                         model_type=model_type)

                def fit(self, x, y):
                    self.learner.fit(x, y)

                def predict(self, x):
                    return self.learner.predict(x)

        elif method == "Baseline":

            class ClassAdapter(ClassifierAdapter):
                def __init__(self, model=None,fit_params=None, params=None):
                    super(ClassAdapter, self).__init__(model,fit_params)
                    # Instantiate model
                    self.learner = fair_dummies_learning.EquiClassLearner(lr=lr_loss,
                                                                       pretrain_pred_epochs=epochs,
                                                                       pretrain_dis_epochs=0,
                                                                       epochs=0,
                                                                       loss_steps=0,
                                                                       dis_steps=0,
                                                                       cost_pred=cost_pred,
                                                                       in_shape=in_shape,
                                                                       batch_size=batch_size,
                                                                       model_type=model_type,
                                                                       lambda_vec=0,
                                                                       second_moment_scaling=0,
                                                                       num_classes=num_classes)

                def fit(self, x, y):
                    self.learner.fit(x, y)

                def predict(self, x):
                    return self.learner.predict(x)

        fairness_class = ClassAdapter(model=None)


        nc = ClassifierNc(fairness_class, InverseProbabilityErrFunc())

        # function that extracts the group identifier
        def condition(x, y=None):
            return int(x[0][0]>0)

        icp = IcpClassifier(nc,condition=condition)

        input_data_train = np.concatenate((A[:,np.newaxis],X),1)
        icp.fit(input_data_train, Y)

        input_data_cal = np.concatenate((A_cal[:,np.newaxis],X_cal),1)
        icp.calibrate(input_data_cal, Y_cal)

        input_data_test = np.concatenate((A_test[:,np.newaxis],X_test),1)
        Yhat_test = icp.predict(input_data_test, significance=0.1)

        avg_coverage, avg_length = class_compute_coverage_len(Y_test, Yhat_test)
        coverage_sample, length_sample = class_compute_coverage_per_sample(Y_test,
                                                                           Yhat_test,
                                                                           0.1,
                                                                           input_data_test,
                                                                           condition)

        avg_length_0[i] = np.mean(length_sample[0])
        avg_coverage_0[i] = np.mean(coverage_sample[0])
        avg_length_1[i] = np.mean(length_sample[1])
        avg_coverage_1[i] = np.mean(coverage_sample[1])


        Yhat_out_cal = fairness_class.learner.predict(input_data_cal)
        Yhat_out_test = fairness_class.learner.predict(input_data_test)

        p_val = utility_functions.fair_dummies_test_classification(Yhat_out_cal,
                                                                   A_cal,
                                                                   Y_cal,
                                                                   Yhat_out_test,
                                                                   A_test,
                                                                   Y_test,
                                                                   num_reps=1,
                                                                   num_p_val_rep=1000,
                                                                   reg_func_name="Net")
        avg_p_val[i] = p_val

        pred_error[i] = 1.0-utility_functions.compute_acc_numpy(Yhat_out_test, Y_test)

        print("experiment = " + str(i+1))
        print("Coverage 0 = " + str(avg_coverage_0[i]))
        print("Coverage 1 = " + str(avg_coverage_1[i]))
        print("Length 0 = " + str(avg_length_0[i]))
        print("Length 1 = " + str(avg_length_1[i]))
        print("Prediction Error = " + str(pred_error[i]))


        print("p_val = " + str(p_val))

        sys.stdout.flush()


        outdir = './results/'
        if not os.path.exists(outdir):
            os.mkdir(outdir)

        out_name = outdir + 'results.csv'

        full_name = cur_test_method + "_" + cur_model_type
        df = pd.DataFrame({'method'    : [cur_test_method],
                           'dataset'   : [cur_dataset_name],
                           'batch_size': [cur_batch_size],
                           'lr_loss'   : [cur_lr_loss],
                           'lr_dis'    : [cur_lr_dis],
                           'loss_steps': [cur_loss_steps],
                           'dis_steps' : [cur_dis_steps],
                           'mu_val'    : [cur_mu_val],
                           'epochs'    : [cur_epochs],
                           'second_scale' : [cur_second_scale],
                           'random_state' : [seed+i],
                           'model_type'   : [cur_model_type],
                           'metric'       : [metric],
                           'avg_length'        : [avg_length],
                           'avg_coverage'        : [avg_coverage],
                           'avg_length_0'      : [avg_length_0[i]],
                           'avg_length_1'      : [avg_length_1[i]],
                           'avg_coverage_0'    : [avg_coverage_0[i]],
                           'avg_coverage_1'    : [avg_coverage_1[i]],
                           'pred_error'          : [pred_error[i]],
                           'p_val'           : [p_val],
                           'full_name'       : [full_name]
                           })

        if os.path.isfile(out_name):
            df2 = pd.read_csv(out_name)
            df = pd.concat([df2, df], ignore_index=True)

        df.to_csv(out_name, index=False)

        print(full_name)
        print("Num experiments %02d | Avg. Pred Err = %.4f | Avg Length 0 = %.4f | Avg Length 1 = %.4f | Avg Coverage 0 = %.4f | Avg Coverage 1 = %.4f | Avg p_val = %.4f | min p_val = %.4f" %
              (i+1, np.mean(pred_error[:i+1]), np.mean(avg_length_0[:i+1]), np.mean(avg_length_1[:i+1]),
                    np.mean(avg_coverage_0[:i+1]), np.mean(avg_coverage_1[:i+1]),
                    np.mean(avg_p_val[:i+1]), np.min(avg_p_val[:i+1])))
        print("======== Done =========")
        sys.stdout.flush()
Beispiel #7
0
calX, calY = df_cal.drop(['TARGET'], axis=1), df_cal['TARGET']

model = joblib.load(os.path.join("models", f"{MODEL}.pkl"))
if 'TARGET' in df_test.columns:
    testX, testY = df_test.drop(['id', 'TARGET'], axis=1), df_test['TARGET']
else:
    testX = df_test.drop(['id'], axis=1)

if PROBLEM_TYPE == 'classification':
    if MODEL == 'catboost':
        raise Exception('Cant compute inervals for CatBoostClassifier!')

    nc = NcFactory.create_nc(
        model, normalizer_model=KNeighborsRegressor(
            n_neighbors=11))  # Create a default nonconformity function
    icp = IcpClassifier(nc)

    icp.fit(trainX.values, trainY.values)

    # Calibrate the ICP using the calibration set
    icp.calibrate(calX.values, calY.values)

    # Produce predictions for the test set, with confidence 95%
    prediction = icp.predict(testX.to_numpy(), significance=0.05)

else:
    if MODEL == 'catboost':
        params = joblib.load("models/params.pkl")

        model = CatBoostRegressor()
        model.set_params(**params, loss_function='Quantile:alpha=0.025')
Beispiel #8
0
    X_test, y_test = X[test_index], y[test_index]

    lda = LinearDiscriminantAnalysis(n_components=9)
    X_train_lda = lda.fit_transform(X_train, y_train)
    X_test_lda = lda.transform(X_test)
    x_anomaly_lda = lda.transform(x_anomaly)

    x_train, x_cal, y_train, y_cal = train_test_split(X_train_lda,
                                                      y_train, test_size=0.3, shuffle=False, random_state=1)

    model = KNeighborsClassifier(n_neighbors=5)
    # -----------------------------------------------------------------------------
    # Train and calibrate
    # -----------------------------------------------------------------------------

    icp = IcpClassifier(ClassifierNc(ClassifierAdapter(model)))
    icp.fit(x_train, y_train)
    icp.calibrate(x_cal, y_cal)

    # -----------------------------------------------------------------------------
    # Predict
    # -----------------------------------------------------------------------------
    SIG = 0.2
    prediction = icp.predict(X_test_lda, significance=SIG)
    result = np.sum(prediction, axis=1)
    zero_sum_correct = (48 - result.sum(axis=0))/48
    correct.append(zero_sum_correct)
    print("the correct prediction")
    print(result)

    prediction_anomaly = icp.predict(x_anomaly_lda, significance=SIG)
x_train, x_val, x_test, y_train, y_val, y_test, x_cvtrain, y_cvtrain, x_train_unbalanced, y_train_unbalanced, x_cvtrain_unbalanced, y_cvtrain_unbalanced, og_features, og_features_train, catcolumns, numcolumns = prepro.Get_sets(
    train, val, test, cvtrain)

# %%
#*loading feature sets from the feature selection results

x_cvtrain, x_cvtrain_unbalanced, x_train, x_train_unbalanced, x_val, x_test = prepro.FeatureSelection(
    og_features, og_features_train, x_cvtrain, x_cvtrain_unbalanced, x_train,
    x_train_unbalanced, x_val, x_test)

# %%
model = RandomForestClassifier(n_estimators=300, n_jobs=-1)

# %%
nc = NcFactory.create_nc(model)  # Create a default nonconformity function
icp = IcpClassifier(nc)  # Create an inductive conformal classifier

x_train_np = x_cvtrain_unbalanced.to_numpy(copy=True)
x_val_np = x_val.to_numpy(copy=True)
x_test_np = x_test.to_numpy(copy=True)

y_train_np = y_cvtrain_unbalanced.to_numpy(copy=True)
y_val_np = y_val.to_numpy(copy=True)
y_test_np = y_test.to_numpy(copy=True)

# %%
print('fitting inductive conformal predictor')
# Fit the ICP using the proper training set
icp.fit(x_train_np, y_train_np)
print('calibrating inductive conformal prediction')
# Calibrate the ICP using the calibration set
simple_model = RandomForestClassifier(n_estimators=500, criterion='entropy')
model_name = "RF(500)"

# simple_model = KNeighborsClassifier(n_neighbors=1)
# model_name = '1NN'

# simple_model = SVC(C=6000.0, gamma=0.001, probability=True)
# model_name = "SVM(6000,0.001)"

# -----------------------------------------------------------------------------
# Define models
# -----------------------------------------------------------------------------

models = {
    'ACP-RandomSubSampler':
    AggregatedCp(IcpClassifier(ClassifierNc(ClassifierAdapter(simple_model))),
                 RandomSubSampler()),
    'ACP-CrossSampler':
    AggregatedCp(IcpClassifier(ClassifierNc(ClassifierAdapter(simple_model))),
                 CrossSampler()),
    'ACP-BootstrapSampler':
    AggregatedCp(IcpClassifier(ClassifierNc(ClassifierAdapter(simple_model))),
                 BootstrapSampler()),
    'CCP':
    CrossConformalClassifier(
        IcpClassifier(ClassifierNc(ClassifierAdapter(simple_model)))),
    'BCP':
    BootstrapConformalClassifier(
        IcpClassifier(ClassifierNc(ClassifierAdapter(simple_model)))),
}
error_summary = []
# --------------------------------------------
# prediction
# --------------------------------------------
simple_model = RandomForestClassifier(n_estimators=500, criterion='entropy')
model_name = "RF(500)"

# simple_model = SVC(C=60.0, gamma=0.001, probability=True)
# model_name = "SVM(60,0.001)"
# -----------------------------------------------------------------------------
# Define models
# -----------------------------------------------------------------------------

models = {
    'ACP-RandomSubSampler':
    AggregatedCp(IcpClassifier(ClassifierNc(ClassifierAdapter(simple_model))),
                 RandomSubSampler()),
    'ACP-CrossSampler':
    AggregatedCp(IcpClassifier(ClassifierNc(ClassifierAdapter(simple_model))),
                 CrossSampler()),
    'ACP-BootstrapSampler':
    AggregatedCp(IcpClassifier(ClassifierNc(ClassifierAdapter(simple_model))),
                 BootstrapSampler()),
    'CCP':
    CrossConformalClassifier(
        IcpClassifier(ClassifierNc(ClassifierAdapter(simple_model)))),
    'BCP':
    BootstrapConformalClassifier(
        IcpClassifier(ClassifierNc(ClassifierAdapter(simple_model)))),
}