def __init__(self, model, sklearn_model: bool): r"""__init__ method This method is used to adapt the input `model` so it can be used for creating confidente intervals with conformal prediction. Parameters ---------- model: Model we want to use as the underlying model to generate predictions and the confidence interval. This model can only be a scikit learn model, LGBMRegressor, LGBMClassifier, XGBRegressor, XGBClassifier, CatBoostRegressor or CatBoostClassifier. sklearn_model: bool This variable indicates if the model belongs to scikit learn or not. Returns ------- cp: obj: Adapt_to_CP The class of the adapted model. Examples -------- >>> model = lightgbm.LGBMRegressor() >>> cp = Adapt_to_CP(model) """ self.model = model if sklearn_model: if is_classifier(model): self.icp = IcpClassifier(NcFactory.create_nc(model)) elif is_regressor(model): self.icp = IcpRegressor(NcFactory.create_nc(model)) else: model_adapter = NonConformistAdapter(model) if is_classifier(model): self.icp = IcpClassifier(ClassifierNc(model_adapter)) elif is_regressor(model): self.icp = IcpRegressor(RegressorNc(model_adapter)) elif model.__class__.__name__ == "Booster": self.icp = IcpRegressor(RegressorNc(model_adapter))
def ccp_predict(self, data_lbld, data_unlbld, new_lbld): # Create SMOTE instance for class rebalancing smote = SMOTE(random_state=self.random_state) # Create instance of classifier classifier_y = self.classifiers['classifier_y'] parameters_y = self.clf_parameters['classifier_y'] clf = classifier_y.set_params(**parameters_y) X = data_lbld.iloc[:, :-2] y = data_lbld.iloc[:, -1] X_new = new_lbld.iloc[:, :-2] y_new = new_lbld.iloc[:, -1] X = X.append(X_new, sort=False) y = y.append(y_new) X_unlbld = data_unlbld.iloc[:, :-2] sss = StratifiedKFold(n_splits=5, random_state=self.random_state) sss.get_n_splits(X, y) p_values = [] for train_index, calib_index in sss.split(X, y): X_train, X_calib = X.iloc[train_index], X.iloc[calib_index] y_train, y_calib = y.iloc[train_index], y.iloc[calib_index] if self.rebalancing_parameters['SMOTE_y']: X_train, y_train = smote.fit_resample(X_train, y_train) clf.fit(X_train[:, :-1], y_train, sample_weight=X_train[:, -1]) else: clf.fit(X_train.iloc[:, :-1], y_train, sample_weight=X_train.iloc[:, -1]) nc = NcFactory.create_nc(clf, MarginErrFunc()) icp = IcpClassifier(nc) if self.rebalancing_parameters['SMOTE_y']: icp.fit(X_train[:, :-1], y_train) else: icp.fit(X_train.iloc[:, :-1].values, y_train) icp.calibrate(X_calib.iloc[:, :-1].values, y_calib) # Predict confidences for validation sample and unlabeled sample p_values.append( icp.predict(X_unlbld.iloc[:, :-1].values, significance=None)) mean_p_values = np.array(p_values).mean(axis=0) ccp_predictions = pd.DataFrame(mean_p_values, columns=['mean_p_0', 'mean_p_1']) ccp_predictions["credibility"] = [ row.max() for _, row in ccp_predictions.iterrows() ] ccp_predictions["confidence"] = [ 1 - row.min() for _, row in ccp_predictions.iterrows() ] ccp_predictions.index = X_unlbld.index return ccp_predictions
folds=10, scoring_funcs=scoring_funcs, significance_levels=[0.05, 0.1, 0.2], ) print("\n{}: {}".format(icp_name, ds_name)) scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean()) # ----------------------------------------------------------------------------- # Classification # ----------------------------------------------------------------------------- data = load_iris() nc = NcFactory.create_nc(RandomForestClassifier(n_estimators=100)) icp = IcpClassifier(nc) icp_cv = ClassIcpCvHelper(icp) score_model(icp_cv, "IcpClassifier", data, "iris", [class_mean_errors, class_avg_c]) # ----------------------------------------------------------------------------- # Classification (normalized) # ----------------------------------------------------------------------------- data = load_iris() nc = NcFactory.create_nc(RandomForestClassifier(n_estimators=100), normalizer_model=KNeighborsRegressor()) icp = IcpClassifier(nc) icp_cv = ClassIcpCvHelper(icp)
ds.target, iterations=10, folds=10, scoring_funcs=scoring_funcs, significance_levels=[0.05, 0.1, 0.2]) print('\n{}: {}'.format(icp_name, ds_name)) scores = scores.drop(['fold', 'iter'], axis=1) print(scores.groupby(['significance']).mean()) # ----------------------------------------------------------------------------- # Classification # ----------------------------------------------------------------------------- data = load_iris() nc = NcFactory.create_nc(RandomForestClassifier(n_estimators=100)) icp = IcpClassifier(nc) icp_cv = ClassIcpCvHelper(icp) score_model(icp_cv, 'IcpClassifier', data, 'iris', [class_mean_errors, class_avg_c]) # ----------------------------------------------------------------------------- # Classification (normalized) # ----------------------------------------------------------------------------- data = load_iris() nc = NcFactory.create_nc(RandomForestClassifier(n_estimators=100), normalizer_model=KNeighborsRegressor())
# ----------------------------------------------------------------- # force_prediction result_summary = [] s_folder = StratifiedKFold(n_splits=10, shuffle=True) for index, (train, test) in enumerate(s_folder.split(X, y)): x_train_std, x_test_std = X[train], X[test] y_train, y_test = y[train], y[test] truth = y_test.reshape((-1, 1)) lda = LinearDiscriminantAnalysis(n_components=9) x_train_lda = lda.fit_transform(x_train_std, y_train) x_test_lda = lda.transform(x_test_std) nc_fun = NcFactory.create_nc(model=simple_model) model = BootstrapConformalClassifier(IcpClassifier(nc_fun)) model.fit(x_train_lda, y_train) prediction = model.predict(x_test_lda, significance=None) table = np.hstack((prediction, truth)) result = [1 - force_mean_errors(prediction, truth)] if index == 0: result_summary = result else: result_summary = np.vstack((result_summary, result)) print('\nBCP_Force') if np.unique(truth).shape[0] == 10: print('True') else: print(
def __updatePlot(self): plotIdx = 0 # Plot sampling over ground truth if self.groundTruth is not None: self.ax[plotIdx].clear() self.ax[plotIdx].set_xlim([0, 1.05]) self.ax[plotIdx].set_ylim([0, 1.05]) self.ax[plotIdx].set_title("ATNE sampling") self.ax[plotIdx].plot(self.groundTruth[:, 0], self.groundTruth[:, 1], 'x', color="0.7", markeredgewidth=1.8, markersize=5) if len(self.hintEliminatedIndexes ) > 0 and self.elimWeights is None: self.ax[plotIdx].plot( self.groundTruth[self.hintEliminatedIndexes, 0], self.groundTruth[self.hintEliminatedIndexes, 1], 'x', color="indianred", markeredgewidth=1.8, markersize=5) if self.elimWeights is None: self.ax[plotIdx].plot(self.groundTruth[self.relaxedIndexes, 0], self.groundTruth[self.relaxedIndexes, 1], 'x', color="g", markeredgewidth=1.8, markersize=5) else: #self.ax[plotIdx].plot(self.groundTruth[self.sampledIndexes,0], self.groundTruth[self.sampledIndexes,1], 'o', color="b", markeredgewidth=1.8, markersize=5, alpha=0.6) weights = np.sum( np.mean(self.elimWeights, axis=1) / np.max(np.mean(self.elimWeights, axis=1), axis=0), axis=1) # weights = np.mean(self.elimWeights[:,:,0], axis=1) alpha = 1 - (weights / np.max(weights)) / 2 red = weights / np.max(weights) for i in range(self.groundTruth.shape[0]): if i not in self.relaxedIndexes: continue if np.isnan(red[i]): red[i] = 0 self.ax[plotIdx].plot(self.groundTruth[i, 0], self.groundTruth[i, 1], 'x', color=[red[i], 1 - red[i], 0], markeredgewidth=1.8, markersize=5) self.ax[plotIdx].plot(self.groundTruth[self.sampledIndexes, 0], self.groundTruth[self.sampledIndexes, 1], 'x', color="b", markeredgewidth=1.8, markersize=5) plotIdx += 1 # Plot predicted design space if self.predictions is not None and self.doPlotPredictions: self.ax[plotIdx].clear() labeledMask = np.in1d(self.predictionsIndexes, self.sampledIndexes) labeledMaskIdx = np.where(labeledMask)[0] cmap = self.plt.cm.get_cmap('hsv') shapes = ['x', '.', '+'] # Plot type 1 #self.ax[plotIdx].set_title("Estimated design spaces by each forest") #for f in range(self.predictions.shape[0]): # self.ax[plotIdx].plot(self.predictions[f,:,0], self.predictions[f,:,1], 'x', markeredgewidth=1.8, markersize=5) # #self.ax[plotIdx].plot(self.predictions[f,labeledMask,0], self.predictions[f,labeledMask,1], 'x', markeredgewidth=1.8, markersize=5) # Plot type 2 #import matplotlib #for i,p in enumerate(labeledMaskIdx): # color = cmap(i/len(labeledMaskIdx)) # predmean = self.predictions[:,p,:].mean(0) # predmed = np.median(self.predictions[:,p,:], 0) # predstd = self.predictions[:,p,:].std(0) # # Plot type 2.1 # #self.ax[plotIdx].plot(self.predictions[:,p,0], self.predictions[:,p,1], shapes[i%len(shapes)], markeredgewidth=1.8, markersize=5, color=color) # #self.ax[plotIdx].plot(predmean[0], predmean[1], shapes[0], markeredgewidth=1.8, markersize=5, color=color) # #self.ax[plotIdx].plot(predmed[0], predmed[1], shapes[1], markeredgewidth=1.8, markersize=5, color=color) # # Plot type 2.2 # circle = matplotlib.patches.Ellipse(predmean[[0,1]], predstd[0], predstd[1]) # self.ax[plotIdx].add_artist(circle) # Plot type 3 (Mean predictions) # self.ax[plotIdx].set_title("Average estimated P_relaxed") # pred_mean = self.predictions.mean(0) # self.ax[plotIdx].plot(pred_mean[:,0], pred_mean[:,1], 'x', markeredgewidth=1.8, markersize=5) # Plot type 4 (Mean predictions of the entire space) self.ax[plotIdx].set_title("Average estimated design space") if self.estimators is not None: predictions = np.empty([ self.predictions.shape[0], self.designs.getNumDesigns(), self.predictions.shape[2] ]) for f in range(self.predictions.shape[0]): for o in range(self.predictions.shape[2]): predictions[f, :, o] = self.estimators[f][o].predict( self.allKnobs) pred_mean = predictions.mean(0) self.ax[plotIdx].scatter(pred_mean[:, 0], pred_mean[:, 1], marker='x', c=np.arange(pred_mean.shape[0]) / pred_mean.shape[0]) # Some tests here, although I can't remember what I was testing exactly... if False: from nonconformist.cp import IcpRegressor from nonconformist.nc import NcFactory from sklearn.ensemble import RandomForestRegressor model1 = RandomForestRegressor() nc1 = NcFactory.create_nc(model1) icp1 = IcpRegressor(nc1) model2 = RandomForestRegressor() nc2 = NcFactory.create_nc(model2) icp2 = IcpRegressor(nc2) n = self.sampledIndexes.size idx = np.random.permutation(n) idx_train, idx_cal = idx[:int(0.8 * n)], idx[int(0.8 * n):] icp1.fit( self.allKnobs[self.sampledIndexes][idx_train, :], self.groundTruth[self.sampledIndexes, 0][idx_train]) icp2.fit( self.allKnobs[self.sampledIndexes][idx_train, :], self.groundTruth[self.sampledIndexes, 1][idx_train]) icp1.calibrate( self.allKnobs[self.sampledIndexes][idx_cal, :], self.groundTruth[self.sampledIndexes, 0][idx_cal]) icp2.calibrate( self.allKnobs[self.sampledIndexes][idx_cal, :], self.groundTruth[self.sampledIndexes, 1][idx_cal]) prediction1 = icp1.predict(self.allKnobs, significance=0.05) prediction2 = icp2.predict(self.allKnobs, significance=0.05) print(prediction1) self.ax[plotIdx].errorbar(pred_mean[:, 0], pred_mean[:, 1], xerr=prediction1, yerr=prediction2, linestyle="None") # Keep this #self.ax[plotIdx].set_xlim(left=0, right=2) #self.ax[plotIdx].set_ylim(bottom=0, top=2) plotIdx += 1 # Plot hint space if available if self.doPlotHintSpace and self.hintSpace is not None: self.ax[plotIdx].clear() self.ax[plotIdx].set_xlim([0, 1.05]) self.ax[plotIdx].set_ylim([0, 1.05]) self.ax[plotIdx].set_title("Hint space") self.ax[plotIdx].plot(self.hintSpace[:, 0], self.hintSpace[:, 1], 'x', markeredgewidth=1.8, markersize=5) plotIdx += 1 # Plot distances for labeled samples if self.selectedDistances and self.doPlotDistances: self.ax[plotIdx].clear() self.ax[plotIdx].set_title( "Estimated distances for labeled samples") for d in self.selectedDistances: self.ax[plotIdx].hist(d.flatten(), 50, alpha=0.65) #self.ax[plotIdx].hist(d.mean(0), 50, alpha=0.65) #for d in self.gtDistances: # self.ax[plotIdx].hist(d.flatten(), 50, alpha=0.65) plotIdx += 1 # Plot distances for unlabeled samples if self.predictedDistances and self.doPlotDistances: self.ax[plotIdx].clear() self.ax[plotIdx].set_title( "Estimated distances for unlabeled samples (within P_relaxed)") predictedDistances = np.array(self.predictedDistances) for d in range(predictedDistances.shape[2]): self.ax[plotIdx].hist(predictedDistances[:, :, d].flatten(), 50, alpha=0.65) plotIdx += 1 self.plt.show() try: self.plt.pause(0.00001) except: pass self.fig.canvas.draw() if self.blocking: self.fig.waitforbuttonpress() self.selectedDistances = [] self.gtDistances = [] self.predictedDistances = []
import sys sys.path.append('/Users/staffan/git/peptid_studie/experiments/src') # Nonconformist from nonconformist.cp import TcpClassifier from nonconformist.nc import NcFactory iris = load_iris() idx = np.random.permutation(iris.target.size) # Divide the data into training set and test set idx_train, idx_test = idx[:100], idx[100:] model = SVC(probability=True) # Create the underlying model nc = NcFactory.create_nc(model) # Create a default nonconformity function tcp = TcpClassifier(nc) # Create a transductive conformal classifier # Fit the TCP using the proper training set tcp.fit(iris.data[idx_train, :], iris.target[idx_train]) # Produce predictions for the test set predictions = tcp.predict(iris.data[idx_test, :]) # targets = np.array(iris.target[idx_test], copy=True) targets.shape = (len(targets),1) output = np.hstack((targets, predictions)) np.savetxt('resources/multiclass.csv', output, delimiter=',')
#----------------------------------------------------------- # force_prediction s_folder = StratifiedKFold(n_splits=10, shuffle=True) for index, (train, test) in enumerate(s_folder.split(X, y)): X_train, X_test = X[train], X[test] y_train, y_test = y[train], y[test] x_train_sp, x_cal, y_train_sp, y_cal = train_test_split( X_train, y_train, test_size=test_size, shuffle=True) y_test = y_test.reshape((-1, 1)) lda = LinearDiscriminantAnalysis(n_components=9) x_train_lda = lda.fit_transform(x_train_sp, y_train_sp) x_cal_lda = lda.transform(x_cal) x_test_lda = lda.transform(X_test) nc = NcFactory.create_nc(model=model) icp = IcpClassifier(nc) icp.fit(x_train_lda, y_train_sp) icp.calibrate(x_cal_lda, y_cal) prediction = icp.predict(x_test_lda, significance=None) result = [1 - force_mean_errors(prediction, y_test)] if index == 0: result_summary = result else: result_summary = np.vstack((result_summary, result)) print('\nICP_Force') if np.unique(y_test).shape[0] == 10: print('True') else:
def run_experiment(dataset_name, test_method, random_state_train_test, save_to_csv=True): """ Estimate prediction intervals and print the average length and coverage Parameters ---------- dataset_name : array of strings, list of datasets test_method : string, method to be tested, estimating the 90% prediction interval random_state_train_test : integer, random seed to be used save_to_csv : boolean, save average length and coverage to csv (True) or not (False) """ dataset_name_vec = [] method_vec = [] coverage_vec = [] length_vec = [] seed_vec = [] seed = random_state_train_test random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) coverage_linear=0 length_linear=0 coverage_linear_local=0 length_linear_local=0 coverage_net=0 length_net=0 coverage_net_local=0 length_net_local=0 coverage_forest=0 length_forest=0 coverage_forest_local=0 length_forest_local=0 coverage_cp_qnet=0 length_cp_qnet=0 coverage_qnet=0 length_qnet=0 coverage_cp_sign_qnet=0 length_cp_sign_qnet=0 coverage_cp_re_qnet=0 length_cp_re_qnet=0 coverage_re_qnet=0 length_re_qnet=0 coverage_cp_sign_re_qnet=0 length_cp_sign_re_qnet=0 coverage_cp_qforest=0 length_cp_qforest=0 coverage_qforest=0 length_qforest=0 coverage_cp_sign_qforest=0 length_cp_sign_qforest=0 # determines the size of test set test_ratio = 0.2 # conformal prediction miscoverage level significance = 0.1 # desired quantile levels, used by the quantile regression methods quantiles = [0.05, 0.95] # Random forests parameters (shared by conditional quantile random forests # and conditional mean random forests regression). n_estimators = 1000 # usual random forests n_estimators parameter min_samples_leaf = 1 # default parameter of sklearn # Quantile random forests parameters. # See QuantileForestRegressorAdapter class for more details quantiles_forest = [5, 95] CV_qforest = True coverage_factor = 0.85 cv_test_ratio = 0.05 cv_random_state = 1 cv_range_vals = 30 cv_num_vals = 10 # Neural network parameters (shared by conditional quantile neural network # and conditional mean neural network regression) # See AllQNet_RegressorAdapter and MSENet_RegressorAdapter in helper.py nn_learn_func = torch.optim.Adam epochs = 1000 lr = 0.0005 hidden_size = 64 batch_size = 64 dropout = 0.1 wd = 1e-6 # Ask for a reduced coverage when tuning the network parameters by # cross-validation to avoid too conservative initial estimation of the # prediction interval. This estimation will be conformalized by CQR. quantiles_net = [0.1, 0.9] # local conformal prediction parameter. # See RegressorNc class for more details. beta = 1 beta_net = 1 # local conformal prediction parameter. The local ridge regression method # uses nearest neighbor regression as the MAD estimator. # Number of neighbors used by nearest neighbor regression. n_neighbors = 11 print(dataset_name) sys.stdout.flush() try: # load the dataset X, y = datasets.GetDataset(dataset_name, base_dataset_path) except: print("CANNOT LOAD DATASET!") return # Dataset is divided into test and train data based on test_ratio parameter X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_state_train_test) # zero mean and unit variance scaling of the train and test features scalerX = StandardScaler() scalerX = scalerX.fit(X_train) X_train = scalerX.transform(X_train) X_test = scalerX.transform(X_test) # scale the labels by dividing each by the mean absolute response max_ytrain = np.mean(np.abs(y_train)) y_train = y_train/max_ytrain y_test = y_test/max_ytrain # fit a simple ridge regression model (sanity check) model = linear_model.RidgeCV() model = model.fit(X_train, y_train) predicted_data = model.predict(X_test).astype(np.float32) # calculate the normalized mean squared error print("Ridge relative error: %f" % (np.sum((y_test-predicted_data)**2)/np.sum(y_test**2))) sys.stdout.flush() # reshape the data X_train = np.asarray(X_train) y_train = np.squeeze(np.asarray(y_train)) X_test = np.asarray(X_test) y_test = np.squeeze(np.asarray(y_test)) # input dimensions n_train = X_train.shape[0] in_shape = X_train.shape[1] print("Size: train (%d, %d), test (%d, %d)" % (X_train.shape[0], X_train.shape[1], X_test.shape[0], X_test.shape[1])) sys.stdout.flush() # set seed for splitting the data into proper train and calibration np.random.seed(seed) idx = np.random.permutation(n_train) # divide the data into proper training set and calibration set n_half = int(np.floor(n_train/2)) idx_train, idx_cal = idx[:n_half], idx[n_half:2*n_half] ######################## Linear if 'linear' == test_method: model = linear_model.RidgeCV() nc = RegressorNc(model) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Ridge") coverage_linear, length_linear = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Ridge") dataset_name_vec.append(dataset_name) method_vec.append('Ridge') coverage_vec.append(coverage_linear) length_vec.append(length_linear) seed_vec.append(seed) nc = NcFactory.create_nc( linear_model.RidgeCV(), normalizer_model=KNeighborsRegressor(n_neighbors=n_neighbors) ) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Ridge-L") coverage_linear_local, length_linear_local = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Ridge-L") dataset_name_vec.append(dataset_name) method_vec.append('Ridge-L') coverage_vec.append(coverage_linear_local) length_vec.append(length_linear_local) seed_vec.append(seed) ######################### Neural net if 'neural_net' == test_method: model = helper.MSENet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state) nc = RegressorNc(model) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Net") coverage_net, length_net = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Net") dataset_name_vec.append(dataset_name) method_vec.append('Net') coverage_vec.append(coverage_net) length_vec.append(length_net) seed_vec.append(seed) normalizer_adapter = helper.MSENet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state) adapter = helper.MSENet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state) normalizer = RegressorNormalizer(adapter, normalizer_adapter, AbsErrorErrFunc()) nc = RegressorNc(adapter, AbsErrorErrFunc(), normalizer, beta=beta_net) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Net-L") coverage_net_local, length_net_local = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Net-L") dataset_name_vec.append(dataset_name) method_vec.append('Net-L') coverage_vec.append(coverage_net_local) length_vec.append(length_net_local) seed_vec.append(seed) ################## Random Forest if 'random_forest' == test_method: model = RandomForestRegressor(n_estimators=n_estimators,min_samples_leaf=min_samples_leaf, random_state=0) nc = RegressorNc(model, AbsErrorErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"RF") coverage_forest, length_forest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"RF") dataset_name_vec.append(dataset_name) method_vec.append('RF') coverage_vec.append(coverage_forest) length_vec.append(length_forest) seed_vec.append(seed) normalizer_adapter = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=0) adapter = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=0) normalizer = RegressorNormalizer(adapter, normalizer_adapter, AbsErrorErrFunc()) nc = RegressorNc(adapter, AbsErrorErrFunc(), normalizer, beta=beta) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"RF-L") coverage_forest_local, length_forest_local = helper.compute_coverage(y_test,y_lower,y_upper,significance,"RF-L") dataset_name_vec.append(dataset_name) method_vec.append('RF-L') coverage_vec.append(coverage_forest_local) length_vec.append(length_forest_local) seed_vec.append(seed) ################## Quantile Net if 'quantile_net' == test_method: model_full = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=False) model_full.fit(X_train, y_train) tmp = model_full.predict(X_test) y_lower = tmp[:,0] y_upper = tmp[:,1] if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"QNet") coverage_qnet, length_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"QNet") dataset_name_vec.append(dataset_name) method_vec.append('QNet') coverage_vec.append(coverage_qnet) length_vec.append(length_qnet) seed_vec.append(seed) if 'cqr_quantile_net' == test_method: model = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles_net, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=False) nc = RegressorNc(model, QuantileRegErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"CQR Net") coverage_cp_qnet, length_cp_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR Net") dataset_name_vec.append(dataset_name) method_vec.append('CQR Net') coverage_vec.append(coverage_cp_qnet) length_vec.append(length_cp_qnet) seed_vec.append(seed) if 'cqr_asymmetric_quantile_net' == test_method: model = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles_net, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=False) nc = RegressorNc(model, QuantileRegAsymmetricErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"CQR Sign Net") coverage_cp_sign_qnet, length_cp_sign_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR Sign Net") dataset_name_vec.append(dataset_name) method_vec.append('CQR Sign Net') coverage_vec.append(coverage_cp_sign_qnet) length_vec.append(length_cp_sign_qnet) seed_vec.append(seed) ################### Rearrangement Quantile Net if 'rearrangement' == test_method: model_full = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=True) model_full.fit(X_train, y_train) tmp = model_full.predict(X_test) y_lower = tmp[:,0] y_upper = tmp[:,1] if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Rearrange QNet") coverage_re_qnet, length_re_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Rearrange QNet") dataset_name_vec.append(dataset_name) method_vec.append('Rearrange QNet') coverage_vec.append(coverage_re_qnet) length_vec.append(length_re_qnet) seed_vec.append(seed) if 'cqr_rearrangement' == test_method: model = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles_net, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=True) nc = RegressorNc(model, QuantileRegErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Rearrange CQR Net") coverage_cp_re_qnet, length_cp_re_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Rearrange CQR Net") dataset_name_vec.append(dataset_name) method_vec.append('Rearrange CQR Net') coverage_vec.append(coverage_cp_re_qnet) length_vec.append(length_cp_re_qnet) seed_vec.append(seed) if 'cqr_asymmetric_rearrangement' == test_method: model = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles_net, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=True) nc = RegressorNc(model, QuantileRegAsymmetricErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Rearrange CQR Sign Net") coverage_cp_sign_re_qnet, length_cp_sign_re_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Rearrange CQR Net") dataset_name_vec.append(dataset_name) method_vec.append('Rearrange CQR Sign Net') coverage_vec.append(coverage_cp_sign_re_qnet) length_vec.append(length_cp_sign_re_qnet) seed_vec.append(seed) ################### Quantile Random Forest if 'quantile_forest' == test_method: params_qforest = dict() params_qforest["random_state"] = 0 params_qforest["min_samples_leaf"] = min_samples_leaf params_qforest["n_estimators"] = n_estimators params_qforest["max_features"] = X_train.shape[1] params_qforest["CV"]=False params_qforest["coverage_factor"] = coverage_factor params_qforest["test_ratio"]=cv_test_ratio params_qforest["random_state"]=cv_random_state params_qforest["range_vals"] = cv_range_vals params_qforest["num_vals"] = cv_num_vals model_full = helper.QuantileForestRegressorAdapter(model = None, fit_params=None, quantiles=np.dot(100,quantiles), params = params_qforest) model_full.fit(X_train, y_train) tmp = model_full.predict(X_test) y_lower = tmp[:,0] y_upper = tmp[:,1] if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"QRF") coverage_qforest, length_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"QRF") dataset_name_vec.append(dataset_name) method_vec.append('QRF') coverage_vec.append(coverage_qforest) length_vec.append(length_qforest) seed_vec.append(seed) if 'cqr_quantile_forest' == test_method: params_qforest = dict() params_qforest["random_state"] = 0 params_qforest["min_samples_leaf"] = min_samples_leaf params_qforest["n_estimators"] = n_estimators params_qforest["max_features"] = X_train.shape[1] params_qforest["CV"]=CV_qforest params_qforest["coverage_factor"] = coverage_factor params_qforest["test_ratio"]=cv_test_ratio params_qforest["random_state"]=cv_random_state params_qforest["range_vals"] = cv_range_vals params_qforest["num_vals"] = cv_num_vals model = helper.QuantileForestRegressorAdapter(model = None, fit_params=None, quantiles=quantiles_forest, params = params_qforest) nc = RegressorNc(model, QuantileRegErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"CQR RF") coverage_cp_qforest, length_cp_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR RF") dataset_name_vec.append(dataset_name) method_vec.append('CQR RF') coverage_vec.append(coverage_cp_qforest) length_vec.append(length_cp_qforest) seed_vec.append(seed) if 'cqr_asymmetric_quantile_forest' == test_method: params_qforest = dict() params_qforest["random_state"] = 0 params_qforest["min_samples_leaf"] = min_samples_leaf params_qforest["n_estimators"] = n_estimators params_qforest["max_features"] = X_train.shape[1] params_qforest["CV"]=CV_qforest params_qforest["coverage_factor"] = coverage_factor params_qforest["test_ratio"]=cv_test_ratio params_qforest["random_state"]=cv_random_state params_qforest["range_vals"] = cv_range_vals params_qforest["num_vals"] = cv_num_vals model = helper.QuantileForestRegressorAdapter(model = None, fit_params=None, quantiles=quantiles_forest, params = params_qforest) nc = RegressorNc(model, QuantileRegAsymmetricErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"CQR Sign RF") coverage_cp_sign_qforest, length_cp_sign_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR Sign RF") dataset_name_vec.append(dataset_name) method_vec.append('CQR Sign RF') coverage_vec.append(coverage_cp_sign_qforest) length_vec.append(length_cp_sign_qforest) seed_vec.append(seed) # tmp = model.predict(X_test) # y_lower = tmp[:,0] # y_upper = tmp[:,1] # if plot_results: # helper.plot_func_data(y_test,y_lower,y_upper,"QRF") # coverage_qforest, length_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"QRF") # # dataset_name_vec.append(dataset_name) # method_vec.append('QRF') # coverage_vec.append(coverage_qforest) # length_vec.append(length_qforest) # seed_vec.append(seed) ############### Summary coverage_str = 'Coverage (expected ' + str(100 - significance*100) + '%)' results = np.array([[dataset_name, coverage_str, 'Avg. Length', 'Seed'], ['CP Linear', coverage_linear, length_linear, seed], ['CP Linear Local', coverage_linear_local, length_linear_local, seed], ['CP Neural Net', coverage_net, length_net, seed], ['CP Neural Net Local', coverage_net_local, length_net_local, seed], ['CP Random Forest', coverage_forest, length_forest, seed], ['CP Random Forest Local', coverage_forest_local, length_forest_local, seed], ['CP Quantile Net', coverage_cp_qnet, length_cp_qnet, seed], ['CP Asymmetric Quantile Net', coverage_cp_sign_qnet, length_cp_sign_qnet, seed], ['Quantile Net', coverage_qnet, length_qnet, seed], ['CP Rearrange Quantile Net', coverage_cp_re_qnet, length_cp_re_qnet, seed], ['CP Asymmetric Rearrange Quantile Net', coverage_cp_sign_re_qnet, length_cp_sign_re_qnet, seed], ['Rearrange Quantile Net', coverage_re_qnet, length_re_qnet, seed], ['CP Quantile Random Forest', coverage_cp_qforest, length_cp_qforest, seed], ['CP Asymmetric Quantile Random Forest', coverage_cp_sign_qforest, length_cp_sign_qforest, seed], ['Quantile Random Forest', coverage_qforest, length_qforest, seed]]) results_ = pd.DataFrame(data=results[1:,1:], index=results[1:,0], columns=results[0,1:]) print("== SUMMARY == ") print("dataset name: " + dataset_name) print(results_) sys.stdout.flush() if save_to_csv: results = pd.DataFrame(results) outdir = './results/' if not os.path.exists(outdir): os.mkdir(outdir) out_name = outdir + 'results.csv' df = pd.DataFrame({'name': dataset_name_vec, 'method': method_vec, coverage_str : coverage_vec, 'Avg. Length' : length_vec, 'seed': seed_vec}) if os.path.isfile(out_name): df2 = pd.read_csv(out_name) df = pd.concat([df2, df], ignore_index=True) df.to_csv(out_name, index=False)
df_test = pd.read_csv(TEST) trainX, trainY = df_train.drop(['TARGET'], axis=1), df_train['TARGET'] calX, calY = df_cal.drop(['TARGET'], axis=1), df_cal['TARGET'] model = joblib.load(os.path.join("models", f"{MODEL}.pkl")) if 'TARGET' in df_test.columns: testX, testY = df_test.drop(['id', 'TARGET'], axis=1), df_test['TARGET'] else: testX = df_test.drop(['id'], axis=1) if PROBLEM_TYPE == 'classification': if MODEL == 'catboost': raise Exception('Cant compute inervals for CatBoostClassifier!') nc = NcFactory.create_nc( model, normalizer_model=KNeighborsRegressor( n_neighbors=11)) # Create a default nonconformity function icp = IcpClassifier(nc) icp.fit(trainX.values, trainY.values) # Calibrate the ICP using the calibration set icp.calibrate(calX.values, calY.values) # Produce predictions for the test set, with confidence 95% prediction = icp.predict(testX.to_numpy(), significance=0.05) else: if MODEL == 'catboost': params = joblib.load("models/params.pkl")
def test_nc_factory(self): def score_model(icp, icp_name, ds, ds_name, scoring_funcs): scores = cross_val_score( icp, ds.data, ds.target, iterations=10, folds=10, scoring_funcs=scoring_funcs, significance_levels=[0.05, 0.1, 0.2], ) print("\n{}: {}".format(icp_name, ds_name)) scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean()) # ----------------------------------------------------------------------------- # Classification # ----------------------------------------------------------------------------- data = load_iris() nc = NcFactory.create_nc(RandomForestClassifier(n_estimators=100)) icp = IcpClassifier(nc) icp_cv = ClassIcpCvHelper(icp) score_model(icp_cv, "IcpClassifier", data, "iris", [class_mean_errors, class_avg_c]) # ----------------------------------------------------------------------------- # Classification (normalized) # ----------------------------------------------------------------------------- data = load_iris() nc = NcFactory.create_nc(RandomForestClassifier(n_estimators=100), normalizer_model=KNeighborsRegressor()) icp = IcpClassifier(nc) icp_cv = ClassIcpCvHelper(icp) score_model(icp_cv, "IcpClassifier (normalized)", data, "iris", [class_mean_errors, class_avg_c]) # ----------------------------------------------------------------------------- # Classification OOB # ----------------------------------------------------------------------------- data = load_iris() nc = NcFactory.create_nc(RandomForestClassifier(n_estimators=100, oob_score=True), oob=True) icp_cv = OobCpClassifier(nc) score_model(icp_cv, "IcpClassifier (OOB)", data, "iris", [class_mean_errors, class_avg_c]) # ----------------------------------------------------------------------------- # Classification OOB normalized # ----------------------------------------------------------------------------- data = load_iris() nc = NcFactory.create_nc( RandomForestClassifier(n_estimators=100, oob_score=True), oob=True, normalizer_model=KNeighborsRegressor(), ) icp_cv = OobCpClassifier(nc) score_model( icp_cv, "IcpClassifier (OOB, normalized)", data, "iris", [class_mean_errors, class_avg_c], ) # ----------------------------------------------------------------------------- # Regression # ----------------------------------------------------------------------------- data = load_diabetes() nc = NcFactory.create_nc(RandomForestRegressor(n_estimators=100)) icp = IcpRegressor(nc) icp_cv = RegIcpCvHelper(icp) score_model(icp_cv, "IcpRegressor", data, "diabetes", [reg_mean_errors, reg_median_size]) # ----------------------------------------------------------------------------- # Regression (normalized) # ----------------------------------------------------------------------------- data = load_diabetes() nc = NcFactory.create_nc(RandomForestRegressor(n_estimators=100), normalizer_model=KNeighborsRegressor()) icp = IcpRegressor(nc) icp_cv = RegIcpCvHelper(icp) score_model( icp_cv, "IcpRegressor (normalized)", data, "diabetes", [reg_mean_errors, reg_median_size], ) # ----------------------------------------------------------------------------- # Regression OOB # ----------------------------------------------------------------------------- data = load_diabetes() nc = NcFactory.create_nc(RandomForestRegressor(n_estimators=100, oob_score=True), oob=True) icp_cv = OobCpRegressor(nc) score_model(icp_cv, "IcpRegressor (OOB)", data, "diabetes", [reg_mean_errors, reg_median_size]) # ----------------------------------------------------------------------------- # Regression OOB normalized # ----------------------------------------------------------------------------- data = load_diabetes() nc = NcFactory.create_nc( RandomForestRegressor(n_estimators=100, oob_score=True), oob=True, normalizer_model=KNeighborsRegressor(), ) icp_cv = OobCpRegressor(nc) score_model( icp_cv, "IcpRegressor (OOB, normalized)", data, "diabetes", [reg_mean_errors, reg_median_size], )