def test_icp_regression_tree(self): # ----------------------------------------------------------------------------- # Setup training, calibration and test indices # ----------------------------------------------------------------------------- data = load_boston() idx = np.random.permutation(data.target.size) train = idx[:int(idx.size / 3)] calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)] test = idx[int(2 * idx.size / 3):] # ----------------------------------------------------------------------------- # Without normalization # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- underlying_model = RegressorAdapter( DecisionTreeRegressor(min_samples_leaf=5)) nc = RegressorNc(underlying_model, AbsErrorErrFunc()) icp = IcpRegressor(nc) icp.fit(data.data[train, :], data.target[train]) icp.calibrate(data.data[calibrate, :], data.target[calibrate]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(data.data[test, :], significance=0.1) header = ["min", "max", "truth", "size"] size = prediction[:, 1] - prediction[:, 0] table = np.vstack([prediction.T, data.target[test], size.T]).T df = pd.DataFrame(table, columns=header) print(df) # ----------------------------------------------------------------------------- # With normalization # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- underlying_model = RegressorAdapter( DecisionTreeRegressor(min_samples_leaf=5)) normalizing_model = RegressorAdapter( KNeighborsRegressor(n_neighbors=1)) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp.fit(data.data[train, :], data.target[train]) icp.calibrate(data.data[calibrate, :], data.target[calibrate]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(data.data[test, :], significance=0.1) header = ["min", "max", "truth", "size"] size = prediction[:, 1] - prediction[:, 0] table = np.vstack([prediction.T, data.target[test], size.T]).T df = pd.DataFrame(table, columns=header) print(df)
def build(self): if not self.quantitative: print("PLSR only applies to quantitative data") return False, "PLSR only applies to quantitative data" if self.failed: return False, "Error initiating model" X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) if self.cv: self.cv = getCrossVal(self.cv, 46, self.n, self.p) if self.tune: if self.optimiz == 'auto': super(PLSR, self).optimize(X, Y, PLS_r( **self.estimator_parameters), self.tune_parameters) elif self.optimiz == 'manual': self.optimize(X, Y, PLS_r( **self.estimator_parameters), self.tune_parameters) results.append( ('model', 'model type', 'PLSR quantitative (optimized)')) else: print("Building Quantitative PLSR") self.estimator = PLS_r(**self.estimator_parameters) results.append(('model', 'model type', 'PLSR quantitative')) if self.conformal: underlying_model = RegressorAdapter(self.estimator) normalizing_model = RegressorAdapter( KNeighborsRegressor(n_neighbors=1)) normalizing_model = RegressorAdapter(self.estimator) normalizer = RegressorNormalizer( underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) self.conformal_pred = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) # self.conformal_pred = AggregatedCp(IcpRegressor(RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) self.conformal_pred.fit(X, Y) # overrides non-conformal results.append( ('model', 'model type', 'conformal PLSR quantitative')) self.estimator.fit(X, Y) return True, results
def CF_QuanCal(X, Y, estimator): # X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42) acp = AggregatedCp( IcpRegressor( RegressorNc( RegressorAdapter(estimator), AbsErrorErrFunc(), RegressorNormalizer(estimator, copy.copy(estimator), AbsErrorErrFunc())), RandomSubSampler()), ) acp.fit(X, Y) # icp.calibrate(X_test, y_test) return acp
def build(self): '''Build a new XGBOOST model with the X and Y numpy matrices ''' try: from xgboost.sklearn import XGBClassifier from xgboost.sklearn import XGBRegressor except Exception as e: return False, 'XGboost not found, please revise your environment' # Make a copy of data matrices X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) # If tune then call gridsearch to optimize the estimator if self.param.getVal('tune'): LOG.info("Optimizing XGBOOST estimator") try: # Check type of model if self.param.getVal('quantitative'): self.estimator = XGBRegressor( **self.estimator_parameters) self.optimize(X, Y, self.estimator, self.tune_parameters) results.append(('model','model type','XGBOOST quantitative (optimized)')) else: self.estimator = XGBClassifier( **self.estimator_parameters) params = self.estimator.get_params() params['num_class'] = 2 self.optimize(X, Y, self.estimator, self.tune_parameters) results.append(('model','model type','XGBOOST qualitative (optimized)')) except Exception as e: return False, f'Exception optimizing XGBOOST estimator with exception {e}' else: try: if self.param.getVal('quantitative'): LOG.info("Building Quantitative XGBOOST model") # params = { # 'objective': 'reg:squarederror', # 'missing': -99.99999, # # 'max_depth': 20, # # 'learning_rate': 1.0, # # 'silent': 1, # # 'n_estimators': 25 # } # self.estimator = XGBRegressor(**params) self.estimator = XGBRegressor(**self.estimator_parameters) results.append(('model', 'model type', 'XGBOOST quantitative')) else: LOG.info("Building Qualitative XGBOOST model") # params = { # 'objective': 'binary:logistic', # 'max_depth': 3, # #'learning_rate': 0.7, # #'silent': 1, # 'n_estimators': 100 # } self.estimator = XGBClassifier(**self.estimator_parameters) results.append(('model', 'model type', 'XGBOOST qualitative')) self.estimator.fit(X, Y) print(self.estimator) except Exception as e: raise e return False, f'Exception building XGBOOST estimator with exception {e}' self.estimator_temp = copy(self.estimator) if not self.param.getVal('conformal'): return True, results # Create the conformal estimator try: # Conformal regressor if self.param.getVal('quantitative'): LOG.info("Building conformal Quantitative XGBOOST model") underlying_model = RegressorAdapter(self.estimator_temp) #normalizing_model = RegressorAdapter( #KNeighborsRegressor(n_neighbors=5)) normalizing_model = RegressorAdapter(self.estimator_temp) normalizer = RegressorNormalizer( underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) # self.conformal_pred = AggregatedCp(IcpRegressor # (RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) self.estimator = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) self.estimator.fit(X, Y) results.append(('model', 'model type', 'conformal XGBOOST quantitative')) # Conformal classifier else: LOG.info("Building conformal Qualitative XGBOOST model") self.estimator = AggregatedCp( IcpClassifier( ClassifierNc( ClassifierAdapter(self.estimator_temp), MarginErrFunc() ) ), BootstrapSampler()) # Fit estimator to the data self.estimator.fit(X, Y) results.append(('model', 'model type', 'conformal XGBOOST qualitative')) except Exception as e: raise e return False, f'Exception building conformal XGBOOST estimator with exception {e}' return True, results ## Overriding of parent methods # def CF_quantitative_validation(self): # ''' performs validation for conformal quantitative models ''' # def CF_qualitative_validation(self): # ''' performs validation for conformal qualitative models ''' # def quantitativeValidation(self): # ''' performs validation for quantitative models ''' # def qualitativeValidation(self): # ''' performs validation for qualitative models ''' # def validate(self): # ''' Validates the model and computes suitable model quality scoring values''' # def optimize(self, X, Y, estimator, tune_parameters): # ''' optimizes a model using a grid search over a range of values for diverse parameters''' # def regularProject(self, Xb, results): # ''' projects a collection of query objects in a regular model, for obtaining predictions ''' # def conformalProject(self, Xb, results): # ''' projects a collection of query objects in a conformal model, for obtaining predictions ''' # def project(self, Xb, results): # ''' Uses the X matrix provided as argument to predict Y'''
def build(self): '''Build a new DL model with the X and Y numpy matrices ''' try: from keras.wrappers.scikit_learn import KerasClassifier from keras.wrappers.scikit_learn import KerasRegressor except Exception as e: return False, 'Keras not found, please revise your environment' # Make a copy of data matrices X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) # If tune then call gridsearch to optimize the estimator if self.param.getVal('tune'): LOG.info("Optimizing Keras estimator") try: # Check type of model if self.param.getVal('quantitative'): self.estimator = KerasRegressor( **self.estimator_parameters) self.optimize(X, Y, self.estimator, self.tune_parameters) results.append(('model', 'model type', 'KERAS quantitative (optimized)')) else: self.estimator = KerasClassifier( **self.estimator_parameters) #params = self.estimator.get_params() #params['num_class'] = 2 self.optimize(X, Y, self.estimator, self.tune_parameters) results.append(('model', 'model type', 'KERAS qualitative (optimized)')) except Exception as e: return False, f'Exception optimizing KERAS estimator with exception {e}' else: try: if self.param.getVal('quantitative'): LOG.info("Building Quantitative KERAS mode") self.estimator = KerasRegressor( build_fn=self.create_model, **self.estimator_parameters, verbose=0) results.append( ('model', 'model type', 'Keras quantitative')) else: LOG.info("Building Qualitative Keras model") self.estimator = KerasClassifier( build_fn=self.create_model, dim=self.X.shape[1], **self.estimator_parameters, verbose=0) results.append( ('model', 'model type', 'Keras qualitative')) self.estimator.fit(X, Y) print(self.estimator) except Exception as e: raise e return False, f'Exception building Keras estimator with exception {e}' self.estimator_temp = clone(self.estimator) if not self.param.getVal('conformal'): return True, results # Create the conformal estimator try: # Conformal regressor if self.param.getVal('quantitative'): LOG.info("Building conformal Quantitative Keras model") underlying_model = RegressorAdapter(self.estimator_temp) normalizing_model = RegressorAdapter( KNeighborsRegressor(n_neighbors=15)) # normalizing_model = RegressorAdapter(self.estimator_temp) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) # self.conformal_pred = AggregatedCp(IcpRegressor # (RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) self.estimator = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) self.estimator.fit(X, Y) results.append( ('model', 'model type', 'conformal Keras quantitative')) # Conformal classifier else: LOG.info("Building conformal Qualitative Keras model") self.estimator = AggregatedCp( IcpClassifier( ClassifierNc(ClassifierAdapter(self.estimator_temp), MarginErrFunc())), BootstrapSampler()) # Fit estimator to the data print('build finished') self.estimator.fit(X, Y) results.append( ('model', 'model type', 'conformal Keras qualitative')) except Exception as e: raise e return False, f'Exception building conformal Keras estimator with exception {e}' return True, []
def train_and_test_cp_algo(i): window = 96 p = {'window': window} algorithm = BiLSTM(p) path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv' df = pd.read_csv(path).drop(['QdfTime', 'Unnamed: 0'], axis=1).fillna(0) y_raw_test = df.NetPosUsd[-120:] median_ = df.NetPosUsd.median() mad_ = mad(df.NetPosUsd.values) df.NetPosUsd = mlog_trans(df.NetPosUsd.values) # mean = df.NetPosUsd.mean() # std = df.NetPosUsd.std() # df.NetPosUsd = (df.NetPosUsd - mean) / std data = df.NetPosUsd.values def generate_index(window, data_matrix): ''' :return: ''' num_elements = data_matrix.shape[0] for start, stop in zip(range(0, num_elements - window, 1), range(window, num_elements, 1)): yield data_matrix[stop - window:stop].reshape((-1, 1)) cnt = [] for sequence in generate_index(window, data): cnt.append(sequence) cnt = np.array(cnt) X = cnt y = data[window:] X = X.reshape(X.shape[0], X.shape[1]) train_test_split = X.shape[0] - 120 - 3480 train = X[:train_test_split, :] calibrate = X[train_test_split:train_test_split + 3480, :] test = X[-120:] ytrain = y[:train_test_split] ycalibrate = y[train_test_split:train_test_split + 3480] ytest = y[-120:] underlying_model = RegressorAdapter(algorithm) normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50)) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) underlying_model2 = RegressorAdapter(algorithm) nc2 = RegressorNc(underlying_model2, AbsErrorErrFunc()) icp2 = IcpRegressor(nc2) icp2.fit(train, ytrain) icp2.calibrate(calibrate, ycalibrate) for a in tqdm(np.linspace(5, 95, 19)): # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(test, significance=a / 100) header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction'] lower, upper = prediction[:, 0], prediction[:, 1] lower = mlog_inverse(lower, median_, mad_) upper = mlog_inverse(upper, median_, mad_) ytest = mlog_inverse(ytest, median_, mad_) # lower=lower*std+mean # upper=upper*std+mean # ytest=ytest*std+mean size = upper / 2 + lower / 2 table = np.vstack([lower, upper, y_raw_test, size.T]).T dfncp = pd.DataFrame(table, columns=header) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp2.predict(test, significance=a / 100) header = ['CP_lower', 'CP_upper', 'NetPosUsd', 'prediction'] lower, upper = prediction[:, 0], prediction[:, 1] lower = mlog_inverse(lower, median_, mad_) upper = mlog_inverse(upper, median_, mad_) ytest = mlog_inverse(ytest, median_, mad_) # lower=lower*std+mean # upper=upper*std+mean # ytest=ytest*std+mean size = upper / 2 + lower / 2 table = np.vstack([lower, upper, y_raw_test, size.T]).T dfcp = pd.DataFrame(table, columns=header) if i == 0: dfcp.to_csv( 'CP' + '_' + 'cudaLSTM' + '_' + str( np.round(a).astype(int)) + '_' + 'calibrationwindow' + str( 3480) + '.csv', encoding='utf-8', index=False) else: dfcp.to_csv( 'CP' + '_' + 'cudaLSTM' + '_' + str( np.round(a).astype(int)) + '_' + 'calibrationwindow' + str( 3480) + '.csv', mode='a', header=False, index=False) if i == 0: dfncp.to_csv( 'NCP' + '_' + 'cudaLSTM' + '_' + str( np.round(a).astype(int)) + '_' + 'calibrationwindow' + str( 3480) + '.csv', encoding='utf-8', index=False) else: dfncp.to_csv( 'NCP' + '_' + 'cudaLSTM' + '_' + str( np.round(a).astype(int)) + '_' + 'calibrationwindow' + str( 3480) + '.csv', mode='a', header=False, index=False)
def evaluate(model_filepath, train_filepath, test_filepath, calibrate_filepath): """Evaluate model to estimate power. Args: model_filepath (str): Path to model. train_filepath (str): Path to train set. test_filepath (str): Path to test set. calibrate_filepath (str): Path to calibrate set. """ METRICS_FILE_PATH.parent.mkdir(parents=True, exist_ok=True) # Load parameters params = yaml.safe_load(open("params.yaml"))["evaluate"] params_train = yaml.safe_load(open("params.yaml"))["train"] params_split = yaml.safe_load(open("params.yaml"))["split"] test = np.load(test_filepath) X_test = test["X"] y_test = test["y"] # pandas data frame to store predictions and ground truth. df_predictions = None y_pred = None if params_split["calibrate_split"] == 0: model = models.load_model(model_filepath) y_pred = model.predict(X_test) else: trained_model = models.load_model(model_filepath) # mycustommodel = MyCustomModel(model_filepath) mycustommodel = MyCustomModel(trained_model) m = cnn(X_test.shape[-2], X_test.shape[-1], output_length=1, kernel_size=params_train["kernel_size"]) nc = RegressorNc( mycustommodel, err_func=AbsErrorErrFunc(), # non-conformity function # normalizer_model=KNeighborsRegressor(n_neighbors=15) # normalizer # normalizer=m ) # nc = NcFactory.create_nc(mycustommodel, # err_func=AbsErrorErrFunc(), # non-conformity function # # normalizer_model=KNeighborsRegressor(n_neighbors=15) # normalizer # normalizer_model=m # ) model = IcpRegressor(nc) # Fit the normalizer. train = np.load(train_filepath) X_train = train["X"] y_train = train["y"] y_train = y_train.reshape((y_train.shape[0], )) model.fit(X_train, y_train) # Calibrate model. calibrate = np.load(calibrate_filepath) X_calibrate = calibrate["X"] y_calibrate = calibrate["y"] y_calibrate = y_calibrate.reshape((y_calibrate.shape[0], )) model.calibrate(X_calibrate, y_calibrate) print(f"Calibration: {X_calibrate.shape}") # Set conformal prediction error. This should be a parameter specified by the user. error = 0.05 # Predictions will contain the intervals. We need to compute the middle # points to get the actual predictions y. predictions = model.predict(X_test, significance=error) # Compute middle points. y_pred = predictions[:, 0] + (predictions[:, 1] - predictions[:, 0]) / 2 # Reshape to put it in the same format as without calibration set. y_pred = y_pred.reshape((y_pred.shape[0], 1)) # Build data frame with predictions. my_results = list( zip(np.reshape(y_test, (y_test.shape[0], )), np.reshape(y_pred, (y_pred.shape[0], )), predictions[:, 0], predictions[:, 1])) df_predictions = pd.DataFrame(my_results, columns=[ 'ground_truth', 'predicted', 'lower_bound', 'upper_bound' ]) save_predictions(df_predictions) plot_intervals(df_predictions) mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) print("MSE: {}".format(mse)) print("R2: {}".format(r2)) plot_prediction(y_test, y_pred, inputs=X_test, info="(R2: {})".format(r2)) plot_individual_predictions(y_test, y_pred) with open(METRICS_FILE_PATH, "w") as f: json.dump(dict(mse=mse, r2=r2), f)
def run_experiment(cur_test_method, cur_dataset_name, cur_batch_size, cur_lr_loss, cur_lr_dis, cur_loss_steps, cur_dis_steps, cur_mu_val, cur_epochs, cur_model_type, cur_regression_type, cur_random_state, cur_second_scale, num_experiments): method = cur_test_method seed = cur_random_state random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) dataset = cur_dataset_name batch_size = cur_batch_size # step size to minimize loss lr_loss = cur_lr_loss # step size used to fit GAN's classifier lr_dis = cur_lr_dis # inner epochs to fit loss loss_steps = cur_loss_steps # inner epochs to fit GAN's classifier dis_steps = cur_dis_steps # total number of epochs epochs = cur_epochs # utility loss if cur_regression_type == "mreg": cost_pred = torch.nn.MSELoss() out_shape = 1 else: raise model_type = cur_model_type metric = "equalized_odds" print(dataset) print(method) sys.stdout.flush() avg_length_0 = np.zeros(num_experiments) avg_length_1 = np.zeros(num_experiments) avg_coverage_0 = np.zeros(num_experiments) avg_coverage_1 = np.zeros(num_experiments) avg_p_val = np.zeros(num_experiments) mse = np.zeros(num_experiments) for i in range(num_experiments): # Split into train and test X, A, Y, X_cal, A_cal, Y_cal, X_test, A_test, Y_test = get_dataset.get_train_test_data( base_path, dataset, seed + i) in_shape = X.shape[1] print("n train = " + str(X.shape[0]) + " p = " + str(X.shape[1])) print("n calibration = " + str(X_cal.shape[0])) print("n test = " + str(X_test.shape[0])) sys.stdout.flush() if method == "AdversarialDebiasing": class RegAdapter(RegressorAdapter): def __init__(self, model=None, fit_params=None, params=None): super(RegAdapter, self).__init__(model, fit_params) # Instantiate model self.learner = adv_debiasing.AdvDebiasingRegLearner( lr=lr_loss, N_CLF_EPOCHS=loss_steps, N_ADV_EPOCHS=dis_steps, N_EPOCH_COMBINED=epochs, cost_pred=cost_pred, in_shape=in_shape, batch_size=batch_size, model_type=model_type, out_shape=out_shape, lambda_vec=cur_mu_val) def fit(self, x, y): self.learner.fit(x, y) def predict(self, x): return self.learner.predict(x) elif method == 'FairDummies': class RegAdapter(RegressorAdapter): def __init__(self, model=None, fit_params=None, params=None): super(RegAdapter, self).__init__(model, fit_params) # Instantiate model self.learner = fair_dummies_learning.EquiRegLearner( lr=lr_loss, pretrain_pred_epochs=0, pretrain_dis_epochs=0, epochs=epochs, loss_steps=loss_steps, dis_steps=dis_steps, cost_pred=cost_pred, in_shape=in_shape, batch_size=batch_size, model_type=model_type, lambda_vec=cur_mu_val, second_moment_scaling=cur_second_scale, out_shape=out_shape) def fit(self, x, y): self.learner.fit(x, y) def predict(self, x): return self.learner.predict(x) elif method == 'HGR': class RegAdapter(RegressorAdapter): def __init__(self, model=None, fit_params=None, params=None): super(RegAdapter, self).__init__(model, fit_params) # Instantiate model self.learner = continuous_fairness.HGR_Reg_Learner( lr=lr_loss, epochs=epochs, mu=cur_mu_val, cost_pred=cost_pred, in_shape=in_shape, out_shape=out_shape, batch_size=batch_size, model_type=model_type) def fit(self, x, y): self.learner.fit(x, y) def predict(self, x): return self.learner.predict(x) elif method == 'Baseline': class RegAdapter(RegressorAdapter): def __init__(self, model=None, fit_params=None, params=None): super(RegAdapter, self).__init__(model, fit_params) # Instantiate model self.learner = fair_dummies_learning.EquiRegLearner( lr=lr_loss, pretrain_pred_epochs=epochs, pretrain_dis_epochs=0, epochs=0, loss_steps=0, dis_steps=0, cost_pred=cost_pred, in_shape=in_shape, batch_size=batch_size, model_type=model_type, lambda_vec=0, second_moment_scaling=0, out_shape=out_shape) def fit(self, x, y): self.learner.fit(x, y) def predict(self, x): return self.learner.predict(x) fairness_reg = RegAdapter(model=None) if cur_regression_type == "mreg": nc = RegressorNc(fairness_reg, AbsErrorErrFunc()) else: raise # function that extracts the group identifier def condition(x, y=None): return int(x[0][0] > 0) icp = IcpRegressor(nc, condition=condition) input_data_train = np.concatenate((A[:, np.newaxis], X), 1) icp.fit(input_data_train, Y) input_data_cal = np.concatenate((A_cal[:, np.newaxis], X_cal), 1) icp.calibrate(input_data_cal, Y_cal) input_data_test = np.concatenate((A_test[:, np.newaxis], X_test), 1) Yhat_test = icp.predict(input_data_test, significance=0.1) # compute and print average coverage and average length coverage_sample, length_sample = compute_coverage_per_sample( Y_test, Yhat_test[:, 0], Yhat_test[:, 1], 0.1, method, input_data_test, condition) avg_coverage, avg_length = compute_coverage_len( Y_test, Yhat_test[:, 0], Yhat_test[:, 1]) avg_length_0[i] = np.mean(length_sample[0]) avg_coverage_0[i] = np.mean(coverage_sample[0]) avg_length_1[i] = np.mean(length_sample[1]) avg_coverage_1[i] = np.mean(coverage_sample[1]) Yhat_out_cal = fairness_reg.learner.predict(input_data_cal) Yhat_out_test = fairness_reg.learner.predict(input_data_test) if out_shape == 1: mse[i] = np.mean((Yhat_out_test - Y_test)**2) MSE_trivial = np.mean((np.mean(Y_test) - Y_test)**2) print("MSE = " + str(mse[i]) + "MSE Trivial = " + str(MSE_trivial)) p_val = utility_functions.fair_dummies_test_regression( Yhat_out_cal, A_cal, Y_cal, Yhat_out_test, A_test, Y_test, num_reps=1, num_p_val_rep=1000, reg_func_name="Net") avg_p_val[i] = p_val print("experiment = " + str(i + 1)) # if out_shape==2: # init_coverage, init_length = compute_coverage_len(Y_test, Yhat_out_test[:,0], Yhat_out_test[:,1]) # print("Init Coverage = " + str(init_coverage)) # print("Init Length = " + str(init_length)) print("Coverage 0 = " + str(avg_coverage_0[i])) print("Coverage 1 = " + str(avg_coverage_1[i])) print("Length 0 = " + str(avg_length_0[i])) print("Length 1 = " + str(avg_length_1[i])) print("MSE = " + str(mse[i])) print("p_val = " + str(p_val)) sys.stdout.flush() outdir = './results/' if not os.path.exists(outdir): os.mkdir(outdir) out_name = outdir + 'results.csv' full_name = cur_test_method + "_" + cur_model_type + "_" + cur_regression_type df = pd.DataFrame({ 'method': [cur_test_method], 'dataset': [cur_dataset_name], 'batch_size': [cur_batch_size], 'lr_loss': [cur_lr_loss], 'lr_dis': [cur_lr_dis], 'loss_steps': [cur_loss_steps], 'dis_steps': [cur_dis_steps], 'mu_val': [cur_mu_val], 'epochs': [cur_epochs], 'random_state': [seed + i], 'model_type': [cur_model_type], 'metric': [metric], 'cur_second_scale': [cur_second_scale], 'regression_type': [cur_regression_type], 'avg_length': [avg_length], 'avg_coverage': [avg_coverage], 'avg_length_0': [avg_length_0[i]], 'avg_length_1': [avg_length_1[i]], 'mse': [mse[i]], 'avg_coverage_0': [avg_coverage_0[i]], 'avg_coverage_1': [avg_coverage_1[i]], 'p_val': [p_val], 'full_name': [full_name] }) if os.path.isfile(out_name): df2 = pd.read_csv(out_name) df = pd.concat([df2, df], ignore_index=True) df.to_csv(out_name, index=False) print(full_name) print( "Num experiments %02d | Avg MSE = %.4f | Avg Length 0 = %.4f | Avg Length 1 = %.4f | Avg Coverage 0 = %.4f | Avg Coverage 1 = %.4f | Avg p_val = %.4f | min p_val = %.4f" % (i + 1, np.mean(mse[:i + 1]), np.mean(avg_length_0[:i + 1]), np.mean(avg_length_1[:i + 1]), np.mean(avg_coverage_0[:i + 1]), np.mean(avg_coverage_1[:i + 1]), np.mean( avg_p_val[:i + 1]), np.min(avg_p_val[:i + 1]))) print("======== Done =========") sys.stdout.flush()
def build(self): '''Build a new RF model with the X and Y numpy matrices ''' # Make a copy of data matrices X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) results.append(('model', 'model type', 'RF')) conformal = self.param.getVal('conformal') # If tune then call gridsearch to optimize the estimator if self.param.getVal('tune'): LOG.info("Optimizing RF estimator") try: # Check type of model if self.param.getVal('quantitative'): self.estimator = RandomForestRegressor( **self.estimator_parameters) self.optimize(X, Y, self.estimator, self.tune_parameters) # results.append(('model','model type','RF quantitative (optimized)')) else: self.estimator = RandomForestClassifier( **self.estimator_parameters) self.optimize(X, Y, self.estimator, self.tune_parameters) # results.append(('model','model type','RF qualitative (optimized)')) except Exception as e: return False, f'Exception optimizing RF estimator with exception {e}' else: try: if self.param.getVal('quantitative'): self.estimator = RandomForestRegressor( **self.estimator_parameters) if not conformal: LOG.info("Building Quantitative RF model") # results.append(('model', 'model type', 'RF quantitative')) else: self.estimator = RandomForestClassifier( **self.estimator_parameters) if not conformal: LOG.info("Building Qualitative RF model") # results.append(('model', 'model type', 'RF qualitative')) self.estimator.fit(X, Y) except Exception as e: return False, f'Exception building RF estimator with exception {e}' if not conformal: return True, results self.estimator_temp = copy(self.estimator) # Create the conformal estimator try: # Conformal regressor if self.param.getVal('quantitative'): conformal_settings = self.param.getDict('conformal_settings') LOG.info("Building conformal Quantitative RF model") underlying_model = RegressorAdapter(self.estimator_temp) self.normalizing_model = RegressorAdapter( KNeighborsRegressor( n_neighbors=conformal_settings['KNN_NN'])) # normalizing_model = RegressorAdapter(self.estimator_temp) normalizer = RegressorNormalizer(underlying_model, copy(self.normalizing_model), AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) # self.conformal_pred = AggregatedCp(IcpRegressor # (RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) self.estimator = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) self.estimator.fit(X, Y) # results.append(('model', 'model type', 'conformal RF quantitative')) # Conformal classifier else: LOG.info("Building conformal Qualitative RF model") self.estimator = AggregatedCp( IcpClassifier( ClassifierNc(ClassifierAdapter(self.estimator_temp), MarginErrFunc())), BootstrapSampler()) # Fit estimator to the data self.estimator.fit(X, Y) # results.append(('model', 'model type', 'conformal RF qualitative')) except Exception as e: return False, f'Exception building conformal RF estimator with exception {e}' return True, results ## Overriding of parent methods # def CF_quantitative_validation(self): # ''' performs validation for conformal quantitative models ''' # def CF_qualitative_validation(self): # ''' performs validation for conformal qualitative models ''' # def quantitativeValidation(self): # ''' performs validation for quantitative models ''' # def qualitativeValidation(self): # ''' performs validation for qualitative models ''' # def validate(self): # ''' Validates the model and computes suitable model quality scoring values''' # def optimize(self, X, Y, estimator, tune_parameters): # ''' optimizes a model using a grid search over a range of values for diverse parameters''' # def regularProject(self, Xb, results): # ''' projects a collection of query objects in a regular model, for obtaining predictions ''' # def conformalProject(self, Xb, results): # ''' projects a collection of query objects in a conformal model, for obtaining predictions ''' # def project(self, Xb, results): # ''' Uses the X matrix provided as argument to predict Y'''
def test_cross_validation(self): # ----------------------------------------------------------------------------- # Classification # ----------------------------------------------------------------------------- data = load_iris() icp = IcpClassifier( ClassifierNc( ClassifierAdapter(RandomForestClassifier(n_estimators=100)), MarginErrFunc())) icp_cv = ClassIcpCvHelper(icp) scores = cross_val_score( icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[class_mean_errors, class_avg_c], significance_levels=[0.05, 0.1, 0.2], ) print("Classification: iris") scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean()) # ----------------------------------------------------------------------------- # Regression, absolute error # ----------------------------------------------------------------------------- data = load_diabetes() icp = IcpRegressor( RegressorNc( RegressorAdapter(RandomForestRegressor(n_estimators=100)), AbsErrorErrFunc())) icp_cv = RegIcpCvHelper(icp) scores = cross_val_score( icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[reg_mean_errors, reg_median_size], significance_levels=[0.05, 0.1, 0.2], ) print("Absolute error regression: diabetes") scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean()) # ----------------------------------------------------------------------------- # Regression, normalized absolute error # ----------------------------------------------------------------------------- data = load_diabetes() underlying_model = RegressorAdapter( RandomForestRegressor(n_estimators=100)) normalizer_model = RegressorAdapter( RandomForestRegressor(n_estimators=100)) normalizer = RegressorNormalizer(underlying_model, normalizer_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp_cv = RegIcpCvHelper(icp) scores = cross_val_score( icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[reg_mean_errors, reg_median_size], significance_levels=[0.05, 0.1, 0.2], ) print("Normalized absolute error regression: diabetes") scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean()) # ----------------------------------------------------------------------------- # Regression, normalized signed error # ----------------------------------------------------------------------------- data = load_diabetes() icp = IcpRegressor( RegressorNc( RegressorAdapter(RandomForestRegressor(n_estimators=100)), SignErrorErrFunc())) icp_cv = RegIcpCvHelper(icp) scores = cross_val_score( icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[reg_mean_errors, reg_median_size], significance_levels=[0.05, 0.1, 0.2], ) print("Signed error regression: diabetes") scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean()) # ----------------------------------------------------------------------------- # Regression, signed error # ----------------------------------------------------------------------------- data = load_diabetes() underlying_model = RegressorAdapter( RandomForestRegressor(n_estimators=100)) normalizer_model = RegressorAdapter( RandomForestRegressor(n_estimators=100)) # The normalization model can use a different error function than is # used to measure errors on the underlying model normalizer = RegressorNormalizer(underlying_model, normalizer_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, SignErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp_cv = RegIcpCvHelper(icp) scores = cross_val_score( icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[reg_mean_errors, reg_median_size], significance_levels=[0.05, 0.1, 0.2], ) print("Normalized signed error regression: diabetes") scores = scores.drop(["fold", "iter"], axis=1) print(scores.groupby(["significance"]).mean())
def train_and_test_cp_algo(parameters): p = parameters.copy() p.pop('algorithm') p.pop('randomized_calibration') p.pop('alpha_') p.pop('calibration_size') p.pop('WhichCP') for i in tqdm(range(29)): if parameters.get('algorithm') == 'RandomForest': algorithm = RandomForestRegressor(**p) if parameters.get('algorithm') == 'K-NearestNeighbours': algorithm = KNeighborsRegressor(**p) if parameters.get('algorithm') == 'LightGBM': algorithm = LGBMRegressor(**p) if parameters.get('algorithm') == 'LassoRegression': algorithm = Lasso(**p) if parameters.get('algorithm') == 'NeuralNetwork': algorithm = NeuralNetworkAlgorithm(p) if parameters.get('algorithm') == 'LSTM': algorithm = BiLSTM(**p) if parameters.get('algorithm') == 'GradientBoosting': algorithm =GradientBoostingRegressor(**p) path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv' df = pd.read_csv(path).drop(['Unnamed: 0','QdfTime'], axis=1).fillna(0) m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std() mean = df.mean(axis=0) std = df.std(axis=0) df = (df - mean) / std if parameters.get('randomized_calibration') == True: train_test_split = len(df) - 120 train_ = df.drop([ 'NetPosUsd'], axis=1).iloc[:train_test_split, :].values choose = np.random.choice(len(train_), parameters.get("calibration_size"), replace=False) calibrate = train_[choose, :] mask = np.ones(len(train_), dtype=bool) mask[choose] = False train = train_[mask, :] test = (df.drop([ 'NetPosUsd'], axis=1)).iloc[train_test_split:, :].values ytrain_ = df['NetPosUsd'][:train_test_split].values ycalibrate = ytrain_[choose] ytrain = ytrain_[mask] ytest = df['NetPosUsd'].iloc[train_test_split:] else: train_test_split = len(df) - 120 - parameters.get("calibration_size") train = df.drop([ 'NetPosUsd'], axis=1).iloc[:train_test_split, :].values calibrate = df.drop([ 'NetPosUsd'], axis=1).iloc[train_test_split:train_test_split + parameters.get("calibration_size"), :].values test = (df.drop([ 'NetPosUsd'], axis=1)).iloc[-120:,:].values ytrain = df['NetPosUsd'][:train_test_split].values ycalibrate = df['NetPosUsd'][train_test_split:train_test_split + parameters.get("calibration_size")] ytest = df['NetPosUsd'].iloc[-120:] if parameters.get("WhichCP") == 'NCP': underlying_model = RegressorAdapter(algorithm) normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50)) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(test, significance=parameters.get('alpha_')) header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction'] size = prediction[:, 1] / 2 + prediction[:, 0] / 2 prediction=prediction*s+m ytest=ytest*s+m size=size*s+m table = np.vstack([prediction.T, ytest, size.T]).T dfncp = pd.DataFrame(table, columns=header) else: underlying_model = RegressorAdapter(algorithm) nc = RegressorNc(underlying_model, AbsErrorErrFunc()) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(test, significance=parameters.get('alpha_')) header = ['CP_lower', 'CP_upper', 'NetPosUsd', 'prediction'] size = prediction[:, 1] / 2 + prediction[:, 0] / 2 prediction = prediction * s + m ytest = ytest * s + m size = size * s + m table = np.vstack([prediction.T, ytest, size.T]).T dfncp = pd.DataFrame(table, columns=header) if i == 0: dfncp.to_csv( parameters.get("WhichCP") + '_' + parameters.get('algorithm') + '_' + str( np.round(parameters.get('alpha_') * 100).astype(int)) + '_' + 'calibrationwindow' + str( parameters.get('calibration_size')) + '.csv', encoding='utf-8', index=False) else: dfncp.to_csv( parameters.get("WhichCP") + '_' + parameters.get('algorithm') + '_' + str( np.round(parameters.get('alpha_') * 100).astype(int)) + '_' + 'calibrationwindow' + str( parameters.get('calibration_size')) + '.csv', mode='a', header=False, index=False) del algorithm
def cv(df, parameters): end = len(df) - 120 out = np.zeros(3) out2 = np.zeros(3) p = parameters.copy() p.pop('algorithm') p.pop('randomized_calibration') p.pop('alpha_') if parameters.get('algorithm') == 'RandomForest': algorithm = RandomForestRegressor(**p) d = {'n_estimators': parameters.get('n_estimators'), "criterion": parameters.get("criterion"), "max_features": parameters.get("max_features"), "min_samples_split": parameters.get("min_samples_split"), "min_samples_leaf": parameters.get("min_samples_leaf") } if parameters.get('algorithm') == 'K-NearestNeighbours': algorithm = KNeighborsRegressor(**p) d = { 'n_neighbours': parameters.get('n_neighbours'), 'weights': parameters.get('weights'), 'metric': parameters.get('metric') } if parameters.get('algorithm') == 'LightGBM': algorithm = LGBMRegressor(**p) d = {"metric": parameters.get("metric"), "num_leaves": parameters.get('num_leaves'), "learning_rate": parameters.get('learning_rate'), "feature_fraction": parameters.get('feature_fraction'), "bagging_fraction": parameters.get('bagging_fraction'), "bagging_freq": parameters.get('bagging_freq'), } if parameters.get('algorithm') == 'LassoRegression': algorithm = Lasso(**p) d = {'alpha_': parameters.get('alpha_')} if parameters.get('algorithm') == 'NeuralNetwork': algorithm = NeuralNetworkAlgorithm(p) if parameters.get('algorithm') == 'LSTM': algorithm = BiLSTM(**p) d = {} d = p d['alpha_'] = parameters.get('alpha_') m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std() df=df.drop(['QdfTime' ], axis=1) mean = df.mean(axis=0) std = df.std(axis=0) df = (df - mean) / std for i, ratio in enumerate(([.5, 0.66, .84])): if parameters.get('randomized_calibration') == True: train_ = df.drop([ 'NetPosUsd'], axis=1).iloc[:int(end * ratio), :].values choose = np.random.choice(len(train_), int(end / 6), replace=False) calibrate = train_[choose, :] mask = np.ones(len(train_), dtype=bool) mask[choose] = False train = train_[mask, :] test = (df.drop([ 'NetPosUsd'], axis=1)).iloc[int(end * ratio):int(end * ratio) + int(end / 6), :].values ytrain_ = df['NetPosUsd'][:int(end * ratio)].values ycalibrate = ytrain_[choose] ytrain = ytrain_[mask] ytest = df['NetPosUsd'].iloc[int(end * ratio):int(end * ratio) + int(end / 6)] else: train = df.drop([ 'NetPosUsd'], axis=1).iloc[:int(end * ratio) - int(end / 6), :].values calibrate = df.drop([ 'NetPosUsd'], axis=1).iloc[int(end * ratio) - int(end / 6):int(end * ratio), :].values test = df.drop([ 'NetPosUsd'], axis=1).iloc[int(end * ratio):int(end * ratio) + int(end / 6), :].values ytrain = df['NetPosUsd'][:int(end * ratio) - int(end / 6)].values ycalibrate = df['NetPosUsd'][int(end * ratio) - int(end / 6):int(end * ratio)].values ytest = df['NetPosUsd'][int(end * ratio):int(end * ratio) + int(end / 6)].values # print(len(train),len(ytrain),len(calibrate),len(ycalibrate),len(test),len(ytest)) # Train and calibrate # ----------------------------------------------------------------------------- underlying_model = RegressorAdapter(algorithm) normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50)) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(test, significance=parameters.get('alpha_')) header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction'] size = prediction[:, 1] / 2 + prediction[:, 0] / 2 prediction = prediction * s + m ytest = ytest * s + m size = size * s + m table = np.vstack([prediction.T, ytest, size.T]).T dfncp = pd.DataFrame(table, columns=header) underlying_model = RegressorAdapter(algorithm) nc = RegressorNc(underlying_model, AbsErrorErrFunc()) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) prediction = icp.predict(test, significance=parameters.get('alpha_')) header = ['cp_lower', 'cp_upper'] prediction = prediction * s + m table = np.vstack([prediction.T]).T dfcp = pd.DataFrame(table, columns=header) dfncp['CP_lower'] = dfcp['cp_lower'] dfncp['CP_upper'] = dfcp['cp_upper'] out[i] = qd_objective(dfncp.NetPosUsd, dfncp['CP_lower'], dfncp['CP_upper'], parameters.get('alpha_')) out2[i] = qd_objective(dfncp.NetPosUsd, dfncp['NCP_lower'], dfncp['NCP_upper'], parameters.get('alpha_')) d['CP_loss'] = np.mean(out) d['NCP_loss'] = np.mean(out2) if os.path.exists(parameters.get('algorithm') + '_cv.csv') == True: pd.DataFrame(data=d, index=[0]).to_csv(parameters.get('algorithm') + '_cv.csv', mode='a', header=False, index=False) else: pd.DataFrame(data=d, index=[0]).to_csv(parameters.get('algorithm') + '_cv.csv', encoding='utf-8', index=False)
def build(self): '''Build a new RF model with the X and Y numpy matrices ''' if self.failed: return False X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) if self.cv: self.cv = getCrossVal(self.cv, self.estimator_parameters["random_state"], self.n, self.p) if self.tune: if self.quantitative: self.optimize(X, Y, RandomForestRegressor(), self.tune_parameters) results.append( ('model', 'model type', 'RF quantitative (optimized)')) else: self.optimize(X, Y, RandomForestClassifier(), self.tune_parameters) results.append( ('model', 'model type', 'RF qualitative (optimized)')) else: if self.quantitative: log.info("Building Quantitative RF model") self.estimator_parameters.pop('class_weight', None) self.estimator = RandomForestRegressor( **self.estimator_parameters) results.append(('model', 'model type', 'RF quantitative')) else: log.info("Building Qualitative RF model") self.estimator = RandomForestClassifier( **self.estimator_parameters) results.append(('model', 'model type', 'RF qualitative')) if self.conformal: if self.quantitative: underlying_model = RegressorAdapter(self.estimator) normalizing_model = RegressorAdapter( KNeighborsRegressor(n_neighbors=5)) normalizing_model = RegressorAdapter(self.estimator) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) # self.conformal_pred = AggregatedCp(IcpRegressor(RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) self.conformal_pred = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) self.conformal_pred.fit(X, Y) # overrides non-conformal results.append( ('model', 'model type', 'conformal RF quantitative')) else: self.conformal_pred = AggregatedCp( IcpClassifier( ClassifierNc(ClassifierAdapter(self.estimator), MarginErrFunc())), BootstrapSampler()) self.conformal_pred.fit(X, Y) # overrides non-conformal results.append( ('model', 'model type', 'conformal RF qualitative')) self.estimator.fit(X, Y) return True, results #### Overriding of parent methods # def CF_quantitative_validation(self): # ''' performs validation for conformal quantitative models ''' # def CF_qualitative_validation(self): # ''' performs validation for conformal qualitative models ''' # def quantitativeValidation(self): # ''' performs validation for quantitative models ''' # def qualitativeValidation(self): # ''' performs validation for qualitative models ''' # def validate(self): # ''' Validates the model and computes suitable model quality scoring values''' # def optimize(self, X, Y, estimator, tune_parameters): # ''' optimizes a model using a grid search over a range of values for diverse parameters''' # def regularProject(self, Xb, results): # ''' projects a collection of query objects in a regular model, for obtaining predictions ''' # def conformalProject(self, Xb, results): # ''' projects a collection of query objects in a conformal model, for obtaining predictions ''' # def project(self, Xb, results): # ''' Uses the X matrix provided as argument to predict Y'''
def fit(self, rows_treat, labels_treat, rows_control, labels_control): if rows_treat.shape[0] == 0: return self.Node() if self.seed is not None: np.random.seed(self.seed) # split for conformal regression train_rows_treat, val_rows_treat, train_outcome_treat, val_labels_treat = \ train_test_split(rows_treat, labels_treat, shuffle=True, test_size=0.5) train_rows_control, val_rows_control, train_outcome_control, val_labels_control = \ train_test_split(rows_control, labels_control, shuffle=True, test_size=0.5) # check estimator internal error error_no_tmp = 0 FIT_FLAG = True while (FIT_FLAG): x_train = np.concatenate([train_rows_treat, train_rows_control]) y_train = np.concatenate( [train_outcome_treat, train_outcome_control]) w_train = np.zeros(x_train.shape[0]) w_train[0:train_rows_treat.shape[0]] = 1 FIT_FLAG = self.estimator_treat.model.fit(x_train, y_train, w_train) error_no_tmp = error_no_tmp + 1 if error_no_tmp > 2: # error occur request new datasets raise Exception('Too many errors occur in internal estimator.') # do conformal prediction total_val_no_treat = val_rows_treat.shape[0] total_val_no_control = val_rows_control.shape[0] if self.conformal_mode == "SCR": nc_treat = RegressorNc_r2p(self.estimator_treat, AbsErrorErrFunc()) nc_control = RegressorNc_r2p(self.estimator_control, AbsErrorErrFunc()) icp_treat = IcpRegressor_r2p(nc_treat) icp_treat.fit( train_rows_treat, train_outcome_treat.reshape((train_outcome_treat.shape[0], 1))) icp_treat.calibrate(val_rows_treat, val_labels_treat) cal_scores_treat = icp_treat.cal_scores icp_control = IcpRegressor_r2p(nc_control) icp_control.fit( train_rows_control, train_outcome_control.reshape((train_outcome_control.shape[0], 1))) icp_control.calibrate(val_rows_control, val_labels_control) cal_scores_control = icp_control.cal_scores val_est_treat_treat = self.estimator_treat.predict(val_rows_treat) val_est_treat_control = self.estimator_control.predict(val_rows_treat) val_est_treat_CATE = val_est_treat_treat - val_est_treat_control val_est_control_treat = self.estimator_treat.predict(val_rows_control) val_est_control_control = self.estimator_control.predict( val_rows_control) val_est_control_CATE = val_est_control_treat - val_est_control_control val_est = np.concatenate([val_est_treat_CATE, val_est_control_CATE]) est_mean = float(np.mean(val_est)) # calculate partition measure val_rows = np.concatenate([val_rows_treat, val_rows_control]) val_rows_est_treat = np.concatenate( [val_est_treat_treat, val_est_control_treat]) val_rows_est_control = np.concatenate( [val_est_treat_control, val_est_control_control]) intv_treat = icp_treat.predict(val_rows, significance=self.significance, est_input=val_rows_est_treat) intv_control = icp_control.predict(val_rows, significance=self.significance, est_input=val_rows_est_control) intv = self.get_TE_CI(intv_treat, intv_control) intv_len = np.mean(intv[:, 1] - intv[:, 0]) intv_treat_split = icp_treat.predict(val_rows, significance=self.sig_for_split, est_input=val_rows_est_treat) intv_control_split = icp_control.predict( val_rows, significance=self.sig_for_split, est_input=val_rows_est_control) intv_split = self.get_TE_CI(intv_treat_split, intv_control_split) intv_len_split = np.mean(intv_split[:, 1] - intv_split[:, 0]) obj, intv_measure, homogeneity, obj_real = \ self.eval_func(intv, intv_split, est_mean, total_val_no_treat, total_val_no_control) if self.seed is not None: np.random.seed(self.seed) self.obj = obj self.curr_leaves = 1 self.root = self.Node(col=-1, value=None, obj=obj, homogeneity=homogeneity, intv_len=intv_len, est_treat_treat=val_est_treat_treat, est_treat_control=val_est_treat_control, est_control_treat=val_est_control_treat, est_control_control=val_est_control_control, conf_pred_treat=icp_treat, confl_pred_control=icp_control, cal_scores_treat=cal_scores_treat, cal_scores_control=cal_scores_control, node_depth=0) self.root = self.fit_r(rows_treat, labels_treat, rows_control, labels_control, curr_depth=0, node=self.root, val_rows_treat=val_rows_treat, val_labels_treat=val_labels_treat, val_rows_control=val_rows_control, val_labels_control=val_labels_control, total_val_no_treat=total_val_no_treat, total_val_no_control=total_val_no_control)
from nonconformist.base import RegressorAdapter from nonconformist.icp import IcpRegressor from nonconformist.nc import RegressorNc, AbsErrorErrFunc # ----------------------------------------------------------------------------- # Setup training, calibration and test indices # ----------------------------------------------------------------------------- data = load_boston() idx = np.random.permutation(data.target.size) train = idx[:int(idx.size / 3)] calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)] test = idx[int(2 * idx.size / 3):] # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- icp = IcpRegressor( RegressorNc(RegressorAdapter(DecisionTreeRegressor()), AbsErrorErrFunc())) icp.fit(data.data[train, :], data.target[train]) icp.calibrate(data.data[calibrate, :], data.target[calibrate]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(data.data[test, :], significance=0.1) header = np.array(['min', 'max', 'Truth']) table = np.vstack([prediction.T, data.target[test]]).T df = pd.DataFrame(np.vstack([header, table])) print(df)
def build(self): # Make a copy of data matrices X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) if self.param.getVal('tune'): # Optimize estimator using sklearn-gridsearch if self.estimator_parameters['optimize'] == 'auto': try: LOG.info('Optimizing PLSR using SK-LearnGridSearch') # Remove optimize key from parameter dictionary # to avoid sklearn estimator error (unexpected keyword) self.estimator_parameters.pop("optimize") super(PLSR, self).optimize(X, Y, PLS_r( **self.estimator_parameters), self.param.getDict('PLSR_optimize')) except Exception as e: LOG.error(f'Error performing SK-LearnGridSearch' f' on PLSR estimator with exception {e}') return False, f'Error performing SK-LearnGridSearch on PLSR estimator with exception {e}' # Optimize using flame implementation (recommended) elif self.estimator_parameters['optimize'] == 'manual': LOG.info('Optimizing PLSR using manual method') # Remove optimize key from parameter dictionary # to avoid sklearn estimator error (unexpected keyword) self.estimator_parameters.pop("optimize") success, message = self.optimize(X, Y, PLS_r( **self.estimator_parameters), self.param.getDict('PLSR_optimize')) if not success: return False, message else: LOG.error('Type of tune not recognized, check the input') return False, 'Type of tune not recognized, check the input' results.append(('model', 'model type', 'PLSR quantitative (optimized)')) else: LOG.info('Building Quantitative PLSR with no optimization') try: # Remove optimize key from parameters to avoid error self.estimator_parameters.pop("optimize") # as the sklearn estimator does not have this key self.estimator = PLS_r(**self.estimator_parameters) except Exception as e: LOG.error(f'Error at PLS_r instantiation with ' f'exception {e}') return False, f'Error at PLS_da instantiation with exception {e}' results.append(('model', 'model type', 'PLSR quantitative')) # Fit estimator to the data self.estimator.fit(X, Y) if not self.param.getVal('conformal'): return True, results self.estimator_temp = copy(self.estimator) try: LOG.info('Building PLSR aggregated conformal predictor') underlying_model = RegressorAdapter(self.estimator_temp) # normalizing_model = RegressorAdapter( # KNeighborsRegressor(n_neighbors=1)) normalizing_model = RegressorAdapter(self.estimator_temp) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) self.estimator = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) except Exception as e: LOG.error(f'Error building aggregated PLSR conformal' f' regressor with exception: {e}') return False, f'Error building aggregated PLSR conformal regressor with exception: {e}' # self.conformal_pred = AggregatedCp(IcpRegressor( # RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) # Fit conformal estimator to the data self.estimator.fit(X, Y) # overrides non-conformal results.append(('model', 'model type', 'conformal PLSR quantitative')) return True, results
def run(self): np.seterr(divide='warn', invalid='warn') """ # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions """ output_columns = self.transaction.lmd['predict_columns'] input_columns = [ col for col in self.transaction.lmd['columns'] if col not in output_columns and col not in self.transaction.lmd['columns_to_ignore'] ] # Make predictions on the validation dataset normally and with various columns missing normal_predictions = self.transaction.model_backend.predict('validate') normal_predictions_test = self.transaction.model_backend.predict( 'test') normal_accuracy = evaluate_accuracy( normal_predictions, self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], output_columns, backend=self.transaction.model_backend) for col in output_columns: if self.transaction.lmd['tss']['is_timeseries']: reals = list(self.transaction.input_data.validation_df[ self.transaction.input_data. validation_df['make_predictions'] == True][col]) else: reals = self.transaction.input_data.validation_df[col] preds = normal_predictions[col] fails = False data_type = self.transaction.lmd['stats_v2'][col]['typing'][ 'data_type'] data_subtype = self.transaction.lmd['stats_v2'][col]['typing'][ 'data_subtype'] if data_type == DATA_TYPES.CATEGORICAL: if data_subtype == DATA_SUBTYPES.TAGS: encoder = self.transaction.model_backend.predictor._mixer.encoders[ col] if balanced_accuracy_score( encoder.encode(reals).argmax(axis=1), encoder.encode(preds).argmax( axis=1)) <= self.transaction.lmd['stats_v2'][ col]['balanced_guess_probability']: fails = True else: if balanced_accuracy_score( reals, preds) <= self.transaction.lmd['stats_v2'][ col]['balanced_guess_probability']: fails = True elif data_type == DATA_TYPES.NUMERIC: if r2_score(reals, preds) < 0: fails = True else: pass if fails: if not self.transaction.lmd['force_predict']: def predict_wrapper(*args, **kwargs): raise Exception('Failed to train model') self.session.predict = predict_wrapper log.error('Failed to train model to predict {}'.format(col)) empty_input_predictions = {} empty_input_accuracy = {} empty_input_predictions_test = {} ignorable_input_columns = [ x for x in input_columns if self.transaction.lmd['stats_v2'][x] ['typing']['data_type'] != DATA_TYPES.FILE_PATH and ( not self.transaction.lmd['tss']['is_timeseries'] or x not in self.transaction.lmd['tss']['order_by']) ] for col in ignorable_input_columns: empty_input_predictions[ col] = self.transaction.model_backend.predict( 'validate', ignore_columns=[col]) empty_input_predictions_test[ col] = self.transaction.model_backend.predict( 'test', ignore_columns=[col]) empty_input_accuracy[col] = evaluate_accuracy( empty_input_predictions[col], self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], output_columns, backend=self.transaction.model_backend) # Get some information about the importance of each column self.transaction.lmd['column_importances'] = {} for col in ignorable_input_columns: accuracy_increase = (normal_accuracy - empty_input_accuracy[col]) # normalize from 0 to 10 self.transaction.lmd['column_importances'][col] = 10 * max( 0, accuracy_increase) # Run Probabilistic Validator overall_accuracy_arr = [] self.transaction.lmd['accuracy_histogram'] = {} self.transaction.lmd['confusion_matrices'] = {} self.transaction.lmd['accuracy_samples'] = {} self.transaction.hmd['probabilistic_validators'] = {} self.transaction.lmd['train_data_accuracy'] = {} self.transaction.lmd['test_data_accuracy'] = {} self.transaction.lmd['valid_data_accuracy'] = {} for col in output_columns: # Training data accuracy predictions = self.transaction.model_backend.predict( 'predict_on_train_data', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['train_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.train_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) # Testing data accuracy predictions = self.transaction.model_backend.predict( 'test', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['test_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.test_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) # Validation data accuracy predictions = self.transaction.model_backend.predict( 'validate', ignore_columns=self.transaction.lmd['stats_v2'] ['columns_to_ignore']) self.transaction.lmd['valid_data_accuracy'][ col] = evaluate_accuracy( predictions, self.transaction.input_data.validation_df, self.transaction.lmd['stats_v2'], [col], backend=self.transaction.model_backend) for col in output_columns: pval = ProbabilisticValidator( col_stats=self.transaction.lmd['stats_v2'][col], col_name=col, input_columns=input_columns) predictions_arr = [normal_predictions_test] + [ x for x in empty_input_predictions_test.values() ] pval.fit(self.transaction.input_data.test_df, predictions_arr, [[ignored_column] for ignored_column in empty_input_predictions_test]) overall_accuracy, accuracy_histogram, cm, accuracy_samples = pval.get_accuracy_stats( ) overall_accuracy_arr.append(overall_accuracy) self.transaction.lmd['accuracy_histogram'][ col] = accuracy_histogram self.transaction.lmd['confusion_matrices'][col] = cm self.transaction.lmd['accuracy_samples'][col] = accuracy_samples self.transaction.hmd['probabilistic_validators'][col] = pickle_obj( pval) self.transaction.lmd['validation_set_accuracy'] = sum( overall_accuracy_arr) / len(overall_accuracy_arr) # conformal prediction confidence estimation self.transaction.lmd['stats_v2']['train_std_dev'] = {} self.transaction.hmd['label_encoders'] = {} self.transaction.hmd['icp'] = {'active': False} for target in output_columns: data_type = self.transaction.lmd['stats_v2'][target]['typing'][ 'data_type'] data_subtype = self.transaction.lmd['stats_v2'][target]['typing'][ 'data_subtype'] is_classification = data_type == DATA_TYPES.CATEGORICAL fit_params = { 'target': target, 'all_columns': self.transaction.lmd['columns'], 'columns_to_ignore': [] } fit_params['columns_to_ignore'].extend( self.transaction.lmd['columns_to_ignore']) fit_params['columns_to_ignore'].extend( [col for col in output_columns if col != target]) if is_classification: if data_subtype != DATA_SUBTYPES.TAGS: all_targets = [ elt[1][target].values for elt in inspect.getmembers( self.transaction.input_data) if elt[0] in {'test_df', 'train_df', 'validation_df'} ] all_classes = np.unique( np.concatenate([np.unique(arr) for arr in all_targets])) enc = OneHotEncoder(sparse=False, handle_unknown='ignore') enc.fit(all_classes.reshape(-1, 1)) fit_params['one_hot_enc'] = enc self.transaction.hmd['label_encoders'][target] = enc else: fit_params['one_hot_enc'] = None self.transaction.hmd['label_encoders'][target] = None adapter = ConformalClassifierAdapter nc_function = MarginErrFunc( ) # better than IPS as we'd need the complete distribution over all classes nc_class = ClassifierNc icp_class = IcpClassifier else: adapter = ConformalRegressorAdapter nc_function = AbsErrorErrFunc() nc_class = RegressorNc icp_class = IcpRegressor if (data_type == DATA_TYPES.NUMERIC or (is_classification and data_subtype != DATA_SUBTYPES.TAGS) ) and not self.transaction.lmd['tss']['is_timeseries']: model = adapter(self.transaction.model_backend.predictor, fit_params=fit_params) nc = nc_class(model, nc_function) X = deepcopy(self.transaction.input_data.train_df) y = X.pop(target) if is_classification: self.transaction.hmd['icp'][target] = icp_class( nc, smoothing=False) else: self.transaction.hmd['icp'][target] = icp_class(nc) self.transaction.lmd['stats_v2']['train_std_dev'][ target] = self.transaction.input_data.train_df[ target].std() X = clean_df(X, self.transaction.lmd['stats_v2'], output_columns) self.transaction.hmd['icp'][target].fit(X.values, y.values) self.transaction.hmd['icp']['active'] = True # calibrate conformal estimator on test set X = deepcopy(self.transaction.input_data.validation_df) y = X.pop(target).values if is_classification: if isinstance(enc.categories_[0][0], str): cats = enc.categories_[0].tolist() y = np.array([cats.index(i) for i in y]) y = y.astype(int) X = clean_df(X, self.transaction.lmd['stats_v2'], output_columns) self.transaction.hmd['icp'][target].calibrate(X.values, y)
# Setup training, calibration and test indices # ----------------------------------------------------------------------------- data = load_boston() idx = np.random.permutation(data.target.size) train = idx[:int(idx.size / 3)] calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)] test = idx[int(2 * idx.size / 3):] # ----------------------------------------------------------------------------- # Without normalization # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- underlying_model = RegressorAdapter(DecisionTreeRegressor(min_samples_leaf=5)) nc = RegressorNc(underlying_model, AbsErrorErrFunc()) icp = IcpRegressor(nc) icp.fit(data.data[train, :], data.target[train]) icp.calibrate(data.data[calibrate, :], data.target[calibrate]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(data.data[test, :], significance=0.1) header = ['min','max','truth','size'] size = prediction[:, 1] - prediction[:, 0] table = np.vstack([prediction.T, data.target[test], size.T]).T df = pd.DataFrame(table, columns=header) print(df) # -----------------------------------------------------------------------------
def run_experiment(dataset_name, test_method, random_state_train_test, save_to_csv=True): """ Estimate prediction intervals and print the average length and coverage Parameters ---------- dataset_name : array of strings, list of datasets test_method : string, method to be tested, estimating the 90% prediction interval random_state_train_test : integer, random seed to be used save_to_csv : boolean, save average length and coverage to csv (True) or not (False) """ dataset_name_vec = [] method_vec = [] coverage_vec = [] length_vec = [] seed_vec = [] seed = random_state_train_test random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed) coverage_linear=0 length_linear=0 coverage_linear_local=0 length_linear_local=0 coverage_net=0 length_net=0 coverage_net_local=0 length_net_local=0 coverage_forest=0 length_forest=0 coverage_forest_local=0 length_forest_local=0 coverage_cp_qnet=0 length_cp_qnet=0 coverage_qnet=0 length_qnet=0 coverage_cp_sign_qnet=0 length_cp_sign_qnet=0 coverage_cp_re_qnet=0 length_cp_re_qnet=0 coverage_re_qnet=0 length_re_qnet=0 coverage_cp_sign_re_qnet=0 length_cp_sign_re_qnet=0 coverage_cp_qforest=0 length_cp_qforest=0 coverage_qforest=0 length_qforest=0 coverage_cp_sign_qforest=0 length_cp_sign_qforest=0 # determines the size of test set test_ratio = 0.2 # conformal prediction miscoverage level significance = 0.1 # desired quantile levels, used by the quantile regression methods quantiles = [0.05, 0.95] # Random forests parameters (shared by conditional quantile random forests # and conditional mean random forests regression). n_estimators = 1000 # usual random forests n_estimators parameter min_samples_leaf = 1 # default parameter of sklearn # Quantile random forests parameters. # See QuantileForestRegressorAdapter class for more details quantiles_forest = [5, 95] CV_qforest = True coverage_factor = 0.85 cv_test_ratio = 0.05 cv_random_state = 1 cv_range_vals = 30 cv_num_vals = 10 # Neural network parameters (shared by conditional quantile neural network # and conditional mean neural network regression) # See AllQNet_RegressorAdapter and MSENet_RegressorAdapter in helper.py nn_learn_func = torch.optim.Adam epochs = 1000 lr = 0.0005 hidden_size = 64 batch_size = 64 dropout = 0.1 wd = 1e-6 # Ask for a reduced coverage when tuning the network parameters by # cross-validation to avoid too conservative initial estimation of the # prediction interval. This estimation will be conformalized by CQR. quantiles_net = [0.1, 0.9] # local conformal prediction parameter. # See RegressorNc class for more details. beta = 1 beta_net = 1 # local conformal prediction parameter. The local ridge regression method # uses nearest neighbor regression as the MAD estimator. # Number of neighbors used by nearest neighbor regression. n_neighbors = 11 print(dataset_name) sys.stdout.flush() try: # load the dataset X, y = datasets.GetDataset(dataset_name, base_dataset_path) except: print("CANNOT LOAD DATASET!") return # Dataset is divided into test and train data based on test_ratio parameter X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=random_state_train_test) # zero mean and unit variance scaling of the train and test features scalerX = StandardScaler() scalerX = scalerX.fit(X_train) X_train = scalerX.transform(X_train) X_test = scalerX.transform(X_test) # scale the labels by dividing each by the mean absolute response max_ytrain = np.mean(np.abs(y_train)) y_train = y_train/max_ytrain y_test = y_test/max_ytrain # fit a simple ridge regression model (sanity check) model = linear_model.RidgeCV() model = model.fit(X_train, y_train) predicted_data = model.predict(X_test).astype(np.float32) # calculate the normalized mean squared error print("Ridge relative error: %f" % (np.sum((y_test-predicted_data)**2)/np.sum(y_test**2))) sys.stdout.flush() # reshape the data X_train = np.asarray(X_train) y_train = np.squeeze(np.asarray(y_train)) X_test = np.asarray(X_test) y_test = np.squeeze(np.asarray(y_test)) # input dimensions n_train = X_train.shape[0] in_shape = X_train.shape[1] print("Size: train (%d, %d), test (%d, %d)" % (X_train.shape[0], X_train.shape[1], X_test.shape[0], X_test.shape[1])) sys.stdout.flush() # set seed for splitting the data into proper train and calibration np.random.seed(seed) idx = np.random.permutation(n_train) # divide the data into proper training set and calibration set n_half = int(np.floor(n_train/2)) idx_train, idx_cal = idx[:n_half], idx[n_half:2*n_half] ######################## Linear if 'linear' == test_method: model = linear_model.RidgeCV() nc = RegressorNc(model) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Ridge") coverage_linear, length_linear = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Ridge") dataset_name_vec.append(dataset_name) method_vec.append('Ridge') coverage_vec.append(coverage_linear) length_vec.append(length_linear) seed_vec.append(seed) nc = NcFactory.create_nc( linear_model.RidgeCV(), normalizer_model=KNeighborsRegressor(n_neighbors=n_neighbors) ) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Ridge-L") coverage_linear_local, length_linear_local = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Ridge-L") dataset_name_vec.append(dataset_name) method_vec.append('Ridge-L') coverage_vec.append(coverage_linear_local) length_vec.append(length_linear_local) seed_vec.append(seed) ######################### Neural net if 'neural_net' == test_method: model = helper.MSENet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state) nc = RegressorNc(model) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Net") coverage_net, length_net = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Net") dataset_name_vec.append(dataset_name) method_vec.append('Net') coverage_vec.append(coverage_net) length_vec.append(length_net) seed_vec.append(seed) normalizer_adapter = helper.MSENet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state) adapter = helper.MSENet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state) normalizer = RegressorNormalizer(adapter, normalizer_adapter, AbsErrorErrFunc()) nc = RegressorNc(adapter, AbsErrorErrFunc(), normalizer, beta=beta_net) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Net-L") coverage_net_local, length_net_local = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Net-L") dataset_name_vec.append(dataset_name) method_vec.append('Net-L') coverage_vec.append(coverage_net_local) length_vec.append(length_net_local) seed_vec.append(seed) ################## Random Forest if 'random_forest' == test_method: model = RandomForestRegressor(n_estimators=n_estimators,min_samples_leaf=min_samples_leaf, random_state=0) nc = RegressorNc(model, AbsErrorErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"RF") coverage_forest, length_forest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"RF") dataset_name_vec.append(dataset_name) method_vec.append('RF') coverage_vec.append(coverage_forest) length_vec.append(length_forest) seed_vec.append(seed) normalizer_adapter = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=0) adapter = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=0) normalizer = RegressorNormalizer(adapter, normalizer_adapter, AbsErrorErrFunc()) nc = RegressorNc(adapter, AbsErrorErrFunc(), normalizer, beta=beta) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"RF-L") coverage_forest_local, length_forest_local = helper.compute_coverage(y_test,y_lower,y_upper,significance,"RF-L") dataset_name_vec.append(dataset_name) method_vec.append('RF-L') coverage_vec.append(coverage_forest_local) length_vec.append(length_forest_local) seed_vec.append(seed) ################## Quantile Net if 'quantile_net' == test_method: model_full = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=False) model_full.fit(X_train, y_train) tmp = model_full.predict(X_test) y_lower = tmp[:,0] y_upper = tmp[:,1] if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"QNet") coverage_qnet, length_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"QNet") dataset_name_vec.append(dataset_name) method_vec.append('QNet') coverage_vec.append(coverage_qnet) length_vec.append(length_qnet) seed_vec.append(seed) if 'cqr_quantile_net' == test_method: model = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles_net, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=False) nc = RegressorNc(model, QuantileRegErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"CQR Net") coverage_cp_qnet, length_cp_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR Net") dataset_name_vec.append(dataset_name) method_vec.append('CQR Net') coverage_vec.append(coverage_cp_qnet) length_vec.append(length_cp_qnet) seed_vec.append(seed) if 'cqr_asymmetric_quantile_net' == test_method: model = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles_net, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=False) nc = RegressorNc(model, QuantileRegAsymmetricErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"CQR Sign Net") coverage_cp_sign_qnet, length_cp_sign_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR Sign Net") dataset_name_vec.append(dataset_name) method_vec.append('CQR Sign Net') coverage_vec.append(coverage_cp_sign_qnet) length_vec.append(length_cp_sign_qnet) seed_vec.append(seed) ################### Rearrangement Quantile Net if 'rearrangement' == test_method: model_full = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=True) model_full.fit(X_train, y_train) tmp = model_full.predict(X_test) y_lower = tmp[:,0] y_upper = tmp[:,1] if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Rearrange QNet") coverage_re_qnet, length_re_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Rearrange QNet") dataset_name_vec.append(dataset_name) method_vec.append('Rearrange QNet') coverage_vec.append(coverage_re_qnet) length_vec.append(length_re_qnet) seed_vec.append(seed) if 'cqr_rearrangement' == test_method: model = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles_net, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=True) nc = RegressorNc(model, QuantileRegErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Rearrange CQR Net") coverage_cp_re_qnet, length_cp_re_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Rearrange CQR Net") dataset_name_vec.append(dataset_name) method_vec.append('Rearrange CQR Net') coverage_vec.append(coverage_cp_re_qnet) length_vec.append(length_cp_re_qnet) seed_vec.append(seed) if 'cqr_asymmetric_rearrangement' == test_method: model = helper.AllQNet_RegressorAdapter(model=None, fit_params=None, in_shape = in_shape, hidden_size = hidden_size, quantiles = quantiles_net, learn_func = nn_learn_func, epochs = epochs, batch_size=batch_size, dropout=dropout, lr=lr, wd=wd, test_ratio=cv_test_ratio, random_state=cv_random_state, use_rearrangement=True) nc = RegressorNc(model, QuantileRegAsymmetricErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"Rearrange CQR Sign Net") coverage_cp_sign_re_qnet, length_cp_sign_re_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Rearrange CQR Net") dataset_name_vec.append(dataset_name) method_vec.append('Rearrange CQR Sign Net') coverage_vec.append(coverage_cp_sign_re_qnet) length_vec.append(length_cp_sign_re_qnet) seed_vec.append(seed) ################### Quantile Random Forest if 'quantile_forest' == test_method: params_qforest = dict() params_qforest["random_state"] = 0 params_qforest["min_samples_leaf"] = min_samples_leaf params_qforest["n_estimators"] = n_estimators params_qforest["max_features"] = X_train.shape[1] params_qforest["CV"]=False params_qforest["coverage_factor"] = coverage_factor params_qforest["test_ratio"]=cv_test_ratio params_qforest["random_state"]=cv_random_state params_qforest["range_vals"] = cv_range_vals params_qforest["num_vals"] = cv_num_vals model_full = helper.QuantileForestRegressorAdapter(model = None, fit_params=None, quantiles=np.dot(100,quantiles), params = params_qforest) model_full.fit(X_train, y_train) tmp = model_full.predict(X_test) y_lower = tmp[:,0] y_upper = tmp[:,1] if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"QRF") coverage_qforest, length_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"QRF") dataset_name_vec.append(dataset_name) method_vec.append('QRF') coverage_vec.append(coverage_qforest) length_vec.append(length_qforest) seed_vec.append(seed) if 'cqr_quantile_forest' == test_method: params_qforest = dict() params_qforest["random_state"] = 0 params_qforest["min_samples_leaf"] = min_samples_leaf params_qforest["n_estimators"] = n_estimators params_qforest["max_features"] = X_train.shape[1] params_qforest["CV"]=CV_qforest params_qforest["coverage_factor"] = coverage_factor params_qforest["test_ratio"]=cv_test_ratio params_qforest["random_state"]=cv_random_state params_qforest["range_vals"] = cv_range_vals params_qforest["num_vals"] = cv_num_vals model = helper.QuantileForestRegressorAdapter(model = None, fit_params=None, quantiles=quantiles_forest, params = params_qforest) nc = RegressorNc(model, QuantileRegErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"CQR RF") coverage_cp_qforest, length_cp_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR RF") dataset_name_vec.append(dataset_name) method_vec.append('CQR RF') coverage_vec.append(coverage_cp_qforest) length_vec.append(length_cp_qforest) seed_vec.append(seed) if 'cqr_asymmetric_quantile_forest' == test_method: params_qforest = dict() params_qforest["random_state"] = 0 params_qforest["min_samples_leaf"] = min_samples_leaf params_qforest["n_estimators"] = n_estimators params_qforest["max_features"] = X_train.shape[1] params_qforest["CV"]=CV_qforest params_qforest["coverage_factor"] = coverage_factor params_qforest["test_ratio"]=cv_test_ratio params_qforest["random_state"]=cv_random_state params_qforest["range_vals"] = cv_range_vals params_qforest["num_vals"] = cv_num_vals model = helper.QuantileForestRegressorAdapter(model = None, fit_params=None, quantiles=quantiles_forest, params = params_qforest) nc = RegressorNc(model, QuantileRegAsymmetricErrFunc()) y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance) if plot_results: helper.plot_func_data(y_test,y_lower,y_upper,"CQR Sign RF") coverage_cp_sign_qforest, length_cp_sign_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR Sign RF") dataset_name_vec.append(dataset_name) method_vec.append('CQR Sign RF') coverage_vec.append(coverage_cp_sign_qforest) length_vec.append(length_cp_sign_qforest) seed_vec.append(seed) # tmp = model.predict(X_test) # y_lower = tmp[:,0] # y_upper = tmp[:,1] # if plot_results: # helper.plot_func_data(y_test,y_lower,y_upper,"QRF") # coverage_qforest, length_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"QRF") # # dataset_name_vec.append(dataset_name) # method_vec.append('QRF') # coverage_vec.append(coverage_qforest) # length_vec.append(length_qforest) # seed_vec.append(seed) ############### Summary coverage_str = 'Coverage (expected ' + str(100 - significance*100) + '%)' results = np.array([[dataset_name, coverage_str, 'Avg. Length', 'Seed'], ['CP Linear', coverage_linear, length_linear, seed], ['CP Linear Local', coverage_linear_local, length_linear_local, seed], ['CP Neural Net', coverage_net, length_net, seed], ['CP Neural Net Local', coverage_net_local, length_net_local, seed], ['CP Random Forest', coverage_forest, length_forest, seed], ['CP Random Forest Local', coverage_forest_local, length_forest_local, seed], ['CP Quantile Net', coverage_cp_qnet, length_cp_qnet, seed], ['CP Asymmetric Quantile Net', coverage_cp_sign_qnet, length_cp_sign_qnet, seed], ['Quantile Net', coverage_qnet, length_qnet, seed], ['CP Rearrange Quantile Net', coverage_cp_re_qnet, length_cp_re_qnet, seed], ['CP Asymmetric Rearrange Quantile Net', coverage_cp_sign_re_qnet, length_cp_sign_re_qnet, seed], ['Rearrange Quantile Net', coverage_re_qnet, length_re_qnet, seed], ['CP Quantile Random Forest', coverage_cp_qforest, length_cp_qforest, seed], ['CP Asymmetric Quantile Random Forest', coverage_cp_sign_qforest, length_cp_sign_qforest, seed], ['Quantile Random Forest', coverage_qforest, length_qforest, seed]]) results_ = pd.DataFrame(data=results[1:,1:], index=results[1:,0], columns=results[0,1:]) print("== SUMMARY == ") print("dataset name: " + dataset_name) print(results_) sys.stdout.flush() if save_to_csv: results = pd.DataFrame(results) outdir = './results/' if not os.path.exists(outdir): os.mkdir(outdir) out_name = outdir + 'results.csv' df = pd.DataFrame({'name': dataset_name_vec, 'method': method_vec, coverage_str : coverage_vec, 'Avg. Length' : length_vec, 'seed': seed_vec}) if os.path.isfile(out_name): df2 = pd.read_csv(out_name) df = pd.concat([df2, df], ignore_index=True) df.to_csv(out_name, index=False)
def build(self): '''Build a new SVM model with the X and Y numpy matrices''' # Make a copy of data matrices X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) # If tune then call gridsearch to optimize the estimator if self.param.getVal('tune'): try: # Check type of model if self.param.getVal('quantitative'): self.optimize(X, Y, svm.SVR(**self.estimator_parameters), self.tune_parameters) results.append(('model', 'model type', 'SVM quantitative (optimized)')) else: self.optimize(X, Y, svm.SVC(**self.estimator_parameters), self.tune_parameters) results.append( ('model', 'model type', 'SVM qualitative (optimized)')) LOG.debug('SVM estimator optimized') except Exception as e: LOG.error(f'Exception optimizing SVM' f'estimator with exception {e}') else: try: LOG.info("Building SVM model") if self.param.getVal('quantitative'): LOG.info("Building Quantitative SVM-R model") self.estimator = svm.SVR(**self.estimator_parameters) results.append(('model', 'model type', 'SVM quantitative')) else: self.estimator = svm.SVC(**self.estimator_parameters) results.append(('model', 'model type', 'SVM qualitative')) except Exception as e: LOG.error(f'Exception building SVM' f'estimator with exception {e}') self.estimator.fit(X, Y) self.estimator_temp = copy(self.estimator) if self.param.getVal('conformal'): try: LOG.info("Building aggregated conformal SVM model") if self.param.getVal('quantitative'): underlying_model = RegressorAdapter(self.estimator_temp) # normalizing_model = RegressorAdapter( # KNeighborsRegressor(n_neighbors=5)) normalizing_model = RegressorAdapter(self.estimator_temp) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) # self.conformal_pred = AggregatedCp(IcpRegressor( # RegressorNc(RegressorAdapter(self.estimator))), # BootstrapSampler()) self.estimator = AggregatedCp(IcpRegressor(nc), BootstrapSampler()) self.estimator.fit(X, Y) # overrides non-conformal results.append( ('model', 'model type', 'conformal SVM quantitative')) else: self.estimator = AggregatedCp( IcpClassifier( ClassifierNc( ClassifierAdapter(self.estimator_temp), MarginErrFunc())), BootstrapSampler()) self.estimator.fit(X, Y) # overrides non-conformal results.append( ('model', 'model type', 'conformal SVM qualitative')) except Exception as e: LOG.error(f'Exception building aggregated conformal SVM ' f'estimator with exception {e}') # Fit estimator to the data return True, results
folds=5, scoring_funcs=[class_mean_errors, class_avg_c], significance_levels=[0.05, 0.1, 0.2]) print('Classification: iris') scores = scores.drop(['fold', 'iter'], axis=1) print(scores.groupby(['significance']).mean()) # ----------------------------------------------------------------------------- # Regression, absolute error # ----------------------------------------------------------------------------- data = load_diabetes() icp = IcpRegressor( RegressorNc(RegressorAdapter(RandomForestRegressor(n_estimators=100)), AbsErrorErrFunc())) icp_cv = RegIcpCvHelper(icp) scores = cross_val_score(icp_cv, data.data, data.target, iterations=5, folds=5, scoring_funcs=[reg_mean_errors, reg_median_size], significance_levels=[0.05, 0.1, 0.2]) print('Absolute error regression: diabetes') scores = scores.drop(['fold', 'iter'], axis=1) print(scores.groupby(['significance']).mean()) # -----------------------------------------------------------------------------