Beispiel #1
0
    def test_icp_regression_tree(self):
        # -----------------------------------------------------------------------------
        # Setup training, calibration and test indices
        # -----------------------------------------------------------------------------
        data = load_boston()

        idx = np.random.permutation(data.target.size)
        train = idx[:int(idx.size / 3)]
        calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)]
        test = idx[int(2 * idx.size / 3):]

        # -----------------------------------------------------------------------------
        # Without normalization
        # -----------------------------------------------------------------------------
        # Train and calibrate
        # -----------------------------------------------------------------------------
        underlying_model = RegressorAdapter(
            DecisionTreeRegressor(min_samples_leaf=5))
        nc = RegressorNc(underlying_model, AbsErrorErrFunc())
        icp = IcpRegressor(nc)
        icp.fit(data.data[train, :], data.target[train])
        icp.calibrate(data.data[calibrate, :], data.target[calibrate])

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp.predict(data.data[test, :], significance=0.1)
        header = ["min", "max", "truth", "size"]
        size = prediction[:, 1] - prediction[:, 0]
        table = np.vstack([prediction.T, data.target[test], size.T]).T
        df = pd.DataFrame(table, columns=header)
        print(df)

        # -----------------------------------------------------------------------------
        # With normalization
        # -----------------------------------------------------------------------------
        # Train and calibrate
        # -----------------------------------------------------------------------------
        underlying_model = RegressorAdapter(
            DecisionTreeRegressor(min_samples_leaf=5))
        normalizing_model = RegressorAdapter(
            KNeighborsRegressor(n_neighbors=1))
        normalizer = RegressorNormalizer(underlying_model, normalizing_model,
                                         AbsErrorErrFunc())
        nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
        icp = IcpRegressor(nc)
        icp.fit(data.data[train, :], data.target[train])
        icp.calibrate(data.data[calibrate, :], data.target[calibrate])

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp.predict(data.data[test, :], significance=0.1)
        header = ["min", "max", "truth", "size"]
        size = prediction[:, 1] - prediction[:, 0]
        table = np.vstack([prediction.T, data.target[test], size.T]).T
        df = pd.DataFrame(table, columns=header)
        print(df)
Beispiel #2
0
    def build(self):
        if not self.quantitative:
            print("PLSR only applies to quantitative data")
            return False, "PLSR only applies to quantitative data"

        if self.failed:
            return False, "Error initiating model"

        X = self.X.copy()
        Y = self.Y.copy()


        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))

        if self.cv:
            self.cv = getCrossVal(self.cv, 46, self.n, self.p)

        if self.tune:
            if self.optimiz == 'auto':
                super(PLSR, self).optimize(X, Y, PLS_r(
                    **self.estimator_parameters), self.tune_parameters)
            elif self.optimiz == 'manual':
                self.optimize(X, Y, PLS_r(
                    **self.estimator_parameters), self.tune_parameters)

            results.append(
                ('model', 'model type', 'PLSR quantitative (optimized)'))

        else:
            print("Building  Quantitative PLSR")
            self.estimator = PLS_r(**self.estimator_parameters)
            results.append(('model', 'model type', 'PLSR quantitative'))

        if self.conformal:
            underlying_model = RegressorAdapter(self.estimator)
            normalizing_model = RegressorAdapter(
                KNeighborsRegressor(n_neighbors=1))
            normalizing_model = RegressorAdapter(self.estimator)
            normalizer = RegressorNormalizer(
                underlying_model, normalizing_model, AbsErrorErrFunc())
            nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
            self.conformal_pred = AggregatedCp(IcpRegressor(nc),
                                               BootstrapSampler())

            # self.conformal_pred = AggregatedCp(IcpRegressor(RegressorNc(RegressorAdapter(self.estimator))),
            #                                    BootstrapSampler())

            self.conformal_pred.fit(X, Y)
            # overrides non-conformal
            results.append(
                ('model', 'model type', 'conformal PLSR quantitative'))

        self.estimator.fit(X, Y)

        return True, results
def CF_QuanCal(X, Y, estimator):
    # X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
    acp = AggregatedCp(
        IcpRegressor(
            RegressorNc(
                RegressorAdapter(estimator), AbsErrorErrFunc(),
                RegressorNormalizer(estimator, copy.copy(estimator),
                                    AbsErrorErrFunc())), RandomSubSampler()), )
    acp.fit(X, Y)
    # icp.calibrate(X_test, y_test)
    return acp
Beispiel #4
0
    def build(self):
        '''Build a new XGBOOST model with the X and Y numpy matrices '''

        try:
            from xgboost.sklearn import XGBClassifier
            from xgboost.sklearn import XGBRegressor
        except Exception as e:
            return False,  'XGboost not found, please revise your environment'

        # Make a copy of data matrices
        X = self.X.copy()
        Y = self.Y.copy()

        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))

        # If tune then call gridsearch to optimize the estimator
        if self.param.getVal('tune'):

            LOG.info("Optimizing XGBOOST estimator")
            
            try:
                # Check type of model
                if self.param.getVal('quantitative'):
                    self.estimator = XGBRegressor(
                                        **self.estimator_parameters)
                    self.optimize(X, Y, self.estimator, self.tune_parameters)
                    results.append(('model','model type','XGBOOST quantitative (optimized)'))
                else:
                    self.estimator = XGBClassifier(
                                        **self.estimator_parameters)
                    params = self.estimator.get_params()
                    params['num_class'] = 2
                    self.optimize(X, Y, self.estimator,
                                  self.tune_parameters)
                    results.append(('model','model type','XGBOOST qualitative (optimized)'))

            except Exception as e:
                return False, f'Exception optimizing XGBOOST estimator with exception {e}'
            
        else:
            try:
                if self.param.getVal('quantitative'):

                    LOG.info("Building Quantitative XGBOOST model")
                    # params = {
                    #     'objective': 'reg:squarederror',
                    #     'missing': -99.99999,
                    #     # 'max_depth': 20,
                    #     # 'learning_rate': 1.0,
                    #     # 'silent': 1,
                    #     # 'n_estimators': 25
                    #     }
                    # self.estimator = XGBRegressor(**params)
                    self.estimator = XGBRegressor(**self.estimator_parameters)
                    results.append(('model', 'model type', 'XGBOOST quantitative'))
                else:

                    LOG.info("Building Qualitative XGBOOST model")
                    # params = {
                    #     'objective': 'binary:logistic',
                    #      'max_depth': 3,
                    #      #'learning_rate': 0.7,
                    #      #'silent': 1,
                    #      'n_estimators': 100
                    #     }
                    self.estimator = XGBClassifier(**self.estimator_parameters)
                    results.append(('model', 'model type', 'XGBOOST qualitative'))

                self.estimator.fit(X, Y)
                print(self.estimator)

            except Exception as e:
                raise e
                return False, f'Exception building XGBOOST estimator with exception {e}'

        self.estimator_temp = copy(self.estimator)

        if not self.param.getVal('conformal'):
            return True, results
        # Create the conformal estimator
        try:
            # Conformal regressor
            if self.param.getVal('quantitative'):

                LOG.info("Building conformal Quantitative XGBOOST model")

                underlying_model = RegressorAdapter(self.estimator_temp)
                #normalizing_model = RegressorAdapter(
                    #KNeighborsRegressor(n_neighbors=5))
                normalizing_model = RegressorAdapter(self.estimator_temp)
                normalizer = RegressorNormalizer(
                                underlying_model,
                                normalizing_model,
                                AbsErrorErrFunc())
                nc = RegressorNc(underlying_model,
                                    AbsErrorErrFunc(),
                                    normalizer)

                # self.conformal_pred = AggregatedCp(IcpRegressor
                # (RegressorNc(RegressorAdapter(self.estimator))),
                #                                   BootstrapSampler())

                self.estimator = AggregatedCp(IcpRegressor(nc),
                                                BootstrapSampler())

                self.estimator.fit(X, Y)
                results.append(('model', 'model type', 'conformal XGBOOST quantitative'))

            # Conformal classifier
            else:

                LOG.info("Building conformal Qualitative XGBOOST model")

                self.estimator = AggregatedCp(
                                    IcpClassifier(
                                        ClassifierNc(
                                            ClassifierAdapter(self.estimator_temp),
                                            MarginErrFunc()
                                        )
                                    ),
                                    BootstrapSampler())

                # Fit estimator to the data
                self.estimator.fit(X, Y)
                results.append(('model', 'model type', 'conformal XGBOOST qualitative'))

        except Exception as e:
            raise e
            return False, f'Exception building conformal XGBOOST estimator with exception {e}'

        return True, results



## Overriding of parent methods

    # def CF_quantitative_validation(self):
    #     ''' performs validation for conformal quantitative models '''

      

    # def CF_qualitative_validation(self):
    #     ''' performs validation for conformal qualitative models '''


    # def quantitativeValidation(self):
    #     ''' performs validation for quantitative models '''

    # def qualitativeValidation(self):
    #     ''' performs validation for qualitative models '''


    # def validate(self):
    #     ''' Validates the model and computes suitable model quality scoring values'''


    # def optimize(self, X, Y, estimator, tune_parameters):
    #     ''' optimizes a model using a grid search over a range of values for diverse parameters'''


    # def regularProject(self, Xb, results):
    #     ''' projects a collection of query objects in a regular model, for obtaining predictions '''


    # def conformalProject(self, Xb, results):
    #     ''' projects a collection of query objects in a conformal model, for obtaining predictions '''


    # def project(self, Xb, results):
    #     ''' Uses the X matrix provided as argument to predict Y'''
Beispiel #5
0
    def build(self):
        '''Build a new DL model with the X and Y numpy matrices '''

        try:
            from keras.wrappers.scikit_learn import KerasClassifier
            from keras.wrappers.scikit_learn import KerasRegressor
        except Exception as e:
            return False, 'Keras not found, please revise your environment'

        # Make a copy of data matrices
        X = self.X.copy()
        Y = self.Y.copy()

        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))

        # If tune then call gridsearch to optimize the estimator
        if self.param.getVal('tune'):

            LOG.info("Optimizing Keras estimator")

            try:
                # Check type of model
                if self.param.getVal('quantitative'):
                    self.estimator = KerasRegressor(
                        **self.estimator_parameters)
                    self.optimize(X, Y, self.estimator, self.tune_parameters)
                    results.append(('model', 'model type',
                                    'KERAS quantitative (optimized)'))
                else:
                    self.estimator = KerasClassifier(
                        **self.estimator_parameters)
                    #params = self.estimator.get_params()
                    #params['num_class'] = 2
                    self.optimize(X, Y, self.estimator, self.tune_parameters)
                    results.append(('model', 'model type',
                                    'KERAS qualitative (optimized)'))

            except Exception as e:
                return False, f'Exception optimizing KERAS estimator with exception {e}'

        else:
            try:
                if self.param.getVal('quantitative'):

                    LOG.info("Building Quantitative KERAS mode")
                    self.estimator = KerasRegressor(
                        build_fn=self.create_model,
                        **self.estimator_parameters,
                        verbose=0)
                    results.append(
                        ('model', 'model type', 'Keras quantitative'))
                else:

                    LOG.info("Building Qualitative Keras model")
                    self.estimator = KerasClassifier(
                        build_fn=self.create_model,
                        dim=self.X.shape[1],
                        **self.estimator_parameters,
                        verbose=0)
                    results.append(
                        ('model', 'model type', 'Keras qualitative'))

                self.estimator.fit(X, Y)
                print(self.estimator)

            except Exception as e:
                raise e
                return False, f'Exception building Keras estimator with exception {e}'

        self.estimator_temp = clone(self.estimator)

        if not self.param.getVal('conformal'):
            return True, results
        # Create the conformal estimator
        try:
            # Conformal regressor
            if self.param.getVal('quantitative'):

                LOG.info("Building conformal Quantitative Keras model")

                underlying_model = RegressorAdapter(self.estimator_temp)
                normalizing_model = RegressorAdapter(
                    KNeighborsRegressor(n_neighbors=15))
                # normalizing_model = RegressorAdapter(self.estimator_temp)
                normalizer = RegressorNormalizer(underlying_model,
                                                 normalizing_model,
                                                 AbsErrorErrFunc())
                nc = RegressorNc(underlying_model, AbsErrorErrFunc(),
                                 normalizer)

                # self.conformal_pred = AggregatedCp(IcpRegressor
                # (RegressorNc(RegressorAdapter(self.estimator))),
                #                                   BootstrapSampler())

                self.estimator = AggregatedCp(IcpRegressor(nc),
                                              BootstrapSampler())

                self.estimator.fit(X, Y)
                results.append(
                    ('model', 'model type', 'conformal Keras quantitative'))

            # Conformal classifier
            else:

                LOG.info("Building conformal Qualitative Keras model")

                self.estimator = AggregatedCp(
                    IcpClassifier(
                        ClassifierNc(ClassifierAdapter(self.estimator_temp),
                                     MarginErrFunc())), BootstrapSampler())

                # Fit estimator to the data
                print('build finished')
                self.estimator.fit(X, Y)
                results.append(
                    ('model', 'model type', 'conformal Keras qualitative'))

        except Exception as e:
            raise e
            return False, f'Exception building conformal Keras estimator with exception {e}'

        return True, []
Beispiel #6
0
def train_and_test_cp_algo(i):
    window = 96
    p = {'window': window}
    algorithm = BiLSTM(p)

    path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv'
    df = pd.read_csv(path).drop(['QdfTime', 'Unnamed: 0'], axis=1).fillna(0)
    y_raw_test = df.NetPosUsd[-120:]
    median_ = df.NetPosUsd.median()
    mad_ = mad(df.NetPosUsd.values)
    df.NetPosUsd = mlog_trans(df.NetPosUsd.values)

    # mean = df.NetPosUsd.mean()
    # std = df.NetPosUsd.std()
    # df.NetPosUsd = (df.NetPosUsd - mean) / std

    data = df.NetPosUsd.values

    def generate_index(window, data_matrix):
        '''

        :return:
        '''

        num_elements = data_matrix.shape[0]

        for start, stop in zip(range(0, num_elements - window, 1), range(window, num_elements, 1)):
            yield data_matrix[stop - window:stop].reshape((-1, 1))

    cnt = []

    for sequence in generate_index(window, data):
        cnt.append(sequence)
    cnt = np.array(cnt)

    X = cnt
    y = data[window:]

    X = X.reshape(X.shape[0], X.shape[1])

    train_test_split = X.shape[0] - 120 - 3480
    train = X[:train_test_split, :]

    calibrate = X[train_test_split:train_test_split + 3480, :]

    test = X[-120:]

    ytrain = y[:train_test_split]

    ycalibrate = y[train_test_split:train_test_split + 3480]

    ytest = y[-120:]

    underlying_model = RegressorAdapter(algorithm)
    normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50))
    normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc())
    nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
    icp = IcpRegressor(nc)
    icp.fit(train, ytrain)
    icp.calibrate(calibrate, ycalibrate)

    underlying_model2 = RegressorAdapter(algorithm)
    nc2 = RegressorNc(underlying_model2, AbsErrorErrFunc())
    icp2 = IcpRegressor(nc2)
    icp2.fit(train, ytrain)
    icp2.calibrate(calibrate, ycalibrate)

    for a in tqdm(np.linspace(5, 95, 19)):

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp.predict(test, significance=a / 100)
        header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction']
        lower, upper = prediction[:, 0], prediction[:, 1]

        lower = mlog_inverse(lower, median_, mad_)
        upper = mlog_inverse(upper, median_, mad_)
        ytest = mlog_inverse(ytest, median_, mad_)
        # lower=lower*std+mean
        # upper=upper*std+mean
        # ytest=ytest*std+mean
        size = upper / 2 + lower / 2
        table = np.vstack([lower, upper, y_raw_test, size.T]).T

        dfncp = pd.DataFrame(table, columns=header)

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp2.predict(test, significance=a / 100)
        header = ['CP_lower', 'CP_upper', 'NetPosUsd', 'prediction']
        lower, upper = prediction[:, 0], prediction[:, 1]

        lower = mlog_inverse(lower, median_, mad_)
        upper = mlog_inverse(upper, median_, mad_)
        ytest = mlog_inverse(ytest, median_, mad_)

        # lower=lower*std+mean
        # upper=upper*std+mean
        # ytest=ytest*std+mean
        size = upper / 2 + lower / 2
        table = np.vstack([lower, upper, y_raw_test, size.T]).T

        dfcp = pd.DataFrame(table, columns=header)

        if i == 0:
            dfcp.to_csv(
                'CP' + '_' + 'cudaLSTM' + '_' + str(
                    np.round(a).astype(int)) + '_' + 'calibrationwindow' + str(
                    3480) + '.csv',
                encoding='utf-8', index=False)
        else:
            dfcp.to_csv(
                'CP' + '_' + 'cudaLSTM' + '_' + str(
                    np.round(a).astype(int)) + '_' + 'calibrationwindow' + str(
                    3480) + '.csv', mode='a',
                header=False, index=False)

        if i == 0:
            dfncp.to_csv(
                'NCP' + '_' + 'cudaLSTM' + '_' + str(
                    np.round(a).astype(int)) + '_' + 'calibrationwindow' + str(
                    3480) + '.csv',
                encoding='utf-8', index=False)
        else:
            dfncp.to_csv(
                'NCP' + '_' + 'cudaLSTM' + '_' + str(
                    np.round(a).astype(int)) + '_' + 'calibrationwindow' + str(
                    3480) + '.csv', mode='a',
                header=False, index=False)
Beispiel #7
0
def evaluate(model_filepath, train_filepath, test_filepath,
             calibrate_filepath):
    """Evaluate model to estimate power.

    Args:
        model_filepath (str): Path to model.
        train_filepath (str): Path to train set.
        test_filepath (str): Path to test set.
        calibrate_filepath (str): Path to calibrate set.

    """

    METRICS_FILE_PATH.parent.mkdir(parents=True, exist_ok=True)

    # Load parameters
    params = yaml.safe_load(open("params.yaml"))["evaluate"]
    params_train = yaml.safe_load(open("params.yaml"))["train"]
    params_split = yaml.safe_load(open("params.yaml"))["split"]

    test = np.load(test_filepath)
    X_test = test["X"]
    y_test = test["y"]

    # pandas data frame to store predictions and ground truth.
    df_predictions = None

    y_pred = None

    if params_split["calibrate_split"] == 0:
        model = models.load_model(model_filepath)
        y_pred = model.predict(X_test)
    else:
        trained_model = models.load_model(model_filepath)
        # mycustommodel = MyCustomModel(model_filepath)
        mycustommodel = MyCustomModel(trained_model)

        m = cnn(X_test.shape[-2],
                X_test.shape[-1],
                output_length=1,
                kernel_size=params_train["kernel_size"])

        nc = RegressorNc(
            mycustommodel,
            err_func=AbsErrorErrFunc(),  # non-conformity function
            # normalizer_model=KNeighborsRegressor(n_neighbors=15)  # normalizer
            # normalizer=m
        )

        # nc = NcFactory.create_nc(mycustommodel,
        #     err_func=AbsErrorErrFunc(),  # non-conformity function
        #     # normalizer_model=KNeighborsRegressor(n_neighbors=15)  # normalizer
        #     normalizer_model=m
        # )

        model = IcpRegressor(nc)

        # Fit the normalizer.
        train = np.load(train_filepath)
        X_train = train["X"]
        y_train = train["y"]

        y_train = y_train.reshape((y_train.shape[0], ))

        model.fit(X_train, y_train)

        # Calibrate model.
        calibrate = np.load(calibrate_filepath)
        X_calibrate = calibrate["X"]
        y_calibrate = calibrate["y"]
        y_calibrate = y_calibrate.reshape((y_calibrate.shape[0], ))
        model.calibrate(X_calibrate, y_calibrate)

        print(f"Calibration: {X_calibrate.shape}")

        # Set conformal prediction error. This should be a parameter specified by the user.
        error = 0.05

        # Predictions will contain the intervals. We need to compute the middle
        # points to get the actual predictions y.
        predictions = model.predict(X_test, significance=error)

        # Compute middle points.
        y_pred = predictions[:,
                             0] + (predictions[:, 1] - predictions[:, 0]) / 2

        # Reshape to put it in the same format as without calibration set.
        y_pred = y_pred.reshape((y_pred.shape[0], 1))

        # Build data frame with predictions.
        my_results = list(
            zip(np.reshape(y_test, (y_test.shape[0], )),
                np.reshape(y_pred, (y_pred.shape[0], )), predictions[:, 0],
                predictions[:, 1]))

        df_predictions = pd.DataFrame(my_results,
                                      columns=[
                                          'ground_truth', 'predicted',
                                          'lower_bound', 'upper_bound'
                                      ])

        save_predictions(df_predictions)

        plot_intervals(df_predictions)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print("MSE: {}".format(mse))
    print("R2: {}".format(r2))

    plot_prediction(y_test, y_pred, inputs=X_test, info="(R2: {})".format(r2))
    plot_individual_predictions(y_test, y_pred)

    with open(METRICS_FILE_PATH, "w") as f:
        json.dump(dict(mse=mse, r2=r2), f)
Beispiel #8
0
def run_experiment(cur_test_method, cur_dataset_name, cur_batch_size,
                   cur_lr_loss, cur_lr_dis, cur_loss_steps, cur_dis_steps,
                   cur_mu_val, cur_epochs, cur_model_type, cur_regression_type,
                   cur_random_state, cur_second_scale, num_experiments):

    method = cur_test_method

    seed = cur_random_state
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    dataset = cur_dataset_name

    batch_size = cur_batch_size

    # step size to minimize loss
    lr_loss = cur_lr_loss

    # step size used to fit GAN's classifier
    lr_dis = cur_lr_dis

    # inner epochs to fit loss
    loss_steps = cur_loss_steps

    # inner epochs to fit GAN's classifier
    dis_steps = cur_dis_steps

    # total number of epochs
    epochs = cur_epochs

    # utility loss
    if cur_regression_type == "mreg":
        cost_pred = torch.nn.MSELoss()
        out_shape = 1
    else:
        raise

    model_type = cur_model_type

    metric = "equalized_odds"

    print(dataset)
    print(method)
    sys.stdout.flush()

    avg_length_0 = np.zeros(num_experiments)
    avg_length_1 = np.zeros(num_experiments)

    avg_coverage_0 = np.zeros(num_experiments)
    avg_coverage_1 = np.zeros(num_experiments)

    avg_p_val = np.zeros(num_experiments)
    mse = np.zeros(num_experiments)

    for i in range(num_experiments):

        # Split into train and test
        X, A, Y, X_cal, A_cal, Y_cal, X_test, A_test, Y_test = get_dataset.get_train_test_data(
            base_path, dataset, seed + i)
        in_shape = X.shape[1]

        print("n train = " + str(X.shape[0]) + " p = " + str(X.shape[1]))
        print("n calibration = " + str(X_cal.shape[0]))
        print("n test = " + str(X_test.shape[0]))

        sys.stdout.flush()

        if method == "AdversarialDebiasing":

            class RegAdapter(RegressorAdapter):
                def __init__(self, model=None, fit_params=None, params=None):
                    super(RegAdapter, self).__init__(model, fit_params)
                    # Instantiate model
                    self.learner = adv_debiasing.AdvDebiasingRegLearner(
                        lr=lr_loss,
                        N_CLF_EPOCHS=loss_steps,
                        N_ADV_EPOCHS=dis_steps,
                        N_EPOCH_COMBINED=epochs,
                        cost_pred=cost_pred,
                        in_shape=in_shape,
                        batch_size=batch_size,
                        model_type=model_type,
                        out_shape=out_shape,
                        lambda_vec=cur_mu_val)

                def fit(self, x, y):
                    self.learner.fit(x, y)

                def predict(self, x):
                    return self.learner.predict(x)

        elif method == 'FairDummies':

            class RegAdapter(RegressorAdapter):
                def __init__(self, model=None, fit_params=None, params=None):
                    super(RegAdapter, self).__init__(model, fit_params)
                    # Instantiate model
                    self.learner = fair_dummies_learning.EquiRegLearner(
                        lr=lr_loss,
                        pretrain_pred_epochs=0,
                        pretrain_dis_epochs=0,
                        epochs=epochs,
                        loss_steps=loss_steps,
                        dis_steps=dis_steps,
                        cost_pred=cost_pred,
                        in_shape=in_shape,
                        batch_size=batch_size,
                        model_type=model_type,
                        lambda_vec=cur_mu_val,
                        second_moment_scaling=cur_second_scale,
                        out_shape=out_shape)

                def fit(self, x, y):
                    self.learner.fit(x, y)

                def predict(self, x):
                    return self.learner.predict(x)

        elif method == 'HGR':

            class RegAdapter(RegressorAdapter):
                def __init__(self, model=None, fit_params=None, params=None):
                    super(RegAdapter, self).__init__(model, fit_params)
                    # Instantiate model

                    self.learner = continuous_fairness.HGR_Reg_Learner(
                        lr=lr_loss,
                        epochs=epochs,
                        mu=cur_mu_val,
                        cost_pred=cost_pred,
                        in_shape=in_shape,
                        out_shape=out_shape,
                        batch_size=batch_size,
                        model_type=model_type)

                def fit(self, x, y):
                    self.learner.fit(x, y)

                def predict(self, x):
                    return self.learner.predict(x)

        elif method == 'Baseline':

            class RegAdapter(RegressorAdapter):
                def __init__(self, model=None, fit_params=None, params=None):
                    super(RegAdapter, self).__init__(model, fit_params)
                    # Instantiate model
                    self.learner = fair_dummies_learning.EquiRegLearner(
                        lr=lr_loss,
                        pretrain_pred_epochs=epochs,
                        pretrain_dis_epochs=0,
                        epochs=0,
                        loss_steps=0,
                        dis_steps=0,
                        cost_pred=cost_pred,
                        in_shape=in_shape,
                        batch_size=batch_size,
                        model_type=model_type,
                        lambda_vec=0,
                        second_moment_scaling=0,
                        out_shape=out_shape)

                def fit(self, x, y):
                    self.learner.fit(x, y)

                def predict(self, x):
                    return self.learner.predict(x)

        fairness_reg = RegAdapter(model=None)

        if cur_regression_type == "mreg":
            nc = RegressorNc(fairness_reg, AbsErrorErrFunc())
        else:
            raise

        # function that extracts the group identifier
        def condition(x, y=None):
            return int(x[0][0] > 0)

        icp = IcpRegressor(nc, condition=condition)

        input_data_train = np.concatenate((A[:, np.newaxis], X), 1)
        icp.fit(input_data_train, Y)

        input_data_cal = np.concatenate((A_cal[:, np.newaxis], X_cal), 1)
        icp.calibrate(input_data_cal, Y_cal)

        input_data_test = np.concatenate((A_test[:, np.newaxis], X_test), 1)
        Yhat_test = icp.predict(input_data_test, significance=0.1)

        # compute and print average coverage and average length
        coverage_sample, length_sample = compute_coverage_per_sample(
            Y_test, Yhat_test[:, 0], Yhat_test[:, 1], 0.1, method,
            input_data_test, condition)

        avg_coverage, avg_length = compute_coverage_len(
            Y_test, Yhat_test[:, 0], Yhat_test[:, 1])
        avg_length_0[i] = np.mean(length_sample[0])
        avg_coverage_0[i] = np.mean(coverage_sample[0])
        avg_length_1[i] = np.mean(length_sample[1])
        avg_coverage_1[i] = np.mean(coverage_sample[1])

        Yhat_out_cal = fairness_reg.learner.predict(input_data_cal)
        Yhat_out_test = fairness_reg.learner.predict(input_data_test)

        if out_shape == 1:
            mse[i] = np.mean((Yhat_out_test - Y_test)**2)
            MSE_trivial = np.mean((np.mean(Y_test) - Y_test)**2)
            print("MSE = " + str(mse[i]) + "MSE Trivial = " + str(MSE_trivial))

        p_val = utility_functions.fair_dummies_test_regression(
            Yhat_out_cal,
            A_cal,
            Y_cal,
            Yhat_out_test,
            A_test,
            Y_test,
            num_reps=1,
            num_p_val_rep=1000,
            reg_func_name="Net")

        avg_p_val[i] = p_val

        print("experiment = " + str(i + 1))

        #        if out_shape==2:
        #            init_coverage, init_length = compute_coverage_len(Y_test, Yhat_out_test[:,0], Yhat_out_test[:,1])
        #            print("Init Coverage = " + str(init_coverage))
        #            print("Init Length = " + str(init_length))

        print("Coverage 0 = " + str(avg_coverage_0[i]))
        print("Coverage 1 = " + str(avg_coverage_1[i]))

        print("Length 0 = " + str(avg_length_0[i]))
        print("Length 1 = " + str(avg_length_1[i]))
        print("MSE = " + str(mse[i]))

        print("p_val = " + str(p_val))
        sys.stdout.flush()

        outdir = './results/'
        if not os.path.exists(outdir):
            os.mkdir(outdir)

        out_name = outdir + 'results.csv'

        full_name = cur_test_method + "_" + cur_model_type + "_" + cur_regression_type
        df = pd.DataFrame({
            'method': [cur_test_method],
            'dataset': [cur_dataset_name],
            'batch_size': [cur_batch_size],
            'lr_loss': [cur_lr_loss],
            'lr_dis': [cur_lr_dis],
            'loss_steps': [cur_loss_steps],
            'dis_steps': [cur_dis_steps],
            'mu_val': [cur_mu_val],
            'epochs': [cur_epochs],
            'random_state': [seed + i],
            'model_type': [cur_model_type],
            'metric': [metric],
            'cur_second_scale': [cur_second_scale],
            'regression_type': [cur_regression_type],
            'avg_length': [avg_length],
            'avg_coverage': [avg_coverage],
            'avg_length_0': [avg_length_0[i]],
            'avg_length_1': [avg_length_1[i]],
            'mse': [mse[i]],
            'avg_coverage_0': [avg_coverage_0[i]],
            'avg_coverage_1': [avg_coverage_1[i]],
            'p_val': [p_val],
            'full_name': [full_name]
        })

        if os.path.isfile(out_name):
            df2 = pd.read_csv(out_name)
            df = pd.concat([df2, df], ignore_index=True)

        df.to_csv(out_name, index=False)

        print(full_name)
        print(
            "Num experiments %02d | Avg MSE = %.4f | Avg Length 0 = %.4f | Avg Length 1 = %.4f | Avg Coverage 0 = %.4f | Avg Coverage 1 = %.4f | Avg p_val = %.4f | min p_val = %.4f"
            % (i + 1, np.mean(mse[:i + 1]), np.mean(avg_length_0[:i + 1]),
               np.mean(avg_length_1[:i + 1]), np.mean(avg_coverage_0[:i + 1]),
               np.mean(avg_coverage_1[:i + 1]), np.mean(
                   avg_p_val[:i + 1]), np.min(avg_p_val[:i + 1])))
        print("======== Done =========")
        sys.stdout.flush()
Beispiel #9
0
Datei: RF.py Projekt: e7dal/flame
    def build(self):
        '''Build a new RF model with the X and Y numpy matrices '''

        # Make a copy of data matrices
        X = self.X.copy()
        Y = self.Y.copy()

        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))
        results.append(('model', 'model type', 'RF'))

        conformal = self.param.getVal('conformal')
        # If tune then call gridsearch to optimize the estimator
        if self.param.getVal('tune'):

            LOG.info("Optimizing RF estimator")

            try:
                # Check type of model
                if self.param.getVal('quantitative'):
                    self.estimator = RandomForestRegressor(
                        **self.estimator_parameters)
                    self.optimize(X, Y, self.estimator, self.tune_parameters)
                    # results.append(('model','model type','RF quantitative (optimized)'))
                else:
                    self.estimator = RandomForestClassifier(
                        **self.estimator_parameters)
                    self.optimize(X, Y, self.estimator, self.tune_parameters)
                    # results.append(('model','model type','RF qualitative (optimized)'))

            except Exception as e:
                return False, f'Exception optimizing RF estimator with exception {e}'

        else:
            try:
                if self.param.getVal('quantitative'):

                    self.estimator = RandomForestRegressor(
                        **self.estimator_parameters)

                    if not conformal:
                        LOG.info("Building Quantitative RF model")
                        # results.append(('model', 'model type', 'RF quantitative'))
                else:

                    self.estimator = RandomForestClassifier(
                        **self.estimator_parameters)

                    if not conformal:
                        LOG.info("Building Qualitative RF model")
                        # results.append(('model', 'model type', 'RF qualitative'))

                self.estimator.fit(X, Y)

            except Exception as e:
                return False, f'Exception building RF estimator with exception {e}'

        if not conformal:
            return True, results

        self.estimator_temp = copy(self.estimator)

        # Create the conformal estimator
        try:
            # Conformal regressor
            if self.param.getVal('quantitative'):
                conformal_settings = self.param.getDict('conformal_settings')
                LOG.info("Building conformal Quantitative RF model")

                underlying_model = RegressorAdapter(self.estimator_temp)
                self.normalizing_model = RegressorAdapter(
                    KNeighborsRegressor(
                        n_neighbors=conformal_settings['KNN_NN']))
                # normalizing_model = RegressorAdapter(self.estimator_temp)
                normalizer = RegressorNormalizer(underlying_model,
                                                 copy(self.normalizing_model),
                                                 AbsErrorErrFunc())
                nc = RegressorNc(underlying_model, AbsErrorErrFunc(),
                                 normalizer)

                # self.conformal_pred = AggregatedCp(IcpRegressor
                # (RegressorNc(RegressorAdapter(self.estimator))),
                #                                   BootstrapSampler())

                self.estimator = AggregatedCp(IcpRegressor(nc),
                                              BootstrapSampler())

                self.estimator.fit(X, Y)
                # results.append(('model', 'model type', 'conformal RF quantitative'))

            # Conformal classifier
            else:

                LOG.info("Building conformal Qualitative RF model")

                self.estimator = AggregatedCp(
                    IcpClassifier(
                        ClassifierNc(ClassifierAdapter(self.estimator_temp),
                                     MarginErrFunc())), BootstrapSampler())

                # Fit estimator to the data
                self.estimator.fit(X, Y)
                # results.append(('model', 'model type', 'conformal RF qualitative'))

        except Exception as e:
            return False, f'Exception building conformal RF estimator with exception {e}'

        return True, results


## Overriding of parent methods

# def CF_quantitative_validation(self):
#     ''' performs validation for conformal quantitative models '''

# def CF_qualitative_validation(self):
#     ''' performs validation for conformal qualitative models '''

# def quantitativeValidation(self):
#     ''' performs validation for quantitative models '''

# def qualitativeValidation(self):
#     ''' performs validation for qualitative models '''

# def validate(self):
#     ''' Validates the model and computes suitable model quality scoring values'''

# def optimize(self, X, Y, estimator, tune_parameters):
#     ''' optimizes a model using a grid search over a range of values for diverse parameters'''

# def regularProject(self, Xb, results):
#     ''' projects a collection of query objects in a regular model, for obtaining predictions '''

# def conformalProject(self, Xb, results):
#     ''' projects a collection of query objects in a conformal model, for obtaining predictions '''

# def project(self, Xb, results):
#     ''' Uses the X matrix provided as argument to predict Y'''
Beispiel #10
0
    def test_cross_validation(self):
        # -----------------------------------------------------------------------------
        # Classification
        # -----------------------------------------------------------------------------
        data = load_iris()

        icp = IcpClassifier(
            ClassifierNc(
                ClassifierAdapter(RandomForestClassifier(n_estimators=100)),
                MarginErrFunc()))
        icp_cv = ClassIcpCvHelper(icp)

        scores = cross_val_score(
            icp_cv,
            data.data,
            data.target,
            iterations=5,
            folds=5,
            scoring_funcs=[class_mean_errors, class_avg_c],
            significance_levels=[0.05, 0.1, 0.2],
        )

        print("Classification: iris")
        scores = scores.drop(["fold", "iter"], axis=1)
        print(scores.groupby(["significance"]).mean())

        # -----------------------------------------------------------------------------
        # Regression, absolute error
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        icp = IcpRegressor(
            RegressorNc(
                RegressorAdapter(RandomForestRegressor(n_estimators=100)),
                AbsErrorErrFunc()))
        icp_cv = RegIcpCvHelper(icp)

        scores = cross_val_score(
            icp_cv,
            data.data,
            data.target,
            iterations=5,
            folds=5,
            scoring_funcs=[reg_mean_errors, reg_median_size],
            significance_levels=[0.05, 0.1, 0.2],
        )

        print("Absolute error regression: diabetes")
        scores = scores.drop(["fold", "iter"], axis=1)
        print(scores.groupby(["significance"]).mean())

        # -----------------------------------------------------------------------------
        # Regression, normalized absolute error
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        underlying_model = RegressorAdapter(
            RandomForestRegressor(n_estimators=100))
        normalizer_model = RegressorAdapter(
            RandomForestRegressor(n_estimators=100))
        normalizer = RegressorNormalizer(underlying_model, normalizer_model,
                                         AbsErrorErrFunc())
        nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)

        icp = IcpRegressor(nc)
        icp_cv = RegIcpCvHelper(icp)

        scores = cross_val_score(
            icp_cv,
            data.data,
            data.target,
            iterations=5,
            folds=5,
            scoring_funcs=[reg_mean_errors, reg_median_size],
            significance_levels=[0.05, 0.1, 0.2],
        )

        print("Normalized absolute error regression: diabetes")
        scores = scores.drop(["fold", "iter"], axis=1)
        print(scores.groupby(["significance"]).mean())

        # -----------------------------------------------------------------------------
        # Regression, normalized signed error
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        icp = IcpRegressor(
            RegressorNc(
                RegressorAdapter(RandomForestRegressor(n_estimators=100)),
                SignErrorErrFunc()))
        icp_cv = RegIcpCvHelper(icp)

        scores = cross_val_score(
            icp_cv,
            data.data,
            data.target,
            iterations=5,
            folds=5,
            scoring_funcs=[reg_mean_errors, reg_median_size],
            significance_levels=[0.05, 0.1, 0.2],
        )

        print("Signed error regression: diabetes")
        scores = scores.drop(["fold", "iter"], axis=1)
        print(scores.groupby(["significance"]).mean())

        # -----------------------------------------------------------------------------
        # Regression, signed error
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        underlying_model = RegressorAdapter(
            RandomForestRegressor(n_estimators=100))
        normalizer_model = RegressorAdapter(
            RandomForestRegressor(n_estimators=100))

        # The normalization model can use a different error function than is
        # used to measure errors on the underlying model
        normalizer = RegressorNormalizer(underlying_model, normalizer_model,
                                         AbsErrorErrFunc())
        nc = RegressorNc(underlying_model, SignErrorErrFunc(), normalizer)

        icp = IcpRegressor(nc)
        icp_cv = RegIcpCvHelper(icp)

        scores = cross_val_score(
            icp_cv,
            data.data,
            data.target,
            iterations=5,
            folds=5,
            scoring_funcs=[reg_mean_errors, reg_median_size],
            significance_levels=[0.05, 0.1, 0.2],
        )

        print("Normalized signed error regression: diabetes")
        scores = scores.drop(["fold", "iter"], axis=1)
        print(scores.groupby(["significance"]).mean())
def train_and_test_cp_algo(parameters):
    p = parameters.copy()
    p.pop('algorithm')
    p.pop('randomized_calibration')
    p.pop('alpha_')
    p.pop('calibration_size')
    p.pop('WhichCP')

    for i in tqdm(range(29)):
        if parameters.get('algorithm') == 'RandomForest':
            algorithm = RandomForestRegressor(**p)
        if parameters.get('algorithm') == 'K-NearestNeighbours':
            algorithm = KNeighborsRegressor(**p)
        if parameters.get('algorithm') == 'LightGBM':
            algorithm = LGBMRegressor(**p)
        if parameters.get('algorithm') == 'LassoRegression':
            algorithm = Lasso(**p)
        if parameters.get('algorithm') == 'NeuralNetwork':
            algorithm = NeuralNetworkAlgorithm(p)
        if parameters.get('algorithm') == 'LSTM':
            algorithm = BiLSTM(**p)
        if parameters.get('algorithm') == 'GradientBoosting':
            algorithm =GradientBoostingRegressor(**p)


        path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv'
        df = pd.read_csv(path).drop(['Unnamed: 0','QdfTime'], axis=1).fillna(0)
        m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std()

        mean = df.mean(axis=0)
        std = df.std(axis=0)
        df = (df - mean) / std

        if parameters.get('randomized_calibration') == True:

            train_test_split = len(df) - 120
            train_ = df.drop([ 'NetPosUsd'], axis=1).iloc[:train_test_split, :].values
            choose = np.random.choice(len(train_), parameters.get("calibration_size"), replace=False)
            calibrate = train_[choose, :]
            mask = np.ones(len(train_), dtype=bool)
            mask[choose] = False
            train = train_[mask, :]

            test = (df.drop([  'NetPosUsd'], axis=1)).iloc[train_test_split:,
                   :].values

            ytrain_ = df['NetPosUsd'][:train_test_split].values

            ycalibrate = ytrain_[choose]
            ytrain = ytrain_[mask]

            ytest = df['NetPosUsd'].iloc[train_test_split:]


        else:
            train_test_split = len(df) - 120 - parameters.get("calibration_size")
            train = df.drop([  'NetPosUsd'], axis=1).iloc[:train_test_split, :].values

            calibrate = df.drop([ 'NetPosUsd'], axis=1).iloc[train_test_split:train_test_split + parameters.get("calibration_size"), :].values

            test = (df.drop([  'NetPosUsd'], axis=1)).iloc[-120:,:].values

            ytrain = df['NetPosUsd'][:train_test_split].values

            ycalibrate = df['NetPosUsd'][train_test_split:train_test_split + parameters.get("calibration_size")]

            ytest = df['NetPosUsd'].iloc[-120:]

        if parameters.get("WhichCP") == 'NCP':
            underlying_model = RegressorAdapter(algorithm)
            normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50))
            normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc())
            nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
            icp = IcpRegressor(nc)
            icp.fit(train, ytrain)
            icp.calibrate(calibrate, ycalibrate)

            # -----------------------------------------------------------------------------
            # Predict
            # -----------------------------------------------------------------------------
            prediction = icp.predict(test, significance=parameters.get('alpha_'))
            header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction']
            size = prediction[:, 1] / 2 + prediction[:, 0] / 2

            prediction=prediction*s+m
            ytest=ytest*s+m
            size=size*s+m

            table = np.vstack([prediction.T, ytest, size.T]).T

            dfncp = pd.DataFrame(table, columns=header)

        else:
            underlying_model = RegressorAdapter(algorithm)
            nc = RegressorNc(underlying_model, AbsErrorErrFunc())
            icp = IcpRegressor(nc)
            icp.fit(train, ytrain)
            icp.calibrate(calibrate, ycalibrate)

            # -----------------------------------------------------------------------------
            # Predict
            # -----------------------------------------------------------------------------
            prediction = icp.predict(test, significance=parameters.get('alpha_'))
            header = ['CP_lower', 'CP_upper', 'NetPosUsd', 'prediction']
            size = prediction[:, 1] / 2 + prediction[:, 0] / 2

            prediction = prediction * s + m
            ytest = ytest * s + m
            size = size * s + m

            table = np.vstack([prediction.T, ytest, size.T]).T

            dfncp = pd.DataFrame(table, columns=header)

        if i == 0:
            dfncp.to_csv(
                parameters.get("WhichCP") + '_' + parameters.get('algorithm') + '_' + str(
                    np.round(parameters.get('alpha_') * 100).astype(int)) + '_' + 'calibrationwindow' + str(
                    parameters.get('calibration_size')) + '.csv',
                encoding='utf-8', index=False)
        else:
            dfncp.to_csv(
                parameters.get("WhichCP") + '_' + parameters.get('algorithm') + '_' + str(
                    np.round(parameters.get('alpha_') * 100).astype(int)) + '_' + 'calibrationwindow' + str(
                    parameters.get('calibration_size')) + '.csv', mode='a',
                header=False, index=False)

        del algorithm
def cv(df, parameters):
    end = len(df) - 120
    out = np.zeros(3)
    out2 = np.zeros(3)
    p = parameters.copy()
    p.pop('algorithm')
    p.pop('randomized_calibration')
    p.pop('alpha_')
    if parameters.get('algorithm') == 'RandomForest':
        algorithm = RandomForestRegressor(**p)
        d = {'n_estimators': parameters.get('n_estimators'),
             "criterion": parameters.get("criterion"),
             "max_features": parameters.get("max_features"),
             "min_samples_split": parameters.get("min_samples_split"),
             "min_samples_leaf": parameters.get("min_samples_leaf")
             }
    if parameters.get('algorithm') == 'K-NearestNeighbours':
        algorithm = KNeighborsRegressor(**p)
        d = {
            'n_neighbours': parameters.get('n_neighbours'),
            'weights': parameters.get('weights'),
            'metric': parameters.get('metric')
        }
    if parameters.get('algorithm') == 'LightGBM':
        algorithm = LGBMRegressor(**p)
        d = {"metric": parameters.get("metric"),
             "num_leaves": parameters.get('num_leaves'),
             "learning_rate": parameters.get('learning_rate'),
             "feature_fraction": parameters.get('feature_fraction'),
             "bagging_fraction": parameters.get('bagging_fraction'),
             "bagging_freq": parameters.get('bagging_freq'),
             }

    if parameters.get('algorithm') == 'LassoRegression':
        algorithm = Lasso(**p)
        d = {'alpha_': parameters.get('alpha_')}

    if parameters.get('algorithm') == 'NeuralNetwork':
        algorithm = NeuralNetworkAlgorithm(p)

    if parameters.get('algorithm') == 'LSTM':
        algorithm = BiLSTM(**p)
        d = {}
    d = p
    d['alpha_'] = parameters.get('alpha_')

    m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std()
    df=df.drop(['QdfTime' ], axis=1)
    mean = df.mean(axis=0)
    std = df.std(axis=0)
    df = (df - mean) / std

    for i, ratio in enumerate(([.5, 0.66, .84])):
        if parameters.get('randomized_calibration') == True:

            train_ = df.drop([  'NetPosUsd'], axis=1).iloc[:int(end * ratio), :].values
            choose = np.random.choice(len(train_), int(end / 6), replace=False)
            calibrate = train_[choose, :]
            mask = np.ones(len(train_), dtype=bool)
            mask[choose] = False

            train = train_[mask, :]
            test = (df.drop([  'NetPosUsd'], axis=1)).iloc[int(end * ratio):int(end * ratio) + int(end / 6),
                   :].values

            ytrain_ = df['NetPosUsd'][:int(end * ratio)].values

            ycalibrate = ytrain_[choose]
            ytrain = ytrain_[mask]

            ytest = df['NetPosUsd'].iloc[int(end * ratio):int(end * ratio) + int(end / 6)]

        else:
            train = df.drop([  'NetPosUsd'], axis=1).iloc[:int(end * ratio) - int(end / 6), :].values

            calibrate = df.drop([  'NetPosUsd'], axis=1).iloc[int(end * ratio) - int(end / 6):int(end * ratio),
                        :].values

            test = df.drop([  'NetPosUsd'], axis=1).iloc[int(end * ratio):int(end * ratio) + int(end / 6),
                   :].values

            ytrain = df['NetPosUsd'][:int(end * ratio) - int(end / 6)].values

            ycalibrate = df['NetPosUsd'][int(end * ratio) - int(end / 6):int(end * ratio)].values

            ytest = df['NetPosUsd'][int(end * ratio):int(end * ratio) + int(end / 6)].values
            # print(len(train),len(ytrain),len(calibrate),len(ycalibrate),len(test),len(ytest))

            # Train and calibrate
        # -----------------------------------------------------------------------------

        underlying_model = RegressorAdapter(algorithm)
        normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50))
        normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc())
        nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)

        icp = IcpRegressor(nc)
        icp.fit(train, ytrain)
        icp.calibrate(calibrate, ycalibrate)

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp.predict(test, significance=parameters.get('alpha_'))
        header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction']
        size = prediction[:, 1] / 2 + prediction[:, 0] / 2

        prediction = prediction * s + m
        ytest = ytest * s + m
        size = size * s + m

        table = np.vstack([prediction.T, ytest, size.T]).T

        dfncp = pd.DataFrame(table, columns=header)

        underlying_model = RegressorAdapter(algorithm)

        nc = RegressorNc(underlying_model, AbsErrorErrFunc())
        icp = IcpRegressor(nc)
        icp.fit(train, ytrain)
        icp.calibrate(calibrate, ycalibrate)

        prediction = icp.predict(test, significance=parameters.get('alpha_'))
        header = ['cp_lower', 'cp_upper']

        prediction = prediction * s + m

        table = np.vstack([prediction.T]).T

        dfcp = pd.DataFrame(table, columns=header)
        dfncp['CP_lower'] = dfcp['cp_lower']
        dfncp['CP_upper'] = dfcp['cp_upper']

        out[i] = qd_objective(dfncp.NetPosUsd, dfncp['CP_lower'], dfncp['CP_upper'], parameters.get('alpha_'))

        out2[i] = qd_objective(dfncp.NetPosUsd, dfncp['NCP_lower'], dfncp['NCP_upper'], parameters.get('alpha_'))

    d['CP_loss'] = np.mean(out)
    d['NCP_loss'] = np.mean(out2)

    if os.path.exists(parameters.get('algorithm') + '_cv.csv') == True:

        pd.DataFrame(data=d, index=[0]).to_csv(parameters.get('algorithm') + '_cv.csv', mode='a', header=False,
                                               index=False)

    else:
        pd.DataFrame(data=d, index=[0]).to_csv(parameters.get('algorithm') + '_cv.csv', encoding='utf-8', index=False)
Beispiel #13
0
    def build(self):
        '''Build a new RF model with the X and Y numpy matrices '''

        if self.failed:
            return False

        X = self.X.copy()
        Y = self.Y.copy()

        results = []

        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))

        if self.cv:
            self.cv = getCrossVal(self.cv,
                                  self.estimator_parameters["random_state"],
                                  self.n, self.p)
        if self.tune:
            if self.quantitative:
                self.optimize(X, Y, RandomForestRegressor(),
                              self.tune_parameters)
                results.append(
                    ('model', 'model type', 'RF quantitative (optimized)'))
            else:
                self.optimize(X, Y, RandomForestClassifier(),
                              self.tune_parameters)
                results.append(
                    ('model', 'model type', 'RF qualitative (optimized)'))
        else:
            if self.quantitative:
                log.info("Building Quantitative RF model")
                self.estimator_parameters.pop('class_weight', None)

                self.estimator = RandomForestRegressor(
                    **self.estimator_parameters)
                results.append(('model', 'model type', 'RF quantitative'))

            else:
                log.info("Building Qualitative RF model")
                self.estimator = RandomForestClassifier(
                    **self.estimator_parameters)
                results.append(('model', 'model type', 'RF qualitative'))

        if self.conformal:
            if self.quantitative:
                underlying_model = RegressorAdapter(self.estimator)
                normalizing_model = RegressorAdapter(
                    KNeighborsRegressor(n_neighbors=5))
                normalizing_model = RegressorAdapter(self.estimator)
                normalizer = RegressorNormalizer(underlying_model,
                                                 normalizing_model,
                                                 AbsErrorErrFunc())
                nc = RegressorNc(underlying_model, AbsErrorErrFunc(),
                                 normalizer)
                # self.conformal_pred = AggregatedCp(IcpRegressor(RegressorNc(RegressorAdapter(self.estimator))),
                #                                   BootstrapSampler())

                self.conformal_pred = AggregatedCp(IcpRegressor(nc),
                                                   BootstrapSampler())
                self.conformal_pred.fit(X, Y)
                # overrides non-conformal
                results.append(
                    ('model', 'model type', 'conformal RF quantitative'))

            else:
                self.conformal_pred = AggregatedCp(
                    IcpClassifier(
                        ClassifierNc(ClassifierAdapter(self.estimator),
                                     MarginErrFunc())), BootstrapSampler())
                self.conformal_pred.fit(X, Y)
                # overrides non-conformal
                results.append(
                    ('model', 'model type', 'conformal RF qualitative'))

        self.estimator.fit(X, Y)

        return True, results


#### Overriding of parent methods

# def CF_quantitative_validation(self):
#     ''' performs validation for conformal quantitative models '''

# def CF_qualitative_validation(self):
#     ''' performs validation for conformal qualitative models '''

# def quantitativeValidation(self):
#     ''' performs validation for quantitative models '''

# def qualitativeValidation(self):
#     ''' performs validation for qualitative models '''

# def validate(self):
#     ''' Validates the model and computes suitable model quality scoring values'''

# def optimize(self, X, Y, estimator, tune_parameters):
#     ''' optimizes a model using a grid search over a range of values for diverse parameters'''

# def regularProject(self, Xb, results):
#     ''' projects a collection of query objects in a regular model, for obtaining predictions '''

# def conformalProject(self, Xb, results):
#     ''' projects a collection of query objects in a conformal model, for obtaining predictions '''

# def project(self, Xb, results):
#     ''' Uses the X matrix provided as argument to predict Y'''
    def fit(self, rows_treat, labels_treat, rows_control, labels_control):
        if rows_treat.shape[0] == 0:
            return self.Node()

        if self.seed is not None:
            np.random.seed(self.seed)

        # split for conformal regression
        train_rows_treat, val_rows_treat, train_outcome_treat, val_labels_treat = \
            train_test_split(rows_treat, labels_treat, shuffle=True, test_size=0.5)
        train_rows_control, val_rows_control, train_outcome_control, val_labels_control = \
            train_test_split(rows_control, labels_control, shuffle=True, test_size=0.5)

        # check estimator internal error
        error_no_tmp = 0
        FIT_FLAG = True
        while (FIT_FLAG):
            x_train = np.concatenate([train_rows_treat, train_rows_control])
            y_train = np.concatenate(
                [train_outcome_treat, train_outcome_control])
            w_train = np.zeros(x_train.shape[0])
            w_train[0:train_rows_treat.shape[0]] = 1
            FIT_FLAG = self.estimator_treat.model.fit(x_train, y_train,
                                                      w_train)
            error_no_tmp = error_no_tmp + 1
            if error_no_tmp > 2:
                # error occur request new datasets
                raise Exception('Too many errors occur in internal estimator.')

        # do conformal prediction
        total_val_no_treat = val_rows_treat.shape[0]
        total_val_no_control = val_rows_control.shape[0]

        if self.conformal_mode == "SCR":
            nc_treat = RegressorNc_r2p(self.estimator_treat, AbsErrorErrFunc())
            nc_control = RegressorNc_r2p(self.estimator_control,
                                         AbsErrorErrFunc())

        icp_treat = IcpRegressor_r2p(nc_treat)
        icp_treat.fit(
            train_rows_treat,
            train_outcome_treat.reshape((train_outcome_treat.shape[0], 1)))
        icp_treat.calibrate(val_rows_treat, val_labels_treat)
        cal_scores_treat = icp_treat.cal_scores

        icp_control = IcpRegressor_r2p(nc_control)
        icp_control.fit(
            train_rows_control,
            train_outcome_control.reshape((train_outcome_control.shape[0], 1)))
        icp_control.calibrate(val_rows_control, val_labels_control)
        cal_scores_control = icp_control.cal_scores

        val_est_treat_treat = self.estimator_treat.predict(val_rows_treat)
        val_est_treat_control = self.estimator_control.predict(val_rows_treat)
        val_est_treat_CATE = val_est_treat_treat - val_est_treat_control
        val_est_control_treat = self.estimator_treat.predict(val_rows_control)
        val_est_control_control = self.estimator_control.predict(
            val_rows_control)
        val_est_control_CATE = val_est_control_treat - val_est_control_control
        val_est = np.concatenate([val_est_treat_CATE, val_est_control_CATE])
        est_mean = float(np.mean(val_est))

        # calculate partition measure
        val_rows = np.concatenate([val_rows_treat, val_rows_control])
        val_rows_est_treat = np.concatenate(
            [val_est_treat_treat, val_est_control_treat])
        val_rows_est_control = np.concatenate(
            [val_est_treat_control, val_est_control_control])

        intv_treat = icp_treat.predict(val_rows,
                                       significance=self.significance,
                                       est_input=val_rows_est_treat)
        intv_control = icp_control.predict(val_rows,
                                           significance=self.significance,
                                           est_input=val_rows_est_control)
        intv = self.get_TE_CI(intv_treat, intv_control)
        intv_len = np.mean(intv[:, 1] - intv[:, 0])

        intv_treat_split = icp_treat.predict(val_rows,
                                             significance=self.sig_for_split,
                                             est_input=val_rows_est_treat)
        intv_control_split = icp_control.predict(
            val_rows,
            significance=self.sig_for_split,
            est_input=val_rows_est_control)
        intv_split = self.get_TE_CI(intv_treat_split, intv_control_split)
        intv_len_split = np.mean(intv_split[:, 1] - intv_split[:, 0])

        obj, intv_measure, homogeneity, obj_real = \
            self.eval_func(intv, intv_split, est_mean, total_val_no_treat, total_val_no_control)

        if self.seed is not None:
            np.random.seed(self.seed)

        self.obj = obj
        self.curr_leaves = 1
        self.root = self.Node(col=-1,
                              value=None,
                              obj=obj,
                              homogeneity=homogeneity,
                              intv_len=intv_len,
                              est_treat_treat=val_est_treat_treat,
                              est_treat_control=val_est_treat_control,
                              est_control_treat=val_est_control_treat,
                              est_control_control=val_est_control_control,
                              conf_pred_treat=icp_treat,
                              confl_pred_control=icp_control,
                              cal_scores_treat=cal_scores_treat,
                              cal_scores_control=cal_scores_control,
                              node_depth=0)

        self.root = self.fit_r(rows_treat,
                               labels_treat,
                               rows_control,
                               labels_control,
                               curr_depth=0,
                               node=self.root,
                               val_rows_treat=val_rows_treat,
                               val_labels_treat=val_labels_treat,
                               val_rows_control=val_rows_control,
                               val_labels_control=val_labels_control,
                               total_val_no_treat=total_val_no_treat,
                               total_val_no_control=total_val_no_control)
from nonconformist.base import RegressorAdapter
from nonconformist.icp import IcpRegressor
from nonconformist.nc import RegressorNc, AbsErrorErrFunc

# -----------------------------------------------------------------------------
# Setup training, calibration and test indices
# -----------------------------------------------------------------------------
data = load_boston()

idx = np.random.permutation(data.target.size)
train = idx[:int(idx.size / 3)]
calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)]
test = idx[int(2 * idx.size / 3):]

# -----------------------------------------------------------------------------
# Train and calibrate
# -----------------------------------------------------------------------------
icp = IcpRegressor(
    RegressorNc(RegressorAdapter(DecisionTreeRegressor()), AbsErrorErrFunc()))
icp.fit(data.data[train, :], data.target[train])
icp.calibrate(data.data[calibrate, :], data.target[calibrate])

# -----------------------------------------------------------------------------
# Predict
# -----------------------------------------------------------------------------
prediction = icp.predict(data.data[test, :], significance=0.1)
header = np.array(['min', 'max', 'Truth'])
table = np.vstack([prediction.T, data.target[test]]).T
df = pd.DataFrame(np.vstack([header, table]))
print(df)
Beispiel #16
0
    def build(self):

        # Make a copy of data matrices
        X = self.X.copy()
        Y = self.Y.copy()

        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))

        if self.param.getVal('tune'):

            # Optimize estimator using sklearn-gridsearch
            if self.estimator_parameters['optimize'] == 'auto':
                try:

                    LOG.info('Optimizing PLSR using SK-LearnGridSearch')

                    # Remove optimize key from parameter dictionary
                    # to avoid sklearn estimator error (unexpected keyword)
                    self.estimator_parameters.pop("optimize")   

                    super(PLSR, self).optimize(X, Y, PLS_r(
                        **self.estimator_parameters), 
                        self.param.getDict('PLSR_optimize'))

                except Exception as e:
                    LOG.error(f'Error performing SK-LearnGridSearch'
                              f' on PLSR estimator with exception {e}')
                    return False, f'Error performing SK-LearnGridSearch on PLSR estimator with exception {e}'

            # Optimize using flame implementation (recommended)
            elif self.estimator_parameters['optimize'] == 'manual':

                LOG.info('Optimizing PLSR using manual method')

                # Remove optimize key from parameter dictionary
                # to avoid sklearn estimator error (unexpected keyword)
                self.estimator_parameters.pop("optimize")   

                success, message = self.optimize(X, Y, PLS_r(
                    **self.estimator_parameters), 
                    self.param.getDict('PLSR_optimize'))

                if not success:
                    return False, message

            else: 
                LOG.error('Type of tune not recognized, check the input')
                return False, 'Type of tune not recognized, check the input'    

            results.append(('model', 'model type', 'PLSR quantitative (optimized)'))

        else:
            LOG.info('Building Quantitative PLSR with no optimization')
            try:
                # Remove optimize key from parameters to avoid error
                self.estimator_parameters.pop("optimize") 

                # as the sklearn estimator does not have this key
                self.estimator = PLS_r(**self.estimator_parameters)
            except Exception as e:
                LOG.error(f'Error at PLS_r instantiation with '
                          f'exception {e}')
                return False, f'Error at PLS_da instantiation with exception {e}'

            results.append(('model', 'model type', 'PLSR quantitative'))
        
        # Fit estimator to the data
        self.estimator.fit(X, Y)

        if not self.param.getVal('conformal'):
            return True, results

        self.estimator_temp = copy(self.estimator)
        try:
            
            LOG.info('Building PLSR aggregated conformal predictor')

            underlying_model = RegressorAdapter(self.estimator_temp)
            # normalizing_model = RegressorAdapter(
            #     KNeighborsRegressor(n_neighbors=1))
            normalizing_model = RegressorAdapter(self.estimator_temp)
            normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc())

            nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
            self.estimator = AggregatedCp(IcpRegressor(nc), BootstrapSampler())

        except Exception as e:
            LOG.error(f'Error building aggregated PLSR conformal'
                        f' regressor with exception: {e}')
            return False, f'Error building aggregated PLSR conformal regressor with exception: {e}'

            # self.conformal_pred = AggregatedCp(IcpRegressor(
            # RegressorNc(RegressorAdapter(self.estimator))),
            #                                    BootstrapSampler())

        # Fit conformal estimator to the data
        self.estimator.fit(X, Y)

        # overrides non-conformal
        results.append(('model', 'model type', 'conformal PLSR quantitative'))

        return True, results
    def run(self):
        np.seterr(divide='warn', invalid='warn')
        """
        # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
        """

        output_columns = self.transaction.lmd['predict_columns']
        input_columns = [
            col for col in self.transaction.lmd['columns']
            if col not in output_columns
            and col not in self.transaction.lmd['columns_to_ignore']
        ]

        # Make predictions on the validation dataset normally and with various columns missing
        normal_predictions = self.transaction.model_backend.predict('validate')

        normal_predictions_test = self.transaction.model_backend.predict(
            'test')
        normal_accuracy = evaluate_accuracy(
            normal_predictions,
            self.transaction.input_data.validation_df,
            self.transaction.lmd['stats_v2'],
            output_columns,
            backend=self.transaction.model_backend)

        for col in output_columns:
            if self.transaction.lmd['tss']['is_timeseries']:
                reals = list(self.transaction.input_data.validation_df[
                    self.transaction.input_data.
                    validation_df['make_predictions'] == True][col])
            else:
                reals = self.transaction.input_data.validation_df[col]
            preds = normal_predictions[col]

            fails = False

            data_type = self.transaction.lmd['stats_v2'][col]['typing'][
                'data_type']
            data_subtype = self.transaction.lmd['stats_v2'][col]['typing'][
                'data_subtype']

            if data_type == DATA_TYPES.CATEGORICAL:
                if data_subtype == DATA_SUBTYPES.TAGS:
                    encoder = self.transaction.model_backend.predictor._mixer.encoders[
                        col]
                    if balanced_accuracy_score(
                            encoder.encode(reals).argmax(axis=1),
                            encoder.encode(preds).argmax(
                                axis=1)) <= self.transaction.lmd['stats_v2'][
                                    col]['balanced_guess_probability']:
                        fails = True
                else:
                    if balanced_accuracy_score(
                            reals, preds) <= self.transaction.lmd['stats_v2'][
                                col]['balanced_guess_probability']:
                        fails = True
            elif data_type == DATA_TYPES.NUMERIC:
                if r2_score(reals, preds) < 0:
                    fails = True
            else:
                pass

            if fails:
                if not self.transaction.lmd['force_predict']:

                    def predict_wrapper(*args, **kwargs):
                        raise Exception('Failed to train model')

                    self.session.predict = predict_wrapper
                log.error('Failed to train model to predict {}'.format(col))

        empty_input_predictions = {}
        empty_input_accuracy = {}
        empty_input_predictions_test = {}

        ignorable_input_columns = [
            x for x in input_columns if self.transaction.lmd['stats_v2'][x]
            ['typing']['data_type'] != DATA_TYPES.FILE_PATH and (
                not self.transaction.lmd['tss']['is_timeseries']
                or x not in self.transaction.lmd['tss']['order_by'])
        ]

        for col in ignorable_input_columns:
            empty_input_predictions[
                col] = self.transaction.model_backend.predict(
                    'validate', ignore_columns=[col])
            empty_input_predictions_test[
                col] = self.transaction.model_backend.predict(
                    'test', ignore_columns=[col])
            empty_input_accuracy[col] = evaluate_accuracy(
                empty_input_predictions[col],
                self.transaction.input_data.validation_df,
                self.transaction.lmd['stats_v2'],
                output_columns,
                backend=self.transaction.model_backend)

        # Get some information about the importance of each column
        self.transaction.lmd['column_importances'] = {}
        for col in ignorable_input_columns:
            accuracy_increase = (normal_accuracy - empty_input_accuracy[col])
            # normalize from 0 to 10
            self.transaction.lmd['column_importances'][col] = 10 * max(
                0, accuracy_increase)

        # Run Probabilistic Validator
        overall_accuracy_arr = []
        self.transaction.lmd['accuracy_histogram'] = {}
        self.transaction.lmd['confusion_matrices'] = {}
        self.transaction.lmd['accuracy_samples'] = {}
        self.transaction.hmd['probabilistic_validators'] = {}

        self.transaction.lmd['train_data_accuracy'] = {}
        self.transaction.lmd['test_data_accuracy'] = {}
        self.transaction.lmd['valid_data_accuracy'] = {}

        for col in output_columns:

            # Training data accuracy
            predictions = self.transaction.model_backend.predict(
                'predict_on_train_data',
                ignore_columns=self.transaction.lmd['stats_v2']
                ['columns_to_ignore'])
            self.transaction.lmd['train_data_accuracy'][
                col] = evaluate_accuracy(
                    predictions,
                    self.transaction.input_data.train_df,
                    self.transaction.lmd['stats_v2'], [col],
                    backend=self.transaction.model_backend)

            # Testing data accuracy
            predictions = self.transaction.model_backend.predict(
                'test',
                ignore_columns=self.transaction.lmd['stats_v2']
                ['columns_to_ignore'])
            self.transaction.lmd['test_data_accuracy'][
                col] = evaluate_accuracy(
                    predictions,
                    self.transaction.input_data.test_df,
                    self.transaction.lmd['stats_v2'], [col],
                    backend=self.transaction.model_backend)

            # Validation data accuracy
            predictions = self.transaction.model_backend.predict(
                'validate',
                ignore_columns=self.transaction.lmd['stats_v2']
                ['columns_to_ignore'])
            self.transaction.lmd['valid_data_accuracy'][
                col] = evaluate_accuracy(
                    predictions,
                    self.transaction.input_data.validation_df,
                    self.transaction.lmd['stats_v2'], [col],
                    backend=self.transaction.model_backend)

        for col in output_columns:
            pval = ProbabilisticValidator(
                col_stats=self.transaction.lmd['stats_v2'][col],
                col_name=col,
                input_columns=input_columns)
            predictions_arr = [normal_predictions_test] + [
                x for x in empty_input_predictions_test.values()
            ]

            pval.fit(self.transaction.input_data.test_df, predictions_arr,
                     [[ignored_column]
                      for ignored_column in empty_input_predictions_test])
            overall_accuracy, accuracy_histogram, cm, accuracy_samples = pval.get_accuracy_stats(
            )
            overall_accuracy_arr.append(overall_accuracy)

            self.transaction.lmd['accuracy_histogram'][
                col] = accuracy_histogram
            self.transaction.lmd['confusion_matrices'][col] = cm
            self.transaction.lmd['accuracy_samples'][col] = accuracy_samples
            self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(
                pval)

        self.transaction.lmd['validation_set_accuracy'] = sum(
            overall_accuracy_arr) / len(overall_accuracy_arr)

        # conformal prediction confidence estimation
        self.transaction.lmd['stats_v2']['train_std_dev'] = {}
        self.transaction.hmd['label_encoders'] = {}
        self.transaction.hmd['icp'] = {'active': False}

        for target in output_columns:
            data_type = self.transaction.lmd['stats_v2'][target]['typing'][
                'data_type']
            data_subtype = self.transaction.lmd['stats_v2'][target]['typing'][
                'data_subtype']
            is_classification = data_type == DATA_TYPES.CATEGORICAL

            fit_params = {
                'target': target,
                'all_columns': self.transaction.lmd['columns'],
                'columns_to_ignore': []
            }
            fit_params['columns_to_ignore'].extend(
                self.transaction.lmd['columns_to_ignore'])
            fit_params['columns_to_ignore'].extend(
                [col for col in output_columns if col != target])

            if is_classification:
                if data_subtype != DATA_SUBTYPES.TAGS:
                    all_targets = [
                        elt[1][target].values for elt in inspect.getmembers(
                            self.transaction.input_data)
                        if elt[0] in {'test_df', 'train_df', 'validation_df'}
                    ]
                    all_classes = np.unique(
                        np.concatenate([np.unique(arr)
                                        for arr in all_targets]))

                    enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
                    enc.fit(all_classes.reshape(-1, 1))
                    fit_params['one_hot_enc'] = enc
                    self.transaction.hmd['label_encoders'][target] = enc
                else:
                    fit_params['one_hot_enc'] = None
                    self.transaction.hmd['label_encoders'][target] = None

                adapter = ConformalClassifierAdapter
                nc_function = MarginErrFunc(
                )  # better than IPS as we'd need the complete distribution over all classes
                nc_class = ClassifierNc
                icp_class = IcpClassifier

            else:
                adapter = ConformalRegressorAdapter
                nc_function = AbsErrorErrFunc()
                nc_class = RegressorNc
                icp_class = IcpRegressor

            if (data_type == DATA_TYPES.NUMERIC or
                (is_classification and data_subtype != DATA_SUBTYPES.TAGS)
                ) and not self.transaction.lmd['tss']['is_timeseries']:
                model = adapter(self.transaction.model_backend.predictor,
                                fit_params=fit_params)
                nc = nc_class(model, nc_function)

                X = deepcopy(self.transaction.input_data.train_df)
                y = X.pop(target)

                if is_classification:
                    self.transaction.hmd['icp'][target] = icp_class(
                        nc, smoothing=False)
                else:
                    self.transaction.hmd['icp'][target] = icp_class(nc)
                    self.transaction.lmd['stats_v2']['train_std_dev'][
                        target] = self.transaction.input_data.train_df[
                            target].std()

                X = clean_df(X, self.transaction.lmd['stats_v2'],
                             output_columns)
                self.transaction.hmd['icp'][target].fit(X.values, y.values)
                self.transaction.hmd['icp']['active'] = True

                # calibrate conformal estimator on test set
                X = deepcopy(self.transaction.input_data.validation_df)
                y = X.pop(target).values

                if is_classification:
                    if isinstance(enc.categories_[0][0], str):
                        cats = enc.categories_[0].tolist()
                        y = np.array([cats.index(i) for i in y])
                    y = y.astype(int)

                X = clean_df(X, self.transaction.lmd['stats_v2'],
                             output_columns)
                self.transaction.hmd['icp'][target].calibrate(X.values, y)
# Setup training, calibration and test indices
# -----------------------------------------------------------------------------
data = load_boston()

idx = np.random.permutation(data.target.size)
train = idx[:int(idx.size / 3)]
calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)]
test = idx[int(2 * idx.size / 3):]

# -----------------------------------------------------------------------------
# Without normalization
# -----------------------------------------------------------------------------
# Train and calibrate
# -----------------------------------------------------------------------------
underlying_model = RegressorAdapter(DecisionTreeRegressor(min_samples_leaf=5))
nc = RegressorNc(underlying_model, AbsErrorErrFunc())
icp = IcpRegressor(nc)
icp.fit(data.data[train, :], data.target[train])
icp.calibrate(data.data[calibrate, :], data.target[calibrate])

# -----------------------------------------------------------------------------
# Predict
# -----------------------------------------------------------------------------
prediction = icp.predict(data.data[test, :], significance=0.1)
header = ['min','max','truth','size']
size = prediction[:, 1] - prediction[:, 0]
table = np.vstack([prediction.T, data.target[test], size.T]).T
df = pd.DataFrame(table, columns=header)
print(df)

# -----------------------------------------------------------------------------
Beispiel #19
0
def run_experiment(dataset_name,
                   test_method,
                   random_state_train_test,
                   save_to_csv=True):
    """ Estimate prediction intervals and print the average length and coverage

    Parameters
    ----------

    dataset_name : array of strings, list of datasets
    test_method  : string, method to be tested, estimating
                   the 90% prediction interval
    random_state_train_test : integer, random seed to be used
    save_to_csv : boolean, save average length and coverage to csv (True)
                  or not (False)

    """

    dataset_name_vec = []
    method_vec = []
    coverage_vec = []
    length_vec = []
    seed_vec = []

    seed = random_state_train_test
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

    coverage_linear=0
    length_linear=0
    coverage_linear_local=0
    length_linear_local=0

    coverage_net=0
    length_net=0
    coverage_net_local=0
    length_net_local=0

    coverage_forest=0
    length_forest=0
    coverage_forest_local=0
    length_forest_local=0

    coverage_cp_qnet=0
    length_cp_qnet=0
    coverage_qnet=0
    length_qnet=0

    coverage_cp_sign_qnet=0
    length_cp_sign_qnet=0

    coverage_cp_re_qnet=0
    length_cp_re_qnet=0
    coverage_re_qnet=0
    length_re_qnet=0

    coverage_cp_sign_re_qnet=0
    length_cp_sign_re_qnet=0

    coverage_cp_qforest=0
    length_cp_qforest=0
    coverage_qforest=0
    length_qforest=0

    coverage_cp_sign_qforest=0
    length_cp_sign_qforest=0


    # determines the size of test set
    test_ratio = 0.2

    # conformal prediction miscoverage level
    significance = 0.1
    # desired quantile levels, used by the quantile regression methods
    quantiles = [0.05, 0.95]

    # Random forests parameters (shared by conditional quantile random forests
    # and conditional mean random forests regression).
    n_estimators = 1000 # usual random forests n_estimators parameter
    min_samples_leaf = 1 # default parameter of sklearn

    # Quantile random forests parameters.
    # See QuantileForestRegressorAdapter class for more details
    quantiles_forest = [5, 95]
    CV_qforest = True
    coverage_factor = 0.85
    cv_test_ratio = 0.05
    cv_random_state = 1
    cv_range_vals = 30
    cv_num_vals = 10

    # Neural network parameters  (shared by conditional quantile neural network
    # and conditional mean neural network regression)
    # See AllQNet_RegressorAdapter and MSENet_RegressorAdapter in helper.py
    nn_learn_func = torch.optim.Adam
    epochs = 1000
    lr = 0.0005
    hidden_size = 64
    batch_size = 64
    dropout = 0.1
    wd = 1e-6

    # Ask for a reduced coverage when tuning the network parameters by
    # cross-validation to avoid too conservative initial estimation of the
    # prediction interval. This estimation will be conformalized by CQR.
    quantiles_net = [0.1, 0.9]


    # local conformal prediction parameter.
    # See RegressorNc class for more details.
    beta = 1
    beta_net = 1

    # local conformal prediction parameter. The local ridge regression method
    # uses nearest neighbor regression as the MAD estimator.
    # Number of neighbors used by nearest neighbor regression.
    n_neighbors = 11

    print(dataset_name)
    sys.stdout.flush()

    try:
        # load the dataset
        X, y = datasets.GetDataset(dataset_name, base_dataset_path)
    except:
        print("CANNOT LOAD DATASET!")
        return

    # Dataset is divided into test and train data based on test_ratio parameter
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_ratio,
                                                        random_state=random_state_train_test)

    # zero mean and unit variance scaling of the train and test features
    scalerX = StandardScaler()
    scalerX = scalerX.fit(X_train)
    X_train = scalerX.transform(X_train)
    X_test = scalerX.transform(X_test)

    # scale the labels by dividing each by the mean absolute response
    max_ytrain = np.mean(np.abs(y_train))
    y_train = y_train/max_ytrain
    y_test = y_test/max_ytrain

    # fit a simple ridge regression model (sanity check)
    model = linear_model.RidgeCV()
    model = model.fit(X_train, y_train)
    predicted_data = model.predict(X_test).astype(np.float32)

    # calculate the normalized mean squared error
    print("Ridge relative error: %f" % (np.sum((y_test-predicted_data)**2)/np.sum(y_test**2)))
    sys.stdout.flush()

    # reshape the data
    X_train = np.asarray(X_train)
    y_train = np.squeeze(np.asarray(y_train))
    X_test = np.asarray(X_test)
    y_test = np.squeeze(np.asarray(y_test))

    # input dimensions
    n_train = X_train.shape[0]
    in_shape = X_train.shape[1]

    print("Size: train (%d, %d), test (%d, %d)" % (X_train.shape[0], X_train.shape[1], X_test.shape[0], X_test.shape[1]))
    sys.stdout.flush()

    # set seed for splitting the data into proper train and calibration
    np.random.seed(seed)
    idx = np.random.permutation(n_train)

    # divide the data into proper training set and calibration set
    n_half = int(np.floor(n_train/2))
    idx_train, idx_cal = idx[:n_half], idx[n_half:2*n_half]

    ######################## Linear

    if 'linear' == test_method:

        model = linear_model.RidgeCV()
        nc = RegressorNc(model)

        y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance)
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"Ridge")
        coverage_linear, length_linear = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Ridge")

        dataset_name_vec.append(dataset_name)
        method_vec.append('Ridge')
        coverage_vec.append(coverage_linear)
        length_vec.append(length_linear)
        seed_vec.append(seed)

        nc = NcFactory.create_nc(
            linear_model.RidgeCV(),
            normalizer_model=KNeighborsRegressor(n_neighbors=n_neighbors)
        )

        y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance)
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"Ridge-L")
        coverage_linear_local, length_linear_local = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Ridge-L")

        dataset_name_vec.append(dataset_name)
        method_vec.append('Ridge-L')
        coverage_vec.append(coverage_linear_local)
        length_vec.append(length_linear_local)
        seed_vec.append(seed)

    ######################### Neural net

    if 'neural_net' == test_method:

        model = helper.MSENet_RegressorAdapter(model=None,
                                               fit_params=None,
                                               in_shape = in_shape,
                                               hidden_size = hidden_size,
                                               learn_func = nn_learn_func,
                                               epochs = epochs,
                                               batch_size=batch_size,
                                               dropout=dropout,
                                               lr=lr,
                                               wd=wd,
                                               test_ratio=cv_test_ratio,
                                               random_state=cv_random_state)
        nc = RegressorNc(model)

        y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance)
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"Net")
        coverage_net, length_net = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Net")

        dataset_name_vec.append(dataset_name)
        method_vec.append('Net')
        coverage_vec.append(coverage_net)
        length_vec.append(length_net)
        seed_vec.append(seed)

        normalizer_adapter = helper.MSENet_RegressorAdapter(model=None,
                                                            fit_params=None,
                                                            in_shape = in_shape,
                                                            hidden_size = hidden_size,
                                                            learn_func = nn_learn_func,
                                                            epochs = epochs,
                                                            batch_size=batch_size,
                                                            dropout=dropout,
                                                            lr=lr,
                                                            wd=wd,
                                                            test_ratio=cv_test_ratio,
                                                            random_state=cv_random_state)
        adapter = helper.MSENet_RegressorAdapter(model=None,
                                                fit_params=None,
                                                in_shape = in_shape,
                                                hidden_size = hidden_size,
                                                learn_func = nn_learn_func,
                                                epochs = epochs,
                                                batch_size=batch_size,
                                                dropout=dropout,
                                                lr=lr,
                                                wd=wd,
                                                test_ratio=cv_test_ratio,
                                                random_state=cv_random_state)

        normalizer = RegressorNormalizer(adapter,
                                         normalizer_adapter,
                                         AbsErrorErrFunc())
        nc = RegressorNc(adapter, AbsErrorErrFunc(), normalizer, beta=beta_net)
        y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance)
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"Net-L")
        coverage_net_local, length_net_local = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Net-L")

        dataset_name_vec.append(dataset_name)
        method_vec.append('Net-L')
        coverage_vec.append(coverage_net_local)
        length_vec.append(length_net_local)
        seed_vec.append(seed)

    ################## Random Forest

    if 'random_forest' == test_method:

        model = RandomForestRegressor(n_estimators=n_estimators,min_samples_leaf=min_samples_leaf, random_state=0)
        nc = RegressorNc(model, AbsErrorErrFunc())

        y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance)
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"RF")
        coverage_forest, length_forest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"RF")

        dataset_name_vec.append(dataset_name)
        method_vec.append('RF')
        coverage_vec.append(coverage_forest)
        length_vec.append(length_forest)
        seed_vec.append(seed)

        normalizer_adapter = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=0)
        adapter = RandomForestRegressor(n_estimators=n_estimators, min_samples_leaf=min_samples_leaf, random_state=0)
        normalizer = RegressorNormalizer(adapter,
                                         normalizer_adapter,
                                         AbsErrorErrFunc())
        nc = RegressorNc(adapter, AbsErrorErrFunc(), normalizer, beta=beta)

        y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance)
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"RF-L")
        coverage_forest_local, length_forest_local = helper.compute_coverage(y_test,y_lower,y_upper,significance,"RF-L")

        dataset_name_vec.append(dataset_name)
        method_vec.append('RF-L')
        coverage_vec.append(coverage_forest_local)
        length_vec.append(length_forest_local)
        seed_vec.append(seed)

    ################## Quantile Net

    if 'quantile_net' == test_method:

        model_full = helper.AllQNet_RegressorAdapter(model=None,
                                             fit_params=None,
                                             in_shape = in_shape,
                                             hidden_size = hidden_size,
                                             quantiles = quantiles,
                                             learn_func = nn_learn_func,
                                             epochs = epochs,
                                             batch_size=batch_size,
                                             dropout=dropout,
                                             lr=lr,
                                             wd=wd,
                                             test_ratio=cv_test_ratio,
                                             random_state=cv_random_state,
                                             use_rearrangement=False)
        model_full.fit(X_train, y_train)
        tmp = model_full.predict(X_test)
        y_lower = tmp[:,0]
        y_upper = tmp[:,1]
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"QNet")
        coverage_qnet, length_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"QNet")

        dataset_name_vec.append(dataset_name)
        method_vec.append('QNet')
        coverage_vec.append(coverage_qnet)
        length_vec.append(length_qnet)
        seed_vec.append(seed)

    if 'cqr_quantile_net' == test_method:

        model = helper.AllQNet_RegressorAdapter(model=None,
                                             fit_params=None,
                                             in_shape = in_shape,
                                             hidden_size = hidden_size,
                                             quantiles = quantiles_net,
                                             learn_func = nn_learn_func,
                                             epochs = epochs,
                                             batch_size=batch_size,
                                             dropout=dropout,
                                             lr=lr,
                                             wd=wd,
                                             test_ratio=cv_test_ratio,
                                             random_state=cv_random_state,
                                             use_rearrangement=False)
        nc = RegressorNc(model, QuantileRegErrFunc())

        y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance)
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"CQR Net")
        coverage_cp_qnet, length_cp_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR Net")


        dataset_name_vec.append(dataset_name)
        method_vec.append('CQR Net')
        coverage_vec.append(coverage_cp_qnet)
        length_vec.append(length_cp_qnet)
        seed_vec.append(seed)


    if 'cqr_asymmetric_quantile_net' == test_method:

        model = helper.AllQNet_RegressorAdapter(model=None,
                                             fit_params=None,
                                             in_shape = in_shape,
                                             hidden_size = hidden_size,
                                             quantiles = quantiles_net,
                                             learn_func = nn_learn_func,
                                             epochs = epochs,
                                             batch_size=batch_size,
                                             dropout=dropout,
                                             lr=lr,
                                             wd=wd,
                                             test_ratio=cv_test_ratio,
                                             random_state=cv_random_state,
                                             use_rearrangement=False)
        nc = RegressorNc(model, QuantileRegAsymmetricErrFunc())

        y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance)
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"CQR Sign Net")
        coverage_cp_sign_qnet, length_cp_sign_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR Sign Net")


        dataset_name_vec.append(dataset_name)
        method_vec.append('CQR Sign Net')
        coverage_vec.append(coverage_cp_sign_qnet)
        length_vec.append(length_cp_sign_qnet)
        seed_vec.append(seed)


    ################### Rearrangement Quantile Net

    if 'rearrangement' == test_method:

        model_full = helper.AllQNet_RegressorAdapter(model=None,
                                             fit_params=None,
                                             in_shape = in_shape,
                                             hidden_size = hidden_size,
                                             quantiles = quantiles,
                                             learn_func = nn_learn_func,
                                             epochs = epochs,
                                             batch_size=batch_size,
                                             dropout=dropout,
                                             lr=lr,
                                             wd=wd,
                                             test_ratio=cv_test_ratio,
                                             random_state=cv_random_state,
                                             use_rearrangement=True)
        model_full.fit(X_train, y_train)
        tmp = model_full.predict(X_test)
        y_lower = tmp[:,0]
        y_upper = tmp[:,1]
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"Rearrange QNet")
        coverage_re_qnet, length_re_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Rearrange QNet")

        dataset_name_vec.append(dataset_name)
        method_vec.append('Rearrange QNet')
        coverage_vec.append(coverage_re_qnet)
        length_vec.append(length_re_qnet)
        seed_vec.append(seed)

    if 'cqr_rearrangement' == test_method:

        model = helper.AllQNet_RegressorAdapter(model=None,
                                                 fit_params=None,
                                                 in_shape = in_shape,
                                                 hidden_size = hidden_size,
                                                 quantiles = quantiles_net,
                                                 learn_func = nn_learn_func,
                                                 epochs = epochs,
                                                 batch_size=batch_size,
                                                 dropout=dropout,
                                                 lr=lr,
                                                 wd=wd,
                                                 test_ratio=cv_test_ratio,
                                                 random_state=cv_random_state,
                                                 use_rearrangement=True)
        nc = RegressorNc(model, QuantileRegErrFunc())

        y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance)
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"Rearrange CQR Net")
        coverage_cp_re_qnet, length_cp_re_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Rearrange CQR Net")


        dataset_name_vec.append(dataset_name)
        method_vec.append('Rearrange CQR Net')
        coverage_vec.append(coverage_cp_re_qnet)
        length_vec.append(length_cp_re_qnet)
        seed_vec.append(seed)


    if 'cqr_asymmetric_rearrangement' == test_method:

        model = helper.AllQNet_RegressorAdapter(model=None,
                                                 fit_params=None,
                                                 in_shape = in_shape,
                                                 hidden_size = hidden_size,
                                                 quantiles = quantiles_net,
                                                 learn_func = nn_learn_func,
                                                 epochs = epochs,
                                                 batch_size=batch_size,
                                                 dropout=dropout,
                                                 lr=lr,
                                                 wd=wd,
                                                 test_ratio=cv_test_ratio,
                                                 random_state=cv_random_state,
                                                 use_rearrangement=True)
        nc = RegressorNc(model, QuantileRegAsymmetricErrFunc())

        y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance)
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"Rearrange CQR Sign Net")
        coverage_cp_sign_re_qnet, length_cp_sign_re_qnet = helper.compute_coverage(y_test,y_lower,y_upper,significance,"Rearrange CQR Net")


        dataset_name_vec.append(dataset_name)
        method_vec.append('Rearrange CQR Sign Net')
        coverage_vec.append(coverage_cp_sign_re_qnet)
        length_vec.append(length_cp_sign_re_qnet)
        seed_vec.append(seed)

    ################### Quantile Random Forest

    if 'quantile_forest' == test_method:

        params_qforest = dict()
        params_qforest["random_state"] = 0
        params_qforest["min_samples_leaf"] = min_samples_leaf
        params_qforest["n_estimators"] = n_estimators
        params_qforest["max_features"] = X_train.shape[1]

        params_qforest["CV"]=False
        params_qforest["coverage_factor"] = coverage_factor
        params_qforest["test_ratio"]=cv_test_ratio
        params_qforest["random_state"]=cv_random_state
        params_qforest["range_vals"] = cv_range_vals
        params_qforest["num_vals"] = cv_num_vals

        model_full = helper.QuantileForestRegressorAdapter(model = None,
                                                      fit_params=None,
                                                      quantiles=np.dot(100,quantiles),
                                                      params = params_qforest)
        model_full.fit(X_train, y_train)
        tmp = model_full.predict(X_test)
        y_lower = tmp[:,0]
        y_upper = tmp[:,1]
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"QRF")
        coverage_qforest, length_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"QRF")

        dataset_name_vec.append(dataset_name)
        method_vec.append('QRF')
        coverage_vec.append(coverage_qforest)
        length_vec.append(length_qforest)
        seed_vec.append(seed)

    if 'cqr_quantile_forest' == test_method:

        params_qforest = dict()
        params_qforest["random_state"] = 0
        params_qforest["min_samples_leaf"] = min_samples_leaf
        params_qforest["n_estimators"] = n_estimators
        params_qforest["max_features"] = X_train.shape[1]

        params_qforest["CV"]=CV_qforest
        params_qforest["coverage_factor"] = coverage_factor
        params_qforest["test_ratio"]=cv_test_ratio
        params_qforest["random_state"]=cv_random_state
        params_qforest["range_vals"] = cv_range_vals
        params_qforest["num_vals"] = cv_num_vals


        model = helper.QuantileForestRegressorAdapter(model = None,
                                                      fit_params=None,
                                                      quantiles=quantiles_forest,
                                                      params = params_qforest)

        nc = RegressorNc(model, QuantileRegErrFunc())
        y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance)
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"CQR RF")
        coverage_cp_qforest, length_cp_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR RF")

        dataset_name_vec.append(dataset_name)
        method_vec.append('CQR RF')
        coverage_vec.append(coverage_cp_qforest)
        length_vec.append(length_cp_qforest)
        seed_vec.append(seed)

    if 'cqr_asymmetric_quantile_forest' == test_method:

        params_qforest = dict()
        params_qforest["random_state"] = 0
        params_qforest["min_samples_leaf"] = min_samples_leaf
        params_qforest["n_estimators"] = n_estimators
        params_qforest["max_features"] = X_train.shape[1]

        params_qforest["CV"]=CV_qforest
        params_qforest["coverage_factor"] = coverage_factor
        params_qforest["test_ratio"]=cv_test_ratio
        params_qforest["random_state"]=cv_random_state
        params_qforest["range_vals"] = cv_range_vals
        params_qforest["num_vals"] = cv_num_vals


        model = helper.QuantileForestRegressorAdapter(model = None,
                                                      fit_params=None,
                                                      quantiles=quantiles_forest,
                                                      params = params_qforest)

        nc = RegressorNc(model, QuantileRegAsymmetricErrFunc())
        y_lower, y_upper = helper.run_icp(nc, X_train, y_train, X_test, idx_train, idx_cal, significance)
        if plot_results:
            helper.plot_func_data(y_test,y_lower,y_upper,"CQR Sign RF")
        coverage_cp_sign_qforest, length_cp_sign_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"CQR Sign RF")

        dataset_name_vec.append(dataset_name)
        method_vec.append('CQR Sign RF')
        coverage_vec.append(coverage_cp_sign_qforest)
        length_vec.append(length_cp_sign_qforest)
        seed_vec.append(seed)


#        tmp = model.predict(X_test)
#        y_lower = tmp[:,0]
#        y_upper = tmp[:,1]
#        if plot_results:
#            helper.plot_func_data(y_test,y_lower,y_upper,"QRF")
#        coverage_qforest, length_qforest = helper.compute_coverage(y_test,y_lower,y_upper,significance,"QRF")
#
#        dataset_name_vec.append(dataset_name)
#        method_vec.append('QRF')
#        coverage_vec.append(coverage_qforest)
#        length_vec.append(length_qforest)
#        seed_vec.append(seed)



    ############### Summary

    coverage_str = 'Coverage (expected ' + str(100 - significance*100) + '%)'
    results = np.array([[dataset_name, coverage_str, 'Avg. Length', 'Seed'],
                     ['CP Linear', coverage_linear, length_linear, seed],
                     ['CP Linear Local', coverage_linear_local, length_linear_local, seed],
                     ['CP Neural Net', coverage_net, length_net, seed],
                     ['CP Neural Net Local', coverage_net_local, length_net_local, seed],
                     ['CP Random Forest', coverage_forest, length_forest, seed],
                     ['CP Random Forest Local', coverage_forest_local, length_forest_local, seed],
                     ['CP Quantile Net', coverage_cp_qnet, length_cp_qnet, seed],
                     ['CP Asymmetric Quantile Net', coverage_cp_sign_qnet, length_cp_sign_qnet, seed],
                     ['Quantile Net', coverage_qnet, length_qnet, seed],
                     ['CP Rearrange Quantile Net', coverage_cp_re_qnet, length_cp_re_qnet, seed],
                     ['CP Asymmetric Rearrange Quantile Net', coverage_cp_sign_re_qnet, length_cp_sign_re_qnet, seed],
                     ['Rearrange Quantile Net', coverage_re_qnet, length_re_qnet, seed],
                     ['CP Quantile Random Forest', coverage_cp_qforest, length_cp_qforest, seed],
                     ['CP Asymmetric Quantile Random Forest', coverage_cp_sign_qforest, length_cp_sign_qforest, seed],
                     ['Quantile Random Forest', coverage_qforest, length_qforest, seed]])

    results_ = pd.DataFrame(data=results[1:,1:],
                      index=results[1:,0],
                      columns=results[0,1:])

    print("== SUMMARY == ")
    print("dataset name: " + dataset_name)
    print(results_)
    sys.stdout.flush()

    if save_to_csv:
        results = pd.DataFrame(results)

        outdir = './results/'
        if not os.path.exists(outdir):
            os.mkdir(outdir)

        out_name = outdir + 'results.csv'

        df = pd.DataFrame({'name': dataset_name_vec,
                           'method': method_vec,
                           coverage_str : coverage_vec,
                           'Avg. Length' : length_vec,
                           'seed': seed_vec})

        if os.path.isfile(out_name):
            df2 = pd.read_csv(out_name)
            df = pd.concat([df2, df], ignore_index=True)

        df.to_csv(out_name, index=False)
Beispiel #20
0
    def build(self):
        '''Build a new SVM model with the X and Y numpy matrices'''

        # Make a copy of data matrices
        X = self.X.copy()
        Y = self.Y.copy()

        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))

        # If tune then call gridsearch to optimize the estimator
        if self.param.getVal('tune'):
            try:
                # Check type of model
                if self.param.getVal('quantitative'):
                    self.optimize(X, Y, svm.SVR(**self.estimator_parameters),
                                  self.tune_parameters)
                    results.append(('model', 'model type',
                                    'SVM quantitative (optimized)'))

                else:
                    self.optimize(X, Y, svm.SVC(**self.estimator_parameters),
                                  self.tune_parameters)
                    results.append(
                        ('model', 'model type', 'SVM qualitative (optimized)'))
                LOG.debug('SVM estimator optimized')
            except Exception as e:
                LOG.error(f'Exception optimizing SVM'
                          f'estimator with exception {e}')
        else:
            try:
                LOG.info("Building  SVM model")
                if self.param.getVal('quantitative'):
                    LOG.info("Building Quantitative SVM-R model")
                    self.estimator = svm.SVR(**self.estimator_parameters)
                    results.append(('model', 'model type', 'SVM quantitative'))
                else:
                    self.estimator = svm.SVC(**self.estimator_parameters)
                    results.append(('model', 'model type', 'SVM qualitative'))
            except Exception as e:
                LOG.error(f'Exception building SVM'
                          f'estimator with exception {e}')
        self.estimator.fit(X, Y)
        self.estimator_temp = copy(self.estimator)
        if self.param.getVal('conformal'):
            try:
                LOG.info("Building aggregated conformal SVM model")
                if self.param.getVal('quantitative'):
                    underlying_model = RegressorAdapter(self.estimator_temp)
                    # normalizing_model = RegressorAdapter(
                    # KNeighborsRegressor(n_neighbors=5))
                    normalizing_model = RegressorAdapter(self.estimator_temp)
                    normalizer = RegressorNormalizer(underlying_model,
                                                     normalizing_model,
                                                     AbsErrorErrFunc())
                    nc = RegressorNc(underlying_model, AbsErrorErrFunc(),
                                     normalizer)
                    # self.conformal_pred = AggregatedCp(IcpRegressor(
                    # RegressorNc(RegressorAdapter(self.estimator))),
                    #                                   BootstrapSampler())

                    self.estimator = AggregatedCp(IcpRegressor(nc),
                                                  BootstrapSampler())
                    self.estimator.fit(X, Y)
                    # overrides non-conformal
                    results.append(
                        ('model', 'model type', 'conformal SVM quantitative'))

                else:
                    self.estimator = AggregatedCp(
                        IcpClassifier(
                            ClassifierNc(
                                ClassifierAdapter(self.estimator_temp),
                                MarginErrFunc())), BootstrapSampler())
                    self.estimator.fit(X, Y)
                    # overrides non-conformal
                    results.append(
                        ('model', 'model type', 'conformal SVM qualitative'))
            except Exception as e:
                LOG.error(f'Exception building aggregated conformal SVM '
                          f'estimator with exception {e}')
        # Fit estimator to the data
        return True, results
                         folds=5,
                         scoring_funcs=[class_mean_errors, class_avg_c],
                         significance_levels=[0.05, 0.1, 0.2])

print('Classification: iris')
scores = scores.drop(['fold', 'iter'], axis=1)
print(scores.groupby(['significance']).mean())

# -----------------------------------------------------------------------------
# Regression, absolute error
# -----------------------------------------------------------------------------
data = load_diabetes()

icp = IcpRegressor(
    RegressorNc(RegressorAdapter(RandomForestRegressor(n_estimators=100)),
                AbsErrorErrFunc()))
icp_cv = RegIcpCvHelper(icp)

scores = cross_val_score(icp_cv,
                         data.data,
                         data.target,
                         iterations=5,
                         folds=5,
                         scoring_funcs=[reg_mean_errors, reg_median_size],
                         significance_levels=[0.05, 0.1, 0.2])

print('Absolute error regression: diabetes')
scores = scores.drop(['fold', 'iter'], axis=1)
print(scores.groupby(['significance']).mean())

# -----------------------------------------------------------------------------