Esempio n. 1
0
    def test_acp_regression_tree(self):
        # -----------------------------------------------------------------------------
        # Experiment setup
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        idx = np.random.permutation(data.target.size)
        train = idx[:int(2 * idx.size / 3)]
        test = idx[int(2 * idx.size / 3):]

        truth = data.target[test]
        columns = ["min", "max", "truth"]
        significance = 0.1

        # -----------------------------------------------------------------------------
        # Define models
        # -----------------------------------------------------------------------------

        models = {
            "ACP-RandomSubSampler":
            AggregatedCp(
                IcpRegressor(
                    RegressorNc(RegressorAdapter(DecisionTreeRegressor()))),
                RandomSubSampler(),
            ),
            "ACP-CrossSampler":
            AggregatedCp(
                IcpRegressor(
                    RegressorNc(RegressorAdapter(DecisionTreeRegressor()))),
                CrossSampler(),
            ),
            "ACP-BootstrapSampler":
            AggregatedCp(
                IcpRegressor(
                    RegressorNc(RegressorAdapter(DecisionTreeRegressor()))),
                BootstrapSampler(),
            ),
        }

        # -----------------------------------------------------------------------------
        # Train, predict and evaluate
        # -----------------------------------------------------------------------------
        for name, model in models.items():
            model.fit(data.data[train, :], data.target[train])
            prediction = model.predict(data.data[test, :])
            prediction_sign = model.predict(data.data[test, :],
                                            significance=significance)
            table = np.vstack((prediction_sign.T, truth)).T
            df = pd.DataFrame(table, columns=columns)
            print("\n{}".format(name))
            print("Error rate: {}".format(
                reg_mean_errors(prediction, truth, significance)))
            print(df)
Esempio n. 2
0
    def CF_quantitative_validation(self):
        ''' Performs internal  validation for conformal quantitative models '''

        # Make a copy of original matrices.
        X = self.X.copy()
        Y = self.Y.copy()

        # Number of external validations for the aggregated conformal estimator.
        seeds = [5, 7, 35]
        # Interval means for each aggregated  conformal estimator (out of 3)
        interval_means = []
        # Accuracies for each aggregated conformal estimator (out of 3)
        accuracies = []
        results = []
        try:
            for i in range(len(seeds)):
                # Generate training a test sets
                X_train, X_test, Y_train, Y_test = train_test_split(
                    X, Y, test_size=0.25, random_state=i, shuffle=False)
                # Create the aggregated conformal regressor.
                conformal_pred = AggregatedCp(
                    IcpRegressor(RegressorNc(RegressorAdapter(
                        self.estimator))), BootstrapSampler())
                # Fit conformal regressor to the data
                conformal_pred.fit(X_train, Y_train)

                # Perform prediction on test set
                prediction = conformal_pred.predict(X_test,
                                                    self.conformalSignificance)
                # Add the n validation interval means
                interval_means.append(
                    np.mean(
                        np.abs(prediction[:, 0]) - np.abs(prediction[:, 1])))
                Y_test = Y_test.reshape(-1, 1)
                # Get boolean mask of instances within the applicability domain.
                inside_interval = ((prediction[:, 0].reshape(-1, 1) < Y_test) &
                                   (prediction[:, 1].reshape(-1, 1) > Y_test))
                # Compute the accuracy (number of instances within the AD).
                accuracy = np.sum(inside_interval) / len(Y_test)
                # Add validation result to the list of accuracies.
                accuracies.append(accuracy)
        except Exception as e:
            LOG.error(f'Quantitative conformal validation'
                      f' failed with exception: {e}')
            raise e

        # Compute mean interval_means and accuracy.
        interval_means = np.mean(interval_means)
        accuracies = np.mean(accuracies)
        # Cut into two decimals.
        self.conformal_accuracy = float("{0:.2f}".format(accuracies))
        self.conformal_mean_interval = float("{0:.2f}".format(interval_means))
        #Add quality metrics to results.

        results.append(('Conformal_mean_interval', 'Conformal mean interval',
                        self.conformal_mean_interval))
        results.append(('Conformal_accuracy', 'Conformal accuracy',
                        self.conformal_accuracy))

        return True, (results, )
Esempio n. 3
0
def CF_QuanVal(X, Y, estimator, conformalSignificance):
    print("Starting quantitative conformal prediction validation")

    icp = AggregatedCp(IcpRegressor(RegressorNc(RegressorAdapter(estimator))),
                       BootstrapSampler())

    # icp = AggregatedCp(IcpRegressor(RegressorNc(RegressorAdapter(estimator),
    #                               AbsErrorErrFunc(), RegressorNormalizer(estimator,
    #                                RegressorAdapter(copy.copy(estimator)), AbsErrorErrFunc()))))
    # icp_cv = RegIcpCvHelper(icp)
    # scores = conformal_cross_val_score(icp_cv,
    #                          X,
    #                          Y,
    #                          iterations=5,
    #                          folds=5,
    #                          scoring_funcs=[reg_mean_errors, reg_median_size, reg_mean_size],
    #                          significance_levels=[0.05, 0.1, 0.2, conformalSignificance])

    icp.fit(X[:30], Y[:30])
    prediction = icp.predict(X[30:])
    prediction_sign = icp.predict(X[30:], significance=0.25)

    interval = prediction_sign[:, 0] - prediction_sign[:, 1]
    print(np.mean(interval))
    print(interval)
    print("\n")
    print(prediction)
    print(prediction_sign)
    return (icp)
Esempio n. 4
0
    def build(self):
        if not self.quantitative:
            print("PLSR only applies to quantitative data")
            return False, "PLSR only applies to quantitative data"

        if self.failed:
            return False, "Error initiating model"

        X = self.X.copy()
        Y = self.Y.copy()


        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))

        if self.cv:
            self.cv = getCrossVal(self.cv, 46, self.n, self.p)

        if self.tune:
            if self.optimiz == 'auto':
                super(PLSR, self).optimize(X, Y, PLS_r(
                    **self.estimator_parameters), self.tune_parameters)
            elif self.optimiz == 'manual':
                self.optimize(X, Y, PLS_r(
                    **self.estimator_parameters), self.tune_parameters)

            results.append(
                ('model', 'model type', 'PLSR quantitative (optimized)'))

        else:
            print("Building  Quantitative PLSR")
            self.estimator = PLS_r(**self.estimator_parameters)
            results.append(('model', 'model type', 'PLSR quantitative'))

        if self.conformal:
            underlying_model = RegressorAdapter(self.estimator)
            normalizing_model = RegressorAdapter(
                KNeighborsRegressor(n_neighbors=1))
            normalizing_model = RegressorAdapter(self.estimator)
            normalizer = RegressorNormalizer(
                underlying_model, normalizing_model, AbsErrorErrFunc())
            nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
            self.conformal_pred = AggregatedCp(IcpRegressor(nc),
                                               BootstrapSampler())

            # self.conformal_pred = AggregatedCp(IcpRegressor(RegressorNc(RegressorAdapter(self.estimator))),
            #                                    BootstrapSampler())

            self.conformal_pred.fit(X, Y)
            # overrides non-conformal
            results.append(
                ('model', 'model type', 'conformal PLSR quantitative'))

        self.estimator.fit(X, Y)

        return True, results
Esempio n. 5
0
def CF_QuanCal(X, Y, estimator):
    # X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.20, random_state=42)
    acp = AggregatedCp(
        IcpRegressor(
            RegressorNc(
                RegressorAdapter(estimator), AbsErrorErrFunc(),
                RegressorNormalizer(estimator, copy.copy(estimator),
                                    AbsErrorErrFunc())), RandomSubSampler()), )
    acp.fit(X, Y)
    # icp.calibrate(X_test, y_test)
    return acp
Esempio n. 6
0
# -----------------------------------------------------------------------------
# Setup training, calibration and test indices
# -----------------------------------------------------------------------------
data = Orange.data.Table('iris')
X, y = data.X[:, :3], data.X[:, 3]

idx = np.random.permutation(y.size)
train = idx[:idx.size // 3]
calibrate = idx[idx.size // 3:2 * idx.size // 3]
test = idx[2 * idx.size // 3:]

# -----------------------------------------------------------------------------
# Train and calibrate
# -----------------------------------------------------------------------------
icp = IcpRegressor(
    RegressorNc(DecisionTreeRegressor(), abs_error, abs_error_inv))
icp.fit(X[train, :], y[train])
icp.calibrate(X[calibrate, :], y[calibrate])

acp = AggregatedCp(IcpRegressor(
    RegressorNc(DecisionTreeRegressor(), abs_error, abs_error_inv)),
                   sampler=CrossSampler())
acp.fit(X[train, :], y[train])

# -----------------------------------------------------------------------------
# Predict
# -----------------------------------------------------------------------------
print('# Inductive')
prediction = icp.predict(X[test, :], significance=0.1)
for pred, actual in zip(prediction[:5], y[test]):
    print(pred, actual)
Esempio n. 7
0
score_model(
    icp_cv,
    "IcpClassifier (OOB, normalized)",
    data,
    "iris",
    [class_mean_errors, class_avg_c],
)

# -----------------------------------------------------------------------------
# Regression
# -----------------------------------------------------------------------------
data = load_diabetes()

nc = NcFactory.create_nc(RandomForestRegressor(n_estimators=100))
icp = IcpRegressor(nc)
icp_cv = RegIcpCvHelper(icp)

score_model(icp_cv, "IcpRegressor", data, "diabetes",
            [reg_mean_errors, reg_median_size])

# -----------------------------------------------------------------------------
# Regression (normalized)
# -----------------------------------------------------------------------------
data = load_diabetes()

nc = NcFactory.create_nc(RandomForestRegressor(n_estimators=100),
                         normalizer_model=KNeighborsRegressor())
icp = IcpRegressor(nc)
icp_cv = RegIcpCvHelper(icp)
from nonconformist.icp import IcpRegressor
from nonconformist.nc import RegressorNc, abs_error, abs_error_inv

data = load_boston()

# -----------------------------------------------------------------------------
# Setup training, calibration and test indices
# -----------------------------------------------------------------------------
idx = np.random.permutation(data.target.size)
train = idx[:int(idx.size / 3)]
calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)]
test = idx[int(2 * idx.size / 3):]

# -----------------------------------------------------------------------------
# Train and calibrate
# -----------------------------------------------------------------------------
icp = IcpRegressor(RegressorNc(DecisionTreeRegressor, abs_error, abs_error_inv))
icp.fit(data.data[train, :], data.target[train])
icp.calibrate(data.data[calibrate, :], data.target[calibrate])

# -----------------------------------------------------------------------------
# Predict
# -----------------------------------------------------------------------------
import pandas

prediction = icp.predict(data.data[test, :], significance=0.1)
header = np.array(['min','max','Truth'])
table = np.vstack([prediction.T, data.target[test]]).T
df = pandas.DataFrame(np.vstack([header, table]))
print(df)
# -----------------------------------------------------------------------------
data = load_boston()

idx = np.random.permutation(data.target.size)
train = idx[:int(idx.size / 3)]
calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)]
test = idx[int(2 * idx.size / 3):]

# -----------------------------------------------------------------------------
# Without normalization
# -----------------------------------------------------------------------------
# Train and calibrate
# -----------------------------------------------------------------------------
underlying_model = RegressorAdapter(DecisionTreeRegressor(min_samples_leaf=5))
nc = RegressorNc(underlying_model, AbsErrorErrFunc())
icp = IcpRegressor(nc)
icp.fit(data.data[train, :], data.target[train])
icp.calibrate(data.data[calibrate, :], data.target[calibrate])

# -----------------------------------------------------------------------------
# Predict
# -----------------------------------------------------------------------------
prediction = icp.predict(data.data[test, :], significance=0.1)
header = ['min','max','truth','size']
size = prediction[:, 1] - prediction[:, 0]
table = np.vstack([prediction.T, data.target[test], size.T]).T
df = pd.DataFrame(table, columns=header)
print(df)

# -----------------------------------------------------------------------------
# With normalization
Esempio n. 10
0
    def build(self):
        '''Build a new XGBOOST model with the X and Y numpy matrices '''

        try:
            from xgboost.sklearn import XGBClassifier
            from xgboost.sklearn import XGBRegressor
        except Exception as e:
            return False,  'XGboost not found, please revise your environment'

        # Make a copy of data matrices
        X = self.X.copy()
        Y = self.Y.copy()

        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))

        # If tune then call gridsearch to optimize the estimator
        if self.param.getVal('tune'):

            LOG.info("Optimizing XGBOOST estimator")
            
            try:
                # Check type of model
                if self.param.getVal('quantitative'):
                    self.estimator = XGBRegressor(
                                        **self.estimator_parameters)
                    self.optimize(X, Y, self.estimator, self.tune_parameters)
                    results.append(('model','model type','XGBOOST quantitative (optimized)'))
                else:
                    self.estimator = XGBClassifier(
                                        **self.estimator_parameters)
                    params = self.estimator.get_params()
                    params['num_class'] = 2
                    self.optimize(X, Y, self.estimator,
                                  self.tune_parameters)
                    results.append(('model','model type','XGBOOST qualitative (optimized)'))

            except Exception as e:
                return False, f'Exception optimizing XGBOOST estimator with exception {e}'
            
        else:
            try:
                if self.param.getVal('quantitative'):

                    LOG.info("Building Quantitative XGBOOST model")
                    # params = {
                    #     'objective': 'reg:squarederror',
                    #     'missing': -99.99999,
                    #     # 'max_depth': 20,
                    #     # 'learning_rate': 1.0,
                    #     # 'silent': 1,
                    #     # 'n_estimators': 25
                    #     }
                    # self.estimator = XGBRegressor(**params)
                    self.estimator = XGBRegressor(**self.estimator_parameters)
                    results.append(('model', 'model type', 'XGBOOST quantitative'))
                else:

                    LOG.info("Building Qualitative XGBOOST model")
                    # params = {
                    #     'objective': 'binary:logistic',
                    #      'max_depth': 3,
                    #      #'learning_rate': 0.7,
                    #      #'silent': 1,
                    #      'n_estimators': 100
                    #     }
                    self.estimator = XGBClassifier(**self.estimator_parameters)
                    results.append(('model', 'model type', 'XGBOOST qualitative'))

                self.estimator.fit(X, Y)
                print(self.estimator)

            except Exception as e:
                raise e
                return False, f'Exception building XGBOOST estimator with exception {e}'

        self.estimator_temp = copy(self.estimator)

        if not self.param.getVal('conformal'):
            return True, results
        # Create the conformal estimator
        try:
            # Conformal regressor
            if self.param.getVal('quantitative'):

                LOG.info("Building conformal Quantitative XGBOOST model")

                underlying_model = RegressorAdapter(self.estimator_temp)
                #normalizing_model = RegressorAdapter(
                    #KNeighborsRegressor(n_neighbors=5))
                normalizing_model = RegressorAdapter(self.estimator_temp)
                normalizer = RegressorNormalizer(
                                underlying_model,
                                normalizing_model,
                                AbsErrorErrFunc())
                nc = RegressorNc(underlying_model,
                                    AbsErrorErrFunc(),
                                    normalizer)

                # self.conformal_pred = AggregatedCp(IcpRegressor
                # (RegressorNc(RegressorAdapter(self.estimator))),
                #                                   BootstrapSampler())

                self.estimator = AggregatedCp(IcpRegressor(nc),
                                                BootstrapSampler())

                self.estimator.fit(X, Y)
                results.append(('model', 'model type', 'conformal XGBOOST quantitative'))

            # Conformal classifier
            else:

                LOG.info("Building conformal Qualitative XGBOOST model")

                self.estimator = AggregatedCp(
                                    IcpClassifier(
                                        ClassifierNc(
                                            ClassifierAdapter(self.estimator_temp),
                                            MarginErrFunc()
                                        )
                                    ),
                                    BootstrapSampler())

                # Fit estimator to the data
                self.estimator.fit(X, Y)
                results.append(('model', 'model type', 'conformal XGBOOST qualitative'))

        except Exception as e:
            raise e
            return False, f'Exception building conformal XGBOOST estimator with exception {e}'

        return True, results



## Overriding of parent methods

    # def CF_quantitative_validation(self):
    #     ''' performs validation for conformal quantitative models '''

      

    # def CF_qualitative_validation(self):
    #     ''' performs validation for conformal qualitative models '''


    # def quantitativeValidation(self):
    #     ''' performs validation for quantitative models '''

    # def qualitativeValidation(self):
    #     ''' performs validation for qualitative models '''


    # def validate(self):
    #     ''' Validates the model and computes suitable model quality scoring values'''


    # def optimize(self, X, Y, estimator, tune_parameters):
    #     ''' optimizes a model using a grid search over a range of values for diverse parameters'''


    # def regularProject(self, Xb, results):
    #     ''' projects a collection of query objects in a regular model, for obtaining predictions '''


    # def conformalProject(self, Xb, results):
    #     ''' projects a collection of query objects in a conformal model, for obtaining predictions '''


    # def project(self, Xb, results):
    #     ''' Uses the X matrix provided as argument to predict Y'''
Esempio n. 11
0
def train_and_test_cp_algo(parameters):
    p = parameters.copy()
    p.pop('algorithm')
    p.pop('randomized_calibration')
    p.pop('alpha_')
    p.pop('calibration_size')
    p.pop('WhichCP')

    for i in tqdm(range(29)):
        if parameters.get('algorithm') == 'RandomForest':
            algorithm = RandomForestRegressor(**p)
        if parameters.get('algorithm') == 'K-NearestNeighbours':
            algorithm = KNeighborsRegressor(**p)
        if parameters.get('algorithm') == 'LightGBM':
            algorithm = LGBMRegressor(**p)
        if parameters.get('algorithm') == 'LassoRegression':
            algorithm = Lasso(**p)
        if parameters.get('algorithm') == 'NeuralNetwork':
            algorithm = NeuralNetworkAlgorithm(p)
        if parameters.get('algorithm') == 'LSTM':
            algorithm = BiLSTM(**p)
        if parameters.get('algorithm') == 'GradientBoosting':
            algorithm =GradientBoostingRegressor(**p)


        path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv'
        df = pd.read_csv(path).drop(['Unnamed: 0','QdfTime'], axis=1).fillna(0)
        m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std()

        mean = df.mean(axis=0)
        std = df.std(axis=0)
        df = (df - mean) / std

        if parameters.get('randomized_calibration') == True:

            train_test_split = len(df) - 120
            train_ = df.drop([ 'NetPosUsd'], axis=1).iloc[:train_test_split, :].values
            choose = np.random.choice(len(train_), parameters.get("calibration_size"), replace=False)
            calibrate = train_[choose, :]
            mask = np.ones(len(train_), dtype=bool)
            mask[choose] = False
            train = train_[mask, :]

            test = (df.drop([  'NetPosUsd'], axis=1)).iloc[train_test_split:,
                   :].values

            ytrain_ = df['NetPosUsd'][:train_test_split].values

            ycalibrate = ytrain_[choose]
            ytrain = ytrain_[mask]

            ytest = df['NetPosUsd'].iloc[train_test_split:]


        else:
            train_test_split = len(df) - 120 - parameters.get("calibration_size")
            train = df.drop([  'NetPosUsd'], axis=1).iloc[:train_test_split, :].values

            calibrate = df.drop([ 'NetPosUsd'], axis=1).iloc[train_test_split:train_test_split + parameters.get("calibration_size"), :].values

            test = (df.drop([  'NetPosUsd'], axis=1)).iloc[-120:,:].values

            ytrain = df['NetPosUsd'][:train_test_split].values

            ycalibrate = df['NetPosUsd'][train_test_split:train_test_split + parameters.get("calibration_size")]

            ytest = df['NetPosUsd'].iloc[-120:]

        if parameters.get("WhichCP") == 'NCP':
            underlying_model = RegressorAdapter(algorithm)
            normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50))
            normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc())
            nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
            icp = IcpRegressor(nc)
            icp.fit(train, ytrain)
            icp.calibrate(calibrate, ycalibrate)

            # -----------------------------------------------------------------------------
            # Predict
            # -----------------------------------------------------------------------------
            prediction = icp.predict(test, significance=parameters.get('alpha_'))
            header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction']
            size = prediction[:, 1] / 2 + prediction[:, 0] / 2

            prediction=prediction*s+m
            ytest=ytest*s+m
            size=size*s+m

            table = np.vstack([prediction.T, ytest, size.T]).T

            dfncp = pd.DataFrame(table, columns=header)

        else:
            underlying_model = RegressorAdapter(algorithm)
            nc = RegressorNc(underlying_model, AbsErrorErrFunc())
            icp = IcpRegressor(nc)
            icp.fit(train, ytrain)
            icp.calibrate(calibrate, ycalibrate)

            # -----------------------------------------------------------------------------
            # Predict
            # -----------------------------------------------------------------------------
            prediction = icp.predict(test, significance=parameters.get('alpha_'))
            header = ['CP_lower', 'CP_upper', 'NetPosUsd', 'prediction']
            size = prediction[:, 1] / 2 + prediction[:, 0] / 2

            prediction = prediction * s + m
            ytest = ytest * s + m
            size = size * s + m

            table = np.vstack([prediction.T, ytest, size.T]).T

            dfncp = pd.DataFrame(table, columns=header)

        if i == 0:
            dfncp.to_csv(
                parameters.get("WhichCP") + '_' + parameters.get('algorithm') + '_' + str(
                    np.round(parameters.get('alpha_') * 100).astype(int)) + '_' + 'calibrationwindow' + str(
                    parameters.get('calibration_size')) + '.csv',
                encoding='utf-8', index=False)
        else:
            dfncp.to_csv(
                parameters.get("WhichCP") + '_' + parameters.get('algorithm') + '_' + str(
                    np.round(parameters.get('alpha_') * 100).astype(int)) + '_' + 'calibrationwindow' + str(
                    parameters.get('calibration_size')) + '.csv', mode='a',
                header=False, index=False)

        del algorithm
Esempio n. 12
0
def cv(df, parameters):
    end = len(df) - 120
    out = np.zeros(3)
    out2 = np.zeros(3)
    p = parameters.copy()
    p.pop('algorithm')
    p.pop('randomized_calibration')
    p.pop('alpha_')
    if parameters.get('algorithm') == 'RandomForest':
        algorithm = RandomForestRegressor(**p)
        d = {'n_estimators': parameters.get('n_estimators'),
             "criterion": parameters.get("criterion"),
             "max_features": parameters.get("max_features"),
             "min_samples_split": parameters.get("min_samples_split"),
             "min_samples_leaf": parameters.get("min_samples_leaf")
             }
    if parameters.get('algorithm') == 'K-NearestNeighbours':
        algorithm = KNeighborsRegressor(**p)
        d = {
            'n_neighbours': parameters.get('n_neighbours'),
            'weights': parameters.get('weights'),
            'metric': parameters.get('metric')
        }
    if parameters.get('algorithm') == 'LightGBM':
        algorithm = LGBMRegressor(**p)
        d = {"metric": parameters.get("metric"),
             "num_leaves": parameters.get('num_leaves'),
             "learning_rate": parameters.get('learning_rate'),
             "feature_fraction": parameters.get('feature_fraction'),
             "bagging_fraction": parameters.get('bagging_fraction'),
             "bagging_freq": parameters.get('bagging_freq'),
             }

    if parameters.get('algorithm') == 'LassoRegression':
        algorithm = Lasso(**p)
        d = {'alpha_': parameters.get('alpha_')}

    if parameters.get('algorithm') == 'NeuralNetwork':
        algorithm = NeuralNetworkAlgorithm(p)

    if parameters.get('algorithm') == 'LSTM':
        algorithm = BiLSTM(**p)
        d = {}
    d = p
    d['alpha_'] = parameters.get('alpha_')

    m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std()
    df=df.drop(['QdfTime' ], axis=1)
    mean = df.mean(axis=0)
    std = df.std(axis=0)
    df = (df - mean) / std

    for i, ratio in enumerate(([.5, 0.66, .84])):
        if parameters.get('randomized_calibration') == True:

            train_ = df.drop([  'NetPosUsd'], axis=1).iloc[:int(end * ratio), :].values
            choose = np.random.choice(len(train_), int(end / 6), replace=False)
            calibrate = train_[choose, :]
            mask = np.ones(len(train_), dtype=bool)
            mask[choose] = False

            train = train_[mask, :]
            test = (df.drop([  'NetPosUsd'], axis=1)).iloc[int(end * ratio):int(end * ratio) + int(end / 6),
                   :].values

            ytrain_ = df['NetPosUsd'][:int(end * ratio)].values

            ycalibrate = ytrain_[choose]
            ytrain = ytrain_[mask]

            ytest = df['NetPosUsd'].iloc[int(end * ratio):int(end * ratio) + int(end / 6)]

        else:
            train = df.drop([  'NetPosUsd'], axis=1).iloc[:int(end * ratio) - int(end / 6), :].values

            calibrate = df.drop([  'NetPosUsd'], axis=1).iloc[int(end * ratio) - int(end / 6):int(end * ratio),
                        :].values

            test = df.drop([  'NetPosUsd'], axis=1).iloc[int(end * ratio):int(end * ratio) + int(end / 6),
                   :].values

            ytrain = df['NetPosUsd'][:int(end * ratio) - int(end / 6)].values

            ycalibrate = df['NetPosUsd'][int(end * ratio) - int(end / 6):int(end * ratio)].values

            ytest = df['NetPosUsd'][int(end * ratio):int(end * ratio) + int(end / 6)].values
            # print(len(train),len(ytrain),len(calibrate),len(ycalibrate),len(test),len(ytest))

            # Train and calibrate
        # -----------------------------------------------------------------------------

        underlying_model = RegressorAdapter(algorithm)
        normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50))
        normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc())
        nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)

        icp = IcpRegressor(nc)
        icp.fit(train, ytrain)
        icp.calibrate(calibrate, ycalibrate)

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp.predict(test, significance=parameters.get('alpha_'))
        header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction']
        size = prediction[:, 1] / 2 + prediction[:, 0] / 2

        prediction = prediction * s + m
        ytest = ytest * s + m
        size = size * s + m

        table = np.vstack([prediction.T, ytest, size.T]).T

        dfncp = pd.DataFrame(table, columns=header)

        underlying_model = RegressorAdapter(algorithm)

        nc = RegressorNc(underlying_model, AbsErrorErrFunc())
        icp = IcpRegressor(nc)
        icp.fit(train, ytrain)
        icp.calibrate(calibrate, ycalibrate)

        prediction = icp.predict(test, significance=parameters.get('alpha_'))
        header = ['cp_lower', 'cp_upper']

        prediction = prediction * s + m

        table = np.vstack([prediction.T]).T

        dfcp = pd.DataFrame(table, columns=header)
        dfncp['CP_lower'] = dfcp['cp_lower']
        dfncp['CP_upper'] = dfcp['cp_upper']

        out[i] = qd_objective(dfncp.NetPosUsd, dfncp['CP_lower'], dfncp['CP_upper'], parameters.get('alpha_'))

        out2[i] = qd_objective(dfncp.NetPosUsd, dfncp['NCP_lower'], dfncp['NCP_upper'], parameters.get('alpha_'))

    d['CP_loss'] = np.mean(out)
    d['NCP_loss'] = np.mean(out2)

    if os.path.exists(parameters.get('algorithm') + '_cv.csv') == True:

        pd.DataFrame(data=d, index=[0]).to_csv(parameters.get('algorithm') + '_cv.csv', mode='a', header=False,
                                               index=False)

    else:
        pd.DataFrame(data=d, index=[0]).to_csv(parameters.get('algorithm') + '_cv.csv', encoding='utf-8', index=False)
Esempio n. 13
0
    def build(self):
        '''Build a new RF model with the X and Y numpy matrices '''

        if self.failed:
            return False

        X = self.X.copy()
        Y = self.Y.copy()

        results = []

        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))

        if self.cv:
            self.cv = getCrossVal(self.cv,
                                  self.estimator_parameters["random_state"],
                                  self.n, self.p)
        if self.tune:
            if self.quantitative:
                self.optimize(X, Y, RandomForestRegressor(),
                              self.tune_parameters)
                results.append(
                    ('model', 'model type', 'RF quantitative (optimized)'))
            else:
                self.optimize(X, Y, RandomForestClassifier(),
                              self.tune_parameters)
                results.append(
                    ('model', 'model type', 'RF qualitative (optimized)'))
        else:
            if self.quantitative:
                log.info("Building Quantitative RF model")
                self.estimator_parameters.pop('class_weight', None)

                self.estimator = RandomForestRegressor(
                    **self.estimator_parameters)
                results.append(('model', 'model type', 'RF quantitative'))

            else:
                log.info("Building Qualitative RF model")
                self.estimator = RandomForestClassifier(
                    **self.estimator_parameters)
                results.append(('model', 'model type', 'RF qualitative'))

        if self.conformal:
            if self.quantitative:
                underlying_model = RegressorAdapter(self.estimator)
                normalizing_model = RegressorAdapter(
                    KNeighborsRegressor(n_neighbors=5))
                normalizing_model = RegressorAdapter(self.estimator)
                normalizer = RegressorNormalizer(underlying_model,
                                                 normalizing_model,
                                                 AbsErrorErrFunc())
                nc = RegressorNc(underlying_model, AbsErrorErrFunc(),
                                 normalizer)
                # self.conformal_pred = AggregatedCp(IcpRegressor(RegressorNc(RegressorAdapter(self.estimator))),
                #                                   BootstrapSampler())

                self.conformal_pred = AggregatedCp(IcpRegressor(nc),
                                                   BootstrapSampler())
                self.conformal_pred.fit(X, Y)
                # overrides non-conformal
                results.append(
                    ('model', 'model type', 'conformal RF quantitative'))

            else:
                self.conformal_pred = AggregatedCp(
                    IcpClassifier(
                        ClassifierNc(ClassifierAdapter(self.estimator),
                                     MarginErrFunc())), BootstrapSampler())
                self.conformal_pred.fit(X, Y)
                # overrides non-conformal
                results.append(
                    ('model', 'model type', 'conformal RF qualitative'))

        self.estimator.fit(X, Y)

        return True, results


#### Overriding of parent methods

# def CF_quantitative_validation(self):
#     ''' performs validation for conformal quantitative models '''

# def CF_qualitative_validation(self):
#     ''' performs validation for conformal qualitative models '''

# def quantitativeValidation(self):
#     ''' performs validation for quantitative models '''

# def qualitativeValidation(self):
#     ''' performs validation for qualitative models '''

# def validate(self):
#     ''' Validates the model and computes suitable model quality scoring values'''

# def optimize(self, X, Y, estimator, tune_parameters):
#     ''' optimizes a model using a grid search over a range of values for diverse parameters'''

# def regularProject(self, Xb, results):
#     ''' projects a collection of query objects in a regular model, for obtaining predictions '''

# def conformalProject(self, Xb, results):
#     ''' projects a collection of query objects in a conformal model, for obtaining predictions '''

# def project(self, Xb, results):
#     ''' Uses the X matrix provided as argument to predict Y'''
Esempio n. 14
0
from nonconformist.nc import RegressorNc, abs_error, abs_error_inv


def split_data(data, n_train, n_test):
    n_train = n_train*len(data)//(n_train+n_test)
    n_test = len(data)-n_train
    ind = np.random.permutation(len(data))
    return data[ind[:n_train]], data[ind[n_train:n_train+n_test]]

data = Orange.data.Table("auto-mpg")
imp = Impute()
data = imp(data)

for sig in np.linspace(0.01, 0.1, 10):
    errs, szs = [], []
    for rep in range(10):
        train, test = split_data(data, 2, 1)
        train, calib = split_data(train, 2, 1)

        icp = IcpRegressor(RegressorNc(DecisionTreeRegressor(), abs_error, abs_error_inv))
        icp.fit(train.X, train.Y)
        icp.calibrate(calib.X, calib.Y)
        pred = icp.predict(test.X, significance=sig)

        acc = sum(p[0] <= y <= p[1] for p, y in zip(pred, test.Y))/len(pred)
        err = 1-acc
        sz = sum(p[1]-p[0] for p in pred)/len(pred)
        errs.append(err)
        szs.append(sz)
    print(sig, np.mean(errs), np.mean(szs))
Esempio n. 15
0
def create_conformal_model():
    """
    Description - Create conformal model - Main loop
    """

    #Read data from file
    data = read_data(args.i)

    #Calculate descriptors using RD-kit
    descriptors_df = calculate_descriptors(data['smiles'])

    #Assign indices
    train_i, calibrate_i, test_i = create_indices_test_training_calibration(
        data)  # Create indices for test,training, calibration sets
    test_index_total = [x for x in test_i]
    calibrate_index_total = [x for x in calibrate_i]

    #Create inductive conformal prediction regressor

    if args.m == 'RF':
        icp = IcpRegressor(
            NormalizedRegressorNc(RandomForestRegressor,
                                  KNeighborsRegressor,
                                  abs_error,
                                  abs_error_inv,
                                  model_params={'n_estimators': 100}))

    if args.m == 'SVM':
        #No support vector regressor
        print('error - no SVM-regressor avliable')
        icp = IcpRegressor(
            NormalizedRegressorNc(SVR,
                                  KNeighborsRegressor,
                                  abs_error,
                                  abs_error_inv,
                                  model_params={'n_estimators': 100}))

    #Create DataFrames to store data
    A = pandas.DataFrame(index=range(len(data)))
    B = pandas.DataFrame(index=range(len(data)))
    C = pandas.DataFrame(index=range(len(data)))

    iA = pandas.DataFrame(index=range(len(data)))
    iB = pandas.DataFrame(index=range(len(data)))
    iC = pandas.DataFrame(index=range(len(data)))

    if args.verbose:
        print('Number of models to create: ' + args.num_models)
        print('############## Starting calculations ##############')

    icp_s = []

    for i in range(int(args.num_models)):  #DEBUG 100
        Xtrain, Xtest, Xcalibrate, ytrain, ytest, ycalibrate = create_train_test_calibrate_sets(
            data, descriptors_df, train_i, calibrate_i, test_i)

        #Create nornal model
        icp.fit(Xtrain, ytrain)

        #Calibrate normal model
        icp.calibrate(asanyarray(Xcalibrate), asanyarray(ycalibrate))

        #Predrict test and training sets
        prediction_test = icp.predict(asanyarray(Xtest),
                                      significance=args.significance)  # 0.2
        prediction_calibrate = icp.predict(asanyarray(Xcalibrate),
                                           significance=args.significance)

        #Create DF with data
        blob = pandas.DataFrame(prediction_test, index=test_i)
        iblob = pandas.DataFrame(prediction_calibrate, index=calibrate_i)

        A[i] = blob[0]
        B[i] = blob[1]

        iA[i] = iblob[0]
        iB[i] = iblob[1]

        #Create new indices for next model
        test_index_total = np.unique(
            np.concatenate((test_index_total, test_i), axis=0))
        calibrate_index_total = np.unique(
            np.concatenate((calibrate_index_total, calibrate_i), axis=0))

        train_i, calibrate_i, test_i = randomize_new_indices(
            train_i, calibrate_i, test_i, data, i)

        #temp = sklearn.base.clone(icp)
        icp_s.append(copy.copy(icp))

    ### Save models ###
    save_models(icp_s)

    if args.verbose:
        print(
            '################## Loop finished, model created, test set predicted #################'
        )

    experimental_values = data['Observed'][test_index_total]
    iexperimental_values = data['Observed'][calibrate_index_total]

    C['median_prediction_0'] = A.median(axis=1)
    C['median_prediction_1'] = B.median(axis=1)
    C['median_prediction'] = (C['median_prediction_0'] +
                              C['median_prediction_1']) / 2
    C['median_prediction_size'] = C['median_prediction'] - C[
        'median_prediction_0']

    Y_pred_median_test = C['median_prediction'].dropna()
    median_prediction_size = C['median_prediction_size'].dropna().tolist()

    num_outside_median = 0
    for i in range(len(data)):
        try:
            if C['median_prediction_0'].dropna()[i] < experimental_values[
                    i] < C['median_prediction_1'].dropna()[i]:
                pass
            else:
                num_outside_median += 1
                #print('Outside range')
        except:
            pass  #print('error')

    #Internal prediction
    iC['median_prediction_0'] = iA.median(axis=1)
    iC['median_prediction_1'] = iB.median(axis=1)
    iC['median_prediction'] = (iC['median_prediction_0'] +
                               iC['median_prediction_1']) / 2
    iC['median_prediction_size'] = iC['median_prediction'] - iC[
        'median_prediction_0']

    iY_pred_median_test = iC['median_prediction'].dropna()
    imedian_prediction_size = iC['median_prediction_size'].dropna().tolist()

    inum_outside_median = 0
    for i in range(len(data)):
        try:
            if iC['median_prediction_0'].dropna()[i] < iexperimental_values[
                    i] < iC['median_prediction_1'].dropna()[i]:
                pass
            else:
                inum_outside_median += 1
                #print('Outside range')
        except:
            pass  #print('error')

    if args.verbose:
        print(
            '########################## Prediction statistics external test ##########################'
        )
        print('')

    print('Number of compounds predicted in test set: ' +
          str(C['median_prediction'].notnull().sum()))

    if args.t != 'full_model':
        ex_r2_score = r2_score(experimental_values, Y_pred_median_test)
        print('R^2 (coefficient of determination):  %.3f' % ex_r2_score)

        ex_mean_squared_error = mean_squared_error(experimental_values,
                                                   Y_pred_median_test)
        ex_rmse = sqrt(ex_mean_squared_error)
        print('RMSE:  %.3f' % ex_rmse)

        ex_MAE = mean_absolute_error(experimental_values, Y_pred_median_test)
        print('Mean absolute error:  %.3f' % ex_MAE)

        print('Mean squared error: %.3f' % ex_mean_squared_error)

        #Average prediction range
        print('Mean of median prediction range: %.3f' %
              mean(median_prediction_size))

        percent_num_outside_median = 100 * float(num_outside_median) / float(
            len(experimental_values))
        print('Number of compounds outside of prediction range: ' +
              str(num_outside_median))
        print('% of compounds predicted outside of prediction range: ' +
              str(percent_num_outside_median) + ' %')
        print(' ')

        #####Internal Prediction ########

        print('Number of compounds predicted in training set: ' +
              str(iC['median_prediction'].notnull().sum()))

        iex_r2_score = r2_score(iexperimental_values, iY_pred_median_test)
        print('R^2 (coefficient of determination):  %.3f' % iex_r2_score)

        iex_mean_squared_error = mean_squared_error(iexperimental_values,
                                                    iY_pred_median_test)
        iex_rmse = sqrt(iex_mean_squared_error)
        print('RMSE:  %.3f' % iex_rmse)

        print('Mean squared error: %.3f' % iex_mean_squared_error)

        iex_MAE = mean_absolute_error(iexperimental_values,
                                      iY_pred_median_test)
        print('Mean absolute error:  %.3f' % iex_MAE)

        #Average prediction range
        print('Mean of median prediction range: %.3f' %
              mean(imedian_prediction_size))

        ipercent_num_outside_median = 100 * float(inum_outside_median) / float(
            len(iexperimental_values))
        print('Number of compounds outside of prediction range: ' +
              str(inum_outside_median))
        print('% of compounds predicted outside of prediction range: ' +
              str(ipercent_num_outside_median) + ' %')
        print(' ')

        #### Plot results - plot test set
        if args.plot:
            if args.verbose:
                print(' ################ Plotting testset #################')
            fig, ax = plt.subplots()

            ax.errorbar(experimental_values,
                        Y_pred_median_test,
                        yerr=median_prediction_size,
                        fmt='o',
                        markeredgecolor='black',
                        markersize=6,
                        mew=1,
                        ecolor='black',
                        elinewidth=0.3,
                        capsize=3,
                        capthick=1,
                        errorevery=1)

            #Set the size
            ax.set_ylim([-10, -3])
            ax.set_xlim([-10, -3])

            # Plot title and lables
            #plt.title('Median predictions with prediction ranges for the testset')
            plt.ylabel('Predicted log Kp')
            plt.xlabel('Experimental log Kp')

            # Draw line
            fit = np.polyfit(experimental_values, Y_pred_median_test, 1)

            x = [-10, -3]

            #Regression line
            #ax.plot(experimental_values, fit[0]*asanyarray(experimental_values)+ fit[1], color='black')
            #ax.plot(x, fit[0]*asanyarray(x)+ fit[1], color='black')

            #Creating colored dots for ref 10

            #ref10_experimental = data.loc[data['Ref.'] == 10]['Observed']
            #ref10_predicted = C['median_prediction'][ref10_experimental.index]
            #ax.scatter(ref10_experimental, ref10_predicted,marker = 'o', color ='red', s = 100)

            ax.plot(x, x, color='black')

            plt.show()

    #Print data in CSV-file

    descriptors_df['Median prediction low range'] = C['median_prediction_0']
    descriptors_df['Median prediction high range'] = C['median_prediction_1']
    descriptors_df['Median prediction'] = C['median_prediction']
    descriptors_df['size prediction range'] = C['median_prediction_1'] - C[
        'median_prediction_0']
    write_csv_with_data(data, descriptors_df, args.d)

    #Calculate min, max and mean values for descriptors
    if args.phys:
        print(args.phys)
        print('Min: ')
        print(descriptors_df.min())
        print('Max: ')
        print(descriptors_df.max())
        print('Mean:')
        print(descriptors_df.mean())

    if args.pca:
        print('Starting PCA')
        print(descriptors_df[[
            'logP', 'PSA', 'MolWt', 'RingCount', 'HeavyAtomCount',
            'NumRotatableBonds'
        ]].head(3))
        print(len(descriptors_df[['size prediction range']]))

        #Define typ of PCA
        pca = PCA(n_components=2)

        #Select desctiptors to use in PCA
        df_small = descriptors_df[[
            'logP', 'PSA', 'MolWt', 'RingCount', 'HeavyAtomCount',
            'NumRotatableBonds'
        ]]

        #Convert descritor values to numeric/float
        df_X = df_small.apply(pandas.to_numeric, errors='raise')

        #Scale data
        scaler = preprocessing.RobustScaler()  #Normalizer() # MaxAbsScaler()
        df_X_scaled = scaler.fit_transform(df_X)

        #Calculate PCA
        pca.fit(df_X_scaled)

        X2 = pca.transform(
            df_X_scaled
        )  #descriptors_df[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']])

        #-----------------------------------------------------------
        desc_testset_large = descriptors_df.dropna(
            subset=['size prediction range'])

        desc_testset_small = desc_testset_large[[
            'logP', 'PSA', 'MolWt', 'RingCount', 'HeavyAtomCount',
            'NumRotatableBonds'
        ]]

        desc_testset_num = desc_testset_small.apply(pandas.to_numeric,
                                                    errors='raise')

        desc_testset_scaled = scaler.fit_transform(desc_testset_num)

        X3 = pca.transform(
            desc_testset_scaled
        )  #desc_testset[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']])
        #-----------------------------------------------------------

        #desc_testset = descriptors_df.dropna(subset = ['size prediction range'])
        Yerr_num = desc_testset_large[['size prediction range'
                                       ]].apply(pandas.to_numeric,
                                                errors='coerce')
        #print(pandas.Series(Yerr['size prediction range']))

        yerr = list(pandas.Series(Yerr_num['size prediction range']) / 4)

        plt.errorbar(X3[:, 0],
                     X3[:, 1],
                     yerr=yerr,
                     fmt='o',
                     markeredgecolor='black',
                     markersize=6,
                     mew=1,
                     ecolor='black',
                     elinewidth=0.3,
                     capsize=3,
                     capthick=1,
                     errorevery=1)

        plt.scatter(X2[:, 0], X2[:, 1])
        plt.xlabel('PC1')
        plt.ylabel('PC2')

        plt.title('PCA of descriptors')
        plt.show()
Esempio n. 16
0
def train_and_test_cp_algo(i):
    window = 96
    p = {'window': window}
    algorithm = BiLSTM(p)

    path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv'
    df = pd.read_csv(path).drop(['QdfTime', 'Unnamed: 0'], axis=1).fillna(0)
    y_raw_test = df.NetPosUsd[-120:]
    median_ = df.NetPosUsd.median()
    mad_ = mad(df.NetPosUsd.values)
    df.NetPosUsd = mlog_trans(df.NetPosUsd.values)

    # mean = df.NetPosUsd.mean()
    # std = df.NetPosUsd.std()
    # df.NetPosUsd = (df.NetPosUsd - mean) / std

    data = df.NetPosUsd.values

    def generate_index(window, data_matrix):
        '''

        :return:
        '''

        num_elements = data_matrix.shape[0]

        for start, stop in zip(range(0, num_elements - window, 1), range(window, num_elements, 1)):
            yield data_matrix[stop - window:stop].reshape((-1, 1))

    cnt = []

    for sequence in generate_index(window, data):
        cnt.append(sequence)
    cnt = np.array(cnt)

    X = cnt
    y = data[window:]

    X = X.reshape(X.shape[0], X.shape[1])

    train_test_split = X.shape[0] - 120 - 3480
    train = X[:train_test_split, :]

    calibrate = X[train_test_split:train_test_split + 3480, :]

    test = X[-120:]

    ytrain = y[:train_test_split]

    ycalibrate = y[train_test_split:train_test_split + 3480]

    ytest = y[-120:]

    underlying_model = RegressorAdapter(algorithm)
    normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50))
    normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc())
    nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
    icp = IcpRegressor(nc)
    icp.fit(train, ytrain)
    icp.calibrate(calibrate, ycalibrate)

    underlying_model2 = RegressorAdapter(algorithm)
    nc2 = RegressorNc(underlying_model2, AbsErrorErrFunc())
    icp2 = IcpRegressor(nc2)
    icp2.fit(train, ytrain)
    icp2.calibrate(calibrate, ycalibrate)

    for a in tqdm(np.linspace(5, 95, 19)):

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp.predict(test, significance=a / 100)
        header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction']
        lower, upper = prediction[:, 0], prediction[:, 1]

        lower = mlog_inverse(lower, median_, mad_)
        upper = mlog_inverse(upper, median_, mad_)
        ytest = mlog_inverse(ytest, median_, mad_)
        # lower=lower*std+mean
        # upper=upper*std+mean
        # ytest=ytest*std+mean
        size = upper / 2 + lower / 2
        table = np.vstack([lower, upper, y_raw_test, size.T]).T

        dfncp = pd.DataFrame(table, columns=header)

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp2.predict(test, significance=a / 100)
        header = ['CP_lower', 'CP_upper', 'NetPosUsd', 'prediction']
        lower, upper = prediction[:, 0], prediction[:, 1]

        lower = mlog_inverse(lower, median_, mad_)
        upper = mlog_inverse(upper, median_, mad_)
        ytest = mlog_inverse(ytest, median_, mad_)

        # lower=lower*std+mean
        # upper=upper*std+mean
        # ytest=ytest*std+mean
        size = upper / 2 + lower / 2
        table = np.vstack([lower, upper, y_raw_test, size.T]).T

        dfcp = pd.DataFrame(table, columns=header)

        if i == 0:
            dfcp.to_csv(
                'CP' + '_' + 'cudaLSTM' + '_' + str(
                    np.round(a).astype(int)) + '_' + 'calibrationwindow' + str(
                    3480) + '.csv',
                encoding='utf-8', index=False)
        else:
            dfcp.to_csv(
                'CP' + '_' + 'cudaLSTM' + '_' + str(
                    np.round(a).astype(int)) + '_' + 'calibrationwindow' + str(
                    3480) + '.csv', mode='a',
                header=False, index=False)

        if i == 0:
            dfncp.to_csv(
                'NCP' + '_' + 'cudaLSTM' + '_' + str(
                    np.round(a).astype(int)) + '_' + 'calibrationwindow' + str(
                    3480) + '.csv',
                encoding='utf-8', index=False)
        else:
            dfncp.to_csv(
                'NCP' + '_' + 'cudaLSTM' + '_' + str(
                    np.round(a).astype(int)) + '_' + 'calibrationwindow' + str(
                    3480) + '.csv', mode='a',
                header=False, index=False)
Esempio n. 17
0
    def build(self):
        '''Build a new DL model with the X and Y numpy matrices '''

        try:
            from keras.wrappers.scikit_learn import KerasClassifier
            from keras.wrappers.scikit_learn import KerasRegressor
        except Exception as e:
            return False, 'Keras not found, please revise your environment'

        # Make a copy of data matrices
        X = self.X.copy()
        Y = self.Y.copy()

        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))

        # If tune then call gridsearch to optimize the estimator
        if self.param.getVal('tune'):

            LOG.info("Optimizing Keras estimator")

            try:
                # Check type of model
                if self.param.getVal('quantitative'):
                    self.estimator = KerasRegressor(
                        **self.estimator_parameters)
                    self.optimize(X, Y, self.estimator, self.tune_parameters)
                    results.append(('model', 'model type',
                                    'KERAS quantitative (optimized)'))
                else:
                    self.estimator = KerasClassifier(
                        **self.estimator_parameters)
                    #params = self.estimator.get_params()
                    #params['num_class'] = 2
                    self.optimize(X, Y, self.estimator, self.tune_parameters)
                    results.append(('model', 'model type',
                                    'KERAS qualitative (optimized)'))

            except Exception as e:
                return False, f'Exception optimizing KERAS estimator with exception {e}'

        else:
            try:
                if self.param.getVal('quantitative'):

                    LOG.info("Building Quantitative KERAS mode")
                    self.estimator = KerasRegressor(
                        build_fn=self.create_model,
                        **self.estimator_parameters,
                        verbose=0)
                    results.append(
                        ('model', 'model type', 'Keras quantitative'))
                else:

                    LOG.info("Building Qualitative Keras model")
                    self.estimator = KerasClassifier(
                        build_fn=self.create_model,
                        dim=self.X.shape[1],
                        **self.estimator_parameters,
                        verbose=0)
                    results.append(
                        ('model', 'model type', 'Keras qualitative'))

                self.estimator.fit(X, Y)
                print(self.estimator)

            except Exception as e:
                raise e
                return False, f'Exception building Keras estimator with exception {e}'

        self.estimator_temp = clone(self.estimator)

        if not self.param.getVal('conformal'):
            return True, results
        # Create the conformal estimator
        try:
            # Conformal regressor
            if self.param.getVal('quantitative'):

                LOG.info("Building conformal Quantitative Keras model")

                underlying_model = RegressorAdapter(self.estimator_temp)
                normalizing_model = RegressorAdapter(
                    KNeighborsRegressor(n_neighbors=15))
                # normalizing_model = RegressorAdapter(self.estimator_temp)
                normalizer = RegressorNormalizer(underlying_model,
                                                 normalizing_model,
                                                 AbsErrorErrFunc())
                nc = RegressorNc(underlying_model, AbsErrorErrFunc(),
                                 normalizer)

                # self.conformal_pred = AggregatedCp(IcpRegressor
                # (RegressorNc(RegressorAdapter(self.estimator))),
                #                                   BootstrapSampler())

                self.estimator = AggregatedCp(IcpRegressor(nc),
                                              BootstrapSampler())

                self.estimator.fit(X, Y)
                results.append(
                    ('model', 'model type', 'conformal Keras quantitative'))

            # Conformal classifier
            else:

                LOG.info("Building conformal Qualitative Keras model")

                self.estimator = AggregatedCp(
                    IcpClassifier(
                        ClassifierNc(ClassifierAdapter(self.estimator_temp),
                                     MarginErrFunc())), BootstrapSampler())

                # Fit estimator to the data
                print('build finished')
                self.estimator.fit(X, Y)
                results.append(
                    ('model', 'model type', 'conformal Keras qualitative'))

        except Exception as e:
            raise e
            return False, f'Exception building conformal Keras estimator with exception {e}'

        return True, []
Esempio n. 18
0
    def test_cross_validation(self):
        # -----------------------------------------------------------------------------
        # Classification
        # -----------------------------------------------------------------------------
        data = load_iris()

        icp = IcpClassifier(
            ClassifierNc(
                ClassifierAdapter(RandomForestClassifier(n_estimators=100)),
                MarginErrFunc()))
        icp_cv = ClassIcpCvHelper(icp)

        scores = cross_val_score(
            icp_cv,
            data.data,
            data.target,
            iterations=5,
            folds=5,
            scoring_funcs=[class_mean_errors, class_avg_c],
            significance_levels=[0.05, 0.1, 0.2],
        )

        print("Classification: iris")
        scores = scores.drop(["fold", "iter"], axis=1)
        print(scores.groupby(["significance"]).mean())

        # -----------------------------------------------------------------------------
        # Regression, absolute error
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        icp = IcpRegressor(
            RegressorNc(
                RegressorAdapter(RandomForestRegressor(n_estimators=100)),
                AbsErrorErrFunc()))
        icp_cv = RegIcpCvHelper(icp)

        scores = cross_val_score(
            icp_cv,
            data.data,
            data.target,
            iterations=5,
            folds=5,
            scoring_funcs=[reg_mean_errors, reg_median_size],
            significance_levels=[0.05, 0.1, 0.2],
        )

        print("Absolute error regression: diabetes")
        scores = scores.drop(["fold", "iter"], axis=1)
        print(scores.groupby(["significance"]).mean())

        # -----------------------------------------------------------------------------
        # Regression, normalized absolute error
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        underlying_model = RegressorAdapter(
            RandomForestRegressor(n_estimators=100))
        normalizer_model = RegressorAdapter(
            RandomForestRegressor(n_estimators=100))
        normalizer = RegressorNormalizer(underlying_model, normalizer_model,
                                         AbsErrorErrFunc())
        nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)

        icp = IcpRegressor(nc)
        icp_cv = RegIcpCvHelper(icp)

        scores = cross_val_score(
            icp_cv,
            data.data,
            data.target,
            iterations=5,
            folds=5,
            scoring_funcs=[reg_mean_errors, reg_median_size],
            significance_levels=[0.05, 0.1, 0.2],
        )

        print("Normalized absolute error regression: diabetes")
        scores = scores.drop(["fold", "iter"], axis=1)
        print(scores.groupby(["significance"]).mean())

        # -----------------------------------------------------------------------------
        # Regression, normalized signed error
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        icp = IcpRegressor(
            RegressorNc(
                RegressorAdapter(RandomForestRegressor(n_estimators=100)),
                SignErrorErrFunc()))
        icp_cv = RegIcpCvHelper(icp)

        scores = cross_val_score(
            icp_cv,
            data.data,
            data.target,
            iterations=5,
            folds=5,
            scoring_funcs=[reg_mean_errors, reg_median_size],
            significance_levels=[0.05, 0.1, 0.2],
        )

        print("Signed error regression: diabetes")
        scores = scores.drop(["fold", "iter"], axis=1)
        print(scores.groupby(["significance"]).mean())

        # -----------------------------------------------------------------------------
        # Regression, signed error
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        underlying_model = RegressorAdapter(
            RandomForestRegressor(n_estimators=100))
        normalizer_model = RegressorAdapter(
            RandomForestRegressor(n_estimators=100))

        # The normalization model can use a different error function than is
        # used to measure errors on the underlying model
        normalizer = RegressorNormalizer(underlying_model, normalizer_model,
                                         AbsErrorErrFunc())
        nc = RegressorNc(underlying_model, SignErrorErrFunc(), normalizer)

        icp = IcpRegressor(nc)
        icp_cv = RegIcpCvHelper(icp)

        scores = cross_val_score(
            icp_cv,
            data.data,
            data.target,
            iterations=5,
            folds=5,
            scoring_funcs=[reg_mean_errors, reg_median_size],
            significance_levels=[0.05, 0.1, 0.2],
        )

        print("Normalized signed error regression: diabetes")
        scores = scores.drop(["fold", "iter"], axis=1)
        print(scores.groupby(["significance"]).mean())
Esempio n. 19
0
    def build(self):

        # Make a copy of data matrices
        X = self.X.copy()
        Y = self.Y.copy()

        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))

        if self.param.getVal('tune'):

            # Optimize estimator using sklearn-gridsearch
            if self.estimator_parameters['optimize'] == 'auto':
                try:

                    LOG.info('Optimizing PLSR using SK-LearnGridSearch')

                    # Remove optimize key from parameter dictionary
                    # to avoid sklearn estimator error (unexpected keyword)
                    self.estimator_parameters.pop("optimize")   

                    super(PLSR, self).optimize(X, Y, PLS_r(
                        **self.estimator_parameters), 
                        self.param.getDict('PLSR_optimize'))

                except Exception as e:
                    LOG.error(f'Error performing SK-LearnGridSearch'
                              f' on PLSR estimator with exception {e}')
                    return False, f'Error performing SK-LearnGridSearch on PLSR estimator with exception {e}'

            # Optimize using flame implementation (recommended)
            elif self.estimator_parameters['optimize'] == 'manual':

                LOG.info('Optimizing PLSR using manual method')

                # Remove optimize key from parameter dictionary
                # to avoid sklearn estimator error (unexpected keyword)
                self.estimator_parameters.pop("optimize")   

                success, message = self.optimize(X, Y, PLS_r(
                    **self.estimator_parameters), 
                    self.param.getDict('PLSR_optimize'))

                if not success:
                    return False, message

            else: 
                LOG.error('Type of tune not recognized, check the input')
                return False, 'Type of tune not recognized, check the input'    

            results.append(('model', 'model type', 'PLSR quantitative (optimized)'))

        else:
            LOG.info('Building Quantitative PLSR with no optimization')
            try:
                # Remove optimize key from parameters to avoid error
                self.estimator_parameters.pop("optimize") 

                # as the sklearn estimator does not have this key
                self.estimator = PLS_r(**self.estimator_parameters)
            except Exception as e:
                LOG.error(f'Error at PLS_r instantiation with '
                          f'exception {e}')
                return False, f'Error at PLS_da instantiation with exception {e}'

            results.append(('model', 'model type', 'PLSR quantitative'))
        
        # Fit estimator to the data
        self.estimator.fit(X, Y)

        if not self.param.getVal('conformal'):
            return True, results

        self.estimator_temp = copy(self.estimator)
        try:
            
            LOG.info('Building PLSR aggregated conformal predictor')

            underlying_model = RegressorAdapter(self.estimator_temp)
            # normalizing_model = RegressorAdapter(
            #     KNeighborsRegressor(n_neighbors=1))
            normalizing_model = RegressorAdapter(self.estimator_temp)
            normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc())

            nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
            self.estimator = AggregatedCp(IcpRegressor(nc), BootstrapSampler())

        except Exception as e:
            LOG.error(f'Error building aggregated PLSR conformal'
                        f' regressor with exception: {e}')
            return False, f'Error building aggregated PLSR conformal regressor with exception: {e}'

            # self.conformal_pred = AggregatedCp(IcpRegressor(
            # RegressorNc(RegressorAdapter(self.estimator))),
            #                                    BootstrapSampler())

        # Fit conformal estimator to the data
        self.estimator.fit(X, Y)

        # overrides non-conformal
        results.append(('model', 'model type', 'conformal PLSR quantitative'))

        return True, results
Esempio n. 20
0
    def test_icp_regression_tree(self):
        # -----------------------------------------------------------------------------
        # Setup training, calibration and test indices
        # -----------------------------------------------------------------------------
        data = load_boston()

        idx = np.random.permutation(data.target.size)
        train = idx[:int(idx.size / 3)]
        calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)]
        test = idx[int(2 * idx.size / 3):]

        # -----------------------------------------------------------------------------
        # Without normalization
        # -----------------------------------------------------------------------------
        # Train and calibrate
        # -----------------------------------------------------------------------------
        underlying_model = RegressorAdapter(
            DecisionTreeRegressor(min_samples_leaf=5))
        nc = RegressorNc(underlying_model, AbsErrorErrFunc())
        icp = IcpRegressor(nc)
        icp.fit(data.data[train, :], data.target[train])
        icp.calibrate(data.data[calibrate, :], data.target[calibrate])

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp.predict(data.data[test, :], significance=0.1)
        header = ["min", "max", "truth", "size"]
        size = prediction[:, 1] - prediction[:, 0]
        table = np.vstack([prediction.T, data.target[test], size.T]).T
        df = pd.DataFrame(table, columns=header)
        print(df)

        # -----------------------------------------------------------------------------
        # With normalization
        # -----------------------------------------------------------------------------
        # Train and calibrate
        # -----------------------------------------------------------------------------
        underlying_model = RegressorAdapter(
            DecisionTreeRegressor(min_samples_leaf=5))
        normalizing_model = RegressorAdapter(
            KNeighborsRegressor(n_neighbors=1))
        normalizer = RegressorNormalizer(underlying_model, normalizing_model,
                                         AbsErrorErrFunc())
        nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer)
        icp = IcpRegressor(nc)
        icp.fit(data.data[train, :], data.target[train])
        icp.calibrate(data.data[calibrate, :], data.target[calibrate])

        # -----------------------------------------------------------------------------
        # Predict
        # -----------------------------------------------------------------------------
        prediction = icp.predict(data.data[test, :], significance=0.1)
        header = ["min", "max", "truth", "size"]
        size = prediction[:, 1] - prediction[:, 0]
        table = np.vstack([prediction.T, data.target[test], size.T]).T
        df = pd.DataFrame(table, columns=header)
        print(df)
Esempio n. 21
0
    def CF_quantitative_validation(self):
        ''' Performs internal  validation for conformal quantitative models '''

        # Make a copy of original matrices.
        X = self.X.copy()
        Y = self.Y.copy()

        info = []
        kf = KFold(n_splits=self.param.getVal('ModelValidationN'),
                   shuffle=True,
                   random_state=46)
        # Copy Y vector to use it as template to assign predictions
        Y_pred = copy.copy(Y).tolist()
        try:
            for train_index, test_index in kf.split(X):
                # Generate training and test sets
                X_train, X_test = X[train_index], X[test_index]
                Y_train, Y_test = Y[train_index], Y[test_index]
                # Generate training a test sets
                # Create the aggregated conformal regressor.
                conformal_pred = AggregatedCp(
                    IcpRegressor(
                        RegressorNc(RegressorAdapter(self.estimator_temp))),
                    BootstrapSampler())
                # Fit conformal regressor to the data
                conformal_pred.fit(X_train, Y_train)

                # Perform prediction on test set
                prediction = conformal_pred.predict(
                    X_test, self.param.getVal('conformalSignificance'))
                # Assign the prediction its original index
                for index, el in enumerate(test_index):
                    Y_pred[el] = prediction[index]

        except Exception as e:
            LOG.error(f'Quantitative conformal validation'
                      f' failed with exception: {e}')
            raise e

        Y_pred = np.asarray(Y_pred)
        # Add the n validation interval means
        interval_mean = np.mean(np.abs((Y_pred[:, 0]) - (Y_pred[:, 1])))
        # Get boolean mask of instances
        #  within the applicability domain.
        inside_interval = ((Y_pred[:, 0].reshape(-1, 1) < Y) &
                           (Y_pred[:, 1].reshape(-1, 1) > Y))
        # Compute the accuracy (number of instances within the AD).
        accuracy = np.sum(inside_interval) / len(Y)

        # Cut into two decimals.
        self.conformal_interval_medians = (np.mean(Y_pred, axis=1))
        self.conformal_accuracy = float("{0:.2f}".format(accuracy))
        self.conformal_mean_interval = float("{0:.2f}".format(interval_mean))

        #Add quality metrics to results.
        info.append(('Conformal_mean_interval', 'Conformal mean interval',
                     self.conformal_mean_interval))
        info.append(('Conformal_accuracy', 'Conformal accuracy',
                     self.conformal_accuracy))
        info.append(
            ('Conformal_interval_medians', 'Conformal interval medians',
             self.conformal_interval_medians))
        info.append(('Conformal_prediction_ranges',
                     'Conformal prediction ranges', Y_pred))

        results = {}
        results['quality'] = info
        return True, results
Esempio n. 22
0
    def test_nc_factory(self):
        def score_model(icp, icp_name, ds, ds_name, scoring_funcs):
            scores = cross_val_score(
                icp,
                ds.data,
                ds.target,
                iterations=10,
                folds=10,
                scoring_funcs=scoring_funcs,
                significance_levels=[0.05, 0.1, 0.2],
            )

            print("\n{}: {}".format(icp_name, ds_name))
            scores = scores.drop(["fold", "iter"], axis=1)
            print(scores.groupby(["significance"]).mean())

        # -----------------------------------------------------------------------------
        # Classification
        # -----------------------------------------------------------------------------
        data = load_iris()

        nc = NcFactory.create_nc(RandomForestClassifier(n_estimators=100))
        icp = IcpClassifier(nc)
        icp_cv = ClassIcpCvHelper(icp)
        score_model(icp_cv, "IcpClassifier", data, "iris",
                    [class_mean_errors, class_avg_c])

        # -----------------------------------------------------------------------------
        # Classification (normalized)
        # -----------------------------------------------------------------------------
        data = load_iris()

        nc = NcFactory.create_nc(RandomForestClassifier(n_estimators=100),
                                 normalizer_model=KNeighborsRegressor())
        icp = IcpClassifier(nc)
        icp_cv = ClassIcpCvHelper(icp)

        score_model(icp_cv, "IcpClassifier (normalized)", data, "iris",
                    [class_mean_errors, class_avg_c])

        # -----------------------------------------------------------------------------
        # Classification OOB
        # -----------------------------------------------------------------------------
        data = load_iris()

        nc = NcFactory.create_nc(RandomForestClassifier(n_estimators=100,
                                                        oob_score=True),
                                 oob=True)
        icp_cv = OobCpClassifier(nc)

        score_model(icp_cv, "IcpClassifier (OOB)", data, "iris",
                    [class_mean_errors, class_avg_c])

        # -----------------------------------------------------------------------------
        # Classification OOB normalized
        # -----------------------------------------------------------------------------
        data = load_iris()

        nc = NcFactory.create_nc(
            RandomForestClassifier(n_estimators=100, oob_score=True),
            oob=True,
            normalizer_model=KNeighborsRegressor(),
        )
        icp_cv = OobCpClassifier(nc)

        score_model(
            icp_cv,
            "IcpClassifier (OOB, normalized)",
            data,
            "iris",
            [class_mean_errors, class_avg_c],
        )

        # -----------------------------------------------------------------------------
        # Regression
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        nc = NcFactory.create_nc(RandomForestRegressor(n_estimators=100))
        icp = IcpRegressor(nc)
        icp_cv = RegIcpCvHelper(icp)

        score_model(icp_cv, "IcpRegressor", data, "diabetes",
                    [reg_mean_errors, reg_median_size])

        # -----------------------------------------------------------------------------
        # Regression (normalized)
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        nc = NcFactory.create_nc(RandomForestRegressor(n_estimators=100),
                                 normalizer_model=KNeighborsRegressor())
        icp = IcpRegressor(nc)
        icp_cv = RegIcpCvHelper(icp)

        score_model(
            icp_cv,
            "IcpRegressor (normalized)",
            data,
            "diabetes",
            [reg_mean_errors, reg_median_size],
        )

        # -----------------------------------------------------------------------------
        # Regression OOB
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        nc = NcFactory.create_nc(RandomForestRegressor(n_estimators=100,
                                                       oob_score=True),
                                 oob=True)
        icp_cv = OobCpRegressor(nc)

        score_model(icp_cv, "IcpRegressor (OOB)", data, "diabetes",
                    [reg_mean_errors, reg_median_size])

        # -----------------------------------------------------------------------------
        # Regression OOB normalized
        # -----------------------------------------------------------------------------
        data = load_diabetes()

        nc = NcFactory.create_nc(
            RandomForestRegressor(n_estimators=100, oob_score=True),
            oob=True,
            normalizer_model=KNeighborsRegressor(),
        )
        icp_cv = OobCpRegressor(nc)

        score_model(
            icp_cv,
            "IcpRegressor (OOB, normalized)",
            data,
            "diabetes",
            [reg_mean_errors, reg_median_size],
        )
Esempio n. 23
0
from nonconformist.base import RegressorAdapter
from nonconformist.icp import IcpRegressor
from nonconformist.nc import RegressorNc, AbsErrorErrFunc, SignErrorErrFunc

# -----------------------------------------------------------------------------
# Setup training, calibration and test indices
# -----------------------------------------------------------------------------
data = load_boston()

idx = np.random.permutation(data.target.size)
train = idx[:int(idx.size / 3)]
calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)]
test = idx[int(2 * idx.size / 3):]

# -----------------------------------------------------------------------------
# Train and calibrate
# -----------------------------------------------------------------------------
icp = IcpRegressor(
    RegressorNc(RegressorAdapter(DecisionTreeRegressor()), SignErrorErrFunc()))
icp.fit(data.data[train, :], data.target[train])
icp.calibrate(data.data[calibrate, :], data.target[calibrate])

# -----------------------------------------------------------------------------
# Predict
# -----------------------------------------------------------------------------
prediction = icp.predict(data.data[test, :], significance=0.05)
header = np.array(['min', 'max', 'Truth'])
table = np.vstack([prediction.T, data.target[test]]).T
df = pd.DataFrame(np.vstack([header, table]))
print(df)
Esempio n. 24
0
                         iterations=5,
                         folds=5,
                         scoring_funcs=[class_mean_errors, class_avg_c],
                         significance_levels=[0.05, 0.1, 0.2])

print('Classification: iris')
scores = scores.drop(['fold', 'iter'], axis=1)
print(scores.groupby(['significance']).mean())

# -----------------------------------------------------------------------------
# Regression, absolute error
# -----------------------------------------------------------------------------
data = load_diabetes()

icp = IcpRegressor(
    RegressorNc(RegressorAdapter(RandomForestRegressor(n_estimators=100)),
                AbsErrorErrFunc()))
icp_cv = RegIcpCvHelper(icp)

scores = cross_val_score(icp_cv,
                         data.data,
                         data.target,
                         iterations=5,
                         folds=5,
                         scoring_funcs=[reg_mean_errors, reg_median_size],
                         significance_levels=[0.05, 0.1, 0.2])

print('Absolute error regression: diabetes')
scores = scores.drop(['fold', 'iter'], axis=1)
print(scores.groupby(['significance']).mean())
Esempio n. 25
0
idx = np.random.permutation(data.target.size)
train = idx[:int(2 * idx.size / 3)]
test = idx[int(2 * idx.size / 3):]

truth = data.target[test]
columns = ['min', 'max', 'truth']
significance = 0.1

# -----------------------------------------------------------------------------
# Define models
# -----------------------------------------------------------------------------

models = {
    'ACP-RandomSubSampler':
    AggregatedCp(
        IcpRegressor(RegressorNc(RegressorAdapter(DecisionTreeRegressor()))),
        RandomSubSampler()),
    'ACP-CrossSampler':
    AggregatedCp(
        IcpRegressor(RegressorNc(RegressorAdapter(DecisionTreeRegressor()))),
        CrossSampler()),
    'ACP-BootstrapSampler':
    AggregatedCp(
        IcpRegressor(RegressorNc(RegressorAdapter(DecisionTreeRegressor()))),
        BootstrapSampler())
}

# -----------------------------------------------------------------------------
# Train, predict and evaluate
# -----------------------------------------------------------------------------
for name, model in models.iteritems():
Esempio n. 26
0
File: RF.py Progetto: e7dal/flame
    def build(self):
        '''Build a new RF model with the X and Y numpy matrices '''

        # Make a copy of data matrices
        X = self.X.copy()
        Y = self.Y.copy()

        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))
        results.append(('model', 'model type', 'RF'))

        conformal = self.param.getVal('conformal')
        # If tune then call gridsearch to optimize the estimator
        if self.param.getVal('tune'):

            LOG.info("Optimizing RF estimator")

            try:
                # Check type of model
                if self.param.getVal('quantitative'):
                    self.estimator = RandomForestRegressor(
                        **self.estimator_parameters)
                    self.optimize(X, Y, self.estimator, self.tune_parameters)
                    # results.append(('model','model type','RF quantitative (optimized)'))
                else:
                    self.estimator = RandomForestClassifier(
                        **self.estimator_parameters)
                    self.optimize(X, Y, self.estimator, self.tune_parameters)
                    # results.append(('model','model type','RF qualitative (optimized)'))

            except Exception as e:
                return False, f'Exception optimizing RF estimator with exception {e}'

        else:
            try:
                if self.param.getVal('quantitative'):

                    self.estimator = RandomForestRegressor(
                        **self.estimator_parameters)

                    if not conformal:
                        LOG.info("Building Quantitative RF model")
                        # results.append(('model', 'model type', 'RF quantitative'))
                else:

                    self.estimator = RandomForestClassifier(
                        **self.estimator_parameters)

                    if not conformal:
                        LOG.info("Building Qualitative RF model")
                        # results.append(('model', 'model type', 'RF qualitative'))

                self.estimator.fit(X, Y)

            except Exception as e:
                return False, f'Exception building RF estimator with exception {e}'

        if not conformal:
            return True, results

        self.estimator_temp = copy(self.estimator)

        # Create the conformal estimator
        try:
            # Conformal regressor
            if self.param.getVal('quantitative'):
                conformal_settings = self.param.getDict('conformal_settings')
                LOG.info("Building conformal Quantitative RF model")

                underlying_model = RegressorAdapter(self.estimator_temp)
                self.normalizing_model = RegressorAdapter(
                    KNeighborsRegressor(
                        n_neighbors=conformal_settings['KNN_NN']))
                # normalizing_model = RegressorAdapter(self.estimator_temp)
                normalizer = RegressorNormalizer(underlying_model,
                                                 copy(self.normalizing_model),
                                                 AbsErrorErrFunc())
                nc = RegressorNc(underlying_model, AbsErrorErrFunc(),
                                 normalizer)

                # self.conformal_pred = AggregatedCp(IcpRegressor
                # (RegressorNc(RegressorAdapter(self.estimator))),
                #                                   BootstrapSampler())

                self.estimator = AggregatedCp(IcpRegressor(nc),
                                              BootstrapSampler())

                self.estimator.fit(X, Y)
                # results.append(('model', 'model type', 'conformal RF quantitative'))

            # Conformal classifier
            else:

                LOG.info("Building conformal Qualitative RF model")

                self.estimator = AggregatedCp(
                    IcpClassifier(
                        ClassifierNc(ClassifierAdapter(self.estimator_temp),
                                     MarginErrFunc())), BootstrapSampler())

                # Fit estimator to the data
                self.estimator.fit(X, Y)
                # results.append(('model', 'model type', 'conformal RF qualitative'))

        except Exception as e:
            return False, f'Exception building conformal RF estimator with exception {e}'

        return True, results


## Overriding of parent methods

# def CF_quantitative_validation(self):
#     ''' performs validation for conformal quantitative models '''

# def CF_qualitative_validation(self):
#     ''' performs validation for conformal qualitative models '''

# def quantitativeValidation(self):
#     ''' performs validation for quantitative models '''

# def qualitativeValidation(self):
#     ''' performs validation for qualitative models '''

# def validate(self):
#     ''' Validates the model and computes suitable model quality scoring values'''

# def optimize(self, X, Y, estimator, tune_parameters):
#     ''' optimizes a model using a grid search over a range of values for diverse parameters'''

# def regularProject(self, Xb, results):
#     ''' projects a collection of query objects in a regular model, for obtaining predictions '''

# def conformalProject(self, Xb, results):
#     ''' projects a collection of query objects in a conformal model, for obtaining predictions '''

# def project(self, Xb, results):
#     ''' Uses the X matrix provided as argument to predict Y'''
Esempio n. 27
0
    def build(self):
        '''Build a new SVM model with the X and Y numpy matrices'''

        # Make a copy of data matrices
        X = self.X.copy()
        Y = self.Y.copy()

        results = []
        results.append(('nobj', 'number of objects', self.nobj))
        results.append(('nvarx', 'number of predictor variables', self.nvarx))

        # If tune then call gridsearch to optimize the estimator
        if self.param.getVal('tune'):
            try:
                # Check type of model
                if self.param.getVal('quantitative'):
                    self.optimize(X, Y, svm.SVR(**self.estimator_parameters),
                                  self.tune_parameters)
                    results.append(('model', 'model type',
                                    'SVM quantitative (optimized)'))

                else:
                    self.optimize(X, Y, svm.SVC(**self.estimator_parameters),
                                  self.tune_parameters)
                    results.append(
                        ('model', 'model type', 'SVM qualitative (optimized)'))
                LOG.debug('SVM estimator optimized')
            except Exception as e:
                LOG.error(f'Exception optimizing SVM'
                          f'estimator with exception {e}')
        else:
            try:
                LOG.info("Building  SVM model")
                if self.param.getVal('quantitative'):
                    LOG.info("Building Quantitative SVM-R model")
                    self.estimator = svm.SVR(**self.estimator_parameters)
                    results.append(('model', 'model type', 'SVM quantitative'))
                else:
                    self.estimator = svm.SVC(**self.estimator_parameters)
                    results.append(('model', 'model type', 'SVM qualitative'))
            except Exception as e:
                LOG.error(f'Exception building SVM'
                          f'estimator with exception {e}')
        self.estimator.fit(X, Y)
        self.estimator_temp = copy(self.estimator)
        if self.param.getVal('conformal'):
            try:
                LOG.info("Building aggregated conformal SVM model")
                if self.param.getVal('quantitative'):
                    underlying_model = RegressorAdapter(self.estimator_temp)
                    # normalizing_model = RegressorAdapter(
                    # KNeighborsRegressor(n_neighbors=5))
                    normalizing_model = RegressorAdapter(self.estimator_temp)
                    normalizer = RegressorNormalizer(underlying_model,
                                                     normalizing_model,
                                                     AbsErrorErrFunc())
                    nc = RegressorNc(underlying_model, AbsErrorErrFunc(),
                                     normalizer)
                    # self.conformal_pred = AggregatedCp(IcpRegressor(
                    # RegressorNc(RegressorAdapter(self.estimator))),
                    #                                   BootstrapSampler())

                    self.estimator = AggregatedCp(IcpRegressor(nc),
                                                  BootstrapSampler())
                    self.estimator.fit(X, Y)
                    # overrides non-conformal
                    results.append(
                        ('model', 'model type', 'conformal SVM quantitative'))

                else:
                    self.estimator = AggregatedCp(
                        IcpClassifier(
                            ClassifierNc(
                                ClassifierAdapter(self.estimator_temp),
                                MarginErrFunc())), BootstrapSampler())
                    self.estimator.fit(X, Y)
                    # overrides non-conformal
                    results.append(
                        ('model', 'model type', 'conformal SVM qualitative'))
            except Exception as e:
                LOG.error(f'Exception building aggregated conformal SVM '
                          f'estimator with exception {e}')
        # Fit estimator to the data
        return True, results
def create_conformal_model():
    """
    Description - Create conformal model - Main loop
    """

    #Read data from file
    data = read_data(args.i)
	
    #Calculate descriptors using RD-kit
    descriptors_df = calculate_descriptors(data['smiles']) 
    
    #Assign indices
    train_i, calibrate_i, test_i  = create_indices_test_training_calibration(data) # Create indices for test,training, calibration sets          
    test_index_total = [x for x in test_i]
    calibrate_index_total = [x for x in calibrate_i]

    #Create inductive conformal prediction regressor
   
    if args.m == 'RF':
        icp = IcpRegressor(NormalizedRegressorNc(RandomForestRegressor, KNeighborsRegressor, abs_error, abs_error_inv, model_params={'n_estimators': 100}))

    if args.m == 'SVM':
        #No support vector regressor
        print('error - no SVM-regressor avliable')
        icp = IcpRegressor(NormalizedRegressorNc(SVR, KNeighborsRegressor, abs_error, abs_error_inv, model_params={'n_estimators': 100}))
           
    #Create DataFrames to store data
    A = pandas.DataFrame(index = range(len(data)))
    B = pandas.DataFrame(index = range(len(data)))
    C = pandas.DataFrame(index = range(len(data)))

    iA = pandas.DataFrame(index = range(len(data)))
    iB = pandas.DataFrame(index = range(len(data)))
    iC = pandas.DataFrame(index = range(len(data)))

    if args.verbose:
	print('Number of models to create: '+args.num_models)
	print('############## Starting calculations ##############')
    
    icp_s = []


    for i in range(int(args.num_models)): #DEBUG 100
        Xtrain, Xtest, Xcalibrate, ytrain, ytest, ycalibrate = create_train_test_calibrate_sets(data, descriptors_df,  train_i, calibrate_i, test_i)

        #Create nornal model
        icp.fit(Xtrain, ytrain)
    
        #Calibrate normal model               
        icp.calibrate(asanyarray(Xcalibrate), asanyarray(ycalibrate))
            
        #Predrict test and training sets
        prediction_test = icp.predict(asanyarray(Xtest), significance = args.significance) # 0.2
        prediction_calibrate = icp.predict(asanyarray(Xcalibrate), significance = args.significance)

        #Create DF with data
        blob = pandas.DataFrame(prediction_test, index=test_i)
        iblob = pandas.DataFrame(prediction_calibrate, index=calibrate_i)
        
        A[i] = blob[0]
        B[i] = blob[1]

        iA[i] = iblob[0]
        iB[i] = iblob[1]
        

        #Create new indices for next model
        test_index_total = np.unique(np.concatenate((test_index_total, test_i), axis=0))
        calibrate_index_total = np.unique(np.concatenate((calibrate_index_total, calibrate_i), axis=0)) 
        
        train_i, calibrate_i, test_i  = randomize_new_indices(train_i, calibrate_i, test_i, data, i)
        
        #temp = sklearn.base.clone(icp)
        icp_s.append(copy.copy(icp))
    

    ### Save models ###
    save_models(icp_s)
    

 

    if args.verbose:
        print('################## Loop finished, model created, test set predicted #################')
        
    experimental_values = data['Observed'][test_index_total]
    iexperimental_values = data['Observed'][calibrate_index_total] 


    C['median_prediction_0'] = A.median(axis=1)
    C['median_prediction_1'] = B.median(axis=1)
    C['median_prediction'] = (C['median_prediction_0'] + C['median_prediction_1'])/2
    C['median_prediction_size'] = C['median_prediction'] - C['median_prediction_0']

    Y_pred_median_test = C['median_prediction'].dropna()
    median_prediction_size = C['median_prediction_size'].dropna().tolist()
        
    num_outside_median = 0
    for i in range(len(data)):
        try:
            if  C['median_prediction_0'].dropna()[i] < experimental_values[i] < C['median_prediction_1'].dropna()[i]:
                pass
            else:
                num_outside_median +=1
                #print('Outside range')
        except:
            pass #print('error')
    
    #Internal prediction
    iC['median_prediction_0'] = iA.median(axis=1)
    iC['median_prediction_1'] = iB.median(axis=1)
    iC['median_prediction'] = (iC['median_prediction_0'] + iC['median_prediction_1'])/2
    iC['median_prediction_size'] = iC['median_prediction'] - iC['median_prediction_0']
    
    iY_pred_median_test = iC['median_prediction'].dropna()
    imedian_prediction_size = iC['median_prediction_size'].dropna().tolist()

    inum_outside_median = 0
    for i in range(len(data)):
        try:
            if  iC['median_prediction_0'].dropna()[i] < iexperimental_values[i] < iC['median_prediction_1'].dropna()[i]:
                pass
            else:
                inum_outside_median +=1
                #print('Outside range')
        except:
            pass #print('error')


    if args.verbose:
        print('########################## Prediction statistics external test ##########################')
        print('')
    

       
    print('Number of compounds predicted in test set: '+ str(C['median_prediction'].notnull().sum()))   
    
    if args.t != 'full_model':         
        ex_r2_score= r2_score(experimental_values, Y_pred_median_test)
        print('R^2 (coefficient of determination):  %.3f' % ex_r2_score)

        ex_mean_squared_error = mean_squared_error(experimental_values, Y_pred_median_test)
        ex_rmse = sqrt(ex_mean_squared_error)               
        print('RMSE:  %.3f' % ex_rmse)
        
        ex_MAE = mean_absolute_error(experimental_values, Y_pred_median_test)
        print('Mean absolute error:  %.3f' % ex_MAE)
 
        print('Mean squared error: %.3f' % ex_mean_squared_error)

        #Average prediction range   
        print('Mean of median prediction range: %.3f' % mean(median_prediction_size))

        percent_num_outside_median = 100*float(num_outside_median)/float(len(experimental_values))
        print('Number of compounds outside of prediction range: '+str(num_outside_median))
        print('% of compounds predicted outside of prediction range: '+str(percent_num_outside_median) +' %')
        print(' ')

        #####Internal Prediction ########
    
        print('Number of compounds predicted in training set: '+ str(iC['median_prediction'].notnull().sum()))   
          
        iex_r2_score= r2_score(iexperimental_values, iY_pred_median_test)
        print('R^2 (coefficient of determination):  %.3f' % iex_r2_score)

        iex_mean_squared_error = mean_squared_error(iexperimental_values, iY_pred_median_test)
        iex_rmse = sqrt(iex_mean_squared_error)               
        print('RMSE:  %.3f' % iex_rmse)
        
        print('Mean squared error: %.3f' % iex_mean_squared_error)
        
       
        iex_MAE = mean_absolute_error(iexperimental_values, iY_pred_median_test)
        print('Mean absolute error:  %.3f' % iex_MAE)

        #Average prediction range   
        print('Mean of median prediction range: %.3f' % mean(imedian_prediction_size))



        ipercent_num_outside_median = 100*float(inum_outside_median)/float(len(iexperimental_values))
        print('Number of compounds outside of prediction range: '+str(inum_outside_median))
        print('% of compounds predicted outside of prediction range: '+str(ipercent_num_outside_median) +' %')
        print(' ')   

        #### Plot results - plot test set
        if args.plot:     
            if args.verbose:
                print(' ################ Plotting testset #################')
            fig, ax = plt.subplots()

            ax.errorbar(experimental_values, Y_pred_median_test, yerr=median_prediction_size,
            fmt='o', markeredgecolor = 'black', markersize = 6,
            mew=1, ecolor='black', elinewidth=0.3, capsize = 3, capthick=1, errorevery = 1)
    
            #Set the size
            ax.set_ylim([-10,-3])
            ax.set_xlim([-10,-3])
    

            # Plot title and lables
            #plt.title('Median predictions with prediction ranges for the testset')
            plt.ylabel('Predicted log Kp')
            plt.xlabel('Experimental log Kp')
    
            # Draw line 
            fit = np.polyfit(experimental_values, Y_pred_median_test, 1)
    
            x = [-10,-3]
    
            #Regression line
            #ax.plot(experimental_values, fit[0]*asanyarray(experimental_values)+ fit[1], color='black')
            #ax.plot(x, fit[0]*asanyarray(x)+ fit[1], color='black')
    

    
            #Creating colored dots for ref 10
    
            #ref10_experimental = data.loc[data['Ref.'] == 10]['Observed']
            #ref10_predicted = C['median_prediction'][ref10_experimental.index]
            #ax.scatter(ref10_experimental, ref10_predicted,marker = 'o', color ='red', s = 100)
    


            ax.plot(x, x, color='black')
    
            plt.show()

    #Print data in CSV-file
    
    descriptors_df['Median prediction low range'] = C['median_prediction_0']
    descriptors_df['Median prediction high range'] = C['median_prediction_1'] 
    descriptors_df['Median prediction'] = C['median_prediction']
    descriptors_df['size prediction range'] = C['median_prediction_1'] - C['median_prediction_0']
    write_csv_with_data(data,descriptors_df, args.d)


    #Calculate min, max and mean values for descriptors
    if args.phys:
        print(args.phys)
        print('Min: ')
        print(descriptors_df.min())
        print('Max: ')
        print(descriptors_df.max())
        print('Mean:')    
        print(descriptors_df.mean()) 

    if args.pca:
        print('Starting PCA')
        print(descriptors_df[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']].head(3))
        print(len(descriptors_df[['size prediction range']]))
        
        #Define typ of PCA
        pca = PCA(n_components=2)
        
        #Select desctiptors to use in PCA
        df_small = descriptors_df[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']]

        #Convert descritor values to numeric/float
        df_X = df_small.apply(pandas.to_numeric, errors='raise') 
 
        #Scale data 
        scaler = preprocessing.RobustScaler() #Normalizer() # MaxAbsScaler() 
        df_X_scaled = scaler.fit_transform(df_X)
        
        #Calculate PCA
        pca.fit(df_X_scaled)


        X2 = pca.transform(df_X_scaled) #descriptors_df[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']])

        #-----------------------------------------------------------
        desc_testset_large = descriptors_df.dropna(subset = ['size prediction range'])


        desc_testset_small = desc_testset_large[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']]

        desc_testset_num = desc_testset_small.apply(pandas.to_numeric, errors='raise')

        desc_testset_scaled = scaler.fit_transform(desc_testset_num)

        X3 = pca.transform(desc_testset_scaled) #desc_testset[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']])
        #-----------------------------------------------------------

        #desc_testset = descriptors_df.dropna(subset = ['size prediction range'])
        Yerr_num = desc_testset_large[['size prediction range']].apply(pandas.to_numeric, errors='coerce')
        #print(pandas.Series(Yerr['size prediction range']))


        yerr = list(pandas.Series(Yerr_num['size prediction range'])/4)

        plt.errorbar(X3[:,0], X3[:,1], yerr=yerr ,fmt='o', markeredgecolor = 'black', markersize = 6, mew=1, ecolor='black', elinewidth=0.3, capsize = 3, capthick=1, errorevery = 1)

        plt.scatter(X2[:,0], X2[:,1])
        plt.xlabel('PC1')
        plt.ylabel('PC2')

        plt.title('PCA of descriptors')
        plt.show()