Example #1
0
class MaximumLikelihoodGaussianProcess(object):
    """
    Gaussian Process model which has its own hyperparameters chosen by a maximum likelihood process
    """

    # Can't have instantiation of model without supplying data
    def __init__(self, X, Y, kernel, max_feval):
        if not GPRegression:
            raise ImportError('No module named GPy')
        self.X = X
        self.Y = Y
        self.kernel = kernel
        self.model = GPRegression(X=self.X, Y=self.Y, kernel=self.kernel)
        self.max_feval = max_feval
        # TODO make this a variable.
        self.num_restarts = 20

    def fit(self):
        """
        Fits the model with random restarts.
        :return:
        """
        self.model.optimize_restarts(num_restarts=self.num_restarts,
                                     verbose=False)

    def predict(self, x):
        return self.model.predict(Xnew=x)
Example #2
0
class MaximumLikelihoodGaussianProcess(object):
    """
    Gaussian Process model which has its own hyperparameters chosen by a maximum likelihood process
    """

    # Can't have instantiation of model without supplying data
    def __init__(self, X, Y, kernel, max_feval):
        if not GPRegression:
            raise ImportError('No module named GPy')
        self.X = X
        self.Y = Y
        self.kernel = kernel
        self.model = GPRegression(X=self.X, Y=self.Y, kernel=self.kernel)
        self.max_feval = max_feval
        # TODO make this a variable.
        self.num_restarts = 20

    def fit(self):
        """
        Fits the model with random restarts.
        :return:
        """
        self.model.optimize_restarts(num_restarts=self.num_restarts, verbose=False)

    def predict(self, x):
        return self.model.predict(Xnew=x)
Example #3
0
class GP(Base):
    """A class that is declared for performing GP interpolation.
    GP interpolation (usually) works on the principle of finding the 
    best unbiased predictor. 

    Parameters
    ----------
    type : str, optional
    This parameter defines the type of Kriging under consideration. This 
    implementation uses PyKrige package  (https://github.com/bsmurphy/PyKrige).
    The user needs to choose between "Ordinary" and "Universal".

    """
    def __init__(
            self,
            kernel=RBF(2, ARD=True),
    ):

        super().__init__()
        self.kernel = kernel

    def _fit(self, X, y, n_restarts=5, verbose=False, random_state=None):
        """ Fit method for GP Interpolation
        This function shouldn't be called directly.
        """
        np.random.seed(random_state)
        if len(y.shape) == 1:
            y = y.reshape(-1, 1)
        self.model = GPRegression(X, y, self.kernel)
        self.model.optimize_restarts(n_restarts, verbose=verbose)
        return self

    def _predict_grid(self, x1lim, x2lim):
        """The function that is called to return the interpolated data in Kriging Interpolation
        in a grid. This method shouldn't be called directly"""
        lims = (*x1lim, *x2lim)
        x1min, x1max, x2min, x2max = lims
        x1 = np.linspace(x1min, x1max, self.resolution)
        x2 = np.linspace(x2min, x2max, self.resolution)

        X1, X2 = np.meshgrid(x1, x2)
        X = np.array([(i, j) for i, j in zip(X1.ravel(), X2.ravel())])

        predictions = self.model.predict(X)[0].reshape(len(x1), len(x2))

        return predictions.ravel()

    def _predict(self, X, return_variance=False):
        """This function should be called to return the interpolated data in kriging
        in a pointwise manner. This method shouldn't be called directly."""

        predictions, variance = self.model.predict(X)
        if return_variance:
            return predictions.ravel(), variance
        else:
            return predictions.ravel()
Example #4
0
def fit_GPy_kern(X, Y, kernel, restarts, score = BIC, **kwargs):
    if len(np.shape(X)) == 1: X = np.array(X)[:, None]
    if len(np.shape(Y)) == 1: Y = np.array(Y)[:, None]

    m = GPRegression(X, Y, kernel)
    m.optimize_restarts(num_restarts = restarts, **kwargs)

    m.plot()
    print(m.kern)
    print(f'Log-Likelihood: {m.log_likelihood()}')
    print(f'{score.__name__}: {score(m.log_likelihood(), len(X), m._size_transformed())}')

    plt.show()

    return m
Example #5
0
    def _fit_model(self, X, Y):
        if max(Y) < 0:
            self.transformed = True
        else:
            self.transformed = False

        Y_trans = self._transform_score(Y)

        model = GPRegression(X, Y_trans, self.kernel)
        # Catch fitting error
        try:
            model.optimize_restarts(num_restarts=self.n_init, verbose=False)
            self.model = model
        except np.linalg.linalg.LinAlgError:
            self.model = None
Example #6
0
class Stationary(Base):
    """
    Matern32 kernel for sensor placement
    """
    def __init__(self, n_restarts, kernel_name, verbose=True):
        super().__init__(verbose)
        self.__n_restarts = n_restarts
        self.__kernel_name = kernel_name

    def _Kernel(self, S1, S2=None):
        return self.__model.kern.K(S1, S2)

    def _fit(self, X, y, ECM=None):
        self._X = X
        self._y = y

        kern_dict = {
            'm32':
            Matern32(input_dim=self._X.shape[1],
                     active_dims=list(range(self._X.shape[1])),
                     ARD=True),
            'm52':
            Matern52(input_dim=self._X.shape[1],
                     active_dims=list(range(self._X.shape[1])),
                     ARD=True),
            'rbf':
            RBF(input_dim=self._X.shape[1],
                active_dims=list(range(self._X.shape[1])),
                ARD=True)
        }

        self.__model = GPRegression(X, y, kern_dict[self.__kernel_name])
        self.__model.optimize_restarts(self.__n_restarts,
                                       verbose=self._verbose)
        return self

    def _predict(self, X, return_cov=True):
        if not return_cov:
            return self.__model.predict(X)[0]
        return self.__model.predict(X, full_cov=True)
    def __fit_single_gaussian_process(variable, response_norm, num_restarts=7):
        """
        GP fitting.

        Returns the GP and kernel.

        :param variable: time
        :param response_norm: log-normalized target
        :return [tuple] a tuple:
            - the gp object
            - the kernel
        """

        obs_per_replicate = response_norm.shape[1]

        kernel = RBF(input_dim=1, variance=1., lengthscale=10.)
        variable = np.tile(variable, (response_norm.shape[0], 1))
        response = np.resize(
            response_norm,
            (response_norm.shape[0] * response_norm.shape[1], 1))
        gp = GPRegression(variable, response, kernel)
        gp.optimize_restarts(num_restarts=num_restarts, messages=False)

        return gp, kernel
Example #8
0
class GaussianProcess(Model):
    """
    Constructs a Gaussian Process Model using GPy
    """
    def __init__(self,
                 dataset_path,
                 input_obj,
                 molecule_type=None,
                 molecule=None,
                 train_path=None,
                 test_path=None):
        super().__init__(dataset_path, input_obj, molecule_type, molecule,
                         train_path, test_path)
        self.set_default_hyperparameters()

    def set_default_hyperparameters(self):
        """
        Set default hyperparameter space. If none is provided, default is used.
        """
        self.hyperparameter_space = {
            'scale_X': hp.choice('scale_X', ['std', 'mm01', 'mm11', None]),
            'scale_y': hp.choice('scale_y', ['std', 'mm01', 'mm11', None]),
        }

        if self.input_obj.keywords['pes_format'] == 'interatomics':
            self.set_hyperparameter(
                'morse_transform',
                hp.choice(
                    'morse_transform',
                    [{
                        'morse': True,
                        'morse_alpha': hp.quniform('morse_alpha', 1, 2, 0.1)
                    }, {
                        'morse': False
                    }]))
        else:
            self.set_hyperparameter(
                'morse_transform',
                hp.choice('morse_transform', [{
                    'morse': False
                }]))
        if self.pip:
            val = hp.choice('pip', [{
                'pip':
                True,
                'degree_reduction':
                hp.choice('degree_reduction', [True, False])
            }])
            self.set_hyperparameter('pip', val)
        else:
            self.set_hyperparameter('pip', hp.choice('pip', [{'pip': False}]))

        if self.input_obj.keywords[
                'gp_ard'] == 'opt':  # auto relevancy determination (independant length scales for each feature)
            self.set_hyperparameter('ARD', hp.choice('ARD', [True, False]))
        #TODO add optional space inclusions, something like: if option: self.hyperparameter_space['newoption'] = hp.choice(..)

    def split_train_test(self, params):
        """
        Take raw dataset and apply hyperparameters/input keywords/preprocessing
        and train/test (tr,test) splitting.
        Assigns:
        self.X : complete input data, transformed
        self.y : complete output data, transformed
        self.Xscaler : scaling transformer for inputs 
        self.yscaler : scaling transformer for outputs 
        self.Xtr : training input data, transformed
        self.ytr : training output data, transformed
        self.Xtest : test input data, transformed
        self.ytest : test output data, transformed
        """
        self.X, self.y, self.Xscaler, self.yscaler = self.preprocess(
            params, self.raw_X, self.raw_y)
        if self.sampler == 'user_supplied':
            self.Xtr = self.transform_new_X(self.raw_Xtr, params, self.Xscaler)
            self.ytr = self.transform_new_y(self.raw_ytr, self.yscaler)
            self.Xtest = self.transform_new_X(self.raw_Xtest, params,
                                              self.Xscaler)
            self.ytest = self.transform_new_y(self.raw_ytest, self.yscaler)

        else:
            self.Xtr = self.X[self.train_indices]
            self.ytr = self.y[self.train_indices]
            self.Xtest = self.X[self.test_indices]
            self.ytest = self.y[self.test_indices]

    def build_model(self, params, nrestarts=10, maxit=1000, seed=0):
        print("Hyperparameters: ", params)
        self.split_train_test(params)
        np.random.seed(
            seed)  # make GPy deterministic for a given hyperparameter config
        dim = self.X.shape[1]
        if self.input_obj.keywords['gp_ard'] == 'opt':
            ard_val = params['ARD']
        elif self.input_obj.keywords['gp_ard'] == 'true':
            ard_val = True
        else:
            ard_val = False
        kernel = RBF(dim, ARD=ard_val)  # TODO add HP control of kernel
        self.model = GPRegression(self.Xtr,
                                  self.ytr,
                                  kernel=kernel,
                                  normalizer=False)
        self.model.optimize_restarts(nrestarts,
                                     optimizer="lbfgsb",
                                     robust=True,
                                     verbose=False,
                                     max_iters=maxit,
                                     messages=False)
        gc.collect(2)  #fixes some memory leak issues with certain BLAS configs

    def hyperopt_model(self, params):
        # skip building this model if hyperparameter combination already attempted
        for i in self.hyperopt_trials.results:
            if 'memo' in i:
                if params == i['memo']:
                    return {
                        'loss': i['loss'],
                        'status': STATUS_OK,
                        'memo': 'repeat'
                    }
        if self.itercount > self.hp_maxit:
            return {
                'loss': 0.0,
                'status': STATUS_FAIL,
                'memo': 'max iters reached'
            }
        self.build_model(params)
        error_test = self.vet_model(self.model)
        self.itercount += 1
        return {'loss': error_test, 'status': STATUS_OK, 'memo': params}

    def predict(self, model, data_in):
        prediction, v1 = model.predict(data_in, full_cov=False)
        return prediction

    def vet_model(self, model):
        """Convenience method for getting model errors of test and full datasets"""
        pred_test = self.predict(model, self.Xtest)
        pred_full = self.predict(model, self.X)
        error_test = self.compute_error(self.ytest, pred_test, self.yscaler)
        error_full, median_error, max_errors = self.compute_error(
            self.y, pred_full, yscaler=self.yscaler, max_errors=5)
        print("Test Dataset {}".format(round(hartree2cm * error_test, 2)),
              end='  ')
        print("Full Dataset {}".format(round(hartree2cm * error_full, 2)),
              end='     ')
        print("Median error: {}".format(np.round(median_error[0], 2)),
              end='  ')
        print(
            "Max 5 errors: {}".format(
                np.sort(np.round(max_errors.flatten(), 1))), '\n')
        error_test_invcm = round(hartree2cm * error_test, 2)
        return error_test_invcm

    def preprocess(self, params, raw_X, raw_y):
        """
        Preprocess raw data according to hyperparameters
        """
        # TODO make more flexible. If keys don't exist, ignore them. smth like "if key: if param['key']: do transform"
        if params['morse_transform']['morse']:
            raw_X = morse(raw_X, params['morse_transform']['morse_alpha']
                          )  # Transform to morse variables (exp(-r/alpha))
        # Transform to FIs, degree reduce if called
        if params['pip']['pip']:
            # find path to fundamental invariants form molecule type AxByCz...
            #path = os.path.join(package_directory, "lib", self.molecule_type, "output")
            path = os.path.join(fi_dir, self.molecule_type, "output")
            raw_X, degrees = interatomics_to_fundinvar(raw_X, path)
            if params['pip']['degree_reduction']:
                raw_X = degree_reduce(raw_X, degrees)

        if params['scale_X']:
            X, Xscaler = general_scaler(params['scale_X'], raw_X)
        else:
            X = raw_X
            Xscaler = None
        if params['scale_y']:
            y, yscaler = general_scaler(params['scale_y'], raw_y)
        else:
            y = raw_y
            yscaler = None
        return X, y, Xscaler, yscaler

    def optimize_model(self):
        print("Beginning hyperparameter optimization...")
        print("Trying {} combinations of hyperparameters".format(
            self.hp_maxit))
        print("Training with {} points (Full dataset contains {} points).".
              format(self.ntrain, self.n_datapoints))
        print("Using {} training set point sampling.".format(self.sampler))
        print("Errors are root-mean-square error in wavenumbers (cm-1)")
        self.hyperopt_trials = Trials()
        self.itercount = 1  # keep track of hyperopt iterations
        if self.input_obj.keywords['rseed']:
            rstate = np.random.RandomState(self.input_obj.keywords['rseed'])
        else:
            rstate = None
        best = fmin(self.hyperopt_model,
                    space=self.hyperparameter_space,
                    algo=tpe.suggest,
                    max_evals=self.hp_maxit * 2,
                    rstate=rstate,
                    show_progressbar=False,
                    trials=self.hyperopt_trials)
        hyperopt_complete()
        print("Best performing hyperparameters are:")
        final = space_eval(self.hyperparameter_space, best)
        print(str(sorted(final.items())))
        self.optimal_hyperparameters = dict(final)
        # obtain final model from best hyperparameters
        print("Fine-tuning final model architecture...")
        self.build_model(self.optimal_hyperparameters,
                         nrestarts=10,
                         maxit=1000)
        print("Final model performance (cm-1):")
        self.test_error = self.vet_model(self.model)
        self.save_model(self.optimal_hyperparameters)

    def save_model(self, params):
        # Save model. Currently GPy requires saving training data in model for some reason.
        model_dict = self.model.to_dict(save_data=True)
        print("Saving ML model data...")
        model_path = "model1_data"
        while os.path.isdir(model_path):
            new = int(re.findall("\d+", model_path)[0]) + 1
            model_path = re.sub("\d+", str(new), model_path)
        os.mkdir(model_path)
        os.chdir(model_path)
        with open('model.json', 'w') as f:
            json.dump(model_dict, f)
        with open('hyperparameters', 'w') as f:
            print(params, file=f)

        if self.sampler == 'user_supplied':
            self.traindata.to_csv('train_set',
                                  sep=',',
                                  index=False,
                                  float_format='%12.12f')
            self.testdata.to_csv('test_set',
                                 sep=',',
                                 index=False,
                                 float_format='%12.12f')
        else:
            self.dataset.iloc[self.train_indices].to_csv(
                'train_set', sep=',', index=False, float_format='%12.12f')
            self.dataset.iloc[self.test_indices].to_csv('test_set',
                                                        sep=',',
                                                        index=False,
                                                        float_format='%12.12f')

        self.dataset.to_csv('PES.dat',
                            sep=',',
                            index=False,
                            float_format='%12.12f')
        # write convenience function
        with open('compute_energy.py', 'w+') as f:
            print(self.write_convenience_function(), file=f)

        # print model performance
        sys.stdout = open('performance', 'w')
        self.vet_model(self.model)
        sys.stdout = sys.__stdout__
        os.chdir("../")

    def transform_new_X(self, newX, params, Xscaler=None):
        """
        Transform a new, raw input according to the model's transformation procedure 
        so that prediction can be made.
        """
        # ensure X dimension is n x m (n new points, m input variables)
        if len(newX.shape) == 1:
            newX = np.expand_dims(newX, 0)
        elif len(newX.shape) > 2:
            raise Exception("Dimensions of input data is incorrect.")
        if params['morse_transform']['morse']:
            newX = morse(newX, params['morse_transform']['morse_alpha'])
        if params['pip']['pip']:
            # find path to fundamental invariants for an N atom system with molecule type AxByCz...
            path = os.path.join(package_directory, "lib", self.molecule_type,
                                "output")
            newX, degrees = interatomics_to_fundinvar(newX, path)
            if params['pip']['degree_reduction']:
                newX = degree_reduce(newX, degrees)
        if Xscaler:
            newX = Xscaler.transform(newX)
        return newX

    def transform_new_y(self, newy, yscaler=None):
        if yscaler:
            newy = yscaler.transform(newy)
        return newy

    def inverse_transform_new_y(self, newy, yscaler=None):
        if yscaler:
            newy = yscaler.inverse_transform(newy)
        return newy

    def write_convenience_function(self):
        string = "from peslearn.ml import GaussianProcess\nfrom peslearn import InputProcessor\nfrom GPy.core.model import Model\nimport numpy as np\nimport json\nfrom itertools import combinations\n\n"
        if self.pip:
            string += "gp = GaussianProcess('PES.dat', InputProcessor(''), molecule_type='{}')\n".format(
                self.molecule_type)
        else:
            string += "gp = GaussianProcess('PES.dat', InputProcessor(''))\n"
        with open('hyperparameters', 'r') as f:
            hyperparameters = f.read()
        string += "params = {}\n".format(hyperparameters)
        string += "X, y, Xscaler, yscaler =  gp.preprocess(params, gp.raw_X, gp.raw_y)\n"
        string += "model = Model('mymodel')\n"
        string += "with open('model.json', 'r') as f:\n"
        string += "    model_dict = json.load(f)\n"
        string += "final = model.from_dict(model_dict)\n\n"
        string += gp_convenience_function
        return string
Example #9
0
class GPModel():
    def __init__(self, X, Y, kernel_expression=SumKE(['WN'])._initialise()):
        self.X = X
        self.Y = Y
        self.kernel_expression = kernel_expression
        self.restarts = None
        self.model = None
        self.cached_utility_function = None
        self.cached_utility_function_type = None

    # Kwargs passed to optimize_restarts, which passes them to optimize
    #   Check comments in optimize's class AND optimization.get_optimizer for real list of optimizers
    # TODO: Eventually set robust to True; see description in optimize_restarts method
    def fit(self,
            restarts=None,
            optimiser='lbfgsb',
            verbose=False,
            robust=False,
            **kwargs):
        if restarts is None:
            if self.restarts is None:
                raise ValueError('No restarts value specified')
        else:
            self.restarts = restarts
        self.model = GPRegression(self.X, self.Y,
                                  self.kernel_expression.to_kernel())
        with warnings.catch_warnings():  # Ignore known numerical warnings
            warnings.simplefilter('ignore')
            self.model.optimize_restarts(num_restarts=self.restarts,
                                         verbose=verbose,
                                         robust=robust,
                                         optimizer=optimiser,
                                         **kwargs)
        return self

    def interpret(self):
        return fit_ker_to_kex_with_params(
            self.model.kern,
            deepcopy(self.kernel_expression)).get_interpretation()

    def predict(self,
                X,
                quantiles=(2.5, 97.5),
                full_cov=False,
                Y_metadata=None,
                kern=None,
                likelihood=None,
                include_likelihood=True):
        mean, cov = self.model.predict(X, full_cov, Y_metadata, kern,
                                       likelihood, include_likelihood)
        qs = self.model.predict_quantiles(X, quantiles, Y_metadata, kern,
                                          likelihood)
        return {
            'mean': mean,
            'covariance': cov,
            'low_quantile': qs[0],
            'high_quantile': qs[1]
        }

    def change_plotting_library(self, library='plotly_offline'):
        '''Wrapper of GPy.plotting's homonymous function;
        supported values are: 'matplotlib', 'plotly', 'plotly_online', 'plotly_offline' and 'none'.
        If 'plotly' then a 3-tuple is returned, with as 1st value the Figure object requiring a .show() to display.'''
        change_plotting_library(library)

    def plot(self):
        return self.model.plot()

    # Model fit objective criteria & related values:

    def _ll(self):
        return self.model.log_likelihood()

    def _n(self):
        return len(self.model.X)  # number of data points

    def _k(self):
        return self.model._size_transformed(
        )  # number of estimated parameters, i.e. model degrees of freedom

    def _ordered_score_ps(self):
        return self.model, self._ll(), self._n(), self._k()

    def compute_utility(self, score_f):
        self.cached_utility_function = score_f(*self._ordered_score_ps())
        self.cached_utility_function_type = score_f.__name__
        return self.cached_utility_function
Example #10
0
def main():
    print("######################")
    global target, X0, Y0, values, frac_M, frac_X, bo_flag

    #target_params = np.array([[0.14,0.4],[1.4,0.03]])

    #target = LiX_wrapper(True,'LiF','Rocksalt','JC',
    #                     target_params,False,False,eng)

    target = np.array([[-764.5, 6.012 * 0.99, 6.012 * 0.99, 6.012 * 0.99]])

    if focus == 'energy':
        target_comp = target[0, 0].reshape(1, -1)
    if focus == 'constant':
        target_comp = target[0, 1].reshape(1, -1)
    else:
        target_comp = target[0, :4].reshape(1, -1)

    print('Target initialized!')

    latin_design = LatinDesign(parameter_space=parameter_space)
    X0 = latin_design.get_samples(INIT_POINTS)
    Y0 = np.array([])
    for x in X0:
        x = np.array([x])
        Y0 = np.append(Y0, f.evaluate(x))
    values = []

    for y in Y0:
        values.append(y.Y)

    values = np.asarray(values, dtype=float)

    ### Redundancy check
    if (values[:, 7:-1] == values[0, 7]).all():
        values = values[:, :7]
        frac_X = False

    if (values[:, 4:7] == values[0, 4]).all():
        values = values[:, :4]
        frac_M = False

    values = values.reshape(-1, np.max(np.shape(target)))
    bo_flag = True

    if focus == 'energy':
        values = values[:, 0].reshape(-1, 1)
    if focus == 'constant':
        values = values[:, 1:4].reshape(-1, 3)

    ### BO Loop
    kern = Matern52(X0.shape[1], variance=1)
    model = GPRegression(X0,
                         values,
                         kernel=kern,
                         normalizer=True,
                         noise_var=NOISE)  # Kernel = None: RBF default

    model.optimize(optimizer='lbfgsb')
    model.optimize_restarts(num_restarts=50, verbose=False)
    model_wrapped = GPyModelWrapper(model)

    acq = L2_LCB(model=model_wrapped, target=target_comp, beta=np.float64(1.))
    # beta is the exploration constant
    bayesopt_loop = BayesianOptimizationLoop(model=model_wrapped,
                                             space=parameter_space,
                                             acquisition=acq)
    bayesopt_loop.run_loop(f, BO_ITER)

    return save(bayesopt_loop)
class ExperimentalCondition:
    """
    The `ExperimentalCondition` class stores treatment response data for an experimental condition within a `CancerModel`.
    It stores all replicates for all variables of the experimental condition for a given cancer model system.

    For example, in CancerModel Derived Xenograph (PDX) experiments it would store the tumour size measurements at each
    exposure time for all mouse models derived from a single patient.

    In cancer cell lines (CCLs) it would store all viability measurements for each dose level for all cultures derived
    from a single cancer cell line and treated with a specific compound.

    Thus the `ExperimentalCondition` class can be though of a storing data response data for a cancer model in two
    dimensions: replicates (e.g., a specific mouse or culture) variable condition levels (e.g., a specific time or
    dose).

    Common experimental conditions:
        * Control, i.e. no treatment
        * Exposure to a specific drug or compound
        * Treatment with a specific type of ionizing radiation

    It can have multiple replicates (ie. data for multiple growth curves)
    """
    def __init__(self,
                 name,
                 source_id=None,
                 variable=None,
                 response=None,
                 replicates=None,
                 variable_treatment_start=None,
                 is_control=False):
        """
        Initialize a particular treatment condition within a cancer model. For example, exposure to a given compound
        in set of PDX models derived from a single patient.

        :param name: [string] Name of the experimental/treatment condition (e.g., Control, Erlotinib, Paclitaxel, etc.)
        :param source_id: [string] A unique identifier for the cancer model source. For PDX models this would be the
            name of id of the patient from which the models were derived. For CCLs this would be the strain from which
            all cell cultures were derived.
        :param variable: [ndarray] The independent variable of the experimental condition. For example, the treatment
            exposure time for each tumour size measurement or the dose variable for each cell viability measurement.
        :param response: [ndarray] The response metric for the experimental condition. E.g., the tumour size in a PDX
            model after variable days of treatment exposure or the cell viability measurements in a CCL at a specific
            compound dose.
        :param replicates: [ndarray] The indexes of replicate values in the response attribute.
        :param is_control: [bool] Whether or not the treatment condition is a control.
        :return [None] Creates the ExperimentalCondition object.
        """

        self.name = name
        self.variable = np.asarray([[var] for var in variable])
        self.response = np.asarray(response.T).astype(float)
        self.response_norm = None
        self.variable_end = self.variable[-1][0]
        # TODO:: Is there any situation where np.array indexing doesn't start at 0?
        self.variable_start = self.variable[0][0]
        self.variable_treatment_start = variable_treatment_start if variable_treatment_start is not None else \
            self.variable_start

        self.variable_start_index = np.where(
            self.variable.ravel() == self.variable_start)[0][0]
        self.variable_end_index = np.where(
            self.variable.ravel() == self.variable_end)[0][0]

        # Assume treatment start is the same as the start of the independent variable, unless the user assigns
        self.variable_treatment_start_index = self.variable_start_index
        self.variable_treatment_end_index = self.variable_end_index

        self.source_id = source_id
        self.replicates = replicates if isinstance(replicates,
                                                   list) else list(replicates)
        self.is_control = is_control
        self.kl_p_cvsc = None

        # GPs
        self.gp = None
        self.gp_kernel = None

        # all below are between the <experimental_condition> and the control
        self.empirical_kl = None

        # KL divergence stats
        self.kl_divergence = None
        self.kl_p_value = None

        # naive stats
        # {701: 'mCR', 711: 'mPR', ...}
        self.best_avg_response = np.array([], dtype=np.float64)
        self.mrecist = {}
        self.mrecist_counts = None
        self.linear_models = []

        # {701: response angle, ...}
        self.response_angle = {}
        self.response_angle_rel = {}

        self.response_angle_control = {}
        self.response_angle_rel_control = {}

        # response angles based on average of curves
        self.average_angle = None
        self.average_angle_rel = None
        self.average_angle_control = None
        self.average_angle_rel_control = None

        # {701: AUC, ...}
        self.auc = {}
        self.auc_norm = {}

        self.auc_gp = None
        self.auc_gp_control = None
        self.auc_control = {}
        self.auc_control_norm = {}

        self.inverted = False

        # credible intervals stats
        self.credible_intervals = []
        self.percent_credible_intervals = None
        self.responder_pvalue_AUC = None
        self.responder_pvalue_angle = None

        self.rates_list = []
        self.rates_list_control = []

        # Full Data is all of the data of the treatments and control
        self.full_data = np.array([])

        # gp_h0 and gp_h1 depend on the full_data
        self.gp_h0 = None
        self.gp_h0_kernel = None
        self.gp_h1 = None
        self.gp_h1_kernel = None

        self.delta_log_likelihood_h0_h1 = None

        self.tgi = None

    @property
    def responder_AUC(self, p_value=0.05):
        """
        Decide if the cancer model is a responder based on AUC for a specified p-value
        cut-off.

        @param p_value [`float`] The p-value cutoff. Default is 0.05.

        @return [`bool`] True of False, where True means the cancer model is a
            responder to the treatment.
        """
        if self.responder_pvalue_AUC is None:
            self.calculate_responder_pvalue_AUC()
        return self.responder_pvalue_AUC < p_value

    @property
    def responder_angle(self, p_value=0.05):
        """
        Decide if the cancer model is a responder based on response angle for a specified p-value
        cut-off.

        @param p_value [`float`] The p-value cutoff. Default is 0.05.

        @return [`bool`] True of False, where True means the cancer model is a
            responder to the treatment.
        """
        if self.responder_pvalue_angle is None:
            self.calculate_responder_pvalue_angle()
        return self.responder_pvalue_angle < p_value

    # ---- Single Bracket Subsetting
    def __getitem__(self, item):
        """
        Implementation of slicing and single bracket subsetting syntax for this object

        :item [int or slice object]

        :return []
        """
        # Deal with slices
        if isinstance(item, slice):
            if item.stop > max(self.replicates) or item.start > max(
                    self.replicates):
                raise IndexError(
                    f"Slice indexes out of bounds. Acceptable slice range is from "
                    f"{min(self.replicates)} to {max(self.replicates) + 1}.")
            array = np.hstack([self.variable, self.response[item, :].T])
            return pd.DataFrame.from_records(
                array,
                columns=[
                    'variable', *[
                        'replicate_' + str(idx) for idx in range(
                            item.start, item.stop,
                            item.step if item.step is not None else 1)
                    ]
                ])
        # Deal with numeric indexing
        if not isinstance(item, list):
            item = [item]
        if not all([isinstance(idx, int) for idx in item]):
            raise IndexError(
                "Index must be an int, list of ints or a slice object!")
        else:
            if max(item) > max(self.replicates) or min(item) < min(
                    self.replicates):
                raise IndexError(
                    f"One or more of {item} is an out of bounds index. Acceptable index range is from "
                    f"{min(self.replicates)} to {max(self.replicates)}.")
            array = np.hstack([self.variable, self.response[item, :].T])
            return pd.DataFrame.from_records(
                array,
                columns=[
                    'variable', *['replicate_' + str(idx) for idx in item]
                ])

    def to_dict(self, json=False):
        """
        Convert a ExperimentalCondition object into a dictionary with attributes as keys for their associated values. If
        `json` is True, all values will be coerced to JSONizable Python base types.
        """

        # Helper to convert any NumPy types into base types
        def _if_numpy_to_base(object):
            if isinstance(object, np.ndarray):
                return object.tolist()
            elif isinstance(object, np.generic):
                return object.item()
            else:
                return object

        if json:
            return dict(
                zip(list(self.__dict__.keys()), [
                    _if_numpy_to_base(item) for item in self.__dict__.values()
                ]))
        else:
            return self.__dict__

    ## TODO:: Can we implement this in the constructor?
    def find_variable_start_index(self):
        """
        Returns the index in the array of the location of the treatment start value, + or - 1. For a PDX model, this
        corresponds to the index of the day treatment was started.

        :return [int] The index.
        """
        start = None
        start_found = False

        for i in range(len(self.variable.ravel())):
            if self.variable[
                    i] - 1 <= self.variable_treatment_start <= self.variable[
                        i] + 1 and start_found is False:
                start = i
                start_found = True
        return start

    def normalize_data(self):
        """
        Normalizes all growths using normalize_first_day_and_log_transform helper function.

        :return [None] modifies self.response_norm
        """
        logger.info("Normalizing data for " + self.name)
        self.response_norm = self.__normalize_treatment_start_variable_and_log_transform(
            self.response, self.find_variable_start_index())

    def __normalize_treatment_start_variable_and_log_transform(
            self, response, treatment_start_index):
        """
        Normalize by dividing every response element-wise by the first day's median
        and then taking the log.

        :param response [array] the array of values to be normalised:
        :return [array] the normalised array:
        """
        return np.log(
            np.asarray(
                (response.T + 0.01) / response.T[int(treatment_start_index)],
                dtype=float).T) + 1

    def create_full_data(self, control):
        """
        Creates a 2d numpy array with columns time, treatment and tumour size
        :param control [Boolean] whether the experimental_condition is from the control group:
        :return [None] Creates the full_data array
        """
        # control
        for j, entry in enumerate(control.response_norm.T):
            for y in entry:
                if self.full_data.size == 0:
                    self.full_data = np.array([control.variable[j][0], 0, y])
                else:
                    self.full_data = np.vstack(
                        (self.full_data, [control.variable[j][0], 0, y]))

        # case
        for j, entry in enumerate(self.response_norm.T):
            for y in entry:
                self.full_data = np.vstack(
                    (self.full_data, [self.variable[j][0], 1, y]))

    def calculate_tgi(self, control):
        """
        Calculates the Tumour Growth Index of a ExperimentalCondition object
        :param control [Boolean] whether the experimental_condition is from the control group
        :return [None] Writes the calculated value into self.tgi
        """
        def TGI(yt, yc, i, j):
            # calculates TGI between yt (Treatment) and yc (Control) during epoch i, to j
            return 1 - (yt[j] - yt[i]) / (yc[j] - yc[i])

        start = max(self.find_variable_start_index(),
                    control.variable_treatment_start_index)
        end = min(self.variable_treatment_end_index,
                  control.variable_treatment_end_index) + 1

        self.tgi = TGI(
            self.response_norm.mean(axis=0)[start:end],
            control.response_norm.mean(axis=0)[start:end], 0, end - start - 1)

    def fit_gaussian_processes(self, control=None, num_restarts=7):
        """
        This is the new version, which fits only on the `relevant' interval
        Fits a GP for both the control and case growth curves,
        H1 with time and treatment, and H0 with only time.

        :param control If None, then just fits one GP - else, fits 3 different GPs
                        (one for case, two for gp_h0 and gp_h1):
        :param num_restarts The number of restarts in the optimisation: 
        :return [None] creates the GP objects:
        """
        logger.info("Fitting Gaussian processes for " + self.name)

        # control for number of measurements per replicate if time not same length
        # self.response_norm.shape[0] is num replicates, [1] is num measurements
        ## TODO:: Can we remove this line?
        obs_per_replicate = self.response_norm.shape[1]
        print("Now attempting to fit:")
        print("self.name:")
        print(self.name)
        print("Self.source_id:")
        print(self.source_id)

        self.gp_kernel = RBF(input_dim=1, variance=1., lengthscale=10.)

        response_norm_trunc = self.response_norm[:, self.
                                                 variable_treatment_start_index:
                                                 self.
                                                 variable_treatment_end_index]

        # # Determine index of first mouse death to remove all NaNs before fitting the model
        # first_death_idx = min(np.sum(~np.isnan(response_norm_trunc), axis=1))
        #
        # # Subset the independent variable and response data
        # response_norm_trunc = response_norm_trunc[:, 0:first_death_idx]
        # variable_trunc = self.variable[0:first_death_idx, :]

        # Reshape the data to pass into GPRegression (flatten into a single column)
        variable = np.tile(
            self.variable[self.variable_treatment_start_index:self.
                          variable_treatment_end_index],
            (len(self.replicates), 1))
        response = np.resize(
            response_norm_trunc,
            (response_norm_trunc.shape[0] * response_norm_trunc.shape[1], 1))

        self.gp = GPRegression(variable, response, self.gp_kernel)
        self.gp.optimize_restarts(num_restarts=num_restarts, messages=False)

        if control is not None:
            # Subset full data for control calculations
            # self.full_data = self.full_data[np.isin(self.full_data[:, 0], variable_trunc), :]

            # kernels
            self.gp_h0_kernel = RBF(input_dim=1, variance=1., lengthscale=10.)
            self.gp_h1_kernel = RBF(input_dim=2, variance=1., ARD=True)

            # GPs
            self.gp_h0 = GPRegression(self.full_data[:, 0:1],
                                      self.full_data[:,
                                                     2:3], self.gp_h0_kernel)
            self.gp_h1 = GPRegression(self.full_data[:, 0:2],
                                      self.full_data[:,
                                                     2:3], self.gp_h1_kernel)

            # optimize GPs
            self.gp_h0.optimize_restarts(num_restarts=num_restarts,
                                         messages=False,
                                         robust=True)  # silent exceptions
            self.gp_h1.optimize_restarts(num_restarts=num_restarts,
                                         messages=False,
                                         robust=True)

            self.delta_log_likelihood_h0_h1 = self.gp_h1.log_likelihood(
            ) - self.gp_h0.log_likelihood()

    def fit_linear_models(self):
        """
        Fits a separate OLS model, "Response ~ Variable + 0", to each replicate in the object.

        :return [list] List of OLS model objects, with each index corresponding to the replicate that model was fit for.
        """
        model_dfs = [
            pd.DataFrame({
                "Response": resp,
                "Variable": self.variable.flatten()
            }) for resp in self.response
        ]
        self.linear_models = [
            smf.ols(formula="Response ~ Variable + 0", data=model_df).fit()
            for model_df in model_dfs
        ]

    def calculate_lm_slopes(self):
        """
        Calculate the slope of each replicate linear model in degrees. The slope is defined as the arctan of the
        coefficient for the independent variable in the linear model. Results are converted to degrees.

        :return [ndarray] Slope of the linear model for each replicate in degrees.
        """
        params = np.array(
            [model.params.values.item() for model in self.linear_models])
        return np.arctan(params) * (180 / np.pi)

    def calculate_kl_divergence(self, control):
        """
        Calculates the KL divergence between the GPs fit for both the
        batched controls and batched cases.

        :param control: The corresponding control ExperimentalCondition object
        :return: The KL divergence
        """

        logger.info("Calculating the KL Divergence for " + self.name)

        def kl_integrand(variable):
            """
            Calculates the KL integrand
            :param variable [int?] The independent variable for the Gaussian Process Model (either time or dose).
            :return [float] The integrand
            """
            mean_control, var_control = control.gp.predict(
                np.asarray([[variable]]))
            mean_case, var_case = self.gp.predict(np.asarray([[variable]]))

            return ((var_control + (mean_control - mean_case)**2) /
                    (2 * var_case)) + ((var_case +
                                        (mean_case - mean_control)**2) /
                                       (2 * var_control)) - 1

        max_x_index = min(self.variable_treatment_end_index,
                          control.variable_treatment_end_index)

        if control.response.shape[1] > self.response.shape[1]:
            self.kl_divergence = abs(
                1 /
                (self.variable[max_x_index] - self.variable_treatment_start) *
                quad(kl_integrand,
                     self.variable_treatment_start,
                     self.variable[max_x_index],
                     limit=100)[0])[0]
        else:
            self.kl_divergence = abs(1 / (control.variable[max_x_index] -
                                          self.variable_treatment_start) *
                                     quad(kl_integrand,
                                          self.variable_treatment_start,
                                          control.variable[max_x_index],
                                          limit=100)[0])[0]

        logger.info(self.kl_divergence)

    def calculate_responder_pvalue_AUC(self):
        """
        Conduct a Mann-Whitney rank test between the AUC values for the treatment
        vs the AUC of the control and return the p-value.
        """
        self.responder_pvalue_AUC = \
            stats.mannwhitneyu(list(self.auc.values()), list(self.auc_control.values()), alternative="less").pvalue

    def calculate_responder_pvalue_angle(self):
        """
        Conduct a Mann-Whitney rank test between the response angle values for the treatment
        vs the response angle of the control and return the p-value.
        """
        self.responder_pvalue_angle = \
            stats.mannwhitneyu(list(self.response_angle.values()),
                               list(self.response_angle_control.values()), alternative="less").pvalue

    @staticmethod
    def __fit_single_gaussian_process(variable, response_norm, num_restarts=7):
        """
        GP fitting.

        Returns the GP and kernel.

        :param variable: time
        :param response_norm: log-normalized target
        :return [tuple] a tuple:
            - the gp object
            - the kernel
        """

        obs_per_replicate = response_norm.shape[1]

        kernel = RBF(input_dim=1, variance=1., lengthscale=10.)
        variable = np.tile(variable, (response_norm.shape[0], 1))
        response = np.resize(
            response_norm,
            (response_norm.shape[0] * response_norm.shape[1], 1))
        gp = GPRegression(variable, response, kernel)
        gp.optimize_restarts(num_restarts=num_restarts, messages=False)

        return gp, kernel

    @staticmethod
    def __relativize(y, start):
        """
        Normalises a numpy array to the start day
        :param response [ndarray] the array to be normalised:
        :param start [int] the start day:
        :return [ndarray] the normalised array:
        """
        return y / y[start] - 1

    @staticmethod
    def __centre(y, start):
        """
        Centres a numpy array to the start day
        :param response [ndarray] the array to be normalised:
        :param start [int] the start day:
        :return [ndarray] the normalised array:
        """
        return y - y[start]

    @staticmethod
    def __compute_response_angle(variable, response, start):
        """
        Calculates the response angle for observations response, given time points variable and start point start
        :param variable [ndarray] the time points
        :param response [ndarray] the observations
        :param start [numpy array] the start point for the angle computation
        :return [float] the angle:
        """
        min_length = min(len(variable), len(response))
        model = sm.OLS(response[start:min_length],
                       variable[start:min_length],
                       missing='drop')  # Drop NaNs
        results = model.fit()
        return np.arctan(results.params[0])

    def calculate_response_angles(self, control):
        """
        Builds the response angle dict.

        :param control [ExperimentalCondition] the corresponding control object
        :return [None] writes to the angle parameters 
        """
        start = self.find_variable_start_index()
        for i in range(len(self.replicates)):

            if start is None:
                raise ValueError(
                    "The `self.variable_start_index` parameter is missing, please initialize this value."
                )
            else:
                self.response_angle[
                    self.replicates[i]] = self.__compute_response_angle(
                        self.variable.ravel(),
                        self.__centre(self.response[i], start), start)
                self.response_angle_rel[
                    self.replicates[i]] = self.__compute_response_angle(
                        self.variable.ravel(),
                        self.__relativize(self.response[i], start), start)

        self.average_angle = self.__compute_response_angle(
            self.variable.ravel(),
            self.__centre(np.nanmean(self.response, axis=0), start), start)
        self.average_angle_rel = self.__compute_response_angle(
            self.variable.ravel(),
            self.__relativize(np.nanmean(self.response, axis=0), start), start)
        self.average_angle_control = self.__compute_response_angle(
            control.variable.ravel(),
            self.__centre(np.nanmean(control.response, axis=0), start), start)
        self.average_angle_rel_control = self.__compute_response_angle(
            control.variable.ravel(),
            self.__relativize(np.nanmean(control.response, axis=0), start),
            start)

    @staticmethod
    def __calculate_AUC(variable, response):
        """
        Calculates the area under the curve of a set of observations 

        :param variable [ndarray] the time points
        :param response [ndarray] the observations
        :return [float] The area under the curve
        """
        min_length = min(len(variable), len(response))
        AUC = sklearn.metrics.auc(x=variable[0:min_length + 1],
                                  y=response[0:min_length + 1])
        return AUC

    def calculate_gp_auc(self):
        """
        Builds the AUC (Area under the curve) with respect to the GP fit.

        :return
        """
        self.auc_gp = self.__calculate_AUC(self.variable,
                                           self.gp.predict(self.variable)[0])

    def calculate_auc(self, control):
        """
        Builds the AUC (Area under the curve) dict for response.
        :param control: the corresponding control object:
        :return [None]:
        """
        start = max(self.find_variable_start_index(),
                    control.find_variable_start_index())
        end = min(self.variable_treatment_end_index,
                  control.variable_treatment_end_index)
        for i in range(len(self.replicates)):
            self.auc[self.replicates[i]] = self.__calculate_AUC(
                self.variable.ravel()[start:end], self.response[i, start:end])

    def calculate_auc_norm(self, control):
        """
        Builds the AUC (Area under the curve) dict. for response_norm
        :param control: the corresponding control object:
        :return [None]:
        """
        start = max(self.find_variable_start_index(),
                    control.find_variable_start_index())
        end = min(self.variable_treatment_end_index,
                  control.variable_treatment_end_index)
        for i in range(len(self.replicates)):
            self.auc_norm[self.replicates[i]] = self.__calculate_AUC(
                self.variable.ravel()[start:end],
                self.response_norm[i, start:end])

    def calculate_mrecist(self):
        """
        Builds the mRECIST dict.

        - **mCR**: BestResponse < -95% AND BestAverageResponse < -40%
        - **mPR**: BestResponse < -50% AND BestAverageResponse < -20%
        - **mSD**: BestResponse < 35% AND BestAverageResponse < 30%
        - **mPD**: everything else

        :return [None]
        """
        start = self.find_variable_start_index()
        end = self.variable_treatment_end_index
        ## FIXME:: Why does this go through almost the same loop twice? Is there a missing if condition?
        for i in range(len(self.replicates) - 1):
            # days_volume = zip(self.variable.ravel(), self.response[i])
            print(i)
            if start is None:
                raise ValueError(
                    "The `start` attribute for this `ExperimentalCondition` object is set to None, "
                    "please reset.")
            else:
                initial_volume = self.response[i][start]

                # array of all responses for t >= 3
                responses = []
                average_responses = []

                for day, volume in zip(self.variable.ravel(),
                                       self.response[i]):
                    if (day - self.variable_treatment_start >=
                            3) and (day <= self.variable[end]):
                        responses.append(
                            ((volume - initial_volume) / initial_volume) * 100)
                        average_responses.append(np.average(responses))

                if min(responses) < -95 and min(average_responses) < -40:
                    self.mrecist[self.replicates[i]] = 'mCR'
                elif min(responses) < -50 and min(average_responses) < -20:
                    self.mrecist[self.replicates[i]] = 'mPR'
                elif min(responses) < 35 and min(average_responses) < 30:
                    self.mrecist[self.replicates[i]] = 'mSD'
                else:
                    self.mrecist[self.replicates[i]] = 'mPD'

        if self.best_avg_response is not None:
            self.best_avg_response = np.array([], dtype=np.float64)
        for i in range(len(self.replicates)):

            days_volume = zip(self.variable.ravel(), self.response[i])
            start = self.find_variable_start_index()

            if start is None:
                raise ValueError(
                    "The `start` attribute for this `ExperimentalCondition` object is set to None, "
                    "please reset.")
            else:
                initial_volume = self.response[i][start]

                # array of all responses for t >= 10
                responses = []
                average_responses = []

                day_diff = 0

                for day, volume in days_volume:
                    day_diff = day - self.variable_treatment_start
                    if day >= self.variable_treatment_start and day_diff >= 3:
                        responses.append(
                            ((volume - initial_volume) / initial_volume) * 100)
                        average_responses.append(np.average(responses))

                self.best_avg_response = np.append(self.best_avg_response,
                                                   min(average_responses))
                if min(responses) < -95 and min(average_responses) < -40:
                    self.mrecist[self.replicates[i]] = 'mCR'
                elif min(responses) < -50 and min(average_responses) < -20:
                    self.mrecist[self.replicates[i]] = 'mPR'
                elif min(responses) < 35 and min(average_responses) < 30:
                    self.mrecist[self.replicates[i]] = 'mSD'
                else:
                    self.mrecist[self.replicates[i]] = 'mPD'

    def enumerate_mrecist(self):
        """
        Builds up the mrecist_counts attribute with number of each occurrence of mRECIST experimental_condition.

        :return:
        """

        # TODO:: Instead of error, we could just call method to calculate mrecist, then give the user a warning?
        if self.mrecist is None:
            raise ValueError(
                "`ExperimentalCondition` object mrecist attribute is none, please calculate mrecist first!"
            )

        self.mrecist_counts = Counter(mCR=0, mPR=0, mSD=0, mPD=0)
        for replicate in self.replicates:
            mrecist = self.mrecist[replicate]
            if mrecist == 'mCR':
                self.mrecist_counts['mCR'] += 1
            elif mrecist == 'mPR':
                self.mrecist_counts['mPR'] += 1
            elif mrecist == 'mSD':
                self.mrecist_counts['mSD'] += 1
            elif mrecist == 'mPD':
                self.mrecist_counts['mPD'] += 1

    def __credible_interval(self,
                            threshold,
                            variable_2,
                            variable_1=0,
                            control=None):
        """
        Credible interval function, for finding where the two GPs diverge.

        ## FIXME:: Is variable float or int?
        :param threshold [float] The variable of confidence
        :param variable_2 [int] The value of variable at the end of the range (i.e, time 2 or dose 2)
        :param variable_1 [int] The value of variable at the start of the range (i.e., time 1 or dose 1)
        :param control: the corresponding control object:
        :return:
        """
        if control is not None:
            mu = 0
            sigma = 1

            a = np.array([1, -1, -1, 1])
            means = np.array([
                self.gp.predict(np.asarray([[variable_2]])),
                self.gp.predict(np.asarray([[variable_1]])),
                control.gp.predict(np.asarray([[variable_2]])),
                control.gp.predict(np.asarray([[variable_1]]))
            ])[:, 0, 0]

            controlp = [
                control.gp.predict(np.asarray([[variable_1]])),
                control.gp.predict(np.asarray([[variable_2]]))
            ]
            variances = np.zeros((4, 4))

            variances[0:2, 0:2] = self.gp.predict(np.asarray([[variable_1],
                                                              [variable_2]]),
                                                  full_cov=True)[1]
            variances[2:4,
                      2:4] = control.gp.predict(np.asarray([[variable_1],
                                                            [variable_2]]),
                                                full_cov=True)[1]

            mu = np.dot(a, means)
            sigma = np.dot(np.dot(a, variances), a.T)
            interval = norm.interval(threshold, mu, sigma)

            return (interval[0] < 0) and (interval[1] > 0)
        else:
            logger.error(
                "The private function `__credible_interval` requires control.")

    def calculate_credible_intervals(self, control):
        """
        :param control: control ExperimentalCondition object
        :return:
        """

        logger.info("Calculating credible intervals for: " + self.name)

        if control is not None:
            largest_x_index = max(len(control.variable), len(self.variable))

            if len(control.variable) > len(self.variable):
                for i in self.variable[1:]:  # Why starting at second value?
                    self.credible_intervals.append(
                        (self.__credible_interval(0.95, i[0],
                                                  control=control)[0], i[0]))
            else:
                for i in control.variable[1:]:
                    self.credible_intervals.append(
                        (self.__credible_interval(0.95, i[0],
                                                  control=control)[0], i[0]))
        else:
            logger.error(
                "The function `calculate_credible_intervals` requires control."
            )

    def calculate_credible_intervals_percentage(self):
        """
        :return [float] The credible intervals; also has the side effect of setting the percent_credible_intervals
            attribute on the object.
        """
        logger.info("Calculating percentage of credible intervals.")

        num_true = 0
        for i in self.credible_intervals:
            if i[0] == True:
                num_true += 1

        self.percent_credible_intervals = (num_true /
                                           len(self.credible_intervals)) * 100
        return self.percent_credible_intervals

    def __gp_derivative(self, variable, gp):
        """
        Computes the derivative of the Gaussian Process gp
        (with respect to its 'time' variable) and
        returns the values of the derivative at time
        points variable to deal with some weird stuff about
        :param variable [float] The independent variable, either time for PDX models or dose for CCL models
        :param gp [GP] The GaussianProcess to be differentiated
        :return [tuple] A tuple:
            - The mean
            - The covariance
        """

        if variable.ndim == 1:
            variable = variable[:, np.newaxis]

        mu, ignore = gp.predictive_gradients(variable)
        ignore, cov = gp.predict(variable, full_cov=True)
        # FIXME:: How did this not divide by zero previously?
        mult = [[((1. / gp.kern.lengthscale) *
                  (1 - (1. / gp.kern.lengthscale) * (y - z)**2))[0]
                 for y in variable if y != z] for z in variable]
        return mu, mult * cov

    def compute_all_gp_derivatives(self, control):
        """
        :param control [ExperimentalCondition] The control `ExperimentalCondition` for the current `CancerModel`
        :return: [None] Sets the `rates_list` attribute
        """

        if not isinstance(self.rates_list, list):
            self.rates_list = list(self.rates_list)
        if not isinstance(self.rates_list_control, list):
            self.rates_list_control = list(self.rates_list_control)

        logger.info("Calculating the GP derivatives for: " + self.name +
                    ' and control')
        for var in self.variable:
            self.rates_list.append(self.__gp_derivative(var, self.gp)[0])
        for var in control.variable:
            self.rates_list_control.append(
                self.__gp_derivative(var, control.gp)[0])
        self.rates_list = np.ravel(self.rates_list)
        self.rates_list_control = np.ravel(self.rates_list_control)
        logger.info("Done calcluating GP derivatives for: " + self.name +
                    ' and control')

    def plot_with_control(self,
                          control=None,
                          output_path=None,
                          show_kl_divergence=True,
                          show_legend=True,
                          file_type=None,
                          output_pdf=None):
        """
        Given all of the data and an output path, saves a PDF
        of the comparison with some statistics as well.


        :param control: The control ExperimentalCondition object
        :param output_path: output filepath - if not specified, doesn't save
        :param show_kl_divergence: flag for displaying calculated kl_divergence
        :param show_legend: flag for displaying legend
        :param file_type: can be 'svg' or 'pdf', defaults to 'pdf'.
        :param output_pdf: an output_pdf object
        :return:
        """
        if control is None:
            logger.error("You need to plot with a control.")
        else:
            logger.info("Plotting with statistics for " + self.name)

            fig, ax = plt.subplots()
            plt.title(
                f"Case (Blue) and Control (Red) Comparison of \n {str(self.source_id)} with {str(self.name)}"
            )

            # set xlim
            gp_x_limit = max(self.variable) + 5

            # Control
            control.gp.plot_data(ax=ax, color='red')
            control.gp.plot_mean(ax=ax,
                                 color='red',
                                 plot_limits=[0, gp_x_limit])
            control.gp.plot_confidence(ax=ax,
                                       color='red',
                                       plot_limits=[0, gp_x_limit])

            # Case
            self.gp.plot_data(ax=ax, color='blue')
            self.gp.plot_mean(ax=ax, color='blue', plot_limits=[0, gp_x_limit])
            self.gp.plot_confidence(ax=ax,
                                    color='blue',
                                    plot_limits=[0, gp_x_limit])

            # Drug Start Line
            plt.plot(
                [self.variable_treatment_start, self.variable_treatment_start],
                [-10, 15],
                'k-',
                lw=1)

            plt.xlabel('Day')
            plt.ylabel('Normalized log tumor size')
            plt.ylim(-10, 15)

            # Always select the longest date + 5
            plt.xlim(0, max(self.variable) + 5)

            if show_kl_divergence:
                plt.text(2, -8, 'KL Divergence: ' + str(self.kl_divergence))

            if show_legend is True:
                plt.legend(loc=0)

            if file_type == 'pdf':
                output_pdf.savefig(fig)
                plt.close(fig)
            elif file_type == 'svg':
                plt.savefig(output_path, format="svg")

    def __repr__(self):
        """
        Returns a string representation of the experimental_condition object.

        :return [string] The representation:
        """
        return ('\n'.join([
            f"Name: {self.name}",
            f"Treatment Start Date: {self.variable_treatment_start}",
            f"Source Id: {self.source_id}",
            f"K-L Divergence: {self.kl_divergence}",
            f"K-L P-Value: {self.kl_p_value}", f"mRecist: {self.mrecist}",
            f"Percent Credible Interval: {self.percent_credible_intervals}",
            f"Rates List: {self.rates_list}"
        ]))
class Gaussfit:
    """Handles GPR of input data. """
    def __init__(self):
        """Initialize a gaussfit object."""
        self.kernel = None
        self.model = None
        self.scale = None
        self.translate = None
        self.save_fig = False
        self.save_path = None
        self.kernel_name = None  # Used for saving file names

    @property
    def save_fig(self):
        return self.save_fig

    @save_fig.setter
    def save_fig(self, save_fig):
        self.save_fig = save_fig

    @property
    def save_path(self):
        return self.save_path

    @save_path.setter
    def save_path(self, save_path):
        self.save_path = save_path

    def set_gp_kernel(self,
                      kernel=DEFAULTS['kernel'],
                      in_dim=DEFAULTS['input_dim'],
                      variance=DEFAULTS['variance'],
                      lengthscale=DEFAULTS['lengthscale'],
                      multi_dim=False):
        self.kernel_name = kernel  # This is used for saving file names
        """Sets the kernel of this Gaussfit"""
        if kernel == 'RBF':
            self.kernel = RBF(input_dim=in_dim,
                              variance=variance,
                              lengthscale=lengthscale,
                              ARD=multi_dim)
        elif kernel == 'Exponential':
            self.kernel = Exponential(input_dim=in_dim,
                                      variance=variance,
                                      lengthscale=lengthscale,
                                      ARD=multi_dim)
        elif kernel == 'Matern32':
            self.kernel = Matern32(input_dim=in_dim,
                                   variance=variance,
                                   lengthscale=lengthscale,
                                   ARD=multi_dim)
        elif kernel == 'Matern52':
            self.kernel = Matern52(input_dim=in_dim,
                                   variance=variance,
                                   lengthscale=lengthscale,
                                   ARD=multi_dim)
        else:
            print 'Kernel not recognized or not implemented'

    def populate_gp_model(self,
                          observable,
                          lecs,
                          energy=None,
                          rescale=False,
                          fixvariance=0):
        """Creates a model based on given data and kernel.
        
        Args:
        observable - numpy array with observable. (1 row for each observable from each lec sample)
        lecs - numpy array with lec parameters fit should be done with regard to (lec 1 coloum 1 and so on, sample 1 on row 1 and so on)
        energy - energy values 
        """
        # Add row with energies to parameters for fit (c for col if that is that is the right way)
        if energy is not None:
            lecs = np.r_(lecs, energy)
        if rescale:
            (lecs, observable) = self.rescale(lecs, observable)
        lecs.transpose()

        observable.transpose()
        self.model = GPRegression(lecs, observable, self.kernel)

        self.model.Gaussian_noise.variance.unconstrain()

        self.model.Gaussian_noise.variance = fixvariance

        self.model.Gaussian_noise.variance.fix()

    def optimize(self, num_restarts=1):
        """Optimize the model."""

        #Something worng, model doesn't always converge
        self.model.optimize_restarts(num_restarts=num_restarts, messages=True)
        print self.model

    def rescale(self, inlecs, inobs):
        """Rescales the input parameters that Gpy handles,
           so that they are in the interval [-1,1] #Remove 16xnr 
        """

        if self.translate is None:
            self.translate = np.append(np.mean(inlecs, axis=0), np.mean(inobs))

        inlecs = inlecs - self.translate[None, :16]
        inobs = inobs - self.translate[16]

        if self.scale is None:
            self.scale = np.append(np.amax(abs(inlecs), axis=0),
                                   max(abs(inobs)))
            self.scale[self.scale <= 1e-10] = 1
        outlecs = inlecs / self.scale[None, :16]
        outobs = inobs / self.scale[16]

        return (outlecs, outobs)

    def calculate_valid(self, Xvalid):
        """Calculates model prediction in validation points"""
        if self.scale is not None:
            Xvalid = (Xvalid -
                      self.translate[None, :16]) / self.scale[None, :16]
            (Ymodel, Variance) = self.model.predict(Xvalid)
            Ymodel = Ymodel * self.scale[16] + self.translate[16]
            Variance = Variance * self.scale[16] * self.scale[16]
            return (Ymodel, Variance)
        else:
            return self.model.predict(Xvalid)

    def plot(self):
        """Plot the GP-model.
        Plot limits only for 1D-case.
        """
        print(self.model)
        self.model.plot()
        plt.show()

    def tags_to_title(self, train_tags, val_tags):
        """Create plot title from tags."""
        title = '_'.join(train_tags)
        title += '_' + '_'.join(val_tags)
        title += '_' + str(self.kernel_name)
        return title

    def save_fig_to_file(self, filename):
        """Saves the last specified global figure to file with filename
        File path specified by self.file_path.
        Also concatenates kernel name used
        """
        plt.savefig(self.save_path + filename)

    def generate_and_save_tikz(self, Ymodel, Yvalid, Variance, train_tags,
                               val_tags):
        fig = plt.figure()
        style.use('seaborn-bright')

        sigma = np.sqrt(Variance)
        Expected, = plt.plot([max(Yvalid), min(Yvalid)],
                             [max(Yvalid), min(Yvalid)],
                             '-',
                             linewidth=2,
                             zorder=10,
                             ms=19,
                             label="Expected")
        Data, = plt.plot(Yvalid,
                         Ymodel,
                         '.',
                         ms=0.5,
                         zorder=3,
                         label="Data points")
        plt.errorbar(Yvalid,
                     Ymodel,
                     yerr=2 * sigma,
                     fmt='none',
                     alpha=0.5,
                     zorder=1,
                     label="Error bars")

        plt.xlabel('Simulated value [\si{\milli\barn}]')
        plt.ylabel('Emulated value [\si{\milli\barn}]')
        plt.grid(True)

        modelError = str(self.get_model_error(Ymodel, Yvalid))

        # Create a legend for the line.
        first_legend = plt.legend(handles=[Expected, Data],
                                  loc=4)  #["Expected", "Data points"],
        #third_legend = plt.legend(handles=[Error], loc=4)

        #The folowing saves the file to folder as well as adding 3 rows. The "clip mode=individual" was a bit tricky to add so this is the ugly way to solve it.
        tikz_save(self.save_path + self.tags_to_title(train_tags, val_tags) +
                  '_predicted_actual.tex',
                  figureheight='\\textwidth*0.8,\nclip mode=individual',
                  figurewidth='\\textwidth*0.8')

        #Last fix of tikz with script.
        edit = EditText()
        #adding tikz file info
        edit.fix_file(
            self.save_path + self.tags_to_title(train_tags, val_tags) +
            '_predicted_actual.tex',
            '% This file was created by matplotlib2tikz v0.6.3.',
            '%  ' + self.save_path + '\n%  ' +
            self.tags_to_title(train_tags, val_tags) + '\n%  Model Error: ' +
            modelError)

        #adding legend
        edit.fix_file(
            self.save_path + self.tags_to_title(train_tags, val_tags) +
            '_predicted_actual.tex', '\\end{axis}',
            '\\legend{Data,Expected}\n\\end{axis}')
        #adding forget plot
        edit.fix_file(
            self.save_path + self.tags_to_title(train_tags, val_tags) +
            '_predicted_actual.tex',
            '\\addplot [lightgray!80.0!black, opacity=0.5, mark=-, mark size=3, mark options={solid}, only marks]',
            '\\addplot [lightgray!80.0!black, opacity=0.5, mark=-, mark size=3, mark options={solid}, only marks, forget plot]'
        )

        #Making transformable to PNG
        edit.fix_file(
            self.save_path + self.tags_to_title(train_tags, val_tags) +
            '_predicted_actual.tex', '%  Model Error: ' + modelError,
            '\documentclass{standalone}\n\usepackage{tikz}\n\usepackage{pgfplots}\n\usepackage{siunitx}\n\n\\begin{document}'
        )

        edit.fix_file(
            self.save_path + self.tags_to_title(train_tags, val_tags) +
            '_predicted_actual.tex', '\end{tikzpicture}',
            '\end{tikzpicture}\n\end{document}')

    def get_model_error(self, Ymodel, Yvalid, alt=False):
        """A measure of how great the model's error is compared to validation points
        Currently uses the rms of the relative error
        """
        #Sum of a numpy array returns another array, we use the first (and only) element
        #if alt:
        #    return np.sqrt(np.mean(np.square((Ymodel-Yvalid)/np.std(Yvalid))))

        return np.sqrt(np.mean(np.square((Ymodel - Yvalid) / Yvalid)))

    def plot_predicted_actual(self, Ymodel, Yvalid, Variance, train_tags,
                              val_tags):
        """Plots the predicted values vs the actual values, adds a straight line and 2sigma error bars."""
        sigma = np.sqrt(Variance)
        plt.figure(1)
        plt.plot(Yvalid, Ymodel, '.')
        plt.errorbar(Yvalid, Ymodel, yerr=2 * sigma, fmt='none')
        plt.plot([max(Yvalid), min(Yvalid)], [max(Yvalid), min(Yvalid)], '-')

        plt.xlabel('Simulated value [mb]')
        plt.ylabel('Emulated value [mb]')

        # Do we want to save to file?
        if self.save_fig:
            self.save_fig_to_file(
                self.tags_to_title(train_tags, val_tags) +
                "_predicted_actual.png")
        plt.show()

    def get_sigma_intervals(self, Ymodel, Yvalid, Variance):
        """Returns the fraction of errors within 1, 2, and 3 sigma."""
        sigma = np.sqrt(Variance)
        n = np.array([0, 0, 0])
        errors = abs(Yvalid - Ymodel)
        for i, e in enumerate(errors):
            if e <= sigma[i]:
                n[0] = n[0] + 1
            if e <= 2 * sigma[i]:
                n[1] = n[1] + 1
            if e <= 3 * sigma[i]:
                n[2] = n[2] + 1
        return n / float(np.shape(errors)[0])

    def plot_modelerror(self, Xvalid, Xlearn, Ymodel, Yvalid, train_tags,
                        val_tags):
        """ Creates a plot showing the vallidated error """
        alldists = cdist(Xvalid, Xlearn, 'euclidean')
        mindists = np.min(alldists, axis=1)
        plt.figure(1)
        plt.plot(mindists, Ymodel - Yvalid, '.')
        plt.xlabel('Distance to closest training point')
        plt.ylabel('Vallidated error [mb]')
        plt.axis([
            0, 1.1 * max(mindists), 1.1 * min(Ymodel - Yvalid),
            1.1 * max(Ymodel - Yvalid)
        ])

        #Do we want to save val error to file?
        if self.save_fig:
            self.save_fig_to_file(
                self.tags_to_title(train_tags, val_tags) + "_val_error.png")
        plt.figure(2)
        plt.plot(mindists, (Ymodel - Yvalid) / Yvalid, '.')
        plt.xlabel('Distance to closest training point')
        plt.ylabel('Vallidated relative error')
        plt.axis([
            0, 1.1 * max(mindists), 1.1 * min((Ymodel - Yvalid) / Yvalid),
            1.1 * max((Ymodel - Yvalid) / Yvalid)
        ])

        #Show model_error in plot
        if self.save_fig:
            self.save_fig_to_file(
                self.tags_to_title(train_tags, val_tags) +
                "_val_rel_error.png")
        plt.show()

    def plot_model(self, Xvalid, Ymodel, Yvalid):
        """Plot the model of training data with the model of walidation data."""
        plt.figure(3)
        plt.plot(Xvalid, Ymodel, 'bo')
        plt.plot(Xvalid, Yvalid, 'rx')
        plt.show()

    """Plots the kernel function of lec index"""

    def plot_kernel(self, lec_idx):
        plot_covariance(self.kernel, visible_dims=lec_idx)
        plt.show()

    """Plots a slice of of each lec through the center point
    Set energy to None if energy is not a parameter in your model"""

    def plot_lecs(self, center, intervals, energy=None):
        if energy is not None:
            center = np.append(center, energy).reshape(1, 17)
            intervals = np.append(intervals, 0).reshape(1, 17)
        else:
            intervals = np.append(intervals, 0).reshape(1, 16)
        for i in range(16):
            plt.subplot(4, 4, i + 1)
            x = np.linspace(center[0][i] - intervals[0][i],
                            center[0][i] + intervals[0][i],
                            num=200)
            lecs = np.tile(center[0], (200, 1))
            lecs[:, i] = x
            obs, _ = self.calculate_valid(lecs)
            plt.plot(x, obs)
        plt.show()

    def save_model_parameters(self, savepath, traintags, kernel, LEC_LENGTH,
                              lengthscale, multidim, rescale):
        "Saves GPy model hyperparameters as a .pickle file" ""

        params = self.model.param_array

        if (savepath.endswith(".pickle")) and (not os.path.isfile(savepath)):
            with open(savepath, 'w') as f:
                pickle.dump([
                    params, kernel, traintags, LEC_LENGTH, lengthscale,
                    multidim, rescale
                ], f)
        elif (not savepath.endswith(".pickle")):
            print "*****ERROR***** Model properties must be saved as .pickle file *****ERROR*****"
        elif os.path.isfile(savepath):
            print "*****ERROR***** File already exists. Cannot save to existing file. *****ERROR*****"

    def load_model_parameters(self, Ylearn, Xlearn, loadpath):
        """Loads a GPy model with hyperparameters from a .pickle file"""

        Xlearn.transpose()
        Ylearn.transpose()

        with open(loadpath, 'r') as f:
            filecontents = pickle.load(f)
            if len(filecontents) == 6:
                params, kernel, traintags, LEC_LENGTH, lengthscale, multi_dim = filecontents
                rescale = False
            elif len(filecontents) == 7:
                params, kernel, traintags, LEC_LENGTH, lengthscale, multi_dim, rescale = filecontents

        print(params)
        print(LEC_LENGTH)
        self.set_gp_kernel(kernel=kernel,
                           in_dim=LEC_LENGTH,
                           lengthscale=lengthscale,
                           multi_dim=multi_dim)

        if rescale:
            (Xlearn, Ylearn) = self.rescale(Xlearn, Ylearn)
        m_load = GPRegression(Xlearn, Ylearn, self.kernel, initialize=False)
        m_load.update_model(False)
        m_load.initialize_parameter()
        m_load[:] = params
        m_load.update_model(True)
        self.model = m_load

    def plot_energy_curve(self, mod_obs, val_obs, mod_var, val_energy):

        plt.plot(val_energy, val_obs, 'x')
        plt.plot(val_energy, mod_obs, 'o')
        plt.show()
Example #13
0
#     [ContinuousParameter('x1', 0., 157.), ContinuousParameter('x2', 0., 157.), ContinuousParameter('x3', 0., 157.),
#      ContinuousParameter('x4', 0., 157.), ContinuousParameter('x5', 0., 157.), ContinuousParameter('x6', 0., 5999.),
#      ContinuousParameter('x7', 0., 999.), ContinuousParameter('x8', 0., 699.)])

latin_design = LatinDesign(parameter_space=parameter_space)
X0 = latin_design.get_samples(n_samples)
Y0 = training_function(X0)
#D0 = ((Y0 - target)**2).sum(axis=1)
#plotter = BayesOptPlotter(h_noiseless, target, xmin, xmax, X0=X0, Y0=Y0)

model = GPRegression(X0, Y0)
model_wrapped = GPyModelWrapper(model)
target = user_sample_vector
acq = L2_LCB(model=model_wrapped, target=target)

fit_update = lambda a, b: model.optimize_restarts(verbose=False)
bayesopt_loop = BayesianOptimizationLoop(model=model_wrapped,
                                         space=parameter_space,
                                         acquisition=acq)
bayesopt_loop.iteration_end_event.append(fit_update)
bayesopt_loop.run_loop(training_function, 5)

# 5. train and wrap the model in Emukit
# model_gpy = GPRegression(X, Y, normalizer=True)
#
# model_emukit = GPyModelWrapper(model_gpy)
# expected_improvement = ExpectedImprovement(model=model_emukit)
# bayesopt_loop = BayesianOptimizationLoop(model=model_emukit,
#                                          space=parameter_space,
#                                          acquisition=expected_improvement,
#                                          batch_size=5)
Example #14
0
class GaussianProcessRewardModel(RewardModel):
    """
    Models rewards with a Gaussian process regressor.

    Implemented with a modified version of scikit-learn's Gaussian Process
    Regressor class.

    The GP is updated online as samples are added. As such, hyperparameters
    for the GP are fit in batch after a threshold number of samples are collected.
    The hyperparameters are then refined afterwards as more samples are added
    until the number of samples passes an upper threshold, after which the
    hyperparameters are no longer updated. This helps avoid highly expensive
    refinement which has computational complexity of O(N^3) in number of samples.

    Parameters:
    -----------
    min_samples: integer (default 100)
        The number of samples after which initial batch hyperparameter
        fitting is performed.
    batch_retries: integer (default 20)
        The number of random restarts for the initial hyperparameter fit.
    refine_ll_delta: numeric (default 1.0)
        The hyperparameters are refined after the average GP marginal
        log-likelihood decreases by this much since the last refinement.
    max_samples: integer (default 1000)
        The number of samples after which hyperparameters are no longer
        refined.

    Other Keyword Parameters:
    -------------------
    Refer to sklearn.gaussian_process.GaussianProcessRegressor's __init__
    """

    def __init__(self, min_samples=10, batch_retries=19, enable_refine=True,
                 refine_period=0, refine_ll_delta=1.0, refine_retries=0, 
                 kernel_type='rbf', verbose=False, **kwargs):

        self.min_samples = min_samples
        self.hp_batch_retries = batch_retries
        
        self.enable_refine = enable_refine
        self.hp_refine_ll_delta = float(refine_ll_delta)
        self.hp_refine_retries = refine_retries
        self.hp_refine_period = refine_period
        self.last_refine_iter = 0

        self.hp_init = False
        self.last_ll = None
        self.kwargs = kwargs
        self.verbose = bool(verbose)

        if kernel_type.lower() == 'rbf':
            self.kernel_class = RBF
        elif kernel_type.lower() == 'matern':
            self.kernel_class = Matern32
        else:
            raise ValueError('Unknown kernel_type: ' + kernel_type)

        self.kernel = None
        self.gp = None  # Init later
        self.inputs = []
        self.outputs = []

    def _initialize(self):
        x = np.asarray(self.inputs)
        y = np.asarray(self.outputs).reshape(-1, 1)
        self.kernel = self.kernel_class(input_dim=x.shape[1], ARD=True)
        self.gp = GPRegression(x, y, kernel=self.kernel, **self.kwargs)

    @property
    def num_samples(self):
        return len(self.inputs)

    def average_log_likelihood(self):
        # NOTE For some reason this returns the negative log-likelihood
        if self.gp is None or self.num_samples < self.min_samples:
            return None
        return -self.gp.log_likelihood() / self.num_samples

    def report_sample(self, x, reward):
        self.inputs.append(x)
        self.outputs.append(reward)

        if self.gp is None:
            self.batch_optimize()
        else:
            x = np.asarray(self.inputs)
            y = np.asarray(self.outputs).reshape(-1, 1)
            self.gp.set_XY(x, y)

        # Wait until we've initialized
        if not self.hp_init:
            return

        current_ll = self.average_log_likelihood()
        if self.verbose:
            rospy.loginfo('Prev LL: %f Curr LL: %f', self.last_ll, current_ll)

        self.check_refine(current_ll)

    def check_refine(self, current_ll):
        if not self.enable_refine:
            return

        if current_ll > self.last_ll:
            self.last_ll = current_ll

        # If the LL has decreased by refine_ll_delta
        delta_achieved = current_ll < self.last_ll - self.hp_refine_ll_delta

        # If it has been refine_period samples since last refinement
        period_achieved = self.num_samples > self.last_refine_iter + self.hp_refine_period
        if delta_achieved or period_achieved:
            self.batch_optimize(self.hp_refine_retries + 1)
            self.last_refine_iter = self.num_samples

    def batch_optimize(self, n_restarts=None):
        if self.num_samples < self.min_samples:
            return

        if n_restarts is None:
            n_restarts = self.hp_batch_retries + 1

        # NOTE Warm-restarting seems to get stuck in local optima, possibly from mean?
        # if self.gp is None: 
        self._initialize()

        if self.verbose:
            rospy.loginfo('Batch optimizing with %d restarts...', n_restarts)

        self.gp.optimize_restarts(optimizer='bfgs',
                                  messages=False,
                                  num_restarts=n_restarts)

        if self.verbose:
            rospy.loginfo('Optimization complete. Model:\n%s\n Kernel:\n%s', str(self.gp), str(self.kernel.lengthscale))

        self.hp_init = True
        self.last_ll = self.average_log_likelihood()

    def predict(self, x, return_std=False):
        if self.gp is None:
            #raise RuntimeError('Model is not fitted yet!')
            pred_mean = 0
            pred_std = float('inf')
        else:
            x = np.asarray(x)
            if len(x.shape) == 1:
                x = x.reshape(1, -1)
            pred_mean, pred_var = self.gp.predict_noiseless(x)
            # To catch negative variances
            if pred_var < 0:
                rospy.logwarn('Negative variance %f rounding to 0', pred_var)
                pred_var = 0
            pred_std = np.sqrt(pred_var)

        if return_std:
            return np.squeeze(pred_mean), np.squeeze(pred_std)
        else:
            return np.squeeze(pred_mean)

    def clear(self):
        self.inputs = []
        self.outputs = []
        self.kernel = None
        self.gp = None

    def fit(self, X, y):
        """Initialize the model from lists of inputs and corresponding rewards.

        Parameters
        ----------
        X : Iterable of inputs
        Y : Iterable of corresponding rewards
        """
        if len(X) != len(y):
            raise RuntimeError('X and Y lengths must be the same!')

        self.inputs = list(X)
        self.outputs = list(y)
        self._initialize()
        self.batch_optimize(self.hp_batch_retries)

    @property
    def num_samples(self):
        return len(self.inputs)

    @property
    def model(self):
        return self.gp
Example #15
0
    def gpy_regression(hyperparameters: List[Hyperparameter],
                       gp_config: GPConfig, X_sample: np.ndarray,
                       Y_sample: np.ndarray) -> GPRegression:
        kernel = GPyModel.create_kernel(gp_config.kernel,
                                        X_sample.shape[1],
                                        ARD=gp_config.ard)

        # If there is only one sample, .std() == 0 and Y ends up being NaN.
        model = GPRegression(X_sample,
                             Y_sample,
                             kernel=kernel,
                             normalizer=len(X_sample) > 1)

        logging.debug("GPY hyperparam optimization start")

        min_bound = 1e-2
        max_bound = 1e3

        # TODO: bugbugbug
        if gp_config.informative_prior:
            for i, param in enumerate(hyperparameters):
                prior = GPyModel.prior_for_hyperparam(gp_config, param)
                model.kern.lengthscale[[i]].set_prior(prior)

            variance_prior = GPy.priors.Gamma(gp_config.gamma_a,
                                              gp_config.gamma_b)

            # TODO: Gaussian_noise.variance prior?

            model.kern.variance.unconstrain()
            model.kern.variance.set_prior(variance_prior)
        else:
            model.Gaussian_noise.variance.unconstrain()
            model.Gaussian_noise.variance.constrain_bounded(
                min_bound, max_bound)

            if gp_config.gamma_prior:
                model.kern.variance.set_prior(
                    GPy.priors.Gamma(gp_config.gamma_a, gp_config.gamma_b))
                model.kern.lengthscale.set_prior(
                    GPy.priors.Gamma(gp_config.gamma_a, gp_config.gamma_b))
            else:
                model.kern.variance.unconstrain()
                model.kern.variance.constrain_bounded(min_bound, max_bound)

                model.kern.lengthscale.unconstrain()
                model.kern.lengthscale.constrain_bounded(min_bound, max_bound)

        # TODO: parallel=True?
        # model.optimize_restarts(gp_config.num_optimize_restarts)
        logging.error("TODO: SET PARALLEL BACK FROM 2")
        logging.error("TODO: SET PARALLEL BACK FROM 2")
        logging.error("TODO: SET PARALLEL BACK FROM 2")
        logging.error("TODO: SET PARALLEL BACK FROM 2")
        logging.error("TODO: SET PARALLEL BACK FROM 2")
        model.optimize_restarts(2)

        logging.debug("GPY hyperparam optimization DONE, params: %s",
                      model.param_array)

        return GPyModel.wrap_kernel_with_rounding(model, hyperparameters)