class GP(Base): """A class that is declared for performing GP interpolation. GP interpolation (usually) works on the principle of finding the best unbiased predictor. Parameters ---------- type : str, optional This parameter defines the type of Kriging under consideration. This implementation uses PyKrige package (https://github.com/bsmurphy/PyKrige). The user needs to choose between "Ordinary" and "Universal". """ def __init__( self, kernel=RBF(2, ARD=True), ): super().__init__() self.kernel = kernel def _fit(self, X, y, n_restarts=5, verbose=False, random_state=None): """ Fit method for GP Interpolation This function shouldn't be called directly. """ np.random.seed(random_state) if len(y.shape) == 1: y = y.reshape(-1, 1) self.model = GPRegression(X, y, self.kernel) self.model.optimize_restarts(n_restarts, verbose=verbose) return self def _predict_grid(self, x1lim, x2lim): """The function that is called to return the interpolated data in Kriging Interpolation in a grid. This method shouldn't be called directly""" lims = (*x1lim, *x2lim) x1min, x1max, x2min, x2max = lims x1 = np.linspace(x1min, x1max, self.resolution) x2 = np.linspace(x2min, x2max, self.resolution) X1, X2 = np.meshgrid(x1, x2) X = np.array([(i, j) for i, j in zip(X1.ravel(), X2.ravel())]) predictions = self.model.predict(X)[0].reshape(len(x1), len(x2)) return predictions.ravel() def _predict(self, X, return_variance=False): """This function should be called to return the interpolated data in kriging in a pointwise manner. This method shouldn't be called directly.""" predictions, variance = self.model.predict(X) if return_variance: return predictions.ravel(), variance else: return predictions.ravel()
class MaximumLikelihoodGaussianProcess(object): """ Gaussian Process model which has its own hyperparameters chosen by a maximum likelihood process """ # Can't have instantiation of model without supplying data def __init__(self, X, Y, kernel, max_feval): if not GPRegression: raise ImportError('No module named GPy') self.X = X self.Y = Y self.kernel = kernel self.model = GPRegression(X=self.X, Y=self.Y, kernel=self.kernel) self.max_feval = max_feval # TODO make this a variable. self.num_restarts = 20 def fit(self): """ Fits the model with random restarts. :return: """ self.model.optimize_restarts(num_restarts=self.num_restarts, verbose=False) def predict(self, x): return self.model.predict(Xnew=x)
class Stationary(Base): """ Matern32 kernel for sensor placement """ def __init__(self, n_restarts, kernel_name, verbose=True): super().__init__(verbose) self.__n_restarts = n_restarts self.__kernel_name = kernel_name def _Kernel(self, S1, S2=None): return self.__model.kern.K(S1, S2) def _fit(self, X, y, ECM=None): self._X = X self._y = y kern_dict = { 'm32': Matern32(input_dim=self._X.shape[1], active_dims=list(range(self._X.shape[1])), ARD=True), 'm52': Matern52(input_dim=self._X.shape[1], active_dims=list(range(self._X.shape[1])), ARD=True), 'rbf': RBF(input_dim=self._X.shape[1], active_dims=list(range(self._X.shape[1])), ARD=True) } self.__model = GPRegression(X, y, kern_dict[self.__kernel_name]) self.__model.optimize_restarts(self.__n_restarts, verbose=self._verbose) return self def _predict(self, X, return_cov=True): if not return_cov: return self.__model.predict(X)[0] return self.__model.predict(X, full_cov=True)
def __call__(self, gp: GPRegression, X: np.ndarray, f_s: float, xi: float = 0.01) -> np.ndarray: assert X is not None mu, var = gp.predict(X) if var <= 0: logging.debug(""" GPy predicted negative variance, setting to 0 instead. This is a known issue. See https://github.com/SheffieldML/GPy/issues/253 for more details""") var = 1e-6 sigma = np.sqrt(var) ei = self.raw_call(mu, sigma, f_s, xi) return ei
class GPModel(): def __init__(self, X, Y, kernel_expression=SumKE(['WN'])._initialise()): self.X = X self.Y = Y self.kernel_expression = kernel_expression self.restarts = None self.model = None self.cached_utility_function = None self.cached_utility_function_type = None # Kwargs passed to optimize_restarts, which passes them to optimize # Check comments in optimize's class AND optimization.get_optimizer for real list of optimizers # TODO: Eventually set robust to True; see description in optimize_restarts method def fit(self, restarts=None, optimiser='lbfgsb', verbose=False, robust=False, **kwargs): if restarts is None: if self.restarts is None: raise ValueError('No restarts value specified') else: self.restarts = restarts self.model = GPRegression(self.X, self.Y, self.kernel_expression.to_kernel()) with warnings.catch_warnings(): # Ignore known numerical warnings warnings.simplefilter('ignore') self.model.optimize_restarts(num_restarts=self.restarts, verbose=verbose, robust=robust, optimizer=optimiser, **kwargs) return self def interpret(self): return fit_ker_to_kex_with_params( self.model.kern, deepcopy(self.kernel_expression)).get_interpretation() def predict(self, X, quantiles=(2.5, 97.5), full_cov=False, Y_metadata=None, kern=None, likelihood=None, include_likelihood=True): mean, cov = self.model.predict(X, full_cov, Y_metadata, kern, likelihood, include_likelihood) qs = self.model.predict_quantiles(X, quantiles, Y_metadata, kern, likelihood) return { 'mean': mean, 'covariance': cov, 'low_quantile': qs[0], 'high_quantile': qs[1] } def change_plotting_library(self, library='plotly_offline'): '''Wrapper of GPy.plotting's homonymous function; supported values are: 'matplotlib', 'plotly', 'plotly_online', 'plotly_offline' and 'none'. If 'plotly' then a 3-tuple is returned, with as 1st value the Figure object requiring a .show() to display.''' change_plotting_library(library) def plot(self): return self.model.plot() # Model fit objective criteria & related values: def _ll(self): return self.model.log_likelihood() def _n(self): return len(self.model.X) # number of data points def _k(self): return self.model._size_transformed( ) # number of estimated parameters, i.e. model degrees of freedom def _ordered_score_ps(self): return self.model, self._ll(), self._n(), self._k() def compute_utility(self, score_f): self.cached_utility_function = score_f(*self._ordered_score_ps()) self.cached_utility_function_type = score_f.__name__ return self.cached_utility_function
class EmpiricalStickBreakingGPModel(Model): """ Compute the empirical probability given the counts, convert the empirical probability into a real valued vector that can be modeled with a GP. """ def __init__(self, K, kernel, D=1, alpha=1): self.alpha = alpha self.K = K self.D = D self.kernel = kernel def add_data(self, Z, X, optimize_hypers=True): assert Z.ndim == 2 and Z.shape[1] == self.D M = Z.shape[0] assert X.shape == (M, self.K) # Get the empirical probabilities (offset by 1 to ensure nonzero) pi_emp_train = (self.alpha+X).astype(np.float) / \ (self.alpha + X).sum(axis=1)[:,None] # Convert these to psi's self.Z = Z self.psi = np.array([pi_to_psi(pi) for pi in pi_emp_train]) # Compute the mean value of psi self.mu = self.psi.mean(axis=0) self.psi -= self.mu # Create the GP Regression model from GPy.models import GPRegression self.model = GPRegression(Z, self.psi, self.kernel) # Optimize the kernel parameters if optimize_hypers: self.model.optimize(messages=True) def initialize_from_data(self, initialize_to_mle=False): "For consistency" pass def generate(self, keep=True, **kwargs): raise NotImplementedError def collapsed_predict(self, Z_test): psi_pred, psi_pred_var = self.model.predict(Z_test, full_cov=False) psi_pred += self.mu pi_pred = np.array([psi_to_pi(psi) for psi in psi_pred]) return pi_pred, psi_pred, psi_pred_var def predict(self, Z_test): return self.collapsed_predict(Z_test) def predictive_log_likelihood(self, Z_test, X_test): pi_pred, _, _ = self.predict(Z_test) pll = 0 pll += gammaln(X_test.sum(axis=1)+1).sum() - gammaln(X_test+1).sum() pll += np.nansum(X_test * np.log(pi_pred)) return pll, pi_pred
data = pods.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta', gene_number=937) x = data['X'] y = data['Y'] offset = np.mean(y) scale = np.sqrt(np.var(y)) yhat = (y - offset) / scale #kernel = RBF(input_dim=1, variance=100) #kernel = Matern32(input_dim=1, variance=2.0, lengthscale=200) model = GPRegression(x, yhat) model.kern.lengthscale = 20 #this will widen with 100, 200 #gp_regression.likelihood.variance = 0.001 print(model.log_likelihood()) model.optimize() print(model.log_likelihood()) xt = np.linspace(-20, 260, 100)[:, np.newaxis] yt_mean, yt_var = model.predict(xt) plot_gp(yt_mean, yt_var, xt, X_train=model.X.flatten(), Y_train=model.Y.flatten())
class DCKE(RegressorMixin): """ Dynamically Controlled Kernel Estimation Computes the conditional expectation $E[Y \mid X=x]$ from a training set $X_i$, $y_i$, $i=1, \ldots, N$ of joint realizations of $X$ and $Y$ for an arbitrary prediction set of $x$'s. The DCKE regressor first uses local regression on a mesh grid to solve the problem on the mesh grid and then uses GPR to evaluate in between the points on the mesh grid. Optionally, a control variate $Z$ can be supplied together with $\mu_Z = E[Z \mid X=x_k]$ for the points $x_k$ on the mesh grid. In that case, the expectation $E[Y +\beta (Z-\mu_Z) \mid X=x_k]$ is computed on the mesh grid with variance reduced by the correlation between $Y$ and $Z$. """ def __init__(self, locreg, gpr_kernel): """ Initializes the DCKE object. :param locreg: an instance of LocalRegression :param gpr_kernel: an instance of GPy.kern """ self.locreg = locreg self.gpr_kernel = gpr_kernel self.gpr_ = None self.X_train_ = None self.y_train_ = None self.x_mesh_ = None self.y_mesh_ = None self.Z_ = None self.mz_ = None self.cov_ = None self.var_ = None self.beta_ = None def fit(self, X, y, x_mesh, Z=None, mz=None, bandwidth=None): """ Fits the DCKE to training data. :param X: a numpy array of shape (num_samples, num_dimensions) :param y: a numpy array of shape (num_samples,) :param Z: a numpy array of shape (num_samples,) :param x_mesh: a numpy array of shape (num_meshes, num_dimensions) :param mz: a numpy array of shape (num_meshes,) any any mz[k] satisties mz[k] = E[Z \mid X=x_k]$ where x_k are the points in x_mesh :param bandwidth: bandwidth parameter for the local regression :return: """ self.X_train_ = X self.y_train_ = y self.x_mesh_ = x_mesh if Z is None and mz is None: self.Z_ = np.zeros_like(self.y_train_) self.mz_ = np.zeros(self.x_mesh_.shape[0]) elif (Z is None and mz is not None) or (Z is not None and mz is None): raise ValueError( 'Parameter Z and mz have to be either both None or both not None.' ) else: self.Z_ = Z self.mz_ = mz self.locreg.warm_start = True self.locreg.fit(X, y, bandwidth) def _calculate_locregs(self): """ Uses the approximate conditional expectation operator $\tilde E[_ \mid X=x]$ defined by the local regression in self.locreg to compute the approximate optimal beta for the control variate $Z$ defined by $\beta_x = - \tfrac{\Cov[Y, Z \mid X=x]}{\Var[Z \mid X=x]}$ for all $x$ in self.x_mesh. :return: beta, a numpy array of shape (num_mesh_points, ) """ h = self.locreg.bandwidth n = self.x_mesh_.shape[0] self.cov_ = np.zeros(n) self.var_ = np.zeros(n) self.y_mesh_ = np.zeros(n) self.beta_ = np.zeros(n) m_y = np.zeros(n) m_z = np.zeros(n) for i in range(n): m_y[i] = self.locreg.predict(np.atleast_2d( self.x_mesh_[i]).T).squeeze() self.locreg.fit_partial(np.atleast_2d(self.Z_).T, h) m_z[i] = self.locreg.predict_partial().squeeze() self.locreg.fit_partial( (self.y_train_ - m_y[i]) * (self.Z_ - m_z[i]), h) self.cov_[i] = self.locreg.predict_partial().squeeze() self.locreg.fit_partial((self.Z_ - m_z[i])**2, h) self.var_[i] = self.locreg.predict_partial().squeeze() self.beta_[i] = -self.cov_[i] / self.var_[i] self.locreg.fit_partial( self.y_train_ + self.beta_[i] * (self.Z_ - self.mz_[i]), h) self.y_mesh_[i] = self.locreg.predict_partial() def predict(self, X): """ Predicts the conditional expectation $E[Y \mid X=x]$ for all x in $X$. :param X: a numpy array of shape (num_predictions, num_dimensions) :return: a numpy array of shape (num_predictions,) """ self._calculate_locregs() self.gpr_ = GPRegression(self.x_mesh_, np.atleast_2d(self.y_mesh_).T, self.gpr_kernel) self.gpr_.optimize(messages=False) #self.gpr_.optimize_restarts(num_restarts = 10) y_pred, self.gp_var_ = self.gpr_.predict(X) self.gp_var_ = self.gp_var_.squeeze() return y_pred.squeeze()
class KernelKernelGPModel: def __init__(self, kernel_kernel: Optional[Covariance] = None, noise_var: Optional[float] = None, exact_f_eval: bool = False, optimizer: Optional[str] = 'lbfgsb', max_iters: int = 1000, optimize_restarts: int = 5, verbose: bool = True, kernel_kernel_hyperpriors: Optional[HyperpriorMap] = None): """ :param kernel_kernel: :param noise_var: :param exact_f_eval: :param optimizer: :param max_iters: :param optimize_restarts: :param verbose: :param kernel_kernel_hyperpriors: """ self.noise_var = noise_var self.exact_f_eval = exact_f_eval self.optimize_restarts = optimize_restarts self.optimizer = optimizer self.max_iters = max_iters self.verbose = verbose self.covariance = kernel_kernel self.kernel_hyperpriors = kernel_kernel_hyperpriors self.model = None def train(self): """Train (optimize) the model.""" if self.max_iters > 0: # Update the model maximizing the marginal likelihood. if self.optimize_restarts == 1: self.model.optimize(optimizer=self.optimizer, max_iters=self.max_iters, messages=False, ipython_notebook=False) else: self.model.optimize_restarts(num_restarts=self.optimize_restarts, optimizer=self.optimizer, max_iters=self.max_iters, ipython_notebook=False, verbose=self.verbose, robust=True, messages=False) def _create_model(self, x: np.ndarray, y: np.ndarray): """Create model given input data X and output data Y. :param x: 2d array of indices of distance builder :param y: model fitness scores :return: """ # Make sure input data consists only of positive integers. assert np.issubdtype(x.dtype, np.integer) and x.min() >= 0 # Define kernel self.input_dim = x.shape[1] # TODO: figure out default kernel kernel initialization if self.covariance is None: assert self.covariance is not None # kern = GPy.kern.RBF(self.input_dim, variance=1.) else: kern = self.covariance.raw_kernel self.covariance = None # Define model noise_var = y.var() * 0.01 if self.noise_var is None else self.noise_var normalize = x.size > 1 # only normalize if more than 1 observation. self.model = GPRegression(x, y, kern, noise_var=noise_var, normalizer=normalize) # Set hyperpriors if self.kernel_hyperpriors is not None: if 'GP' in self.kernel_hyperpriors: # Set likelihood hyperpriors. likelihood_hyperprior = self.kernel_hyperpriors['GP'] set_priors(self.model.likelihood, likelihood_hyperprior, in_place=True) if 'SE' in self.kernel_hyperpriors: # Set kernel hyperpriors. se_hyperprior = self.kernel_hyperpriors['SE'] set_priors(self.model.kern, se_hyperprior, in_place=True) # Restrict variance if exact evaluations of the objective. if self.exact_f_eval: self.model.Gaussian_noise.constrain_fixed(1e-6, warning=False) else: # --- We make sure we do not get ridiculously small residual noise variance if self.model.priors.size > 0: # FIXME: shouldn't need this case, but GPy doesn't have log Jacobian implemented for Logistic self.model.Gaussian_noise.constrain_positive(warning=False) else: self.model.Gaussian_noise.constrain_bounded(1e-9, 1e6, warning=False) def update(self, x_all, y_all, x_new, y_new): """Update model with new observations.""" if self.model is None: self._create_model(x_all, y_all) else: self.model.set_XY(x_all, y_all) self.train() def _predict(self, x: np.ndarray, full_cov: bool, include_likelihood: bool): if x.ndim == 1: x = x[None, :] m, v = self.model.predict(x, full_cov=full_cov, include_likelihood=include_likelihood) v = np.clip(v, 1e-10, np.inf) return m, v def predict(self, x: np.ndarray, with_noise: bool = True): m, v = self._predict(x, False, with_noise) # We can take the square root because v is just a diagonal matrix of variances return m, np.sqrt(v) def get_f_max(self): """ Returns the location where the posterior mean is takes its maximal value. """ return self.model.predict(self.model.X)[0].max() def plot(self, **plot_kwargs): import matplotlib.pyplot as plt self.model.plot(plot_limits=(0, self.model.kern.n_models - 1), resolution=self.model.kern.n_models, **plot_kwargs) plt.show()
class ExperimentalCondition: """ The `ExperimentalCondition` class stores treatment response data for an experimental condition within a `CancerModel`. It stores all replicates for all variables of the experimental condition for a given cancer model system. For example, in CancerModel Derived Xenograph (PDX) experiments it would store the tumour size measurements at each exposure time for all mouse models derived from a single patient. In cancer cell lines (CCLs) it would store all viability measurements for each dose level for all cultures derived from a single cancer cell line and treated with a specific compound. Thus the `ExperimentalCondition` class can be though of a storing data response data for a cancer model in two dimensions: replicates (e.g., a specific mouse or culture) variable condition levels (e.g., a specific time or dose). Common experimental conditions: * Control, i.e. no treatment * Exposure to a specific drug or compound * Treatment with a specific type of ionizing radiation It can have multiple replicates (ie. data for multiple growth curves) """ def __init__(self, name, source_id=None, variable=None, response=None, replicates=None, variable_treatment_start=None, is_control=False): """ Initialize a particular treatment condition within a cancer model. For example, exposure to a given compound in set of PDX models derived from a single patient. :param name: [string] Name of the experimental/treatment condition (e.g., Control, Erlotinib, Paclitaxel, etc.) :param source_id: [string] A unique identifier for the cancer model source. For PDX models this would be the name of id of the patient from which the models were derived. For CCLs this would be the strain from which all cell cultures were derived. :param variable: [ndarray] The independent variable of the experimental condition. For example, the treatment exposure time for each tumour size measurement or the dose variable for each cell viability measurement. :param response: [ndarray] The response metric for the experimental condition. E.g., the tumour size in a PDX model after variable days of treatment exposure or the cell viability measurements in a CCL at a specific compound dose. :param replicates: [ndarray] The indexes of replicate values in the response attribute. :param is_control: [bool] Whether or not the treatment condition is a control. :return [None] Creates the ExperimentalCondition object. """ self.name = name self.variable = np.asarray([[var] for var in variable]) self.response = np.asarray(response.T).astype(float) self.response_norm = None self.variable_end = self.variable[-1][0] # TODO:: Is there any situation where np.array indexing doesn't start at 0? self.variable_start = self.variable[0][0] self.variable_treatment_start = variable_treatment_start if variable_treatment_start is not None else \ self.variable_start self.variable_start_index = np.where( self.variable.ravel() == self.variable_start)[0][0] self.variable_end_index = np.where( self.variable.ravel() == self.variable_end)[0][0] # Assume treatment start is the same as the start of the independent variable, unless the user assigns self.variable_treatment_start_index = self.variable_start_index self.variable_treatment_end_index = self.variable_end_index self.source_id = source_id self.replicates = replicates if isinstance(replicates, list) else list(replicates) self.is_control = is_control self.kl_p_cvsc = None # GPs self.gp = None self.gp_kernel = None # all below are between the <experimental_condition> and the control self.empirical_kl = None # KL divergence stats self.kl_divergence = None self.kl_p_value = None # naive stats # {701: 'mCR', 711: 'mPR', ...} self.best_avg_response = np.array([], dtype=np.float64) self.mrecist = {} self.mrecist_counts = None self.linear_models = [] # {701: response angle, ...} self.response_angle = {} self.response_angle_rel = {} self.response_angle_control = {} self.response_angle_rel_control = {} # response angles based on average of curves self.average_angle = None self.average_angle_rel = None self.average_angle_control = None self.average_angle_rel_control = None # {701: AUC, ...} self.auc = {} self.auc_norm = {} self.auc_gp = None self.auc_gp_control = None self.auc_control = {} self.auc_control_norm = {} self.inverted = False # credible intervals stats self.credible_intervals = [] self.percent_credible_intervals = None self.responder_pvalue_AUC = None self.responder_pvalue_angle = None self.rates_list = [] self.rates_list_control = [] # Full Data is all of the data of the treatments and control self.full_data = np.array([]) # gp_h0 and gp_h1 depend on the full_data self.gp_h0 = None self.gp_h0_kernel = None self.gp_h1 = None self.gp_h1_kernel = None self.delta_log_likelihood_h0_h1 = None self.tgi = None @property def responder_AUC(self, p_value=0.05): """ Decide if the cancer model is a responder based on AUC for a specified p-value cut-off. @param p_value [`float`] The p-value cutoff. Default is 0.05. @return [`bool`] True of False, where True means the cancer model is a responder to the treatment. """ if self.responder_pvalue_AUC is None: self.calculate_responder_pvalue_AUC() return self.responder_pvalue_AUC < p_value @property def responder_angle(self, p_value=0.05): """ Decide if the cancer model is a responder based on response angle for a specified p-value cut-off. @param p_value [`float`] The p-value cutoff. Default is 0.05. @return [`bool`] True of False, where True means the cancer model is a responder to the treatment. """ if self.responder_pvalue_angle is None: self.calculate_responder_pvalue_angle() return self.responder_pvalue_angle < p_value # ---- Single Bracket Subsetting def __getitem__(self, item): """ Implementation of slicing and single bracket subsetting syntax for this object :item [int or slice object] :return [] """ # Deal with slices if isinstance(item, slice): if item.stop > max(self.replicates) or item.start > max( self.replicates): raise IndexError( f"Slice indexes out of bounds. Acceptable slice range is from " f"{min(self.replicates)} to {max(self.replicates) + 1}.") array = np.hstack([self.variable, self.response[item, :].T]) return pd.DataFrame.from_records( array, columns=[ 'variable', *[ 'replicate_' + str(idx) for idx in range( item.start, item.stop, item.step if item.step is not None else 1) ] ]) # Deal with numeric indexing if not isinstance(item, list): item = [item] if not all([isinstance(idx, int) for idx in item]): raise IndexError( "Index must be an int, list of ints or a slice object!") else: if max(item) > max(self.replicates) or min(item) < min( self.replicates): raise IndexError( f"One or more of {item} is an out of bounds index. Acceptable index range is from " f"{min(self.replicates)} to {max(self.replicates)}.") array = np.hstack([self.variable, self.response[item, :].T]) return pd.DataFrame.from_records( array, columns=[ 'variable', *['replicate_' + str(idx) for idx in item] ]) def to_dict(self, json=False): """ Convert a ExperimentalCondition object into a dictionary with attributes as keys for their associated values. If `json` is True, all values will be coerced to JSONizable Python base types. """ # Helper to convert any NumPy types into base types def _if_numpy_to_base(object): if isinstance(object, np.ndarray): return object.tolist() elif isinstance(object, np.generic): return object.item() else: return object if json: return dict( zip(list(self.__dict__.keys()), [ _if_numpy_to_base(item) for item in self.__dict__.values() ])) else: return self.__dict__ ## TODO:: Can we implement this in the constructor? def find_variable_start_index(self): """ Returns the index in the array of the location of the treatment start value, + or - 1. For a PDX model, this corresponds to the index of the day treatment was started. :return [int] The index. """ start = None start_found = False for i in range(len(self.variable.ravel())): if self.variable[ i] - 1 <= self.variable_treatment_start <= self.variable[ i] + 1 and start_found is False: start = i start_found = True return start def normalize_data(self): """ Normalizes all growths using normalize_first_day_and_log_transform helper function. :return [None] modifies self.response_norm """ logger.info("Normalizing data for " + self.name) self.response_norm = self.__normalize_treatment_start_variable_and_log_transform( self.response, self.find_variable_start_index()) def __normalize_treatment_start_variable_and_log_transform( self, response, treatment_start_index): """ Normalize by dividing every response element-wise by the first day's median and then taking the log. :param response [array] the array of values to be normalised: :return [array] the normalised array: """ return np.log( np.asarray( (response.T + 0.01) / response.T[int(treatment_start_index)], dtype=float).T) + 1 def create_full_data(self, control): """ Creates a 2d numpy array with columns time, treatment and tumour size :param control [Boolean] whether the experimental_condition is from the control group: :return [None] Creates the full_data array """ # control for j, entry in enumerate(control.response_norm.T): for y in entry: if self.full_data.size == 0: self.full_data = np.array([control.variable[j][0], 0, y]) else: self.full_data = np.vstack( (self.full_data, [control.variable[j][0], 0, y])) # case for j, entry in enumerate(self.response_norm.T): for y in entry: self.full_data = np.vstack( (self.full_data, [self.variable[j][0], 1, y])) def calculate_tgi(self, control): """ Calculates the Tumour Growth Index of a ExperimentalCondition object :param control [Boolean] whether the experimental_condition is from the control group :return [None] Writes the calculated value into self.tgi """ def TGI(yt, yc, i, j): # calculates TGI between yt (Treatment) and yc (Control) during epoch i, to j return 1 - (yt[j] - yt[i]) / (yc[j] - yc[i]) start = max(self.find_variable_start_index(), control.variable_treatment_start_index) end = min(self.variable_treatment_end_index, control.variable_treatment_end_index) + 1 self.tgi = TGI( self.response_norm.mean(axis=0)[start:end], control.response_norm.mean(axis=0)[start:end], 0, end - start - 1) def fit_gaussian_processes(self, control=None, num_restarts=7): """ This is the new version, which fits only on the `relevant' interval Fits a GP for both the control and case growth curves, H1 with time and treatment, and H0 with only time. :param control If None, then just fits one GP - else, fits 3 different GPs (one for case, two for gp_h0 and gp_h1): :param num_restarts The number of restarts in the optimisation: :return [None] creates the GP objects: """ logger.info("Fitting Gaussian processes for " + self.name) # control for number of measurements per replicate if time not same length # self.response_norm.shape[0] is num replicates, [1] is num measurements ## TODO:: Can we remove this line? obs_per_replicate = self.response_norm.shape[1] print("Now attempting to fit:") print("self.name:") print(self.name) print("Self.source_id:") print(self.source_id) self.gp_kernel = RBF(input_dim=1, variance=1., lengthscale=10.) response_norm_trunc = self.response_norm[:, self. variable_treatment_start_index: self. variable_treatment_end_index] # # Determine index of first mouse death to remove all NaNs before fitting the model # first_death_idx = min(np.sum(~np.isnan(response_norm_trunc), axis=1)) # # # Subset the independent variable and response data # response_norm_trunc = response_norm_trunc[:, 0:first_death_idx] # variable_trunc = self.variable[0:first_death_idx, :] # Reshape the data to pass into GPRegression (flatten into a single column) variable = np.tile( self.variable[self.variable_treatment_start_index:self. variable_treatment_end_index], (len(self.replicates), 1)) response = np.resize( response_norm_trunc, (response_norm_trunc.shape[0] * response_norm_trunc.shape[1], 1)) self.gp = GPRegression(variable, response, self.gp_kernel) self.gp.optimize_restarts(num_restarts=num_restarts, messages=False) if control is not None: # Subset full data for control calculations # self.full_data = self.full_data[np.isin(self.full_data[:, 0], variable_trunc), :] # kernels self.gp_h0_kernel = RBF(input_dim=1, variance=1., lengthscale=10.) self.gp_h1_kernel = RBF(input_dim=2, variance=1., ARD=True) # GPs self.gp_h0 = GPRegression(self.full_data[:, 0:1], self.full_data[:, 2:3], self.gp_h0_kernel) self.gp_h1 = GPRegression(self.full_data[:, 0:2], self.full_data[:, 2:3], self.gp_h1_kernel) # optimize GPs self.gp_h0.optimize_restarts(num_restarts=num_restarts, messages=False, robust=True) # silent exceptions self.gp_h1.optimize_restarts(num_restarts=num_restarts, messages=False, robust=True) self.delta_log_likelihood_h0_h1 = self.gp_h1.log_likelihood( ) - self.gp_h0.log_likelihood() def fit_linear_models(self): """ Fits a separate OLS model, "Response ~ Variable + 0", to each replicate in the object. :return [list] List of OLS model objects, with each index corresponding to the replicate that model was fit for. """ model_dfs = [ pd.DataFrame({ "Response": resp, "Variable": self.variable.flatten() }) for resp in self.response ] self.linear_models = [ smf.ols(formula="Response ~ Variable + 0", data=model_df).fit() for model_df in model_dfs ] def calculate_lm_slopes(self): """ Calculate the slope of each replicate linear model in degrees. The slope is defined as the arctan of the coefficient for the independent variable in the linear model. Results are converted to degrees. :return [ndarray] Slope of the linear model for each replicate in degrees. """ params = np.array( [model.params.values.item() for model in self.linear_models]) return np.arctan(params) * (180 / np.pi) def calculate_kl_divergence(self, control): """ Calculates the KL divergence between the GPs fit for both the batched controls and batched cases. :param control: The corresponding control ExperimentalCondition object :return: The KL divergence """ logger.info("Calculating the KL Divergence for " + self.name) def kl_integrand(variable): """ Calculates the KL integrand :param variable [int?] The independent variable for the Gaussian Process Model (either time or dose). :return [float] The integrand """ mean_control, var_control = control.gp.predict( np.asarray([[variable]])) mean_case, var_case = self.gp.predict(np.asarray([[variable]])) return ((var_control + (mean_control - mean_case)**2) / (2 * var_case)) + ((var_case + (mean_case - mean_control)**2) / (2 * var_control)) - 1 max_x_index = min(self.variable_treatment_end_index, control.variable_treatment_end_index) if control.response.shape[1] > self.response.shape[1]: self.kl_divergence = abs( 1 / (self.variable[max_x_index] - self.variable_treatment_start) * quad(kl_integrand, self.variable_treatment_start, self.variable[max_x_index], limit=100)[0])[0] else: self.kl_divergence = abs(1 / (control.variable[max_x_index] - self.variable_treatment_start) * quad(kl_integrand, self.variable_treatment_start, control.variable[max_x_index], limit=100)[0])[0] logger.info(self.kl_divergence) def calculate_responder_pvalue_AUC(self): """ Conduct a Mann-Whitney rank test between the AUC values for the treatment vs the AUC of the control and return the p-value. """ self.responder_pvalue_AUC = \ stats.mannwhitneyu(list(self.auc.values()), list(self.auc_control.values()), alternative="less").pvalue def calculate_responder_pvalue_angle(self): """ Conduct a Mann-Whitney rank test between the response angle values for the treatment vs the response angle of the control and return the p-value. """ self.responder_pvalue_angle = \ stats.mannwhitneyu(list(self.response_angle.values()), list(self.response_angle_control.values()), alternative="less").pvalue @staticmethod def __fit_single_gaussian_process(variable, response_norm, num_restarts=7): """ GP fitting. Returns the GP and kernel. :param variable: time :param response_norm: log-normalized target :return [tuple] a tuple: - the gp object - the kernel """ obs_per_replicate = response_norm.shape[1] kernel = RBF(input_dim=1, variance=1., lengthscale=10.) variable = np.tile(variable, (response_norm.shape[0], 1)) response = np.resize( response_norm, (response_norm.shape[0] * response_norm.shape[1], 1)) gp = GPRegression(variable, response, kernel) gp.optimize_restarts(num_restarts=num_restarts, messages=False) return gp, kernel @staticmethod def __relativize(y, start): """ Normalises a numpy array to the start day :param response [ndarray] the array to be normalised: :param start [int] the start day: :return [ndarray] the normalised array: """ return y / y[start] - 1 @staticmethod def __centre(y, start): """ Centres a numpy array to the start day :param response [ndarray] the array to be normalised: :param start [int] the start day: :return [ndarray] the normalised array: """ return y - y[start] @staticmethod def __compute_response_angle(variable, response, start): """ Calculates the response angle for observations response, given time points variable and start point start :param variable [ndarray] the time points :param response [ndarray] the observations :param start [numpy array] the start point for the angle computation :return [float] the angle: """ min_length = min(len(variable), len(response)) model = sm.OLS(response[start:min_length], variable[start:min_length], missing='drop') # Drop NaNs results = model.fit() return np.arctan(results.params[0]) def calculate_response_angles(self, control): """ Builds the response angle dict. :param control [ExperimentalCondition] the corresponding control object :return [None] writes to the angle parameters """ start = self.find_variable_start_index() for i in range(len(self.replicates)): if start is None: raise ValueError( "The `self.variable_start_index` parameter is missing, please initialize this value." ) else: self.response_angle[ self.replicates[i]] = self.__compute_response_angle( self.variable.ravel(), self.__centre(self.response[i], start), start) self.response_angle_rel[ self.replicates[i]] = self.__compute_response_angle( self.variable.ravel(), self.__relativize(self.response[i], start), start) self.average_angle = self.__compute_response_angle( self.variable.ravel(), self.__centre(np.nanmean(self.response, axis=0), start), start) self.average_angle_rel = self.__compute_response_angle( self.variable.ravel(), self.__relativize(np.nanmean(self.response, axis=0), start), start) self.average_angle_control = self.__compute_response_angle( control.variable.ravel(), self.__centre(np.nanmean(control.response, axis=0), start), start) self.average_angle_rel_control = self.__compute_response_angle( control.variable.ravel(), self.__relativize(np.nanmean(control.response, axis=0), start), start) @staticmethod def __calculate_AUC(variable, response): """ Calculates the area under the curve of a set of observations :param variable [ndarray] the time points :param response [ndarray] the observations :return [float] The area under the curve """ min_length = min(len(variable), len(response)) AUC = sklearn.metrics.auc(x=variable[0:min_length + 1], y=response[0:min_length + 1]) return AUC def calculate_gp_auc(self): """ Builds the AUC (Area under the curve) with respect to the GP fit. :return """ self.auc_gp = self.__calculate_AUC(self.variable, self.gp.predict(self.variable)[0]) def calculate_auc(self, control): """ Builds the AUC (Area under the curve) dict for response. :param control: the corresponding control object: :return [None]: """ start = max(self.find_variable_start_index(), control.find_variable_start_index()) end = min(self.variable_treatment_end_index, control.variable_treatment_end_index) for i in range(len(self.replicates)): self.auc[self.replicates[i]] = self.__calculate_AUC( self.variable.ravel()[start:end], self.response[i, start:end]) def calculate_auc_norm(self, control): """ Builds the AUC (Area under the curve) dict. for response_norm :param control: the corresponding control object: :return [None]: """ start = max(self.find_variable_start_index(), control.find_variable_start_index()) end = min(self.variable_treatment_end_index, control.variable_treatment_end_index) for i in range(len(self.replicates)): self.auc_norm[self.replicates[i]] = self.__calculate_AUC( self.variable.ravel()[start:end], self.response_norm[i, start:end]) def calculate_mrecist(self): """ Builds the mRECIST dict. - **mCR**: BestResponse < -95% AND BestAverageResponse < -40% - **mPR**: BestResponse < -50% AND BestAverageResponse < -20% - **mSD**: BestResponse < 35% AND BestAverageResponse < 30% - **mPD**: everything else :return [None] """ start = self.find_variable_start_index() end = self.variable_treatment_end_index ## FIXME:: Why does this go through almost the same loop twice? Is there a missing if condition? for i in range(len(self.replicates) - 1): # days_volume = zip(self.variable.ravel(), self.response[i]) print(i) if start is None: raise ValueError( "The `start` attribute for this `ExperimentalCondition` object is set to None, " "please reset.") else: initial_volume = self.response[i][start] # array of all responses for t >= 3 responses = [] average_responses = [] for day, volume in zip(self.variable.ravel(), self.response[i]): if (day - self.variable_treatment_start >= 3) and (day <= self.variable[end]): responses.append( ((volume - initial_volume) / initial_volume) * 100) average_responses.append(np.average(responses)) if min(responses) < -95 and min(average_responses) < -40: self.mrecist[self.replicates[i]] = 'mCR' elif min(responses) < -50 and min(average_responses) < -20: self.mrecist[self.replicates[i]] = 'mPR' elif min(responses) < 35 and min(average_responses) < 30: self.mrecist[self.replicates[i]] = 'mSD' else: self.mrecist[self.replicates[i]] = 'mPD' if self.best_avg_response is not None: self.best_avg_response = np.array([], dtype=np.float64) for i in range(len(self.replicates)): days_volume = zip(self.variable.ravel(), self.response[i]) start = self.find_variable_start_index() if start is None: raise ValueError( "The `start` attribute for this `ExperimentalCondition` object is set to None, " "please reset.") else: initial_volume = self.response[i][start] # array of all responses for t >= 10 responses = [] average_responses = [] day_diff = 0 for day, volume in days_volume: day_diff = day - self.variable_treatment_start if day >= self.variable_treatment_start and day_diff >= 3: responses.append( ((volume - initial_volume) / initial_volume) * 100) average_responses.append(np.average(responses)) self.best_avg_response = np.append(self.best_avg_response, min(average_responses)) if min(responses) < -95 and min(average_responses) < -40: self.mrecist[self.replicates[i]] = 'mCR' elif min(responses) < -50 and min(average_responses) < -20: self.mrecist[self.replicates[i]] = 'mPR' elif min(responses) < 35 and min(average_responses) < 30: self.mrecist[self.replicates[i]] = 'mSD' else: self.mrecist[self.replicates[i]] = 'mPD' def enumerate_mrecist(self): """ Builds up the mrecist_counts attribute with number of each occurrence of mRECIST experimental_condition. :return: """ # TODO:: Instead of error, we could just call method to calculate mrecist, then give the user a warning? if self.mrecist is None: raise ValueError( "`ExperimentalCondition` object mrecist attribute is none, please calculate mrecist first!" ) self.mrecist_counts = Counter(mCR=0, mPR=0, mSD=0, mPD=0) for replicate in self.replicates: mrecist = self.mrecist[replicate] if mrecist == 'mCR': self.mrecist_counts['mCR'] += 1 elif mrecist == 'mPR': self.mrecist_counts['mPR'] += 1 elif mrecist == 'mSD': self.mrecist_counts['mSD'] += 1 elif mrecist == 'mPD': self.mrecist_counts['mPD'] += 1 def __credible_interval(self, threshold, variable_2, variable_1=0, control=None): """ Credible interval function, for finding where the two GPs diverge. ## FIXME:: Is variable float or int? :param threshold [float] The variable of confidence :param variable_2 [int] The value of variable at the end of the range (i.e, time 2 or dose 2) :param variable_1 [int] The value of variable at the start of the range (i.e., time 1 or dose 1) :param control: the corresponding control object: :return: """ if control is not None: mu = 0 sigma = 1 a = np.array([1, -1, -1, 1]) means = np.array([ self.gp.predict(np.asarray([[variable_2]])), self.gp.predict(np.asarray([[variable_1]])), control.gp.predict(np.asarray([[variable_2]])), control.gp.predict(np.asarray([[variable_1]])) ])[:, 0, 0] controlp = [ control.gp.predict(np.asarray([[variable_1]])), control.gp.predict(np.asarray([[variable_2]])) ] variances = np.zeros((4, 4)) variances[0:2, 0:2] = self.gp.predict(np.asarray([[variable_1], [variable_2]]), full_cov=True)[1] variances[2:4, 2:4] = control.gp.predict(np.asarray([[variable_1], [variable_2]]), full_cov=True)[1] mu = np.dot(a, means) sigma = np.dot(np.dot(a, variances), a.T) interval = norm.interval(threshold, mu, sigma) return (interval[0] < 0) and (interval[1] > 0) else: logger.error( "The private function `__credible_interval` requires control.") def calculate_credible_intervals(self, control): """ :param control: control ExperimentalCondition object :return: """ logger.info("Calculating credible intervals for: " + self.name) if control is not None: largest_x_index = max(len(control.variable), len(self.variable)) if len(control.variable) > len(self.variable): for i in self.variable[1:]: # Why starting at second value? self.credible_intervals.append( (self.__credible_interval(0.95, i[0], control=control)[0], i[0])) else: for i in control.variable[1:]: self.credible_intervals.append( (self.__credible_interval(0.95, i[0], control=control)[0], i[0])) else: logger.error( "The function `calculate_credible_intervals` requires control." ) def calculate_credible_intervals_percentage(self): """ :return [float] The credible intervals; also has the side effect of setting the percent_credible_intervals attribute on the object. """ logger.info("Calculating percentage of credible intervals.") num_true = 0 for i in self.credible_intervals: if i[0] == True: num_true += 1 self.percent_credible_intervals = (num_true / len(self.credible_intervals)) * 100 return self.percent_credible_intervals def __gp_derivative(self, variable, gp): """ Computes the derivative of the Gaussian Process gp (with respect to its 'time' variable) and returns the values of the derivative at time points variable to deal with some weird stuff about :param variable [float] The independent variable, either time for PDX models or dose for CCL models :param gp [GP] The GaussianProcess to be differentiated :return [tuple] A tuple: - The mean - The covariance """ if variable.ndim == 1: variable = variable[:, np.newaxis] mu, ignore = gp.predictive_gradients(variable) ignore, cov = gp.predict(variable, full_cov=True) # FIXME:: How did this not divide by zero previously? mult = [[((1. / gp.kern.lengthscale) * (1 - (1. / gp.kern.lengthscale) * (y - z)**2))[0] for y in variable if y != z] for z in variable] return mu, mult * cov def compute_all_gp_derivatives(self, control): """ :param control [ExperimentalCondition] The control `ExperimentalCondition` for the current `CancerModel` :return: [None] Sets the `rates_list` attribute """ if not isinstance(self.rates_list, list): self.rates_list = list(self.rates_list) if not isinstance(self.rates_list_control, list): self.rates_list_control = list(self.rates_list_control) logger.info("Calculating the GP derivatives for: " + self.name + ' and control') for var in self.variable: self.rates_list.append(self.__gp_derivative(var, self.gp)[0]) for var in control.variable: self.rates_list_control.append( self.__gp_derivative(var, control.gp)[0]) self.rates_list = np.ravel(self.rates_list) self.rates_list_control = np.ravel(self.rates_list_control) logger.info("Done calcluating GP derivatives for: " + self.name + ' and control') def plot_with_control(self, control=None, output_path=None, show_kl_divergence=True, show_legend=True, file_type=None, output_pdf=None): """ Given all of the data and an output path, saves a PDF of the comparison with some statistics as well. :param control: The control ExperimentalCondition object :param output_path: output filepath - if not specified, doesn't save :param show_kl_divergence: flag for displaying calculated kl_divergence :param show_legend: flag for displaying legend :param file_type: can be 'svg' or 'pdf', defaults to 'pdf'. :param output_pdf: an output_pdf object :return: """ if control is None: logger.error("You need to plot with a control.") else: logger.info("Plotting with statistics for " + self.name) fig, ax = plt.subplots() plt.title( f"Case (Blue) and Control (Red) Comparison of \n {str(self.source_id)} with {str(self.name)}" ) # set xlim gp_x_limit = max(self.variable) + 5 # Control control.gp.plot_data(ax=ax, color='red') control.gp.plot_mean(ax=ax, color='red', plot_limits=[0, gp_x_limit]) control.gp.plot_confidence(ax=ax, color='red', plot_limits=[0, gp_x_limit]) # Case self.gp.plot_data(ax=ax, color='blue') self.gp.plot_mean(ax=ax, color='blue', plot_limits=[0, gp_x_limit]) self.gp.plot_confidence(ax=ax, color='blue', plot_limits=[0, gp_x_limit]) # Drug Start Line plt.plot( [self.variable_treatment_start, self.variable_treatment_start], [-10, 15], 'k-', lw=1) plt.xlabel('Day') plt.ylabel('Normalized log tumor size') plt.ylim(-10, 15) # Always select the longest date + 5 plt.xlim(0, max(self.variable) + 5) if show_kl_divergence: plt.text(2, -8, 'KL Divergence: ' + str(self.kl_divergence)) if show_legend is True: plt.legend(loc=0) if file_type == 'pdf': output_pdf.savefig(fig) plt.close(fig) elif file_type == 'svg': plt.savefig(output_path, format="svg") def __repr__(self): """ Returns a string representation of the experimental_condition object. :return [string] The representation: """ return ('\n'.join([ f"Name: {self.name}", f"Treatment Start Date: {self.variable_treatment_start}", f"Source Id: {self.source_id}", f"K-L Divergence: {self.kl_divergence}", f"K-L P-Value: {self.kl_p_value}", f"mRecist: {self.mrecist}", f"Percent Credible Interval: {self.percent_credible_intervals}", f"Rates List: {self.rates_list}" ]))
class Gaussfit: """Handles GPR of input data. """ def __init__(self): """Initialize a gaussfit object.""" self.kernel = None self.model = None self.scale = None self.translate = None self.save_fig = False self.save_path = None self.kernel_name = None # Used for saving file names @property def save_fig(self): return self.save_fig @save_fig.setter def save_fig(self, save_fig): self.save_fig = save_fig @property def save_path(self): return self.save_path @save_path.setter def save_path(self, save_path): self.save_path = save_path def set_gp_kernel(self, kernel=DEFAULTS['kernel'], in_dim=DEFAULTS['input_dim'], variance=DEFAULTS['variance'], lengthscale=DEFAULTS['lengthscale'], multi_dim=False): self.kernel_name = kernel # This is used for saving file names """Sets the kernel of this Gaussfit""" if kernel == 'RBF': self.kernel = RBF(input_dim=in_dim, variance=variance, lengthscale=lengthscale, ARD=multi_dim) elif kernel == 'Exponential': self.kernel = Exponential(input_dim=in_dim, variance=variance, lengthscale=lengthscale, ARD=multi_dim) elif kernel == 'Matern32': self.kernel = Matern32(input_dim=in_dim, variance=variance, lengthscale=lengthscale, ARD=multi_dim) elif kernel == 'Matern52': self.kernel = Matern52(input_dim=in_dim, variance=variance, lengthscale=lengthscale, ARD=multi_dim) else: print 'Kernel not recognized or not implemented' def populate_gp_model(self, observable, lecs, energy=None, rescale=False, fixvariance=0): """Creates a model based on given data and kernel. Args: observable - numpy array with observable. (1 row for each observable from each lec sample) lecs - numpy array with lec parameters fit should be done with regard to (lec 1 coloum 1 and so on, sample 1 on row 1 and so on) energy - energy values """ # Add row with energies to parameters for fit (c for col if that is that is the right way) if energy is not None: lecs = np.r_(lecs, energy) if rescale: (lecs, observable) = self.rescale(lecs, observable) lecs.transpose() observable.transpose() self.model = GPRegression(lecs, observable, self.kernel) self.model.Gaussian_noise.variance.unconstrain() self.model.Gaussian_noise.variance = fixvariance self.model.Gaussian_noise.variance.fix() def optimize(self, num_restarts=1): """Optimize the model.""" #Something worng, model doesn't always converge self.model.optimize_restarts(num_restarts=num_restarts, messages=True) print self.model def rescale(self, inlecs, inobs): """Rescales the input parameters that Gpy handles, so that they are in the interval [-1,1] #Remove 16xnr """ if self.translate is None: self.translate = np.append(np.mean(inlecs, axis=0), np.mean(inobs)) inlecs = inlecs - self.translate[None, :16] inobs = inobs - self.translate[16] if self.scale is None: self.scale = np.append(np.amax(abs(inlecs), axis=0), max(abs(inobs))) self.scale[self.scale <= 1e-10] = 1 outlecs = inlecs / self.scale[None, :16] outobs = inobs / self.scale[16] return (outlecs, outobs) def calculate_valid(self, Xvalid): """Calculates model prediction in validation points""" if self.scale is not None: Xvalid = (Xvalid - self.translate[None, :16]) / self.scale[None, :16] (Ymodel, Variance) = self.model.predict(Xvalid) Ymodel = Ymodel * self.scale[16] + self.translate[16] Variance = Variance * self.scale[16] * self.scale[16] return (Ymodel, Variance) else: return self.model.predict(Xvalid) def plot(self): """Plot the GP-model. Plot limits only for 1D-case. """ print(self.model) self.model.plot() plt.show() def tags_to_title(self, train_tags, val_tags): """Create plot title from tags.""" title = '_'.join(train_tags) title += '_' + '_'.join(val_tags) title += '_' + str(self.kernel_name) return title def save_fig_to_file(self, filename): """Saves the last specified global figure to file with filename File path specified by self.file_path. Also concatenates kernel name used """ plt.savefig(self.save_path + filename) def generate_and_save_tikz(self, Ymodel, Yvalid, Variance, train_tags, val_tags): fig = plt.figure() style.use('seaborn-bright') sigma = np.sqrt(Variance) Expected, = plt.plot([max(Yvalid), min(Yvalid)], [max(Yvalid), min(Yvalid)], '-', linewidth=2, zorder=10, ms=19, label="Expected") Data, = plt.plot(Yvalid, Ymodel, '.', ms=0.5, zorder=3, label="Data points") plt.errorbar(Yvalid, Ymodel, yerr=2 * sigma, fmt='none', alpha=0.5, zorder=1, label="Error bars") plt.xlabel('Simulated value [\si{\milli\barn}]') plt.ylabel('Emulated value [\si{\milli\barn}]') plt.grid(True) modelError = str(self.get_model_error(Ymodel, Yvalid)) # Create a legend for the line. first_legend = plt.legend(handles=[Expected, Data], loc=4) #["Expected", "Data points"], #third_legend = plt.legend(handles=[Error], loc=4) #The folowing saves the file to folder as well as adding 3 rows. The "clip mode=individual" was a bit tricky to add so this is the ugly way to solve it. tikz_save(self.save_path + self.tags_to_title(train_tags, val_tags) + '_predicted_actual.tex', figureheight='\\textwidth*0.8,\nclip mode=individual', figurewidth='\\textwidth*0.8') #Last fix of tikz with script. edit = EditText() #adding tikz file info edit.fix_file( self.save_path + self.tags_to_title(train_tags, val_tags) + '_predicted_actual.tex', '% This file was created by matplotlib2tikz v0.6.3.', '% ' + self.save_path + '\n% ' + self.tags_to_title(train_tags, val_tags) + '\n% Model Error: ' + modelError) #adding legend edit.fix_file( self.save_path + self.tags_to_title(train_tags, val_tags) + '_predicted_actual.tex', '\\end{axis}', '\\legend{Data,Expected}\n\\end{axis}') #adding forget plot edit.fix_file( self.save_path + self.tags_to_title(train_tags, val_tags) + '_predicted_actual.tex', '\\addplot [lightgray!80.0!black, opacity=0.5, mark=-, mark size=3, mark options={solid}, only marks]', '\\addplot [lightgray!80.0!black, opacity=0.5, mark=-, mark size=3, mark options={solid}, only marks, forget plot]' ) #Making transformable to PNG edit.fix_file( self.save_path + self.tags_to_title(train_tags, val_tags) + '_predicted_actual.tex', '% Model Error: ' + modelError, '\documentclass{standalone}\n\usepackage{tikz}\n\usepackage{pgfplots}\n\usepackage{siunitx}\n\n\\begin{document}' ) edit.fix_file( self.save_path + self.tags_to_title(train_tags, val_tags) + '_predicted_actual.tex', '\end{tikzpicture}', '\end{tikzpicture}\n\end{document}') def get_model_error(self, Ymodel, Yvalid, alt=False): """A measure of how great the model's error is compared to validation points Currently uses the rms of the relative error """ #Sum of a numpy array returns another array, we use the first (and only) element #if alt: # return np.sqrt(np.mean(np.square((Ymodel-Yvalid)/np.std(Yvalid)))) return np.sqrt(np.mean(np.square((Ymodel - Yvalid) / Yvalid))) def plot_predicted_actual(self, Ymodel, Yvalid, Variance, train_tags, val_tags): """Plots the predicted values vs the actual values, adds a straight line and 2sigma error bars.""" sigma = np.sqrt(Variance) plt.figure(1) plt.plot(Yvalid, Ymodel, '.') plt.errorbar(Yvalid, Ymodel, yerr=2 * sigma, fmt='none') plt.plot([max(Yvalid), min(Yvalid)], [max(Yvalid), min(Yvalid)], '-') plt.xlabel('Simulated value [mb]') plt.ylabel('Emulated value [mb]') # Do we want to save to file? if self.save_fig: self.save_fig_to_file( self.tags_to_title(train_tags, val_tags) + "_predicted_actual.png") plt.show() def get_sigma_intervals(self, Ymodel, Yvalid, Variance): """Returns the fraction of errors within 1, 2, and 3 sigma.""" sigma = np.sqrt(Variance) n = np.array([0, 0, 0]) errors = abs(Yvalid - Ymodel) for i, e in enumerate(errors): if e <= sigma[i]: n[0] = n[0] + 1 if e <= 2 * sigma[i]: n[1] = n[1] + 1 if e <= 3 * sigma[i]: n[2] = n[2] + 1 return n / float(np.shape(errors)[0]) def plot_modelerror(self, Xvalid, Xlearn, Ymodel, Yvalid, train_tags, val_tags): """ Creates a plot showing the vallidated error """ alldists = cdist(Xvalid, Xlearn, 'euclidean') mindists = np.min(alldists, axis=1) plt.figure(1) plt.plot(mindists, Ymodel - Yvalid, '.') plt.xlabel('Distance to closest training point') plt.ylabel('Vallidated error [mb]') plt.axis([ 0, 1.1 * max(mindists), 1.1 * min(Ymodel - Yvalid), 1.1 * max(Ymodel - Yvalid) ]) #Do we want to save val error to file? if self.save_fig: self.save_fig_to_file( self.tags_to_title(train_tags, val_tags) + "_val_error.png") plt.figure(2) plt.plot(mindists, (Ymodel - Yvalid) / Yvalid, '.') plt.xlabel('Distance to closest training point') plt.ylabel('Vallidated relative error') plt.axis([ 0, 1.1 * max(mindists), 1.1 * min((Ymodel - Yvalid) / Yvalid), 1.1 * max((Ymodel - Yvalid) / Yvalid) ]) #Show model_error in plot if self.save_fig: self.save_fig_to_file( self.tags_to_title(train_tags, val_tags) + "_val_rel_error.png") plt.show() def plot_model(self, Xvalid, Ymodel, Yvalid): """Plot the model of training data with the model of walidation data.""" plt.figure(3) plt.plot(Xvalid, Ymodel, 'bo') plt.plot(Xvalid, Yvalid, 'rx') plt.show() """Plots the kernel function of lec index""" def plot_kernel(self, lec_idx): plot_covariance(self.kernel, visible_dims=lec_idx) plt.show() """Plots a slice of of each lec through the center point Set energy to None if energy is not a parameter in your model""" def plot_lecs(self, center, intervals, energy=None): if energy is not None: center = np.append(center, energy).reshape(1, 17) intervals = np.append(intervals, 0).reshape(1, 17) else: intervals = np.append(intervals, 0).reshape(1, 16) for i in range(16): plt.subplot(4, 4, i + 1) x = np.linspace(center[0][i] - intervals[0][i], center[0][i] + intervals[0][i], num=200) lecs = np.tile(center[0], (200, 1)) lecs[:, i] = x obs, _ = self.calculate_valid(lecs) plt.plot(x, obs) plt.show() def save_model_parameters(self, savepath, traintags, kernel, LEC_LENGTH, lengthscale, multidim, rescale): "Saves GPy model hyperparameters as a .pickle file" "" params = self.model.param_array if (savepath.endswith(".pickle")) and (not os.path.isfile(savepath)): with open(savepath, 'w') as f: pickle.dump([ params, kernel, traintags, LEC_LENGTH, lengthscale, multidim, rescale ], f) elif (not savepath.endswith(".pickle")): print "*****ERROR***** Model properties must be saved as .pickle file *****ERROR*****" elif os.path.isfile(savepath): print "*****ERROR***** File already exists. Cannot save to existing file. *****ERROR*****" def load_model_parameters(self, Ylearn, Xlearn, loadpath): """Loads a GPy model with hyperparameters from a .pickle file""" Xlearn.transpose() Ylearn.transpose() with open(loadpath, 'r') as f: filecontents = pickle.load(f) if len(filecontents) == 6: params, kernel, traintags, LEC_LENGTH, lengthscale, multi_dim = filecontents rescale = False elif len(filecontents) == 7: params, kernel, traintags, LEC_LENGTH, lengthscale, multi_dim, rescale = filecontents print(params) print(LEC_LENGTH) self.set_gp_kernel(kernel=kernel, in_dim=LEC_LENGTH, lengthscale=lengthscale, multi_dim=multi_dim) if rescale: (Xlearn, Ylearn) = self.rescale(Xlearn, Ylearn) m_load = GPRegression(Xlearn, Ylearn, self.kernel, initialize=False) m_load.update_model(False) m_load.initialize_parameter() m_load[:] = params m_load.update_model(True) self.model = m_load def plot_energy_curve(self, mod_obs, val_obs, mod_var, val_energy): plt.plot(val_energy, val_obs, 'x') plt.plot(val_energy, mod_obs, 'o') plt.show()
if __name__ == "__main__": np.random.seed(1) dim = 50 f = sphere X = np.random.uniform(-5, 5, (10, dim)) y = np.array([f(xi) for xi in X]).reshape(-1, 1) gpy_kern = RBFg(input_dim=dim, ARD=False) gpy_model = GPRegression(X, y, kernel=gpy_kern) gpy_model.optimize() mobo_kern = RBF(ARD=False) mobo_model = moboGP(mobo_kern, X, y) mobo_likelihood = Likelihood(mobo_model) mobo_likelihood.evaluate() optimizer = lbfgsb(mobo_model) optimizer.opt() print("gpy:", gpy_model.log_likelihood()) print("mobo:", mobo_model.log_likelihood) Xtest = np.random.uniform(-5, 5, (1000, dim)) ytest = np.array([f(xi) for xi in Xtest]).reshape(-1, 1) gpy_prediction, gpy_cov = gpy_model.predict(Xtest) mobo_prediction, mobo_cov = mobo_model.predict(Xtest) print("gpy:", np.mean(abs(ytest - gpy_prediction))) print("mobo:", np.mean(abs(ytest - mobo_prediction)))
class Kernel(object): def __init__(self, x0, y0, cons=None, alpha=opt.ke_alpha, beta=opt.ke_beta, input_size=opt.ke_input_size, hidden_size=opt.ke_hidden_size, num_layers=opt.ke_num_layers, bidirectional=opt.ke_bidirectional, lr=opt.ke_lr, weight_decay=opt.ke_weight_decay): super(Kernel, self).__init__() self.alpha = alpha self.beta = beta self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=bidirectional) self.lstm = self.lstm.to(opt.device) self.input_size = input_size self.hidden_size = hidden_size self.num_layers = num_layers self.bidirectional = bidirectional self.bi = 2 if bidirectional else 1 self.x = [x0] self.y = torch.tensor([y0], dtype=torch.float, device=opt.device, requires_grad=False) self.cons = [cons] inp, out = clean_x(self.x, self.cons) self.model = GPRegression(inp, out) self.model.Gaussian_noise.constrain_fixed(1e-6, warning=False) self.model.optimize() self.x_best = x0 self.y_best = y0 self.i_best = 0 self.n = 1 self.E = self.embedding(x0).view(1, -1) self.K = self.kernel(self.E[0], self.E[0]).view(1, 1) self.K_inv = torch.inverse(self.K + self.beta * torch.eye(self.n, device=opt.device)) self.optimizer = optim.Adam(self.lstm.parameters(), lr=lr, weight_decay=weight_decay) def embedding(self, xi): inputs = xi.view(-1, 1, self.input_size) outputs, (hn, cn) = self.lstm(inputs) outputs = torch.mean(outputs.squeeze(1), dim=0) outputs = outputs / torch.norm(outputs) return outputs def kernel(self, ei, ej): d = ei - ej d = torch.sum(d * d) k = torch.exp(-d / (2 * self.alpha)) return k def kernel_batch(self, en): n = self.n k = torch.zeros(n, device=opt.device) for i in range(n): k[i] = self.kernel(self.E[i], en) return k def predict(self, xn): n = self.n en = self.embedding(xn) k = self.kernel_batch(en) kn = self.kernel(en, en) t = torch.mm(k.view(1, n), self.K_inv) mu = torch.mm(t, self.y.view(n, 1)) sigma = kn - torch.mm(t, k.view(n, 1)) sigma = torch.sqrt(sigma + self.beta) return mu, sigma def acquisition_cons(self, xn): with torch.no_grad(): xn_ = np.array([xn.cpu().numpy().flatten()]) mu_cons, sigma_cons = self.model.predict(xn_) sigma_cons = sqrt(sigma_cons) PoF = norm.cdf(0, mu_cons, sigma_cons) mu, sigma = self.predict(xn) mu = mu.item() sigma = sigma.item() y_best = self.y_best z = (mu - y_best) / sigma ei = (mu - y_best) * norm.cdf(z) + sigma * norm.pdf(z) return ei * PoF def acquisition(self, xn): with torch.no_grad(): mu, sigma = self.predict(xn) mu = mu.item() sigma = sigma.item() y_best = self.y_best z = (mu - y_best) / sigma ei = (mu - y_best) * norm.cdf(z) + sigma * norm.pdf(z) return ei def kernel_batch_ex(self, t): n = self.n k = torch.zeros(n - 1, device=opt.device) for i in range(t): k[i] = self.kernel(self.E[i], self.E[t]) for i in range(t + 1, n): k[i - 1] = self.kernel(self.E[t], self.E[i]) return k def predict_ex(self, t): n = self.n k = self.kernel_batch_ex(t) kt = self.kernel(self.E[t], self.E[t]) indices = list(range(t)) + list(range(t + 1, n)) indices = torch.tensor(indices, dtype=torch.long, device=opt.device) K = self.K K = torch.index_select(K, 0, indices) K = torch.index_select(K, 1, indices) K_inv = torch.inverse(K + self.beta * torch.eye(n - 1, device=opt.device)) y = torch.index_select(self.y, 0, indices) t = torch.mm(k.view(1, n - 1), K_inv) mu = torch.mm(t, y.view(n - 1, 1)) sigma = kt - torch.mm(t, k.view(n - 1, 1)) sigma = torch.sqrt(sigma + self.beta) return mu, sigma def add_sample(self, xn, yn, consn): self.x.append(xn) self.y = torch.cat((self.y, torch.tensor([yn], dtype=torch.float, device=opt.device, requires_grad=False))) self.cons.append(consn) inp, out = clean_x(self.x, self.cons) self.model.set_XY(inp, out) self.model.optimize() n = self.n if consn > 0: if yn > self.y_best: self.x_best = xn self.y_best = yn self.i_best = n en = self.embedding(xn) k = self.kernel_batch(en) kn = self.kernel(en, en) self.E = torch.cat((self.E, en.view(1, -1)), 0) self.K = torch.cat( (torch.cat((self.K, k.view(n, 1)), 1), torch.cat((k.view(1, n), kn.view(1, 1)), 1)), 0) self.n += 1 self.K_inv = torch.inverse(self.K + self.beta * torch.eye(self.n, device=opt.device)) def add_batch(self, x, y, cons): self.x.extend(x) self.y = torch.cat((self.y, y)) self.cons.extend(cons) inp, out = clean_x(self.x, self.cons) self.model.set_XY(inp, out) self.model.optimize() m = len(x) for i in range(m): n = self.n if self.cons[i] > 0: if y[i].item() > self.y_best: self.x_best = x[i] self.y_best = y[i].item() self.i_best = n en = self.embedding(x[i]) k = self.kernel_batch(en) kn = self.kernel(en, en) self.E = torch.cat((self.E, en.view(1, -1)), 0) self.K = torch.cat( (torch.cat((self.K, k.view(n, 1)), 1), torch.cat((k.view(1, n), kn.view(1, 1)), 1)), 0) self.n += 1 self.K_inv = torch.inverse(self.K + self.beta * torch.eye(self.n, device=opt.device)) def update_EK(self): n = self.n E_ = torch.zeros((n, self.E.size(1)), device=opt.device) for i in range(n): E_[i] = self.embedding(self.x[i]) self.E = E_ K_ = torch.zeros((n, n), device=opt.device) for i in range(n): for j in range(i, n): k = self.kernel(self.E[i], self.E[j]) K_[i, j] = k K_[j, i] = k self.K = K_ self.K_inv = torch.inverse(self.K + self.beta * torch.eye(self.n, device=opt.device)) def loss(self): n = self.n l = torch.zeros(n, device=opt.device) for i in range(n): mu, sigma = self.predict_ex(i) d = self.y[i] - mu l[i] = -(0.918939 + torch.log(sigma) + d * d / (2 * sigma * sigma)) l = -torch.mean(l) return l def opt_step(self): if self.n < 2: return 0.0 self.optimizer.zero_grad() l = self.loss() ll = -l.item() l.backward() self.optimizer.step() self.update_EK() return ll def save(self, save_path): path = os.path.dirname(save_path) if not os.path.exists(path): os.makedirs(path) torch.save(self, save_path)
# choose a kernel #kernel = Matern32(input_dim=1, variance=2.0) #kernel = GridRBF(input_dim=1) #kernel = RBF(input_dim=1, variance=2.0) # gp regression and optimize the paramters using logliklihood gp_regression = GPRegression(x_train, y_train) #gp_regression.kern.lengthscale = 500 #gp_regression.likelihood.variance = 0.001 print("loglikelihood: ", gp_regression.log_likelihood()) gp_regression.optimize() print("loglikelihood: ", gp_regression.log_likelihood()) # predict new unseen samples x_test = np.linspace(1870, 2030, 200)[:, np.newaxis] yt_mean, yt_var = gp_regression.predict(x_test) yt_sd = np.sqrt(yt_var) # draw some samples from the posterior samples = gp_regression.posterior_samples(x_test, size=1).squeeze(1) # plot plot_gp(yt_mean, yt_var, x_test, X_train=x_train, Y_train=y_train, samples=samples)