def permute(self, varb=None): # get model input x = self.x.copy() y = self.y.copy() # shuffle targeet variable col = np.where(varb == self.x_keys)[0] # which one? shuffled = np.random.choice(x[:, col].ravel(), size=x.shape[0], replace=False) x[:, col] = shuffled[:, np.newaxis] # replace # same steps as fit, below can be done more concisely, but I worry about minor differenes x_dim = x.shape[1] y_dim = y.shape[1] kern = buildKernel(x_dim, ARD=self.ARD) mcopy = GPRegression(x, y, kern) if self.heteroscedastic: kern = addFixedKernel(kern, y_dim, self.error) mcopy = GPRegression(x, y, kern) mcopy.optimize() return mcopy.log_likelihood()
data = pods.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta', gene_number=937) x = data['X'] y = data['Y'] offset = np.mean(y) scale = np.sqrt(np.var(y)) yhat = (y - offset) / scale #kernel = RBF(input_dim=1, variance=100) #kernel = Matern32(input_dim=1, variance=2.0, lengthscale=200) model = GPRegression(x, yhat) model.kern.lengthscale = 20 #this will widen with 100, 200 #gp_regression.likelihood.variance = 0.001 print(model.log_likelihood()) model.optimize() print(model.log_likelihood()) xt = np.linspace(-20, 260, 100)[:, np.newaxis] yt_mean, yt_var = model.predict(xt) plot_gp(yt_mean, yt_var, xt, X_train=model.X.flatten(), Y_train=model.Y.flatten())
class GPModel(): def __init__(self, X, Y, kernel_expression=SumKE(['WN'])._initialise()): self.X = X self.Y = Y self.kernel_expression = kernel_expression self.restarts = None self.model = None self.cached_utility_function = None self.cached_utility_function_type = None # Kwargs passed to optimize_restarts, which passes them to optimize # Check comments in optimize's class AND optimization.get_optimizer for real list of optimizers # TODO: Eventually set robust to True; see description in optimize_restarts method def fit(self, restarts=None, optimiser='lbfgsb', verbose=False, robust=False, **kwargs): if restarts is None: if self.restarts is None: raise ValueError('No restarts value specified') else: self.restarts = restarts self.model = GPRegression(self.X, self.Y, self.kernel_expression.to_kernel()) with warnings.catch_warnings(): # Ignore known numerical warnings warnings.simplefilter('ignore') self.model.optimize_restarts(num_restarts=self.restarts, verbose=verbose, robust=robust, optimizer=optimiser, **kwargs) return self def interpret(self): return fit_ker_to_kex_with_params( self.model.kern, deepcopy(self.kernel_expression)).get_interpretation() def predict(self, X, quantiles=(2.5, 97.5), full_cov=False, Y_metadata=None, kern=None, likelihood=None, include_likelihood=True): mean, cov = self.model.predict(X, full_cov, Y_metadata, kern, likelihood, include_likelihood) qs = self.model.predict_quantiles(X, quantiles, Y_metadata, kern, likelihood) return { 'mean': mean, 'covariance': cov, 'low_quantile': qs[0], 'high_quantile': qs[1] } def change_plotting_library(self, library='plotly_offline'): '''Wrapper of GPy.plotting's homonymous function; supported values are: 'matplotlib', 'plotly', 'plotly_online', 'plotly_offline' and 'none'. If 'plotly' then a 3-tuple is returned, with as 1st value the Figure object requiring a .show() to display.''' change_plotting_library(library) def plot(self): return self.model.plot() # Model fit objective criteria & related values: def _ll(self): return self.model.log_likelihood() def _n(self): return len(self.model.X) # number of data points def _k(self): return self.model._size_transformed( ) # number of estimated parameters, i.e. model degrees of freedom def _ordered_score_ps(self): return self.model, self._ll(), self._n(), self._k() def compute_utility(self, score_f): self.cached_utility_function = score_f(*self._ordered_score_ps()) self.cached_utility_function_type = score_f.__name__ return self.cached_utility_function
x_half)[:, None] # First cluster of inputs/covariates X[x_half:, :] = np.linspace( 8, 10, x_half)[:, None] # Second cluster of inputs/covariates rbf = RBF(input_dim=1) mu = np.zeros(N) cov = rbf.K(X) + np.eye(N) * np.sqrt(noise_var) y = np.random.multivariate_normal(mu, cov).reshape(-1, 1) # plt.scatter(X, y) # plt.show() gp_regression = GPRegression(X, y) gp_regression.optimize(messages=True) log_likelihood1 = gp_regression.log_likelihood() model_output(gp_regression, title="GP Regression with loglikelihood: " + str(log_likelihood1)) ################################# # inducing variables, u. Each inducing variable has its own associated input index, Z, which lives in the same space as X. Z = np.hstack((np.linspace(2.5, 4., 3), np.linspace(7, 8.5, 3)))[:, None] sparse_regression = SparseGPRegression(X, y, kernel=rbf, Z=Z) sparse_regression.noise_var = noise_var sparse_regression.inducing_inputs.constrain_fixed() sparse_regression.optimize(messages=True)
class ExperimentalCondition: """ The `ExperimentalCondition` class stores treatment response data for an experimental condition within a `CancerModel`. It stores all replicates for all variables of the experimental condition for a given cancer model system. For example, in CancerModel Derived Xenograph (PDX) experiments it would store the tumour size measurements at each exposure time for all mouse models derived from a single patient. In cancer cell lines (CCLs) it would store all viability measurements for each dose level for all cultures derived from a single cancer cell line and treated with a specific compound. Thus the `ExperimentalCondition` class can be though of a storing data response data for a cancer model in two dimensions: replicates (e.g., a specific mouse or culture) variable condition levels (e.g., a specific time or dose). Common experimental conditions: * Control, i.e. no treatment * Exposure to a specific drug or compound * Treatment with a specific type of ionizing radiation It can have multiple replicates (ie. data for multiple growth curves) """ def __init__(self, name, source_id=None, variable=None, response=None, replicates=None, variable_treatment_start=None, is_control=False): """ Initialize a particular treatment condition within a cancer model. For example, exposure to a given compound in set of PDX models derived from a single patient. :param name: [string] Name of the experimental/treatment condition (e.g., Control, Erlotinib, Paclitaxel, etc.) :param source_id: [string] A unique identifier for the cancer model source. For PDX models this would be the name of id of the patient from which the models were derived. For CCLs this would be the strain from which all cell cultures were derived. :param variable: [ndarray] The independent variable of the experimental condition. For example, the treatment exposure time for each tumour size measurement or the dose variable for each cell viability measurement. :param response: [ndarray] The response metric for the experimental condition. E.g., the tumour size in a PDX model after variable days of treatment exposure or the cell viability measurements in a CCL at a specific compound dose. :param replicates: [ndarray] The indexes of replicate values in the response attribute. :param is_control: [bool] Whether or not the treatment condition is a control. :return [None] Creates the ExperimentalCondition object. """ self.name = name self.variable = np.asarray([[var] for var in variable]) self.response = np.asarray(response.T).astype(float) self.response_norm = None self.variable_end = self.variable[-1][0] # TODO:: Is there any situation where np.array indexing doesn't start at 0? self.variable_start = self.variable[0][0] self.variable_treatment_start = variable_treatment_start if variable_treatment_start is not None else \ self.variable_start self.variable_start_index = np.where( self.variable.ravel() == self.variable_start)[0][0] self.variable_end_index = np.where( self.variable.ravel() == self.variable_end)[0][0] # Assume treatment start is the same as the start of the independent variable, unless the user assigns self.variable_treatment_start_index = self.variable_start_index self.variable_treatment_end_index = self.variable_end_index self.source_id = source_id self.replicates = replicates if isinstance(replicates, list) else list(replicates) self.is_control = is_control self.kl_p_cvsc = None # GPs self.gp = None self.gp_kernel = None # all below are between the <experimental_condition> and the control self.empirical_kl = None # KL divergence stats self.kl_divergence = None self.kl_p_value = None # naive stats # {701: 'mCR', 711: 'mPR', ...} self.best_avg_response = np.array([], dtype=np.float64) self.mrecist = {} self.mrecist_counts = None self.linear_models = [] # {701: response angle, ...} self.response_angle = {} self.response_angle_rel = {} self.response_angle_control = {} self.response_angle_rel_control = {} # response angles based on average of curves self.average_angle = None self.average_angle_rel = None self.average_angle_control = None self.average_angle_rel_control = None # {701: AUC, ...} self.auc = {} self.auc_norm = {} self.auc_gp = None self.auc_gp_control = None self.auc_control = {} self.auc_control_norm = {} self.inverted = False # credible intervals stats self.credible_intervals = [] self.percent_credible_intervals = None self.responder_pvalue_AUC = None self.responder_pvalue_angle = None self.rates_list = [] self.rates_list_control = [] # Full Data is all of the data of the treatments and control self.full_data = np.array([]) # gp_h0 and gp_h1 depend on the full_data self.gp_h0 = None self.gp_h0_kernel = None self.gp_h1 = None self.gp_h1_kernel = None self.delta_log_likelihood_h0_h1 = None self.tgi = None @property def responder_AUC(self, p_value=0.05): """ Decide if the cancer model is a responder based on AUC for a specified p-value cut-off. @param p_value [`float`] The p-value cutoff. Default is 0.05. @return [`bool`] True of False, where True means the cancer model is a responder to the treatment. """ if self.responder_pvalue_AUC is None: self.calculate_responder_pvalue_AUC() return self.responder_pvalue_AUC < p_value @property def responder_angle(self, p_value=0.05): """ Decide if the cancer model is a responder based on response angle for a specified p-value cut-off. @param p_value [`float`] The p-value cutoff. Default is 0.05. @return [`bool`] True of False, where True means the cancer model is a responder to the treatment. """ if self.responder_pvalue_angle is None: self.calculate_responder_pvalue_angle() return self.responder_pvalue_angle < p_value # ---- Single Bracket Subsetting def __getitem__(self, item): """ Implementation of slicing and single bracket subsetting syntax for this object :item [int or slice object] :return [] """ # Deal with slices if isinstance(item, slice): if item.stop > max(self.replicates) or item.start > max( self.replicates): raise IndexError( f"Slice indexes out of bounds. Acceptable slice range is from " f"{min(self.replicates)} to {max(self.replicates) + 1}.") array = np.hstack([self.variable, self.response[item, :].T]) return pd.DataFrame.from_records( array, columns=[ 'variable', *[ 'replicate_' + str(idx) for idx in range( item.start, item.stop, item.step if item.step is not None else 1) ] ]) # Deal with numeric indexing if not isinstance(item, list): item = [item] if not all([isinstance(idx, int) for idx in item]): raise IndexError( "Index must be an int, list of ints or a slice object!") else: if max(item) > max(self.replicates) or min(item) < min( self.replicates): raise IndexError( f"One or more of {item} is an out of bounds index. Acceptable index range is from " f"{min(self.replicates)} to {max(self.replicates)}.") array = np.hstack([self.variable, self.response[item, :].T]) return pd.DataFrame.from_records( array, columns=[ 'variable', *['replicate_' + str(idx) for idx in item] ]) def to_dict(self, json=False): """ Convert a ExperimentalCondition object into a dictionary with attributes as keys for their associated values. If `json` is True, all values will be coerced to JSONizable Python base types. """ # Helper to convert any NumPy types into base types def _if_numpy_to_base(object): if isinstance(object, np.ndarray): return object.tolist() elif isinstance(object, np.generic): return object.item() else: return object if json: return dict( zip(list(self.__dict__.keys()), [ _if_numpy_to_base(item) for item in self.__dict__.values() ])) else: return self.__dict__ ## TODO:: Can we implement this in the constructor? def find_variable_start_index(self): """ Returns the index in the array of the location of the treatment start value, + or - 1. For a PDX model, this corresponds to the index of the day treatment was started. :return [int] The index. """ start = None start_found = False for i in range(len(self.variable.ravel())): if self.variable[ i] - 1 <= self.variable_treatment_start <= self.variable[ i] + 1 and start_found is False: start = i start_found = True return start def normalize_data(self): """ Normalizes all growths using normalize_first_day_and_log_transform helper function. :return [None] modifies self.response_norm """ logger.info("Normalizing data for " + self.name) self.response_norm = self.__normalize_treatment_start_variable_and_log_transform( self.response, self.find_variable_start_index()) def __normalize_treatment_start_variable_and_log_transform( self, response, treatment_start_index): """ Normalize by dividing every response element-wise by the first day's median and then taking the log. :param response [array] the array of values to be normalised: :return [array] the normalised array: """ return np.log( np.asarray( (response.T + 0.01) / response.T[int(treatment_start_index)], dtype=float).T) + 1 def create_full_data(self, control): """ Creates a 2d numpy array with columns time, treatment and tumour size :param control [Boolean] whether the experimental_condition is from the control group: :return [None] Creates the full_data array """ # control for j, entry in enumerate(control.response_norm.T): for y in entry: if self.full_data.size == 0: self.full_data = np.array([control.variable[j][0], 0, y]) else: self.full_data = np.vstack( (self.full_data, [control.variable[j][0], 0, y])) # case for j, entry in enumerate(self.response_norm.T): for y in entry: self.full_data = np.vstack( (self.full_data, [self.variable[j][0], 1, y])) def calculate_tgi(self, control): """ Calculates the Tumour Growth Index of a ExperimentalCondition object :param control [Boolean] whether the experimental_condition is from the control group :return [None] Writes the calculated value into self.tgi """ def TGI(yt, yc, i, j): # calculates TGI between yt (Treatment) and yc (Control) during epoch i, to j return 1 - (yt[j] - yt[i]) / (yc[j] - yc[i]) start = max(self.find_variable_start_index(), control.variable_treatment_start_index) end = min(self.variable_treatment_end_index, control.variable_treatment_end_index) + 1 self.tgi = TGI( self.response_norm.mean(axis=0)[start:end], control.response_norm.mean(axis=0)[start:end], 0, end - start - 1) def fit_gaussian_processes(self, control=None, num_restarts=7): """ This is the new version, which fits only on the `relevant' interval Fits a GP for both the control and case growth curves, H1 with time and treatment, and H0 with only time. :param control If None, then just fits one GP - else, fits 3 different GPs (one for case, two for gp_h0 and gp_h1): :param num_restarts The number of restarts in the optimisation: :return [None] creates the GP objects: """ logger.info("Fitting Gaussian processes for " + self.name) # control for number of measurements per replicate if time not same length # self.response_norm.shape[0] is num replicates, [1] is num measurements ## TODO:: Can we remove this line? obs_per_replicate = self.response_norm.shape[1] print("Now attempting to fit:") print("self.name:") print(self.name) print("Self.source_id:") print(self.source_id) self.gp_kernel = RBF(input_dim=1, variance=1., lengthscale=10.) response_norm_trunc = self.response_norm[:, self. variable_treatment_start_index: self. variable_treatment_end_index] # # Determine index of first mouse death to remove all NaNs before fitting the model # first_death_idx = min(np.sum(~np.isnan(response_norm_trunc), axis=1)) # # # Subset the independent variable and response data # response_norm_trunc = response_norm_trunc[:, 0:first_death_idx] # variable_trunc = self.variable[0:first_death_idx, :] # Reshape the data to pass into GPRegression (flatten into a single column) variable = np.tile( self.variable[self.variable_treatment_start_index:self. variable_treatment_end_index], (len(self.replicates), 1)) response = np.resize( response_norm_trunc, (response_norm_trunc.shape[0] * response_norm_trunc.shape[1], 1)) self.gp = GPRegression(variable, response, self.gp_kernel) self.gp.optimize_restarts(num_restarts=num_restarts, messages=False) if control is not None: # Subset full data for control calculations # self.full_data = self.full_data[np.isin(self.full_data[:, 0], variable_trunc), :] # kernels self.gp_h0_kernel = RBF(input_dim=1, variance=1., lengthscale=10.) self.gp_h1_kernel = RBF(input_dim=2, variance=1., ARD=True) # GPs self.gp_h0 = GPRegression(self.full_data[:, 0:1], self.full_data[:, 2:3], self.gp_h0_kernel) self.gp_h1 = GPRegression(self.full_data[:, 0:2], self.full_data[:, 2:3], self.gp_h1_kernel) # optimize GPs self.gp_h0.optimize_restarts(num_restarts=num_restarts, messages=False, robust=True) # silent exceptions self.gp_h1.optimize_restarts(num_restarts=num_restarts, messages=False, robust=True) self.delta_log_likelihood_h0_h1 = self.gp_h1.log_likelihood( ) - self.gp_h0.log_likelihood() def fit_linear_models(self): """ Fits a separate OLS model, "Response ~ Variable + 0", to each replicate in the object. :return [list] List of OLS model objects, with each index corresponding to the replicate that model was fit for. """ model_dfs = [ pd.DataFrame({ "Response": resp, "Variable": self.variable.flatten() }) for resp in self.response ] self.linear_models = [ smf.ols(formula="Response ~ Variable + 0", data=model_df).fit() for model_df in model_dfs ] def calculate_lm_slopes(self): """ Calculate the slope of each replicate linear model in degrees. The slope is defined as the arctan of the coefficient for the independent variable in the linear model. Results are converted to degrees. :return [ndarray] Slope of the linear model for each replicate in degrees. """ params = np.array( [model.params.values.item() for model in self.linear_models]) return np.arctan(params) * (180 / np.pi) def calculate_kl_divergence(self, control): """ Calculates the KL divergence between the GPs fit for both the batched controls and batched cases. :param control: The corresponding control ExperimentalCondition object :return: The KL divergence """ logger.info("Calculating the KL Divergence for " + self.name) def kl_integrand(variable): """ Calculates the KL integrand :param variable [int?] The independent variable for the Gaussian Process Model (either time or dose). :return [float] The integrand """ mean_control, var_control = control.gp.predict( np.asarray([[variable]])) mean_case, var_case = self.gp.predict(np.asarray([[variable]])) return ((var_control + (mean_control - mean_case)**2) / (2 * var_case)) + ((var_case + (mean_case - mean_control)**2) / (2 * var_control)) - 1 max_x_index = min(self.variable_treatment_end_index, control.variable_treatment_end_index) if control.response.shape[1] > self.response.shape[1]: self.kl_divergence = abs( 1 / (self.variable[max_x_index] - self.variable_treatment_start) * quad(kl_integrand, self.variable_treatment_start, self.variable[max_x_index], limit=100)[0])[0] else: self.kl_divergence = abs(1 / (control.variable[max_x_index] - self.variable_treatment_start) * quad(kl_integrand, self.variable_treatment_start, control.variable[max_x_index], limit=100)[0])[0] logger.info(self.kl_divergence) def calculate_responder_pvalue_AUC(self): """ Conduct a Mann-Whitney rank test between the AUC values for the treatment vs the AUC of the control and return the p-value. """ self.responder_pvalue_AUC = \ stats.mannwhitneyu(list(self.auc.values()), list(self.auc_control.values()), alternative="less").pvalue def calculate_responder_pvalue_angle(self): """ Conduct a Mann-Whitney rank test between the response angle values for the treatment vs the response angle of the control and return the p-value. """ self.responder_pvalue_angle = \ stats.mannwhitneyu(list(self.response_angle.values()), list(self.response_angle_control.values()), alternative="less").pvalue @staticmethod def __fit_single_gaussian_process(variable, response_norm, num_restarts=7): """ GP fitting. Returns the GP and kernel. :param variable: time :param response_norm: log-normalized target :return [tuple] a tuple: - the gp object - the kernel """ obs_per_replicate = response_norm.shape[1] kernel = RBF(input_dim=1, variance=1., lengthscale=10.) variable = np.tile(variable, (response_norm.shape[0], 1)) response = np.resize( response_norm, (response_norm.shape[0] * response_norm.shape[1], 1)) gp = GPRegression(variable, response, kernel) gp.optimize_restarts(num_restarts=num_restarts, messages=False) return gp, kernel @staticmethod def __relativize(y, start): """ Normalises a numpy array to the start day :param response [ndarray] the array to be normalised: :param start [int] the start day: :return [ndarray] the normalised array: """ return y / y[start] - 1 @staticmethod def __centre(y, start): """ Centres a numpy array to the start day :param response [ndarray] the array to be normalised: :param start [int] the start day: :return [ndarray] the normalised array: """ return y - y[start] @staticmethod def __compute_response_angle(variable, response, start): """ Calculates the response angle for observations response, given time points variable and start point start :param variable [ndarray] the time points :param response [ndarray] the observations :param start [numpy array] the start point for the angle computation :return [float] the angle: """ min_length = min(len(variable), len(response)) model = sm.OLS(response[start:min_length], variable[start:min_length], missing='drop') # Drop NaNs results = model.fit() return np.arctan(results.params[0]) def calculate_response_angles(self, control): """ Builds the response angle dict. :param control [ExperimentalCondition] the corresponding control object :return [None] writes to the angle parameters """ start = self.find_variable_start_index() for i in range(len(self.replicates)): if start is None: raise ValueError( "The `self.variable_start_index` parameter is missing, please initialize this value." ) else: self.response_angle[ self.replicates[i]] = self.__compute_response_angle( self.variable.ravel(), self.__centre(self.response[i], start), start) self.response_angle_rel[ self.replicates[i]] = self.__compute_response_angle( self.variable.ravel(), self.__relativize(self.response[i], start), start) self.average_angle = self.__compute_response_angle( self.variable.ravel(), self.__centre(np.nanmean(self.response, axis=0), start), start) self.average_angle_rel = self.__compute_response_angle( self.variable.ravel(), self.__relativize(np.nanmean(self.response, axis=0), start), start) self.average_angle_control = self.__compute_response_angle( control.variable.ravel(), self.__centre(np.nanmean(control.response, axis=0), start), start) self.average_angle_rel_control = self.__compute_response_angle( control.variable.ravel(), self.__relativize(np.nanmean(control.response, axis=0), start), start) @staticmethod def __calculate_AUC(variable, response): """ Calculates the area under the curve of a set of observations :param variable [ndarray] the time points :param response [ndarray] the observations :return [float] The area under the curve """ min_length = min(len(variable), len(response)) AUC = sklearn.metrics.auc(x=variable[0:min_length + 1], y=response[0:min_length + 1]) return AUC def calculate_gp_auc(self): """ Builds the AUC (Area under the curve) with respect to the GP fit. :return """ self.auc_gp = self.__calculate_AUC(self.variable, self.gp.predict(self.variable)[0]) def calculate_auc(self, control): """ Builds the AUC (Area under the curve) dict for response. :param control: the corresponding control object: :return [None]: """ start = max(self.find_variable_start_index(), control.find_variable_start_index()) end = min(self.variable_treatment_end_index, control.variable_treatment_end_index) for i in range(len(self.replicates)): self.auc[self.replicates[i]] = self.__calculate_AUC( self.variable.ravel()[start:end], self.response[i, start:end]) def calculate_auc_norm(self, control): """ Builds the AUC (Area under the curve) dict. for response_norm :param control: the corresponding control object: :return [None]: """ start = max(self.find_variable_start_index(), control.find_variable_start_index()) end = min(self.variable_treatment_end_index, control.variable_treatment_end_index) for i in range(len(self.replicates)): self.auc_norm[self.replicates[i]] = self.__calculate_AUC( self.variable.ravel()[start:end], self.response_norm[i, start:end]) def calculate_mrecist(self): """ Builds the mRECIST dict. - **mCR**: BestResponse < -95% AND BestAverageResponse < -40% - **mPR**: BestResponse < -50% AND BestAverageResponse < -20% - **mSD**: BestResponse < 35% AND BestAverageResponse < 30% - **mPD**: everything else :return [None] """ start = self.find_variable_start_index() end = self.variable_treatment_end_index ## FIXME:: Why does this go through almost the same loop twice? Is there a missing if condition? for i in range(len(self.replicates) - 1): # days_volume = zip(self.variable.ravel(), self.response[i]) print(i) if start is None: raise ValueError( "The `start` attribute for this `ExperimentalCondition` object is set to None, " "please reset.") else: initial_volume = self.response[i][start] # array of all responses for t >= 3 responses = [] average_responses = [] for day, volume in zip(self.variable.ravel(), self.response[i]): if (day - self.variable_treatment_start >= 3) and (day <= self.variable[end]): responses.append( ((volume - initial_volume) / initial_volume) * 100) average_responses.append(np.average(responses)) if min(responses) < -95 and min(average_responses) < -40: self.mrecist[self.replicates[i]] = 'mCR' elif min(responses) < -50 and min(average_responses) < -20: self.mrecist[self.replicates[i]] = 'mPR' elif min(responses) < 35 and min(average_responses) < 30: self.mrecist[self.replicates[i]] = 'mSD' else: self.mrecist[self.replicates[i]] = 'mPD' if self.best_avg_response is not None: self.best_avg_response = np.array([], dtype=np.float64) for i in range(len(self.replicates)): days_volume = zip(self.variable.ravel(), self.response[i]) start = self.find_variable_start_index() if start is None: raise ValueError( "The `start` attribute for this `ExperimentalCondition` object is set to None, " "please reset.") else: initial_volume = self.response[i][start] # array of all responses for t >= 10 responses = [] average_responses = [] day_diff = 0 for day, volume in days_volume: day_diff = day - self.variable_treatment_start if day >= self.variable_treatment_start and day_diff >= 3: responses.append( ((volume - initial_volume) / initial_volume) * 100) average_responses.append(np.average(responses)) self.best_avg_response = np.append(self.best_avg_response, min(average_responses)) if min(responses) < -95 and min(average_responses) < -40: self.mrecist[self.replicates[i]] = 'mCR' elif min(responses) < -50 and min(average_responses) < -20: self.mrecist[self.replicates[i]] = 'mPR' elif min(responses) < 35 and min(average_responses) < 30: self.mrecist[self.replicates[i]] = 'mSD' else: self.mrecist[self.replicates[i]] = 'mPD' def enumerate_mrecist(self): """ Builds up the mrecist_counts attribute with number of each occurrence of mRECIST experimental_condition. :return: """ # TODO:: Instead of error, we could just call method to calculate mrecist, then give the user a warning? if self.mrecist is None: raise ValueError( "`ExperimentalCondition` object mrecist attribute is none, please calculate mrecist first!" ) self.mrecist_counts = Counter(mCR=0, mPR=0, mSD=0, mPD=0) for replicate in self.replicates: mrecist = self.mrecist[replicate] if mrecist == 'mCR': self.mrecist_counts['mCR'] += 1 elif mrecist == 'mPR': self.mrecist_counts['mPR'] += 1 elif mrecist == 'mSD': self.mrecist_counts['mSD'] += 1 elif mrecist == 'mPD': self.mrecist_counts['mPD'] += 1 def __credible_interval(self, threshold, variable_2, variable_1=0, control=None): """ Credible interval function, for finding where the two GPs diverge. ## FIXME:: Is variable float or int? :param threshold [float] The variable of confidence :param variable_2 [int] The value of variable at the end of the range (i.e, time 2 or dose 2) :param variable_1 [int] The value of variable at the start of the range (i.e., time 1 or dose 1) :param control: the corresponding control object: :return: """ if control is not None: mu = 0 sigma = 1 a = np.array([1, -1, -1, 1]) means = np.array([ self.gp.predict(np.asarray([[variable_2]])), self.gp.predict(np.asarray([[variable_1]])), control.gp.predict(np.asarray([[variable_2]])), control.gp.predict(np.asarray([[variable_1]])) ])[:, 0, 0] controlp = [ control.gp.predict(np.asarray([[variable_1]])), control.gp.predict(np.asarray([[variable_2]])) ] variances = np.zeros((4, 4)) variances[0:2, 0:2] = self.gp.predict(np.asarray([[variable_1], [variable_2]]), full_cov=True)[1] variances[2:4, 2:4] = control.gp.predict(np.asarray([[variable_1], [variable_2]]), full_cov=True)[1] mu = np.dot(a, means) sigma = np.dot(np.dot(a, variances), a.T) interval = norm.interval(threshold, mu, sigma) return (interval[0] < 0) and (interval[1] > 0) else: logger.error( "The private function `__credible_interval` requires control.") def calculate_credible_intervals(self, control): """ :param control: control ExperimentalCondition object :return: """ logger.info("Calculating credible intervals for: " + self.name) if control is not None: largest_x_index = max(len(control.variable), len(self.variable)) if len(control.variable) > len(self.variable): for i in self.variable[1:]: # Why starting at second value? self.credible_intervals.append( (self.__credible_interval(0.95, i[0], control=control)[0], i[0])) else: for i in control.variable[1:]: self.credible_intervals.append( (self.__credible_interval(0.95, i[0], control=control)[0], i[0])) else: logger.error( "The function `calculate_credible_intervals` requires control." ) def calculate_credible_intervals_percentage(self): """ :return [float] The credible intervals; also has the side effect of setting the percent_credible_intervals attribute on the object. """ logger.info("Calculating percentage of credible intervals.") num_true = 0 for i in self.credible_intervals: if i[0] == True: num_true += 1 self.percent_credible_intervals = (num_true / len(self.credible_intervals)) * 100 return self.percent_credible_intervals def __gp_derivative(self, variable, gp): """ Computes the derivative of the Gaussian Process gp (with respect to its 'time' variable) and returns the values of the derivative at time points variable to deal with some weird stuff about :param variable [float] The independent variable, either time for PDX models or dose for CCL models :param gp [GP] The GaussianProcess to be differentiated :return [tuple] A tuple: - The mean - The covariance """ if variable.ndim == 1: variable = variable[:, np.newaxis] mu, ignore = gp.predictive_gradients(variable) ignore, cov = gp.predict(variable, full_cov=True) # FIXME:: How did this not divide by zero previously? mult = [[((1. / gp.kern.lengthscale) * (1 - (1. / gp.kern.lengthscale) * (y - z)**2))[0] for y in variable if y != z] for z in variable] return mu, mult * cov def compute_all_gp_derivatives(self, control): """ :param control [ExperimentalCondition] The control `ExperimentalCondition` for the current `CancerModel` :return: [None] Sets the `rates_list` attribute """ if not isinstance(self.rates_list, list): self.rates_list = list(self.rates_list) if not isinstance(self.rates_list_control, list): self.rates_list_control = list(self.rates_list_control) logger.info("Calculating the GP derivatives for: " + self.name + ' and control') for var in self.variable: self.rates_list.append(self.__gp_derivative(var, self.gp)[0]) for var in control.variable: self.rates_list_control.append( self.__gp_derivative(var, control.gp)[0]) self.rates_list = np.ravel(self.rates_list) self.rates_list_control = np.ravel(self.rates_list_control) logger.info("Done calcluating GP derivatives for: " + self.name + ' and control') def plot_with_control(self, control=None, output_path=None, show_kl_divergence=True, show_legend=True, file_type=None, output_pdf=None): """ Given all of the data and an output path, saves a PDF of the comparison with some statistics as well. :param control: The control ExperimentalCondition object :param output_path: output filepath - if not specified, doesn't save :param show_kl_divergence: flag for displaying calculated kl_divergence :param show_legend: flag for displaying legend :param file_type: can be 'svg' or 'pdf', defaults to 'pdf'. :param output_pdf: an output_pdf object :return: """ if control is None: logger.error("You need to plot with a control.") else: logger.info("Plotting with statistics for " + self.name) fig, ax = plt.subplots() plt.title( f"Case (Blue) and Control (Red) Comparison of \n {str(self.source_id)} with {str(self.name)}" ) # set xlim gp_x_limit = max(self.variable) + 5 # Control control.gp.plot_data(ax=ax, color='red') control.gp.plot_mean(ax=ax, color='red', plot_limits=[0, gp_x_limit]) control.gp.plot_confidence(ax=ax, color='red', plot_limits=[0, gp_x_limit]) # Case self.gp.plot_data(ax=ax, color='blue') self.gp.plot_mean(ax=ax, color='blue', plot_limits=[0, gp_x_limit]) self.gp.plot_confidence(ax=ax, color='blue', plot_limits=[0, gp_x_limit]) # Drug Start Line plt.plot( [self.variable_treatment_start, self.variable_treatment_start], [-10, 15], 'k-', lw=1) plt.xlabel('Day') plt.ylabel('Normalized log tumor size') plt.ylim(-10, 15) # Always select the longest date + 5 plt.xlim(0, max(self.variable) + 5) if show_kl_divergence: plt.text(2, -8, 'KL Divergence: ' + str(self.kl_divergence)) if show_legend is True: plt.legend(loc=0) if file_type == 'pdf': output_pdf.savefig(fig) plt.close(fig) elif file_type == 'svg': plt.savefig(output_path, format="svg") def __repr__(self): """ Returns a string representation of the experimental_condition object. :return [string] The representation: """ return ('\n'.join([ f"Name: {self.name}", f"Treatment Start Date: {self.variable_treatment_start}", f"Source Id: {self.source_id}", f"K-L Divergence: {self.kl_divergence}", f"K-L P-Value: {self.kl_p_value}", f"mRecist: {self.mrecist}", f"Percent Credible Interval: {self.percent_credible_intervals}", f"Rates List: {self.rates_list}" ]))
class GaussianProcessRewardModel(RewardModel): """ Models rewards with a Gaussian process regressor. Implemented with a modified version of scikit-learn's Gaussian Process Regressor class. The GP is updated online as samples are added. As such, hyperparameters for the GP are fit in batch after a threshold number of samples are collected. The hyperparameters are then refined afterwards as more samples are added until the number of samples passes an upper threshold, after which the hyperparameters are no longer updated. This helps avoid highly expensive refinement which has computational complexity of O(N^3) in number of samples. Parameters: ----------- min_samples: integer (default 100) The number of samples after which initial batch hyperparameter fitting is performed. batch_retries: integer (default 20) The number of random restarts for the initial hyperparameter fit. refine_ll_delta: numeric (default 1.0) The hyperparameters are refined after the average GP marginal log-likelihood decreases by this much since the last refinement. max_samples: integer (default 1000) The number of samples after which hyperparameters are no longer refined. Other Keyword Parameters: ------------------- Refer to sklearn.gaussian_process.GaussianProcessRegressor's __init__ """ def __init__(self, min_samples=10, batch_retries=19, enable_refine=True, refine_period=0, refine_ll_delta=1.0, refine_retries=0, kernel_type='rbf', verbose=False, **kwargs): self.min_samples = min_samples self.hp_batch_retries = batch_retries self.enable_refine = enable_refine self.hp_refine_ll_delta = float(refine_ll_delta) self.hp_refine_retries = refine_retries self.hp_refine_period = refine_period self.last_refine_iter = 0 self.hp_init = False self.last_ll = None self.kwargs = kwargs self.verbose = bool(verbose) if kernel_type.lower() == 'rbf': self.kernel_class = RBF elif kernel_type.lower() == 'matern': self.kernel_class = Matern32 else: raise ValueError('Unknown kernel_type: ' + kernel_type) self.kernel = None self.gp = None # Init later self.inputs = [] self.outputs = [] def _initialize(self): x = np.asarray(self.inputs) y = np.asarray(self.outputs).reshape(-1, 1) self.kernel = self.kernel_class(input_dim=x.shape[1], ARD=True) self.gp = GPRegression(x, y, kernel=self.kernel, **self.kwargs) @property def num_samples(self): return len(self.inputs) def average_log_likelihood(self): # NOTE For some reason this returns the negative log-likelihood if self.gp is None or self.num_samples < self.min_samples: return None return -self.gp.log_likelihood() / self.num_samples def report_sample(self, x, reward): self.inputs.append(x) self.outputs.append(reward) if self.gp is None: self.batch_optimize() else: x = np.asarray(self.inputs) y = np.asarray(self.outputs).reshape(-1, 1) self.gp.set_XY(x, y) # Wait until we've initialized if not self.hp_init: return current_ll = self.average_log_likelihood() if self.verbose: rospy.loginfo('Prev LL: %f Curr LL: %f', self.last_ll, current_ll) self.check_refine(current_ll) def check_refine(self, current_ll): if not self.enable_refine: return if current_ll > self.last_ll: self.last_ll = current_ll # If the LL has decreased by refine_ll_delta delta_achieved = current_ll < self.last_ll - self.hp_refine_ll_delta # If it has been refine_period samples since last refinement period_achieved = self.num_samples > self.last_refine_iter + self.hp_refine_period if delta_achieved or period_achieved: self.batch_optimize(self.hp_refine_retries + 1) self.last_refine_iter = self.num_samples def batch_optimize(self, n_restarts=None): if self.num_samples < self.min_samples: return if n_restarts is None: n_restarts = self.hp_batch_retries + 1 # NOTE Warm-restarting seems to get stuck in local optima, possibly from mean? # if self.gp is None: self._initialize() if self.verbose: rospy.loginfo('Batch optimizing with %d restarts...', n_restarts) self.gp.optimize_restarts(optimizer='bfgs', messages=False, num_restarts=n_restarts) if self.verbose: rospy.loginfo('Optimization complete. Model:\n%s\n Kernel:\n%s', str(self.gp), str(self.kernel.lengthscale)) self.hp_init = True self.last_ll = self.average_log_likelihood() def predict(self, x, return_std=False): if self.gp is None: #raise RuntimeError('Model is not fitted yet!') pred_mean = 0 pred_std = float('inf') else: x = np.asarray(x) if len(x.shape) == 1: x = x.reshape(1, -1) pred_mean, pred_var = self.gp.predict_noiseless(x) # To catch negative variances if pred_var < 0: rospy.logwarn('Negative variance %f rounding to 0', pred_var) pred_var = 0 pred_std = np.sqrt(pred_var) if return_std: return np.squeeze(pred_mean), np.squeeze(pred_std) else: return np.squeeze(pred_mean) def clear(self): self.inputs = [] self.outputs = [] self.kernel = None self.gp = None def fit(self, X, y): """Initialize the model from lists of inputs and corresponding rewards. Parameters ---------- X : Iterable of inputs Y : Iterable of corresponding rewards """ if len(X) != len(y): raise RuntimeError('X and Y lengths must be the same!') self.inputs = list(X) self.outputs = list(y) self._initialize() self.batch_optimize(self.hp_batch_retries) @property def num_samples(self): return len(self.inputs) @property def model(self): return self.gp
if __name__ == "__main__": np.random.seed(1) dim = 50 f = sphere X = np.random.uniform(-5, 5, (10, dim)) y = np.array([f(xi) for xi in X]).reshape(-1, 1) gpy_kern = RBFg(input_dim=dim, ARD=False) gpy_model = GPRegression(X, y, kernel=gpy_kern) gpy_model.optimize() mobo_kern = RBF(ARD=False) mobo_model = moboGP(mobo_kern, X, y) mobo_likelihood = Likelihood(mobo_model) mobo_likelihood.evaluate() optimizer = lbfgsb(mobo_model) optimizer.opt() print("gpy:", gpy_model.log_likelihood()) print("mobo:", mobo_model.log_likelihood) Xtest = np.random.uniform(-5, 5, (1000, dim)) ytest = np.array([f(xi) for xi in Xtest]).reshape(-1, 1) gpy_prediction, gpy_cov = gpy_model.predict(Xtest) mobo_prediction, mobo_cov = mobo_model.predict(Xtest) print("gpy:", np.mean(abs(ytest - gpy_prediction))) print("mobo:", np.mean(abs(ytest - mobo_prediction)))
x_train = data["X"] y_train = data["Y"] # choose a kernel #kernel = Matern32(input_dim=1, variance=2.0) #kernel = GridRBF(input_dim=1) #kernel = RBF(input_dim=1, variance=2.0) # gp regression and optimize the paramters using logliklihood gp_regression = GPRegression(x_train, y_train) #gp_regression.kern.lengthscale = 500 #gp_regression.likelihood.variance = 0.001 print("loglikelihood: ", gp_regression.log_likelihood()) gp_regression.optimize() print("loglikelihood: ", gp_regression.log_likelihood()) # predict new unseen samples x_test = np.linspace(1870, 2030, 200)[:, np.newaxis] yt_mean, yt_var = gp_regression.predict(x_test) yt_sd = np.sqrt(yt_var) # draw some samples from the posterior samples = gp_regression.posterior_samples(x_test, size=1).squeeze(1) # plot