def addPriors(self, prior_list): """ Add priors to the object, require prior_list contains priors """ self.prior_list = prior_list (self.C, self.JC, self.c, self.H, self.JH, self.h, self.uprior, self.gprior, self.lprior, self.C_list, self.JC_list, self.c_list, self.H_list, self.JH_list, self.h_list, self.id_C_list, self.id_C_var_list, self.id_H_list, self.id_H_var_list, self.num_constraints_list, self.num_regularizers_list) = utils.constructPrior(prior_list, self) # renew uprior for gamma if self.uprior is None: self.uprior = np.array([[-np.inf] * self.k_beta + [1e-7] * self.k_gamma, [np.inf] * self.k]) else: uprior_beta = self.uprior[:, self.id_beta] uprior_gamma = self.uprior[:, self.id_gamma] uprior_gamma[0] = np.maximum(1e-7, uprior_gamma[0]) uprior_gamma[1] = np.maximum(uprior_gamma[0], uprior_gamma[1]) self.uprior = np.hstack((uprior_beta, uprior_gamma)) self.lt.C, self.lt.JC, self.lt.c = self.C, self.JC, self.c self.lt.H, self.lt.JH, self.lt.h = self.H, self.JH, self.h (self.lt.uprior, self.lt.gprior, self.lt.lprior) = (self.uprior, self.gprior, self.lprior) self.lt = LimeTr(self.study_sizes, self.k_beta, self.k_gamma, self.obs_mean, self.F, self.JF, self.Z, self.obs_std, C=self.C, JC=self.JC, c=self.c, H=self.H, JH=self.JH, h=self.h, uprior=self.uprior, gprior=self.gprior, lprior=self.lprior, inlier_percentage=self.inlier_percentage)
def sample_params(self, n_samples=500): if 'mr_list' in dir(self.mr): # sample for each submodel sample_size_list = self.mr.compute_sample_sizes(n_samples) param_samples = [ LimeTr.sampleSoln(sub_mr.lt, sample_size=ss) for sub_mr, ss in zip(self.mr.mr_list, sample_size_list) ] given_samples = { 'given_beta_samples_list': [i[0] for i in param_samples], 'given_gamma_samples_list': [i[1] for i in param_samples] } else: beta_samples, gamma_samples = LimeTr.sampleSoln( self.mr.lt, sample_size=n_samples) given_samples = { 'given_beta_samples': beta_samples, 'given_gamma_samples': gamma_samples } self.given_samples = given_samples
def get_parameter_samples(mr, n_samples=1000): # sample for each submodel sample_size_list = mr.compute_sample_sizes(n_samples) param_samples = [ LimeTr.sampleSoln(sub_mr.lt, sample_size=ss) for sub_mr, ss in zip(mr.mr_list, sample_size_list) ] given_samples = { 'given_beta_samples_list': [i[0] for i in param_samples], 'given_gamma_samples_list': [i[1] for i in param_samples] } return given_samples
def __init__(self, obs_mean, obs_std, study_sizes, x_cov_list, z_cov_list, spline_list=[], inlier_percentage=1.0, rr_random_slope=False): """ Initialize the object and pass in the data, require - obs_mean: observations - obs_std: standard deviations for the observations - study_sizes: all study sizes in a list - x_cov_list: all x cov in a list - z_cov_list: all z cov in a list - spline_list: optional, all spline in a list - inlier_percentage: optional, used for trimming """ # pass in data self.obs_mean = obs_mean self.obs_std = obs_std self.study_sizes = study_sizes self.num_studies = len(study_sizes) self.num_obs = sum(study_sizes) # construct x and z "covariates" self.spline_list = spline_list self.x_cov_list = x_cov_list self.z_cov_list = z_cov_list (self.F, self.JF, self.F_list, self.JF_list, self.id_beta_list, self.id_spline_beta_list, self.k_beta_list) = utils.constructXCov(x_cov_list, spline_list=spline_list) (self.Z, self.Z_list, self.id_gamma_list, self.id_spline_gamma_list, self.k_gamma_list) = utils.constructZCov(z_cov_list, spline_list=spline_list) self.k_beta = int(sum(self.k_beta_list)) self.k_gamma = int(sum(self.k_gamma_list)) self.k = self.k_beta + self.k_gamma self.id_beta = slice(0, self.k_beta) self.id_gamma = slice(self.k_beta, self.k) # if use the random slope model or not self.rr_random_slope = rr_random_slope if rr_random_slope: valid_x_cov_id = [ i for i in range(len(x_cov_list)) if x_cov_list[i]['cov_type'] in ['log_ratio_spline', 'log_ratio_spline_integral'] ] if len(valid_x_cov_id) == 0: raise Exception( "Error: no suitable x cov for random slope model.") if len(valid_x_cov_id) >= 2: raise Exception( "Error: multiple x cov for random slope model.") x_cov = x_cov_list[valid_x_cov_id[0]] mat = x_cov['mat'] if x_cov['cov_type'] == 'log_ratio_spline': scaling = mat[0] - mat[1] else: scaling = 0.5 * (mat[0] + mat[1] - mat[2] - mat[3]) self.Z *= scaling.reshape(scaling.size, 1) # create limetr object self.inlier_percentage = inlier_percentage self.lt = LimeTr(self.study_sizes, self.k_beta, self.k_gamma, self.obs_mean, self.F, self.JF, self.Z, self.obs_std, inlier_percentage=inlier_percentage)
class MR_BRT: def __init__(self, obs_mean, obs_std, study_sizes, x_cov_list, z_cov_list, spline_list=[], inlier_percentage=1.0, rr_random_slope=False): """ Initialize the object and pass in the data, require - obs_mean: observations - obs_std: standard deviations for the observations - study_sizes: all study sizes in a list - x_cov_list: all x cov in a list - z_cov_list: all z cov in a list - spline_list: optional, all spline in a list - inlier_percentage: optional, used for trimming """ # pass in data self.obs_mean = obs_mean self.obs_std = obs_std self.study_sizes = study_sizes self.num_studies = len(study_sizes) self.num_obs = sum(study_sizes) # construct x and z "covariates" self.spline_list = spline_list self.x_cov_list = x_cov_list self.z_cov_list = z_cov_list (self.F, self.JF, self.F_list, self.JF_list, self.id_beta_list, self.id_spline_beta_list, self.k_beta_list) = utils.constructXCov(x_cov_list, spline_list=spline_list) (self.Z, self.Z_list, self.id_gamma_list, self.id_spline_gamma_list, self.k_gamma_list) = utils.constructZCov(z_cov_list, spline_list=spline_list) self.k_beta = int(sum(self.k_beta_list)) self.k_gamma = int(sum(self.k_gamma_list)) self.k = self.k_beta + self.k_gamma self.id_beta = slice(0, self.k_beta) self.id_gamma = slice(self.k_beta, self.k) # if use the random slope model or not self.rr_random_slope = rr_random_slope if rr_random_slope: valid_x_cov_id = [ i for i in range(len(x_cov_list)) if x_cov_list[i]['cov_type'] in ['log_ratio_spline', 'log_ratio_spline_integral'] ] if len(valid_x_cov_id) == 0: raise Exception( "Error: no suitable x cov for random slope model.") if len(valid_x_cov_id) >= 2: raise Exception( "Error: multiple x cov for random slope model.") x_cov = x_cov_list[valid_x_cov_id[0]] mat = x_cov['mat'] if x_cov['cov_type'] == 'log_ratio_spline': scaling = mat[0] - mat[1] else: scaling = 0.5 * (mat[0] + mat[1] - mat[2] - mat[3]) self.Z *= scaling.reshape(scaling.size, 1) # create limetr object self.inlier_percentage = inlier_percentage self.lt = LimeTr(self.study_sizes, self.k_beta, self.k_gamma, self.obs_mean, self.F, self.JF, self.Z, self.obs_std, inlier_percentage=inlier_percentage) def addPriors(self, prior_list): """ Add priors to the object, require prior_list contains priors """ self.prior_list = prior_list (self.C, self.JC, self.c, self.H, self.JH, self.h, self.uprior, self.gprior, self.lprior, self.C_list, self.JC_list, self.c_list, self.H_list, self.JH_list, self.h_list, self.id_C_list, self.id_C_var_list, self.id_H_list, self.id_H_var_list, self.num_constraints_list, self.num_regularizers_list) = utils.constructPrior(prior_list, self) # renew uprior for gamma if self.uprior is None: self.uprior = np.array([[-np.inf] * self.k_beta + [1e-7] * self.k_gamma, [np.inf] * self.k]) else: uprior_beta = self.uprior[:, self.id_beta] uprior_gamma = self.uprior[:, self.id_gamma] uprior_gamma[0] = np.maximum(1e-7, uprior_gamma[0]) uprior_gamma[1] = np.maximum(uprior_gamma[0], uprior_gamma[1]) self.uprior = np.hstack((uprior_beta, uprior_gamma)) self.lt.C, self.lt.JC, self.lt.c = self.C, self.JC, self.c self.lt.H, self.lt.JH, self.lt.h = self.H, self.JH, self.h (self.lt.uprior, self.lt.gprior, self.lt.lprior) = (self.uprior, self.gprior, self.lprior) self.lt = LimeTr(self.study_sizes, self.k_beta, self.k_gamma, self.obs_mean, self.F, self.JF, self.Z, self.obs_std, C=self.C, JC=self.JC, c=self.c, H=self.H, JH=self.JH, h=self.h, uprior=self.uprior, gprior=self.gprior, lprior=self.lprior, inlier_percentage=self.inlier_percentage) def fitModel(self, x0=None, outer_verbose=False, outer_max_iter=100, outer_step_size=1.0, outer_tol=1e-6, inner_print_level=0, inner_max_iter=20): # initialization with gamma set to be zero gamma_uprior = self.lt.uprior[:, self.lt.idx_gamma].copy() self.lt.uprior[:, self.lt.idx_gamma] = 1e-6 self.lt.n = np.array([1] * self.num_obs) norm_z_col = np.linalg.norm(self.lt.Z, axis=0) self.lt.Z /= norm_z_col if x0 is None: if self.lprior is None: x0 = np.array([1.0] * self.k_beta + [1e-6] * self.k_gamma) else: x0 = np.array([1.0] * self.k_beta * 2 + [1e-6] * self.k_gamma * 2) else: if self.lprior is not None: beta0 = x0[:self.k_beta] gamma0 = x0[self.k_beta:self.k_beta + self.k_gamma] x0 = np.hstack((beta0, np.abs(beta0), gamma0, gamma0)) (beta_0, gamma_0, self.w_soln) = self.lt.fitModel(x0=x0, outer_verbose=outer_verbose, outer_max_iter=outer_max_iter, outer_step_size=outer_step_size, outer_tol=outer_tol, inner_print_level=inner_print_level, inner_max_iter=inner_max_iter) # print("init obj", self.lt.objective(self.lt.soln)) # fit the model from the initial point self.lt.uprior[:, self.lt.idx_gamma] = gamma_uprior self.lt.n = self.study_sizes if self.lprior is not None: x0 = np.hstack((beta_0, np.abs(beta_0), gamma_0, gamma_0)) else: x0 = np.hstack((beta_0, gamma_0)) self.lt.optimize(x0=x0, print_level=inner_print_level, max_iter=100) self.lt.Z *= norm_z_col self.lt.gamma /= norm_z_col**2 self.beta_soln = self.lt.beta self.gamma_soln = self.lt.gamma # print("final obj", self.lt.objective(self.lt.soln)) # print("------------------------------") def predictData(self, pred_x_cov_list, pred_z_cov_list, sample_size, pred_study_sizes=None, given_beta_samples=None, given_gamma_samples=None, ref_point=None, include_random_effect=True): # sample solutions if given_beta_samples is None or given_gamma_samples is None: beta_samples, gamma_samples = LimeTr.sampleSoln( self.lt, sample_size=sample_size) else: beta_samples = given_beta_samples gamma_samples = given_gamma_samples # calculate the beta and gamma post cov # self.beta_samples_mean = np.mean(beta_samples, axis=0) # self.gamma_samples_mean = np.mean(gamma_samples, axis=0) # self.beta_samples_cov = \ # beta_samples.T.dot(beta_samples)/sample_size - \ # np.outer(self.beta_samples_mean, self.beta_samples_mean) # self.gamma_samples_cov = \ # gamma_samples.T.dot(gamma_samples)/sample_size - \ # np.outer(self.gamma_samples_mean, self.gamma_samples_mean) # create x cov (pred_F, pred_JF, pred_F_list, pred_JF_list, pred_id_beta_list) = utils.constructPredXCov(pred_x_cov_list, self) # create z cov (pred_Z, pred_Z_list, pred_id_gamma_list) = utils.constructPredZCov(pred_z_cov_list, self) # num of studies pred_num_obs = pred_Z.shape[0] # create observation samples y_samples = np.vstack([pred_F(beta) for beta in beta_samples]) if ref_point is not None: x_cov_spline_id = [ x_cov['spline_id'] for x_cov in pred_x_cov_list if 'spline' in x_cov['cov_type'] ] if len(x_cov_spline_id) == 0: raise Exception("Error: no spline x cov") if len(x_cov_spline_id) >= 2: raise Exception("Error: multiple spline x covs") spline = self.spline_list[x_cov_spline_id[0]] ref_risk = spline.designMat(np.array([ref_point])).dot( beta_samples[:, self.id_spline_beta_list[x_cov_spline_id[0]]].T) y_samples /= ref_risk.reshape(sample_size, 1) pred_gamma = np.hstack([ self.gamma_soln[pred_id_gamma_list[i]] for i in range(len(pred_id_gamma_list)) ]) if include_random_effect: if self.rr_random_slope: u = np.random.randn(sample_size, self.k_gamma)*\ np.sqrt(self.gamma_soln) # zu = np.sum(pred_Z*u, axis=1) zu = u[:, 0] valid_x_cov_id = [ i for i in range(len(pred_x_cov_list)) if pred_x_cov_list[i]['cov_type'] == 'spline' ] if len(valid_x_cov_id) == 0: raise Exception( "Error: no suitable x cov for random slope model.") if len(valid_x_cov_id) >= 2: raise Exception( "Error: multiple x cov for random slope model.") mat = pred_x_cov_list[valid_x_cov_id[0]]['mat'] if ref_point is None: y_samples *= np.exp(np.outer(zu, mat - mat[0])) else: y_samples *= np.exp(np.outer(zu, mat - ref_point)) else: if pred_study_sizes is None: pred_study_sizes = np.array([1] * pred_num_obs) else: assert sum(pred_study_sizes) == pred_num_obs pred_num_studies = len(pred_study_sizes) pred_Z_sub = np.split(pred_Z, np.cumsum(pred_study_sizes)[:-1]) u = [ np.random.multivariate_normal( np.zeros(pred_study_sizes[i]), (pred_Z_sub[i] * pred_gamma).dot(pred_Z_sub[i].T), sample_size) for i in range(pred_num_studies) ] U = np.hstack(u) if np.any([ 'log_ratio' in self.x_cov_list[i]['cov_type'] for i in range(len(self.x_cov_list)) ]): y_samples *= np.exp(U) else: y_samples += U return y_samples, beta_samples, gamma_samples, pred_F, pred_Z
def predictData(self, pred_x_cov_list, pred_z_cov_list, sample_size, pred_study_sizes=None, given_beta_samples=None, given_gamma_samples=None, ref_point=None, include_random_effect=True): # sample solutions if given_beta_samples is None or given_gamma_samples is None: beta_samples, gamma_samples = LimeTr.sampleSoln( self.lt, sample_size=sample_size) else: beta_samples = given_beta_samples gamma_samples = given_gamma_samples # calculate the beta and gamma post cov # self.beta_samples_mean = np.mean(beta_samples, axis=0) # self.gamma_samples_mean = np.mean(gamma_samples, axis=0) # self.beta_samples_cov = \ # beta_samples.T.dot(beta_samples)/sample_size - \ # np.outer(self.beta_samples_mean, self.beta_samples_mean) # self.gamma_samples_cov = \ # gamma_samples.T.dot(gamma_samples)/sample_size - \ # np.outer(self.gamma_samples_mean, self.gamma_samples_mean) # create x cov (pred_F, pred_JF, pred_F_list, pred_JF_list, pred_id_beta_list) = utils.constructPredXCov(pred_x_cov_list, self) # create z cov (pred_Z, pred_Z_list, pred_id_gamma_list) = utils.constructPredZCov(pred_z_cov_list, self) # num of studies pred_num_obs = pred_Z.shape[0] # create observation samples y_samples = np.vstack([pred_F(beta) for beta in beta_samples]) if ref_point is not None: x_cov_spline_id = [ x_cov['spline_id'] for x_cov in pred_x_cov_list if 'spline' in x_cov['cov_type'] ] if len(x_cov_spline_id) == 0: raise Exception("Error: no spline x cov") if len(x_cov_spline_id) >= 2: raise Exception("Error: multiple spline x covs") spline = self.spline_list[x_cov_spline_id[0]] ref_risk = spline.designMat(np.array([ref_point])).dot( beta_samples[:, self.id_spline_beta_list[x_cov_spline_id[0]]].T) y_samples /= ref_risk.reshape(sample_size, 1) pred_gamma = np.hstack([ self.gamma_soln[pred_id_gamma_list[i]] for i in range(len(pred_id_gamma_list)) ]) if include_random_effect: if self.rr_random_slope: u = np.random.randn(sample_size, self.k_gamma)*\ np.sqrt(self.gamma_soln) # zu = np.sum(pred_Z*u, axis=1) zu = u[:, 0] valid_x_cov_id = [ i for i in range(len(pred_x_cov_list)) if pred_x_cov_list[i]['cov_type'] == 'spline' ] if len(valid_x_cov_id) == 0: raise Exception( "Error: no suitable x cov for random slope model.") if len(valid_x_cov_id) >= 2: raise Exception( "Error: multiple x cov for random slope model.") mat = pred_x_cov_list[valid_x_cov_id[0]]['mat'] if ref_point is None: y_samples *= np.exp(np.outer(zu, mat - mat[0])) else: y_samples *= np.exp(np.outer(zu, mat - ref_point)) else: if pred_study_sizes is None: pred_study_sizes = np.array([1] * pred_num_obs) else: assert sum(pred_study_sizes) == pred_num_obs pred_num_studies = len(pred_study_sizes) pred_Z_sub = np.split(pred_Z, np.cumsum(pred_study_sizes)[:-1]) u = [ np.random.multivariate_normal( np.zeros(pred_study_sizes[i]), (pred_Z_sub[i] * pred_gamma).dot(pred_Z_sub[i].T), sample_size) for i in range(pred_num_studies) ] U = np.hstack(u) if np.any([ 'log_ratio' in self.x_cov_list[i]['cov_type'] for i in range(len(self.x_cov_list)) ]): y_samples *= np.exp(U) else: y_samples += U return y_samples, beta_samples, gamma_samples, pred_F, pred_Z
class MRBRT: """MR-BRT Object """ def __init__(self, data: MRData, cov_models: List[CovModel], inlier_pct: float = 1.0): """Constructor of MRBRT. Args: data (MRData): Data for meta-regression. cov_models (List[CovModel]): A list of covariates models. inlier_pct (float, optional): A float number between 0 and 1 indicate the percentage of inliers. """ self.data = data self.cov_models = cov_models self.inlier_pct = inlier_pct self.check_input() self.cov_model_names = [ cov_model.name for cov_model in self.cov_models ] self.num_cov_models = len(self.cov_models) self.cov_names = [] for cov_model in self.cov_models: self.cov_names.extend(cov_model.covs) self.num_covs = len(self.cov_names) # attach data to cov_model for cov_model in self.cov_models: cov_model.attach_data(self.data) # fixed effects size and index self.x_vars_sizes = [ cov_model.num_x_vars for cov_model in self.cov_models ] self.x_vars_indices = utils.sizes_to_indices(self.x_vars_sizes) self.num_x_vars = sum(self.x_vars_sizes) # random effects size and index self.z_vars_sizes = [ cov_model.num_z_vars for cov_model in self.cov_models ] self.z_vars_indices = utils.sizes_to_indices(self.z_vars_sizes) self.num_z_vars = sum(self.z_vars_sizes) self.num_vars = self.num_x_vars + self.num_z_vars # number of constraints self.num_constraints = sum( [cov_model.num_constraints for cov_model in self.cov_models]) # number of regularizations self.num_regularizations = sum( [cov_model.num_regularizations for cov_model in self.cov_models]) # place holder for the limetr objective self.lt = None self.beta_soln = None self.gamma_soln = None self.u_soln = None self.w_soln = None self.re_soln = None def check_input(self): """Check the input type of the attributes. """ assert isinstance(self.data, MRData) assert isinstance(self.cov_models, list) assert all( [isinstance(cov_model, CovModel) for cov_model in self.cov_models]) assert (self.inlier_pct >= 0.0) and (self.inlier_pct <= 1.0) def get_cov_model(self, name: str) -> CovModel: """Choose covariate model with name. """ index = self.get_cov_model_index(name) return self.cov_models[index] def get_cov_model_index(self, name: str) -> int: """From cov_model name get the index. """ matching_index = [ index for index, cov_model_name in enumerate(self.cov_model_names) if cov_model_name == name ] num_matching_index = len(matching_index) assert num_matching_index == 1, f"Number of matching index is {num_matching_index}." return matching_index[0] def create_x_fun(self, data=None): """Create the fixed effects function, link with limetr. """ data = self.data if data is None else data # create design functions design_funs = [ cov_model.create_x_fun(data) for cov_model in self.cov_models ] funs, jac_funs = list(zip(*design_funs)) def x_fun(beta, funs=funs): return sum( fun(beta[self.x_vars_indices[i]]) for i, fun in enumerate(funs)) def x_jac_fun(beta, jac_funs=jac_funs): return np.hstack([ jac_fun(beta[self.x_vars_indices[i]]) for i, jac_fun in enumerate(jac_funs) ]) return x_fun, x_jac_fun def create_z_mat(self, data=None): """Create the random effects matrix, link with limetr. """ data = self.data if data is None else data mat = np.hstack( [cov_model.create_z_mat(data) for cov_model in self.cov_models]) return mat def create_c_mat(self): """Create the constraints matrices. """ c_mat = np.zeros((0, self.num_vars)) c_vec = np.zeros((2, 0)) for i, cov_model in enumerate(self.cov_models): if cov_model.num_constraints != 0: c_mat_sub = np.zeros( (cov_model.num_constraints, self.num_vars)) c_mat_sub[:, self.x_vars_indices[ i]], c_vec_sub = cov_model.create_constraint_mat() c_mat = np.vstack((c_mat, c_mat_sub)) c_vec = np.hstack((c_vec, c_vec_sub)) return c_mat, c_vec def create_h_mat(self): """Create the regularizer matrices. """ h_mat = np.zeros((0, self.num_vars)) h_vec = np.zeros((2, 0)) for i, cov_model in enumerate(self.cov_models): if cov_model.num_regularizations != 0: h_mat_sub = np.zeros( (cov_model.num_regularizations, self.num_vars)) h_mat_sub[:, self.x_vars_indices[ i]], h_vec_sub = cov_model.create_regularization_mat() h_mat = np.vstack((h_mat, h_mat_sub)) h_vec = np.hstack((h_vec, h_vec_sub)) return h_mat, h_vec def create_uprior(self): """Create direct uniform prior. """ uprior = np.array([[-np.inf] * self.num_vars, [np.inf] * self.num_vars]) for i, cov_model in enumerate(self.cov_models): uprior[:, self.x_vars_indices[i]] = cov_model.prior_beta_uniform uprior[:, self.z_vars_indices[i] + self.num_x_vars] = cov_model.prior_gamma_uniform return uprior def create_gprior(self): """Create direct gaussian prior. """ gprior = np.array([[0] * self.num_vars, [np.inf] * self.num_vars]) for i, cov_model in enumerate(self.cov_models): gprior[:, self.x_vars_indices[i]] = cov_model.prior_beta_gaussian gprior[:, self.z_vars_indices[i] + self.num_x_vars] = cov_model.prior_gamma_gaussian return gprior def create_lprior(self): """Create direct laplace prior. """ lprior = np.array([[0] * self.num_vars, [np.inf] * self.num_vars]) for i, cov_model in enumerate(self.cov_models): lprior[:, self.x_vars_indices[i]] = cov_model.prior_beta_laplace lprior[:, self.z_vars_indices[i] + self.num_x_vars] = cov_model.prior_gamma_laplace return lprior def fit_model(self, **fit_options): """Fitting the model through limetr. Args: x0 (np.ndarray): Initial guess for the optimization problem. inner_print_level (int): If non-zero printing iteration information of the inner problem. inner_max_iter (int): Maximum inner number of iterations. inner_tol (float): Tolerance of the inner problem. outer_verbose (bool): If `True` print out iteration information. outer_max_iter (int): Maximum outer number of iterations. outer_step_size (float): Step size of the outer problem. outer_tol (float): Tolerance of the outer problem. normalize_trimming_grad (bool): If `True`, normalize the gradient of the outer trimmign problem. """ # dimensions n = self.data.study_sizes k_beta = self.num_x_vars k_gamma = self.num_z_vars # data y = self.data.obs s = self.data.obs_se # create x fun and z mat x_fun, x_fun_jac = self.create_x_fun() z_mat = self.create_z_mat() # scale z_mat z_scale = np.max(np.abs(z_mat), axis=0) z_mat /= z_scale # priors c_mat, c_vec = self.create_c_mat() h_mat, h_vec = self.create_h_mat() c_fun, c_fun_jac = utils.mat_to_fun(c_mat) h_fun, h_fun_jac = utils.mat_to_fun(h_mat) uprior = self.create_uprior() uprior[:, self.num_x_vars:self.num_vars] *= z_scale**2 gprior = self.create_gprior() gprior[:, self.num_x_vars:self.num_vars] *= z_scale**2 lprior = self.create_lprior() lprior[:, self.num_x_vars:self.num_vars] *= z_scale**2 if np.isneginf(uprior[0]).all() and np.isposinf(uprior[1]).all(): uprior = None if np.isposinf(gprior[1]).all(): gprior = None if np.isposinf(lprior[1]).all(): lprior = None # create limetr object self.lt = LimeTr(n, k_beta, k_gamma, y, x_fun, x_fun_jac, z_mat, S=s, C=c_fun, JC=c_fun_jac, c=c_vec, H=h_fun, JH=h_fun_jac, h=h_vec, uprior=uprior, gprior=gprior, lprior=lprior, inlier_percentage=self.inlier_pct) self.lt.fitModel(**fit_options) self.lt.Z *= z_scale if hasattr(self.lt, 'gprior'): self.lt.gprior[:, self.lt.idx_gamma] /= z_scale**2 if hasattr(self.lt, 'uprior'): self.lt.uprior[:, self.lt.idx_gamma] /= z_scale**2 if hasattr(self.lt, 'lprior'): self.lt.lprior[:, self.lt.idx_gamma] /= z_scale**2 self.lt.gamma /= z_scale**2 self.beta_soln = self.lt.beta.copy() self.gamma_soln = self.lt.gamma.copy() self.w_soln = self.lt.w.copy() self.u_soln = self.lt.estimateRE() self.re_soln = { study: self.u_soln[i] for i, study in enumerate(self.data.studies) } def extract_re(self, study_id: np.ndarray) -> np.ndarray: """Extract the random effect for a given dataset. """ re = np.vstack([ self.re_soln[study] if study in self.re_soln else np.zeros(self.num_z_vars) for study in study_id ]) return re def predict(self, data: MRData, predict_for_study: bool = False, sort_by_data_id: bool = False) -> np.ndarray: """Create new prediction with existing solution. Args: data (MRData): MRData object contains the predict data. predict_for_study (bool, optional): If `True`, use the random effects information to prediction for specific study. If the `study_id` in `data` do not contain in the fitting data, it will assume the corresponding random effects equal to 0. sort_by_data_id (bool, optional): If `True`, will sort the final prediction as the order of the original data frame that used to create the `data`. Default to False. Returns: np.ndarray: Predicted outcome array. """ assert data.has_covs( self.cov_names ), "Prediction data do not have covariates used for fitting." x_fun, _ = self.create_x_fun(data=data) prediction = x_fun(self.beta_soln) if predict_for_study: z_mat = self.create_z_mat(data=data) re = self.extract_re(data.study_id) prediction += np.sum(z_mat * re, axis=1) if sort_by_data_id: prediction = prediction[np.argsort(data.data_id)] return prediction def sample_soln(self, sample_size: int = 1, sim_prior: bool = True, sim_re: bool = True, print_level: int = 0) -> Tuple[np.ndarray, np.ndarray]: """Sample solutions. Args: sample_size (int, optional): Number of samples. sim_prior (bool, optional): If `True`, simulate priors. sim_re (bool, optional): If `True`, simulate random effects. print_level (int, optional): Level detailed of optimization information printed out during sampling process. If 0, no information will be printed out. Return: Tuple[np.ndarray, np.ndarray]: Return beta samples and gamma samples. """ if self.lt is None: raise ValueError('Please fit the model first.') beta_soln_samples, gamma_soln_samples = \ self.lt.sampleSoln(self.lt, sample_size=sample_size, sim_prior=sim_prior, sim_re=sim_re, print_level=print_level) return beta_soln_samples, gamma_soln_samples def create_draws(self, data: MRData, beta_samples: np.ndarray, gamma_samples: np.ndarray, random_study: bool = True, sort_by_study_id: bool = False) -> np.ndarray: """Create draws for the given data set. Args: data (MRData): MRData object contains predict data. beta_samples (np.ndarray): Samples of beta. gamma_samples (np.ndarray): Samples of gamma. random_study (bool, optional): If `True` the draws will include uncertainty from study heterogeneity. sort_by_data_id (bool, optional): If `True`, will sort the final prediction as the order of the original data frame that used to create the `data`. Default to False. Returns: np.ndarray: Returns outcome sample matrix. """ sample_size = beta_samples.shape[0] assert beta_samples.shape == (sample_size, self.num_x_vars) assert gamma_samples.shape == (sample_size, self.num_z_vars) x_fun, x_jac_fun = self.create_x_fun(data=data) z_mat = self.create_z_mat(data=data) y_samples = np.vstack( [x_fun(beta_sample) for beta_sample in beta_samples]) if random_study: u_samples = np.random.randn( sample_size, self.num_z_vars) * np.sqrt(gamma_samples) y_samples += u_samples.dot(z_mat.T) else: re = self.extract_re(data.study_id) y_samples += np.sum(z_mat * re, axis=1) if sort_by_study_id: y_samples = y_samples[:, np.argsort(data.data_id)] return y_samples.T
def fit_model(self, **fit_options): """Fitting the model through limetr. Args: x0 (np.ndarray): Initial guess for the optimization problem. inner_print_level (int): If non-zero printing iteration information of the inner problem. inner_max_iter (int): Maximum inner number of iterations. inner_tol (float): Tolerance of the inner problem. outer_verbose (bool): If `True` print out iteration information. outer_max_iter (int): Maximum outer number of iterations. outer_step_size (float): Step size of the outer problem. outer_tol (float): Tolerance of the outer problem. normalize_trimming_grad (bool): If `True`, normalize the gradient of the outer trimmign problem. """ # dimensions n = self.data.study_sizes k_beta = self.num_x_vars k_gamma = self.num_z_vars # data y = self.data.obs s = self.data.obs_se # create x fun and z mat x_fun, x_fun_jac = self.create_x_fun() z_mat = self.create_z_mat() # scale z_mat z_scale = np.max(np.abs(z_mat), axis=0) z_mat /= z_scale # priors c_mat, c_vec = self.create_c_mat() h_mat, h_vec = self.create_h_mat() c_fun, c_fun_jac = utils.mat_to_fun(c_mat) h_fun, h_fun_jac = utils.mat_to_fun(h_mat) uprior = self.create_uprior() uprior[:, self.num_x_vars:self.num_vars] *= z_scale**2 gprior = self.create_gprior() gprior[:, self.num_x_vars:self.num_vars] *= z_scale**2 lprior = self.create_lprior() lprior[:, self.num_x_vars:self.num_vars] *= z_scale**2 if np.isneginf(uprior[0]).all() and np.isposinf(uprior[1]).all(): uprior = None if np.isposinf(gprior[1]).all(): gprior = None if np.isposinf(lprior[1]).all(): lprior = None # create limetr object self.lt = LimeTr(n, k_beta, k_gamma, y, x_fun, x_fun_jac, z_mat, S=s, C=c_fun, JC=c_fun_jac, c=c_vec, H=h_fun, JH=h_fun_jac, h=h_vec, uprior=uprior, gprior=gprior, lprior=lprior, inlier_percentage=self.inlier_pct) self.lt.fitModel(**fit_options) self.lt.Z *= z_scale if hasattr(self.lt, 'gprior'): self.lt.gprior[:, self.lt.idx_gamma] /= z_scale**2 if hasattr(self.lt, 'uprior'): self.lt.uprior[:, self.lt.idx_gamma] /= z_scale**2 if hasattr(self.lt, 'lprior'): self.lt.lprior[:, self.lt.idx_gamma] /= z_scale**2 self.lt.gamma /= z_scale**2 self.beta_soln = self.lt.beta.copy() self.gamma_soln = self.lt.gamma.copy() self.w_soln = self.lt.w.copy() self.u_soln = self.lt.estimateRE() self.re_soln = { study: self.u_soln[i] for i, study in enumerate(self.data.studies) }
def sampleGlobalWithLimeTr(self, sample_size=100, max_iter=300): beta_samples, gamma_samples = LimeTr.sampleSoln( self.model, sample_size=sample_size, max_iter=max_iter) return beta_samples, gamma_samples
def optimize(self, var=None, S=None, trim_percentage=0.0, share_obs_std=True, fit_fixed=True, inner_print_level=5, inner_max_iter=100, inner_tol=1e-5, inner_verbose=True, inner_acceptable_tol=1e-4, inner_nlp_scaling_min_value=1e-8, outer_verbose=False, outer_max_iter=1, outer_step_size=1, outer_tol=1e-6, normalize_Z=False, build_X=True, random_seed=0): """ Run optimization routine via LimeTr. Args: var (numpy.ndarray | None, optional): One-dimensional array that gives initialization for variables. If None, the program will first run without random effects to obtain a starting point. S (numpy.ndarray | None, optional): One-dimensional numpy array that gives standard deviation for each measurement. The size of S should be the same as that of measurements vector. If None standard deviation of measurement will be treated as variables and optimized. trim_percentage (float | 0.0, optional): A float that gives percentage of datapoints to trim. Default is 0, i.e. no trimming. share_obs_std (boolean | True, optional): A boolean that indicates whether the model should assume data across studies share the same measurement standard deviation. fit_fixed (boolean | True, optional): A boolean that indicates whether to run a fit without random effects first in order to obtain a good starting point. inner_print_level (int | 5, optional): ``print_level`` for Ipopt. inner_max_iter (int | 100, optional): Maximum number of iterations for inner optimization. inner_tol (float | 1e-5, optional): Tolerance level for inner optimization. inner_verbose (boolean | True, optional): Verbose option for inner optimization. inner_acceptable_tol (float | 1e-4, optional): Acceptable tolerance level for inner optimization. inner_nlp_scaling_min_value (float | 1e-8, optional): Min scaling for objective function. outer_verbose (boolean | False, optional): Verbose option for outer optimization. outer_max_iter (int | 1, optional): Maximum number of iterations for outer optimization. When there is no trimming, outer optimization is not needed, so the default is set to be 1. outer_step_size (float |1.0, optional): Step size for outer optimization. Used in trimming. outer_tol (float | 1e-6, optional): Tolerance level for outer optimization. normalize_Z (bool | False, optional): Whether to normalize Z matrix before optimization. build_X (bool | True, optional): Whether to explicitly build and store X matrix. random_seed (int | 0, optional): random seed for choosing an initial point for optimization. If equals 0 the initial point is chosen to be a vector of 0.01. """ self.S = S self.share_obs_std = share_obs_std Z_norm = self.buildZ(normalize_Z) k = self.k_beta + self.k_gamma if S is None: if share_obs_std: k += 1 else: k += len(self.grouping) print('n_groups', self.n_groups) print('k_beta', self.k_beta) print('k_gamma', self.k_gamma) print('total number of fixed effects variables', k) if self.k_gamma == 0: self.add_re = False self.k_gamma = 1 k += 1 else: self.add_re = True C = [] start = self.k_beta for ran in self.ran_list: _, dims = ran m = np.prod(dims[self.n_grouping_dims:]) c = np.zeros((m - 1, k)) for i in range(m - 1): c[i, start + i] = 1 c[i, start + i + 1] = -1 C.append(c) start += m if len(C) > 0: self.constraints = np.vstack(C) assert self.constraints.shape[1] == k else: self.constraints = [] C = None if len(self.constraints) > 0: def C(var): return self.constraints.dot(var) JC = None if len(self.constraints) > 0: def JC(var): return self.constraints c = None if len(self.constraints) > 0: c = np.zeros((2, self.constraints.shape[0])) self.uprior = np.array( [[-np.inf] * self.k_beta + [1e-7] * self.k_gamma + [1e-7] * (k - self.k_beta - self.k_gamma), [np.inf] * k]) if self.global_cov_bounds is not None: if self.global_intercept: self.uprior[:, 1:len(self.global_ids) + 1] = self.global_cov_bounds else: self.uprior[:, :len(self.global_ids)] = self.global_cov_bounds self.gprior = None if self.use_gprior: assert len(self.ran_eff_gamma_sd) == self.k_gamma self.gprior = np.array( [[0] * k, [np.inf] * self.k_beta + self.ran_eff_gamma_sd + [np.inf] * (k - self.k_beta - self.k_gamma)]) x0 = np.ones(k) * .01 if random_seed != 0: np.random.seed(random_seed) x0 = np.random.randn(k) * .01 if var is not None: if self.add_re is True: assert len(var) == k x0 = var else: assert len(var) == self.k_beta x0 = np.append(var, [1e-8]) assert len(x0) == k if build_X: self._buildX() if fit_fixed or self.add_re is False: uprior_fixed = copy.deepcopy(self.uprior) uprior_fixed[:, self.k_beta:self.k_beta + self.k_gamma] = 1e-8 if S is None or trim_percentage >= 0.01: model_fixed = LimeTr(self.grouping, int(self.k_beta), int(self.k_gamma), self.Y, self.X, self.JX, self.Z, S=S, C=C, JC=JC, c=c, inlier_percentage=1. - trim_percentage, share_obs_std=share_obs_std, uprior=uprior_fixed) model_fixed.optimize( x0=x0, print_level=inner_print_level, max_iter=inner_max_iter, tol=inner_tol, acceptable_tol=inner_acceptable_tol, nlp_scaling_min_value=inner_nlp_scaling_min_value) x0 = model_fixed.soln self.beta_fixed = model_fixed.beta if self.add_re is False: self.beta_soln = self.beta_fixed self.delta_soln = model_fixed.delta self.gamma_soln = model_fixed.gamma self.w_soln = model_fixed.w self.info = model_fixed.info['status_msg'] self.yfit_no_random = model_fixed.F(model_fixed.beta) return else: self.beta_fixed = self._solveBeta(S) x0 = np.append(self.beta_fixed, [1e-8] * self.k_gamma) if self.add_re is False: self.beta_soln = self.beta_fixed self.yfit_no_random = self.Xm.dot(self.beta_fixed) return model = LimeTr(self.grouping, int(self.k_beta), int(self.k_gamma), self.Y, self.X, self.JX, self.Z, S=S, C=C, JC=JC, c=c, inlier_percentage=1 - trim_percentage, share_obs_std=share_obs_std, uprior=self.uprior, gprior=self.gprior) model.fitModel(x0=x0, inner_print_level=inner_print_level, inner_max_iter=inner_max_iter, inner_acceptable_tol=inner_acceptable_tol, inner_nlp_scaling_min_value=inner_nlp_scaling_min_value, inner_tol=inner_tol, outer_verbose=outer_verbose, outer_max_iter=outer_max_iter, outer_step_size=outer_step_size, outer_tol=outer_tol) self.beta_soln = model.beta self.gamma_soln = model.gamma if normalize_Z: self.gamma_soln /= Z_norm**2 self.delta_soln = model.delta self.info = model.info self.w_soln = model.w self.u_soln = model.estimateRE() self.solve_status = model.info['status'] self.solve_status_msg = model.info['status_msg'] self.yfit_no_random = model.F(model.beta) self.yfit = [] Z_split = np.split(self.Z, self.n_groups) yfit_no_random_split = np.split(self.yfit_no_random, self.n_groups) for i in range(self.n_groups): self.yfit.append(yfit_no_random_split[i] + Z_split[i].dot(self.u_soln[i])) self.yfit = np.concatenate(self.yfit) self.model = model if inner_verbose == True and self.solve_status != 0: print(self.solve_status_msg)
def fit(self, max_iter=100, inlier_pct=1.0, outer_max_iter=100, outer_step_size=1.0): """Optimize the model parameters. This is a interface to limetr. Args: max_iter (int, optional): Maximum number of iterations. inlier_pct (float, optional): How much percentage of the data do you trust. outer_max_iter (int, optional): Outer maximum number of iterations. outer_step_size (float, optional): Step size of the trimming problem, the larger the step size the faster it will converge, and the less quality of trimming it will guarantee. """ # dimensions for limetr n = self.cwdata.study_sizes if n.size == 0: n = np.full(self.cwdata.num_obs, 1) k_beta = self.num_vars k_gamma = 1 y = self.cwdata.obs s = self.cwdata.obs_se x = self.design_mat z = np.ones((self.cwdata.num_obs, 1)) uprior = np.hstack( (self.prior_beta_uniform, self.prior_gamma_uniform[:, None])) gprior = np.hstack( (self.prior_beta_gaussian, self.prior_gamma_gaussian[:, None])) if self.constraint_mat is None: cfun = None jcfun = None cvec = None else: num_constraints = self.constraint_mat.shape[0] cmat = np.hstack( (self.constraint_mat, np.zeros((num_constraints, 1)))) cvec = np.array([[-np.inf] * num_constraints, [0.0] * num_constraints]) def cfun(var): return cmat.dot(var) def jcfun(var): return cmat def fun(var): return x.dot(var) def jfun(beta): return x self.lt = LimeTr(n, k_beta, k_gamma, y, fun, jfun, z, S=s, gprior=gprior, uprior=uprior, C=cfun, JC=jcfun, c=cvec, inlier_percentage=inlier_pct) self.beta, self.gamma, self.w = self.lt.fitModel( inner_print_level=5, inner_max_iter=max_iter, outer_max_iter=outer_max_iter, outer_step_size=outer_step_size) self.fixed_vars = { var: self.beta[self.var_idx[var]] for var in self.vars } if self.use_random_intercept: u = self.lt.estimateRE() self.random_vars = { sid: u[i] for i, sid in enumerate(self.cwdata.unique_study_id) } else: self.random_vars = dict() # compute the posterior distribution of beta hessian = self.get_beta_hessian() unconstrained_id = np.hstack([ np.arange(self.lt.k_beta)[self.var_idx[dorm]] for dorm in self.cwdata.unique_dorms if dorm != self.gold_dorm ]) self.beta_sd = np.zeros(self.lt.k_beta) self.beta_sd[unconstrained_id] = np.sqrt( np.diag(np.linalg.inv(hessian)))
class CWModel: """Cross Walk model. """ def __init__(self, cwdata, obs_type='diff_log', cov_models=None, gold_dorm=None, order_prior=None, use_random_intercept=True, prior_gamma_uniform=None, prior_gamma_gaussian=None): """Constructor of CWModel. Args: cwdata (data.CWData): Data for cross walk. obs_type (str, optional): Type of observation can only be chosen from `'diff_log'` and `'diff_logit'`. cov_models (list{crosswalk.CovModel}): A list of covariate models for the definitions/methods gold_dorm (str | None, optional): Gold standard definition/method. order_prior (list{list{str}} | None, optional): Order priors between different definitions. use_random_intercept (bool, optional): If ``True``, use random intercept. prior_gamma_uniform (Tuple[float, float], optional): If not ``None``, use it as the bound of gamma. prior_gamma_gaussian (Tuple[float, float], optional): If not ``None``, use it as the gaussian prior of gamma. """ self.cwdata = cwdata self.obs_type = obs_type self.cov_models = utils.default_input(cov_models, [CovModel('intercept')]) self.gold_dorm = utils.default_input(gold_dorm, cwdata.max_ref_dorm) self.order_prior = order_prior self.use_random_intercept = use_random_intercept if self.cwdata.num_studies == 0 and self.use_random_intercept: warnings.warn("Must have study_id to use random intercept." " Reset use_random_intercept to False.") self.use_random_intercept = False # check input self.check() # create function for prediction if self.obs_type == 'diff_log': def obs_fun(x): return np.log(x) def obs_inv_fun(y): return np.exp(y) else: def obs_fun(x): return np.log(x / (1.0 - x)) def obs_inv_fun(y): return 1.0 / (1.0 + np.exp(-y)) self.obs_fun = obs_fun self.obs_inv_fun = obs_inv_fun # variable names self.vars = [dorm for dorm in self.cwdata.unique_dorms] # dimensions self.num_vars_per_dorm = sum( [model.num_vars for model in self.cov_models]) self.num_vars = self.num_vars_per_dorm * self.cwdata.num_dorms # indices for easy access the variables var_sizes = np.array([self.num_vars_per_dorm] * self.cwdata.num_dorms) var_idx = utils.sizes_to_indices(var_sizes) self.var_idx = {var: var_idx[i] for i, var in enumerate(self.vars)} # create design matrix self.relation_mat = self.create_relation_mat() self._check_relation_mat() self.cov_mat = self.create_cov_mat() self._assert_covs_independent() self.design_mat = self.create_design_mat() self._assert_rank_efficient() self.constraint_mat = self.create_constraint_mat() # gamma bounds self.prior_gamma_uniform = np.array([ 0.0, np.inf ]) if prior_gamma_uniform is None else np.array(prior_gamma_uniform) if not self.use_random_intercept: self.prior_gamma_uniform = np.zeros(2) if self.prior_gamma_uniform[0] < 0.0: warnings.warn( "Lower bound of gamma has to be non-negative, reset it to zero." ) self.prior_gamma_uniform[0] = 0.0 # gamma Gaussian prior self.prior_gamma_gaussian = np.array([ 0.0, np.inf ]) if prior_gamma_gaussian is None else np.array(prior_gamma_gaussian) if not self.use_random_intercept: self.prior_gamma_gaussian = np.array([0.0, np.inf]) # beta bounds uprior = np.repeat(np.array([[-np.inf], [np.inf]]), self.num_vars, axis=1) for i, cov_model in enumerate(self.cov_models): for dorm, prior in cov_model.prior_beta_uniform.items(): uprior[:, self.var_idx[dorm][i]] = prior uprior[:, self.var_idx[self.gold_dorm]] = 0.0 self.prior_beta_uniform = uprior # beta Gaussian prior gprior = np.repeat(np.array([[0.0], [np.inf]]), self.num_vars, axis=1) for i, cov_model in enumerate(self.cov_models): for dorm, prior in cov_model.prior_beta_gaussian.items(): gprior[:, self.var_idx[dorm][i]] = prior gprior[:, self.var_idx[self.gold_dorm]] = np.array([[0.0], [np.inf]]) self.prior_beta_gaussian = gprior # place holder for the solutions self.beta = None self.beta_sd = None self.gamma = None self.fixed_vars = None self.random_vars = None def check(self): """Check input type, dimension and values. """ assert isinstance(self.cwdata, data.CWData) assert self.obs_type in ['diff_log', 'diff_logit'], \ "Unsupport observation type" assert isinstance(self.cov_models, list) assert all([isinstance(model, CovModel) for model in self.cov_models]) assert self.gold_dorm in self.cwdata.unique_dorms assert self.order_prior is None or isinstance(self.order_prior, list) def _assert_covs_independent(self): """Check if the covariates are independent. """ rank = np.linalg.matrix_rank(self.cov_mat) if rank < self.cov_mat.shape[1]: raise ValueError( "Covariates are collinear, that is, some covariate column is a linear combination of " "some of the other columns. Please check them carefully.") def _assert_rank_efficient(self): """Check the rank of the design matrix. """ rank = np.linalg.matrix_rank(self.design_mat) num_unknowns = self.num_vars_per_dorm * (self.cwdata.num_dorms - 1) if rank < num_unknowns: raise ValueError( f"Not enough information in the data to recover parameters." f"Number of effective data points is {rank} and number of unknowns is {num_unknowns}." f"Please include more effective data or reduce the number of covariates." ) def create_relation_mat(self, cwdata=None): """Creating relation matrix. Args: cwdata (data.CWData | None, optional): Optional data set, if None, use `self.cwdata`. Returns: numpy.ndarray: Returns relation matrix with 1 encode alternative definition and -1 encode reference definition. """ cwdata = utils.default_input(cwdata, default=self.cwdata) assert isinstance(cwdata, data.CWData) relation_mat = np.zeros((cwdata.num_obs, cwdata.num_dorms)) for i, dorms in enumerate(cwdata.alt_dorms): for dorm in dorms: relation_mat[i, cwdata.dorm_idx[dorm]] += 1.0 for i, dorms in enumerate(cwdata.ref_dorms): for dorm in dorms: relation_mat[i, cwdata.dorm_idx[dorm]] -= 1.0 return relation_mat def _check_relation_mat(self): """Check relation matrix, detect unused dorms. """ col_scales = np.max(np.abs(self.relation_mat), axis=0) unused_dorms = [ self.cwdata.unique_dorms[i] for i, scale in enumerate(col_scales) if scale == 0.0 ] if len(unused_dorms) != 0: raise ValueError( f"{unused_dorms} appears to be unused, most likely it is (they are) " f"in both alt_dorms and ref_dorms at the same time for all its (their) " f"appearance. Please remove {unused_dorms} from alt_dorms and ref_dorms." ) def create_cov_mat(self, cwdata=None): """Create covariates matrix for definitions/methods model. Args: cwdata (data.CWData | None, optional): Optional data set, if None, use `self.cwdata`. Returns: numpy.ndarray: Returns covarites matrix. """ cwdata = utils.default_input(cwdata, default=self.cwdata) assert isinstance(cwdata, data.CWData) return np.hstack( [model.create_design_mat(cwdata) for model in self.cov_models]) def create_design_mat(self, cwdata=None, relation_mat=None, cov_mat=None): """Create linear design matrix. Args: cwdata (data.CWData | None, optional): Optional data set, if None, use `self.cwdata`. relation_mat (numpy.ndarray | None, optional): Optional relation matrix, if None, use `self.relation_mat` cov_mat (numpy.ndarray | None, optional): Optional covariates matrix, if None, use `self.cov_mat` Returns: numpy.ndarray: Returns linear design matrix. """ cwdata = utils.default_input(cwdata, default=self.cwdata) relation_mat = utils.default_input(relation_mat, default=self.relation_mat) cov_mat = utils.default_input(cov_mat, default=self.cov_mat) mat = (relation_mat.ravel()[:, None] * np.repeat(cov_mat, cwdata.num_dorms, axis=0)).reshape( cwdata.num_obs, self.num_vars) return mat def create_constraint_mat(self): """Create constraint matrix. Returns: numpy.ndarray: Return constraints matrix. """ mat = np.array([]).reshape(0, self.num_vars) if self.order_prior is not None: dorm_constraint_mat = [] cov_mat = self.cov_mat min_cov_mat = np.min(cov_mat, axis=0) max_cov_mat = np.max(cov_mat, axis=0) if np.allclose(min_cov_mat, max_cov_mat): design_mat = min_cov_mat[None, :] else: design_mat = np.vstack((min_cov_mat, max_cov_mat)) for p in self.order_prior: sub_mat = np.zeros((design_mat.shape[0], self.num_vars)) sub_mat[:, self.var_idx[p[0]]] = design_mat sub_mat[:, self.var_idx[p[1]]] = -design_mat dorm_constraint_mat.append(sub_mat) dorm_constraint_mat = np.vstack(dorm_constraint_mat) mat = np.vstack((mat, dorm_constraint_mat)) if mat.size == 0: return None else: return mat def fit(self, max_iter=100, inlier_pct=1.0, outer_max_iter=100, outer_step_size=1.0): """Optimize the model parameters. This is a interface to limetr. Args: max_iter (int, optional): Maximum number of iterations. inlier_pct (float, optional): How much percentage of the data do you trust. outer_max_iter (int, optional): Outer maximum number of iterations. outer_step_size (float, optional): Step size of the trimming problem, the larger the step size the faster it will converge, and the less quality of trimming it will guarantee. """ # dimensions for limetr n = self.cwdata.study_sizes if n.size == 0: n = np.full(self.cwdata.num_obs, 1) k_beta = self.num_vars k_gamma = 1 y = self.cwdata.obs s = self.cwdata.obs_se x = self.design_mat z = np.ones((self.cwdata.num_obs, 1)) uprior = np.hstack( (self.prior_beta_uniform, self.prior_gamma_uniform[:, None])) gprior = np.hstack( (self.prior_beta_gaussian, self.prior_gamma_gaussian[:, None])) if self.constraint_mat is None: cfun = None jcfun = None cvec = None else: num_constraints = self.constraint_mat.shape[0] cmat = np.hstack( (self.constraint_mat, np.zeros((num_constraints, 1)))) cvec = np.array([[-np.inf] * num_constraints, [0.0] * num_constraints]) def cfun(var): return cmat.dot(var) def jcfun(var): return cmat def fun(var): return x.dot(var) def jfun(beta): return x self.lt = LimeTr(n, k_beta, k_gamma, y, fun, jfun, z, S=s, gprior=gprior, uprior=uprior, C=cfun, JC=jcfun, c=cvec, inlier_percentage=inlier_pct) self.beta, self.gamma, self.w = self.lt.fitModel( inner_print_level=5, inner_max_iter=max_iter, outer_max_iter=outer_max_iter, outer_step_size=outer_step_size) self.fixed_vars = { var: self.beta[self.var_idx[var]] for var in self.vars } if self.use_random_intercept: u = self.lt.estimateRE() self.random_vars = { sid: u[i] for i, sid in enumerate(self.cwdata.unique_study_id) } else: self.random_vars = dict() # compute the posterior distribution of beta hessian = self.get_beta_hessian() unconstrained_id = np.hstack([ np.arange(self.lt.k_beta)[self.var_idx[dorm]] for dorm in self.cwdata.unique_dorms if dorm != self.gold_dorm ]) self.beta_sd = np.zeros(self.lt.k_beta) self.beta_sd[unconstrained_id] = np.sqrt( np.diag(np.linalg.inv(hessian))) def get_beta_hessian(self) -> np.ndarray: # compute the posterior distribution of beta x = self.lt.JF(self.lt.beta) * np.sqrt(self.lt.w)[:, None] z = self.lt.Z * np.sqrt(self.lt.w)[:, None] v = limetr.utils.VarMat(self.lt.V**self.lt.w, z, self.lt.gamma, self.lt.n) if hasattr(self.lt, 'gprior'): beta_gprior_sd = self.lt.gprior[:, self.lt.idx_beta][1] else: beta_gprior_sd = np.repeat(np.inf, self.lt.k_beta) hessian = x.T.dot(v.invDot(x)) + np.diag(1.0 / beta_gprior_sd**2) hessian = np.delete(hessian, self.var_idx[self.gold_dorm], axis=0) hessian = np.delete(hessian, self.var_idx[self.gold_dorm], axis=1) return hessian def get_cov_names(self) -> List[str]: # column of covariate name cov_names = [] for model in self.cov_models: if model.spline is None: cov_names.append(model.cov_name) else: cov_names.extend([ f'{model.cov_name}_spline_{i}' for i in range(model.num_vars) ]) return cov_names def create_result_df(self) -> pd.DataFrame: """Create result data frame. Returns: pd.DataFrame: Data frame that contains the result. """ # column of dorms dorms = np.repeat(self.cwdata.unique_dorms, self.num_vars_per_dorm) cov_names = self.get_cov_names() cov_names *= self.cwdata.num_dorms # create data frame df = pd.DataFrame({ 'dorms': dorms, 'cov_names': cov_names, 'beta': self.beta, 'beta_sd': self.beta_sd, }) if self.use_random_intercept: gamma = np.hstack((self.lt.gamma, np.full(self.num_vars - 1, np.nan))) re = np.hstack( (self.lt.u, np.full((self.cwdata.num_studies, self.num_vars - 1), np.nan))) df['gamma'] = gamma for i, study_id in enumerate(self.cwdata.unique_study_id): df[study_id] = re[i] return df def save_result_df(self, folder: str, filename: str = 'result.csv'): """Save result. Args: folder (str): Path to the result folder. filename (str): Name of the result. Default to `'result.csv'`. """ if not filename.endswith('.csv'): filename += '.csv' df = self.create_result_df() df.to_csv(folder + '/' + filename, index=False) def adjust_orig_vals(self, df, orig_dorms, orig_vals_mean, orig_vals_se, study_id=None, data_id=None, ref_dorms=None): """Adjust alternative values. Args: df (pd.DataFrame): Data frame of the alternative values that need to be adjusted. orig_dorms (str): Name of the column in `df` that contains the alternative definitions or methods. orig_vals_mean (str): Name of the column in `df` that contains the alternative values. orig_vals_se (str): Name of the column in `df` that contains the standard error of alternative values. study_id (str | None, optional): If not `None`, predict with the random effects. data_id (str | None, optional): If `None` create data_id by the integer sequence. ref_dorms (str, optional): Name of the column with reference dorms, if is ``None``, use the gold_dorm as the reference dorm. Default to ``None``. Returns: pandas.DataFrame: The adjusted values and standard deviations. """ df_copy = df.copy() if ref_dorms is None: ref_dorms = 'ref_dorms' df_copy[ref_dorms] = np.array([self.gold_dorm] * df_copy.shape[0]) if 'intercept' not in df_copy.columns: df_copy['intercept'] = np.ones(df_copy.shape[0]) new_cwdata = data.CWData(df_copy, alt_dorms=orig_dorms, ref_dorms=ref_dorms, dorm_separator=self.cwdata.dorm_separator, covs=list(self.cwdata.covs.columns), data_id=data_id, add_intercept=False) # transfer data dorm structure to the new_cwdata new_cwdata.copy_dorm_structure(self.cwdata) # create new design matrix new_relation_mat = self.create_relation_mat(cwdata=new_cwdata) new_cov_mat = self.create_cov_mat(cwdata=new_cwdata) new_design_mat = self.create_design_mat(cwdata=new_cwdata, relation_mat=new_relation_mat, cov_mat=new_cov_mat) # calculate the random effects if study_id is not None: random_effects = np.array([ self.random_vars[sid][0] if sid in self.random_vars else 0.0 for sid in df[study_id] ]) else: random_effects = np.zeros(df.shape[0]) random_effects[df[orig_dorms].values == self.gold_dorm] = 0.0 # compute the corresponding gold_dorm value if self.obs_type == 'diff_log': transformed_orig_vals_mean,\ transformed_orig_vals_se = utils.linear_to_log( df[orig_vals_mean].values, df[orig_vals_se].values ) else: transformed_orig_vals_mean, \ transformed_orig_vals_se = utils.linear_to_logit( df[orig_vals_mean].values, df[orig_vals_se].values ) pred_diff_mean = new_design_mat.dot(self.beta) pred_diff_sd = np.sqrt( np.array([(new_design_mat[i]**2).dot(self.beta_sd**2) if dorm != self.gold_dorm else 0.0 for i, dorm in enumerate(df[orig_dorms])])) transformed_ref_vals_mean = transformed_orig_vals_mean - \ pred_diff_mean - random_effects transformed_ref_vals_sd = np.sqrt(transformed_orig_vals_se**2 + pred_diff_sd**2 + self.gamma[0]**2) if self.obs_type == 'diff_log': ref_vals_mean,\ ref_vals_sd = utils.log_to_linear(transformed_ref_vals_mean, transformed_ref_vals_sd) else: ref_vals_mean,\ ref_vals_sd = utils.logit_to_linear(transformed_ref_vals_mean, transformed_ref_vals_sd) pred_df = pd.DataFrame({ 'ref_vals_mean': ref_vals_mean, 'ref_vals_sd': ref_vals_sd, 'pred_diff_mean': pred_diff_mean, 'pred_diff_sd': pred_diff_sd, 'data_id': new_cwdata.data_id }) return pred_df