def __init__(self, Y, Cn, G, F=None, A=None, rank=1, Cr=None): """ Args: Y: [N, P] phenotype matrix F: Sample fixed effect design (first dimension must be N) A: Trait fixed effect design (second dimension must be P) Cn: Limix covariance matrix for Cn (dimension P) G: [N, rank_r] numpy covariance matrix for G rank: rank of column low-rank covariance (default = 1) """ assert_type(Y, NP.ndarray, 'Y') assert_subtype(Cn, Covariance, 'Cn') assert_type(G, NP.ndarray, 'G') covar = Cov2KronSumLR(Cn=Cn, G=G, rank=rank, Cr=Cr) mean = MeanKronSum(Y=Y, F=F, A=A) assert mean.n_terms <= 1, ('GP2KronSum supports MeanKronSum' ' means with maximum 1 term!') GP.__init__(self, covar=covar, mean=mean)
def define_gp(Y, Xr, mean, Ie, type): P = 2 if type == 'null': _Cr = FixedCov(sp.ones([2, 2])) _Cr.scale = 1e-9 _Cr.act_scale = False covar = CategoricalLR(_Cr, sp.ones((Xr.shape[0], 1)), Ie) else: if type == 'block': _Cr = FixedCov(sp.ones((P, P))) elif type == 'rank1': _Cr = LowRankCov(P, 1) elif type == 'full': _Cr = FreeFormCov(P) else: print('poppo') covar = CategoricalLR(_Cr, Xr, Ie) _gp = GP(covar=covar, mean=mean) return _gp
import ipdb print 'Change Params covar:' ipdb.set_trace() gp.covar.diff(gp.covar.setRandomParams) print 'Change Params gp:' ipdb.set_trace() gp.diff(gp.covar.setRandomParams) print 'Change G covar:' ipdb.set_trace() gp.covar.diff(gp.covar.setG, 1. * (sp.rand(N, f) < 0.2)) print 'Change G gp:' ipdb.set_trace() gp.diff(gp.covar.setG, 1. * (sp.rand(N, f) < 0.2)) ipdb.set_trace() gp0 = GP(covar=copy.deepcopy(gp.covar), mean=copy.deepcopy(gp.mean)) t0 = time.time() print 'GP2KronSum.LML():', gp.LML() print 'Time elapsed:', time.time() - t0 # compare with normal gp # assess compatibility with this GP t0 = time.time() print 'GP.LML():', gp0.LML() print 'Time elapsed:', time.time() - t0 t0 = time.time() print 'GP2KronSum.LML_grad():', gp.LML_grad() print 'Time elapsed:', time.time() - t0
# local_noise_cov.scale = 0 # direct_cov.scale = 0 # cov = SumCov(noise_cov, local_noise_cov) cov = SumCov(cov, environment_cov) # fixing length scale of ZKZ and SE environment_cov.length = N_cells / 50 # environment_cov.scale=0 # environment_cov.act_length = False # local_noise_cov.length = N_cells/10.0 # local_noise_cov.act_length = False # define and optimise GP gp = GP(covar=cov, mean=mean) try: gp.optimize() except: print('optimisation', str(phen), 'failed') continue # rescale each terms to sample variance one # direct cov: unnecessary as fixed covariance rescaled before optimisation # local noise covariance tmp = covar_rescaling_factor(local_noise_cov.K()/local_noise_cov.scale) local_noise_cov.scale /= tmp # env effect tmp = covar_rescaling_factor(environment_cov.K()/environment_cov.scale**2) environment_cov.scale = environment_cov.scale**2/tmp
# debug covarianec cov = Cov2KronSumLR(Cn = Cn, G = X, rank = 1) cov.setRandomParams() pdb.set_trace() print ((cov.inv_debug()-cov.inv())**2).mean()<1e-9 print (cov.logdet_debug()-cov.logdet())**2 print (cov.logdet_grad_i_debug(0)-cov.logdet_grad_i(0))**2 if 0: t0 = time.time() print 'GP2KronSum.LML():', gp.LML() print 'Time elapsed:', time.time() - t0 # compare with normal gp # assess compatibility with this GP gp0 = GP(covar = copy.deepcopy(gp.covar), mean = copy.deepcopy(gp.mean)) t0 = time.time() print 'GP.LML():', gp0.LML() print 'Time elapsed:', time.time() - t0 if 0: pdb.set_trace() print gp.LML() - gp0.LML() print ((gp.LML_grad()['covar'] - gp0.LML_grad()['covar'])**2).mean() pdb.set_trace() gp.covar.setRandomParams() gp0.covar.setParams(gp.covar.getParams()) print gp.LML() - gp0.LML() print ((gp.LML_grad()['covar'] - gp0.LML_grad()['covar'])**2).mean() pdb.set_trace()
print('model 1 : complete model ') print('...........................................................') # total cov and mean cov = SumCov(direct_cov, noise_cov) cov = SumCov(cov, local_noise_cov) cov = SumCov(cov, environment_cov) # fixing length scale of ZKZ and SE environment_cov.length = N_cells / 50 environment_cov.act_length = False # local_noise_cov.length = N_cells/10.0 # local_noise_cov.act_length = False # define and optimise GP gp = GP(covar=cov, mean=mean) gp.optimize() # show results print("inferred parameters ") print("direct_scale = ", " ", direct_cov.scale) print("noise_scale = ", " ", noise_cov.scale) print("local_noise_scale = ", " ", local_noise_cov.scale) print("local_noise_length = ", " ", local_noise_cov.length) print("environment_scale = ", " ", environment_cov.scale) print("environment_length = ", " ", environment_cov.length) ####################################################################### # MODEL 2: no social effect ####################################################################### print('...........................................................')
class Model(object): """ Model is a general class for building and training a spatial variance model. Contains all the functions which are not specific to a given model """ def __init__(self): pass ########################## # Preprocessing steps ########################## ''' Normalisation of Y ''' def preprocess_input(self): # normalise phenotype if self.norm == 'quantile': # import pdb; pdb.set_trace() self.Y = utils.quantile_normalise_phenotype(self.Y) elif self.norm == 'std': self.Y = utils.normalise_phenotype(self.Y) else: raise Exception('normalisation method not understood') ''' Define a training set and a test set for out of sample prediction ''' def def_training_set(self, oos_predictions): if self.cv_ix is None: tmp = np.array([True for i in range(self.n_samples)]) if oos_predictions == 0.: self.train_set = tmp elif 0. < oos_predictions < 1.: test_ix = np.random.choice(range(self.n_samples), int(oos_predictions * self.n_samples), replace=False) tmp[test_ix] = False self.train_set = tmp else: raise Exception('oos_predictions out of range, should be in [0;1[') else: # set seed and get an index permutation and step size np.random.seed(0) permuted_indices = np.random.permutation(self.X.shape[0]) step_size = len(permuted_indices) * oos_predictions # select test set first_ix = int(self.cv_ix * step_size) last_ix = int(self.cv_ix * step_size + step_size) test_set = permuted_indices[first_ix:last_ix] # define boolean vector for train set self.train_set = np.array([True for i in range(self.n_samples)]) self.train_set[test_set] = False ########################## # Building Model ########################## ''' General way of initialising a mdel: ''' def init_model(self, cov_terms): self.preprocess_input() # defined in parent self.build_Kinship() self.build_cov(cov_terms) self.build_mean() self.build_gp() ''' The following functions are specific to a given model and have to be implemented in the relevant children class ''' def build_Kinship(self): pass def build_cov(self): pass def add_cov(self): pass def rm_cov(self): pass ''' General way to build the mean term of a GP model for limix ''' def build_mean(self): Y_tmp = self.Y Y_tmp = Y_tmp[self.train_set, :] self.mean = MeanBase(Y_tmp) ''' Creating a limix GP object ''' def build_gp(self): self.gp = GP(self.mean, self.covar) ########################## # Train model ########################## ''' The way the model is trained is specific to the model and has to be implemented in the relevant classes ''' def train_gp(self): pass ########################## # Prediction from model ########################## ''' General functions for out of sample prediction ''' def predict(self): try: return self.gp.predict() except: return np.array([[np.nan]]) def r2(self): Y_pred = self.predict()[:,0] Y_true = self.Y[:,0][~self.train_set] res = ((Y_true - Y_pred)**2.).sum() var = ((Y_true - Y_true.mean())**2.).sum() return 1. - res/var def pred(self): Y_pred = self.predict()[:,0] Y_true = self.Y[:,0][~self.train_set] return np.concatenate((Y_true[:, None], Y_pred[:, None]), axis=1)
def build_gp(self): self.gp = GP(self.mean, self.covar)
Y = sp.sin(X) + sp.sqrt(v_noise) * sp.randn(N, 1) # for out-of-sample preditions Xstar = sp.linspace(0,2,1000)[:,sp.newaxis] # define mean term W = 1. * (sp.rand(N, 2) < 0.2) mean = lin_mean(Y, W) # define covariance matrices sqexp = SQExpCov(X, Xstar = Xstar) noise = FixedCov(sp.eye(N)) covar = SumCov(sqexp, noise) # define gp gp = GP(covar=covar,mean=mean) # initialize params sqexp.scale = 1e-4 sqexp.length = 1 noise.scale = Y.var() # optimize gp.optimize(calc_ste=True) # predict out-of-sample Ystar = gp.predict() # print optimized values and standard errors print('weights of fixed effects') print(mean.b[0, 0], '+/-', mean.b_ste[0, 0]) print(mean.b[1, 0], '+/-', mean.b_ste[1, 0]) print('scale of sqexp') print(sqexp.scale, '+/-', sqexp.scale_ste)
def run_individual_model(model, expression_file, position_file, output_directory, permute_positions=False, random_start_point=False): rm_diag = True if model is not 'full' and model is not 'env': raise Exception('model not understood. Please specify a model between full and env') # read phenotypes data with open(expression_file, 'r') as f: prot_tmp = f.readline() protein_names = prot_tmp.split(' ') protein_names[-1] = protein_names[-1][0:-1] # removing the newline sign at the end of the last protein protein_names = np.reshape(protein_names, [len(protein_names), 1]) phenotypes = np.loadtxt(expression_file, delimiter=' ', skiprows=1) # read position data X = np.genfromtxt(position_file, delimiter=',') if permute_positions: X = X[np.random.permutation(X.shape[0]), :] if X.shape[0] != phenotypes.shape[0]: raise Exception('cell number inconsistent between position and epression levels ') # define output file name output_file = output_directory+'/inferred_parameters_' + model if permute_positions: output_file += '_permuted.txt' else: output_file += '.txt' N_cells = phenotypes.shape[0] parameters = np.zeros([phenotypes.shape[1], 6]) log_lik = np.zeros(phenotypes.shape[1]) for phen in range(0, phenotypes.shape[1]): phenotype = phenotypes[:, phen] phenotype -= phenotype.mean() phenotype /= phenotype.std() phenotype = np.reshape(phenotype, [N_cells, 1]) phenotypes_tmp = np.delete(phenotypes, phen, axis=1) phenotypes_tmp = normalise(phenotypes_tmp) Kinship = phenotypes_tmp.dot(phenotypes_tmp.transpose()) Kinship -= np.linalg.eigvalsh(Kinship).min() * np.eye(N_cells) Kinship *= covar_rescaling_factor(Kinship) # create different models and print the result including likelihood # create all the covariance terms direct_cov = FixedCov(Kinship) # noise noise_cov = FixedCov(np.eye(N_cells)) # local_noise local_noise_cov = SQExpCov(X) local_noise_cov.length = 100 local_noise_cov.act_length = False # environment effect environment_cov = ZKZCov(X, Kinship, rm_diag) # mean term mean = MeanBase(phenotype) ####################################################################### # defining model ####################################################################### cov = SumCov(noise_cov, local_noise_cov) cov = SumCov(cov, environment_cov) if random_start_point: environment_cov.length = np.random.uniform(10, 300) environment_cov.scale = np.random.uniform(1, 15) else: environment_cov.length = 200 # environment_cov.act_length = False if model == 'full': cov = SumCov(cov, direct_cov) else: direct_cov.scale = 0 # define and optimise GP gp = GP(covar=cov, mean=mean) try: gp.optimize() except: print('optimisation', str(phen), 'failed') continue log_lik[phen] = gp.LML() # rescale each terms to sample variance one # direct cov: unnecessary as fixed covariance rescaled before optimisation # local noise covariance tmp = covar_rescaling_factor(local_noise_cov.K()/local_noise_cov.scale) local_noise_cov.scale /= tmp # env effect tmp = covar_rescaling_factor(environment_cov.K()/environment_cov.scale**2) environment_cov.scale = environment_cov.scale**2/tmp parameters[phen, :] = [direct_cov.scale, noise_cov.scale, local_noise_cov.scale, local_noise_cov.length, environment_cov.scale, environment_cov.length] result_header = 'direct_scale' + ' ' + \ 'noise_scale' + ' ' + \ 'local_noise_scale' + ' ' + \ 'local_noise_length' + ' ' + \ 'environment_scale' + ' ' + \ 'environment_length' with open(output_file, 'w') as f: np.savetxt(f, np.hstack((protein_names, parameters)), delimiter=' ', header=result_header, fmt='%s', comments='') log_lik_file = output_file + '_loglik' with open(log_lik_file, 'w') as f: np.savetxt(f, log_lik)