def initializeParameters(self): ''' Initialize betas from logistic regression and phi from prior knowledge ''' # initialize betas from logistic regression where tissue-specific expression outlier status is label for i in range(self.num_tissues): self.beta_children[i] = lr.sgd( self.train_list[i][self.genomic_features].values, self.train_list[i]['expr_label'].values, self.getBetaLeaf(i), self.beta_parent, self.lambda_hp_children[i]) if self.e_distribution == 'noisyor': # p(e = 1 | z = 1) self.phi[0] = 0.7 # p(e = 1 | eqtl = 1) self.phi[1] = 0.6 else: # p(e = 0 | z = 0) self.phi[0][0] = .8 # p(e = 1 | z = 0) self.phi[1][0] = .2 # p(e = 0 | z = 1) self.phi[0][1] = .3 # p(e = 1 | z = 1) self.phi[1][1] = .7 # for simulation '''
def train_tissue_specific_genome_only(self): for i in range(self.num_tissues): # train tissue-specific model beta = lr.sgd(self.train_list[i][self.genomic_features].values, self.train_list[i]["expr_label"].values, np.zeros(len(self.genomic_features)), np.zeros(len(self.genomic_features)), 1.0) self.train_list[i]["tissue specific genome only"] = np.exp( lr.log_prob(self.train_list[i][self.genomic_features].values, beta)) self.test_list[i]["tissue specific genome only"] = np.exp( lr.log_prob(self.test_list[i][self.genomic_features].values, beta))
def train_shared_tissue_genome_only(self): beta = lr.sgd(self.train_list[0][self.genomic_features].values, self.train_list[0]["median_expr_label"].values, np.zeros(len(self.genomic_features)), np.zeros(len(self.genomic_features)), 1.0) for i in range(self.num_tissues): self.train_list[i]["shared tissue genome only"] = np.exp( lr.log_prob(self.train_list[i][self.genomic_features].values, beta)) self.test_list[i]["shared tissue genome only"] = np.exp( lr.log_prob(self.test_list[i][self.genomic_features].values, beta))
def _cross_validate(self, G, E): ''' K-fold Cross-validate beta MAP estimation to find optimal lambda :param G genomic features :param E expression labels :lambda set ''' G = G E = E # lambda set lambda_set = np.array([ 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6 ]) # initialize beta to zero beta_init = np.zeros(len(self.genomic_features)) # AUC scores for each lambda and each fold scores_list = np.zeros((len(lambda_set), self.num_folds)) # for each fold for k in range(self.num_folds): # access training data (everything but Kth fold) training = np.array( [x for i, x in enumerate(G) if i % self.num_folds != k]) training_labels = np.array( [x for i, x in enumerate(E) if i % self.num_folds != k]) # access validation data (Kth fold) validation = np.array( [[x for i, x in enumerate(G) if i % self.num_folds == k]]) validation_labels = np.array( [x for i, x in enumerate(E) if i % self.num_folds == k]) # for each possible lambda for i in range(len(lambda_set)): # train a logistic regression model beta = lr.sgd(training, training_labels, beta_init, beta_init, float(lambda_set[i])) # compute predictions on validation set scores = lr.log_prob(validation, beta).reshape(-1) # compute auc using predictions and validation set labels auc = sklearn.metrics.roc_auc_score(validation_labels, scores) scores_list[i][k] = auc # average across all folds for each lambda lambda_averages = np.mean(scores_list, axis=1) # sanity check assert len(lambda_averages) == len(lambda_set) optimal_lambda = lambda_set[np.argmax(lambda_averages)] return optimal_lambda
def initializeParameters(self): ''' Initialize betas from logistic regression and phi from prior knowledge ''' # initialize betas from logistic regression for i in range(self.num_tissues): self.beta_children[i] = lr.sgd( self.train_list[i][self.genomic_features].values, self.train_list[i][self.label].values, self.getBetaLeaf(i), self.beta_parent, self.lambda_hp_children[i]) self.phi = np.zeros((2, 2)) self.phi[0][0] = .8 self.phi[1][0] = .2 self.phi[0][1] = .3 self.phi[1][1] = .7
def _gradient_descent(self): for i in range(self.num_tissues): self.beta_children[i] = lr.sgd( self.train_list[i][self.genomic_features].values, self.train_list[i][self.model].values, self.getBetaLeaf(i), self.beta_parent, self.lambda_hp_children[i])
def _run_bootstrap(self): # beta is a T x M matrix, where T = # of tissues and M = number of features (not including intercept) beta = np.zeros( (self.num_simulations, self.num_tissues, self.num_features - 1)) beta_parent = np.zeros(self.num_features - 1) beta_init = np.zeros(self.num_features) delta = np.zeros( (self.num_simulations, self.num_tissues, self.num_features - 1)) delta_parent = np.zeros((self.num_simulations, self.num_features - 1)) lambda_set = np.array([ 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6 ]) # for each tissue for j in range(self.num_tissues): # generate K random data sets optimal_lambda = self.optimal_lambdas[j] # for each simulation for i in range(self.num_simulations): # generate simulated dataset i for tissue j train_sample = self.bootstrap_resample(self.train_list[j]) g = train_sample[self.genomic_features] expr_label = train_sample["expr_label"] #optimal_lambda = _cross_validate(g, expr_label, np.zeros(len(annot_cols_original)), np.zeros(len(annot_cols_original)), lambda_set) # compute L2 regularized logistic regression and store non-intercept terms beta[i][j] = lr.sgd(g.values, expr_label.values, beta_init, beta_init, optimal_lambda)[1:] # for each simulation for i in range(self.num_simulations): # estimate beta parent as an equally weighted average of its children beta_parent = self.estimateBetaParent(beta[i], np.ones(self.num_tissues), 1) # estimate difference between children betas and parent beta for j in range(self.num_tissues): delta[i][j] = (beta[i][j] - beta_parent) delta_parent[i] = beta_parent # estimate empirical variance between computed differences of children betas and parent beta lambda_inverse = self.computeEmpiricalVariance(delta, i + 1) # compute the average of the feature-specific variances lambda_inverse = np.sum(lambda_inverse, axis=1) / lambda_inverse.shape[1] # compute empirical variance between computed differences of parent beta and zero vector lambda_parent_inverse = self.computeEmpiricalVarianceParent( delta_parent, i + 1) # compute average of feature-specific variances lambda_parent_inverse = np.sum( lambda_parent_inverse) / lambda_parent_inverse.shape[0] lambda_hp_children = 1.0 / lambda_inverse lambda_hp_parent = 1.0 / lambda_parent_inverse # mapping from tissue to estimated transfer factor lambda_hp_children_dict = {} for i, tissue in enumerate(self.tissues): lambda_hp_children_dict[tissue] = lambda_hp_children[i] return lambda_hp_children_dict, lambda_hp_parent