def train_tissue_specific_genome_only(self): for i in range(self.num_tissues): # train tissue-specific model beta = lr.sgd(self.train_list[i][self.genomic_features].values, self.train_list[i]["expr_label"].values, np.zeros(len(self.genomic_features)), np.zeros(len(self.genomic_features)), 1.0) self.train_list[i]["tissue specific genome only"] = np.exp( lr.log_prob(self.train_list[i][self.genomic_features].values, beta)) self.test_list[i]["tissue specific genome only"] = np.exp( lr.log_prob(self.test_list[i][self.genomic_features].values, beta))
def train_shared_tissue_genome_only(self): beta = lr.sgd(self.train_list[0][self.genomic_features].values, self.train_list[0]["median_expr_label"].values, np.zeros(len(self.genomic_features)), np.zeros(len(self.genomic_features)), 1.0) for i in range(self.num_tissues): self.train_list[i]["shared tissue genome only"] = np.exp( lr.log_prob(self.train_list[i][self.genomic_features].values, beta)) self.test_list[i]["shared tissue genome only"] = np.exp( lr.log_prob(self.test_list[i][self.genomic_features].values, beta))
def computeLikelihood(self): ll = self.log_p_beta() # P(beta^c | beta) for i in range(self.num_tissues): ll += self.log_p_beta_child_given_beta(i) for i in range(self.num_tissues): try: log_prob_z_1_g = lr.log_prob( self.train_list[i][self.genomic_features], self.getBetaLeaf(i)) log_prob_z_0_g = np.log(1.0 - np.exp(log_prob_z_1_g)) log_prob_e_z_1 = nb.log_prob(self.train_list[i]['expr_label'], 1, self.phi) b = log_prob_e_z_1 + lr.log_prob( self.train_list[i][self.genomic_features], self.getBetaLeaf(i)) except: continue a = nb.log_prob(self.train_list[i]['expr_label'], 0, self.phi) + np.log(1.0 - np.exp(log_prob_z_1_g)) # log sum exp trick s = np.maximum(a, b) unnormalized_prob = s + np.log(np.exp(a - s) + np.exp(b - s)) ll_tissue = np.nansum(unnormalized_prob) ll += ll_tissue
def _RIVER_likelihood(e, g, beta, phi): # log p(z = 1 | g) log_p_z_1_given_g = lr.log_prob(g, beta) # log p(z = 0 | g) log_p_z_0_given_g = np.log(1.0 - np.exp(log_p_z_1_given_g)) # log p(e | z = 1) log_p_e_given_z_1 = nb.log_prob(e, 1, phi) # log p(e | z = 0) log_p_e_given_z_0 = nb.log_prob(e, 0, phi) import pdb pdb.set_trace() #x_1 = #m = np.maximum() return 1
def _cross_validate(self, G, E): ''' K-fold Cross-validate beta MAP estimation to find optimal lambda :param G genomic features :param E expression labels :lambda set ''' G = G E = E # lambda set lambda_set = np.array([ 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6 ]) # initialize beta to zero beta_init = np.zeros(len(self.genomic_features)) # AUC scores for each lambda and each fold scores_list = np.zeros((len(lambda_set), self.num_folds)) # for each fold for k in range(self.num_folds): # access training data (everything but Kth fold) training = np.array( [x for i, x in enumerate(G) if i % self.num_folds != k]) training_labels = np.array( [x for i, x in enumerate(E) if i % self.num_folds != k]) # access validation data (Kth fold) validation = np.array( [[x for i, x in enumerate(G) if i % self.num_folds == k]]) validation_labels = np.array( [x for i, x in enumerate(E) if i % self.num_folds == k]) # for each possible lambda for i in range(len(lambda_set)): # train a logistic regression model beta = lr.sgd(training, training_labels, beta_init, beta_init, float(lambda_set[i])) # compute predictions on validation set scores = lr.log_prob(validation, beta).reshape(-1) # compute auc using predictions and validation set labels auc = sklearn.metrics.roc_auc_score(validation_labels, scores) scores_list[i][k] = auc # average across all folds for each lambda lambda_averages = np.mean(scores_list, axis=1) # sanity check assert len(lambda_averages) == len(lambda_set) optimal_lambda = lambda_set[np.argmax(lambda_averages)] return optimal_lambda
def eStepLocal(self, i, data, beta, phi): ''' Compute p(z | ...) for tissue i i : int tissue index data : panda data frame core data structure containing genomic features, expression, updated posteriors. beta : numpy array : 1 x M coefficients for genomic features phi : numpy array either 2 x 2 numpy array for categorical distribution or 1 x 2 for noisy or ''' # log p(z | g) log_prob_z_1_given_g = lr.log_prob(data[self.genomic_features].values, beta) log_prob_z_0_given_g = np.log(1.0 - np.exp(log_prob_z_1_given_g)) # log p(e | z, q) if self.e_distribution == 'noisyor': # noisy OR log_prob_e_given_z_1 = nb.log_prob_noisyor_2_params( data['expr_label'], 1, data["eqtl"], phi) log_prob_e_given_z_0 = nb.log_prob_noisyor_2_params( data[i]['expr_label'], 0, data["eqtl"], phi) # log p(e | z) else: # naive bayes log_prob_e_given_z_1 = nb.log_prob(data['expr_label'].values, 1, self.phi) log_prob_e_given_z_0 = nb.log_prob(data['expr_label'].values, 0, self.phi) # p(e|z =1) * p(z = 1 | g) / (\sum_{z \in S} p(z = s | g) * p(e | z = s)) log_q = log_prob_e_given_z_1 + log_prob_z_1_given_g - np.log( np.exp(log_prob_e_given_z_0) * np.exp(log_prob_z_0_given_g) + np.exp(log_prob_e_given_z_1) * np.exp(log_prob_z_1_given_g)) return np.exp(log_q)
def eStepLocalTest(self, i, beta, phi): ''' Compute expectation for tissue i ''' # log P(Z = 1 | G) log_prob_z_1_given_g = lr.log_prob( self.test_list[i][self.genomic_features].values, beta) # log P(Z = 0 | G) log_prob_z_0_given_g = np.log(1.0 - np.exp(log_prob_z_1_given_g)) # log P(E | Z = 1) log_prob_e_given_z_1 = nb.log_prob( self.test_list[i][self.label].values, 1, phi) # log P(E | Z = 0) log_prob_e_given_z_0 = nb.log_prob( self.test_list[i][self.label].values, 0, phi) log_q = log_prob_e_given_z_1 + log_prob_z_1_given_g - np.log( np.exp(log_prob_e_given_z_0) * np.exp(log_prob_z_0_given_g) + np.exp(log_prob_e_given_z_1) * np.exp(log_prob_z_1_given_g)) return np.exp(log_q)
def _compute_p_z_given_g(self, beta, g): ''' P(z | g; beta) ''' return np.exp(lr.log_prob(g, beta))