Beispiel #1
0
    def initializeParameters(self):
        '''
            Initialize betas from logistic regression and phi from prior knowledge

        '''
        # initialize betas from logistic regression where tissue-specific expression outlier status is label
        for i in range(self.num_tissues):
            self.beta_children[i] = lr.sgd(
                self.train_list[i][self.genomic_features].values,
                self.train_list[i]['expr_label'].values, self.getBetaLeaf(i),
                self.beta_parent, self.lambda_hp_children[i])

        if self.e_distribution == 'noisyor':
            # p(e = 1 | z = 1)
            self.phi[0] = 0.7
            # p(e = 1 | eqtl = 1)
            self.phi[1] = 0.6
        else:
            # p(e = 0 | z = 0)
            self.phi[0][0] = .8
            # p(e = 1 | z = 0)
            self.phi[1][0] = .2
            # p(e = 0 | z = 1)
            self.phi[0][1] = .3
            # p(e = 1 | z = 1)
            self.phi[1][1] = .7

        # for simulation
        '''
Beispiel #2
0
 def train_tissue_specific_genome_only(self):
     for i in range(self.num_tissues):
         # train tissue-specific model
         beta = lr.sgd(self.train_list[i][self.genomic_features].values,
                       self.train_list[i]["expr_label"].values,
                       np.zeros(len(self.genomic_features)),
                       np.zeros(len(self.genomic_features)), 1.0)
         self.train_list[i]["tissue specific genome only"] = np.exp(
             lr.log_prob(self.train_list[i][self.genomic_features].values,
                         beta))
         self.test_list[i]["tissue specific genome only"] = np.exp(
             lr.log_prob(self.test_list[i][self.genomic_features].values,
                         beta))
Beispiel #3
0
    def train_shared_tissue_genome_only(self):

        beta = lr.sgd(self.train_list[0][self.genomic_features].values,
                      self.train_list[0]["median_expr_label"].values,
                      np.zeros(len(self.genomic_features)),
                      np.zeros(len(self.genomic_features)), 1.0)

        for i in range(self.num_tissues):
            self.train_list[i]["shared tissue genome only"] = np.exp(
                lr.log_prob(self.train_list[i][self.genomic_features].values,
                            beta))
            self.test_list[i]["shared tissue genome only"] = np.exp(
                lr.log_prob(self.test_list[i][self.genomic_features].values,
                            beta))
Beispiel #4
0
    def _cross_validate(self, G, E):
        '''
	        K-fold Cross-validate beta MAP estimation to find optimal lambda
	        :param G genomic features
	        :param E expression labels
	        :lambda set 
	    '''
        G = G
        E = E
        # lambda set
        lambda_set = np.array([
            1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5,
            1e6
        ])
        # initialize beta to zero
        beta_init = np.zeros(len(self.genomic_features))
        # AUC scores for each lambda and each fold
        scores_list = np.zeros((len(lambda_set), self.num_folds))
        # for each fold
        for k in range(self.num_folds):
            # access training data (everything but Kth fold)
            training = np.array(
                [x for i, x in enumerate(G) if i % self.num_folds != k])
            training_labels = np.array(
                [x for i, x in enumerate(E) if i % self.num_folds != k])
            # access validation data (Kth fold)
            validation = np.array(
                [[x for i, x in enumerate(G) if i % self.num_folds == k]])
            validation_labels = np.array(
                [x for i, x in enumerate(E) if i % self.num_folds == k])
            # for each possible lambda
            for i in range(len(lambda_set)):
                # train a logistic regression model
                beta = lr.sgd(training, training_labels, beta_init, beta_init,
                              float(lambda_set[i]))
                # compute predictions on validation set
                scores = lr.log_prob(validation, beta).reshape(-1)
                # compute auc using predictions and validation set labels
                auc = sklearn.metrics.roc_auc_score(validation_labels, scores)
                scores_list[i][k] = auc
        # average across all folds for each lambda
        lambda_averages = np.mean(scores_list, axis=1)
        # sanity check
        assert len(lambda_averages) == len(lambda_set)
        optimal_lambda = lambda_set[np.argmax(lambda_averages)]
        return optimal_lambda
Beispiel #5
0
    def initializeParameters(self):
        '''
            Initialize betas from logistic regression and phi from prior knowledge

        '''
        # initialize betas from logistic regression
        for i in range(self.num_tissues):
            self.beta_children[i] = lr.sgd(
                self.train_list[i][self.genomic_features].values,
                self.train_list[i][self.label].values, self.getBetaLeaf(i),
                self.beta_parent, self.lambda_hp_children[i])

        self.phi = np.zeros((2, 2))

        self.phi[0][0] = .8
        self.phi[1][0] = .2
        self.phi[0][1] = .3
        self.phi[1][1] = .7
Beispiel #6
0
 def _gradient_descent(self):
     for i in range(self.num_tissues):
         self.beta_children[i] = lr.sgd(
             self.train_list[i][self.genomic_features].values,
             self.train_list[i][self.model].values, self.getBetaLeaf(i),
             self.beta_parent, self.lambda_hp_children[i])
Beispiel #7
0
    def _run_bootstrap(self):

        # beta is a T x M matrix, where T = # of tissues and M = number of features (not including intercept)
        beta = np.zeros(
            (self.num_simulations, self.num_tissues, self.num_features - 1))
        beta_parent = np.zeros(self.num_features - 1)
        beta_init = np.zeros(self.num_features)

        delta = np.zeros(
            (self.num_simulations, self.num_tissues, self.num_features - 1))
        delta_parent = np.zeros((self.num_simulations, self.num_features - 1))
        lambda_set = np.array([
            1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3, 1e4, 1e5,
            1e6
        ])

        # for each tissue
        for j in range(self.num_tissues):
            # generate K random data sets
            optimal_lambda = self.optimal_lambdas[j]
            # for each simulation
            for i in range(self.num_simulations):
                # generate simulated dataset i for tissue j
                train_sample = self.bootstrap_resample(self.train_list[j])
                g = train_sample[self.genomic_features]
                expr_label = train_sample["expr_label"]
                #optimal_lambda = _cross_validate(g, expr_label, np.zeros(len(annot_cols_original)), np.zeros(len(annot_cols_original)), lambda_set)
                # compute L2 regularized logistic regression and store non-intercept terms
                beta[i][j] = lr.sgd(g.values, expr_label.values, beta_init,
                                    beta_init, optimal_lambda)[1:]
        # for each simulation
        for i in range(self.num_simulations):
            # estimate beta parent as an equally weighted average of its children
            beta_parent = self.estimateBetaParent(beta[i],
                                                  np.ones(self.num_tissues), 1)
            # estimate difference between children betas and parent beta
            for j in range(self.num_tissues):
                delta[i][j] = (beta[i][j] - beta_parent)
            delta_parent[i] = beta_parent
        # estimate empirical variance between computed differences of children betas and parent beta
        lambda_inverse = self.computeEmpiricalVariance(delta, i + 1)
        # compute the average of the feature-specific variances
        lambda_inverse = np.sum(lambda_inverse,
                                axis=1) / lambda_inverse.shape[1]

        # compute empirical variance between computed differences of parent beta and zero vector
        lambda_parent_inverse = self.computeEmpiricalVarianceParent(
            delta_parent, i + 1)
        # compute average of feature-specific variances
        lambda_parent_inverse = np.sum(
            lambda_parent_inverse) / lambda_parent_inverse.shape[0]

        lambda_hp_children = 1.0 / lambda_inverse
        lambda_hp_parent = 1.0 / lambda_parent_inverse

        # mapping from tissue to estimated transfer factor
        lambda_hp_children_dict = {}
        for i, tissue in enumerate(self.tissues):
            lambda_hp_children_dict[tissue] = lambda_hp_children[i]

        return lambda_hp_children_dict, lambda_hp_parent