def fitPairwiseModel(Y,XX=None,S_XX=None,U_XX=None,verbose=False): N,P = Y.shape """ initilizes parameters """ RV = fitSingleTraitModel(Y,XX=XX,S_XX=S_XX,U_XX=U_XX,verbose=verbose) Cg = covariance.freeform(2) Cn = covariance.freeform(2) gp = gp2kronSum(mean(Y[:,0:2]),Cg,Cn,XX=XX,S_XX=S_XX,U_XX=U_XX) conv2 = SP.ones((P,P),dtype=bool) rho_g = SP.ones((P,P)) rho_n = SP.ones((P,P)) for p1 in range(P): for p2 in range(p1): if verbose: print '.. fitting correlation (%d,%d)'%(p1,p2) gp.setY(Y[:,[p1,p2]]) Cg_params0 = SP.array([SP.sqrt(RV['varST'][p1,0]),1e-6*SP.randn(),SP.sqrt(RV['varST'][p2,0])]) Cn_params0 = SP.array([SP.sqrt(RV['varST'][p1,1]),1e-6*SP.randn(),SP.sqrt(RV['varST'][p2,1])]) params0 = {'Cg':Cg_params0,'Cn':Cn_params0} conv2[p1,p2],info = OPT.opt_hyper(gp,params0,factr=1e3) rho_g[p1,p2] = Cg.K()[0,1]/SP.sqrt(Cg.K().diagonal().prod()) rho_n[p1,p2] = Cn.K()[0,1]/SP.sqrt(Cn.K().diagonal().prod()) conv2[p2,p1] = conv2[p1,p2]; rho_g[p2,p1] = rho_g[p1,p2]; rho_n[p2,p1] = rho_n[p1,p2] RV['Cg0'] = rho_g*SP.dot(SP.sqrt(RV['varST'][:,0:1]),SP.sqrt(RV['varST'][:,0:1].T)) RV['Cn0'] = rho_n*SP.dot(SP.sqrt(RV['varST'][:,1:2]),SP.sqrt(RV['varST'][:,1:2].T)) RV['conv2'] = conv2 #3. regularizes covariance matrices offset_g = abs(SP.minimum(LA.eigh(RV['Cg0'])[0].min(),0))+1e-4 offset_n = abs(SP.minimum(LA.eigh(RV['Cn0'])[0].min(),0))+1e-4 RV['Cg0_reg'] = RV['Cg0']+offset_g*SP.eye(P) RV['Cn0_reg'] = RV['Cn0']+offset_n*SP.eye(P) RV['params0_Cg']=LA.cholesky(RV['Cg0_reg'])[SP.tril_indices(P)] RV['params0_Cn']=LA.cholesky(RV['Cn0_reg'])[SP.tril_indices(P)] return RV
def f1(x): C.setParams(x) b = C.K()[sp.tril_indices(2)] delta = (b - SSS) val = (delta * sp.dot(Hi, delta)).sum() db_dx0 = C.K_grad_i(0)[sp.tril_indices(2)] db_dx1 = C.K_grad_i(1)[sp.tril_indices(2)] db_dx2 = C.K_grad_i(2)[sp.tril_indices(2)] grad = 2 * sp.array([(delta * sp.dot(Hi, db_dx0)).sum(), (delta * sp.dot(Hi, db_dx1)).sum(), (delta * sp.dot(Hi, db_dx2)).sum()]) return val, grad
def f1(x): C.setParams(x) b = C.K()[sp.tril_indices(2)] delta = (b-SSS) val = (delta*sp.dot(Hi, delta)).sum() db_dx0 = C.K_grad_i(0)[sp.tril_indices(2)] db_dx1 = C.K_grad_i(1)[sp.tril_indices(2)] db_dx2 = C.K_grad_i(2)[sp.tril_indices(2)] grad = 2*sp.array([(delta*sp.dot(Hi, db_dx0)).sum(), (delta*sp.dot(Hi, db_dx1)).sum(), (delta*sp.dot(Hi, db_dx2)).sum()]) return val, grad
def _updateLgrad(self,i): """ construct the cholesky factor from hyperparameters """ self.zeros[i] = 1 self.Lgrad[sp.tril_indices(self.dim)] = self.zeros self.zeros[i] = 0
def _updateLgrad(self, i): """ construct the cholesky factor from hyperparameters """ self.zeros[i] = 1 self.Lgrad[SP.tril_indices(self.P)] = self.zeros self.zeros[i] = 0
def K_grad_interParam_i(self, i): ix, iy = sp.tril_indices(self.dim) ix = ix[i] iy = iy[i] R = sp.zeros((self.dim,self.dim)) R[ix, iy] = R[iy, ix] = 1 return R
def K_ste(self): if self.getFIinv() is None: R = None else: R = sp.zeros((self.dim, self.dim)) R[sp.tril_indices(self.dim)] = sp.sqrt(self.getFIinv().diagonal()) # symmetrize R = R + R.T - sp.diag(R.diagonal()) return R
def getInterParams(self): # VARIANCE + CORRELATIONS #R1 = self.variance #R2 = self.correlation[sp.tril_indices(self.dim, k = -1)] #R = sp.concatenate([R1,R2]) # COVARIANCES R = self.K()[sp.tril_indices(self.dim)] return R
def _initParams(self,init_method=None): """ this function initializes the paramenter and Ifilter """ if self.P==1: if self.bgRE: params0 = {'Cg':SP.sqrt(0.5)*SP.ones(1),'Cn':SP.sqrt(0.5)*SP.ones(1)} Ifilter = None else: params0 = {'Cr':1e-9*SP.ones(1),'Cn':SP.ones(1)} Ifilter = {'Cr':SP.zeros(1,dtype=bool),'Cn':SP.ones(1,dtype=bool)} else: if self.bgRE: if self.colCovarType=='freeform': if init_method=='pairwise': _RV = fitPairwiseModel(self.Y,XX=self.XX,S_XX=self.S_XX,U_XX=self.U_XX,verbose=False) params0 = {'Cg':_RV['params0_Cg'],'Cn':_RV['params0_Cn']} elif init_method=='random': params0 = {'Cg':SP.randn(self.Cg.getNumberParams()),'Cn':SP.randn(self.Cn.getNumberParams())} else: cov = 0.5*SP.cov(self.Y.T)+1e-4*SP.eye(self.P) chol = LA.cholesky(cov,lower=True) params = chol[SP.tril_indices(self.P)] params0 = {'Cg':params.copy(),'Cn':params.copy()} Ifilter = None else: if self.colCovarType=='freeform': cov = SP.cov(self.Y.T)+1e-4*SP.eye(self.P) chol = LA.cholesky(cov,lower=True) params = chol[SP.tril_indices(self.P)] #else: # S,U=LA.eigh(cov) # a = SP.sqrt(S[-self.rank_r:])[:,SP.newaxis]*U[:,-self.rank_r:] # if self.colCovarType=='lowrank_id': # c = SP.sqrt(S[:-self.rank_r].mean())*SP.ones(1) # else: # c = SP.sqrt(S[:-self.rank_r].mean())*SP.ones(self.P) # params0_Cn = SP.concatenate([a.T.ravel(),c]) params0 = {'Cr':1e-9*SP.ones(self.P),'Cn':params} Ifilter = {'Cr':SP.zeros(self.P,dtype=bool), 'Cn':SP.ones(params.shape[0],dtype=bool)} if self.mean.F is not None and self.bgRE: params0['mean'] = 1e-6*SP.randn(self.mean.getParams().shape[0]) if Ifilter is not None: Ifilter['mean'] = SP.ones(self.mean.getParams().shape[0],dtype=bool) return params0,Ifilter
def fitPairwiseModel(Y, XX=None, S_XX=None, U_XX=None, verbose=False): N, P = Y.shape """ initilizes parameters """ RV = fitSingleTraitModel(Y, XX=XX, S_XX=S_XX, U_XX=U_XX, verbose=verbose) Cg = covariance.freeform(2) Cn = covariance.freeform(2) gp = gp2kronSum(mean(Y[:, 0:2]), Cg, Cn, XX=XX, S_XX=S_XX, U_XX=U_XX) conv2 = SP.ones((P, P), dtype=bool) rho_g = SP.ones((P, P)) rho_n = SP.ones((P, P)) for p1 in range(P): for p2 in range(p1): if verbose: print '.. fitting correlation (%d,%d)' % (p1, p2) gp.setY(Y[:, [p1, p2]]) Cg_params0 = SP.array([ SP.sqrt(RV['varST'][p1, 0]), 1e-6 * SP.randn(), SP.sqrt(RV['varST'][p2, 0]) ]) Cn_params0 = SP.array([ SP.sqrt(RV['varST'][p1, 1]), 1e-6 * SP.randn(), SP.sqrt(RV['varST'][p2, 1]) ]) params0 = {'Cg': Cg_params0, 'Cn': Cn_params0} conv2[p1, p2], info = OPT.opt_hyper(gp, params0, factr=1e3) rho_g[p1, p2] = Cg.K()[0, 1] / SP.sqrt(Cg.K().diagonal().prod()) rho_n[p1, p2] = Cn.K()[0, 1] / SP.sqrt(Cn.K().diagonal().prod()) conv2[p2, p1] = conv2[p1, p2] rho_g[p2, p1] = rho_g[p1, p2] rho_n[p2, p1] = rho_n[p1, p2] RV['Cg0'] = rho_g * SP.dot(SP.sqrt(RV['varST'][:, 0:1]), SP.sqrt(RV['varST'][:, 0:1].T)) RV['Cn0'] = rho_n * SP.dot(SP.sqrt(RV['varST'][:, 1:2]), SP.sqrt(RV['varST'][:, 1:2].T)) RV['conv2'] = conv2 #3. regularizes covariance matrices offset_g = abs(SP.minimum(LA.eigh(RV['Cg0'])[0].min(), 0)) + 1e-4 offset_n = abs(SP.minimum(LA.eigh(RV['Cn0'])[0].min(), 0)) + 1e-4 RV['Cg0_reg'] = RV['Cg0'] + offset_g * SP.eye(P) RV['Cn0_reg'] = RV['Cn0'] + offset_n * SP.eye(P) RV['params0_Cg'] = LA.cholesky(RV['Cg0_reg'])[SP.tril_indices(P)] RV['params0_Cn'] = LA.cholesky(RV['Cn0_reg'])[SP.tril_indices(P)] return RV
def adjacency(self, min_snp2gene_obs=2,fdr_cutoff=0.3,return_genes=False): ''' Return a matrix showing the number of shared HPO genes by Term. The diagonal of the matrix is the number of genes discoverd by that term. The upper diagonal shows the overlap between the row and column and the lower diagonal shows the hypergeomitric pval for the overlap between the two terms. The universe used is the number of unique genes in the overlap results. min_snp2gene_obs : int (default: 2) The min SNP2gene mappinging observations needed to be HPO fdr_cutoff: float (default: 0.3) The FDR cutoff the be considered HPO return_genes : bool (default: False) Return the candidate gene list instead of the overlap table ''' df = self.high_priority_candidates( fdr_cutoff=fdr_cutoff, min_snp2gene_obs=min_snp2gene_obs, original_COB_only=True) # x={df[0]:set(df[1].gene) for df in df.groupby('Term')} adj = [] #num_universe = len(set(chain(*x.values()))) num_universe = len(self.results.gene.unique()) for i,a in enumerate(x.keys()): for j,b in enumerate(x.keys()): if j < i: continue common = set(x[a]).intersection(x[b]) num_common = len(set(x[a]).intersection(x[b])) if a != b: pval = hypergeom.sf(num_common-1,num_universe,len(x[a]),len(x[b])) else: # This will make the diagonal of the matrix be the number HPO genes # for the element pval = len(x[a]) adj.append((a,b,num_common,pval,','.join(common))) adj = pd.DataFrame(adj) adj.columns = ['Term1','Term2','num_common','pval','common'] # Stop early if we just want to return the lists if return_genes == True: adj = adj[adj.num_common>0] adj = adj[np.logical_not(adj.Term1==adj.Term2)] return adj.drop_duplicates() else: overlap = pd.pivot_table(adj,index='Term1',columns='Term2',values='num_common') # Mask out the lower diagonal on the overalp matrix overlap.values[tril_indices(len(overlap))] = 0 pvals = pd.pivot_table(adj,index='Term2',columns='Term1',values='pval') # Mask out the upper tringular on the pvals matrix pvals.values[triu_indices(len(pvals),1)] = 0 return (overlap+pvals).astype(float)
def _initParams(self, init_method=None): """ this function initializes the paramenter and Ifilter """ if self.bgRE: if init_method=='random': params0 = {'covar': sp.randn(self._gpNull.covar.getNumberParams())} else: if self.P==1: params0 = {'covar':sp.sqrt(0.5) * sp.ones(2)} else: cov = 0.5*sp.cov(self.Y.T) + 1e-4*sp.eye(self.P) chol = la.cholesky(cov, lower=True) params = chol[sp.tril_indices(self.P)] params0 = {'covar': sp.concatenate([params, params])} else: if self.P==1: params_cn = sp.array([1.]) else: cov = sp.cov(self.Y.T) + 1e-4*sp.eye(self.P) chol = la.cholesky(cov, lower=True) params_cn = chol[sp.tril_indices(self.P)] params0 = {'covar': params_cn} return params0
def __init__(self, dim, jitter=1e-4): """ Args: dim: dimension of the free-form covariance jitter: extent of diagonal offset which is added for numerical stability (default value: 1e-4) """ Covariance.__init__(self, dim) self._K_act = True self._calcNumberParams() self.dim = dim self.params = sp.zeros(self.n_params) self.idx_r, self.idx_c = sp.tril_indices(self.dim) self.set_jitter(jitter)
def _initParams(self, init_method=None): """ internal function for params initialization """ if self.bgRE: if init_method == "random": params0 = { "covar": sp.randn(self._gpNull.covar.getNumberParams()) } else: if self.P == 1: params0 = {"covar": sp.sqrt(0.5) * sp.ones(2)} else: cov = 0.5 * sp.cov(self.Y.T) + 1e-4 * sp.eye(self.P) chol = la.cholesky(cov, lower=True) params = chol[sp.tril_indices(self.P)] params0 = {"covar": sp.concatenate([params, params])} else: if self.P == 1: params_cn = sp.array([1.]) else: cov = sp.cov(self.Y.T) + 1e-4 * sp.eye(self.P) chol = la.cholesky(cov, lower=True) params_cn = chol[sp.tril_indices(self.P)] params0 = {"covar": params_cn} return params0
def _initParams(self, init_method=None): """ this function initializes the paramenter and Ifilter """ if self.bgRE: if init_method == 'random': params0 = { 'covar': sp.randn(self._gpNull.covar.getNumberParams()) } else: if self.P == 1: params0 = {'covar': sp.sqrt(0.5) * sp.ones(2)} else: cov = 0.5 * sp.cov(self.Y.T) + 1e-4 * sp.eye(self.P) chol = la.cholesky(cov, lower=True) params = chol[sp.tril_indices(self.P)] params0 = {'covar': sp.concatenate([params, params])} else: if self.P == 1: params_cn = sp.array([1.]) else: cov = sp.cov(self.Y.T) + 1e-4 * sp.eye(self.P) chol = la.cholesky(cov, lower=True) params_cn = chol[sp.tril_indices(self.P)] params0 = {'covar': params_cn} return params0
def recover_B2_ridge( y, X, reg = 0 ): """Recover B2 using ridge regression""" N, D = X.shape y = y**2 indices = sc.tril_indices(D) X = array( [ (outer(x,x)[indices]) for x in X ] ) B2_ = inv(X.T.dot(X) + reg * eye(X.shape[1])).dot( X.T ).dot( y ) B2 = zeros((D,D)) B2[indices] = B2_ B2 = (B2 + B2.T)/2 return B2
def correlation_ste(self): if self.getFIinv() is None: R = None else: idx_M = sp.zeros((self.dim,self.dim)) idx_M[sp.tril_indices(self.dim)] = sp.arange( int( 0.5 * self.dim * (self.dim + 1) ) ) R = sp.zeros(idx_M) for i in range(self.dim): for j in range(0,self.dim): ij = idx_M[i,j] # index of cov_ij_ste from fisher ii = idx_M[i,i] # index of cov_ii_ste from fisher jj = idx_M[j,j] # index of cov_jj_ste from fisher #TODO: complete # IN A VARIANCE / CORRELATION PARAMETRIZATION #if self.getFIinv() is None: # R = None #else: # R = sp.zeros((self.dim, self.dim)) # R[sp.tril_indices(self.dim, k = -1)] = sp.sqrt(self.getFIinv().diagonal()[self.dim:]) # R += R.T return R
def _updateL(self): """ construct the cholesky factor from hyperparameters """ self.L[sp.tril_indices(self.dim)] = self.params
def _updateL(self): """ construct the cholesky factor from hyperparameters """ self.L[SP.tril_indices(self.P)] = self.params
def setCovariance(self,cov): """ set hyperparameters from given covariance """ chol = LA.cholesky(cov,lower=True) params = chol[sp.tril_indices(self.dim)] self.setParams(params)
def _initParams(self, init_method=None): """ this function initializes the paramenter and Ifilter """ if self.P == 1: if self.bgRE: params0 = { 'Cg': SP.sqrt(0.5) * SP.ones(1), 'Cn': SP.sqrt(0.5) * SP.ones(1) } Ifilter = None else: params0 = {'Cr': 1e-9 * SP.ones(1), 'Cn': SP.ones(1)} Ifilter = { 'Cr': SP.zeros(1, dtype=bool), 'Cn': SP.ones(1, dtype=bool) } else: if self.bgRE: if self.colCovarType == 'freeform': if init_method == 'pairwise': _RV = fitPairwiseModel(self.Y, XX=self.XX, S_XX=self.S_XX, U_XX=self.U_XX, verbose=False) params0 = { 'Cg': _RV['params0_Cg'], 'Cn': _RV['params0_Cn'] } elif init_method == 'random': params0 = { 'Cg': SP.randn(self.Cg.getNumberParams()), 'Cn': SP.randn(self.Cn.getNumberParams()) } else: cov = 0.5 * SP.cov(self.Y.T) + 1e-4 * SP.eye(self.P) chol = LA.cholesky(cov, lower=True) params = chol[SP.tril_indices(self.P)] params0 = {'Cg': params.copy(), 'Cn': params.copy()} Ifilter = None else: if self.colCovarType == 'freeform': cov = SP.cov(self.Y.T) + 1e-4 * SP.eye(self.P) chol = LA.cholesky(cov, lower=True) params = chol[SP.tril_indices(self.P)] #else: # S,U=LA.eigh(cov) # a = SP.sqrt(S[-self.rank_r:])[:,SP.newaxis]*U[:,-self.rank_r:] # if self.colCovarType=='lowrank_id': # c = SP.sqrt(S[:-self.rank_r].mean())*SP.ones(1) # else: # c = SP.sqrt(S[:-self.rank_r].mean())*SP.ones(self.P) # params0_Cn = SP.concatenate([a.T.ravel(),c]) params0 = {'Cr': 1e-9 * SP.ones(self.P), 'Cn': params} Ifilter = { 'Cr': SP.zeros(self.P, dtype=bool), 'Cn': SP.ones(params.shape[0], dtype=bool) } if self.mean.F is not None and self.bgRE: params0['mean'] = 1e-6 * SP.randn(self.mean.getParams().shape[0]) if Ifilter is not None: Ifilter['mean'] = SP.ones(self.mean.getParams().shape[0], dtype=bool) return params0, Ifilter
def getGenoSte(self, DGE, IGE, IEE, cageEffect): self._gp.covar.getFisherInf() F = self._gp.covar.getFisherInf() # scalar in front of each term # ordering for geno and env is # direct, covar, indirect as in fisher matrix aP = [] vi = [] if DGE and (not IGE): aP.append(self._genoCov.scale) vi.append(1. / covar_rescaling_factor(self._genoCov.K0)) elif IGE and (not DGE): aP.append(self._genoCov.scale) vi.append(1. / covar_rescaling_factor(self._genoCov.K0)) elif DGE and IGE: aP.append(self._genoCov.covff.K()[0,0]) aP.append(self._genoCov.covff.K()[0,1]) aP.append(self._genoCov.covff.K()[1,1]) vi.append(1. / covar_rescaling_factor(self._genoCov._K)) vi.append(1. / covar_rescaling_factor(self._genoCov._KZ + self._genoCov._ZK)) vi.append(1. / covar_rescaling_factor(self._genoCov._ZKZ)) else: pass if not IEE: aP.append(self._envCov.scale) vi.append(1. / covar_rescaling_factor(self._envCov.K0)) else: aP.append(self._envCov.covff.K()[0,0]) aP.append(self._envCov.covff.K()[0,1]) aP.append(self._envCov.covff.K()[1,1]) vi.append(1. / covar_rescaling_factor(self._envCov._K)) vi.append(1. / covar_rescaling_factor(self._envCov._KZ + self._envCov._ZK)) vi.append(1. / covar_rescaling_factor(self._envCov._ZKZ)) if cageEffect: aP.append(self._cageCov.scale) vi.append(1. / covar_rescaling_factor(self._cageCov.K0)) else: pass # make them vectors aP = sp.array(aP) vi = sp.array(vi) # overall variance # this should correspond to the one you get from sampling v = (aP*vi).sum() # fractions of variance exaplined by each term # (can be negative) h = (aP*vi) / v # jacobean J = sp.zeros((aP.shape[0], aP.shape[0])) J[:, 0] = h / vi J[-1, 1:] = -v / vi[-1] for i in range(aP.shape[0]-1): J[i, i+1] = v / vi[i] # transformation of Fisher Fnew = sp.dot(J.T, sp.dot(F, J)) # invert the new Fisher S,U = sp.linalg.eigh(Fnew) I = S>1e-9 U = U[:,I] S = S[I] FI = sp.dot(U,sp.dot(sp.diag(S**(-1)),U.T)) # reorder to have same ordering as before idxs = list(range(1, aP.shape[0])) idxs.append(0) FI = FI[idxs, :][:, idxs] # R is 2x2 matrix: STE_Ad and STE_As on diag, STE_Ads off R = sp.zeros((2, 2)) STE_output = {} if DGE and IGE: FI_geno = FI[:3,:][:,:3] #STEs = sp.sqrt(FI_geno.diagonal()) ( ordered as Ad Ads As) #STEs = sqrt of var of VC corr_params #fills diag and 1 off first R[sp.tril_indices(2)] = sp.sqrt(FI_geno.diagonal()) #now fills other off R = R + R.T - sp.diag(R.diagonal()) corr_param_Ad_As = FI_geno[0,2]/(sp.sqrt(FI_geno[0,0])*sp.sqrt(FI_geno[2,2])) elif DGE and (not IGE): R[0,0] = sp.sqrt(FI[0,0]) R[0,1] = -999 R[1,0] = -999 R[1,1] = -999 corr_param_Ad_As = -999 elif (not DGE) and IGE: R[0,0] = -999 R[0,1] = -999 R[1,0] = -999 R[1,1] = sp.sqrt(FI[0,0]) corr_param_Ad_As = -999 else: R[0,0] = -999 R[0,1] = -999 R[1,0] = -999 R[1,1] = -999 corr_param_Ad_As = -999 STE_output['R']=R STE_output['corr_params']= corr_param_Ad_As return STE_output
def adjacency( self, min_snp2gene_obs=2, fdr_cutoff=0.3, return_genes=False, second_overlap=None, ): """ Return a matrix showing the number of shared HPO genes by Term. The diagonal of the matrix is the number of genes discoverd by that term. The upper diagonal shows the overlap between the row and column and the lower diagonal shows the hypergeomitric pval for the overlap between the two terms. The universe used is the number of unique genes in the overlap results. min_snp2gene_obs : int (default: 2) The min SNP2gene mappinging observations needed to be HPO fdr_cutoff: float (default: 0.3) The FDR cutoff the be considered HPO return_genes : bool (default: False) Return the candidate gene list instead of the overlap table second_overlap : Overlap Object (default: None) If specified, overlap between terms will be calculated between this overlaps HPO genes and the second overlaps HPO genes resulting in a adjacency matrix where the x-axis is overlap 1's terms and the y-axis is overlap 2's terms and the values are the number of shared genes per term. """ hpo1 = self.high_priority_candidates( fdr_cutoff=fdr_cutoff, min_snp2gene_obs=min_snp2gene_obs, original_COB_only=True, ) if second_overlap is None: second_overlap = self hpo2 = second_overlap.high_priority_candidates( fdr_cutoff=fdr_cutoff, min_snp2gene_obs=min_snp2gene_obs, original_COB_only=True, ) # x = {df[0]: set(df[1].gene) for df in hpo1.groupby("Term")} y = {df[0]: set(df[1].gene) for df in hpo2.groupby("Term")} adj = [] # num_universe = len(set(chain(*x.values()))) num_universe = len( set(self.results.gene.unique()).union( set(second_overlap.results.gene.unique()) ) ) for i, a in enumerate(x.keys()): for j, b in enumerate(y.keys()): num_a = len(x[a]) num_b = len(y[b]) if j < i: continue common = set(x[a]).intersection(y[b]) num_common = len(set(x[a]).intersection(y[b])) if a != b: pval = hypergeom.sf( num_common - 1, num_universe, len(x[a]), len(y[b]) ) else: # This will make the diagonal of the matrix be the number HPO genes # for the element pval = len(x[a]) adj.append((a, b, num_a, num_b, num_common, pval, ",".join(common))) adj = pd.DataFrame(adj) adj.columns = [ "Term1", "Term2", "num_term1", "num_term2", "num_common", "pval", "common", ] # Stop early if we just want to return the lists if return_genes == True: adj = adj[adj.num_common > 0] adj = adj[np.logical_not(adj.Term1 == adj.Term2)] adj = adj.drop_duplicates() adj["bonferoni"] = adj.pval <= (0.05 / (len(x) * len(y))) return adj.drop_duplicates() else: overlap = pd.pivot_table( adj, index="Term1", columns="Term2", values="num_common" ) # Mask out the lower diagonal on the overalp matrix overlap.values[tril_indices(len(overlap))] = 0 pvals = pd.pivot_table(adj, index="Term1", columns="Term2", values="pval") # Mask out the upper tringular on the pvals matrix pvals.values[triu_indices(len(pvals), 1)] = 0 return (overlap + pvals).astype(float)
def cholesky_factor(nu): assert len(nu) == 3 Q = sp.zeros(shape=(2, 2)) Q[sp.tril_indices(2)] = nu Q = Q.dot(Q.T) return Q