class PyGibbCAMP: ## Constructor # @param nodeFile A string of pathname of file containing nodes. The # name, type, measured # @param edgeFile A list of tuples, each containing a source and sink node # of an edge # @param dataMatrixFile A string to data def __init__(self, nodeFile, dataMatrixFile, perturbMatrix=None, missingDataMatrix=None): self.network = None self.obsData = None self.missingDataMatrix = None perturbInstances = None self.nChains = 1 self.dictPerturbEffect = {'AKT1' : [('GSK690693', 0), \ ('GSK690693_GSK1120212', 0)], 'MAP2K1' : [('GSK690693_GSK1120212', 0)],\ 'EGFR': [('EGF' , 1), ('FGF1', 1)]} # self.stimuli = ['EGF', 'FGF1', 'HGF', 'IGF1', 'Insulin', 'NRG1', 'PBS', 'Serum'] # parse data mastrix by calling NamedMatrix class if not dataMatrixFile: raise Exception( "Cannot create PyCAMP obj without 'dataMatrixFile'") return self.obsData = NamedMatrix(dataMatrixFile) nCases, nAntibodies = np.shape(self.obsData.data) self.obsData.colnames = map(lambda s: s + 'F', self.obsData.colnames) self.obsDataFileName = dataMatrixFile if perturbMatrix: self.perturbData = NamedMatrix(perturbMatrix) perturbInstances = self.perturbData.getColnames() self.perturbInstances = perturbInstances if missingDataMatrix: self.missingDataMatrix = NamedMatrix(missingDataMatrix) allMissing = np.sum(self.missingDataMatrix, 0) == nCases if np.any(allMissing): raise Exception("Data matrix contain data-less columns") self.missingDataMatrix.colnames = map( lambda s: s + 'F', self.missingDataMatrix.colnames) if not nodeFile: raise Exception("Calling 'intiNetwork' with empty nodeFile name") return try: nf = open(nodeFile, "r") nodeLines = nf.readlines() if len(nodeLines ) == 1: # Mac files end a line with \r instead of \n nodeLines = nodeLines[0].split("\r") nf.close() except IOError: raise Exception("Failed to open the file containing nodes") return print "Creating network" self.network = nx.DiGraph() self.dictProteinToAntibody = dict() self.dictAntibodyToProtein = dict() # parse nodes for line in nodeLines: #print line protein, antibody = line.rstrip().split(',') if protein not in self.dictProteinToAntibody: self.dictProteinToAntibody[protein] = [] self.dictProteinToAntibody[protein].append(antibody) self.dictAntibodyToProtein[antibody] = protein fluo = antibody + 'F' if protein not in self.network: self.network.add_node(protein, nodeObj=SigNetNode( protein, 'ACTIVATIONSTATE', False)) self.network.add_node(antibody, nodeObj=SigNetNode(antibody, 'PHOSPHORYLATIONSTATE', False)) self.network.add_node(fluo, nodeObj=SigNetNode(fluo, 'FLUORESCENCE', True)) self.network.add_edge(antibody, protein) self.network.add_edge(antibody, fluo) for perturb in perturbInstances: self.network.add_node(perturb, nodeObj=SigNetNode(perturb, 'PERTURBATION', True)) # Add edges between PERTURBATION, protein activity,and phosphorylation layers for pro in self.dictProteinToAntibody: for phos in self.dictAntibodyToProtein: if self.dictAntibodyToProtein[phos] == pro: continue self.network.add_edge(pro, phos) for perturb in perturbInstances: self.network.add_edge(perturb, pro) ## Init parameters of the model # In Bayesian network setting, the joint probability is calculated # through the product of a series conditional probability. The parameters # of the PyCAMP model defines p(x | Pa(X)). For observed fluorescent node # the conditional probability is a mixture of two Gaussian distribution. # therefore, the parameters are two pairs of mu and sigma. For # the hidden variables representing phosphorylation states and activation # states of proteins, the conditional probability is defined by a logistic # regression. Therefore, the parameters associated with such a node is a # vector of real numbers. # def _initParams(self): print "Initialize parameters associated with each node in each MCMC chain" for nodeId in self.network: self._initNodeParams(nodeId) def _initNodeParams(self, nodeId): nodeObj = self.network.node[nodeId]['nodeObj'] if nodeObj.type == 'FLUORESCENCE': # Estimate mean and sd of fluo signal using mixture model if self.missingDataMatrix and nodeId in self.missingDataMatrix.getColnames( ): nodeData = self.obsData.getValuesByCol(nodeId) nodeData = nodeData[self.missingDataMatrix.getValuesByCol( nodeId) == 0] else: nodeData = self.obsData.getValuesByCol(nodeId) nodeObj.mus = np.zeros((self.nChains, 2)) nodeObj.sigmas = np.zeros((self.nChains, 2)) for c in range(self.nChains): mixGaussians = normalmixEM(robjects.FloatVector(nodeData), k=2) # mus and sigmas are represented as nChain x 2 matrices nodeObj.mus[c, :] = np.array(mixGaussians[2]) nodeObj.sigmas[c, :] = np.array(mixGaussians[3]) else: preds = self.network.predecessors(nodeId) if len(preds) > 0: nodeObj.paramNames = preds nodeObj.params = np.random.randn(self.nChains, len(preds) + 1) else: nodeObj.params = None ## Initialize latent variables # # def _initHiddenStates(self): hiddenNodes = [ n for n in self.network if not self.network.node[n]['nodeObj'].bMeasured ] phosNodes = [ n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE' ] #print str(phosNodes) nCases, nAntibody = self.obsData.shape() caseNames = self.obsData.getRownames() self.nodeStates = list() for c in range(self.nChains): tmp = np.zeros((nCases, len(hiddenNodes))) tmp[np.random.rand(nCases, len(hiddenNodes)) < 0.3] = 1 tmp = np.column_stack((tmp, self.perturbData.data)) colnames = hiddenNodes + self.perturbData.colnames self.nodeStates.append( NamedMatrix(npMatrix=tmp, colnames=colnames, rownames=caseNames)) #initialize phos state based on the observed fluo for node in phosNodes: fluoNode = node + 'F' #print "phosNode:" + node + "; fluoNode: " + fluoNode fluoNodeObj = self.network.node[fluoNode]['nodeObj'] fluoData = self.obsData.getValuesByCol(fluoNode) tmp = np.zeros(nCases) phosProbOne = - np.log(fluoNodeObj.sigmas[c, 1])\ - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 1]) / np.square(fluoNodeObj.sigmas[c, 1]) phosProbZero = - np.log(fluoNodeObj.sigmas[c, 0])\ - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 0]) / np.square(fluoNodeObj.sigmas[c, 0]) tmp[phosProbOne > phosProbZero] = 1 nodeIndx = self.nodeStates[c].findColIndices(node) self.nodeStates[c].data[:, nodeIndx] = tmp # take care of missing values by random sampling if self.missingDataMatrix: if node in self.missingDataMatrix.getColnames(): #print "processing node with missing values: " + nodeId missingCases = self.missingDataMatrix.getValuesByCol( node) == 1 tmp = np.zeros(sum(missingCases)) tmp[np.random.rand(len(tmp)) <= 0.3] = 1 self.nodeStates[c].data[missingCases, nodeIndx] = tmp ## Calculate the marginal probability of observing the measured data by # integrating out all possible setting of latent variable states and # model parameters. def calcEvidenceLikelihood(self): phosNodes = [ n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE' ] loglikelihood = 0 nCases, nAntibodies = np.shape(self.obsData.data) for nodeId in phosNodes: nodeObj = self.network.node[nodeId]['nodeObj'] nodeIndx = self.nodeStates[0].findColIndices(nodeId) preds = self.network.predecessors(nodeId) for c in range(self.nChains): nodeData = self.nodeStates[c].data[:, nodeIndx] predStates = np.column_stack( (np.ones(nCases), self.nodeStates[c].getValuesByCol(preds))) pOneCondOnParents = 1 / ( 1 + np.exp(-np.dot(predStates, nodeObj.params[c, :]))) pOneCondOnParents[pOneCondOnParents == 1.] -= np.finfo( np.float).eps loglikelihood += np.sum(nodeData * np.log(pOneCondOnParents) \ + (1 - nodeData) * np.log(1 - pOneCondOnParents)) loglikelihood /= self.nChains return loglikelihood ## Perform graph search def trainGibbsEM(self, nChains=10, alpha=0.1, nParents=4, nSamples=5, pickleDumpFile=None, maxIter=1000): self.nChains = nChains self.alpha = alpha self.likelihood = list() self.nSamples = nSamples self.nParents = nParents if pickleDumpFile: self.pickleDumpFile = pickleDumpFile else: self.pickleDumpFile = self.obsDataFileName + "alpha" + str( self.alpha) + ".pickle" # check if the network and data agrees nodeToDelete = list() for nodeId in self.network: if self.network.node[nodeId][ 'nodeObj'].type == 'FLUORESCENCE' and nodeId not in self.obsData.getColnames( ): print "Node " + nodeId + " don't has associated data" nodeToDelete.append(nodeId) nodeToDelete.append(self.network.predecessors(nodeId)[0]) for nodeId in nodeToDelete: if self.network.has_node(nodeId): print "removing node " + nodeId self.network.remove_node(nodeId) # Starting EM set up Markov chains to train a model purely based on prior knowledge self._initParams() self._initHiddenStates() # perform update of latent variables in a layer-wise manner self.likelihood = list() self.expectedStates = list() nCases, nAntibodies = np.shape(self.obsData.data) for c in range(self.nChains): # each chain collect expected statistics of nodes from samples along the chain self.expectedStates.append( np.zeros(np.shape(self.nodeStates[c].data))) print "Starting EM: alpha = " + str(self.alpha) + "; nChains = " + str( self.nChains) + "; nSamples = " + str( self.nSamples) + "; nParents = " + str(self.nParents) optLikelihood = float("-inf") bConverged = False sampleCount = 0 likelihood = self.calcEvidenceLikelihood() print "nIter: 0" + "; log likelihood of evidence: " + str(likelihood) self.likelihood.append(likelihood) for nIter in range(maxIter): # E-step of EM self._updateActivationStates() if (nIter + 1) % 2 == 0: # we collect sample every other iteration sampleCount += 1 for c in range(self.nChains): self.expectedStates[c] += self.nodeStates[c].data # M-step of EM. We only update parameters after a collecting a certain number of samples if sampleCount >= self.nSamples: sampleCount = 0 # take expectation of sample states self.expectedStates = map(lambda x: x / self.nSamples, self.expectedStates) self._updteParams(self.alpha, nparents=self.nParents) likelihood = self.calcEvidenceLikelihood() self.likelihood.append(likelihood) print "nIter: " + str( nIter + 1) + "; log likelihood of evidence: " + str(likelihood) # collect the current best fit models if likelihood > optLikelihood: optLikelihood = likelihood try: cPickle.dump(self, open(self.pickleDumpFile, 'wb')) except: raise Exception("Cannot create pickle dumpfile " + self.pickleDumpFile) bConverged = self._checkConvergence() if bConverged: print "EM converged!" break for c in range(self.nChains): # clear expectedStates self.expectedStates[c] = np.zeros( np.shape(self.nodeStates[c].data)) # now try to delete edges that does contribute to evidence #self.trimEdgeByConsensus(.9) return self def _checkConvergence(self): # To do, add convergence checking code if len(self.likelihood) < 20: return False ml = np.mean(self.likelihood[-5:-1]) ratio = abs(self.likelihood[-1] - ml) / abs(ml) return ratio <= 0.001 def _updateActivationStates(self): nCases, antibody = np.shape(self.obsData.data) nCases, nHiddenNodes = np.shape(self.nodeStates[0].data) # interate through all nodes. activationNode = [ n for n in self.network if self.network.node[n]['nodeObj'].type == 'ACTIVATIONSTATE' ] for nodeId in activationNode: for c in range(self.nChains): curNodeMarginal = self.calcNodeCondProb(nodeId, c) # sample states of current node based on the prob, and update sampleState = np.zeros(nCases) sampleState[curNodeMarginal >= np.random.rand(nCases)] = 1. curNodeIndx = self.nodeStates[c].findColIndices(nodeId) self.nodeStates[c].data[:, curNodeIndx] = sampleState # clamp the activationState of perturbed nodes to a fix value if nodeId in self.dictPerturbEffect: # the diction keeps a list conditins under which the node is perurbed and the state to be clamped to for condition, state in self.dictPerturbEffect[nodeId]: perturbState = self.nodeStates[c].getValuesByCol( condition) indx = self.nodeStates[c].findColIndices(nodeId) self.nodeStates[c].data[perturbState == 1, indx] = state def calcNodeCondProb(self, nodeId, c): """ Calculate the marginal probability of a node's state set to "1" conditioning on all evidence. args: nodeId A string id of the node of interest c An integer indicate the chain from which the parameter vector to be used """ nodeObj = self.network.node[nodeId]['nodeObj'] if nodeObj.bMeasured: raise Exception( "Call _caclNodeMarginalProb on an observed variable " + nodeId) nCases, nAntibody = np.shape(self.obsData.data) # collect the state of the predecessors of the node preds = self.network.predecessors(nodeId) logProbOneCondOnParents = 0 logProbZeroCondOnParents = 0 if len(preds) > 0: # if the node has parents # calculate p(curNode = 1 | parents); nodeParams = nodeObj.params[c, :] predStates = np.column_stack( (np.ones(nCases), self.nodeStates[c].getValuesByCol(preds))) pOneCondOnParents = 1 / (1 + np.exp(-np.dot(predStates, nodeParams))) pOneCondOnParents[pOneCondOnParents == 1] -= np.finfo(np.float).eps pOneCondOnParents[pOneCondOnParents == 0] += np.finfo(np.float).eps logProbOneCondOnParents = np.log(pOneCondOnParents) logProbZeroCondOnParents = np.log(1 - pOneCondOnParents) # collect evidence from children logProbChildCondOne = 0 # the prob of child conditioning on current node == 1 logProdOfChildCondZeros = 0 children = self.network.successors(nodeId) if len(children) > 0: for child in children: childNodeObj = self.network.node[child]['nodeObj'] curChildStates = self.nodeStates[c].getValuesByCol(child) # Collect states of the predecessors of the child childPreds = self.network.predecessors(child) childNodeParams = childNodeObj.params[c, :] childPredStates = self.nodeStates[c].getValuesByCol(childPreds) childPredStates = np.column_stack( (np.ones(nCases), childPredStates )) # padding data with a column ones as bias # Set the state of current node to ones curNodePosInPredList = childPreds.index( nodeId) + 1 # offset by 1 because padding if childNodeParams[ curNodePosInPredList] == 0: # not an real edge continue childPredStates[:, curNodePosInPredList] = np.ones(nCases) pChildCondCurNodeOnes = 1 / ( 1 + np.exp(-np.dot(childPredStates, childNodeParams))) pChildCondCurNodeOnes[pChildCondCurNodeOnes == 1] -= np.finfo( np.float).eps pChildCondCurNodeOnes[pChildCondCurNodeOnes == 0] += np.finfo( np.float).eps logProbChildCondOne += np.log(curChildStates * pChildCondCurNodeOnes + (1 - curChildStates) * (1 - pChildCondCurNodeOnes)) # set the state of the current node (nodeId) to zeros childPredStates[:, curNodePosInPredList] = np.zeros(nCases) pChildCondCurNodeZeros = 1 / ( 1 + np.exp(-np.dot(childPredStates, childNodeParams))) pChildCondCurNodeZeros[pChildCondCurNodeZeros == 1] -= np.finfo(np.float).eps pChildCondCurNodeZeros[pChildCondCurNodeZeros == 0] += np.finfo(np.float).eps logProdOfChildCondZeros += np.log(curChildStates * pChildCondCurNodeZeros + (1 - curChildStates) * (1 - pChildCondCurNodeZeros)) # now we can calculate the marginal probability of current node curNodeMarginal = 1 / ( 1 + np.exp(logProbZeroCondOnParents + logProdOfChildCondZeros - logProbOneCondOnParents - logProbChildCondOne)) return curNodeMarginal def parseGlmnetCoef(self, glmnet_res): """ Parse the 'beta' matrix returned by calling glmnet through RPy2. Return the first column of 'beta' matrix of the glmnet object with 3 or more non-zero values """ # read in intercept; a vector of length of nLambda a0 = np.array(glmnet_res.rx('a0'))[0] # Read in lines of beta matrix txt, which is a nVariables * nLambda. # Since we call glmnet by padding x with a column of 1s, we only work # with the 'beta' matrix returned by fit betaLines = StringIO(str(glmnet_res.rx('beta'))).readlines() dimStr = re.search("\d+\s+x\s+\d+", betaLines[1]).group(0) if not dimStr: raise Exception( "'parse_glmnet_res' could not determine the dims of beta") nVariables, nLambda = map(int, dimStr.split(' x ')) betaMatrix = np.zeros((nVariables, nLambda), dtype=np.float) # glmnet print beta matrix in mulitple blocks with # nVariable * blockSize blockSize = len(betaLines[4].split()) - 1 curBlockColStart = -blockSize for line in betaLines: #read in blocks m = re.search('^V\d+', line) if not m: # only find the lines begins with 'V\d' continue else: rowIndx = int(m.group(0)[1:len(m.group(0))]) if rowIndx == 1: curBlockColStart += blockSize # set 'rowIndx' as start from 0 rowIndx -= 1 fields = line.rstrip().split() fields.pop(0) if len(fields) != blockSize: blockSize = len(fields) for j in range(blockSize): if fields[j] == '.': continue else: betaMatrix[rowIndx, curBlockColStart + j] = float(fields[j]) return a0, betaMatrix def _updteParams(self, alpha=0.1, nparents=None): # Update the parameter associated with each node, p(n | Pa(n)) using logistic regression, # using expected states of precessors as X and current node states acrss samples as y nCases, nVariables = np.shape(self.obsData.data) if not nparents: nparents = self.nParents for nodeId in self.network: nodeObj = self.network.node[nodeId]['nodeObj'] if nodeObj.type == 'FLUORESCENCE' or nodeObj.type == 'PERTURBATION': continue nodeObj.fitRes = list() preds = self.network.predecessors(nodeId) predIndices = self.nodeStates[0].findColIndices(preds) for c in range(self.nChains): expectedPredState = self.expectedStates[c][:, predIndices] #x = np.column_stack((np.ones(nCases), expectedPredState)) x = np.column_stack((np.ones(nCases), expectedPredState)) y = self.nodeStates[c].getValuesByCol(nodeId) #check if all x and y are of same value, which will lead to problem for glmnet rIndx = map(lambda z: int(math.floor(z)), np.random.rand(50) * nCases) if sum(y) == nCases: # if every y == 1 y[rIndx] = 0 elif sum(map(lambda x: 1 - x, y)) == nCases: y[rIndx] = 1 y = robjects.vectors.IntVector(y) allRwoSumOnes = np.where(np.sum(x, 0) == nCases)[0] for col in allRwoSumOnes: rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases) x[rIndx, col] = 0 allZeros = np.where( np.sum(np.ones(np.shape(x)) - x, 0) == nCases) for col in allZeros[0]: rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases) x[rIndx, col] = 1 # call logistic regression using glmnet from Rpy fit = glmnet(x, y, alpha=alpha, family="binomial", intercept=0) nodeObj.fitRes.append(fit) # extract coefficients glmnet, keep the first set beta with nParent non-zeros values a0, betaMatrix = self.parseGlmnetCoef(fit) for j in range(np.shape(betaMatrix)[1]): if sum(betaMatrix[:, j] != 0.) >= nparents: break if j >= len(a0): j = len(a0) - 1 myparams = betaMatrix[:, j] if sum(myparams != 0.) > nparents: sortedParams = sorted(np.abs(myparams)) myparams[ np.abs(myparams) < sortedParams[-self.nParents]] = 0. nodeObj.params[c, :] = myparams def getStimuliSpecificNet(self, stimulus): self.stimuli = [ 'EGF', 'FGF1', 'HGF', 'IGF1', 'Insulin', 'NRG1', 'PBS', 'Serum' ] #self.stimuli = ['loLIG1', 'hiLIG1', 'loLIG2', 'hiLIG2'] # trim unused edges if not stimulus in self.nodeStates[0].getColnames(): raise Exception("Input stimulus '" + stimulus + "' is not in the experiment data") #self.trimEdgeByConsensus(0.9) stimulusCases = self.perturbData.getValuesByCol(stimulus) == 1 controlCases = np.sum(self.perturbData.getValuesByCol(self.stimuli), 1) == 0 # identify the nodes to keep by determine if a node responds to a stimuli activeNodes = set() activeNodes.add(stimulus) for nodeId in self.network: if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' \ or self.network.node[nodeId]['nodeObj'].type == 'fluorescence': nodeControlValues = self.obsData.getValuesByCol( nodeId)[controlCases] nodeStimulValues = self.obsData.getValuesByCol( nodeId)[stimulusCases] ttestRes = R('t.test')(robjects.FloatVector(nodeControlValues), robjects.FloatVector(nodeStimulValues)) pvalue = np.array(ttestRes.rx('p.value')[0])[0] if pvalue < 0.05: activeNodes.add(self.network.predecessors(nodeId)[0]) # copy network to a tmp, redirect edges from activation state nodes # Edge indicates the impact tmpNet = nx.DiGraph() for u, v in self.network.edges(): # we are only interested in the edge from protein point to antibody if (self.network.node[u]['nodeObj'].type == 'ACTIVATIONSTATE'\ or self.network.node[u]['nodeObj'].type == 'activeState')\ and (self.network.node[v]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'\ or self.network.node[v]['nodeObj'].type == 'phosState'): # extract parameters associated with u and v vPreds = self.network.predecessors(v) uIndx = vPreds.index(u) vParams = np.sum(self.network.node[v]['nodeObj'].params, 0) if len(vParams) != (len(vPreds) + 1): raise Exception("Bug in retrieving parameters of node v " + u) paramZeros = np.sum( self.network.node[v]['nodeObj'].params == 0, 0) if np.float(paramZeros[uIndx + 1]) / float(self.nChains) > .9: continue # don't add edge with beta == 0 for ab in self.dictProteinToAntibody[u]: if ab not in self.network: continue # find the impact of phosphorylation on activation state uPreds = self.network.predecessors(u) uParams = np.mean(self.network.node[u]['nodeObj'].params, 0) if len(uParams) != (len(uPreds) + 1): raise Exception( "Bug in retrieving parameters of node v " + u) #uAntibodyParam = uParams[uPreds.index(ab) + 1] # if vParams[uIndx+1] > 0. and (vParams[uIndx+1] * uAntibodyParam) > 0: # tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1]) # elif (vParams[uIndx+1] * uAntibodyParam) < 0.: # tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1]) if vParams[uIndx + 1] > 0.: tmpNet.add_edge(ab, v, effect="+", betaValue=vParams[uIndx + 1]) elif vParams[uIndx + 1] < 0.: tmpNet.add_edge(ab, v, effect="-", betaValue=vParams[uIndx + 1]) # remove leave nodes that is not in activeNodes list while True: leafNodes = [] for nodeId in tmpNet: if (nodeId not in activeNodes and len(tmpNet.successors(nodeId)) == 0)\ or (nodeId not in activeNodes and len(tmpNet.predecessors(nodeId)) == 0): leafNodes.append(nodeId) if len(leafNodes) == 0: break for leaf in leafNodes: tmpNet.remove_node(leaf) # now try to remove cycles and make the tmpNet a DAG return tmpNet def toGraphML(self, filename): tmpNet = nx.DiGraph() for edge in self.network.edges(): tmpNet.add_edge(edge) nx.write_graphml(tmpNet, filename, encoding='utf-8', prettyprint=True)
def calcTCI (mutcnaMatrixFN, degMatrixFN, alphaNull = [1, 1], alphaIJKList = [2, 1, 1, 2], v0=0.2, ppiDict = None, dictGeneLength = None, outputPath = ".", opFlag = None, rowBegin=0, rowEnd = None): """ calcTCI (mutcnaMatrix, degMatrix, alphaIJList, alphaIJKList, dictGeneLength) Calculate the causal scores between each pair of SGA and DEG observed in each tumor Inputs: mutcnaMatrixFN A file containing a N x G binary matrix containing the mutation and CNA data of all tumors. N is the number of tumors and G is number of total number of unique genes. For a tumor, genes that have SGAs are indicated by "1"s and "0" otherwise. degMatrixFN A file contains a N x G' binary matrix representing DEG status. A "1" indicate a gene is differentially expressed in a tumor. alphaIJList A list of Dirichlet hyperparameters defining the prior that a mutation event occurs alphaIJKList A list of Dirichlet hyperparameters for caulate the prior of condition probability parameters. alphaIJK[0]: mut == 0 && deg == 0; alphaIJK[1]: mut == 0 && deg == 1; alphaIJK[2]: mut == 1 && deg == 0; alphaIJK[3]: mut == 1 && deg == 1 v0 A float scalar indicate the prior probability that a DEG is caused by a non-SGA factor ppiDict A dictionary keeps PPI network in the form an adjecency list (a dictionary of dictionary) dictGeneLength A dictionary keeps the length of each of G genes in the mutcnaMatrix rowBegin, rowEnd These two arguments control allow user to choose which block out of all tumors (defined by the two row numbers) will be processes in by this function. This can be used to process mulitple block in a parallel fashion. """ # read in data in the form of NamedMatrix try: mutcnaMatrix = NamedMatrix(mutcnaMatrixFN) except: print "Failed to import data matrix %s\n" % mutcnaMatrixFN sys.exit() try: degMatrix = NamedMatrix(degMatrixFN) except: print "Failed to import data matrix %s\n" % degMatrixFN sys.exit() exprsTumorNames = [x.replace("\"", "") for x in degMatrix.getRownames()] mutTumorNames = [x.replace("\"", "") for x in mutcnaMatrix.getRownames()] if exprsTumorNames != mutTumorNames: print "The tumors for mutcnaMatrix and degMatrix do not fully overlap!" print degMatrix.getRownames() print mutcnaMatrix.getRownames() sys.exit() if not dictGeneLength : print "Gene length dictionary not provided, quit\n" sys.exit() tumorNames = degMatrix.getRownames() nTumors, nMutGenes = mutcnaMatrix.shape() mutGeneNames = mutcnaMatrix.getColnames() degGeneNames = degMatrix.getColnames() # now we iterate through each tumor to infer the causal relationship between each # pair of mut - deg # loop through individual tumors and calculate the causal scores between each pair of SGA and DEG if not rowEnd: rowEnd = nTumors - 1 else: if rowEnd >= nTumors: rowEnd = nTumors - 1 elif rowEnd < rowBegin: print "Invalid rowEnd < rowBegin arguments given." sys.exit() if rowBegin > rowEnd: print "Invlid rowBegin > rowEnd argument given." sys.exit() print "Done with loading data, start processing tumor " + str(rowBegin) for t in range(rowBegin, rowEnd): #print pacifier if t % 50 == 0: print "Processed %s tumors" % str(t) # collect data related to DEGs. Identify the genes that are differentially expressed in a tumor, # then collect degGeneIndx = [i for i, j in enumerate(degMatrix.data[t,:]) if j == 1] tumorDEGGenes = [degGeneNames[i] for i in degGeneIndx] nTumorDEGs = len(degGeneIndx) # corresponding to n, the number of DEGs in a given tumor tumorDEGMatrix = degMatrix.data[:,degGeneIndx] # collect data related to mutations tumormutGeneIndx = [i for i, j in enumerate(mutcnaMatrix.data[t,:]) if j == 1] if len(tumormutGeneIndx) < 2: print tumorNames[t] + " has less than 2 mutations, skip." continue tumorMutGenes = [mutGeneNames[i] for i in tumormutGeneIndx] # now extract the sub-matrix of mutcnaMatrix that only contain the genes that are mutated in a given tumor t # check if special operations to create combinations of SGA events are needed. If combination operation is needed, # new combined muation matrix will be created if opFlag == AND: tmpNamedMat = NamedMatrix(npMatrix = tumorMutMatrix, colnames = tumorMutGenes, rownames = tumorNames) tumorNamedMatrix = createANDComb(tmpNamedMat, opFlag) if not tumorNamedMatrix: # this tumor do not have any joint mutations that is oberved in 2% of all tumors continue tumorMutMatrix = tumorNamedMatrix.data tumorMutGenes = tumorNamedMatrix.colnames elif opFlag == OR: tumorMutMatrix = createORComb(tumorMutGenes, ppiDict, mutcnaMatrix) else: tumorMutMatrix = mutcnaMatrix.data[:, tumormutGeneIndx] ## check operation options: 1) orginal, do nothing and contiue # otherwise creat combinary matrix using the tumorMutMatrix # createANDCombMatrix(tumorMutMatrix, operationFlag) if not opFlag: lntumorMutPriors = calcLnPrior(tumorMutGenes, dictGeneLength, v0) # a m-dimension vector with m being number of mutations else: #print tumorMutGenes[:10] if opFlag == AND: lntumorMutPriors = calcLnCombANDPrior(tumorMutGenes, dictGeneLength, v0) elif opFlag == OR: lntumorMutPriors = calcLnCombORPrior(tumorMutGenes, ppiDict, dictGeneLength, mutcnaMatrix.colnames, v0) tumorMutGenes.append('A0') # calculate the pairwise likelihood that an SGA causes a DEG tumorLnFScore = calcF(tumorMutMatrix, tumorDEGMatrix, alphaIJKList) # Calculate the likelihood of expression data conditioning on A0, and then stack to # the LnFScore, equivalent to adding a column of '1' to represent the A0 in tumorMutMatrix nullFscore = calcNullF(tumorDEGMatrix, alphaNull) tumorLnFScore = np.vstack((tumorLnFScore, nullFscore)) #check out this later # calcualte the prior probability that any of mutated genes can be a cause for a DEG, # tile it up to make an nTumorMutGenes x nTumorDEG matrix tumorMutPriorMatrix = np.tile(lntumorMutPriors, (nTumorDEGs, 1)).T lnFScore = add(tumorLnFScore, tumorMutPriorMatrix) # now we need to caclculate the normalized lnFScore so that each columnAccumLogSum = np.zeros(nTumorDEGs) for col in range(nTumorDEGs): currLogSum = np.NINF for j in range(lnFScore.shape[0]): if lnFScore[j,col] == np.NINF: continue currLogSum = logSum(currLogSum, lnFScore[j,col]) columnAccumLogSum[col] = currLogSum normalizer = np.tile(columnAccumLogSum, (lnFScore.shape[0], 1)) posterior = np.exp(add(lnFScore, - normalizer)) #write out the results tumorPosterior = NamedMatrix(npMatrix = posterior, rownames = tumorMutGenes, colnames = tumorDEGGenes) if "\"" in tumorNames[t]: tumorNames[t] = tumorNames[t].replace("\"", "") tumorPosterior.writeToText(filePath = outputPath, filename = tumorNames[t] + ".csv")
def calcTCI (mutcnaMatrixFN, degMatrixFN, tumorTypeFN = None, alphaNull = [1, 1], alphaIJKList = [2, 1, 1, 2], v0 = 0.3, ppiDict = None, dictGeneLength = None, outputPath = ".", opFlag = None, PANCANFlag = None, rowBegin=0, rowEnd = None): """ calcTCI (mutcnaMatrix, degMatrix, alphaIJList, alphaIJKList, dictGeneLength) Calculate the causal scores between each pair of SGA and DEG observed in each tumor Inputs: mutcnaMatrixFN A file containing a N x G binary matrix containing the mutation and CNA data of all tumors. N is the number of tumors and G is number of total number of unique genes. For a tumor, genes that have SGAs are indicated by "1"s and "0" otherwise. Note the last 19 columns are indicators of the tumor degMatrixFN A file contains a N x G' binary matrix representing DEG status. A "1" indicate a gene is differentially expressed in a tumor. tumorTypeFN A string of filename. The file contains N x T matrix, in which each row only has one element set to 1, rest to zero, as an indicator which type of cancer each tumor belongs to alphaIJList A list of Dirichlet hyperparameters defining the prior that a mutation event occurs alphaIJKList A list of Dirichlet hyperparameters for caulate the prior of condition probability parameters. alphaIJK[0]: mut == 0 && deg == 0; alphaIJK[1]: mut == 0 && deg == 1; alphaIJK[2]: mut == 1 && deg == 0; alphaIJK[3]: mut == 1 && deg == 1 v0 A float scalar indicate the prior probability that a DEG is caused by a non-SGA factor PANCANFlag A boolean flag to indicate if we are doing PANCAN ppiDict A dictionary keeps PPI network in the form an adjecency list (a dictionary of dictionary) dictGeneLength A dictionary keeps the length of each of G genes in the mutcnaMatrix rowBegin, rowEnd These two arguments control allow user to choose which block out of all tumors (defined by the two row numbers) will be processes in by this function. This can be used to process mulitple block in a parallel fashion. """ # check if gene length dictionary is set if not dictGeneLength : print "Gene length dictionary not provided, quit\n" sys.exit() # read in data in the form of NamedMatrix try: mutcnaMatrix = NamedMatrix(mutcnaMatrixFN) except: print "Failed to import data matrix %s\n" % mutcnaMatrixFN sys.exit() try: degMatrix = NamedMatrix(degMatrixFN) except: print "Failed to import data matrix %s\n" % degMatrixFN sys.exit() mutGeneNames = mutcnaMatrix.getColnames() mutTumorNames = mutcnaMatrix.getRownames() degGeneNames = degMatrix.getColnames() exprsTumorNames = degMatrix.getRownames() #check if same tumor names from two matrices above agree if exprsTumorNames != mutTumorNames: print "The tumors for mutcnaMatrix and degMatrix do not fully overlap!" print degMatrix.getRownames() print mutcnaMatrix.getRownames() sys.exit() tumorNames = exprsTumorNames nTumors, nMutGenes = mutcnaMatrix.shape() # now perform PANCAN analysis related tasks if PANCANFlag: if not tumorTypeFN: print "Cannot perform PANCAN analysis without tumor-type-indicator matrix" sys.exit() try: tumorTypeMatrix = NamedMatrix(tumorTypeFN) except: print "Failed to import tumor type file %s" % tumorTypeFN sys.exit() tumorTypeTumorNames = [x.replace("\"", "") for x in tumorTypeMatrix.getRownames()] if exprsTumorNames != tumorTypeTumorNames: print "The tumors for tumorTypeMatrix and degMatrix do not fully overlap!" sys.exit() tumorTypes = tumorTypeMatrix.getColnames() # Calculate the prior probability that a tumor-type variable may influence a DEG # to be proportional to the number of tumors from a given type vt = np.sum(tumorTypeMatrix.data, 0) # perform a rowsum to count each type tumor vtprior = np.divide(vt, float(nTumors)) # normalize to 1, as prior for each type of tumor # Now start looping through a chunk of individual tumors and calculate the causal scores between each pair of SGA and DEG print "Done with loading data, start processing tumor " + str(rowBegin) if not rowEnd: rowEnd = nTumors - 1 else: if rowEnd >= nTumors: rowEnd = nTumors - 1 elif rowEnd < rowBegin: print "Invalid rowEnd < rowBegin arguments given." sys.exit() if rowBegin > rowEnd: print "Invlid rowBegin > rowEnd argument given." sys.exit() for t in range(rowBegin, rowEnd): print "processign tumor " + tumorNames[t] #print pacifier if t % 50 == 0: print "\nProcessed %s tumors" % str(t) # collect data related to DEGs to construct a submatrix containing only DEG of the tumor degGeneIndx = [i for i, j in enumerate(degMatrix.data[t,:]) if j == 1] tumorDEGGenes = [degGeneNames[i] for i in degGeneIndx] tumorDEGMatrix = degMatrix.data[:,degGeneIndx] # extract the sub-matrix of mutcnaMatrix that only contain the genes that are mutated in a given tumor t tumormutGeneIndx = [i for i, j in enumerate(mutcnaMatrix.data[t,:]) if j == 1] tumorMutGenes= [mutGeneNames[i] for i in tumormutGeneIndx] nTumorMutGenes = len(tumorMutGenes) # now extract the sub-matrix of mutcnaMatrix that only contain the genes that are mutated in a given tumor t # check if special operations to create combinations of SGA events are needed. If combination operation is needed, # new combined muation matrix will be created if opFlag == OR: tumorMutMatrix = createORComb(tumorMutGenes, ppiDict, mutcnaMatrix) else: # default. Extract columns of mutcnaMatrix corresponding to the altered genes tumorMutMatrix = mutcnaMatrix.data[:, tumormutGeneIndx] # Include the tumor-type label into the tumorMutMatrix as a tissue-specific # fake Gt to capture the DEGs that has tissue-specific characterisitics if PANCANFlag: tumorTypeLabelIndx = np.where(tumorTypeMatrix.data[t,:] == 1)[0] if len(tumorTypeLabelIndx) != 1: raise Exception("Fail to extract tumor type") # add the label to the tumorMutGenes tumorMutMatrix = np.hstack((tumorMutMatrix, tumorTypeMatrix.data[:,tumorTypeLabelIndx])) tumorTypeName = tumorTypes[tumorTypeLabelIndx] tumorMutGenes.append(tumorTypeName) nTumorMutGenes = len(tumorMutGenes) # calculate single pairwise likelihood that an SGA causes a DEG. Return a matrix where rows are mutGenes, # columns are DEGs, currently without the joint impact tumorLnFScore = calcF(tumorMutMatrix, tumorDEGMatrix, alphaIJKList) # If PANCAN analysis, construct combinations of tumor-type label with different GTs to determine the # likelihood of DEG jointly conditioning on GT and tumor-type label. This enables us to capture # the fact that a GT regulate a GE but they also have a high tendency in co-occurring in a specific tumor type if PANCANFlag: if opFlag == AND: raise Exception ("Combination of AND operation with PanCan analysis is not implemented") # Now, calcuate the log likelihood of joint impact of tumor label with individual GTs on each GE jointGTandTumorLableFScore = np.zeros((tumorMutMatrix.shape[1], tumorDEGMatrix.shape[1])) # GT == 1 && Label == 1. Use mulitplication as AND operation tmpMutMatrix = np.multiply(tumorMutMatrix, tumorTypeMatrix.data[:, tumorTypeLabelIndx]) tumorLnFScore = calcF(tmpMutMatrix, tumorDEGMatrix, alphaIJKList) jointGTandTumorLableFScore = add(jointGTandTumorLableFScore, tumorLnFScore) # GT == 1 && label == 0 tmpMutMatrix = np.multiply(tumorMutMatrix, tumorTypeMatrix.data[:, tumorTypeLabelIndx]==0) tumorLnFScore = calcF(tmpMutMatrix, tumorDEGMatrix, alphaIJKList) jointGTandTumorLableFScore = add(jointGTandTumorLableFScore, tumorLnFScore) # GT == 0 && label == 1 tmpMutMatrix = np.multiply(tumorMutMatrix == 0, tumorTypeMatrix.data[:, tumorTypeLabelIndx]) tumorLnFScore = calcF(tmpMutMatrix, tumorDEGMatrix, alphaIJKList) jointGTandTumorLableFScore = add(jointGTandTumorLableFScore, tumorLnFScore) # GT == 0 && label == 0 tmpMutMatrix = np.multiply(tumorMutMatrix == 0, tumorTypeMatrix.data[:, tumorTypeLabelIndx] == 0) tumorLnFScore = calcF(tmpMutMatrix, tumorDEGMatrix, alphaIJKList) jointGTandTumorLableFScore = add(jointGTandTumorLableFScore, tumorLnFScore) # stack the the joint loglikelihood matrix on top to the tumorLnFScore. #Remove the tumor-type label variable from the matrix derived from tumorMutMatrix tumorLnFScore = np.vstack((jointGTandTumorLableFScore[:-1,:] , tumorLnFScore)) # Calculate the likelihood that A0, which is 1 for all tumors, as a cause for DEGs. # Then, stack to the LnFScore, equivalent to adding a column of '1' to # represent the A0 in tumorMutMatrix nullFscore = calcNullF(tumorDEGMatrix, alphaNull) tumorLnFScore = np.vstack((tumorLnFScore, nullFscore)) # calcualte log of the prior probability that any of mutated genes plus A0 can be a cause for a DEG. if PANCANFlag: if not opFlag: lntumorMutPriors = calcPanCanLnPrior(tumorMutGenes, dictGeneLength, vtprior[tumorTypeLabelIndx], v0) elif opFlag == AND: lntumorMutPriors = calcPanCanLnCombANDPrior(tumorMutGenes, dictGeneLength, vtprior[tumorTypeLabelIndx], v0) elif opFlag == OR: lntumorMutPriors = calcPanCanLnCombORPrior(tumorMutGenes, ppiDict, dictGeneLength, mutcnaMatrix.colnames, vtprior[tumorTypeLabelIndx], v0) else: if not opFlag: lntumorMutPriors = calcLnPrior(tumorMutGenes, dictGeneLength, v0) # a m-dimension vector with m being number of mutations else: if opFlag == AND: lntumorMutPriors = calcLnCombANDPrior(tumorMutGenes, dictGeneLength, v0) elif opFlag == OR: lntumorMutPriors = calcLnCombORPrior(tumorMutGenes, ppiDict, dictGeneLength, mutcnaMatrix.colnames, v0) # add to each column, note double transposes because numpy broadcasts by row tumorLnFScore = np.add(tumorLnFScore.T, lntumorMutPriors).T # calculate the normalizer for each column (GE). colLogSum = calcColNormalizer(tumorLnFScore) normalizer = np.tile(colLogSum, (tumorLnFScore.shape[0], 1)) posteriorAll = np.exp(add(tumorLnFScore, - normalizer)) # now sum the posterior of each single GT with the posteriors of joint GT-Tumor-Type posterior = np.add(posteriorAll[0:nTumorMutGenes-1, :], posteriorAll[nTumorMutGenes - 1:-2, :]) posterior = np.vstack((posterior, posteriorAll[-2:, :])) #write out the results tumorMutGenes.append('A0') tumorPosterior = NamedMatrix(npMatrix = posterior, rownames = tumorMutGenes, colnames = tumorDEGGenes) tumorPosterior.writeToText(filePath = outputPath, filename = tumorNames[t] + ".csv")
class PyGibbCAMP: ## Constructor # @param nodeFile A string of pathname of file containing nodes. The # name, type, measured # @param edgeFile A list of tuples, each containing a source and sink node # of an edge # @param dataMatrixFile A string to data def __init__(self, nodeFile , dataMatrixFile , perturbMatrix = None, missingDataMatrix=None): self.network = None self.obsData = None self.missingDataMatrix = None perturbInstances = None self.nChains = 1 self.dictPerturbEffect = {'AKT1' : [('GSK690693', 0), \ ('GSK690693_GSK1120212', 0)], 'MAP2K1' : [('GSK690693_GSK1120212', 0)],\ 'EGFR': [('EGF' , 1), ('FGF1', 1)]} # self.stimuli = ['EGF', 'FGF1', 'HGF', 'IGF1', 'Insulin', 'NRG1', 'PBS', 'Serum'] # parse data mastrix by calling NamedMatrix class if not dataMatrixFile: raise Exception("Cannot create PyCAMP obj without 'dataMatrixFile'") return self.obsData = NamedMatrix(dataMatrixFile) nCases, nAntibodies = np.shape(self.obsData.data) self.obsData.colnames = map(lambda s: s+'F', self.obsData.colnames) self.obsDataFileName = dataMatrixFile if perturbMatrix: self.perturbData = NamedMatrix(perturbMatrix) perturbInstances = self.perturbData.getColnames() self.perturbInstances = perturbInstances if missingDataMatrix: self.missingDataMatrix = NamedMatrix(missingDataMatrix) allMissing = np.sum(self.missingDataMatrix, 0) == nCases if np.any(allMissing): raise Exception ("Data matrix contain data-less columns") self.missingDataMatrix.colnames = map(lambda s: s+'F', self.missingDataMatrix.colnames) if not nodeFile: raise Exception("Calling 'intiNetwork' with empty nodeFile name") return try: nf = open(nodeFile, "r") nodeLines = nf.readlines() if len(nodeLines) == 1: # Mac files end a line with \r instead of \n nodeLines = nodeLines[0].split("\r") nf.close() except IOError: raise Exception( "Failed to open the file containing nodes") return print "Creating network" self.network = nx.DiGraph() self.dictProteinToAntibody = dict() self.dictAntibodyToProtein = dict() # parse nodes for line in nodeLines: #print line protein, antibody = line.rstrip().split(',') if protein not in self.dictProteinToAntibody: self.dictProteinToAntibody[protein] = [] self.dictProteinToAntibody[protein].append(antibody) self.dictAntibodyToProtein[antibody] = protein fluo = antibody + 'F' if protein not in self.network: self.network.add_node(protein, nodeObj = SigNetNode(protein, 'ACTIVATIONSTATE', False)) self.network.add_node(antibody, nodeObj= SigNetNode(antibody, 'PHOSPHORYLATIONSTATE', False)) self.network.add_node(fluo, nodeObj = SigNetNode(fluo, 'FLUORESCENCE', True)) self.network.add_edge(antibody, protein) self.network.add_edge(antibody, fluo) for perturb in perturbInstances: self.network.add_node(perturb, nodeObj = SigNetNode(perturb, 'PERTURBATION', True)) # Add edges between PERTURBATION, protein activity,and phosphorylation layers for pro in self.dictProteinToAntibody: for phos in self.dictAntibodyToProtein: if self.dictAntibodyToProtein[phos] == pro: continue self.network.add_edge(pro, phos) for perturb in perturbInstances: self.network.add_edge(perturb, pro) ## Init parameters of the model # In Bayesian network setting, the joint probability is calculated # through the product of a series conditional probability. The parameters # of the PyCAMP model defines p(x | Pa(X)). For observed fluorescent node # the conditional probability is a mixture of two Gaussian distribution. # therefore, the parameters are two pairs of mu and sigma. For # the hidden variables representing phosphorylation states and activation # states of proteins, the conditional probability is defined by a logistic # regression. Therefore, the parameters associated with such a node is a # vector of real numbers. # def _initParams(self): print "Initialize parameters associated with each node in each MCMC chain" for nodeId in self.network: self._initNodeParams(nodeId) def _initNodeParams(self, nodeId): nodeObj = self.network.node[nodeId]['nodeObj'] if nodeObj.type == 'FLUORESCENCE': # Estimate mean and sd of fluo signal using mixture model if self.missingDataMatrix and nodeId in self.missingDataMatrix.getColnames(): nodeData = self.obsData.getValuesByCol( nodeId) nodeData = nodeData[self.missingDataMatrix.getValuesByCol(nodeId) == 0] else: nodeData = self.obsData.getValuesByCol(nodeId) nodeObj.mus = np.zeros((self.nChains, 2)) nodeObj.sigmas = np.zeros((self.nChains, 2)) for c in range(self.nChains): mixGaussians = normalmixEM(robjects.FloatVector(nodeData), k = 2 ) # mus and sigmas are represented as nChain x 2 matrices nodeObj.mus[c,:] = np.array(mixGaussians[2]) nodeObj.sigmas[c,:] = np.array(mixGaussians[3]) else: preds = self.network.predecessors(nodeId) if len(preds) > 0: nodeObj.paramNames = preds nodeObj.params = np.random.randn(self.nChains, len(preds) + 1) else: nodeObj.params = None ## Initialize latent variables # # def _initHiddenStates(self): hiddenNodes = [n for n in self.network if not self.network.node[n]['nodeObj'].bMeasured] phosNodes = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'] #print str(phosNodes) nCases, nAntibody = self.obsData.shape() caseNames = self.obsData.getRownames() self.nodeStates = list() for c in range(self.nChains): tmp = np.zeros((nCases, len(hiddenNodes))) tmp[np.random.rand(nCases, len(hiddenNodes)) < 0.3] = 1 tmp = np.column_stack((tmp, self.perturbData.data)) colnames = hiddenNodes + self.perturbData.colnames self.nodeStates.append(NamedMatrix(npMatrix = tmp, colnames = colnames, rownames = caseNames)) #initialize phos state based on the observed fluo for node in phosNodes: fluoNode = node + 'F' #print "phosNode:" + node + "; fluoNode: " + fluoNode fluoNodeObj = self.network.node[fluoNode]['nodeObj'] fluoData = self.obsData.getValuesByCol(fluoNode) tmp = np.zeros(nCases) phosProbOne = - np.log(fluoNodeObj.sigmas[c, 1])\ - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 1]) / np.square(fluoNodeObj.sigmas[c, 1]) phosProbZero = - np.log(fluoNodeObj.sigmas[c, 0])\ - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 0]) / np.square(fluoNodeObj.sigmas[c, 0]) tmp[phosProbOne > phosProbZero] = 1 nodeIndx = self.nodeStates[c].findColIndices(node) self.nodeStates[c].data[:,nodeIndx] = tmp # take care of missing values by random sampling if self.missingDataMatrix: if node in self.missingDataMatrix.getColnames(): #print "processing node with missing values: " + nodeId missingCases = self.missingDataMatrix.getValuesByCol(node) == 1 tmp = np.zeros(sum(missingCases)) tmp[np.random.rand(len(tmp)) <= 0.3] = 1 self.nodeStates[c].data[missingCases, nodeIndx] = tmp ## Calculate the marginal probability of observing the measured data by # integrating out all possible setting of latent variable states and # model parameters. def calcEvidenceLikelihood(self): phosNodes = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'] loglikelihood = 0 nCases, nAntibodies = np.shape(self.obsData.data) for nodeId in phosNodes: nodeObj = self.network.node[nodeId]['nodeObj'] nodeIndx = self.nodeStates[0].findColIndices(nodeId) preds = self.network.predecessors(nodeId) for c in range(self.nChains): nodeData = self.nodeStates[c].data[:, nodeIndx] predStates = np.column_stack((np.ones(nCases), self.nodeStates[c].getValuesByCol(preds))) pOneCondOnParents = 1 / (1 + np.exp( - np.dot(predStates, nodeObj.params[c,:]))) pOneCondOnParents[pOneCondOnParents == 1.] -= np.finfo(np.float).eps loglikelihood += np.sum(nodeData * np.log(pOneCondOnParents) \ + (1 - nodeData) * np.log(1 - pOneCondOnParents)) loglikelihood /= self.nChains return loglikelihood ## Perform graph search def trainGibbsEM(self, nChains = 10, alpha = 0.1, nParents = 4, nSamples = 5, pickleDumpFile = None, maxIter = 1000): self.nChains = nChains self.alpha = alpha self.likelihood = list() self.nSamples = nSamples self.nParents = nParents if pickleDumpFile: self.pickleDumpFile = pickleDumpFile else: self.pickleDumpFile = self.obsDataFileName + "alpha" + str(self.alpha) + ".pickle" # check if the network and data agrees nodeToDelete = list() for nodeId in self.network: if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' and nodeId not in self.obsData.getColnames(): print "Node " + nodeId + " don't has associated data" nodeToDelete.append(nodeId) nodeToDelete.append(self.network.predecessors(nodeId)[0]) for nodeId in nodeToDelete: if self.network.has_node(nodeId): print "removing node " + nodeId self.network.remove_node(nodeId) # Starting EM set up Markov chains to train a model purely based on prior knowledge self._initParams() self._initHiddenStates() # perform update of latent variables in a layer-wise manner self.likelihood = list() self.expectedStates = list() nCases, nAntibodies = np.shape(self.obsData.data) for c in range(self.nChains): # each chain collect expected statistics of nodes from samples along the chain self.expectedStates.append(np.zeros(np.shape(self.nodeStates[c].data))) print "Starting EM: alpha = " + str(self.alpha) + "; nChains = " + str(self.nChains) + "; nSamples = " + str (self.nSamples) + "; nParents = " + str(self.nParents) optLikelihood = float("-inf") bConverged = False sampleCount = 0 likelihood = self.calcEvidenceLikelihood() print "nIter: 0" + "; log likelihood of evidence: " + str(likelihood) self.likelihood.append(likelihood) for nIter in range(maxIter): # E-step of EM self._updateActivationStates() if (nIter+1) % 2 == 0: # we collect sample every other iteration sampleCount += 1 for c in range(self.nChains): self.expectedStates[c] += self.nodeStates[c].data # M-step of EM. We only update parameters after a collecting a certain number of samples if sampleCount >= self.nSamples: sampleCount = 0 # take expectation of sample states self.expectedStates = map(lambda x: x / self.nSamples, self.expectedStates) self._updteParams(self.alpha, nparents = self.nParents) likelihood = self.calcEvidenceLikelihood() self.likelihood.append(likelihood) print "nIter: " + str(nIter + 1) + "; log likelihood of evidence: " + str(likelihood) # collect the current best fit models if likelihood > optLikelihood: optLikelihood = likelihood try: cPickle.dump(self, open(self.pickleDumpFile, 'wb')) except: raise Exception("Cannot create pickle dumpfile " + self.pickleDumpFile) bConverged = self._checkConvergence() if bConverged: print "EM converged!" break for c in range(self.nChains): # clear expectedStates self.expectedStates[c] = np.zeros(np.shape(self.nodeStates[c].data)) # now try to delete edges that does contribute to evidence #self.trimEdgeByConsensus(.9) return self def _checkConvergence(self): # To do, add convergence checking code if len(self.likelihood) < 20: return False ml = np.mean(self.likelihood[-5:-1]) ratio = abs(self.likelihood[-1] - ml ) / abs(ml) return ratio <= 0.001 def _updateActivationStates(self): nCases, antibody = np.shape(self.obsData.data) nCases, nHiddenNodes = np.shape(self.nodeStates[0].data) # interate through all nodes. activationNode = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'ACTIVATIONSTATE'] for nodeId in activationNode: for c in range(self.nChains): curNodeMarginal = self.calcNodeCondProb(nodeId, c) # sample states of current node based on the prob, and update sampleState = np.zeros(nCases) sampleState[curNodeMarginal >= np.random.rand(nCases)] = 1. curNodeIndx = self.nodeStates[c].findColIndices(nodeId) self.nodeStates[c].data[:, curNodeIndx] = sampleState # clamp the activationState of perturbed nodes to a fix value if nodeId in self.dictPerturbEffect: # the diction keeps a list conditins under which the node is perurbed and the state to be clamped to for condition, state in self.dictPerturbEffect[nodeId]: perturbState = self.nodeStates[c].getValuesByCol(condition) indx = self.nodeStates[c].findColIndices(nodeId) self.nodeStates[c].data[perturbState==1, indx] = state def calcNodeCondProb(self, nodeId, c): """ Calculate the marginal probability of a node's state set to "1" conditioning on all evidence. args: nodeId A string id of the node of interest c An integer indicate the chain from which the parameter vector to be used """ nodeObj = self.network.node[nodeId]['nodeObj'] if nodeObj.bMeasured: raise Exception("Call _caclNodeMarginalProb on an observed variable " + nodeId) nCases, nAntibody = np.shape(self.obsData.data) # collect the state of the predecessors of the node preds = self.network.predecessors(nodeId) logProbOneCondOnParents = 0 logProbZeroCondOnParents = 0 if len(preds) > 0: # if the node has parents # calculate p(curNode = 1 | parents); nodeParams = nodeObj.params[c,:] predStates = np.column_stack((np.ones(nCases), self.nodeStates[c].getValuesByCol(preds))) pOneCondOnParents = 1 / (1 + np.exp( - np.dot(predStates, nodeParams))) pOneCondOnParents[pOneCondOnParents == 1] -= np.finfo(np.float).eps pOneCondOnParents[pOneCondOnParents == 0] += np.finfo(np.float).eps logProbOneCondOnParents = np.log(pOneCondOnParents) logProbZeroCondOnParents = np.log(1 - pOneCondOnParents) # collect evidence from children logProbChildCondOne = 0 # the prob of child conditioning on current node == 1 logProdOfChildCondZeros = 0 children = self.network.successors(nodeId) if len(children) > 0: for child in children: childNodeObj = self.network.node[child]['nodeObj'] curChildStates = self.nodeStates[c].getValuesByCol(child) # Collect states of the predecessors of the child childPreds = self.network.predecessors(child) childNodeParams = childNodeObj.params[c,:] childPredStates = self.nodeStates[c].getValuesByCol(childPreds) childPredStates = np.column_stack((np.ones(nCases), childPredStates)) # padding data with a column ones as bias # Set the state of current node to ones curNodePosInPredList = childPreds.index(nodeId) + 1 # offset by 1 because padding if childNodeParams[curNodePosInPredList] == 0: # not an real edge continue childPredStates[:, curNodePosInPredList] = np.ones(nCases) pChildCondCurNodeOnes = 1 / (1 + np.exp(-np.dot(childPredStates, childNodeParams))) pChildCondCurNodeOnes[pChildCondCurNodeOnes==1] -= np.finfo(np.float).eps pChildCondCurNodeOnes[pChildCondCurNodeOnes==0] += np.finfo(np.float).eps logProbChildCondOne += np.log (curChildStates * pChildCondCurNodeOnes + (1 - curChildStates) * (1 - pChildCondCurNodeOnes)) # set the state of the current node (nodeId) to zeros childPredStates [:, curNodePosInPredList] = np.zeros(nCases) pChildCondCurNodeZeros = 1 / (1 + np.exp(- np.dot(childPredStates, childNodeParams))) pChildCondCurNodeZeros[pChildCondCurNodeZeros==1] -= np.finfo(np.float).eps pChildCondCurNodeZeros[pChildCondCurNodeZeros==0] += np.finfo(np.float).eps logProdOfChildCondZeros += np.log(curChildStates * pChildCondCurNodeZeros + (1 - curChildStates) * (1 - pChildCondCurNodeZeros)) # now we can calculate the marginal probability of current node curNodeMarginal = 1 / (1 + np.exp(logProbZeroCondOnParents + logProdOfChildCondZeros - logProbOneCondOnParents - logProbChildCondOne)) return curNodeMarginal def parseGlmnetCoef(self, glmnet_res): """ Parse the 'beta' matrix returned by calling glmnet through RPy2. Return the first column of 'beta' matrix of the glmnet object with 3 or more non-zero values """ # read in intercept; a vector of length of nLambda a0 = np.array(glmnet_res.rx('a0'))[0] # Read in lines of beta matrix txt, which is a nVariables * nLambda. # Since we call glmnet by padding x with a column of 1s, we only work # with the 'beta' matrix returned by fit betaLines = StringIO(str(glmnet_res.rx('beta'))).readlines() dimStr = re.search("\d+\s+x\s+\d+", betaLines[1]).group(0) if not dimStr: raise Exception("'parse_glmnet_res' could not determine the dims of beta") nVariables , nLambda = map(int, dimStr.split(' x ')) betaMatrix = np.zeros( (nVariables, nLambda), dtype=np.float) # glmnet print beta matrix in mulitple blocks with # nVariable * blockSize blockSize = len(betaLines[4].split()) - 1 curBlockColStart = - blockSize for line in betaLines: #read in blocks m = re.search('^V\d+', line) if not m: # only find the lines begins with 'V\d' continue else: rowIndx = int(m.group(0)[1:len(m.group(0))]) if rowIndx == 1: curBlockColStart += blockSize # set 'rowIndx' as start from 0 rowIndx -= 1 fields = line.rstrip().split() fields.pop(0) if len(fields) != blockSize: blockSize = len(fields) for j in range(blockSize): if fields[j] == '.': continue else: betaMatrix[rowIndx, curBlockColStart + j] = float(fields[j]) return a0, betaMatrix def _updteParams(self, alpha = 0.1, nparents=None): # Update the parameter associated with each node, p(n | Pa(n)) using logistic regression, # using expected states of precessors as X and current node states acrss samples as y nCases, nVariables = np.shape(self.obsData.data) if not nparents: nparents = self.nParents for nodeId in self.network: nodeObj = self.network.node[nodeId]['nodeObj'] if nodeObj.type == 'FLUORESCENCE' or nodeObj.type == 'PERTURBATION': continue nodeObj.fitRes = list() preds = self.network.predecessors(nodeId) predIndices = self.nodeStates[0].findColIndices(preds) for c in range(self.nChains): expectedPredState = self.expectedStates[c][:, predIndices] #x = np.column_stack((np.ones(nCases), expectedPredState)) x = np.column_stack((np.ones(nCases), expectedPredState)) y = self.nodeStates[c].getValuesByCol(nodeId) #check if all x and y are of same value, which will lead to problem for glmnet rIndx = map(lambda z: int(math.floor(z)), np.random.rand(50) * nCases) if sum(y) == nCases: # if every y == 1 y[rIndx] = 0 elif sum( map(lambda x: 1 - x, y)) == nCases: y[rIndx] = 1 y = robjects.vectors.IntVector(y) allRwoSumOnes = np.where(np.sum(x, 0) == nCases)[0] for col in allRwoSumOnes: rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases) x[rIndx, col] = 0 allZeros = np.where(np.sum(np.ones(np.shape(x)) - x, 0) == nCases) for col in allZeros[0]: rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases) x[rIndx, col] = 1 # call logistic regression using glmnet from Rpy fit = glmnet (x, y, alpha = alpha, family = "binomial", intercept = 0) nodeObj.fitRes.append(fit) # extract coefficients glmnet, keep the first set beta with nParent non-zeros values a0, betaMatrix = self.parseGlmnetCoef(fit) for j in range(np.shape(betaMatrix)[1]): if sum(betaMatrix[:, j] != 0.) >= nparents: break if j >= len(a0): j = len(a0) - 1 myparams = betaMatrix[:, j] if sum( myparams != 0.) > nparents: sortedParams = sorted(np.abs(myparams)) myparams[np.abs(myparams) < sortedParams[-self.nParents]] = 0. nodeObj.params[c,:] = myparams def getStimuliSpecificNet(self, stimulus): self.stimuli = ['EGF', 'FGF1', 'HGF', 'IGF1', 'Insulin', 'NRG1', 'PBS', 'Serum'] #self.stimuli = ['loLIG1', 'hiLIG1', 'loLIG2', 'hiLIG2'] # trim unused edges if not stimulus in self.nodeStates[0].getColnames(): raise Exception("Input stimulus '" + stimulus + "' is not in the experiment data") #self.trimEdgeByConsensus(0.9) stimulusCases = self.perturbData.getValuesByCol(stimulus) == 1 controlCases = np.sum(self.perturbData.getValuesByCol(self.stimuli), 1) == 0 # identify the nodes to keep by determine if a node responds to a stimuli activeNodes = set() activeNodes.add(stimulus) for nodeId in self.network: if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' \ or self.network.node[nodeId]['nodeObj'].type == 'fluorescence': nodeControlValues = self.obsData.getValuesByCol(nodeId)[controlCases] nodeStimulValues = self.obsData.getValuesByCol(nodeId)[stimulusCases] ttestRes = R('t.test')(robjects.FloatVector(nodeControlValues), robjects.FloatVector(nodeStimulValues)) pvalue = np.array(ttestRes.rx('p.value')[0])[0] if pvalue < 0.05: activeNodes.add(self.network.predecessors(nodeId)[0]) # copy network to a tmp, redirect edges from activation state nodes # Edge indicates the impact tmpNet = nx.DiGraph() for u, v in self.network.edges(): # we are only interested in the edge from protein point to antibody if (self.network.node[u]['nodeObj'].type == 'ACTIVATIONSTATE'\ or self.network.node[u]['nodeObj'].type == 'activeState')\ and (self.network.node[v]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'\ or self.network.node[v]['nodeObj'].type == 'phosState'): # extract parameters associated with u and v vPreds = self.network.predecessors(v) uIndx = vPreds.index(u) vParams = np.sum(self.network.node[v]['nodeObj'].params, 0) if len(vParams) != (len(vPreds) + 1): raise Exception ("Bug in retrieving parameters of node v " + u) paramZeros = np.sum(self.network.node[v]['nodeObj'].params == 0, 0) if np.float(paramZeros[uIndx+1]) / float(self.nChains) > .9: continue # don't add edge with beta == 0 for ab in self.dictProteinToAntibody[u]: if ab not in self.network: continue # find the impact of phosphorylation on activation state uPreds = self.network.predecessors(u) uParams = np.mean(self.network.node[u]['nodeObj'].params, 0) if len(uParams) != (len(uPreds) + 1): raise Exception ("Bug in retrieving parameters of node v " + u) #uAntibodyParam = uParams[uPreds.index(ab) + 1] # if vParams[uIndx+1] > 0. and (vParams[uIndx+1] * uAntibodyParam) > 0: # tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1]) # elif (vParams[uIndx+1] * uAntibodyParam) < 0.: # tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1]) if vParams[uIndx+1] > 0. : tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1]) elif vParams[uIndx+1] < 0.: tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1]) # remove leave nodes that is not in activeNodes list while True: leafNodes = [] for nodeId in tmpNet: if (nodeId not in activeNodes and len(tmpNet.successors(nodeId)) == 0)\ or (nodeId not in activeNodes and len(tmpNet.predecessors(nodeId)) == 0): leafNodes.append(nodeId) if len(leafNodes) == 0: break for leaf in leafNodes: tmpNet.remove_node(leaf) # now try to remove cycles and make the tmpNet a DAG return tmpNet def toGraphML(self, filename): tmpNet = nx.DiGraph() for edge in self.network.edges(): tmpNet.add_edge(edge) nx.write_graphml(tmpNet, filename, encoding='utf-8', prettyprint=True)
def calcTCI (mutcnaMatrixFN, degMatrixFN, alphaNull = [1, 1], alphaIJKList = [2, 1, 1, 2], v0=0.2, dictGeneLength = None, outputPath = ".", opFlag = None): """ calcTCI (mutcnaMatrix, degMatrix, alphaIJList, alphaIJKList, dictGeneLength) Calculate the causal scores between each pair of SGA and DEG observed in each tumor Inputs: mutcnaMatrixFN A file containing a N x G binary matrix containing the mutation and CNA data of all tumors. N is the number of tumors and G is number of total number of unique genes. For a tumor, genes that have SGAs are indicated by "1"s and "0" otherwise. degMatrixFN A file contains a N x G' binary matrix representing DEG status. A "1" indicate a gene is differentially expressed in a tumor. alphaIJList A list of Dirichlet hyperparameters defining the prior that a mutation event occurs alphaIJKList A list of Dirichlet hyperparameters for caulate the prior of condition probability parameters. alphaIJK[0]: mut == 0 && deg == 0; alphaIJK[1]: mut == 0 && deg == 1; alphaIJK[2]: mut == 1 && deg == 0; alphaIJK[3]: mut == 1 && deg == 1 v0 A float scalar indicate the prior probability that a DEG is caused by a non-SGA factor dictGeneLength A dictionary keeps the length of each of G genes in the mutcnaMatrix """ # read in data in the form of NamedMatrix try: mutcnaMatrix = NamedMatrix(mutcnaMatrixFN) except: print "Failed to import data matrix %s\n" % mutcnaMatrixFN sys.exit() try: degMatrix = NamedMatrix(degMatrixFN) except: print "Failed to import data matrix %s\n" % degMatrixFN sys.exit() if degMatrix.getRownames() != mutcnaMatrix.getRownames(): print "The tumors for mutcnaMatrix and degMatrix do not fully overlap!" sys.exit() if not dictGeneLength : print "Gene length dictionary not provided, quit\n" sys.exit() # now we iterate through each tumor to infer the causal relationship between each # pair of mut - deg tumorNames = degMatrix.getRownames() nTumors, nMutGenes = mutcnaMatrix.shape() mutGeneNames = mutcnaMatrix.getColnames() degGeneNames = degMatrix.getColnames() for t in range(nTumors): #print pacifier if t % 50 == 0: print "Processed %s tumors" % str(t) # collect data related to mutations tumormutGeneIndx = [i for i, j in enumerate(mutcnaMatrix.data[t,:]) if j == 1] nTumorMutGenes = len(tumormutGeneIndx) tumorMutGenes= [mutGeneNames[i] for i in tumormutGeneIndx] #now extract the sub-matrix of mutcnaMatrix that only contain the genes that are mutated in a given tumor t # stack a column of '1' to represent the A0. If combination operation is needed, new combined muation matrix # will be created tumorMutMatrix = mutcnaMatrix.data[:, tumormutGeneIndx] if opFlag: tmpNamedMat = NamedMatrix(npMatrix = tumorMutMatrix, colnames = tumorMutGenes, rownames = tumorNames) tumorNamedMatrix = createComb(tmpNamedMat, opFlag) if not tumorNamedMatrix: # this tumor do not have any joint mutations that is oberved in 2% of all tumors continue tumorMutGenes = tumorNamedMatrix.colnames tumorMutMatrix = tumorNamedMatrix.data ## check operation options: 1) orginal, do nothing and contiue # otherwise creat combinary matrix using the tumorMutMatrix # createCombMatrix(tumorMutMatrix, operationFlag) if not opFlag: lntumorMutPriors = calcLnPrior(tumorMutGenes, dictGeneLength, v0) # a m-dimension vector with m being number of mutations else: lntumorMutPriors = calcLnCombPrior(tumorMutGenes, dictGeneLength, v0) tumorMutGenes.append('A0') # collect data related to DEGs degGeneIndx = [i for i, j in enumerate(degMatrix.data[t,:]) if j == 1] tumorDEGGenes = [degGeneNames[i] for i in degGeneIndx] nTumorDEGs = len(degGeneIndx) # corresponding to n, the number of DEGs in a given tumor tumorDEGMatrix = degMatrix.data[:,degGeneIndx] # calculate pair-wise m x n matrix tumorLnFScore = calcF(tumorMutMatrix, tumorDEGMatrix, alphaIJKList) nullFscore = calcNullF(tumorDEGMatrix, alphaNull) tumorLnFScore = np.vstack((tumorLnFScore, nullFscore)) #check out this later # calcualte the prior probability that any of mutated genes can be a cause for a DEG, # tile it up to make an nTumorMutGenes x nTumorDEG matrix tumorMutPriorMatrix = np.tile(lntumorMutPriors, (nTumorDEGs, 1)).T lnFScore = add(tumorLnFScore, tumorMutPriorMatrix) #debug code below two lines #tmpOut = NamedMatrix(npMatrix = lnFScore, colnames = tumorDEGGenes, rownames = tumorMutGenes) #tmpOut.writeToText(outputPath, filename = tumorNames[t] + "fscore.csv") # now we need to caclculate the normalized lnFScore so that each columnAccumLogSum = np.zeros(nTumorDEGs) for col in range(nTumorDEGs): currLogSum = np.NINF for j in range(lnFScore.shape[0]): if lnFScore[j,col] == np.NINF: continue currLogSum = logSum(currLogSum, lnFScore[j,col]) columnAccumLogSum[col] = currLogSum normalizer = np.tile(columnAccumLogSum, (lnFScore.shape[0], 1)) posterior = np.exp(add(lnFScore, - normalizer)) #write out the results tumorPosterior = NamedMatrix(npMatrix = posterior, rownames = tumorMutGenes, colnames = tumorDEGGenes) tumorPosterior.writeToText(outputPath, filename = tumorNames[t] + "-mut-vs-DEG-posterior.csv")